skypilot-nightly 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250116__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +0 -38
- sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
- sky/data/storage.py +50 -7
- sky/jobs/controller.py +5 -6
- sky/jobs/state.py +6 -4
- sky/jobs/utils.py +51 -32
- sky/serve/core.py +30 -5
- sky/serve/replica_managers.py +1 -3
- sky/skylet/job_lib.py +8 -4
- sky/utils/db_utils.py +18 -4
- sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250116.dist-info}/METADATA +3 -1
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250116.dist-info}/RECORD +18 -18
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250116.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250116.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250116.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250116.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'e71e5a92ccd90a654662121d6f08c4e100377bbf'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250116'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -35,7 +35,6 @@ from sky import jobs as managed_jobs
|
|
35
35
|
from sky import optimizer
|
36
36
|
from sky import provision as provision_lib
|
37
37
|
from sky import resources as resources_lib
|
38
|
-
from sky import serve as serve_lib
|
39
38
|
from sky import sky_logging
|
40
39
|
from sky import status_lib
|
41
40
|
from sky import task as task_lib
|
@@ -4037,43 +4036,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4037
4036
|
f'{colorama.Style.RESET_ALL}')
|
4038
4037
|
return {str(job_id): local_log_dir}
|
4039
4038
|
|
4040
|
-
def tail_serve_logs(self, handle: CloudVmRayResourceHandle,
|
4041
|
-
service_name: str, target: serve_lib.ServiceComponent,
|
4042
|
-
replica_id: Optional[int], follow: bool) -> None:
|
4043
|
-
"""Tail the logs of a service.
|
4044
|
-
|
4045
|
-
Args:
|
4046
|
-
handle: The handle to the sky serve controller.
|
4047
|
-
service_name: The name of the service.
|
4048
|
-
target: The component to tail the logs of. Could be controller,
|
4049
|
-
load balancer, or replica.
|
4050
|
-
replica_id: The replica ID to tail the logs of. Only used when
|
4051
|
-
target is replica.
|
4052
|
-
follow: Whether to follow the logs.
|
4053
|
-
"""
|
4054
|
-
if target != serve_lib.ServiceComponent.REPLICA:
|
4055
|
-
code = serve_lib.ServeCodeGen.stream_serve_process_logs(
|
4056
|
-
service_name,
|
4057
|
-
stream_controller=(
|
4058
|
-
target == serve_lib.ServiceComponent.CONTROLLER),
|
4059
|
-
follow=follow)
|
4060
|
-
else:
|
4061
|
-
assert replica_id is not None, service_name
|
4062
|
-
code = serve_lib.ServeCodeGen.stream_replica_logs(
|
4063
|
-
service_name, replica_id, follow)
|
4064
|
-
|
4065
|
-
signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
|
4066
|
-
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
4067
|
-
|
4068
|
-
self.run_on_head(
|
4069
|
-
handle,
|
4070
|
-
code,
|
4071
|
-
stream_logs=True,
|
4072
|
-
process_stream=False,
|
4073
|
-
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
4074
|
-
stdin=subprocess.DEVNULL,
|
4075
|
-
)
|
4076
|
-
|
4077
4039
|
def teardown_no_lock(self,
|
4078
4040
|
handle: CloudVmRayResourceHandle,
|
4079
4041
|
terminate: bool,
|
@@ -115,6 +115,16 @@ def _list_accelerators(
|
|
115
115
|
|
116
116
|
If the user does not have sufficient permissions to list pods in all
|
117
117
|
namespaces, the function will return free GPUs as -1.
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
A tuple of three dictionaries:
|
121
|
+
- qtys_map: Dict mapping accelerator names to lists of InstanceTypeInfo
|
122
|
+
objects with quantity information.
|
123
|
+
- total_accelerators_capacity: Dict mapping accelerator names to their
|
124
|
+
total capacity in the cluster.
|
125
|
+
- total_accelerators_available: Dict mapping accelerator names to their
|
126
|
+
current availability. Returns -1 for each accelerator if
|
127
|
+
realtime=False or if insufficient permissions.
|
118
128
|
"""
|
119
129
|
# TODO(romilb): This should be refactored to use get_kubernetes_node_info()
|
120
130
|
# function from kubernetes_utils.
|
@@ -243,6 +253,10 @@ def _list_accelerators(
|
|
243
253
|
|
244
254
|
accelerators_available = accelerator_count - allocated_qty
|
245
255
|
|
256
|
+
# Initialize the entry if it doesn't exist yet
|
257
|
+
if accelerator_name not in total_accelerators_available:
|
258
|
+
total_accelerators_available[accelerator_name] = 0
|
259
|
+
|
246
260
|
if accelerators_available >= min_quantity_filter:
|
247
261
|
quantized_availability = min_quantity_filter * (
|
248
262
|
accelerators_available // min_quantity_filter)
|
sky/data/storage.py
CHANGED
@@ -3968,7 +3968,7 @@ class OciStore(AbstractStore):
|
|
3968
3968
|
|
3969
3969
|
def __init__(self,
|
3970
3970
|
name: str,
|
3971
|
-
source:
|
3971
|
+
source: Optional[SourceType],
|
3972
3972
|
region: Optional[str] = None,
|
3973
3973
|
is_sky_managed: Optional[bool] = None,
|
3974
3974
|
sync_on_reconstruction: Optional[bool] = True,
|
@@ -3980,13 +3980,53 @@ class OciStore(AbstractStore):
|
|
3980
3980
|
self.compartment: str
|
3981
3981
|
self.namespace: str
|
3982
3982
|
|
3983
|
-
#
|
3984
|
-
|
3983
|
+
# Region is from the specified name in <bucket>@<region> format.
|
3984
|
+
# Another case is name can also be set by the source, for example:
|
3985
|
+
# /datasets-storage:
|
3986
|
+
# source: oci://RAGData@us-sanjose-1
|
3987
|
+
# The name in above mount will be set to RAGData@us-sanjose-1
|
3988
|
+
region_in_name = None
|
3989
|
+
if name is not None and '@' in name:
|
3990
|
+
self._validate_bucket_expr(name)
|
3991
|
+
name, region_in_name = name.split('@')
|
3992
|
+
|
3993
|
+
# Region is from the specified source in oci://<bucket>@<region> format
|
3994
|
+
region_in_source = None
|
3995
|
+
if isinstance(source,
|
3996
|
+
str) and source.startswith('oci://') and '@' in source:
|
3997
|
+
self._validate_bucket_expr(source)
|
3998
|
+
source, region_in_source = source.split('@')
|
3999
|
+
|
4000
|
+
if region_in_name is not None and region_in_source is not None:
|
4001
|
+
# This should never happen because name and source will never be
|
4002
|
+
# the remote bucket at the same time.
|
4003
|
+
assert region_in_name == region_in_source, (
|
4004
|
+
f'Mismatch region specified. Region in name {region_in_name}, '
|
4005
|
+
f'but region in source is {region_in_source}')
|
4006
|
+
|
4007
|
+
if region_in_name is not None:
|
4008
|
+
region = region_in_name
|
4009
|
+
elif region_in_source is not None:
|
4010
|
+
region = region_in_source
|
4011
|
+
|
4012
|
+
# Default region set to what specified in oci config.
|
4013
|
+
if region is None:
|
4014
|
+
region = oci.get_oci_config()['region']
|
4015
|
+
|
4016
|
+
# So far from now on, the name and source are canonical, means there
|
4017
|
+
# is no region (@<region> suffix) associated with them anymore.
|
3985
4018
|
|
3986
4019
|
super().__init__(name, source, region, is_sky_managed,
|
3987
4020
|
sync_on_reconstruction, _bucket_sub_path)
|
3988
4021
|
# TODO(zpoint): add _bucket_sub_path to the sync/mount/delete commands
|
3989
4022
|
|
4023
|
+
def _validate_bucket_expr(self, bucket_expr: str):
|
4024
|
+
pattern = r'^(\w+://)?[A-Za-z0-9-._]+(@\w{2}-\w+-\d{1})$'
|
4025
|
+
if not re.match(pattern, bucket_expr):
|
4026
|
+
raise ValueError(
|
4027
|
+
'The format for the bucket portion is <bucket>@<region> '
|
4028
|
+
'when specify a region with a bucket.')
|
4029
|
+
|
3990
4030
|
def _validate(self):
|
3991
4031
|
if self.source is not None and isinstance(self.source, str):
|
3992
4032
|
if self.source.startswith('oci://'):
|
@@ -4137,7 +4177,8 @@ class OciStore(AbstractStore):
|
|
4137
4177
|
sync_command = (
|
4138
4178
|
'oci os object bulk-upload --no-follow-symlinks --overwrite '
|
4139
4179
|
f'--bucket-name {self.name} --namespace-name {self.namespace} '
|
4140
|
-
f'--src-dir "{base_dir_path}"
|
4180
|
+
f'--region {self.region} --src-dir "{base_dir_path}" '
|
4181
|
+
f'{includes}')
|
4141
4182
|
|
4142
4183
|
return sync_command
|
4143
4184
|
|
@@ -4157,8 +4198,8 @@ class OciStore(AbstractStore):
|
|
4157
4198
|
sync_command = (
|
4158
4199
|
'oci os object bulk-upload --no-follow-symlinks --overwrite '
|
4159
4200
|
f'--bucket-name {self.name} --namespace-name {self.namespace} '
|
4160
|
-
f'--
|
4161
|
-
f'{excludes}
|
4201
|
+
f'--region {self.region} --object-prefix "{dest_dir_name}" '
|
4202
|
+
f'--src-dir "{src_dir_path}" {excludes}')
|
4162
4203
|
|
4163
4204
|
return sync_command
|
4164
4205
|
|
@@ -4289,7 +4330,8 @@ class OciStore(AbstractStore):
|
|
4289
4330
|
def get_file_download_command(remote_path, local_path):
|
4290
4331
|
download_command = (f'oci os object get --bucket-name {self.name} '
|
4291
4332
|
f'--namespace-name {self.namespace} '
|
4292
|
-
f'--
|
4333
|
+
f'--region {self.region} --name {remote_path} '
|
4334
|
+
f'--file {local_path}')
|
4293
4335
|
|
4294
4336
|
return download_command
|
4295
4337
|
|
@@ -4346,6 +4388,7 @@ class OciStore(AbstractStore):
|
|
4346
4388
|
@oci.with_oci_env
|
4347
4389
|
def get_bucket_delete_command(bucket_name):
|
4348
4390
|
remove_command = (f'oci os bucket delete --bucket-name '
|
4391
|
+
f'--region {self.region} '
|
4349
4392
|
f'{bucket_name} --empty --force')
|
4350
4393
|
|
4351
4394
|
return remove_command
|
sky/jobs/controller.py
CHANGED
@@ -256,9 +256,7 @@ class JobsController:
|
|
256
256
|
task.num_nodes == 1):
|
257
257
|
continue
|
258
258
|
|
259
|
-
if job_status in
|
260
|
-
job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
|
261
|
-
]:
|
259
|
+
if job_status in job_lib.JobStatus.user_code_failure_states():
|
262
260
|
# Add a grace period before the check of preemption to avoid
|
263
261
|
# false alarm for job failure.
|
264
262
|
time.sleep(5)
|
@@ -288,9 +286,7 @@ class JobsController:
|
|
288
286
|
if job_status is not None and not job_status.is_terminal():
|
289
287
|
# The multi-node job is still running, continue monitoring.
|
290
288
|
continue
|
291
|
-
elif job_status in
|
292
|
-
job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
|
293
|
-
]:
|
289
|
+
elif job_status in job_lib.JobStatus.user_code_failure_states():
|
294
290
|
# The user code has probably crashed, fail immediately.
|
295
291
|
end_time = managed_job_utils.get_job_timestamp(
|
296
292
|
self._backend, cluster_name, get_end_time=True)
|
@@ -493,6 +489,7 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
493
489
|
"""Start the controller."""
|
494
490
|
controller_process = None
|
495
491
|
cancelling = False
|
492
|
+
task_id = None
|
496
493
|
try:
|
497
494
|
_handle_signal(job_id)
|
498
495
|
# TODO(suquark): In theory, we should make controller process a
|
@@ -511,6 +508,7 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
511
508
|
except exceptions.ManagedJobUserCancelledError:
|
512
509
|
dag, _ = _get_dag_and_name(dag_yaml)
|
513
510
|
task_id, _ = managed_job_state.get_latest_task_id_status(job_id)
|
511
|
+
assert task_id is not None, job_id
|
514
512
|
logger.info(
|
515
513
|
f'Cancelling managed job, job_id: {job_id}, task_id: {task_id}')
|
516
514
|
managed_job_state.set_cancelling(
|
@@ -542,6 +540,7 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
542
540
|
logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
|
543
541
|
|
544
542
|
if cancelling:
|
543
|
+
assert task_id is not None, job_id # Since it's set with cancelling
|
545
544
|
managed_job_state.set_cancelled(
|
546
545
|
job_id=job_id,
|
547
546
|
callback_func=managed_job_utils.event_callback_func(
|
sky/jobs/state.py
CHANGED
@@ -620,10 +620,12 @@ def get_latest_task_id_status(
|
|
620
620
|
id_statuses = _get_all_task_ids_statuses(job_id)
|
621
621
|
if not id_statuses:
|
622
622
|
return None, None
|
623
|
-
task_id, status =
|
624
|
-
|
625
|
-
|
626
|
-
|
623
|
+
task_id, status = next(
|
624
|
+
((tid, st) for tid, st in id_statuses if not st.is_terminal()),
|
625
|
+
id_statuses[-1],
|
626
|
+
)
|
627
|
+
# Unpack the tuple first, or it triggers a Pylint's bug on recognizing
|
628
|
+
# the return type.
|
627
629
|
return task_id, status
|
628
630
|
|
629
631
|
|
sky/jobs/utils.py
CHANGED
@@ -398,32 +398,15 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
398
398
|
job_statuses = backend.get_job_status(handle, stream_logs=False)
|
399
399
|
job_status = list(job_statuses.values())[0]
|
400
400
|
assert job_status is not None, 'No job found.'
|
401
|
+
assert task_id is not None, job_id
|
402
|
+
|
401
403
|
if job_status != job_lib.JobStatus.CANCELLED:
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
'is finished. Waiting for the next task\'s log '
|
409
|
-
'to be started.')
|
410
|
-
# Add a newline to avoid the status display below
|
411
|
-
# removing the last line of the task output.
|
412
|
-
print()
|
413
|
-
status_display.update(
|
414
|
-
ux_utils.spinner_message(
|
415
|
-
f'Waiting for the next task: {task_id + 1}'))
|
416
|
-
status_display.start()
|
417
|
-
original_task_id = task_id
|
418
|
-
while True:
|
419
|
-
task_id, managed_job_status = (
|
420
|
-
managed_job_state.get_latest_task_id_status(
|
421
|
-
job_id))
|
422
|
-
if original_task_id != task_id:
|
423
|
-
break
|
424
|
-
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
425
|
-
continue
|
426
|
-
else:
|
404
|
+
if not follow:
|
405
|
+
break
|
406
|
+
|
407
|
+
# Logs for retrying failed tasks.
|
408
|
+
if (job_status
|
409
|
+
in job_lib.JobStatus.user_code_failure_states()):
|
427
410
|
task_specs = managed_job_state.get_task_specs(
|
428
411
|
job_id, task_id)
|
429
412
|
if task_specs.get('max_restarts_on_errors', 0) == 0:
|
@@ -436,15 +419,51 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
436
419
|
ux_utils.spinner_message(
|
437
420
|
'Waiting for next restart for the failed task'))
|
438
421
|
status_display.start()
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
if
|
444
|
-
|
445
|
-
|
422
|
+
|
423
|
+
def is_managed_job_status_updated(
|
424
|
+
status: Optional[managed_job_state.ManagedJobStatus]
|
425
|
+
) -> bool:
|
426
|
+
"""Check if local managed job status reflects remote
|
427
|
+
job failure.
|
428
|
+
|
429
|
+
Ensures synchronization between remote cluster
|
430
|
+
failure detection (JobStatus.FAILED) and controller
|
431
|
+
retry logic.
|
432
|
+
"""
|
433
|
+
return (status !=
|
434
|
+
managed_job_state.ManagedJobStatus.RUNNING)
|
435
|
+
|
436
|
+
while not is_managed_job_status_updated(
|
437
|
+
managed_job_status :=
|
438
|
+
managed_job_state.get_status(job_id)):
|
446
439
|
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
447
440
|
continue
|
441
|
+
|
442
|
+
if task_id == num_tasks - 1:
|
443
|
+
break
|
444
|
+
|
445
|
+
# The log for the current job is finished. We need to
|
446
|
+
# wait until next job to be started.
|
447
|
+
logger.debug(
|
448
|
+
f'INFO: Log for the current task ({task_id}) '
|
449
|
+
'is finished. Waiting for the next task\'s log '
|
450
|
+
'to be started.')
|
451
|
+
# Add a newline to avoid the status display below
|
452
|
+
# removing the last line of the task output.
|
453
|
+
print()
|
454
|
+
status_display.update(
|
455
|
+
ux_utils.spinner_message(
|
456
|
+
f'Waiting for the next task: {task_id + 1}'))
|
457
|
+
status_display.start()
|
458
|
+
original_task_id = task_id
|
459
|
+
while True:
|
460
|
+
task_id, managed_job_status = (
|
461
|
+
managed_job_state.get_latest_task_id_status(job_id))
|
462
|
+
if original_task_id != task_id:
|
463
|
+
break
|
464
|
+
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
465
|
+
continue
|
466
|
+
|
448
467
|
# The job can be cancelled by the user or the controller (when
|
449
468
|
# the cluster is partially preempted).
|
450
469
|
logger.debug(
|
sky/serve/core.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
"""SkyServe core APIs."""
|
2
2
|
import re
|
3
|
+
import signal
|
4
|
+
import subprocess
|
3
5
|
import tempfile
|
6
|
+
import threading
|
4
7
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
8
|
|
6
9
|
import colorama
|
@@ -18,6 +21,7 @@ from sky.serve import serve_utils
|
|
18
21
|
from sky.skylet import constants
|
19
22
|
from sky.usage import usage_lib
|
20
23
|
from sky.utils import admin_policy_utils
|
24
|
+
from sky.utils import command_runner
|
21
25
|
from sky.utils import common_utils
|
22
26
|
from sky.utils import controller_utils
|
23
27
|
from sky.utils import resources_utils
|
@@ -731,8 +735,29 @@ def tail_logs(
|
|
731
735
|
|
732
736
|
backend = backend_utils.get_backend_from_handle(handle)
|
733
737
|
assert isinstance(backend, backends.CloudVmRayBackend), backend
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
738
|
+
|
739
|
+
if target != serve_utils.ServiceComponent.REPLICA:
|
740
|
+
code = serve_utils.ServeCodeGen.stream_serve_process_logs(
|
741
|
+
service_name,
|
742
|
+
stream_controller=(
|
743
|
+
target == serve_utils.ServiceComponent.CONTROLLER),
|
744
|
+
follow=follow)
|
745
|
+
else:
|
746
|
+
assert replica_id is not None, service_name
|
747
|
+
code = serve_utils.ServeCodeGen.stream_replica_logs(
|
748
|
+
service_name, replica_id, follow)
|
749
|
+
|
750
|
+
# With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
|
751
|
+
# kill the process, so we need to handle it manually here.
|
752
|
+
if threading.current_thread() is threading.main_thread():
|
753
|
+
signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
|
754
|
+
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
755
|
+
|
756
|
+
# Refer to the notes in
|
757
|
+
# sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
|
758
|
+
backend.run_on_head(handle,
|
759
|
+
code,
|
760
|
+
stream_logs=True,
|
761
|
+
process_stream=False,
|
762
|
+
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
763
|
+
stdin=subprocess.DEVNULL)
|
sky/serve/replica_managers.py
CHANGED
@@ -998,9 +998,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
998
998
|
# Re-raise the exception if it is not preempted.
|
999
999
|
raise
|
1000
1000
|
job_status = list(job_statuses.values())[0]
|
1001
|
-
if job_status in
|
1002
|
-
job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
|
1003
|
-
]:
|
1001
|
+
if job_status in job_lib.JobStatus.user_code_failure_states():
|
1004
1002
|
info.status_property.user_app_failed = True
|
1005
1003
|
serve_state.add_or_update_replica(self._service_name,
|
1006
1004
|
info.replica_id, info)
|
sky/skylet/job_lib.py
CHANGED
@@ -12,7 +12,7 @@ import signal
|
|
12
12
|
import sqlite3
|
13
13
|
import subprocess
|
14
14
|
import time
|
15
|
-
from typing import Any, Dict, List, Optional
|
15
|
+
from typing import Any, Dict, List, Optional, Sequence
|
16
16
|
|
17
17
|
import colorama
|
18
18
|
import filelock
|
@@ -162,13 +162,17 @@ class JobStatus(enum.Enum):
|
|
162
162
|
def nonterminal_statuses(cls) -> List['JobStatus']:
|
163
163
|
return [cls.INIT, cls.SETTING_UP, cls.PENDING, cls.RUNNING]
|
164
164
|
|
165
|
-
def is_terminal(self):
|
165
|
+
def is_terminal(self) -> bool:
|
166
166
|
return self not in self.nonterminal_statuses()
|
167
167
|
|
168
|
-
|
168
|
+
@classmethod
|
169
|
+
def user_code_failure_states(cls) -> Sequence['JobStatus']:
|
170
|
+
return (cls.FAILED, cls.FAILED_SETUP)
|
171
|
+
|
172
|
+
def __lt__(self, other: 'JobStatus') -> bool:
|
169
173
|
return list(JobStatus).index(self) < list(JobStatus).index(other)
|
170
174
|
|
171
|
-
def colored_str(self):
|
175
|
+
def colored_str(self) -> str:
|
172
176
|
color = _JOB_STATUS_TO_COLOR[self]
|
173
177
|
return f'{color}{self.value}{colorama.Style.RESET_ALL}'
|
174
178
|
|
sky/utils/db_utils.py
CHANGED
@@ -4,11 +4,27 @@ import sqlite3
|
|
4
4
|
import threading
|
5
5
|
from typing import Any, Callable, Optional
|
6
6
|
|
7
|
+
# This parameter (passed to sqlite3.connect) controls how long we will wait to
|
8
|
+
# obtains a database lock (not necessarily during connection, but whenever it is
|
9
|
+
# needed). It is not a connection timeout.
|
10
|
+
# Even in WAL mode, only a single writer is allowed at a time. Other writers
|
11
|
+
# will block until the write lock can be obtained. This behavior is described in
|
12
|
+
# the SQLite documentation for WAL: https://www.sqlite.org/wal.html
|
13
|
+
# Python's default timeout is 5s. In normal usage, lock contention is very low,
|
14
|
+
# and this is more than sufficient. However, in some highly concurrent cases,
|
15
|
+
# such as a jobs controller suddenly recovering thousands of jobs at once, we
|
16
|
+
# can see a small number of processes that take much longer to obtain the lock.
|
17
|
+
# In contrived highly contentious cases, around 0.1% of transactions will take
|
18
|
+
# >30s to take the lock. We have not seen cases that take >60s. For cases up to
|
19
|
+
# 1000x parallelism, this is thus thought to be a conservative setting.
|
20
|
+
# For more info, see the PR description for #4552.
|
21
|
+
_DB_TIMEOUT_S = 60
|
22
|
+
|
7
23
|
|
8
24
|
@contextlib.contextmanager
|
9
25
|
def safe_cursor(db_path: str):
|
10
26
|
"""A newly created, auto-committing, auto-closing cursor."""
|
11
|
-
conn = sqlite3.connect(db_path)
|
27
|
+
conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
|
12
28
|
cursor = conn.cursor()
|
13
29
|
try:
|
14
30
|
yield cursor
|
@@ -79,8 +95,6 @@ class SQLiteConn(threading.local):
|
|
79
95
|
def __init__(self, db_path: str, create_table: Callable):
|
80
96
|
super().__init__()
|
81
97
|
self.db_path = db_path
|
82
|
-
|
83
|
-
# errors. This is a hack, but it works.
|
84
|
-
self.conn = sqlite3.connect(db_path, timeout=10)
|
98
|
+
self.conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
|
85
99
|
self.cursor = self.conn.cursor()
|
86
100
|
create_table(self.cursor, self.conn)
|
@@ -93,11 +93,11 @@ cleanup_agent_node() {
|
|
93
93
|
|
94
94
|
check_gpu() {
|
95
95
|
local NODE_IP=$1
|
96
|
-
run_remote "$NODE_IP" "
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
96
|
+
if run_remote "$NODE_IP" "command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null"; then
|
97
|
+
return 0 # GPU detected
|
98
|
+
else
|
99
|
+
return 1 # No GPU detected
|
100
|
+
fi
|
101
101
|
}
|
102
102
|
|
103
103
|
# Pre-flight checks
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20250116
|
4
4
|
Summary: SkyPilot: An intercloud broker for the clouds
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -351,6 +351,8 @@ Read the research:
|
|
351
351
|
- [Sky Computing vision paper](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s02-stoica.pdf) (HotOS 2021)
|
352
352
|
- [Policy for Managed Spot Jobs](https://www.usenix.org/conference/nsdi24/presentation/wu-zhanghao) (NSDI 2024)
|
353
353
|
|
354
|
+
SkyPilot was initially started at the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley and has since gained many industry contributors. Read more about the project's origin [here](https://docs.skypilot.co/en/latest/sky-computing.html).
|
355
|
+
|
354
356
|
## Support and Questions
|
355
357
|
We are excited to hear your feedback!
|
356
358
|
* For issues and feature requests, please [open a GitHub issue](https://github.com/skypilot-org/skypilot/issues/new).
|
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250116.dist-info}/RECORD
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=NOjVo4cLFc0FCj6F1rqnD041xOezBxAjAjjUQqKcaqE,5944
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=LXUDABKP1FJCS256xTTDJa40WXwHKF5x49S-4hZbD1M,21501
|
4
4
|
sky/check.py,sha256=s8deMVL-k9y8gd519K7NWZc3DqWsEySwiAr0uH3Vvcc,9459
|
@@ -32,7 +32,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
|
|
32
32
|
sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
|
33
33
|
sky/backends/backend.py,sha256=iBs5gnMaaUoH2OIQ3xhAjWdrJWqj8T61Za9TGsBFpvQ,7515
|
34
34
|
sky/backends/backend_utils.py,sha256=Eeew8YV0VYSYxozqzadNMZrjhEMjlE3yuzTRP7YSl50,137348
|
35
|
-
sky/backends/cloud_vm_ray_backend.py,sha256=
|
35
|
+
sky/backends/cloud_vm_ray_backend.py,sha256=rW8YHJsnYwefIXRdIAAiDWEh9NUV7GZ89pmT4iMq0zY,245876
|
36
36
|
sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
|
37
37
|
sky/backends/local_docker_backend.py,sha256=nSYCjms3HOPjPNOrcCqsUKm1WV3AAovRFjEQ7hcEXW4,17021
|
38
38
|
sky/backends/wheel_utils.py,sha256=5BUzBqfYz7p1ME6_0PXGmcsAkLVb8NrFt317p7a4X8s,8278
|
@@ -68,7 +68,7 @@ sky/clouds/service_catalog/do_catalog.py,sha256=Cug2QaQlSN6nFhba7f1ksyzs6z0ICTj6
|
|
68
68
|
sky/clouds/service_catalog/fluidstack_catalog.py,sha256=21-cvrYEYTIi7n3ZNF2e7_0QX-PF4BkhlVJUWQOvKrY,5059
|
69
69
|
sky/clouds/service_catalog/gcp_catalog.py,sha256=jJEfWjZ4ItsE657LjIf9mruJVZERFegCD5Qtu20AFNc,24542
|
70
70
|
sky/clouds/service_catalog/ibm_catalog.py,sha256=1iK0KvbI82U7sySb7chr-qm_16x3tTnZ6nIo7o76ouc,4493
|
71
|
-
sky/clouds/service_catalog/kubernetes_catalog.py,sha256=
|
71
|
+
sky/clouds/service_catalog/kubernetes_catalog.py,sha256=449eTIw-ZIwliMWGPx6ENAYuX8nW2M4kO4mh5V3cea4,13268
|
72
72
|
sky/clouds/service_catalog/lambda_catalog.py,sha256=2R-ccu63BbdvO6X80MtxiniA-jLewXb6I0Ye1rYD9fY,5302
|
73
73
|
sky/clouds/service_catalog/oci_catalog.py,sha256=cyA6ZqwHGOKuPxUl_dKmFGdeWdQGMrvl_-o2MtyF998,8580
|
74
74
|
sky/clouds/service_catalog/paperspace_catalog.py,sha256=MOlfoGRChjEwMzu4nRAho8DrIwwUJ3QlRzrMA1RLqvE,3789
|
@@ -94,15 +94,15 @@ sky/data/__init__.py,sha256=Nhaf1NURisXpZuwWANa2IuCyppIuc720FRwqSE2oEwY,184
|
|
94
94
|
sky/data/data_transfer.py,sha256=wixC4_3_JaeJFdGKOp-O5ulcsMugDSgrCR0SnPpugGc,8946
|
95
95
|
sky/data/data_utils.py,sha256=HjcgMDuWRR_fNQ9gjuROi9GgPVvTGApiJwxGtdb2_UU,28860
|
96
96
|
sky/data/mounting_utils.py,sha256=tJHBPEDP1Wg_r3oSGBwFhMDLnPCMPSFRz26O0QkDd0Y,14908
|
97
|
-
sky/data/storage.py,sha256=
|
97
|
+
sky/data/storage.py,sha256=jOo3veWVL8JMTP2SVmcsXt-ZpfNbReWdOvEgoeCZIic,203768
|
98
98
|
sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
|
99
99
|
sky/jobs/__init__.py,sha256=ObZcz3lL1ip8JcmR6gbfZ4RMMfXJJdsnuU2zLQUb8jY,1546
|
100
100
|
sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
|
101
|
-
sky/jobs/controller.py,sha256=
|
101
|
+
sky/jobs/controller.py,sha256=nJKd_5cPJ14RPm_Gg-JgPquj5W9zfVQ1m6hJxxTKNpE,28577
|
102
102
|
sky/jobs/core.py,sha256=AVbboohNCUDqfK_7DDkc-wJOg87nE7L6Vw0wbPTelIA,20022
|
103
103
|
sky/jobs/recovery_strategy.py,sha256=eP9CLy5qiNTyMJTWWzAxdQ4YolUZWL1g3cLMH7tw8Es,27312
|
104
|
-
sky/jobs/state.py,sha256=
|
105
|
-
sky/jobs/utils.py,sha256=
|
104
|
+
sky/jobs/state.py,sha256=CaOzoU0mPiXwioyupXol0XsNJsvDC8ApgDyKKE_fIRs,27694
|
105
|
+
sky/jobs/utils.py,sha256=0HlO8H1hzTr40XK7xJXseMoeIMQYA01qVuPAuEQFgAE,39596
|
106
106
|
sky/jobs/dashboard/dashboard.py,sha256=KMSarpVcfnc-ELPFvy1M9_I1k4kSeXubTk3ibQC67Tg,3219
|
107
107
|
sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
|
108
108
|
sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x1Tc2mNDK7U,11139
|
@@ -190,10 +190,10 @@ sky/serve/__init__.py,sha256=Bqw8nB9u1QF3ryjbV797SPZq0DWAcjT94E_5B8J24ag,1808
|
|
190
190
|
sky/serve/autoscalers.py,sha256=N7yRGT9Ay5_yJUOkqaBGC7jG3eIdzA5d66i8kskGxZc,30351
|
191
191
|
sky/serve/constants.py,sha256=7MflfgTHO9gDSux93U4BmNeEMWXxZB4q7I54KUwgp-s,4651
|
192
192
|
sky/serve/controller.py,sha256=R5iIEGEEFtbm_6MvSGelYZP-vSmW0cSFuy64OexUc4g,11719
|
193
|
-
sky/serve/core.py,sha256=
|
193
|
+
sky/serve/core.py,sha256=UAbbnxmOZ8GBT7vaeFvtFC7_qXu05TFsNIFcLrdT3Oo,33341
|
194
194
|
sky/serve/load_balancer.py,sha256=nNvDPJPRIrBc_qsBYJz1zzKa_fXDgfi0VDUf4SJEuW8,12990
|
195
195
|
sky/serve/load_balancing_policies.py,sha256=XVj76qBgqh7h6wfx53RKQFzBefDWTE4TCdCEtFLLtI4,5398
|
196
|
-
sky/serve/replica_managers.py,sha256=
|
196
|
+
sky/serve/replica_managers.py,sha256=SFvK7ewilc3NVRcqXg63WtU1WmhJKPtJd27JfKR2aow,57716
|
197
197
|
sky/serve/serve_state.py,sha256=MAx63zlGOXaIgXedP9fUFlRxDKiez1shmyMetrJK6yQ,19756
|
198
198
|
sky/serve/serve_utils.py,sha256=WgPcqEw3WyMOdgRTFg8DSsWyIG1xnRbRkI1-f09tNKg,39741
|
199
199
|
sky/serve/service.py,sha256=7bvK9R9D48PZSYcOKSievXQ2mHUMk1d3AAIxtra7WOI,12083
|
@@ -208,7 +208,7 @@ sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,44
|
|
208
208
|
sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
|
209
209
|
sky/skylet/constants.py,sha256=1h5nhXsAvryo9THpfQ0wQKPSDjXcY9GeN6oX378yAyM,16021
|
210
210
|
sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
|
211
|
-
sky/skylet/job_lib.py,sha256=
|
211
|
+
sky/skylet/job_lib.py,sha256=Vp8rpRYioKYEwV9NkMArMfw-uOUaJMy8eQ2sUwZy-Kc,44014
|
212
212
|
sky/skylet/log_lib.py,sha256=fcQzEe4OK8exsNVBhbdYe4uIq2cdSHszsKZTtX8a3-Q,20453
|
213
213
|
sky/skylet/log_lib.pyi,sha256=VpA_VoL970Noj-YrBkKqLxFi34JVMY7KLrOQ3o4AqEI,4336
|
214
214
|
sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
|
@@ -264,7 +264,7 @@ sky/utils/common_utils.py,sha256=Kh0iymQl9I4HXxYSc3TTcv-xeso27pU_1hGNOc9Xw2o,253
|
|
264
264
|
sky/utils/control_master_utils.py,sha256=90hnxiAUP20gbJ9e3MERh7rb04ZO_I3LsljNjR26H5I,1416
|
265
265
|
sky/utils/controller_utils.py,sha256=g4wvp6BrXUcwjRbMvy_LBtZPMPOzHXeRWyEoXORoZrU,44381
|
266
266
|
sky/utils/dag_utils.py,sha256=R1yhJssvzDg13p6PJIC8OkYFBiR64eIx5xQeRpAG9n4,6099
|
267
|
-
sky/utils/db_utils.py,sha256=
|
267
|
+
sky/utils/db_utils.py,sha256=K2-OHPg0FeHCarevMdWe0IWzm6wWumViEeYeJuGoFUE,3747
|
268
268
|
sky/utils/env_options.py,sha256=E5iwRFBUY2Iq6e0y0c1Mv5OSQ4MRNdk0-p38xUyVerc,1366
|
269
269
|
sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM,1384
|
270
270
|
sky/utils/log_utils.py,sha256=xEbUZfDiIiZkyWoLHXwIcqVMCBDEENsLCiogEXMDLt0,14139
|
@@ -280,7 +280,7 @@ sky/utils/cli_utils/status_utils.py,sha256=2HrH6IBJCJ__AbuZ0ooIEgarBKIVIA5M3myE5
|
|
280
280
|
sky/utils/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
281
281
|
sky/utils/kubernetes/create_cluster.sh,sha256=VLXfazav9XCMQmeKVqhuOQzt2vM6G1jgnvvb0SHUFno,7773
|
282
282
|
sky/utils/kubernetes/delete_cluster.sh,sha256=BSccHF43GyepDNf-FZcenzHzpXXATkVD92vgn1lWPgk,927
|
283
|
-
sky/utils/kubernetes/deploy_remote_cluster.sh,sha256=
|
283
|
+
sky/utils/kubernetes/deploy_remote_cluster.sh,sha256=dj_q2LHFgq03bXJWJhtMFFWOpcWnWAYKfFQyMv7Gr5A,8551
|
284
284
|
sky/utils/kubernetes/generate_kind_config.py,sha256=_TNLnifA_r7-CRq083IP1xjelYqiLjzQX9ohuqYpDH8,3187
|
285
285
|
sky/utils/kubernetes/generate_kubeconfig.sh,sha256=MBvXJio0PeujZSCXiRKE_pa6HCTiU9qBzR1WrXccVSY,10477
|
286
286
|
sky/utils/kubernetes/gpu_labeler.py,sha256=4px7FyfsukacPEvKwTLUNb3WwacMIUrHWjP93qTi3kE,6998
|
@@ -288,9 +288,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
288
288
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
289
289
|
sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
|
290
290
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
|
291
|
-
skypilot_nightly-1.0.0.
|
292
|
-
skypilot_nightly-1.0.0.
|
293
|
-
skypilot_nightly-1.0.0.
|
294
|
-
skypilot_nightly-1.0.0.
|
295
|
-
skypilot_nightly-1.0.0.
|
296
|
-
skypilot_nightly-1.0.0.
|
291
|
+
skypilot_nightly-1.0.0.dev20250116.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
292
|
+
skypilot_nightly-1.0.0.dev20250116.dist-info/METADATA,sha256=8MvuzpqLuZac2S1Dx80sAsED2p3DgJo6EYa3YpI4GSU,20884
|
293
|
+
skypilot_nightly-1.0.0.dev20250116.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
294
|
+
skypilot_nightly-1.0.0.dev20250116.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
295
|
+
skypilot_nightly-1.0.0.dev20250116.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
296
|
+
skypilot_nightly-1.0.0.dev20250116.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250116.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|