skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/common.py +15 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/oci.py +32 -1
- sky/authentication.py +20 -8
- sky/backends/backend_utils.py +44 -0
- sky/backends/cloud_vm_ray_backend.py +202 -41
- sky/backends/wheel_utils.py +4 -1
- sky/check.py +31 -1
- sky/cli.py +39 -43
- sky/cloud_stores.py +71 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +137 -50
- sky/clouds/cloud.py +4 -0
- sky/clouds/do.py +303 -0
- sky/clouds/gcp.py +9 -0
- sky/clouds/kubernetes.py +3 -3
- sky/clouds/oci.py +20 -9
- sky/clouds/service_catalog/__init__.py +7 -3
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
- sky/clouds/utils/oci_utils.py +15 -2
- sky/core.py +8 -5
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +19 -4
- sky/data/mounting_utils.py +99 -15
- sky/data/storage.py +961 -130
- sky/global_user_state.py +1 -1
- sky/jobs/__init__.py +2 -0
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +19 -22
- sky/jobs/core.py +46 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +290 -21
- sky/jobs/utils.py +346 -95
- sky/optimizer.py +6 -3
- sky/provision/aws/config.py +59 -29
- sky/provision/azure/instance.py +1 -1
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +306 -0
- sky/provision/docker_utils.py +22 -11
- sky/provision/gcp/instance_utils.py +15 -9
- sky/provision/kubernetes/instance.py +3 -2
- sky/provision/kubernetes/utils.py +125 -20
- sky/provision/oci/query_utils.py +17 -14
- sky/provision/provisioner.py +0 -1
- sky/provision/runpod/instance.py +10 -1
- sky/provision/runpod/utils.py +170 -13
- sky/resources.py +1 -1
- sky/serve/autoscalers.py +359 -301
- sky/serve/controller.py +10 -8
- sky/serve/core.py +84 -7
- sky/serve/load_balancer.py +27 -10
- sky/serve/replica_managers.py +1 -3
- sky/serve/serve_state.py +10 -5
- sky/serve/serve_utils.py +28 -1
- sky/serve/service.py +4 -3
- sky/serve/service_spec.py +31 -0
- sky/setup_files/dependencies.py +4 -1
- sky/skylet/constants.py +8 -4
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +10 -30
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/providers/command_runner.py +5 -7
- sky/skylet/skylet.py +1 -1
- sky/task.py +28 -1
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/jobs-controller.yaml.j2 +41 -7
- sky/templates/runpod-ray.yml.j2 +13 -0
- sky/templates/sky-serve-controller.yaml.j2 +4 -0
- sky/usage/usage_lib.py +10 -2
- sky/utils/accelerator_registry.py +12 -8
- sky/utils/controller_utils.py +114 -39
- sky/utils/db_utils.py +18 -4
- sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
- sky/utils/log_utils.py +2 -0
- sky/utils/resources_utils.py +25 -21
- sky/utils/schemas.py +27 -0
- sky/utils/subprocess_utils.py +54 -10
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/utils/controller_utils.py
CHANGED
@@ -206,6 +206,9 @@ def _get_cloud_dependencies_installation_commands(
|
|
206
206
|
# installed, so we don't check that.
|
207
207
|
python_packages: Set[str] = set()
|
208
208
|
|
209
|
+
# add flask to the controller dependencies for dashboard
|
210
|
+
python_packages.add('flask')
|
211
|
+
|
209
212
|
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
210
213
|
commands.append(f'echo -en "\\r{step_prefix}uv{empty_str}" &&'
|
211
214
|
f'{constants.SKY_UV_INSTALL_CMD} >/dev/null 2>&1')
|
@@ -649,10 +652,27 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
649
652
|
still sync up any storage mounts with local source paths (which do not
|
650
653
|
undergo translation).
|
651
654
|
"""
|
655
|
+
|
652
656
|
# ================================================================
|
653
657
|
# Translate the workdir and local file mounts to cloud file mounts.
|
654
658
|
# ================================================================
|
655
659
|
|
660
|
+
def _sub_path_join(sub_path: Optional[str], path: str) -> str:
|
661
|
+
if sub_path is None:
|
662
|
+
return path
|
663
|
+
return os.path.join(sub_path, path).strip('/')
|
664
|
+
|
665
|
+
def assert_no_bucket_creation(store: storage_lib.AbstractStore) -> None:
|
666
|
+
if store.is_sky_managed:
|
667
|
+
# Bucket was created, this should not happen since use configured
|
668
|
+
# the bucket and we assumed it already exists.
|
669
|
+
store.delete()
|
670
|
+
with ux_utils.print_exception_no_traceback():
|
671
|
+
raise exceptions.StorageBucketCreateError(
|
672
|
+
f'Jobs bucket {store.name!r} does not exist. '
|
673
|
+
'Please check jobs.bucket configuration in '
|
674
|
+
'your SkyPilot config.')
|
675
|
+
|
656
676
|
run_id = common_utils.get_usage_run_id()[:8]
|
657
677
|
original_file_mounts = task.file_mounts if task.file_mounts else {}
|
658
678
|
original_storage_mounts = task.storage_mounts if task.storage_mounts else {}
|
@@ -679,11 +699,27 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
679
699
|
ux_utils.spinner_message(
|
680
700
|
f'Translating {msg} to SkyPilot Storage...'))
|
681
701
|
|
702
|
+
# Get the bucket name for the workdir and file mounts,
|
703
|
+
# we store all these files in same bucket from config.
|
704
|
+
bucket_wth_prefix = skypilot_config.get_nested(('jobs', 'bucket'), None)
|
705
|
+
store_kwargs: Dict[str, Any] = {}
|
706
|
+
if bucket_wth_prefix is None:
|
707
|
+
store_type = store_cls = sub_path = None
|
708
|
+
storage_account_name = region = None
|
709
|
+
bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format(
|
710
|
+
username=common_utils.get_cleaned_username(), id=run_id)
|
711
|
+
else:
|
712
|
+
store_type, store_cls, bucket_name, sub_path, storage_account_name, \
|
713
|
+
region = storage_lib.StoreType.get_fields_from_store_url(
|
714
|
+
bucket_wth_prefix)
|
715
|
+
if storage_account_name is not None:
|
716
|
+
store_kwargs['storage_account_name'] = storage_account_name
|
717
|
+
if region is not None:
|
718
|
+
store_kwargs['region'] = region
|
719
|
+
|
682
720
|
# Step 1: Translate the workdir to SkyPilot storage.
|
683
721
|
new_storage_mounts = {}
|
684
722
|
if task.workdir is not None:
|
685
|
-
bucket_name = constants.WORKDIR_BUCKET_NAME.format(
|
686
|
-
username=common_utils.get_cleaned_username(), id=run_id)
|
687
723
|
workdir = task.workdir
|
688
724
|
task.workdir = None
|
689
725
|
if (constants.SKY_REMOTE_WORKDIR in original_file_mounts or
|
@@ -691,14 +727,28 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
691
727
|
raise ValueError(
|
692
728
|
f'Cannot mount {constants.SKY_REMOTE_WORKDIR} as both the '
|
693
729
|
'workdir and file_mounts contains it as the target.')
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
730
|
+
bucket_sub_path = _sub_path_join(
|
731
|
+
sub_path,
|
732
|
+
constants.FILE_MOUNTS_WORKDIR_SUBPATH.format(run_id=run_id))
|
733
|
+
stores = None
|
734
|
+
if store_type is not None:
|
735
|
+
assert store_cls is not None
|
736
|
+
with sky_logging.silent():
|
737
|
+
stores = {
|
738
|
+
store_type: store_cls(name=bucket_name,
|
739
|
+
source=workdir,
|
740
|
+
_bucket_sub_path=bucket_sub_path,
|
741
|
+
**store_kwargs)
|
742
|
+
}
|
743
|
+
assert_no_bucket_creation(stores[store_type])
|
744
|
+
|
745
|
+
storage_obj = storage_lib.Storage(name=bucket_name,
|
746
|
+
source=workdir,
|
747
|
+
persistent=False,
|
748
|
+
mode=storage_lib.StorageMode.COPY,
|
749
|
+
stores=stores,
|
750
|
+
_bucket_sub_path=bucket_sub_path)
|
751
|
+
new_storage_mounts[constants.SKY_REMOTE_WORKDIR] = storage_obj
|
702
752
|
# Check of the existence of the workdir in file_mounts is done in
|
703
753
|
# the task construction.
|
704
754
|
logger.info(f' {colorama.Style.DIM}Workdir: {workdir!r} '
|
@@ -716,27 +766,37 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
716
766
|
if os.path.isfile(os.path.abspath(os.path.expanduser(src))):
|
717
767
|
copy_mounts_with_file_in_src[dst] = src
|
718
768
|
continue
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
769
|
+
bucket_sub_path = _sub_path_join(
|
770
|
+
sub_path, constants.FILE_MOUNTS_SUBPATH.format(i=i, run_id=run_id))
|
771
|
+
stores = None
|
772
|
+
if store_type is not None:
|
773
|
+
assert store_cls is not None
|
774
|
+
with sky_logging.silent():
|
775
|
+
store = store_cls(name=bucket_name,
|
776
|
+
source=src,
|
777
|
+
_bucket_sub_path=bucket_sub_path,
|
778
|
+
**store_kwargs)
|
779
|
+
|
780
|
+
stores = {store_type: store}
|
781
|
+
assert_no_bucket_creation(stores[store_type])
|
782
|
+
storage_obj = storage_lib.Storage(name=bucket_name,
|
783
|
+
source=src,
|
784
|
+
persistent=False,
|
785
|
+
mode=storage_lib.StorageMode.COPY,
|
786
|
+
stores=stores,
|
787
|
+
_bucket_sub_path=bucket_sub_path)
|
788
|
+
new_storage_mounts[dst] = storage_obj
|
729
789
|
logger.info(f' {colorama.Style.DIM}Folder : {src!r} '
|
730
790
|
f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}')
|
731
791
|
|
732
792
|
# Step 3: Translate local file mounts with file in src to SkyPilot storage.
|
733
793
|
# Hard link the files in src to a temporary directory, and upload folder.
|
794
|
+
file_mounts_tmp_subpath = _sub_path_join(
|
795
|
+
sub_path, constants.FILE_MOUNTS_TMP_SUBPATH.format(run_id=run_id))
|
734
796
|
local_fm_path = os.path.join(
|
735
797
|
tempfile.gettempdir(),
|
736
798
|
constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id))
|
737
799
|
os.makedirs(local_fm_path, exist_ok=True)
|
738
|
-
file_bucket_name = constants.FILE_MOUNTS_FILE_ONLY_BUCKET_NAME.format(
|
739
|
-
username=common_utils.get_cleaned_username(), id=run_id)
|
740
800
|
file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format(
|
741
801
|
path)
|
742
802
|
if copy_mounts_with_file_in_src:
|
@@ -745,14 +805,27 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
745
805
|
src_to_file_id[src] = i
|
746
806
|
os.link(os.path.abspath(os.path.expanduser(src)),
|
747
807
|
os.path.join(local_fm_path, f'file-{i}'))
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
808
|
+
stores = None
|
809
|
+
if store_type is not None:
|
810
|
+
assert store_cls is not None
|
811
|
+
with sky_logging.silent():
|
812
|
+
stores = {
|
813
|
+
store_type: store_cls(
|
814
|
+
name=bucket_name,
|
815
|
+
source=local_fm_path,
|
816
|
+
_bucket_sub_path=file_mounts_tmp_subpath,
|
817
|
+
**store_kwargs)
|
818
|
+
}
|
819
|
+
assert_no_bucket_creation(stores[store_type])
|
820
|
+
storage_obj = storage_lib.Storage(
|
821
|
+
name=bucket_name,
|
822
|
+
source=local_fm_path,
|
823
|
+
persistent=False,
|
824
|
+
mode=storage_lib.StorageMode.MOUNT,
|
825
|
+
stores=stores,
|
826
|
+
_bucket_sub_path=file_mounts_tmp_subpath)
|
827
|
+
|
828
|
+
new_storage_mounts[file_mount_remote_tmp_dir] = storage_obj
|
756
829
|
if file_mount_remote_tmp_dir in original_storage_mounts:
|
757
830
|
with ux_utils.print_exception_no_traceback():
|
758
831
|
raise ValueError(
|
@@ -762,8 +835,9 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
762
835
|
sources = list(src_to_file_id.keys())
|
763
836
|
sources_str = '\n '.join(sources)
|
764
837
|
logger.info(f' {colorama.Style.DIM}Files (listed below) '
|
765
|
-
f' -> storage: {
|
838
|
+
f' -> storage: {bucket_name}:'
|
766
839
|
f'\n {sources_str}{colorama.Style.RESET_ALL}')
|
840
|
+
|
767
841
|
rich_utils.force_update_status(
|
768
842
|
ux_utils.spinner_message('Uploading translated local files/folders'))
|
769
843
|
task.update_storage_mounts(new_storage_mounts)
|
@@ -779,7 +853,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
779
853
|
ux_utils.spinner_message('Uploading local sources to storage[/] '
|
780
854
|
'[dim]View storages: sky storage ls'))
|
781
855
|
try:
|
782
|
-
task.sync_storage_mounts()
|
856
|
+
task.sync_storage_mounts(force_sync=bucket_wth_prefix is not None)
|
783
857
|
except (ValueError, exceptions.NoCloudAccessError) as e:
|
784
858
|
if 'No enabled cloud for storage' in str(e) or isinstance(
|
785
859
|
e, exceptions.NoCloudAccessError):
|
@@ -809,10 +883,11 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
809
883
|
# file_mount_remote_tmp_dir will only exist when there are files in
|
810
884
|
# the src for copy mounts.
|
811
885
|
storage_obj = task.storage_mounts[file_mount_remote_tmp_dir]
|
812
|
-
|
813
|
-
store_object = storage_obj.stores[
|
886
|
+
curr_store_type = list(storage_obj.stores.keys())[0]
|
887
|
+
store_object = storage_obj.stores[curr_store_type]
|
814
888
|
bucket_url = storage_lib.StoreType.get_endpoint_url(
|
815
|
-
store_object,
|
889
|
+
store_object, bucket_name)
|
890
|
+
bucket_url += f'/{file_mounts_tmp_subpath}'
|
816
891
|
for dst, src in copy_mounts_with_file_in_src.items():
|
817
892
|
file_id = src_to_file_id[src]
|
818
893
|
new_file_mounts[dst] = bucket_url + f'/file-{file_id}'
|
@@ -829,8 +904,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
829
904
|
store_types = list(storage_obj.stores.keys())
|
830
905
|
assert len(store_types) == 1, (
|
831
906
|
'We only support one store type for now.', storage_obj.stores)
|
832
|
-
|
833
|
-
store_object = storage_obj.stores[
|
907
|
+
curr_store_type = store_types[0]
|
908
|
+
store_object = storage_obj.stores[curr_store_type]
|
834
909
|
storage_obj.source = storage_lib.StoreType.get_endpoint_url(
|
835
910
|
store_object, storage_obj.name)
|
836
911
|
storage_obj.force_delete = True
|
@@ -847,8 +922,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
847
922
|
store_types = list(storage_obj.stores.keys())
|
848
923
|
assert len(store_types) == 1, (
|
849
924
|
'We only support one store type for now.', storage_obj.stores)
|
850
|
-
|
851
|
-
store_object = storage_obj.stores[
|
925
|
+
curr_store_type = store_types[0]
|
926
|
+
store_object = storage_obj.stores[curr_store_type]
|
852
927
|
source = storage_lib.StoreType.get_endpoint_url(
|
853
928
|
store_object, storage_obj.name)
|
854
929
|
new_storage = storage_lib.Storage.from_yaml_config({
|
sky/utils/db_utils.py
CHANGED
@@ -4,11 +4,27 @@ import sqlite3
|
|
4
4
|
import threading
|
5
5
|
from typing import Any, Callable, Optional
|
6
6
|
|
7
|
+
# This parameter (passed to sqlite3.connect) controls how long we will wait to
|
8
|
+
# obtains a database lock (not necessarily during connection, but whenever it is
|
9
|
+
# needed). It is not a connection timeout.
|
10
|
+
# Even in WAL mode, only a single writer is allowed at a time. Other writers
|
11
|
+
# will block until the write lock can be obtained. This behavior is described in
|
12
|
+
# the SQLite documentation for WAL: https://www.sqlite.org/wal.html
|
13
|
+
# Python's default timeout is 5s. In normal usage, lock contention is very low,
|
14
|
+
# and this is more than sufficient. However, in some highly concurrent cases,
|
15
|
+
# such as a jobs controller suddenly recovering thousands of jobs at once, we
|
16
|
+
# can see a small number of processes that take much longer to obtain the lock.
|
17
|
+
# In contrived highly contentious cases, around 0.1% of transactions will take
|
18
|
+
# >30s to take the lock. We have not seen cases that take >60s. For cases up to
|
19
|
+
# 1000x parallelism, this is thus thought to be a conservative setting.
|
20
|
+
# For more info, see the PR description for #4552.
|
21
|
+
_DB_TIMEOUT_S = 60
|
22
|
+
|
7
23
|
|
8
24
|
@contextlib.contextmanager
|
9
25
|
def safe_cursor(db_path: str):
|
10
26
|
"""A newly created, auto-committing, auto-closing cursor."""
|
11
|
-
conn = sqlite3.connect(db_path)
|
27
|
+
conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
|
12
28
|
cursor = conn.cursor()
|
13
29
|
try:
|
14
30
|
yield cursor
|
@@ -79,8 +95,6 @@ class SQLiteConn(threading.local):
|
|
79
95
|
def __init__(self, db_path: str, create_table: Callable):
|
80
96
|
super().__init__()
|
81
97
|
self.db_path = db_path
|
82
|
-
|
83
|
-
# errors. This is a hack, but it works.
|
84
|
-
self.conn = sqlite3.connect(db_path, timeout=10)
|
98
|
+
self.conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
|
85
99
|
self.cursor = self.conn.cursor()
|
86
100
|
create_table(self.cursor, self.conn)
|
@@ -93,11 +93,11 @@ cleanup_agent_node() {
|
|
93
93
|
|
94
94
|
check_gpu() {
|
95
95
|
local NODE_IP=$1
|
96
|
-
run_remote "$NODE_IP" "
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
96
|
+
if run_remote "$NODE_IP" "command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null"; then
|
97
|
+
return 0 # GPU detected
|
98
|
+
else
|
99
|
+
return 1 # No GPU detected
|
100
|
+
fi
|
101
101
|
}
|
102
102
|
|
103
103
|
# Pre-flight checks
|
sky/utils/log_utils.py
CHANGED
sky/utils/resources_utils.py
CHANGED
@@ -137,31 +137,35 @@ def simplify_ports(ports: List[str]) -> List[str]:
|
|
137
137
|
return port_set_to_ranges(port_ranges_to_set(ports))
|
138
138
|
|
139
139
|
|
140
|
+
def format_resource(resource: 'resources_lib.Resources',
|
141
|
+
simplify: bool = False) -> str:
|
142
|
+
if simplify:
|
143
|
+
cloud = resource.cloud
|
144
|
+
if resource.accelerators is None:
|
145
|
+
vcpu, _ = cloud.get_vcpus_mem_from_instance_type(
|
146
|
+
resource.instance_type)
|
147
|
+
hardware = f'vCPU={int(vcpu)}'
|
148
|
+
else:
|
149
|
+
hardware = f'{resource.accelerators}'
|
150
|
+
spot = '[Spot]' if resource.use_spot else ''
|
151
|
+
return f'{cloud}({spot}{hardware})'
|
152
|
+
else:
|
153
|
+
# accelerator_args is way too long.
|
154
|
+
# Convert from:
|
155
|
+
# GCP(n1-highmem-8, {'tpu-v2-8': 1}, accelerator_args={'runtime_version': '2.12.0'} # pylint: disable=line-too-long
|
156
|
+
# to:
|
157
|
+
# GCP(n1-highmem-8, {'tpu-v2-8': 1}...)
|
158
|
+
pattern = ', accelerator_args={.*}'
|
159
|
+
launched_resource_str = re.sub(pattern, '...', str(resource))
|
160
|
+
return launched_resource_str
|
161
|
+
|
162
|
+
|
140
163
|
def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle',
|
141
164
|
simplify: bool = False) -> str:
|
142
165
|
if (handle.launched_nodes is not None and
|
143
166
|
handle.launched_resources is not None):
|
144
|
-
|
145
|
-
|
146
|
-
if handle.launched_resources.accelerators is None:
|
147
|
-
vcpu, _ = cloud.get_vcpus_mem_from_instance_type(
|
148
|
-
handle.launched_resources.instance_type)
|
149
|
-
hardware = f'vCPU={int(vcpu)}'
|
150
|
-
else:
|
151
|
-
hardware = f'{handle.launched_resources.accelerators}'
|
152
|
-
spot = '[Spot]' if handle.launched_resources.use_spot else ''
|
153
|
-
return f'{handle.launched_nodes}x {cloud}({spot}{hardware})'
|
154
|
-
else:
|
155
|
-
launched_resource_str = str(handle.launched_resources)
|
156
|
-
# accelerator_args is way too long.
|
157
|
-
# Convert from:
|
158
|
-
# GCP(n1-highmem-8, {'tpu-v2-8': 1}, accelerator_args={'runtime_version': '2.12.0'} # pylint: disable=line-too-long
|
159
|
-
# to:
|
160
|
-
# GCP(n1-highmem-8, {'tpu-v2-8': 1}...)
|
161
|
-
pattern = ', accelerator_args={.*}'
|
162
|
-
launched_resource_str = re.sub(pattern, '...',
|
163
|
-
launched_resource_str)
|
164
|
-
return f'{handle.launched_nodes}x {launched_resource_str}'
|
167
|
+
return (f'{handle.launched_nodes}x '
|
168
|
+
f'{format_resource(handle.launched_resources, simplify)}')
|
165
169
|
return _DEFAULT_MESSAGE_HANDLE_INITIALIZING
|
166
170
|
|
167
171
|
|
sky/utils/schemas.py
CHANGED
@@ -299,6 +299,12 @@ def get_storage_schema():
|
|
299
299
|
mode.value for mode in storage.StorageMode
|
300
300
|
]
|
301
301
|
},
|
302
|
+
'_is_sky_managed': {
|
303
|
+
'type': 'boolean',
|
304
|
+
},
|
305
|
+
'_bucket_sub_path': {
|
306
|
+
'type': 'string',
|
307
|
+
},
|
302
308
|
'_force_delete': {
|
303
309
|
'type': 'boolean',
|
304
310
|
}
|
@@ -390,6 +396,19 @@ def get_service_schema():
|
|
390
396
|
'case_insensitive_enum': list(
|
391
397
|
load_balancing_policies.LB_POLICIES.keys())
|
392
398
|
},
|
399
|
+
'tls': {
|
400
|
+
'type': 'object',
|
401
|
+
'required': ['keyfile', 'certfile'],
|
402
|
+
'additionalProperties': False,
|
403
|
+
'properties': {
|
404
|
+
'keyfile': {
|
405
|
+
'type': 'string',
|
406
|
+
},
|
407
|
+
'certfile': {
|
408
|
+
'type': 'string',
|
409
|
+
},
|
410
|
+
},
|
411
|
+
},
|
393
412
|
}
|
394
413
|
}
|
395
414
|
|
@@ -721,6 +740,11 @@ def get_config_schema():
|
|
721
740
|
'resources': resources_schema,
|
722
741
|
}
|
723
742
|
},
|
743
|
+
'bucket': {
|
744
|
+
'type': 'string',
|
745
|
+
'pattern': '^(https|s3|gs|r2|cos)://.+',
|
746
|
+
'required': [],
|
747
|
+
}
|
724
748
|
}
|
725
749
|
}
|
726
750
|
cloud_configs = {
|
@@ -875,6 +899,9 @@ def get_config_schema():
|
|
875
899
|
'image_tag_gpu': {
|
876
900
|
'type': 'string',
|
877
901
|
},
|
902
|
+
'vcn_ocid': {
|
903
|
+
'type': 'string',
|
904
|
+
},
|
878
905
|
'vcn_subnet': {
|
879
906
|
'type': 'string',
|
880
907
|
},
|
sky/utils/subprocess_utils.py
CHANGED
@@ -3,9 +3,10 @@ from multiprocessing import pool
|
|
3
3
|
import os
|
4
4
|
import random
|
5
5
|
import resource
|
6
|
+
import shlex
|
6
7
|
import subprocess
|
7
8
|
import time
|
8
|
-
from typing import Any, Callable, Dict,
|
9
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
9
10
|
|
10
11
|
import colorama
|
11
12
|
import psutil
|
@@ -97,12 +98,10 @@ def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
|
|
97
98
|
|
98
99
|
|
99
100
|
def run_in_parallel(func: Callable,
|
100
|
-
args:
|
101
|
+
args: List[Any],
|
101
102
|
num_threads: Optional[int] = None) -> List[Any]:
|
102
103
|
"""Run a function in parallel on a list of arguments.
|
103
104
|
|
104
|
-
The function 'func' should raise a CommandError if the command fails.
|
105
|
-
|
106
105
|
Args:
|
107
106
|
func: The function to run in parallel
|
108
107
|
args: Iterable of arguments to pass to func
|
@@ -111,14 +110,23 @@ def run_in_parallel(func: Callable,
|
|
111
110
|
|
112
111
|
Returns:
|
113
112
|
A list of the return values of the function func, in the same order as the
|
114
|
-
|
113
|
+
arguments.
|
114
|
+
|
115
|
+
Raises:
|
116
|
+
Exception: The first exception encountered.
|
115
117
|
"""
|
116
|
-
#
|
117
|
-
|
118
|
-
|
118
|
+
# Short-circuit for short lists
|
119
|
+
if len(args) == 0:
|
120
|
+
return []
|
121
|
+
if len(args) == 1:
|
122
|
+
return [func(args[0])]
|
123
|
+
|
124
|
+
processes = (num_threads
|
125
|
+
if num_threads is not None else get_parallel_threads())
|
126
|
+
|
119
127
|
with pool.ThreadPool(processes=processes) as p:
|
120
|
-
|
121
|
-
return list(
|
128
|
+
ordered_iterators = p.imap(func, args)
|
129
|
+
return list(ordered_iterators)
|
122
130
|
|
123
131
|
|
124
132
|
def handle_returncode(returncode: int,
|
@@ -293,3 +301,39 @@ def kill_process_daemon(process_pid: int) -> None:
|
|
293
301
|
# Disable input
|
294
302
|
stdin=subprocess.DEVNULL,
|
295
303
|
)
|
304
|
+
|
305
|
+
|
306
|
+
def launch_new_process_tree(cmd: str, log_output: str = '/dev/null') -> int:
|
307
|
+
"""Launch a new process that will not be a child of the current process.
|
308
|
+
|
309
|
+
This will launch bash in a new session, which will launch the given cmd.
|
310
|
+
This will ensure that cmd is in its own process tree, and once bash exits,
|
311
|
+
will not be an ancestor of the current process. This is useful for job
|
312
|
+
launching.
|
313
|
+
|
314
|
+
Returns the pid of the launched cmd.
|
315
|
+
"""
|
316
|
+
# Use nohup to ensure the job driver process is a separate process tree,
|
317
|
+
# instead of being a child of the current process. This is important to
|
318
|
+
# avoid a chain of driver processes (job driver can call schedule_step() to
|
319
|
+
# submit new jobs, and the new job can also call schedule_step()
|
320
|
+
# recursively).
|
321
|
+
#
|
322
|
+
# echo $! will output the PID of the last background process started in the
|
323
|
+
# current shell, so we can retrieve it and record in the DB.
|
324
|
+
#
|
325
|
+
# TODO(zhwu): A more elegant solution is to use another daemon process to be
|
326
|
+
# in charge of starting these driver processes, instead of starting them in
|
327
|
+
# the current process.
|
328
|
+
wrapped_cmd = (f'nohup bash -c {shlex.quote(cmd)} '
|
329
|
+
f'</dev/null >{log_output} 2>&1 & echo $!')
|
330
|
+
proc = subprocess.run(wrapped_cmd,
|
331
|
+
stdout=subprocess.PIPE,
|
332
|
+
stderr=subprocess.PIPE,
|
333
|
+
stdin=subprocess.DEVNULL,
|
334
|
+
start_new_session=True,
|
335
|
+
check=True,
|
336
|
+
shell=True,
|
337
|
+
text=True)
|
338
|
+
# Get the PID of the detached process
|
339
|
+
return int(proc.stdout.strip())
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20250124
|
4
4
|
Summary: SkyPilot: An intercloud broker for the clouds
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -90,11 +90,15 @@ Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.42.0; (python_version >= "3.10" and s
|
|
90
90
|
Requires-Dist: protobuf!=3.19.5,>=3.15.3; extra == "remote"
|
91
91
|
Requires-Dist: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3; extra == "remote"
|
92
92
|
Provides-Extra: runpod
|
93
|
-
Requires-Dist: runpod>=1.
|
93
|
+
Requires-Dist: runpod>=1.6.1; extra == "runpod"
|
94
94
|
Provides-Extra: fluidstack
|
95
95
|
Provides-Extra: cudo
|
96
96
|
Requires-Dist: cudo-compute>=0.1.10; extra == "cudo"
|
97
97
|
Provides-Extra: paperspace
|
98
|
+
Provides-Extra: do
|
99
|
+
Requires-Dist: pydo>=0.3.0; extra == "do"
|
100
|
+
Requires-Dist: azure-core>=1.24.0; extra == "do"
|
101
|
+
Requires-Dist: azure-common; extra == "do"
|
98
102
|
Provides-Extra: vsphere
|
99
103
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "vsphere"
|
100
104
|
Provides-Extra: all
|
@@ -136,9 +140,21 @@ Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.32.0; (python_version < "3.10" and sy
|
|
136
140
|
Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.42.0; (python_version >= "3.10" and sys_platform != "darwin") and extra == "all"
|
137
141
|
Requires-Dist: protobuf!=3.19.5,>=3.15.3; extra == "all"
|
138
142
|
Requires-Dist: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3; extra == "all"
|
139
|
-
Requires-Dist: runpod>=1.
|
143
|
+
Requires-Dist: runpod>=1.6.1; extra == "all"
|
140
144
|
Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
145
|
+
Requires-Dist: pydo>=0.3.0; extra == "all"
|
146
|
+
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
147
|
+
Requires-Dist: azure-common; extra == "all"
|
141
148
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
149
|
+
Dynamic: author
|
150
|
+
Dynamic: classifier
|
151
|
+
Dynamic: description
|
152
|
+
Dynamic: description-content-type
|
153
|
+
Dynamic: license
|
154
|
+
Dynamic: project-url
|
155
|
+
Dynamic: provides-extra
|
156
|
+
Dynamic: requires-dist
|
157
|
+
Dynamic: summary
|
142
158
|
|
143
159
|
<p align="center">
|
144
160
|
<img alt="SkyPilot" src="https://raw.githubusercontent.com/skypilot-org/skypilot/master/docs/source/images/skypilot-wide-light-1k.png" width=55%>
|
@@ -165,6 +181,7 @@ Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
|
165
181
|
|
166
182
|
----
|
167
183
|
:fire: *News* :fire:
|
184
|
+
- [Jan 2025] Launch and Serve **[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)** on Kubernetes or Any Cloud: [**example**](./llm/deepseek-r1/)
|
168
185
|
- [Oct 2024] :tada: **SkyPilot crossed 1M+ downloads** :tada:: Thank you to our community! [**Twitter/X**](https://x.com/skypilot_org/status/1844770841718067638)
|
169
186
|
- [Sep 2024] Point, Launch and Serve **Llama 3.2** on Kubernetes or Any Cloud: [**example**](./llm/llama-3_2/)
|
170
187
|
- [Sep 2024] Run and deploy [**Pixtral**](./llm/pixtral), the first open-source multimodal model from Mistral AI.
|
@@ -335,6 +352,8 @@ Read the research:
|
|
335
352
|
- [Sky Computing vision paper](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s02-stoica.pdf) (HotOS 2021)
|
336
353
|
- [Policy for Managed Spot Jobs](https://www.usenix.org/conference/nsdi24/presentation/wu-zhanghao) (NSDI 2024)
|
337
354
|
|
355
|
+
SkyPilot was initially started at the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley and has since gained many industry contributors. Read more about the project's origin [here](https://docs.skypilot.co/en/latest/sky-computing.html).
|
356
|
+
|
338
357
|
## Support and Questions
|
339
358
|
We are excited to hear your feedback!
|
340
359
|
* For issues and feature requests, please [open a GitHub issue](https://github.com/skypilot-org/skypilot/issues/new).
|