skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/common.py +15 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/oci.py +32 -1
- sky/authentication.py +20 -8
- sky/backends/backend_utils.py +44 -0
- sky/backends/cloud_vm_ray_backend.py +202 -41
- sky/backends/wheel_utils.py +4 -1
- sky/check.py +31 -1
- sky/cli.py +39 -43
- sky/cloud_stores.py +71 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +137 -50
- sky/clouds/cloud.py +4 -0
- sky/clouds/do.py +303 -0
- sky/clouds/gcp.py +9 -0
- sky/clouds/kubernetes.py +3 -3
- sky/clouds/oci.py +20 -9
- sky/clouds/service_catalog/__init__.py +7 -3
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
- sky/clouds/utils/oci_utils.py +15 -2
- sky/core.py +8 -5
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +19 -4
- sky/data/mounting_utils.py +99 -15
- sky/data/storage.py +961 -130
- sky/global_user_state.py +1 -1
- sky/jobs/__init__.py +2 -0
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +19 -22
- sky/jobs/core.py +46 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +290 -21
- sky/jobs/utils.py +346 -95
- sky/optimizer.py +6 -3
- sky/provision/aws/config.py +59 -29
- sky/provision/azure/instance.py +1 -1
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +306 -0
- sky/provision/docker_utils.py +22 -11
- sky/provision/gcp/instance_utils.py +15 -9
- sky/provision/kubernetes/instance.py +3 -2
- sky/provision/kubernetes/utils.py +125 -20
- sky/provision/oci/query_utils.py +17 -14
- sky/provision/provisioner.py +0 -1
- sky/provision/runpod/instance.py +10 -1
- sky/provision/runpod/utils.py +170 -13
- sky/resources.py +1 -1
- sky/serve/autoscalers.py +359 -301
- sky/serve/controller.py +10 -8
- sky/serve/core.py +84 -7
- sky/serve/load_balancer.py +27 -10
- sky/serve/replica_managers.py +1 -3
- sky/serve/serve_state.py +10 -5
- sky/serve/serve_utils.py +28 -1
- sky/serve/service.py +4 -3
- sky/serve/service_spec.py +31 -0
- sky/setup_files/dependencies.py +4 -1
- sky/skylet/constants.py +8 -4
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +10 -30
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/providers/command_runner.py +5 -7
- sky/skylet/skylet.py +1 -1
- sky/task.py +28 -1
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/jobs-controller.yaml.j2 +41 -7
- sky/templates/runpod-ray.yml.j2 +13 -0
- sky/templates/sky-serve-controller.yaml.j2 +4 -0
- sky/usage/usage_lib.py +10 -2
- sky/utils/accelerator_registry.py +12 -8
- sky/utils/controller_utils.py +114 -39
- sky/utils/db_utils.py +18 -4
- sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
- sky/utils/log_utils.py +2 -0
- sky/utils/resources_utils.py +25 -21
- sky/utils/schemas.py +27 -0
- sky/utils/subprocess_utils.py +54 -10
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/skylet/log_lib.py
CHANGED
@@ -25,9 +25,9 @@ from sky.utils import log_utils
|
|
25
25
|
from sky.utils import subprocess_utils
|
26
26
|
from sky.utils import ux_utils
|
27
27
|
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
SKY_LOG_WAITING_GAP_SECONDS = 1
|
29
|
+
SKY_LOG_WAITING_MAX_RETRY = 5
|
30
|
+
SKY_LOG_TAILING_GAP_SECONDS = 0.2
|
31
31
|
# Peek the head of the lines to check if we need to start
|
32
32
|
# streaming when tail > 0.
|
33
33
|
PEEK_HEAD_LINES_FOR_START_STREAM = 20
|
@@ -336,7 +336,7 @@ def _follow_job_logs(file,
|
|
336
336
|
]:
|
337
337
|
if wait_last_logs:
|
338
338
|
# Wait all the logs are printed before exit.
|
339
|
-
time.sleep(1 +
|
339
|
+
time.sleep(1 + SKY_LOG_TAILING_GAP_SECONDS)
|
340
340
|
wait_last_logs = False
|
341
341
|
continue
|
342
342
|
status_str = status.value if status is not None else 'None'
|
@@ -345,7 +345,7 @@ def _follow_job_logs(file,
|
|
345
345
|
f'Job finished (status: {status_str}).'))
|
346
346
|
return
|
347
347
|
|
348
|
-
time.sleep(
|
348
|
+
time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
|
349
349
|
status = job_lib.get_status_no_lock(job_id)
|
350
350
|
|
351
351
|
|
@@ -426,15 +426,15 @@ def tail_logs(job_id: Optional[int],
|
|
426
426
|
retry_cnt += 1
|
427
427
|
if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
|
428
428
|
break
|
429
|
-
if retry_cnt >=
|
429
|
+
if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
|
430
430
|
print(
|
431
431
|
f'{colorama.Fore.RED}ERROR: Logs for '
|
432
432
|
f'{job_str} (status: {status.value}) does not exist '
|
433
433
|
f'after retrying {retry_cnt} times.{colorama.Style.RESET_ALL}')
|
434
434
|
return
|
435
|
-
print(f'INFO: Waiting {
|
435
|
+
print(f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
|
436
436
|
'to be written...')
|
437
|
-
time.sleep(
|
437
|
+
time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
|
438
438
|
status = job_lib.update_job_status([job_id], silent=True)[0]
|
439
439
|
|
440
440
|
start_stream_at = LOG_FILE_START_STREAMING_AT
|
sky/skylet/log_lib.pyi
CHANGED
@@ -13,6 +13,9 @@ from sky.skylet import constants as constants
|
|
13
13
|
from sky.skylet import job_lib as job_lib
|
14
14
|
from sky.utils import log_utils as log_utils
|
15
15
|
|
16
|
+
SKY_LOG_WAITING_GAP_SECONDS: int = ...
|
17
|
+
SKY_LOG_WAITING_MAX_RETRY: int = ...
|
18
|
+
SKY_LOG_TAILING_GAP_SECONDS: float = ...
|
16
19
|
LOG_FILE_START_STREAMING_AT: str = ...
|
17
20
|
|
18
21
|
|
@@ -25,7 +25,7 @@ def docker_start_cmds(
|
|
25
25
|
docker_cmd,
|
26
26
|
):
|
27
27
|
"""Generating docker start command without --rm.
|
28
|
-
|
28
|
+
|
29
29
|
The code is borrowed from `ray.autoscaler._private.docker`.
|
30
30
|
|
31
31
|
Changes we made:
|
@@ -159,19 +159,17 @@ class SkyDockerCommandRunner(DockerCommandRunner):
|
|
159
159
|
return True
|
160
160
|
|
161
161
|
# SkyPilot: Docker login if user specified a private docker registry.
|
162
|
-
if
|
162
|
+
if 'docker_login_config' in self.docker_config:
|
163
163
|
# TODO(tian): Maybe support a command to get the login password?
|
164
|
-
docker_login_config: docker_utils.DockerLoginConfig =
|
165
|
-
|
164
|
+
docker_login_config: docker_utils.DockerLoginConfig = (
|
165
|
+
self.docker_config['docker_login_config'])
|
166
166
|
self._run_with_retry(
|
167
167
|
f'{self.docker_cmd} login --username '
|
168
168
|
f'{docker_login_config.username} --password '
|
169
169
|
f'{docker_login_config.password} {docker_login_config.server}')
|
170
170
|
# We automatically add the server prefix to the image name if
|
171
171
|
# the user did not add it.
|
172
|
-
|
173
|
-
if not specific_image.startswith(server_prefix):
|
174
|
-
specific_image = f'{server_prefix}{specific_image}'
|
172
|
+
specific_image = docker_login_config.format_image(specific_image)
|
175
173
|
|
176
174
|
if self.docker_config.get('pull_before_run', True):
|
177
175
|
assert specific_image, ('Image must be included in config if '
|
sky/skylet/skylet.py
CHANGED
@@ -20,7 +20,7 @@ EVENTS = [
|
|
20
20
|
# The managed job update event should be after the job update event.
|
21
21
|
# Otherwise, the abnormal managed job status update will be delayed
|
22
22
|
# until the next job update event.
|
23
|
-
events.
|
23
|
+
events.ManagedJobEvent(),
|
24
24
|
# This is for monitoring controller job status. If it becomes
|
25
25
|
# unhealthy, this event will correctly update the controller
|
26
26
|
# status to CONTROLLER_FAILED.
|
sky/task.py
CHANGED
@@ -948,12 +948,22 @@ class Task:
|
|
948
948
|
store_type = storage_lib.StoreType.from_cloud(storage_cloud_str)
|
949
949
|
return store_type, storage_region
|
950
950
|
|
951
|
-
def sync_storage_mounts(self) -> None:
|
951
|
+
def sync_storage_mounts(self, force_sync: bool = False) -> None:
|
952
952
|
"""(INTERNAL) Eagerly syncs storage mounts to cloud storage.
|
953
953
|
|
954
954
|
After syncing up, COPY-mode storage mounts are translated into regular
|
955
955
|
file_mounts of the form ``{ /remote/path: {s3,gs,..}://<bucket path>
|
956
956
|
}``.
|
957
|
+
|
958
|
+
Args:
|
959
|
+
force_sync: If True, forces the synchronization of storage mounts.
|
960
|
+
If the store object is added via storage.add_store(),
|
961
|
+
the sync will happen automatically via add_store.
|
962
|
+
However, if it is passed via the construction function
|
963
|
+
of storage, it is usually because the user passed an
|
964
|
+
intermediate bucket name in the config and we need to
|
965
|
+
construct from the user config. In this case, set
|
966
|
+
force_sync to True.
|
957
967
|
"""
|
958
968
|
for storage in self.storage_mounts.values():
|
959
969
|
if not storage.stores:
|
@@ -961,6 +971,8 @@ class Task:
|
|
961
971
|
self.storage_plans[storage] = store_type
|
962
972
|
storage.add_store(store_type, store_region)
|
963
973
|
else:
|
974
|
+
if force_sync:
|
975
|
+
storage.sync_all_stores()
|
964
976
|
# We will download the first store that is added to remote.
|
965
977
|
self.storage_plans[storage] = list(storage.stores.keys())[0]
|
966
978
|
|
@@ -977,6 +989,7 @@ class Task:
|
|
977
989
|
else:
|
978
990
|
assert storage.name is not None, storage
|
979
991
|
blob_path = 's3://' + storage.name
|
992
|
+
blob_path = storage.get_bucket_sub_path_prefix(blob_path)
|
980
993
|
self.update_file_mounts({
|
981
994
|
mnt_path: blob_path,
|
982
995
|
})
|
@@ -987,6 +1000,7 @@ class Task:
|
|
987
1000
|
else:
|
988
1001
|
assert storage.name is not None, storage
|
989
1002
|
blob_path = 'gs://' + storage.name
|
1003
|
+
blob_path = storage.get_bucket_sub_path_prefix(blob_path)
|
990
1004
|
self.update_file_mounts({
|
991
1005
|
mnt_path: blob_path,
|
992
1006
|
})
|
@@ -1005,6 +1019,7 @@ class Task:
|
|
1005
1019
|
blob_path = data_utils.AZURE_CONTAINER_URL.format(
|
1006
1020
|
storage_account_name=storage_account_name,
|
1007
1021
|
container_name=storage.name)
|
1022
|
+
blob_path = storage.get_bucket_sub_path_prefix(blob_path)
|
1008
1023
|
self.update_file_mounts({
|
1009
1024
|
mnt_path: blob_path,
|
1010
1025
|
})
|
@@ -1015,6 +1030,7 @@ class Task:
|
|
1015
1030
|
blob_path = storage.source
|
1016
1031
|
else:
|
1017
1032
|
blob_path = 'r2://' + storage.name
|
1033
|
+
blob_path = storage.get_bucket_sub_path_prefix(blob_path)
|
1018
1034
|
self.update_file_mounts({
|
1019
1035
|
mnt_path: blob_path,
|
1020
1036
|
})
|
@@ -1030,7 +1046,18 @@ class Task:
|
|
1030
1046
|
cos_region = data_utils.Rclone.get_region_from_rclone(
|
1031
1047
|
storage.name, data_utils.Rclone.RcloneClouds.IBM)
|
1032
1048
|
blob_path = f'cos://{cos_region}/{storage.name}'
|
1049
|
+
blob_path = storage.get_bucket_sub_path_prefix(blob_path)
|
1033
1050
|
self.update_file_mounts({mnt_path: blob_path})
|
1051
|
+
elif store_type is storage_lib.StoreType.OCI:
|
1052
|
+
if storage.source is not None and not isinstance(
|
1053
|
+
storage.source,
|
1054
|
+
list) and storage.source.startswith('oci://'):
|
1055
|
+
blob_path = storage.source
|
1056
|
+
else:
|
1057
|
+
blob_path = 'oci://' + storage.name
|
1058
|
+
self.update_file_mounts({
|
1059
|
+
mnt_path: blob_path,
|
1060
|
+
})
|
1034
1061
|
else:
|
1035
1062
|
with ux_utils.print_exception_no_traceback():
|
1036
1063
|
raise ValueError(f'Storage Type {store_type} '
|
@@ -0,0 +1,98 @@
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
2
|
+
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
4
|
+
max_workers: {{num_nodes - 1}}
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
6
|
+
idle_timeout_minutes: 60
|
7
|
+
|
8
|
+
{%- if docker_image is not none %}
|
9
|
+
docker:
|
10
|
+
image: {{docker_image}}
|
11
|
+
container_name: {{docker_container_name}}
|
12
|
+
run_options:
|
13
|
+
- --ulimit nofile=1048576:1048576
|
14
|
+
{%- for run_option in docker_run_options %}
|
15
|
+
- {{run_option}}
|
16
|
+
{%- endfor %}
|
17
|
+
{%- if docker_login_config is not none %}
|
18
|
+
docker_login_config:
|
19
|
+
username: |-
|
20
|
+
{{docker_login_config.username}}
|
21
|
+
password: |-
|
22
|
+
{{docker_login_config.password}}
|
23
|
+
server: |-
|
24
|
+
{{docker_login_config.server}}
|
25
|
+
{%- endif %}
|
26
|
+
{%- endif %}
|
27
|
+
|
28
|
+
provider:
|
29
|
+
type: external
|
30
|
+
module: sky.provision.do
|
31
|
+
region: "{{region}}"
|
32
|
+
|
33
|
+
auth:
|
34
|
+
ssh_user: root
|
35
|
+
ssh_private_key: {{ssh_private_key}}
|
36
|
+
ssh_public_key: |-
|
37
|
+
skypilot:ssh_public_key_content
|
38
|
+
|
39
|
+
available_node_types:
|
40
|
+
ray_head_default:
|
41
|
+
resources: {}
|
42
|
+
node_config:
|
43
|
+
InstanceType: {{instance_type}}
|
44
|
+
DiskSize: {{disk_size}}
|
45
|
+
{%- if image_id is not none %}
|
46
|
+
ImageId: {{image_id}}
|
47
|
+
{%- endif %}
|
48
|
+
|
49
|
+
head_node_type: ray_head_default
|
50
|
+
|
51
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
52
|
+
file_mounts: {
|
53
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
54
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
55
|
+
{%- for remote_path, local_path in credentials.items() %}
|
56
|
+
"{{remote_path}}": "{{local_path}}",
|
57
|
+
{%- endfor %}
|
58
|
+
}
|
59
|
+
|
60
|
+
rsync_exclude: []
|
61
|
+
|
62
|
+
initialization_commands: []
|
63
|
+
|
64
|
+
# List of shell commands to run to set up nodes.
|
65
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
66
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
67
|
+
# items!
|
68
|
+
#
|
69
|
+
# Increment the following for catching performance bugs easier:
|
70
|
+
# current num items (num SSH connections): 1
|
71
|
+
setup_commands:
|
72
|
+
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
73
|
+
# Create ~/.ssh/config file in case the file does not exist in the image.
|
74
|
+
# Line 'rm ..': there is another installation of pip.
|
75
|
+
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
76
|
+
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
77
|
+
# Line 'mkdir -p ..': disable host key check
|
78
|
+
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
79
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
80
|
+
{{ initial_setup_command }}
|
81
|
+
{%- endfor %}
|
82
|
+
sudo systemctl stop unattended-upgrades || true;
|
83
|
+
sudo systemctl disable unattended-upgrades || true;
|
84
|
+
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
85
|
+
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
86
|
+
sudo pkill -9 apt-get;
|
87
|
+
sudo pkill -9 dpkg;
|
88
|
+
sudo dpkg --configure -a;
|
89
|
+
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
90
|
+
{{ conda_installation_commands }}
|
91
|
+
{{ ray_skypilot_installation_commands }}
|
92
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
93
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
94
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
95
|
+
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
96
|
+
|
97
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
98
|
+
# We do not need to list it here anymore.
|
@@ -26,16 +26,50 @@ setup: |
|
|
26
26
|
echo 'export SKYPILOT_DEV=1' >> ~/.bashrc
|
27
27
|
{% endif %}
|
28
28
|
|
29
|
-
#
|
30
|
-
|
31
|
-
|
32
|
-
|
29
|
+
# Create systemd service file
|
30
|
+
mkdir -p ~/.config/systemd/user/
|
31
|
+
|
32
|
+
# Create systemd user service file
|
33
|
+
cat << EOF > ~/.config/systemd/user/skypilot-dashboard.service
|
34
|
+
[Unit]
|
35
|
+
Description=SkyPilot Jobs Dashboard
|
36
|
+
After=network.target
|
37
|
+
|
38
|
+
[Service]
|
39
|
+
Environment="PATH={{ sky_python_env_path }}:\$PATH"
|
40
|
+
Environment="SKYPILOT_USER_ID={{controller_envs.SKYPILOT_USER_ID}}"
|
41
|
+
Environment="SKYPILOT_USER={{controller_envs.SKYPILOT_USER}}"
|
42
|
+
Restart=always
|
43
|
+
StandardOutput=append:/home/$USER/.sky/job-dashboard.log
|
44
|
+
StandardError=append:/home/$USER/.sky/job-dashboard.log
|
45
|
+
ExecStart={{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard
|
46
|
+
|
47
|
+
[Install]
|
48
|
+
WantedBy=default.target
|
49
|
+
EOF
|
50
|
+
|
51
|
+
if command -v systemctl &>/dev/null && systemctl --user show &>/dev/null; then
|
52
|
+
systemctl --user daemon-reload
|
53
|
+
systemctl --user enable --now skypilot-dashboard
|
54
|
+
else
|
55
|
+
echo "Systemd user services not found. Setting up SkyPilot dashboard manually."
|
56
|
+
# Kill any old dashboard processes
|
57
|
+
ps aux | grep -v nohup | grep -v grep | grep -- '-m sky.jobs.dashboard.dashboard' \
|
58
|
+
| awk '{print $2}' | xargs kill > /dev/null 2>&1 || true
|
59
|
+
# Launch the dashboard in the background if not already running
|
60
|
+
(ps aux | grep -v nohup | grep -v grep | grep -q -- '-m sky.jobs.dashboard.dashboard') || \
|
61
|
+
(nohup {{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard >> ~/.sky/job-dashboard.log 2>&1 &)
|
62
|
+
fi
|
33
63
|
|
34
64
|
run: |
|
35
65
|
{{ sky_activate_python_env }}
|
36
|
-
#
|
37
|
-
|
38
|
-
|
66
|
+
# Submit the job to the scheduler.
|
67
|
+
# Note: The job is already in the `spot` table, marked as PENDING.
|
68
|
+
# CloudVmRayBackend._exec_code_on_head() calls
|
69
|
+
# managed_job_codegen.set_pending() before we get here.
|
70
|
+
python -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
|
71
|
+
--job-id $SKYPILOT_INTERNAL_JOB_ID
|
72
|
+
|
39
73
|
|
40
74
|
envs:
|
41
75
|
{%- for env_name, env_value in controller_envs.items() %}
|
sky/templates/runpod-ray.yml.j2
CHANGED
@@ -10,6 +10,19 @@ provider:
|
|
10
10
|
module: sky.provision.runpod
|
11
11
|
region: "{{region}}"
|
12
12
|
disable_launch_config_check: true
|
13
|
+
# For RunPod, we directly set the image id for the docker as runtime environment
|
14
|
+
# support, thus we need to avoid the DockerInitializer detects the docker field
|
15
|
+
# and performs the initialization. Therefore we put the docker login config in
|
16
|
+
# the provider config here.
|
17
|
+
{%- if docker_login_config is not none %}
|
18
|
+
docker_login_config:
|
19
|
+
username: |-
|
20
|
+
{{docker_login_config.username}}
|
21
|
+
password: |-
|
22
|
+
{{docker_login_config.password}}
|
23
|
+
server: |-
|
24
|
+
{{docker_login_config.server}}
|
25
|
+
{%- endif %}
|
13
26
|
|
14
27
|
auth:
|
15
28
|
ssh_user: root
|
@@ -29,6 +29,10 @@ file_mounts:
|
|
29
29
|
{%- for remote_catalog_path, local_catalog_path in modified_catalogs.items() %}
|
30
30
|
{{remote_catalog_path}}: {{local_catalog_path}}
|
31
31
|
{%- endfor %}
|
32
|
+
{%- if use_tls %}
|
33
|
+
{{remote_tls_keyfile}}: {{local_tls_keyfile}}
|
34
|
+
{{remote_tls_certfile}}: {{local_tls_certfile}}
|
35
|
+
{%- endif %}
|
32
36
|
|
33
37
|
run: |
|
34
38
|
# Activate the Python environment, so that cloud SDKs can be found in the
|
sky/usage/usage_lib.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
import contextlib
|
4
4
|
import datetime
|
5
5
|
import enum
|
6
|
-
import inspect
|
7
6
|
import json
|
8
7
|
import os
|
9
8
|
import time
|
@@ -12,19 +11,28 @@ import typing
|
|
12
11
|
from typing import Any, Callable, Dict, List, Optional, Union
|
13
12
|
|
14
13
|
import click
|
15
|
-
import requests
|
16
14
|
|
17
15
|
import sky
|
18
16
|
from sky import sky_logging
|
17
|
+
from sky.adaptors import common as adaptors_common
|
19
18
|
from sky.usage import constants
|
20
19
|
from sky.utils import common_utils
|
21
20
|
from sky.utils import env_options
|
22
21
|
from sky.utils import ux_utils
|
23
22
|
|
24
23
|
if typing.TYPE_CHECKING:
|
24
|
+
import inspect
|
25
|
+
|
26
|
+
import requests
|
27
|
+
|
25
28
|
from sky import resources as resources_lib
|
26
29
|
from sky import status_lib
|
27
30
|
from sky import task as task_lib
|
31
|
+
else:
|
32
|
+
# requests and inspect cost ~100ms to load, which can be postponed to
|
33
|
+
# collection phase or skipped if user specifies no collection
|
34
|
+
requests = adaptors_common.LazyImport('requests')
|
35
|
+
inspect = adaptors_common.LazyImport('inspect')
|
28
36
|
|
29
37
|
logger = sky_logging.init_logger(__name__)
|
30
38
|
|
@@ -3,6 +3,7 @@ import typing
|
|
3
3
|
from typing import Optional
|
4
4
|
|
5
5
|
from sky.clouds import service_catalog
|
6
|
+
from sky.utils import rich_utils
|
6
7
|
from sky.utils import ux_utils
|
7
8
|
|
8
9
|
if typing.TYPE_CHECKING:
|
@@ -88,14 +89,17 @@ def canonicalize_accelerator_name(accelerator: str,
|
|
88
89
|
if accelerator.lower() in mapping:
|
89
90
|
return mapping[accelerator.lower()]
|
90
91
|
|
91
|
-
#
|
92
|
-
#
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
92
|
+
# Listing accelerators can be time-consuming since canonicalizing usually
|
93
|
+
# involves catalog reading with cache not warmed up.
|
94
|
+
with rich_utils.safe_status('Listing accelerators...'):
|
95
|
+
# _ACCELERATORS may not be comprehensive.
|
96
|
+
# Users may manually add new accelerators to the catalogs, or download
|
97
|
+
# new catalogs (that have new accelerators) without upgrading SkyPilot.
|
98
|
+
# To cover such cases, we should search the accelerator name
|
99
|
+
# in the service catalog.
|
100
|
+
searched = service_catalog.list_accelerators(name_filter=accelerator,
|
101
|
+
case_sensitive=False,
|
102
|
+
clouds=cloud_str)
|
99
103
|
names = list(searched.keys())
|
100
104
|
|
101
105
|
# Exact match.
|