skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/templates/aws-ray.yml.j2
CHANGED
@@ -11,9 +11,9 @@ docker:
|
|
11
11
|
container_name: {{docker_container_name}}
|
12
12
|
run_options:
|
13
13
|
- --ulimit nofile=1048576:1048576
|
14
|
-
{%-
|
15
|
-
|
16
|
-
{%-
|
14
|
+
{%- for run_option in docker_run_options %}
|
15
|
+
- {{run_option}}
|
16
|
+
{%- endfor %}
|
17
17
|
{%- if docker_login_config is not none %}
|
18
18
|
docker_login_config:
|
19
19
|
username: |-
|
@@ -72,8 +72,11 @@ available_node_types:
|
|
72
72
|
Ebs:
|
73
73
|
VolumeSize: {{disk_size}}
|
74
74
|
VolumeType: {{disk_tier}}
|
75
|
-
{
|
75
|
+
Encrypted: {{disk_encrypted}}
|
76
|
+
{% if disk_iops %}
|
76
77
|
Iops: {{disk_iops}}
|
78
|
+
{% endif %}
|
79
|
+
{% if disk_throughput %}
|
77
80
|
Throughput: {{disk_throughput}}
|
78
81
|
{% endif %}
|
79
82
|
{% if use_spot %}
|
@@ -83,6 +86,12 @@ available_node_types:
|
|
83
86
|
# SpotOptions:
|
84
87
|
# MaxPrice: MAX_HOURLY_PRICE
|
85
88
|
{% endif %}
|
89
|
+
CapacityReservationSpecification:
|
90
|
+
CapacityReservationPreference: open
|
91
|
+
{% if specific_reservations %}
|
92
|
+
CapacityReservationTarget:
|
93
|
+
CapacityReservationId: {{specific_reservations}}
|
94
|
+
{% endif %}
|
86
95
|
# Use cloud init in UserData to set up the authorized_keys to get
|
87
96
|
# around the number of keys limit and permission issues with
|
88
97
|
# ec2.describe_key_pairs.
|
@@ -113,6 +122,15 @@ available_node_types:
|
|
113
122
|
- path: /etc/apt/apt.conf.d/10cloudinit-disable
|
114
123
|
content: |
|
115
124
|
APT::Periodic::Enable "0";
|
125
|
+
- path: /etc/apt/apt.conf.d/52unattended-upgrades-local
|
126
|
+
content: |
|
127
|
+
Unattended-Upgrade::DevRelease "false";
|
128
|
+
Unattended-Upgrade::Allowed-Origins {};
|
129
|
+
bootcmd:
|
130
|
+
- systemctl stop apt-daily.timer apt-daily-upgrade.timer unattended-upgrades.service
|
131
|
+
- systemctl disable apt-daily.timer apt-daily-upgrade.timer unattended-upgrades.service
|
132
|
+
- systemctl mask apt-daily.service apt-daily-upgrade.service unattended-upgrades.service
|
133
|
+
- systemctl daemon-reload
|
116
134
|
TagSpecifications:
|
117
135
|
- ResourceType: instance
|
118
136
|
Tags:
|
@@ -122,6 +140,9 @@ available_node_types:
|
|
122
140
|
- Key: {{ label_key }}
|
123
141
|
Value: {{ label_value|tojson }}
|
124
142
|
{%- endfor %}
|
143
|
+
# Use IDMSv2
|
144
|
+
MetadataOptions:
|
145
|
+
HttpTokens: required
|
125
146
|
|
126
147
|
head_node_type: ray.head.default
|
127
148
|
|
@@ -131,6 +152,7 @@ file_mounts: {
|
|
131
152
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
132
153
|
{%- for remote_path, local_path in credentials.items() %}
|
133
154
|
"{{remote_path}}": "{{local_path}}",
|
155
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
134
156
|
{%- endfor %}
|
135
157
|
}
|
136
158
|
|
@@ -143,7 +165,7 @@ file_mounts: {
|
|
143
165
|
# Increment the following for catching performance bugs easier:
|
144
166
|
# current num items (num SSH connections): 1
|
145
167
|
setup_commands:
|
146
|
-
#
|
168
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
147
169
|
# We set auto_activate_base to be false for pre-installed conda.
|
148
170
|
# This also kills the service that is holding the lock on dpkg (problem only exists on aws/azure, not gcp)
|
149
171
|
# Line "conda config --remove channels": remove the default channel set in the default AWS image as it cannot be accessed.
|
@@ -152,7 +174,12 @@ setup_commands:
|
|
152
174
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
153
175
|
# Line 'mkdir -p ..': disable host key check
|
154
176
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
177
|
+
# Line 'mkdir -p ~/.ssh ...': adding the key in the ssh config to allow interconnection for nodes in the cluster
|
178
|
+
# Line 'rm ~/.aws/credentials': explicitly remove the credentials file to be safe. This is to guard against the case where the credential files was uploaded once as `remote_identity` was not set in a previous launch.
|
155
179
|
- mkdir -p ~/.ssh; touch ~/.ssh/config;
|
180
|
+
{%- for initial_setup_command in initial_setup_commands %}
|
181
|
+
{{ initial_setup_command }}
|
182
|
+
{%- endfor %}
|
156
183
|
{{ conda_installation_commands }}
|
157
184
|
conda config --remove channels "https://aws-ml-conda-ec2.s3.us-west-2.amazonaws.com" || true;
|
158
185
|
{{ ray_skypilot_installation_commands }}
|
@@ -160,8 +187,11 @@ setup_commands:
|
|
160
187
|
{%- if docker_image is none %}
|
161
188
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
162
189
|
{%- endif %}
|
163
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
164
|
-
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
190
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
191
|
+
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
192
|
+
{%- if remote_identity != 'LOCAL_CREDENTIALS' %}
|
193
|
+
rm ~/.aws/credentials || true;
|
194
|
+
{%- endif %}
|
165
195
|
|
166
196
|
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
167
197
|
# We do not need to list it here anymore.
|
sky/templates/azure-ray.yml.j2
CHANGED
@@ -11,29 +11,30 @@ docker:
|
|
11
11
|
container_name: {{docker_container_name}}
|
12
12
|
run_options:
|
13
13
|
- --ulimit nofile=1048576:1048576
|
14
|
-
{%-
|
15
|
-
|
16
|
-
{%-
|
14
|
+
{%- for run_option in docker_run_options %}
|
15
|
+
- {{run_option}}
|
16
|
+
{%- endfor %}
|
17
|
+
{%- if docker_login_config is not none %}
|
18
|
+
docker_login_config:
|
19
|
+
username: |-
|
20
|
+
{{docker_login_config.username}}
|
21
|
+
password: |-
|
22
|
+
{{docker_login_config.password}}
|
23
|
+
server: |-
|
24
|
+
{{docker_login_config.server}}
|
25
|
+
{%- endif %}
|
17
26
|
{%- endif %}
|
18
27
|
|
19
28
|
provider:
|
20
29
|
type: external
|
21
|
-
module: sky.
|
30
|
+
module: sky.provision.azure
|
22
31
|
location: {{region}}
|
23
32
|
# Ref: https://github.com/ray-project/ray/blob/2367a2cb9033913b68b1230316496ae273c25b54/python/ray/autoscaler/_private/_azure/node_provider.py#L87
|
24
33
|
# For Azure, ray distinguishes different instances by the resource_group,
|
25
34
|
# instead of the cluster_name. This ensures that ray creates new instances
|
26
35
|
# for different cluster_name.
|
27
36
|
resource_group: {{resource_group}}
|
28
|
-
{
|
29
|
-
# We put docker login config in provider section because ray's schema disabled
|
30
|
-
# additionalProperties for docker config.
|
31
|
-
# See: https://github.com/ray-project/ray/blob/d2fc4823126927b2c54f89ec72fa3d24b442e6a3/python/ray/autoscaler/ray-schema.json#L227
|
32
|
-
docker_login_config:
|
33
|
-
username: {{docker_login_config.username}}
|
34
|
-
password: {{docker_login_config.password}}
|
35
|
-
server: {{docker_login_config.server}}
|
36
|
-
{%- endif %}
|
37
|
+
use_external_resource_group: {{use_external_resource_group}}
|
37
38
|
# Keep (otherwise cannot reuse when re-provisioning).
|
38
39
|
# teardown(terminate=True) will override this.
|
39
40
|
cache_stopped_nodes: True
|
@@ -67,45 +68,22 @@ available_node_types:
|
|
67
68
|
imageOffer: {{image_offer}}
|
68
69
|
imageSku: "{{image_sku}}"
|
69
70
|
imageVersion: {{image_version}}
|
71
|
+
# Community Gallery Image ID
|
72
|
+
communityGalleryImageId: {{community_gallery_image_id}}
|
70
73
|
osDiskSizeGB: {{disk_size}}
|
71
74
|
osDiskTier: {{disk_tier}}
|
72
|
-
cloudInitSetupCommands: {{cloud_init_setup_commands}}
|
73
|
-
# optionally set priority to use Spot instances
|
74
75
|
{%- if use_spot %}
|
76
|
+
# optionally set priority to use Spot instances
|
75
77
|
priority: Spot
|
76
|
-
# set a maximum price for spot instances if desired
|
77
|
-
# billingProfile:
|
78
|
-
# maxPrice: -1
|
79
78
|
{%- endif %}
|
79
|
+
cloudInitSetupCommands: |-
|
80
|
+
{%- for cmd in cloud_init_setup_commands %}
|
81
|
+
{{ cmd }}
|
82
|
+
{%- endfor %}
|
83
|
+
{%- if disk_performance_tier is not none %}
|
84
|
+
disk_performance_tier: {{disk_performance_tier}}
|
85
|
+
{%- endif %}
|
80
86
|
# TODO: attach disk
|
81
|
-
{% if num_nodes > 1 %}
|
82
|
-
ray.worker.default:
|
83
|
-
min_workers: {{num_nodes - 1}}
|
84
|
-
max_workers: {{num_nodes - 1}}
|
85
|
-
resources: {}
|
86
|
-
node_config:
|
87
|
-
tags:
|
88
|
-
skypilot-user: {{ user }}
|
89
|
-
azure_arm_parameters:
|
90
|
-
adminUsername: skypilot:ssh_user
|
91
|
-
publicKey: |
|
92
|
-
skypilot:ssh_public_key_content
|
93
|
-
vmSize: {{instance_type}}
|
94
|
-
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
95
|
-
imagePublisher: {{image_publisher}}
|
96
|
-
imageOffer: {{image_offer}}
|
97
|
-
imageSku: "{{image_sku}}"
|
98
|
-
imageVersion: {{image_version}}
|
99
|
-
osDiskSizeGB: {{disk_size}}
|
100
|
-
osDiskTier: {{disk_tier}}
|
101
|
-
cloudInitSetupCommands: {{cloud_init_setup_commands}}
|
102
|
-
{%- if use_spot %}
|
103
|
-
priority: Spot
|
104
|
-
# set a maximum price for spot instances if desired
|
105
|
-
# billingProfile:
|
106
|
-
# maxPrice: -1
|
107
|
-
{%- endif %}
|
108
|
-
{%- endif %}
|
109
87
|
|
110
88
|
head_node_type: ray.head.default
|
111
89
|
|
@@ -115,12 +93,10 @@ file_mounts: {
|
|
115
93
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
116
94
|
{%- for remote_path, local_path in credentials.items() %}
|
117
95
|
"{{remote_path}}": "{{local_path}}",
|
96
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
118
97
|
{%- endfor %}
|
119
98
|
}
|
120
99
|
|
121
|
-
rsync_exclude: []
|
122
|
-
|
123
|
-
initialization_commands: []
|
124
100
|
|
125
101
|
# List of shell commands to run to set up nodes.
|
126
102
|
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
@@ -130,7 +106,7 @@ initialization_commands: []
|
|
130
106
|
# Increment the following for catching performance bugs easier:
|
131
107
|
# current num items (num SSH connections): 1
|
132
108
|
setup_commands:
|
133
|
-
#
|
109
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
134
110
|
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
135
111
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
136
112
|
# Line 'sudo systemctl stop jupyter ..': stop jupyter service to avoid port conflict on 8888
|
@@ -151,37 +127,6 @@ setup_commands:
|
|
151
127
|
sudo systemctl stop jupyterhub > /dev/null 2>&1 || true;
|
152
128
|
sudo systemctl disable jupyterhub > /dev/null 2>&1 || true;
|
153
129
|
{%- endif %}
|
154
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
130
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
155
131
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
156
132
|
sudo mv /etc/nccl.conf /etc/nccl.conf.bak || true;
|
157
|
-
|
158
|
-
# Command to start ray on the head node. You don't need to change this.
|
159
|
-
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
160
|
-
# connection, which is expensive. Try your best to co-locate commands into fewer
|
161
|
-
# items! The same comment applies for worker_start_ray_commands.
|
162
|
-
#
|
163
|
-
# Increment the following for catching performance bugs easier:
|
164
|
-
# current num items (num SSH connections): 2
|
165
|
-
head_start_ray_commands:
|
166
|
-
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
|
167
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
168
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
169
|
-
{{dump_port_command}};
|
170
|
-
{{ray_head_wait_initialized_command}}
|
171
|
-
|
172
|
-
{%- if num_nodes > 1 %}
|
173
|
-
worker_start_ray_commands:
|
174
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
175
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
176
|
-
{%- else %}
|
177
|
-
worker_start_ray_commands: []
|
178
|
-
{%- endif %}
|
179
|
-
|
180
|
-
head_node: {}
|
181
|
-
worker_nodes: {}
|
182
|
-
|
183
|
-
# These fields are required for external cloud providers.
|
184
|
-
head_setup_commands: []
|
185
|
-
worker_setup_commands: []
|
186
|
-
cluster_synced_files: []
|
187
|
-
file_mounts_sync_continuously: False
|
sky/templates/cudo-ray.yml.j2
CHANGED
@@ -32,6 +32,7 @@ file_mounts: {
|
|
32
32
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
33
33
|
{%- for remote_path, local_path in credentials.items() %}
|
34
34
|
"{{remote_path}}": "{{local_path}}",
|
35
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
35
36
|
{%- endfor %}
|
36
37
|
}
|
37
38
|
|
@@ -48,13 +49,16 @@ initialization_commands: [ ]
|
|
48
49
|
# current num items (num SSH connections): 1
|
49
50
|
setup_commands:
|
50
51
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
51
|
-
#
|
52
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
52
53
|
# Line 'rm ..': there is another installation of pip.
|
53
54
|
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
54
55
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
55
56
|
# Line 'mkdir -p ..': disable host key check
|
56
57
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
57
|
-
-
|
58
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
59
|
+
{{ initial_setup_command }}
|
60
|
+
{%- endfor %}
|
61
|
+
sudo systemctl stop unattended-upgrades || true;
|
58
62
|
sudo systemctl disable unattended-upgrades || true;
|
59
63
|
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
60
64
|
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
@@ -67,5 +71,5 @@ setup_commands:
|
|
67
71
|
touch ~/.sudo_as_admin_successful;
|
68
72
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
69
73
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
70
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
74
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
71
75
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
@@ -0,0 +1,98 @@
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
2
|
+
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
4
|
+
max_workers: {{num_nodes - 1}}
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
6
|
+
idle_timeout_minutes: 60
|
7
|
+
|
8
|
+
{%- if docker_image is not none %}
|
9
|
+
docker:
|
10
|
+
image: {{docker_image}}
|
11
|
+
container_name: {{docker_container_name}}
|
12
|
+
run_options:
|
13
|
+
- --ulimit nofile=1048576:1048576
|
14
|
+
{%- for run_option in docker_run_options %}
|
15
|
+
- {{run_option}}
|
16
|
+
{%- endfor %}
|
17
|
+
{%- if docker_login_config is not none %}
|
18
|
+
docker_login_config:
|
19
|
+
username: |-
|
20
|
+
{{docker_login_config.username}}
|
21
|
+
password: |-
|
22
|
+
{{docker_login_config.password}}
|
23
|
+
server: |-
|
24
|
+
{{docker_login_config.server}}
|
25
|
+
{%- endif %}
|
26
|
+
{%- endif %}
|
27
|
+
|
28
|
+
provider:
|
29
|
+
type: external
|
30
|
+
module: sky.provision.do
|
31
|
+
region: "{{region}}"
|
32
|
+
|
33
|
+
auth:
|
34
|
+
ssh_user: root
|
35
|
+
ssh_private_key: {{ssh_private_key}}
|
36
|
+
ssh_public_key: |-
|
37
|
+
skypilot:ssh_public_key_content
|
38
|
+
|
39
|
+
available_node_types:
|
40
|
+
ray_head_default:
|
41
|
+
resources: {}
|
42
|
+
node_config:
|
43
|
+
InstanceType: {{instance_type}}
|
44
|
+
DiskSize: {{disk_size}}
|
45
|
+
{%- if image_id is not none %}
|
46
|
+
ImageId: {{image_id}}
|
47
|
+
{%- endif %}
|
48
|
+
|
49
|
+
head_node_type: ray_head_default
|
50
|
+
|
51
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
52
|
+
file_mounts: {
|
53
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
54
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
55
|
+
{%- for remote_path, local_path in credentials.items() %}
|
56
|
+
"{{remote_path}}": "{{local_path}}",
|
57
|
+
{%- endfor %}
|
58
|
+
}
|
59
|
+
|
60
|
+
rsync_exclude: []
|
61
|
+
|
62
|
+
initialization_commands: []
|
63
|
+
|
64
|
+
# List of shell commands to run to set up nodes.
|
65
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
66
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
67
|
+
# items!
|
68
|
+
#
|
69
|
+
# Increment the following for catching performance bugs easier:
|
70
|
+
# current num items (num SSH connections): 1
|
71
|
+
setup_commands:
|
72
|
+
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
73
|
+
# Create ~/.ssh/config file in case the file does not exist in the image.
|
74
|
+
# Line 'rm ..': there is another installation of pip.
|
75
|
+
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
76
|
+
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
77
|
+
# Line 'mkdir -p ..': disable host key check
|
78
|
+
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
79
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
80
|
+
{{ initial_setup_command }}
|
81
|
+
{%- endfor %}
|
82
|
+
sudo systemctl stop unattended-upgrades || true;
|
83
|
+
sudo systemctl disable unattended-upgrades || true;
|
84
|
+
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
85
|
+
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
86
|
+
sudo pkill -9 apt-get;
|
87
|
+
sudo pkill -9 dpkg;
|
88
|
+
sudo dpkg --configure -a;
|
89
|
+
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
90
|
+
{{ conda_installation_commands }}
|
91
|
+
{{ ray_skypilot_installation_commands }}
|
92
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
93
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
94
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
95
|
+
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
96
|
+
|
97
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
98
|
+
# We do not need to list it here anymore.
|
@@ -33,6 +33,7 @@ file_mounts: {
|
|
33
33
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
34
34
|
{%- for remote_path, local_path in credentials.items() %}
|
35
35
|
"{{remote_path}}": "{{local_path}}",
|
36
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
36
37
|
{%- endfor %}
|
37
38
|
}
|
38
39
|
|
@@ -49,26 +50,28 @@ initialization_commands: []
|
|
49
50
|
# current num items (num SSH connections): 1
|
50
51
|
setup_commands:
|
51
52
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
52
|
-
#
|
53
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
53
54
|
# Line 'rm ..': there is another installation of pip.
|
54
55
|
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
55
56
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
56
57
|
# Line 'mkdir -p ..': disable host key check
|
57
58
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
58
|
-
-
|
59
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
60
|
+
{{ initial_setup_command }}
|
61
|
+
{%- endfor %}
|
62
|
+
sudo systemctl stop unattended-upgrades || true;
|
59
63
|
sudo systemctl disable unattended-upgrades || true;
|
60
64
|
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
61
65
|
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
62
66
|
sudo pkill -9 apt-get;
|
63
67
|
sudo pkill -9 dpkg;
|
64
68
|
sudo dpkg --configure -a;
|
65
|
-
{{ cuda_installation_commands }}
|
66
69
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
67
70
|
{{ conda_installation_commands }}
|
68
71
|
{{ ray_skypilot_installation_commands }}
|
69
72
|
touch ~/.sudo_as_admin_successful;
|
70
73
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
71
74
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
72
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
75
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
73
76
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
74
77
|
|
sky/templates/gcp-ray.yml.j2
CHANGED
@@ -12,9 +12,9 @@ docker:
|
|
12
12
|
container_name: {{docker_container_name}}
|
13
13
|
run_options:
|
14
14
|
- --ulimit nofile=1048576:1048576
|
15
|
-
{%-
|
16
|
-
|
17
|
-
{%-
|
15
|
+
{%- for run_option in docker_run_options %}
|
16
|
+
- {{run_option}}
|
17
|
+
{%- endfor %}
|
18
18
|
{%- if docker_login_config is not none %}
|
19
19
|
docker_login_config:
|
20
20
|
username: |-
|
@@ -29,7 +29,7 @@ docker:
|
|
29
29
|
provider:
|
30
30
|
# We use a custom node provider for GCP to create, stop and reuse instances.
|
31
31
|
type: external # type: gcp
|
32
|
-
module: sky.
|
32
|
+
module: sky.provision.gcp
|
33
33
|
region: {{region}}
|
34
34
|
availability_zone: {{zones}}
|
35
35
|
# Keep (otherwise cannot reuse when re-provisioning).
|
@@ -47,6 +47,7 @@ provider:
|
|
47
47
|
firewall_rule: {{firewall_rule}}
|
48
48
|
{% endif %}
|
49
49
|
use_internal_ips: {{use_internal_ips}}
|
50
|
+
force_enable_external_ips: {{force_enable_external_ips}}
|
50
51
|
{%- if tpu_vm %}
|
51
52
|
_has_tpus: True
|
52
53
|
{%- endif %}
|
@@ -62,6 +63,10 @@ provider:
|
|
62
63
|
# The upper-level SkyPilot code has make sure there will not be resource
|
63
64
|
# leakage.
|
64
65
|
disable_launch_config_check: true
|
66
|
+
use_managed_instance_group: {{ gcp_use_managed_instance_group }}
|
67
|
+
{%- if enable_gvnic %}
|
68
|
+
enable_gvnic: {{ enable_gvnic }}
|
69
|
+
{%- endif %}
|
65
70
|
|
66
71
|
auth:
|
67
72
|
ssh_user: gcpuser
|
@@ -79,6 +84,14 @@ available_node_types:
|
|
79
84
|
{%- for label_key, label_value in labels.items() %}
|
80
85
|
{{ label_key }}: {{ label_value|tojson }}
|
81
86
|
{%- endfor %}
|
87
|
+
use-managed-instance-group: {{ gcp_use_managed_instance_group_value|tojson }}
|
88
|
+
{%- if gcp_use_managed_instance_group %}
|
89
|
+
managed-instance-group:
|
90
|
+
run_duration: {{ run_duration }}
|
91
|
+
{%- if provision_timeout is defined and provision_timeout is not none %}
|
92
|
+
provision_timeout: {{ provision_timeout }}
|
93
|
+
{%- endif %}
|
94
|
+
{%- endif %}
|
82
95
|
{%- if specific_reservations %}
|
83
96
|
reservationAffinity:
|
84
97
|
consumeReservationType: SPECIFIC_RESERVATION
|
@@ -114,6 +127,9 @@ available_node_types:
|
|
114
127
|
sourceImage: {{image_id}}
|
115
128
|
{%- endif %}
|
116
129
|
diskType: zones/{{zones}}/diskTypes/{{disk_tier}}
|
130
|
+
{%- if disk_iops %}
|
131
|
+
provisionedIops: {{disk_iops}}
|
132
|
+
{%- endif %}
|
117
133
|
{%- if gpu is not none %}
|
118
134
|
guestAccelerators:
|
119
135
|
- acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}}
|
@@ -153,6 +169,7 @@ file_mounts: {
|
|
153
169
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
154
170
|
{%- for remote_path, local_path in credentials.items() %}
|
155
171
|
"{{remote_path}}": "{{local_path}}",
|
172
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
156
173
|
{%- endfor %}
|
157
174
|
}
|
158
175
|
|
@@ -164,8 +181,8 @@ file_mounts: {
|
|
164
181
|
# Increment the following for catching performance bugs easier:
|
165
182
|
# current num items (num SSH connections): 1 (+1 if tpu_vm)
|
166
183
|
setup_commands:
|
184
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
167
185
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
168
|
-
# Line 'mkdir -p ..': Create ~/.ssh/config file in case the file does not exist in the custom image.
|
169
186
|
# Line 'which conda ..': some images (TPU VM) do not install conda by
|
170
187
|
# default. 'source ~/.bashrc' is needed so conda takes effect for the next
|
171
188
|
# commands.
|
@@ -175,6 +192,9 @@ setup_commands:
|
|
175
192
|
# Line 'mkdir -p ..': disable host key check
|
176
193
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
177
194
|
- function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
|
195
|
+
{%- for initial_setup_command in initial_setup_commands %}
|
196
|
+
{{ initial_setup_command }}
|
197
|
+
{%- endfor %}
|
178
198
|
{%- if docker_image is none %}
|
179
199
|
sudo systemctl stop unattended-upgrades || true;
|
180
200
|
sudo systemctl disable unattended-upgrades || true;
|
@@ -203,7 +223,7 @@ setup_commands:
|
|
203
223
|
sudo systemctl stop jupyter > /dev/null 2>&1 || true;
|
204
224
|
sudo systemctl disable jupyter > /dev/null 2>&1 || true;
|
205
225
|
{%- endif %}
|
206
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
226
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
207
227
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
208
228
|
|
209
229
|
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
sky/templates/ibm-ray.yml.j2
CHANGED
@@ -69,6 +69,7 @@ file_mounts: {
|
|
69
69
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
70
70
|
{%- for remote_path, local_path in credentials.items() %}
|
71
71
|
"{{remote_path}}": "{{local_path}}",
|
72
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
72
73
|
{%- endfor %}
|
73
74
|
}
|
74
75
|
|
@@ -84,7 +85,7 @@ initialization_commands: []
|
|
84
85
|
# Increment the following for catching performance bugs easier:
|
85
86
|
# current num items (num SSH connections): 1
|
86
87
|
setup_commands:
|
87
|
-
#
|
88
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
88
89
|
# We set auto_activate_base to be false for pre-installed conda.
|
89
90
|
# This also kills the service that is holding the lock on dpkg (problem only exists on aws/azure, not gcp)
|
90
91
|
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
@@ -103,7 +104,7 @@ setup_commands:
|
|
103
104
|
{{ ray_skypilot_installation_commands }}
|
104
105
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
105
106
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
106
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
107
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
107
108
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
|
108
109
|
|
109
110
|
|
@@ -4,10 +4,15 @@ name: {{dag_name}}
|
|
4
4
|
|
5
5
|
file_mounts:
|
6
6
|
{{remote_user_yaml_path}}: {{user_yaml_path}}
|
7
|
-
{
|
7
|
+
{%- if local_user_config_path is not none %}
|
8
|
+
{{remote_user_config_path}}: {{local_user_config_path}}
|
9
|
+
{%- endif %}
|
8
10
|
{%- for remote_catalog_path, local_catalog_path in modified_catalogs.items() %}
|
9
11
|
{{remote_catalog_path}}: {{local_catalog_path}}
|
10
12
|
{%- endfor %}
|
13
|
+
{%- for controller_file_mount_path, local_file_mount_path in local_to_controller_file_mounts.items() %}
|
14
|
+
{{controller_file_mount_path}}: {{local_file_mount_path}}
|
15
|
+
{%- endfor %}
|
11
16
|
|
12
17
|
setup: |
|
13
18
|
{{ sky_activate_python_env }}
|
@@ -19,21 +24,51 @@ setup: |
|
|
19
24
|
{{cmd}}
|
20
25
|
{%- endfor %}
|
21
26
|
|
22
|
-
{% if
|
23
|
-
|
24
|
-
echo '
|
27
|
+
{% if controller_envs.get('SKYPILOT_DEV') != '0' %}
|
28
|
+
grep -q 'export SKYPILOT_DEV=' ~/.bashrc || echo 'export SKYPILOT_DEV=1' >> ~/.bashrc
|
29
|
+
grep -q 'alias sky-env=' ~/.bashrc || echo 'alias sky-env="{{ sky_activate_python_env }}"' >> ~/.bashrc
|
25
30
|
{% endif %}
|
26
31
|
|
27
|
-
#
|
28
|
-
|
29
|
-
|
30
|
-
|
32
|
+
# Create systemd service file
|
33
|
+
mkdir -p ~/.config/systemd/user/
|
34
|
+
|
35
|
+
# Create systemd user service file
|
36
|
+
cat << EOF > ~/.config/systemd/user/skypilot-dashboard.service
|
37
|
+
[Unit]
|
38
|
+
Description=SkyPilot Jobs Dashboard
|
39
|
+
After=network.target
|
40
|
+
|
41
|
+
[Service]
|
42
|
+
Environment="PATH={{ sky_python_env_path }}:\$PATH"
|
43
|
+
Environment="SKYPILOT_USER_ID={{controller_envs.SKYPILOT_USER_ID}}"
|
44
|
+
Environment="SKYPILOT_USER={{controller_envs.SKYPILOT_USER}}"
|
45
|
+
Restart=always
|
46
|
+
StandardOutput=append:/home/$USER/.sky/job-dashboard.log
|
47
|
+
StandardError=append:/home/$USER/.sky/job-dashboard.log
|
48
|
+
ExecStart={{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard
|
49
|
+
|
50
|
+
[Install]
|
51
|
+
WantedBy=default.target
|
52
|
+
EOF
|
53
|
+
|
54
|
+
{{ dashboard_setup_cmd }}
|
31
55
|
|
32
56
|
run: |
|
33
57
|
{{ sky_activate_python_env }}
|
34
|
-
|
35
|
-
|
36
|
-
|
58
|
+
|
59
|
+
# Write env vars to a file
|
60
|
+
{%- for env_name, env_value in controller_envs.items() %}
|
61
|
+
echo "export {{env_name}}='{{env_value}}'" >> {{remote_env_file_path}}
|
62
|
+
{%- endfor %}
|
63
|
+
|
64
|
+
# Submit the job to the scheduler.
|
65
|
+
# Note: The job is already in the `spot` table, marked as PENDING.
|
66
|
+
# CloudVmRayBackend._exec_code_on_head() calls
|
67
|
+
# managed_job_codegen.set_pending() before we get here.
|
68
|
+
python -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
|
69
|
+
--job-id $SKYPILOT_INTERNAL_JOB_ID \
|
70
|
+
--env-file {{remote_env_file_path}}
|
71
|
+
|
37
72
|
|
38
73
|
envs:
|
39
74
|
{%- for env_name, env_value in controller_envs.items() %}
|