skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
2
|
+
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
4
|
+
max_workers: {{num_nodes - 1}}
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
6
|
+
idle_timeout_minutes: 60
|
7
|
+
|
8
|
+
provider:
|
9
|
+
type: external
|
10
|
+
module: sky.provision.nebius
|
11
|
+
region: "{{region}}"
|
12
|
+
|
13
|
+
auth:
|
14
|
+
ssh_user: ubuntu
|
15
|
+
ssh_private_key: {{ssh_private_key}}
|
16
|
+
|
17
|
+
available_node_types:
|
18
|
+
ray_head_default:
|
19
|
+
resources: {}
|
20
|
+
node_config:
|
21
|
+
InstanceType: {{instance_type}}
|
22
|
+
ImageId: {{image_id}}
|
23
|
+
DiskSize: {{disk_size}}
|
24
|
+
UserData: |
|
25
|
+
users:
|
26
|
+
- name: skypilot:ssh_user
|
27
|
+
shell: /bin/bash
|
28
|
+
sudo: ALL=(ALL) NOPASSWD:ALL
|
29
|
+
ssh_authorized_keys:
|
30
|
+
- |-
|
31
|
+
skypilot:ssh_public_key_content
|
32
|
+
|
33
|
+
head_node_type: ray_head_default
|
34
|
+
|
35
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
36
|
+
file_mounts: {
|
37
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
38
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
39
|
+
{%- for remote_path, local_path in credentials.items() %}
|
40
|
+
"{{remote_path}}": "{{local_path}}",
|
41
|
+
{%- endfor %}
|
42
|
+
}
|
43
|
+
|
44
|
+
rsync_exclude: []
|
45
|
+
|
46
|
+
initialization_commands: []
|
47
|
+
|
48
|
+
# List of shell commands to run to set up nodes.
|
49
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
50
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
51
|
+
# items!
|
52
|
+
#
|
53
|
+
# Increment the following for catching performance bugs easier:
|
54
|
+
# current num items (num SSH connections): 1
|
55
|
+
setup_commands:
|
56
|
+
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
57
|
+
# Create ~/.ssh/config file in case the file does not exist in the image.
|
58
|
+
# Line 'rm ..': there is another installation of pip.
|
59
|
+
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
60
|
+
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
61
|
+
# Line 'mkdir -p ..': disable host key check
|
62
|
+
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
63
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
64
|
+
{{ initial_setup_command }}
|
65
|
+
{%- endfor %}
|
66
|
+
sudo systemctl stop unattended-upgrades || true;
|
67
|
+
sudo systemctl disable unattended-upgrades || true;
|
68
|
+
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
69
|
+
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
70
|
+
sudo pkill -9 apt-get;
|
71
|
+
sudo pkill -9 dpkg;
|
72
|
+
sudo dpkg --configure -a;
|
73
|
+
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
74
|
+
{{ conda_installation_commands }}
|
75
|
+
{{ ray_skypilot_installation_commands }}
|
76
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
77
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
78
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
79
|
+
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
sky/templates/oci-ray.yml.j2
CHANGED
@@ -7,7 +7,7 @@ idle_timeout_minutes: 60
|
|
7
7
|
|
8
8
|
provider:
|
9
9
|
type: external
|
10
|
-
module: sky.
|
10
|
+
module: sky.provision.oci
|
11
11
|
region: {{region}}
|
12
12
|
cache_stopped_nodes: True
|
13
13
|
# Disable launch config check for worker nodes as it can cause resource leakage.
|
@@ -16,7 +16,11 @@ provider:
|
|
16
16
|
disable_launch_config_check: true
|
17
17
|
|
18
18
|
auth:
|
19
|
+
{% if os_type == "ubuntu" %}
|
19
20
|
ssh_user: ubuntu
|
21
|
+
{% else %}
|
22
|
+
ssh_user: opc
|
23
|
+
{% endif %}
|
20
24
|
ssh_private_key: {{ssh_private_key}}
|
21
25
|
|
22
26
|
available_node_types:
|
@@ -35,25 +39,6 @@ available_node_types:
|
|
35
39
|
Preemptible: {{use_spot}}
|
36
40
|
AuthorizedKey: |
|
37
41
|
skypilot:ssh_public_key_content
|
38
|
-
{% if num_nodes > 1 %}
|
39
|
-
ray_worker_default:
|
40
|
-
min_workers: {{num_nodes - 1}}
|
41
|
-
max_workers: {{num_nodes - 1}}
|
42
|
-
resources: {}
|
43
|
-
node_config:
|
44
|
-
InstanceType: {{instance_type}}
|
45
|
-
VCPUs: {{cpus}}
|
46
|
-
MemoryInGbs: {{memory}}
|
47
|
-
BootVolumeSize: {{disk_size}}
|
48
|
-
BootVolumePerf: {{vpu}}
|
49
|
-
AvailabilityDomain: {{zone}}
|
50
|
-
ImageId: {{image}}
|
51
|
-
AppCatalogListingId: {{app_catalog_listing_id}}
|
52
|
-
ResourceVersion: {{resource_version}}
|
53
|
-
Preemptible: {{use_spot}}
|
54
|
-
AuthorizedKey: |
|
55
|
-
skypilot:ssh_public_key_content
|
56
|
-
{%- endif %}
|
57
42
|
|
58
43
|
head_node_type: ray_head_default
|
59
44
|
|
@@ -63,12 +48,10 @@ file_mounts: {
|
|
63
48
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
64
49
|
{%- for remote_path, local_path in credentials.items() %}
|
65
50
|
"{{remote_path}}": "{{local_path}}",
|
51
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
66
52
|
{%- endfor %}
|
67
53
|
}
|
68
54
|
|
69
|
-
rsync_exclude: []
|
70
|
-
|
71
|
-
initialization_commands: []
|
72
55
|
|
73
56
|
# List of shell commands to run to set up nodes.
|
74
57
|
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
@@ -79,58 +62,36 @@ initialization_commands: []
|
|
79
62
|
# current num items (num SSH connections): 1
|
80
63
|
setup_commands:
|
81
64
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
82
|
-
#
|
65
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
83
66
|
# Line 'rm ..': there is another installation of pip.
|
84
67
|
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
85
68
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
86
69
|
# Line 'mkdir -p ..': disable host key check
|
87
70
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
88
|
-
-
|
71
|
+
- echo "setup commands runs at $(date)" > /tmp/provision.tmp.out || true;
|
72
|
+
{%- if os_type == "ubuntu" %}
|
73
|
+
sudo systemctl stop unattended-upgrades || true;
|
89
74
|
sudo systemctl disable unattended-upgrades || true;
|
90
75
|
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
91
76
|
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
92
77
|
sudo pkill -9 apt-get;
|
93
78
|
sudo pkill -9 dpkg;
|
94
79
|
sudo dpkg --configure -a;
|
95
|
-
|
80
|
+
{%- else %}
|
81
|
+
sudo /usr/libexec/oci-growfs -y || true;
|
82
|
+
sudo systemctl stop firewalld || true;
|
83
|
+
sudo systemctl disable firewalld || true;
|
84
|
+
{%- endif %}
|
96
85
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
97
86
|
{{ conda_installation_commands }}
|
98
87
|
{{ ray_skypilot_installation_commands }}
|
99
88
|
touch ~/.sudo_as_admin_successful;
|
100
89
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
101
90
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
102
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
91
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
103
92
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
104
93
|
sudo iptables -I INPUT -i ens3 -m state --state ESTABLISHED,RELATED,NEW -j ACCEPT;
|
105
94
|
|
106
|
-
# Command to start ray
|
107
|
-
#
|
108
|
-
# connection, which is expensive. Try your best to co-locate commands into fewer
|
109
|
-
# items! The same comment applies for worker_start_ray_commands.
|
110
|
-
#
|
111
|
-
# Increment the following for catching performance bugs easier:
|
112
|
-
# current num items (num SSH connections): 2
|
113
|
-
head_start_ray_commands:
|
114
|
-
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
|
115
|
-
# Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
|
116
|
-
# all the sessions to be reloaded. This is a workaround.
|
117
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
118
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
119
|
-
{{dump_port_command}}; {{ray_head_wait_initialized_command}}
|
120
|
-
|
121
|
-
{%- if num_nodes > 1 %}
|
122
|
-
worker_start_ray_commands:
|
123
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
124
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
125
|
-
{%- else %}
|
126
|
-
worker_start_ray_commands: []
|
127
|
-
{%- endif %}
|
128
|
-
|
129
|
-
head_node: {}
|
130
|
-
worker_nodes: {}
|
95
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
96
|
+
# We do not need to list it here anymore.
|
131
97
|
|
132
|
-
# These fields are required for external cloud providers.
|
133
|
-
head_setup_commands: []
|
134
|
-
worker_setup_commands: []
|
135
|
-
cluster_synced_files: []
|
136
|
-
file_mounts_sync_continuously: False
|
@@ -11,9 +11,9 @@ docker:
|
|
11
11
|
container_name: {{docker_container_name}}
|
12
12
|
run_options:
|
13
13
|
- --ulimit nofile=1048576:1048576
|
14
|
-
{%-
|
15
|
-
|
16
|
-
{%-
|
14
|
+
{%- for run_option in docker_run_options %}
|
15
|
+
- {{run_option}}
|
16
|
+
{%- endfor %}
|
17
17
|
{%- if docker_login_config is not none %}
|
18
18
|
docker_login_config:
|
19
19
|
username: |-
|
@@ -51,6 +51,7 @@ file_mounts: {
|
|
51
51
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
52
52
|
{%- for remote_path, local_path in credentials.items() %}
|
53
53
|
"{{remote_path}}": "{{local_path}}",
|
54
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
54
55
|
{%- endfor %}
|
55
56
|
}
|
56
57
|
|
@@ -67,13 +68,16 @@ initialization_commands: []
|
|
67
68
|
# current num items (num SSH connections): 1
|
68
69
|
setup_commands:
|
69
70
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
70
|
-
#
|
71
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
71
72
|
# Line 'rm ..': there is another installation of pip.
|
72
73
|
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
73
74
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
74
75
|
# Line 'mkdir -p ..': disable host key check
|
75
76
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
76
|
-
-
|
77
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
78
|
+
{{ initial_setup_command }}
|
79
|
+
{%- endfor %}
|
80
|
+
sudo systemctl stop unattended-upgrades || true;
|
77
81
|
sudo systemctl disable unattended-upgrades || true;
|
78
82
|
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
79
83
|
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
@@ -85,5 +89,5 @@ setup_commands:
|
|
85
89
|
{{ ray_skypilot_installation_commands }}
|
86
90
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
87
91
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
88
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
92
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
89
93
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
sky/templates/runpod-ray.yml.j2
CHANGED
@@ -10,9 +10,22 @@ provider:
|
|
10
10
|
module: sky.provision.runpod
|
11
11
|
region: "{{region}}"
|
12
12
|
disable_launch_config_check: true
|
13
|
+
# For RunPod, we directly set the image id for the docker as runtime environment
|
14
|
+
# support, thus we need to avoid the DockerInitializer detects the docker field
|
15
|
+
# and performs the initialization. Therefore we put the docker login config in
|
16
|
+
# the provider config here.
|
17
|
+
{%- if docker_login_config is not none %}
|
18
|
+
docker_login_config:
|
19
|
+
username: |-
|
20
|
+
{{docker_login_config.username}}
|
21
|
+
password: |-
|
22
|
+
{{docker_login_config.password}}
|
23
|
+
server: |-
|
24
|
+
{{docker_login_config.server}}
|
25
|
+
{%- endif %}
|
13
26
|
|
14
27
|
auth:
|
15
|
-
ssh_user:
|
28
|
+
ssh_user: {{docker_username_for_runpod}}
|
16
29
|
ssh_private_key: {{ssh_private_key}}
|
17
30
|
|
18
31
|
available_node_types:
|
@@ -21,6 +34,11 @@ available_node_types:
|
|
21
34
|
node_config:
|
22
35
|
InstanceType: {{instance_type}}
|
23
36
|
DiskSize: {{disk_size}}
|
37
|
+
ImageId: {{image_id}}
|
38
|
+
PublicKey: |-
|
39
|
+
skypilot:ssh_public_key_content
|
40
|
+
Preemptible: {{use_spot}}
|
41
|
+
BidPerGPU: {{bid_per_gpu}}
|
24
42
|
|
25
43
|
head_node_type: ray_head_default
|
26
44
|
|
@@ -30,6 +48,7 @@ file_mounts: {
|
|
30
48
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
31
49
|
{%- for remote_path, local_path in credentials.items() %}
|
32
50
|
"{{remote_path}}": "{{local_path}}",
|
51
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
33
52
|
{%- endfor %}
|
34
53
|
}
|
35
54
|
|
@@ -46,13 +65,16 @@ initialization_commands: []
|
|
46
65
|
# current num items (num SSH connections): 1
|
47
66
|
setup_commands:
|
48
67
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
49
|
-
#
|
68
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
50
69
|
# Line 'rm ..': there is another installation of pip.
|
51
70
|
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
52
71
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
53
72
|
# Line 'mkdir -p ..': disable host key check
|
54
73
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
55
|
-
-
|
74
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
75
|
+
{{ initial_setup_command }}
|
76
|
+
{%- endfor %}
|
77
|
+
sudo systemctl stop unattended-upgrades || true;
|
56
78
|
sudo systemctl disable unattended-upgrades || true;
|
57
79
|
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
58
80
|
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
@@ -65,7 +87,7 @@ setup_commands:
|
|
65
87
|
touch ~/.sudo_as_admin_successful;
|
66
88
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
67
89
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
68
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
90
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
69
91
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
70
92
|
|
71
93
|
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
sky/templates/scp-ray.yml.j2
CHANGED
@@ -46,6 +46,7 @@ file_mounts: {
|
|
46
46
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
47
47
|
{%- for remote_path, local_path in credentials.items() %}
|
48
48
|
"{{remote_path}}": "{{local_path}}",
|
49
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
49
50
|
{%- endfor %}
|
50
51
|
}
|
51
52
|
|
@@ -61,8 +62,8 @@ initialization_commands: []
|
|
61
62
|
# Increment the following for catching performance bugs easier:
|
62
63
|
# current num items (num SSH connections): 1
|
63
64
|
setup_commands:
|
65
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
64
66
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
65
|
-
# Create ~/.ssh/config file in case the file does not exist in the custom image.
|
66
67
|
# We set auto_activate_base to be false for pre-installed conda.
|
67
68
|
# This also kills the service that is holding the lock on dpkg (problem only exists on aws/azure, not gcp)
|
68
69
|
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
@@ -74,7 +75,7 @@ setup_commands:
|
|
74
75
|
{{ ray_skypilot_installation_commands }}
|
75
76
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
76
77
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
77
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
78
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
78
79
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
|
79
80
|
|
80
81
|
# Command to start ray on the head node. You don't need to change this.
|
@@ -15,6 +15,11 @@ setup: |
|
|
15
15
|
{{cmd}}
|
16
16
|
{%- endfor %}
|
17
17
|
|
18
|
+
{% if controller_envs.get('SKYPILOT_DEV') != '0' %}
|
19
|
+
grep -q 'export SKYPILOT_DEV=' ~/.bashrc || echo 'export SKYPILOT_DEV=1' >> ~/.bashrc
|
20
|
+
grep -q 'alias sky-env=' ~/.bashrc || echo 'alias sky-env="{{ sky_activate_python_env }}"' >> ~/.bashrc
|
21
|
+
{% endif %}
|
22
|
+
|
18
23
|
# Install serve dependencies.
|
19
24
|
# TODO(tian): Gather those into serve constants.
|
20
25
|
pip list | grep uvicorn > /dev/null 2>&1 || pip install uvicorn > /dev/null 2>&1
|
@@ -23,10 +28,16 @@ setup: |
|
|
23
28
|
|
24
29
|
file_mounts:
|
25
30
|
{{remote_task_yaml_path}}: {{local_task_yaml_path}}
|
26
|
-
{
|
31
|
+
{%- if local_user_config_path is not none %}
|
32
|
+
{{remote_user_config_path}}: {{local_user_config_path}}
|
33
|
+
{%- endif %}
|
27
34
|
{%- for remote_catalog_path, local_catalog_path in modified_catalogs.items() %}
|
28
35
|
{{remote_catalog_path}}: {{local_catalog_path}}
|
29
36
|
{%- endfor %}
|
37
|
+
{%- if use_tls %}
|
38
|
+
{{remote_tls_keyfile}}: {{local_tls_keyfile}}
|
39
|
+
{{remote_tls_certfile}}: {{local_tls_certfile}}
|
40
|
+
{%- endif %}
|
30
41
|
|
31
42
|
run: |
|
32
43
|
# Activate the Python environment, so that cloud SDKs can be found in the
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
# Check if all required arguments are provided
|
4
|
+
if [ "$#" -ne 4 ]; then
|
5
|
+
echo "Usage: $0 <api_server> <context> <namespace> <pod_name>" >&2
|
6
|
+
exit 1
|
7
|
+
fi
|
8
|
+
|
9
|
+
API_SERVER="$1"
|
10
|
+
CONTEXT="$2"
|
11
|
+
NAMESPACE="$3"
|
12
|
+
POD_NAME="$4"
|
13
|
+
|
14
|
+
# Extract host and port from API_SERVER
|
15
|
+
HOST=$(echo $API_SERVER | cut -d: -f1)
|
16
|
+
PORT=$(echo $API_SERVER | cut -d: -f2)
|
17
|
+
|
18
|
+
# Check if nc is installed
|
19
|
+
if ! command -v nc &> /dev/null
|
20
|
+
then
|
21
|
+
echo "nc (netcat) could not be found. Please install it first." >&2
|
22
|
+
echo "You can install it using: sudo apt-get install netcat" >&2
|
23
|
+
exit 1
|
24
|
+
fi
|
25
|
+
|
26
|
+
# Construct the WebSocket upgrade request
|
27
|
+
UPGRADE_REQUEST="GET /ssh-proxy?context=$CONTEXT&namespace=$NAMESPACE&pod_name=$POD_NAME HTTP/1.1\r\n"
|
28
|
+
UPGRADE_REQUEST+="Host: $API_SERVER\r\n"
|
29
|
+
UPGRADE_REQUEST+="Upgrade: websocket\r\n"
|
30
|
+
UPGRADE_REQUEST+="Connection: Upgrade\r\n"
|
31
|
+
UPGRADE_REQUEST+="Sec-WebSocket-Key: dGhlIHNhbXBsZSBub25jZQ==\r\n"
|
32
|
+
UPGRADE_REQUEST+="Sec-WebSocket-Version: 13\r\n"
|
33
|
+
UPGRADE_REQUEST+="\r\n"
|
34
|
+
|
35
|
+
# Send the upgrade request and then relay data
|
36
|
+
(echo -en "$UPGRADE_REQUEST"; cat) | nc $HOST $PORT
|
@@ -0,0 +1,70 @@
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
2
|
+
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
4
|
+
max_workers: {{num_nodes - 1}}
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
6
|
+
idle_timeout_minutes: 60
|
7
|
+
|
8
|
+
provider:
|
9
|
+
type: external
|
10
|
+
module: sky.provision.vast
|
11
|
+
region: "{{region}}"
|
12
|
+
disable_launch_config_check: true
|
13
|
+
|
14
|
+
auth:
|
15
|
+
ssh_user: root
|
16
|
+
ssh_private_key: {{ssh_private_key}}
|
17
|
+
|
18
|
+
available_node_types:
|
19
|
+
ray_head_default:
|
20
|
+
resources: {}
|
21
|
+
node_config:
|
22
|
+
InstanceType: {{instance_type}}
|
23
|
+
DiskSize: {{disk_size}}
|
24
|
+
ImageId: {{image_id}}
|
25
|
+
Preemptible: {{use_spot}}
|
26
|
+
PublicKey: |-
|
27
|
+
skypilot:ssh_public_key_content
|
28
|
+
|
29
|
+
head_node_type: ray_head_default
|
30
|
+
|
31
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
32
|
+
file_mounts: {
|
33
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
34
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
35
|
+
{%- for remote_path, local_path in credentials.items() %}
|
36
|
+
"{{remote_path}}": "{{local_path}}",
|
37
|
+
{%- endfor %}
|
38
|
+
}
|
39
|
+
|
40
|
+
rsync_exclude: []
|
41
|
+
|
42
|
+
initialization_commands: []
|
43
|
+
|
44
|
+
# List of shell commands to run to set up nodes.
|
45
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
46
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
47
|
+
# items!
|
48
|
+
#
|
49
|
+
# Increment the following for catching performance bugs easier:
|
50
|
+
# current num items (num SSH connections): 1
|
51
|
+
setup_commands:
|
52
|
+
# Create ~/.ssh/config file in case the file does not exist in the image.
|
53
|
+
# Line 'rm ..': there is another installation of pip.
|
54
|
+
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
55
|
+
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
56
|
+
# Line 'mkdir -p ..': disable host key check
|
57
|
+
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
58
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
59
|
+
{{ initial_setup_command }}
|
60
|
+
{%- endfor %}
|
61
|
+
mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
|
62
|
+
{{ conda_installation_commands }}
|
63
|
+
{{ ray_skypilot_installation_commands }}
|
64
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
65
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
66
|
+
(grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
67
|
+
|
68
|
+
|
69
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
70
|
+
# We do not need to list it here anymore.
|
sky/templates/vsphere-ray.yml.j2
CHANGED
@@ -14,6 +14,7 @@ provider:
|
|
14
14
|
auth:
|
15
15
|
ssh_user: ubuntu
|
16
16
|
ssh_private_key: {{ssh_private_key}}
|
17
|
+
ssh_public_key: skypilot:ssh_public_key_content
|
17
18
|
|
18
19
|
available_node_types:
|
19
20
|
ray.head.default:
|
@@ -29,6 +30,7 @@ file_mounts: {
|
|
29
30
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
30
31
|
{%- for remote_path, local_path in credentials.items() %}
|
31
32
|
"{{remote_path}}": "{{local_path}}",
|
33
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
32
34
|
{%- endfor %}
|
33
35
|
}
|
34
36
|
|
@@ -45,13 +47,16 @@ initialization_commands: []
|
|
45
47
|
# current num items (num SSH connections): 1
|
46
48
|
setup_commands:
|
47
49
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
48
|
-
#
|
50
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
49
51
|
# Line 'rm ..': there is another installation of pip.
|
50
52
|
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
51
53
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
52
54
|
# Line 'mkdir -p ..': disable host key check
|
53
55
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
54
|
-
-
|
56
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
57
|
+
{{ initial_setup_command }}
|
58
|
+
{%- endfor %}
|
59
|
+
sudo systemctl stop unattended-upgrades || true;
|
55
60
|
sudo systemctl disable unattended-upgrades || true;
|
56
61
|
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
57
62
|
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
@@ -64,5 +69,5 @@ setup_commands:
|
|
64
69
|
{{ ray_skypilot_installation_commands }}
|
65
70
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
66
71
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
67
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
72
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
68
73
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""Starting a websocket with SkyPilot API server to proxy SSH to a k8s pod.
|
3
|
+
|
4
|
+
This script is useful for users who do not have local Kubernetes credentials.
|
5
|
+
"""
|
6
|
+
import asyncio
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
|
10
|
+
import websockets
|
11
|
+
|
12
|
+
|
13
|
+
async def main(url: str) -> None:
|
14
|
+
async with websockets.connect(url, ping_interval=None) as websocket:
|
15
|
+
if os.isatty(sys.stdin.fileno()):
|
16
|
+
# pylint: disable=import-outside-toplevel
|
17
|
+
import termios
|
18
|
+
import tty
|
19
|
+
old_settings = termios.tcgetattr(sys.stdin.fileno())
|
20
|
+
tty.setraw(sys.stdin.fileno())
|
21
|
+
else:
|
22
|
+
old_settings = None
|
23
|
+
|
24
|
+
try:
|
25
|
+
await asyncio.gather(stdin_to_websocket(websocket),
|
26
|
+
websocket_to_stdout(websocket))
|
27
|
+
finally:
|
28
|
+
if old_settings:
|
29
|
+
termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
|
30
|
+
old_settings)
|
31
|
+
|
32
|
+
|
33
|
+
async def stdin_to_websocket(websocket):
|
34
|
+
try:
|
35
|
+
while True:
|
36
|
+
data = await asyncio.get_event_loop().run_in_executor(
|
37
|
+
None, sys.stdin.buffer.read, 1)
|
38
|
+
if not data:
|
39
|
+
break
|
40
|
+
await websocket.send(data)
|
41
|
+
except Exception as e: # pylint: disable=broad-except
|
42
|
+
print(f'Error in stdin_to_websocket: {e}', file=sys.stderr)
|
43
|
+
finally:
|
44
|
+
await websocket.close()
|
45
|
+
|
46
|
+
|
47
|
+
async def websocket_to_stdout(websocket):
|
48
|
+
try:
|
49
|
+
while True:
|
50
|
+
message = await websocket.recv()
|
51
|
+
sys.stdout.buffer.write(message)
|
52
|
+
await asyncio.get_event_loop().run_in_executor(
|
53
|
+
None, sys.stdout.buffer.flush)
|
54
|
+
except websockets.exceptions.ConnectionClosed:
|
55
|
+
print('WebSocket connection closed', file=sys.stderr)
|
56
|
+
except Exception as e: # pylint: disable=broad-except
|
57
|
+
print(f'Error in websocket_to_stdout: {e}', file=sys.stderr)
|
58
|
+
|
59
|
+
|
60
|
+
if __name__ == '__main__':
|
61
|
+
server_url = sys.argv[1].strip('/')
|
62
|
+
websocket_url = (f'ws://{server_url}/kubernetes-pod-ssh-proxy'
|
63
|
+
f'?cluster_name={sys.argv[2]}')
|
64
|
+
asyncio.run(main(websocket_url))
|
sky/usage/constants.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
LOG_URL = 'http://usage.skypilot.co:9090/loki/api/v1/push' # pylint: disable=line-too-long
|
4
4
|
|
5
5
|
USAGE_MESSAGE_SCHEMA_VERSION = 1
|
6
|
-
|
7
6
|
PRIVACY_POLICY_PATH = '~/.sky/privacy_policy'
|
8
7
|
|
9
8
|
USAGE_POLICY_MESSAGE = (
|
@@ -15,3 +14,13 @@ USAGE_POLICY_MESSAGE = (
|
|
15
14
|
|
16
15
|
USAGE_MESSAGE_REDACT_KEYS = ['setup', 'run', 'envs']
|
17
16
|
USAGE_MESSAGE_REDACT_TYPES = {str, dict}
|
17
|
+
|
18
|
+
# Env var for the usage run id. This is used by the API server to associate
|
19
|
+
# the usage run id of a request from client to the actual functions invoked.
|
20
|
+
USAGE_RUN_ID_ENV_VAR = 'SKYPILOT_USAGE_RUN_ID'
|
21
|
+
|
22
|
+
# The file stores the usage run id on a remote cluster, so that the heartbeat
|
23
|
+
# on that remote cluster can be associated with the usage run id. This file is
|
24
|
+
# initialized when the cluster is firstly launched in:
|
25
|
+
# sky.provision.instance_setup.start_skylet_on_head_node
|
26
|
+
USAGE_RUN_ID_FILE = '~/.sky/usage_run_id'
|