skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,16 @@ ingress_spec:
|
|
2
2
|
apiVersion: networking.k8s.io/v1
|
3
3
|
kind: Ingress
|
4
4
|
metadata:
|
5
|
+
labels:
|
6
|
+
{%- for label_key, label_value in labels.items() %}
|
7
|
+
{{ label_key }}: {{ label_value|tojson }}
|
8
|
+
{%- endfor %}
|
5
9
|
annotations:
|
6
10
|
nginx.ingress.kubernetes.io/use-regex: "true"
|
7
11
|
nginx.ingress.kubernetes.io/rewrite-target: /$2
|
12
|
+
{%- for key, value in annotations.items() %}
|
13
|
+
{{ key }}: {{ value|tojson }}
|
14
|
+
{%- endfor %}
|
8
15
|
name: {{ ingress_name }}
|
9
16
|
namespace: {{ namespace }}
|
10
17
|
spec:
|
@@ -5,6 +5,13 @@ service_spec:
|
|
5
5
|
name: {{ service_name }}
|
6
6
|
labels:
|
7
7
|
parent: skypilot
|
8
|
+
{%- for label_key, label_value in labels.items() %}
|
9
|
+
{{ label_key }}: {{ label_value|tojson }}
|
10
|
+
{%- endfor %}
|
11
|
+
annotations:
|
12
|
+
{%- for key, value in annotations.items() %}
|
13
|
+
{{ key }}: {{ value|tojson }}
|
14
|
+
{%- endfor %}
|
8
15
|
spec:
|
9
16
|
type: LoadBalancer
|
10
17
|
selector:
|
@@ -1,6 +1,41 @@
|
|
1
1
|
#!/usr/bin/env bash
|
2
2
|
set -uo pipefail
|
3
3
|
|
4
|
+
KUBE_CONTEXT=""
|
5
|
+
KUBE_NAMESPACE=""
|
6
|
+
|
7
|
+
# Parse flags
|
8
|
+
while getopts ":c:n:" opt; do
|
9
|
+
case ${opt} in
|
10
|
+
c)
|
11
|
+
KUBE_CONTEXT="$OPTARG"
|
12
|
+
;;
|
13
|
+
n)
|
14
|
+
KUBE_NAMESPACE="$OPTARG"
|
15
|
+
;;
|
16
|
+
\?)
|
17
|
+
echo "Invalid option: -$OPTARG" >&2
|
18
|
+
echo "Usage: $0 <pod_name> [-c kube_context] [-n kube_namespace]" >&2
|
19
|
+
exit 1
|
20
|
+
;;
|
21
|
+
:)
|
22
|
+
echo "Option -$OPTARG requires an argument." >&2
|
23
|
+
exit 1
|
24
|
+
;;
|
25
|
+
esac
|
26
|
+
done
|
27
|
+
|
28
|
+
# Shift the processed options away so that $1 becomes the pod name
|
29
|
+
shift $((OPTIND -1))
|
30
|
+
|
31
|
+
# Check if pod name is passed as an argument
|
32
|
+
if [ $# -lt 1 ]; then
|
33
|
+
echo "Usage: $0 <pod_name> [-c kube_context] [-n kube_namespace]" >&2
|
34
|
+
exit 1
|
35
|
+
fi
|
36
|
+
|
37
|
+
POD_NAME="$1" # The first positional argument is the name of the pod
|
38
|
+
|
4
39
|
# Checks if socat is installed
|
5
40
|
if ! command -v socat > /dev/null; then
|
6
41
|
echo "Using 'port-forward' mode to run ssh session on Kubernetes instances requires 'socat' to be installed. Please install 'socat'" >&2
|
@@ -18,7 +53,21 @@ fi
|
|
18
53
|
# This is preferred because of socket re-use issues in kubectl port-forward,
|
19
54
|
# see - https://github.com/kubernetes/kubernetes/issues/74551#issuecomment-769185879
|
20
55
|
KUBECTL_OUTPUT=$(mktemp)
|
21
|
-
|
56
|
+
KUBECTL_ARGS=()
|
57
|
+
|
58
|
+
if [ -n "$KUBE_CONTEXT" ]; then
|
59
|
+
KUBECTL_ARGS+=("--context=$KUBE_CONTEXT")
|
60
|
+
fi
|
61
|
+
# If context is not provided, it means we are using incluster auth. In this case,
|
62
|
+
# we need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
|
63
|
+
if [ -z "$KUBE_CONTEXT" ]; then
|
64
|
+
KUBECTL_ARGS+=("--kubeconfig=/dev/null")
|
65
|
+
fi
|
66
|
+
if [ -n "$KUBE_NAMESPACE" ]; then
|
67
|
+
KUBECTL_ARGS+=("--namespace=$KUBE_NAMESPACE")
|
68
|
+
fi
|
69
|
+
|
70
|
+
kubectl "${KUBECTL_ARGS[@]}" port-forward pod/"${POD_NAME}" :22 > "${KUBECTL_OUTPUT}" 2>&1 &
|
22
71
|
|
23
72
|
# Capture the PID for the backgrounded kubectl command
|
24
73
|
K8S_PORT_FWD_PID=$!
|
@@ -49,12 +98,7 @@ while ! nc -z 127.0.0.1 "${local_port}"; do
|
|
49
98
|
sleep 0.1
|
50
99
|
done
|
51
100
|
|
52
|
-
# To avoid errors when many concurrent requests are sent (see https://github.com/skypilot-org/skypilot/issues/2628),
|
53
|
-
# we add a random delay before establishing the socat connection.
|
54
|
-
# Empirically, this needs to be at least 1 second. We set this to be random between 1 and 2 seconds.
|
55
|
-
sleep $(shuf -i 10-20 -n 1 | awk '{printf "%f", $1/10}')
|
56
|
-
|
57
101
|
# Establishes two directional byte streams to handle stdin/stdout between
|
58
102
|
# terminal and the jump pod.
|
59
103
|
# socat process terminates when port-forward terminates.
|
60
|
-
socat - tcp:127.0.0.1:"${local_port}"
|
104
|
+
socat - tcp:127.0.0.1:"${local_port}"
|
@@ -18,12 +18,20 @@ provider:
|
|
18
18
|
|
19
19
|
region: kubernetes
|
20
20
|
|
21
|
-
|
21
|
+
|
22
22
|
namespace: {{k8s_namespace}}
|
23
23
|
|
24
|
+
# The kubecontext used to connect to the Kubernetes cluster.
|
25
|
+
{% if k8s_context is not none %}
|
26
|
+
context: {{k8s_context}}
|
27
|
+
{% endif %}
|
28
|
+
|
24
29
|
# This should be one of KubernetesPortMode
|
25
30
|
port_mode: {{k8s_port_mode}}
|
26
31
|
|
32
|
+
# The networking mode used to ssh to pods. One of KubernetesNetworkingMode.
|
33
|
+
networking_mode: {{k8s_networking_mode}}
|
34
|
+
|
27
35
|
# We use internal IPs since we set up a port-forward between the kubernetes
|
28
36
|
# cluster and the local machine, or directly use NodePort to reach the
|
29
37
|
# head node.
|
@@ -214,7 +222,9 @@ provider:
|
|
214
222
|
- protocol: TCP
|
215
223
|
port: 22
|
216
224
|
targetPort: 22
|
217
|
-
# Service that maps to the head node of the Ray cluster
|
225
|
+
# Service that maps to the head node of the Ray cluster, so that the
|
226
|
+
# worker nodes can find the head node using
|
227
|
+
# {{cluster_name_on_cloud}}-head.{{k8s_namespace}}.svc.cluster.local
|
218
228
|
- apiVersion: v1
|
219
229
|
kind: Service
|
220
230
|
metadata:
|
@@ -227,18 +237,12 @@ provider:
|
|
227
237
|
# names.
|
228
238
|
name: {{cluster_name_on_cloud}}-head
|
229
239
|
spec:
|
240
|
+
# Create a headless service so that the head node can be reached by
|
241
|
+
# the worker nodes with any port number.
|
242
|
+
clusterIP: None
|
230
243
|
# This selector must match the head node pod's selector below.
|
231
244
|
selector:
|
232
245
|
component: {{cluster_name_on_cloud}}-head
|
233
|
-
ports:
|
234
|
-
- name: client
|
235
|
-
protocol: TCP
|
236
|
-
port: 10001
|
237
|
-
targetPort: 10001
|
238
|
-
- name: dashboard
|
239
|
-
protocol: TCP
|
240
|
-
port: 8265
|
241
|
-
targetPort: 8265
|
242
246
|
|
243
247
|
# Specify the pod type for the ray head node (as configured below).
|
244
248
|
head_node_type: ray_head_default
|
@@ -261,7 +265,7 @@ available_node_types:
|
|
261
265
|
skypilot-user: {{ user }}
|
262
266
|
# Custom tags for the pods
|
263
267
|
{%- for label_key, label_value in labels.items() %}
|
264
|
-
{{ label_key }}: {{ label_value }}
|
268
|
+
{{ label_key }}: {{ label_value|tojson }}
|
265
269
|
{%- endfor %}
|
266
270
|
{% if k8s_fuse_device_required %}
|
267
271
|
annotations:
|
@@ -272,13 +276,28 @@ available_node_types:
|
|
272
276
|
# serviceAccountName: skypilot-service-account
|
273
277
|
serviceAccountName: {{k8s_service_account_name}}
|
274
278
|
automountServiceAccountToken: {{k8s_automount_sa_token}}
|
275
|
-
|
276
279
|
restartPolicy: Never
|
277
280
|
|
278
|
-
# Add node selector if
|
279
|
-
{% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
|
281
|
+
# Add node selector if GPU/TPUs are requested:
|
282
|
+
{% if (k8s_acc_label_key is not none and k8s_acc_label_value is not none) or (k8s_spot_label_key is not none) %}
|
280
283
|
nodeSelector:
|
284
|
+
{% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
|
281
285
|
{{k8s_acc_label_key}}: {{k8s_acc_label_value}}
|
286
|
+
{% endif %}
|
287
|
+
{% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %}
|
288
|
+
{{k8s_topology_label_key}}: {{k8s_topology_label_value}}
|
289
|
+
{% endif %}
|
290
|
+
{% if k8s_spot_label_key is not none %}
|
291
|
+
{{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
|
292
|
+
{% endif %}
|
293
|
+
{% endif %}
|
294
|
+
|
295
|
+
{% if k8s_spot_label_key is not none %}
|
296
|
+
tolerations:
|
297
|
+
- key: {{k8s_spot_label_key}}
|
298
|
+
operator: Equal
|
299
|
+
value: {{k8s_spot_label_value|tojson}}
|
300
|
+
effect: NoSchedule
|
282
301
|
{% endif %}
|
283
302
|
|
284
303
|
# This volume allocates shared memory for Ray to use for its plasma
|
@@ -298,10 +317,209 @@ available_node_types:
|
|
298
317
|
- name: ray-node
|
299
318
|
imagePullPolicy: IfNotPresent
|
300
319
|
image: {{image_id}}
|
320
|
+
env:
|
321
|
+
- name: SKYPILOT_POD_NODE_TYPE
|
322
|
+
valueFrom:
|
323
|
+
fieldRef:
|
324
|
+
fieldPath: metadata.labels['ray-node-type']
|
325
|
+
{% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
|
326
|
+
- name: {{ key }}
|
327
|
+
value: {{ value }}
|
328
|
+
{% endfor %}
|
301
329
|
# Do not change this command - it keeps the pod alive until it is
|
302
330
|
# explicitly killed.
|
303
331
|
command: ["/bin/bash", "-c", "--"]
|
304
|
-
args:
|
332
|
+
args:
|
333
|
+
- |
|
334
|
+
# For backwards compatibility, we put a marker file in the pod
|
335
|
+
# to indicate that the pod is running with the changes introduced
|
336
|
+
# in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
|
337
|
+
# TODO: Remove this marker file and it's usage in setup_commands
|
338
|
+
# after v0.10.0 release.
|
339
|
+
touch /tmp/skypilot_is_nimbus
|
340
|
+
|
341
|
+
# Helper function to conditionally use sudo
|
342
|
+
# TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
|
343
|
+
prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
|
344
|
+
[ $(id -u) -eq 0 ] && function sudo() { "$@"; } || true;
|
345
|
+
|
346
|
+
STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
|
347
|
+
|
348
|
+
# STEP 1: Run apt update, install missing packages, and set up ssh.
|
349
|
+
(
|
350
|
+
(
|
351
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
|
352
|
+
echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
|
353
|
+
PACKAGES="rsync curl netcat gcc patch pciutils fuse openssh-server";
|
354
|
+
|
355
|
+
# Separate packages into two groups: packages that are installed first
|
356
|
+
# so that curl and rsync are available sooner to unblock the following
|
357
|
+
# conda installation and rsync.
|
358
|
+
set -e
|
359
|
+
INSTALL_FIRST="";
|
360
|
+
MISSING_PACKAGES="";
|
361
|
+
for pkg in $PACKAGES; do
|
362
|
+
if [ "$pkg" == "netcat" ]; then
|
363
|
+
if ! dpkg -l | grep -q "^ii \(netcat\|netcat-openbsd\|netcat-traditional\) "; then
|
364
|
+
INSTALL_FIRST="$INSTALL_FIRST netcat-openbsd";
|
365
|
+
fi
|
366
|
+
elif ! dpkg -l | grep -q "^ii $pkg "; then
|
367
|
+
if [ "$pkg" == "curl" ] || [ "$pkg" == "rsync" ]; then
|
368
|
+
INSTALL_FIRST="$INSTALL_FIRST $pkg";
|
369
|
+
else
|
370
|
+
MISSING_PACKAGES="$MISSING_PACKAGES $pkg";
|
371
|
+
fi
|
372
|
+
fi
|
373
|
+
done;
|
374
|
+
if [ ! -z "$INSTALL_FIRST" ]; then
|
375
|
+
echo "Installing core packages: $INSTALL_FIRST";
|
376
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $INSTALL_FIRST;
|
377
|
+
fi;
|
378
|
+
# SSH and other packages are not necessary, so we disable set -e
|
379
|
+
set +e
|
380
|
+
|
381
|
+
if [ ! -z "$MISSING_PACKAGES" ]; then
|
382
|
+
echo "Installing missing packages: $MISSING_PACKAGES";
|
383
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $MISSING_PACKAGES;
|
384
|
+
fi;
|
385
|
+
|
386
|
+
$(prefix_cmd) mkdir -p /var/run/sshd;
|
387
|
+
$(prefix_cmd) sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config;
|
388
|
+
$(prefix_cmd) sed "s@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g" -i /etc/pam.d/sshd;
|
389
|
+
cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A;
|
390
|
+
$(prefix_cmd) mkdir -p ~/.ssh;
|
391
|
+
$(prefix_cmd) chown -R $(whoami) ~/.ssh;
|
392
|
+
$(prefix_cmd) chmod 700 ~/.ssh;
|
393
|
+
$(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > ~/.ssh/authorized_keys;
|
394
|
+
$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
|
395
|
+
$(prefix_cmd) service ssh restart;
|
396
|
+
$(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
|
397
|
+
) > /tmp/${STEPS[0]}.log 2>&1 || {
|
398
|
+
echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed
|
399
|
+
cat /tmp/${STEPS[0]}.log
|
400
|
+
exit 1
|
401
|
+
}
|
402
|
+
) &
|
403
|
+
|
404
|
+
# STEP 2: Install conda, ray and skypilot (for dependencies); start
|
405
|
+
# ray cluster.
|
406
|
+
(
|
407
|
+
(
|
408
|
+
set -e
|
409
|
+
mkdir -p ~/.sky
|
410
|
+
# Wait for `curl` package to be installed before installing conda
|
411
|
+
# and ray.
|
412
|
+
until dpkg -l | grep -q "^ii curl "; do
|
413
|
+
sleep 0.1
|
414
|
+
echo "Waiting for curl package to be installed..."
|
415
|
+
done
|
416
|
+
{{ conda_installation_commands }}
|
417
|
+
{{ ray_installation_commands }}
|
418
|
+
VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
|
419
|
+
touch /tmp/ray_skypilot_installation_complete
|
420
|
+
echo "=== Ray and skypilot installation completed ==="
|
421
|
+
|
422
|
+
# Disable set -e, as we have some commands that are ok to fail
|
423
|
+
# after the ray start.
|
424
|
+
# TODO(zhwu): this is a hack, we should fix the commands that are
|
425
|
+
# ok to fail.
|
426
|
+
if [ "$SKYPILOT_POD_NODE_TYPE" == "head" ]; then
|
427
|
+
set +e
|
428
|
+
{{ ray_head_start_command }}
|
429
|
+
else
|
430
|
+
# Start ray worker on the worker pod.
|
431
|
+
# Wait until the head pod is available with an IP address
|
432
|
+
export SKYPILOT_RAY_HEAD_IP="{{cluster_name_on_cloud}}-head.{{k8s_namespace}}.svc.cluster.local"
|
433
|
+
export SKYPILOT_RAY_PORT={{skypilot_ray_port}}
|
434
|
+
# Wait until the ray cluster is started on the head pod
|
435
|
+
until dpkg -l | grep -q "^ii \(netcat\|netcat-openbsd\|netcat-traditional\) "; do
|
436
|
+
sleep 0.1
|
437
|
+
echo "Waiting for netcat package to be installed..."
|
438
|
+
done
|
439
|
+
until nc -z -w 1 ${SKYPILOT_RAY_HEAD_IP} ${SKYPILOT_RAY_PORT}; do
|
440
|
+
sleep 0.1
|
441
|
+
done
|
442
|
+
|
443
|
+
set +e
|
444
|
+
{{ ray_worker_start_command }}
|
445
|
+
fi
|
446
|
+
) > /tmp/${STEPS[1]}.log 2>&1 || {
|
447
|
+
echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed
|
448
|
+
cat /tmp/${STEPS[1]}.log
|
449
|
+
exit 1
|
450
|
+
}
|
451
|
+
) &
|
452
|
+
|
453
|
+
|
454
|
+
# STEP 3: Set up environment variables; this should be relatively fast.
|
455
|
+
(
|
456
|
+
(
|
457
|
+
set -e
|
458
|
+
if [ $(id -u) -eq 0 ]; then
|
459
|
+
echo 'alias sudo=""' >> ~/.bashrc; echo succeed;
|
460
|
+
else
|
461
|
+
if command -v sudo >/dev/null 2>&1; then
|
462
|
+
timeout 2 sudo -l >/dev/null 2>&1 && echo succeed || { echo 52; exit 52; };
|
463
|
+
else
|
464
|
+
{ echo 52; exit 52; };
|
465
|
+
fi;
|
466
|
+
fi;
|
467
|
+
printenv | while IFS='=' read -r key value; do echo "export $key=\"$value\""; done > ~/container_env_var.sh && $(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh
|
468
|
+
) > /tmp/${STEPS[2]}.log 2>&1 || {
|
469
|
+
echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed
|
470
|
+
cat /tmp/${STEPS[2]}.log
|
471
|
+
exit 1
|
472
|
+
}
|
473
|
+
) &
|
474
|
+
|
475
|
+
function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
|
476
|
+
|
477
|
+
# Tails file and checks every 5 sec for
|
478
|
+
# open file handlers with write access
|
479
|
+
# closes if none exist
|
480
|
+
monitor_file() {
|
481
|
+
tail -f $file &
|
482
|
+
TAIL_PID=$!
|
483
|
+
while kill -0 $TAIL_PID 2> /dev/null; do
|
484
|
+
# only two PIDs should be accessing the file
|
485
|
+
# the log appender and log tailer
|
486
|
+
if [ $(mylsof $file | wc -l) -lt 2 ]; then
|
487
|
+
kill $TAIL_PID
|
488
|
+
break
|
489
|
+
fi
|
490
|
+
# Sleep for 5 seconds before checking again. Do not make this
|
491
|
+
# too short as it will consume CPU, and too long will cause
|
492
|
+
# the file to be closed too late keeping the pod alive.
|
493
|
+
sleep 5
|
494
|
+
done
|
495
|
+
}
|
496
|
+
|
497
|
+
log_tail() {
|
498
|
+
FILE_PATTERN="~/sky_logs/*/tasks/*.log"
|
499
|
+
while ! ls $(eval echo $FILE_PATTERN) 1> /dev/null 2>&1; do
|
500
|
+
sleep 1
|
501
|
+
done
|
502
|
+
|
503
|
+
# Keep track of already monitored files
|
504
|
+
already_monitored=""
|
505
|
+
|
506
|
+
# Infinite loop to continuously check for new files
|
507
|
+
while true; do
|
508
|
+
for file in $(eval echo $FILE_PATTERN); do
|
509
|
+
if echo $already_monitored | grep -q $file; then
|
510
|
+
# File is already being monitored
|
511
|
+
continue
|
512
|
+
fi
|
513
|
+
|
514
|
+
# Monitor the new file
|
515
|
+
monitor_file $file &
|
516
|
+
already_monitored="${already_monitored} ${file}"
|
517
|
+
done
|
518
|
+
sleep 0.1
|
519
|
+
done
|
520
|
+
}
|
521
|
+
trap : TERM INT; log_tail || sleep infinity & wait
|
522
|
+
|
305
523
|
ports:
|
306
524
|
- containerPort: 22 # Used for SSH
|
307
525
|
- containerPort: {{ray_port}} # Redis port
|
@@ -330,34 +548,82 @@ available_node_types:
|
|
330
548
|
requests:
|
331
549
|
cpu: {{cpus}}
|
332
550
|
memory: {{memory}}G
|
333
|
-
|
551
|
+
{% if k8s_resource_key is not none %}
|
552
|
+
# Number of requested google.com/tpu must be equal to the total
|
553
|
+
# number of available TPU chips on the TPU slice node either it
|
554
|
+
# being a node from multi-host TPU slice or single-host TPU
|
555
|
+
# slice. Example reference:
|
556
|
+
# https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work
|
557
|
+
{{k8s_resource_key}}: {{accelerator_count}}
|
558
|
+
{% endif %}
|
334
559
|
{% if k8s_fuse_device_required %}
|
335
560
|
# Kubernetes resource exposed by the fuse device manager
|
336
561
|
# https://gitlab.com/arm-research/smarter/smarter-device-manager
|
337
562
|
smarter-devices/fuse: "1"
|
338
563
|
{% endif %}
|
564
|
+
{% if k8s_resource_key is not none or k8s_fuse_device_required %}
|
339
565
|
limits:
|
340
|
-
|
566
|
+
# Limits need to be defined for GPU/TPU requests
|
567
|
+
{% if k8s_resource_key is not none %}
|
568
|
+
{{k8s_resource_key}}: {{accelerator_count}}
|
569
|
+
{% endif %}
|
341
570
|
{% if k8s_fuse_device_required %}
|
342
571
|
smarter-devices/fuse: "1"
|
343
572
|
{% endif %}
|
344
|
-
|
573
|
+
{% endif %}
|
574
|
+
|
345
575
|
setup_commands:
|
346
576
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
347
|
-
#
|
577
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
348
578
|
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
349
579
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
350
580
|
# Line 'mkdir -p ..': disable host key check
|
351
581
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
352
|
-
|
582
|
+
# Line 'for step in ..': check if any failure indicator exists for the setup done in pod args and print the error message. This is only a best effort, as the
|
583
|
+
# commands in pod args are asynchronous and we cannot guarantee the failure indicators are created before the setup commands finish.
|
584
|
+
- |
|
353
585
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
354
|
-
{
|
355
|
-
{{
|
586
|
+
{%- for initial_setup_command in initial_setup_commands %}
|
587
|
+
{{ initial_setup_command }}
|
588
|
+
{%- endfor %}
|
589
|
+
STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
|
590
|
+
start_epoch=$(date +%s);
|
591
|
+
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
592
|
+
if [ -f /tmp/skypilot_is_nimbus ]; then
|
593
|
+
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
594
|
+
[ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
|
595
|
+
{ tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/ray_skypilot_installation_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
596
|
+
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
597
|
+
fi
|
598
|
+
end_epoch=$(date +%s);
|
599
|
+
echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
|
600
|
+
start_epoch=$(date +%s);
|
601
|
+
{{ skypilot_wheel_installation_commands }}
|
602
|
+
end_epoch=$(date +%s);
|
603
|
+
echo "=== Skypilot wheel installation completed in $(($end_epoch - $start_epoch)) secs ===";
|
604
|
+
start_epoch=$(date +%s);
|
356
605
|
sudo touch ~/.sudo_as_admin_successful;
|
357
606
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
358
|
-
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf');
|
359
|
-
|
607
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf');
|
608
|
+
ulimit -n 1048576;
|
609
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
360
610
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
|
611
|
+
end_epoch=$(date +%s);
|
612
|
+
echo "=== Setup system configs and fuse completed in $(($end_epoch - $start_epoch)) secs ===";
|
613
|
+
for step in $STEPS; do [ -f "/tmp/${step}.failed" ] && { echo "Error: /tmp/${step}.failed found:"; cat /tmp/${step}.log; exit 1; } || true; done;
|
614
|
+
{% if tpu_requested %}
|
615
|
+
# The /tmp/tpu_logs directory is where TPU-related logs, such as logs from
|
616
|
+
# the TPU runtime, are written. These capture runtime information about the
|
617
|
+
# TPU execution, including any warnings, errors, or general activity of
|
618
|
+
# the TPU driver. By default, the /tmp/tpu_logs directory is created with
|
619
|
+
# 755 permissions, and the user of the provisioned pod is not necessarily
|
620
|
+
# a root. Hence, we need to update the write permission so the logs can be
|
621
|
+
# properly written.
|
622
|
+
# TODO(Doyoung): Investigate to see why TPU workload fails to run without
|
623
|
+
# execution permission, such as granting 766 to log file. Check if it's a
|
624
|
+
# must and see if there's a workaround to grant minimum permission.
|
625
|
+
sudo chmod 777 /tmp/tpu_logs;
|
626
|
+
{% endif %}
|
361
627
|
|
362
628
|
# Format: `REMOTE_PATH : LOCAL_PATH`
|
363
629
|
file_mounts: {
|
@@ -365,6 +631,7 @@ file_mounts: {
|
|
365
631
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
366
632
|
{%- for remote_path, local_path in credentials.items() %}
|
367
633
|
"{{remote_path}}": "{{local_path}}",
|
634
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
368
635
|
{%- endfor %}
|
369
636
|
}
|
370
637
|
|
sky/templates/lambda-ray.yml.j2
CHANGED
@@ -5,9 +5,29 @@ max_workers: {{num_nodes - 1}}
|
|
5
5
|
upscaling_speed: {{num_nodes - 1}}
|
6
6
|
idle_timeout_minutes: 60
|
7
7
|
|
8
|
+
{%- if docker_image is not none %}
|
9
|
+
docker:
|
10
|
+
image: {{docker_image}}
|
11
|
+
container_name: {{docker_container_name}}
|
12
|
+
run_options:
|
13
|
+
- --ulimit nofile=1048576:1048576
|
14
|
+
{%- for run_option in docker_run_options %}
|
15
|
+
- {{run_option}}
|
16
|
+
{%- endfor %}
|
17
|
+
{%- if docker_login_config is not none %}
|
18
|
+
docker_login_config:
|
19
|
+
username: |-
|
20
|
+
{{docker_login_config.username}}
|
21
|
+
password: |-
|
22
|
+
{{docker_login_config.password}}
|
23
|
+
server: |-
|
24
|
+
{{docker_login_config.server}}
|
25
|
+
{%- endif %}
|
26
|
+
{%- endif %}
|
27
|
+
|
8
28
|
provider:
|
9
29
|
type: external
|
10
|
-
module: sky.
|
30
|
+
module: sky.provision.lambda
|
11
31
|
region: {{region}}
|
12
32
|
# Disable launch config check for worker nodes as it can cause resource
|
13
33
|
# leakage.
|
@@ -25,14 +45,6 @@ available_node_types:
|
|
25
45
|
resources: {}
|
26
46
|
node_config:
|
27
47
|
InstanceType: {{instance_type}}
|
28
|
-
{% if num_nodes > 1 %}
|
29
|
-
ray_worker_default:
|
30
|
-
min_workers: {{num_nodes - 1}}
|
31
|
-
max_workers: {{num_nodes - 1}}
|
32
|
-
resources: {}
|
33
|
-
node_config:
|
34
|
-
InstanceType: {{instance_type}}
|
35
|
-
{%- endif %}
|
36
48
|
|
37
49
|
head_node_type: ray_head_default
|
38
50
|
|
@@ -42,6 +54,7 @@ file_mounts: {
|
|
42
54
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
43
55
|
{%- for remote_path, local_path in credentials.items() %}
|
44
56
|
"{{remote_path}}": "{{local_path}}",
|
57
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
45
58
|
{%- endfor %}
|
46
59
|
}
|
47
60
|
|
@@ -58,13 +71,16 @@ initialization_commands: []
|
|
58
71
|
# current num items (num SSH connections): 1
|
59
72
|
setup_commands:
|
60
73
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
61
|
-
#
|
74
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
62
75
|
# Line 'rm ..': there is another installation of pip.
|
63
76
|
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
64
77
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
65
78
|
# Line 'mkdir -p ..': disable host key check
|
66
79
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
67
|
-
-
|
80
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
81
|
+
{{ initial_setup_command }}
|
82
|
+
{%- endfor %}
|
83
|
+
sudo systemctl stop unattended-upgrades || true;
|
68
84
|
sudo systemctl disable unattended-upgrades || true;
|
69
85
|
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
70
86
|
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
@@ -78,34 +94,8 @@ setup_commands:
|
|
78
94
|
touch ~/.sudo_as_admin_successful;
|
79
95
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
80
96
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
81
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
97
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
82
98
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
83
99
|
|
84
|
-
# Command to start ray
|
85
|
-
#
|
86
|
-
# connection, which is expensive. Try your best to co-locate commands into fewer
|
87
|
-
# items! The same comment applies for worker_start_ray_commands.
|
88
|
-
#
|
89
|
-
# Increment the following for catching performance bugs easier:
|
90
|
-
# current num items (num SSH connections): 2
|
91
|
-
head_start_ray_commands:
|
92
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
93
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
94
|
-
{{dump_port_command}}; {{ray_head_wait_initialized_command}}
|
95
|
-
|
96
|
-
{%- if num_nodes > 1 %}
|
97
|
-
worker_start_ray_commands:
|
98
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
99
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
100
|
-
{%- else %}
|
101
|
-
worker_start_ray_commands: []
|
102
|
-
{%- endif %}
|
103
|
-
|
104
|
-
head_node: {}
|
105
|
-
worker_nodes: {}
|
106
|
-
|
107
|
-
# These fields are required for external cloud providers.
|
108
|
-
head_setup_commands: []
|
109
|
-
worker_setup_commands: []
|
110
|
-
cluster_synced_files: []
|
111
|
-
file_mounts_sync_continuously: False
|
100
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
101
|
+
# We do not need to list it here anymore.
|