skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/utils/dag_utils.py
CHANGED
@@ -3,16 +3,16 @@ import copy
|
|
3
3
|
from typing import Any, Dict, List, Optional, Tuple
|
4
4
|
|
5
5
|
from sky import dag as dag_lib
|
6
|
-
from sky import jobs
|
7
6
|
from sky import sky_logging
|
8
7
|
from sky import task as task_lib
|
9
|
-
from sky.
|
8
|
+
from sky.utils import cluster_utils
|
10
9
|
from sky.utils import common_utils
|
10
|
+
from sky.utils import registry
|
11
11
|
from sky.utils import ux_utils
|
12
12
|
|
13
13
|
logger = sky_logging.init_logger(__name__)
|
14
14
|
|
15
|
-
# Message thrown when APIs sky.{exec,launch,
|
15
|
+
# Message thrown when APIs sky.{exec,launch,jobs.launch}() received a string
|
16
16
|
# instead of a Dag. CLI (cli.py) is implemented by us so should not trigger
|
17
17
|
# this.
|
18
18
|
_ENTRYPOINT_STRING_AS_DAG_MESSAGE = """\
|
@@ -31,54 +31,43 @@ The command can then be run as:
|
|
31
31
|
|
32
32
|
sky.launch(task, ...)
|
33
33
|
|
34
|
-
sky.
|
34
|
+
sky.jobs.launch(task, ...)
|
35
35
|
""".strip()
|
36
36
|
|
37
37
|
|
38
38
|
def convert_entrypoint_to_dag(entrypoint: Any) -> 'dag_lib.Dag':
|
39
|
-
"""
|
39
|
+
"""Converts the entrypoint to a sky.Dag and applies the policy.
|
40
40
|
|
41
41
|
Raises TypeError if 'entrypoint' is not a 'sky.Task' or 'sky.Dag'.
|
42
42
|
"""
|
43
43
|
# Not suppressing stacktrace: when calling this via API user may want to
|
44
44
|
# see their own program in the stacktrace. Our CLI impl would not trigger
|
45
45
|
# these errors.
|
46
|
+
converted_dag: 'dag_lib.Dag'
|
46
47
|
if isinstance(entrypoint, str):
|
47
48
|
with ux_utils.print_exception_no_traceback():
|
48
49
|
raise TypeError(_ENTRYPOINT_STRING_AS_DAG_MESSAGE)
|
49
50
|
elif isinstance(entrypoint, dag_lib.Dag):
|
50
|
-
|
51
|
+
converted_dag = copy.deepcopy(entrypoint)
|
51
52
|
elif isinstance(entrypoint, task_lib.Task):
|
52
53
|
entrypoint = copy.deepcopy(entrypoint)
|
53
54
|
with dag_lib.Dag() as dag:
|
54
55
|
dag.add(entrypoint)
|
55
56
|
dag.name = entrypoint.name
|
56
|
-
|
57
|
+
converted_dag = dag
|
57
58
|
else:
|
58
59
|
with ux_utils.print_exception_no_traceback():
|
59
60
|
raise TypeError(
|
60
61
|
'Expected a sky.Task or sky.Dag but received argument of type: '
|
61
62
|
f'{type(entrypoint)}')
|
62
63
|
|
64
|
+
return converted_dag
|
63
65
|
|
64
|
-
def load_chain_dag_from_yaml(
|
65
|
-
path: str,
|
66
|
-
env_overrides: Optional[List[Tuple[str, str]]] = None,
|
67
|
-
) -> dag_lib.Dag:
|
68
|
-
"""Loads a chain DAG from a YAML file.
|
69
|
-
|
70
|
-
Has special handling for an initial section in YAML that contains only the
|
71
|
-
'name' field, which is the DAG name.
|
72
66
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
Returns:
|
78
|
-
A chain Dag with 1 or more tasks (an empty entrypoint would create a
|
79
|
-
trivial task).
|
80
|
-
"""
|
81
|
-
configs = common_utils.read_yaml_all(path)
|
67
|
+
def _load_chain_dag(
|
68
|
+
configs: List[Dict[str, Any]],
|
69
|
+
env_overrides: Optional[List[Tuple[str, str]]] = None) -> dag_lib.Dag:
|
70
|
+
"""Loads a chain DAG from a list of YAML configs."""
|
82
71
|
dag_name = None
|
83
72
|
if set(configs[0].keys()) == {'name'}:
|
84
73
|
dag_name = configs[0]['name']
|
@@ -86,7 +75,7 @@ def load_chain_dag_from_yaml(
|
|
86
75
|
elif len(configs) == 1:
|
87
76
|
dag_name = configs[0].get('name')
|
88
77
|
|
89
|
-
if
|
78
|
+
if not configs:
|
90
79
|
# YAML has only `name: xxx`. Still instantiate a task.
|
91
80
|
configs = [{'name': dag_name}]
|
92
81
|
|
@@ -103,12 +92,74 @@ def load_chain_dag_from_yaml(
|
|
103
92
|
return dag
|
104
93
|
|
105
94
|
|
106
|
-
def
|
95
|
+
def load_chain_dag_from_yaml(
|
96
|
+
path: str,
|
97
|
+
env_overrides: Optional[List[Tuple[str, str]]] = None,
|
98
|
+
) -> dag_lib.Dag:
|
99
|
+
"""Loads a chain DAG from a YAML file.
|
100
|
+
|
101
|
+
Has special handling for an initial section in YAML that contains only the
|
102
|
+
'name' field, which is the DAG name.
|
103
|
+
|
104
|
+
'env_overrides' is a list of (key, value) pairs that will be used to update
|
105
|
+
the task's 'envs' section. If it is a chain dag, the envs will be updated
|
106
|
+
for all tasks in the chain.
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
A chain Dag with 1 or more tasks (an empty entrypoint would create a
|
110
|
+
trivial task).
|
111
|
+
"""
|
112
|
+
configs = common_utils.read_yaml_all(path)
|
113
|
+
return _load_chain_dag(configs, env_overrides)
|
114
|
+
|
115
|
+
|
116
|
+
def load_chain_dag_from_yaml_str(
|
117
|
+
yaml_str: str,
|
118
|
+
env_overrides: Optional[List[Tuple[str, str]]] = None,
|
119
|
+
) -> dag_lib.Dag:
|
120
|
+
"""Loads a chain DAG from a YAML string.
|
121
|
+
|
122
|
+
Has special handling for an initial section in YAML that contains only the
|
123
|
+
'name' field, which is the DAG name.
|
124
|
+
|
125
|
+
'env_overrides' is a list of (key, value) pairs that will be used to update
|
126
|
+
the task's 'envs' section. If it is a chain dag, the envs will be updated
|
127
|
+
for all tasks in the chain.
|
128
|
+
|
129
|
+
Returns:
|
130
|
+
A chain Dag with 1 or more tasks (an empty entrypoint would create a
|
131
|
+
trivial task).
|
132
|
+
"""
|
133
|
+
configs = common_utils.read_yaml_all_str(yaml_str)
|
134
|
+
return _load_chain_dag(configs, env_overrides)
|
135
|
+
|
136
|
+
|
137
|
+
def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag) -> str:
|
138
|
+
"""Dumps a chain DAG to a YAML string.
|
139
|
+
|
140
|
+
Args:
|
141
|
+
dag: the DAG to dump.
|
142
|
+
|
143
|
+
Returns:
|
144
|
+
The YAML string.
|
145
|
+
"""
|
107
146
|
assert dag.is_chain(), dag
|
108
147
|
configs = [{'name': dag.name}]
|
109
148
|
for task in dag.tasks:
|
110
149
|
configs.append(task.to_yaml_config())
|
111
|
-
common_utils.
|
150
|
+
return common_utils.dump_yaml_str(configs)
|
151
|
+
|
152
|
+
|
153
|
+
def dump_chain_dag_to_yaml(dag: dag_lib.Dag, path: str) -> None:
|
154
|
+
"""Dumps a chain DAG to a YAML file.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
dag: the DAG to dump.
|
158
|
+
path: the path to the YAML file.
|
159
|
+
"""
|
160
|
+
dag_str = dump_chain_dag_to_yaml_str(dag)
|
161
|
+
with open(path, 'w', encoding='utf-8') as f:
|
162
|
+
f.write(dag_str)
|
112
163
|
|
113
164
|
|
114
165
|
def maybe_infer_and_fill_dag_and_task_names(dag: dag_lib.Dag) -> None:
|
@@ -125,7 +176,7 @@ def maybe_infer_and_fill_dag_and_task_names(dag: dag_lib.Dag) -> None:
|
|
125
176
|
dag.name = first_task.name
|
126
177
|
|
127
178
|
if dag.name is None:
|
128
|
-
dag.name =
|
179
|
+
dag.name = cluster_utils.generate_cluster_name()
|
129
180
|
|
130
181
|
if len(dag.tasks) == 1:
|
131
182
|
if first_task.name is None:
|
@@ -140,11 +191,21 @@ def fill_default_config_in_dag_for_job_launch(dag: dag_lib.Dag) -> None:
|
|
140
191
|
for task_ in dag.tasks:
|
141
192
|
|
142
193
|
new_resources_list = []
|
194
|
+
default_strategy = registry.JOBS_RECOVERY_STRATEGY_REGISTRY.default
|
195
|
+
assert default_strategy is not None
|
143
196
|
for resources in list(task_.resources):
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
197
|
+
original_job_recovery = resources.job_recovery
|
198
|
+
job_recovery = {'strategy': default_strategy}
|
199
|
+
if isinstance(original_job_recovery, str):
|
200
|
+
job_recovery['strategy'] = original_job_recovery
|
201
|
+
elif isinstance(original_job_recovery, dict):
|
202
|
+
job_recovery.update(original_job_recovery)
|
203
|
+
strategy = job_recovery.get('strategy')
|
204
|
+
if strategy is None:
|
205
|
+
job_recovery['strategy'] = default_strategy
|
206
|
+
change_default_value: Dict[str, Any] = {
|
207
|
+
'job_recovery': job_recovery
|
208
|
+
}
|
148
209
|
|
149
210
|
new_resources = resources.copy(**change_default_value)
|
150
211
|
new_resources_list.append(new_resources)
|
sky/utils/db_utils.py
CHANGED
@@ -4,11 +4,27 @@ import sqlite3
|
|
4
4
|
import threading
|
5
5
|
from typing import Any, Callable, Optional
|
6
6
|
|
7
|
+
# This parameter (passed to sqlite3.connect) controls how long we will wait to
|
8
|
+
# obtains a database lock (not necessarily during connection, but whenever it is
|
9
|
+
# needed). It is not a connection timeout.
|
10
|
+
# Even in WAL mode, only a single writer is allowed at a time. Other writers
|
11
|
+
# will block until the write lock can be obtained. This behavior is described in
|
12
|
+
# the SQLite documentation for WAL: https://www.sqlite.org/wal.html
|
13
|
+
# Python's default timeout is 5s. In normal usage, lock contention is very low,
|
14
|
+
# and this is more than sufficient. However, in some highly concurrent cases,
|
15
|
+
# such as a jobs controller suddenly recovering thousands of jobs at once, we
|
16
|
+
# can see a small number of processes that take much longer to obtain the lock.
|
17
|
+
# In contrived highly contentious cases, around 0.1% of transactions will take
|
18
|
+
# >30s to take the lock. We have not seen cases that take >60s. For cases up to
|
19
|
+
# 1000x parallelism, this is thus thought to be a conservative setting.
|
20
|
+
# For more info, see the PR description for #4552.
|
21
|
+
_DB_TIMEOUT_S = 60
|
22
|
+
|
7
23
|
|
8
24
|
@contextlib.contextmanager
|
9
25
|
def safe_cursor(db_path: str):
|
10
26
|
"""A newly created, auto-committing, auto-closing cursor."""
|
11
|
-
conn = sqlite3.connect(db_path)
|
27
|
+
conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
|
12
28
|
cursor = conn.cursor()
|
13
29
|
try:
|
14
30
|
yield cursor
|
@@ -79,8 +95,6 @@ class SQLiteConn(threading.local):
|
|
79
95
|
def __init__(self, db_path: str, create_table: Callable):
|
80
96
|
super().__init__()
|
81
97
|
self.db_path = db_path
|
82
|
-
|
83
|
-
# errors. This is a hack, but it works.
|
84
|
-
self.conn = sqlite3.connect(db_path, timeout=10)
|
98
|
+
self.conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
|
85
99
|
self.cursor = self.conn.cursor()
|
86
100
|
create_table(self.cursor, self.conn)
|
sky/utils/env_options.py
CHANGED
@@ -1,21 +1,43 @@
|
|
1
1
|
"""Global environment options for sky."""
|
2
2
|
import enum
|
3
3
|
import os
|
4
|
+
from typing import Dict
|
4
5
|
|
5
6
|
|
6
7
|
class Options(enum.Enum):
|
7
8
|
"""Environment variables for SkyPilot."""
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
|
10
|
+
# (env var name, default value)
|
11
|
+
IS_DEVELOPER = ('SKYPILOT_DEV', False)
|
12
|
+
SHOW_DEBUG_INFO = ('SKYPILOT_DEBUG', False)
|
13
|
+
DISABLE_LOGGING = ('SKYPILOT_DISABLE_USAGE_COLLECTION', False)
|
14
|
+
MINIMIZE_LOGGING = ('SKYPILOT_MINIMIZE_LOGGING', True)
|
15
|
+
SUPPRESS_SENSITIVE_LOG = ('SKYPILOT_SUPPRESS_SENSITIVE_LOG', False)
|
12
16
|
# Internal: this is used to skip the cloud user identity check, which is
|
13
17
|
# used to protect cluster operations in a multi-identity scenario.
|
14
18
|
# Currently, this is only used in the job and serve controller, as there
|
15
19
|
# will not be multiple identities, and skipping the check can increase
|
16
20
|
# robustness.
|
17
|
-
SKIP_CLOUD_IDENTITY_CHECK = 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK'
|
21
|
+
SKIP_CLOUD_IDENTITY_CHECK = ('SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK', False)
|
22
|
+
|
23
|
+
def __init__(self, env_var: str, default: bool) -> None:
|
24
|
+
self.env_var = env_var
|
25
|
+
self.default = default
|
26
|
+
|
27
|
+
def __repr__(self) -> str:
|
28
|
+
return self.env_var
|
18
29
|
|
19
|
-
def get(self):
|
30
|
+
def get(self) -> bool:
|
20
31
|
"""Check if an environment variable is set to True."""
|
21
|
-
return os.getenv(self.
|
32
|
+
return os.getenv(self.env_var,
|
33
|
+
str(self.default)).lower() in ('true', '1')
|
34
|
+
|
35
|
+
@property
|
36
|
+
def env_key(self) -> str:
|
37
|
+
"""The environment variable key name."""
|
38
|
+
return self.value[0]
|
39
|
+
|
40
|
+
@classmethod
|
41
|
+
def all_options(cls) -> Dict[str, bool]:
|
42
|
+
"""Returns all options as a dictionary."""
|
43
|
+
return {option.env_key: option.get() for option in list(Options)}
|
@@ -12,9 +12,11 @@ IMAGE_GPU="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:l
|
|
12
12
|
PORT_RANGE_START=30000
|
13
13
|
PORT_RANGE_END=30100
|
14
14
|
|
15
|
+
USER_HASH=$1
|
16
|
+
|
15
17
|
# Check for GPU flag
|
16
18
|
ENABLE_GPUS=false
|
17
|
-
if [[ "$
|
19
|
+
if [[ "$2" == "--gpus" ]]; then
|
18
20
|
ENABLE_GPUS=true
|
19
21
|
fi
|
20
22
|
|
@@ -88,45 +90,20 @@ if kind get clusters | grep -q skypilot; then
|
|
88
90
|
fi
|
89
91
|
|
90
92
|
# Generate cluster YAML
|
91
|
-
|
93
|
+
YAML_PATH="/tmp/skypilot-kind-$USER_HASH.yaml"
|
94
|
+
echo "Generating $YAML_PATH"
|
92
95
|
|
93
96
|
# Add GPUs flag to the generate_kind_config.py command if GPUs are enabled
|
94
97
|
if $ENABLE_GPUS; then
|
95
|
-
python -m sky.utils.kubernetes.generate_kind_config --path
|
98
|
+
python -m sky.utils.kubernetes.generate_kind_config --path $YAML_PATH --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END} --gpus
|
96
99
|
else
|
97
|
-
python -m sky.utils.kubernetes.generate_kind_config --path
|
100
|
+
python -m sky.utils.kubernetes.generate_kind_config --path $YAML_PATH --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END}
|
98
101
|
fi
|
99
102
|
|
100
|
-
kind create cluster --config
|
103
|
+
kind create cluster --config $YAML_PATH --name skypilot
|
101
104
|
|
102
105
|
echo "Kind cluster created."
|
103
106
|
|
104
|
-
# Function to wait for SkyPilot GPU labeling jobs to complete
|
105
|
-
wait_for_gpu_labeling_jobs() {
|
106
|
-
echo "Starting wait for SkyPilot GPU labeling jobs to complete..."
|
107
|
-
|
108
|
-
SECONDS=0
|
109
|
-
TIMEOUT=600 # 10 minutes in seconds
|
110
|
-
|
111
|
-
while true; do
|
112
|
-
TOTAL_JOBS=$(kubectl get jobs -n kube-system -l job=sky-gpu-labeler --no-headers | wc -l)
|
113
|
-
COMPLETED_JOBS=$(kubectl get jobs -n kube-system -l job=sky-gpu-labeler --no-headers | grep "1/1" | wc -l)
|
114
|
-
|
115
|
-
if [[ $COMPLETED_JOBS -eq $TOTAL_JOBS ]]; then
|
116
|
-
echo "All SkyPilot GPU labeling jobs completed ($TOTAL_JOBS)."
|
117
|
-
break
|
118
|
-
elif [ $SECONDS -ge $TIMEOUT ]; then
|
119
|
-
echo "Timeout reached while waiting for GPU labeling jobs."
|
120
|
-
exit 1
|
121
|
-
else
|
122
|
-
echo "Waiting for GPU labeling jobs to complete... ($COMPLETED_JOBS/$TOTAL_JOBS completed)"
|
123
|
-
echo "To check status, see GPU labeling pods:"
|
124
|
-
echo "kubectl get jobs -n kube-system -l job=sky-gpu-labeler"
|
125
|
-
sleep 5
|
126
|
-
fi
|
127
|
-
done
|
128
|
-
}
|
129
|
-
|
130
107
|
# Function to wait for GPU operator to be correctly installed
|
131
108
|
wait_for_gpu_operator_installation() {
|
132
109
|
echo "Starting wait for GPU operator installation..."
|
@@ -150,22 +127,6 @@ wait_for_gpu_operator_installation() {
|
|
150
127
|
done
|
151
128
|
}
|
152
129
|
|
153
|
-
wait_for_skypilot_gpu_image_pull() {
|
154
|
-
echo "Pulling SkyPilot GPU image..."
|
155
|
-
docker pull ${IMAGE_GPU}
|
156
|
-
echo "Loading SkyPilot GPU image into kind cluster..."
|
157
|
-
kind load docker-image --name skypilot ${IMAGE_GPU}
|
158
|
-
echo "SkyPilot GPU image loaded into kind cluster."
|
159
|
-
}
|
160
|
-
|
161
|
-
wait_for_skypilot_cpu_image_pull() {
|
162
|
-
echo "Pulling SkyPilot CPU image..."
|
163
|
-
docker pull ${IMAGE}
|
164
|
-
echo "Loading SkyPilot CPU image into kind cluster..."
|
165
|
-
kind load docker-image --name skypilot ${IMAGE}
|
166
|
-
echo "SkyPilot CPU image loaded into kind cluster."
|
167
|
-
}
|
168
|
-
|
169
130
|
wait_for_nginx_ingress_controller_install() {
|
170
131
|
echo "Starting installation of Nginx Ingress Controller..."
|
171
132
|
|
@@ -206,21 +167,8 @@ if $ENABLE_GPUS; then
|
|
206
167
|
nvidia/gpu-operator --set driver.enabled=false
|
207
168
|
# Wait for GPU operator installation to succeed
|
208
169
|
wait_for_gpu_operator_installation
|
209
|
-
|
210
|
-
# Load the SkyPilot GPU image into the cluster for faster labelling
|
211
|
-
wait_for_skypilot_gpu_image_pull
|
212
|
-
|
213
|
-
# Label nodes with GPUs
|
214
|
-
echo "Labelling nodes with GPUs..."
|
215
|
-
python -m sky.utils.kubernetes.gpu_labeler
|
216
|
-
|
217
|
-
# Wait for all the GPU labeling jobs to complete
|
218
|
-
wait_for_gpu_labeling_jobs
|
219
170
|
fi
|
220
171
|
|
221
|
-
# Load local skypilot image on to the cluster for faster startup
|
222
|
-
wait_for_skypilot_cpu_image_pull
|
223
|
-
|
224
172
|
# Install the Nginx Ingress Controller
|
225
173
|
wait_for_nginx_ingress_controller_install
|
226
174
|
|
@@ -0,0 +1,243 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script.
|
3
|
+
set -e
|
4
|
+
|
5
|
+
# Colors for nicer UX
|
6
|
+
RED='\033[0;31m'
|
7
|
+
GREEN='\033[0;32m'
|
8
|
+
YELLOW='\033[1;33m'
|
9
|
+
NC='\033[0m' # No color
|
10
|
+
|
11
|
+
# Variables
|
12
|
+
IPS_FILE=$1
|
13
|
+
USER=$2
|
14
|
+
SSH_KEY=$3
|
15
|
+
K3S_TOKEN=mytoken # Any string can be used as the token
|
16
|
+
CLEANUP=false
|
17
|
+
INSTALL_GPU=false
|
18
|
+
|
19
|
+
if [[ "$4" == "--cleanup" ]]; then
|
20
|
+
CLEANUP=true
|
21
|
+
fi
|
22
|
+
|
23
|
+
# Basic argument checks
|
24
|
+
if [ -z "$IPS_FILE" ] || [ -z "$USER" ] || [ -z "$SSH_KEY" ]; then
|
25
|
+
>&2 echo -e "${RED}Error: Missing required arguments.${NC}"
|
26
|
+
>&2 echo "Usage: ./deploy_remote_cluster.sh ips.txt username path/to/ssh/key [--cleanup]"
|
27
|
+
exit 1
|
28
|
+
fi
|
29
|
+
|
30
|
+
# Check if SSH key exists
|
31
|
+
if [ ! -f "$SSH_KEY" ]; then
|
32
|
+
>&2 echo -e "${RED}Error: SSH key not found: $SSH_KEY${NC}"
|
33
|
+
exit 1
|
34
|
+
fi
|
35
|
+
|
36
|
+
# Check if IPs file exists
|
37
|
+
if [ ! -f "$IPS_FILE" ]; then
|
38
|
+
>&2 echo -e "${RED}Error: IPs file not found: $IPS_FILE${NC}"
|
39
|
+
exit 1
|
40
|
+
fi
|
41
|
+
|
42
|
+
# Get head node and worker nodes from the IPs file
|
43
|
+
HEAD_NODE=$(head -n 1 "$IPS_FILE")
|
44
|
+
WORKER_NODES=$(tail -n +2 "$IPS_FILE")
|
45
|
+
|
46
|
+
# Check if the IPs file is empty or not formatted correctly
|
47
|
+
if [ -z "$HEAD_NODE" ]; then
|
48
|
+
>&2 echo -e "${RED}Error: IPs file is empty or not formatted correctly.${NC}"
|
49
|
+
exit 1
|
50
|
+
fi
|
51
|
+
|
52
|
+
# Function to show a progress message
|
53
|
+
progress_message() {
|
54
|
+
echo -e "${YELLOW}➜ $1${NC}"
|
55
|
+
}
|
56
|
+
|
57
|
+
# Step to display success
|
58
|
+
success_message() {
|
59
|
+
echo -e "${GREEN}✔ $1${NC}"
|
60
|
+
}
|
61
|
+
|
62
|
+
# Function to run a command on a remote machine via SSH
|
63
|
+
run_remote() {
|
64
|
+
local NODE_IP=$1
|
65
|
+
local CMD=$2
|
66
|
+
# echo -e "${YELLOW}Running command on $NODE_IP...${NC}"
|
67
|
+
ssh -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$NODE_IP" "$CMD"
|
68
|
+
}
|
69
|
+
|
70
|
+
# Function to uninstall k3s and clean up the state on a remote machine
|
71
|
+
cleanup_server_node() {
|
72
|
+
local NODE_IP=$1
|
73
|
+
echo -e "${YELLOW}Cleaning up head node $NODE_IP...${NC}"
|
74
|
+
run_remote "$NODE_IP" "
|
75
|
+
echo 'Uninstalling k3s...' &&
|
76
|
+
/usr/local/bin/k3s-uninstall.sh || true &&
|
77
|
+
sudo rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
78
|
+
"
|
79
|
+
echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
|
80
|
+
}
|
81
|
+
|
82
|
+
# Function to uninstall k3s and clean up the state on a remote machine
|
83
|
+
cleanup_agent_node() {
|
84
|
+
local NODE_IP=$1
|
85
|
+
echo -e "${YELLOW}Cleaning up node $NODE_IP...${NC}"
|
86
|
+
run_remote "$NODE_IP" "
|
87
|
+
echo 'Uninstalling k3s...' &&
|
88
|
+
/usr/local/bin/k3s-agent-uninstall.sh || true &&
|
89
|
+
sudo rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
90
|
+
"
|
91
|
+
echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
|
92
|
+
}
|
93
|
+
|
94
|
+
check_gpu() {
|
95
|
+
local NODE_IP=$1
|
96
|
+
if run_remote "$NODE_IP" "command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null"; then
|
97
|
+
return 0 # GPU detected
|
98
|
+
else
|
99
|
+
return 1 # No GPU detected
|
100
|
+
fi
|
101
|
+
}
|
102
|
+
|
103
|
+
# Pre-flight checks
|
104
|
+
run_remote "$HEAD_NODE" "echo 'SSH connection successful'"
|
105
|
+
# TODO: Add more pre-flight checks here, including checking if port 6443 is accessible
|
106
|
+
|
107
|
+
# If --cleanup flag is set, uninstall k3s and exit
|
108
|
+
if [ "$CLEANUP" == "true" ]; then
|
109
|
+
echo -e "${YELLOW}Starting cleanup...${NC}"
|
110
|
+
|
111
|
+
# Clean up head node
|
112
|
+
cleanup_server_node "$HEAD_NODE"
|
113
|
+
|
114
|
+
# Clean up worker nodes
|
115
|
+
for NODE in $WORKER_NODES; do
|
116
|
+
cleanup_agent_node "$NODE"
|
117
|
+
done
|
118
|
+
|
119
|
+
echo -e "${GREEN}Cleanup completed successfully.${NC}"
|
120
|
+
exit 0
|
121
|
+
fi
|
122
|
+
|
123
|
+
# Step 1: Install k3s on the head node
|
124
|
+
progress_message "Deploying Kubernetes on head node ($HEAD_NODE)..."
|
125
|
+
run_remote "$HEAD_NODE" "
|
126
|
+
curl -sfL https://get.k3s.io | K3S_TOKEN=$K3S_TOKEN sh - &&
|
127
|
+
mkdir -p ~/.kube &&
|
128
|
+
sudo cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
|
129
|
+
sudo chown \$(id -u):\$(id -g) ~/.kube/config &&
|
130
|
+
for i in {1..3}; do
|
131
|
+
if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
|
132
|
+
break
|
133
|
+
else
|
134
|
+
echo 'Waiting for nodes to be ready...'
|
135
|
+
sleep 5
|
136
|
+
fi
|
137
|
+
done
|
138
|
+
if [ $i -eq 3 ]; then
|
139
|
+
echo 'Failed to wait for nodes to be ready after 3 attempts'
|
140
|
+
exit 1
|
141
|
+
fi"
|
142
|
+
success_message "K3s deployed on head node."
|
143
|
+
|
144
|
+
# Check if head node has a GPU
|
145
|
+
if check_gpu "$HEAD_NODE"; then
|
146
|
+
echo -e "${YELLOW}GPU detected on head node ($HEAD_NODE).${NC}"
|
147
|
+
INSTALL_GPU=true
|
148
|
+
fi
|
149
|
+
|
150
|
+
# Fetch the head node's internal IP (this will be passed to worker nodes)
|
151
|
+
MASTER_ADDR=$(run_remote "$HEAD_NODE" "hostname -I | awk '{print \$1}'")
|
152
|
+
|
153
|
+
echo -e "${GREEN}Master node internal IP: $MASTER_ADDR${NC}"
|
154
|
+
|
155
|
+
# Step 2: Install k3s on worker nodes and join them to the master node
|
156
|
+
for NODE in $WORKER_NODES; do
|
157
|
+
progress_message "Deploying Kubernetes on worker node ($NODE)..."
|
158
|
+
run_remote "$NODE" "
|
159
|
+
curl -sfL https://get.k3s.io | K3S_URL=https://$MASTER_ADDR:6443 K3S_TOKEN=$K3S_TOKEN sh -"
|
160
|
+
success_message "Kubernetes deployed on worker node ($NODE)."
|
161
|
+
|
162
|
+
# Check if worker node has a GPU
|
163
|
+
if check_gpu "$NODE"; then
|
164
|
+
echo -e "${YELLOW}GPU detected on worker node ($NODE).${NC}"
|
165
|
+
INSTALL_GPU=true
|
166
|
+
fi
|
167
|
+
done
|
168
|
+
# Step 3: Configure local kubectl to connect to the cluster
|
169
|
+
progress_message "Configuring local kubectl to connect to the cluster..."
|
170
|
+
scp -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$HEAD_NODE":~/.kube/config ~/.kube/config
|
171
|
+
|
172
|
+
# Back up the original kubeconfig file if it exists
|
173
|
+
KUBECONFIG_FILE="$HOME/.kube/config"
|
174
|
+
if [[ -f "$KUBECONFIG_FILE" ]]; then
|
175
|
+
echo "Backing up existing kubeconfig to $KUBECONFIG_FILE.bak"
|
176
|
+
cp "$KUBECONFIG_FILE" "$KUBECONFIG_FILE.bak"
|
177
|
+
fi
|
178
|
+
|
179
|
+
# Update kubeconfig for the local machine to use the master node's IP
|
180
|
+
# Temporary file to hold the modified kubeconfig
|
181
|
+
TEMP_FILE=$(mktemp)
|
182
|
+
|
183
|
+
# Remove the certificate-authority-data, and replace the server with the master address
|
184
|
+
awk '
|
185
|
+
BEGIN { in_cluster = 0 }
|
186
|
+
/^clusters:/ { in_cluster = 1 }
|
187
|
+
/^users:/ { in_cluster = 0 }
|
188
|
+
in_cluster && /^ *certificate-authority-data:/ { next }
|
189
|
+
in_cluster && /^ *server:/ {
|
190
|
+
print " server: https://'${HEAD_NODE}:6443'"
|
191
|
+
print " insecure-skip-tls-verify: true"
|
192
|
+
next
|
193
|
+
}
|
194
|
+
{ print }
|
195
|
+
' "$KUBECONFIG_FILE" > "$TEMP_FILE"
|
196
|
+
|
197
|
+
# Replace the original kubeconfig with the modified one
|
198
|
+
mv "$TEMP_FILE" "$KUBECONFIG_FILE"
|
199
|
+
|
200
|
+
success_message "kubectl configured to connect to the cluster."
|
201
|
+
|
202
|
+
echo "Cluster deployment completed. You can now run 'kubectl get nodes' to verify the setup."
|
203
|
+
|
204
|
+
# Install GPU operator if a GPU was detected on any node
|
205
|
+
if [ "$INSTALL_GPU" == "true" ]; then
|
206
|
+
echo -e "${YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...${NC}"
|
207
|
+
run_remote "$HEAD_NODE" "
|
208
|
+
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
|
209
|
+
chmod 700 get_helm.sh &&
|
210
|
+
./get_helm.sh &&
|
211
|
+
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
|
212
|
+
kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
|
213
|
+
sudo ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
|
214
|
+
helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \
|
215
|
+
--set 'toolkit.env[0].name=CONTAINERD_CONFIG' \
|
216
|
+
--set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \
|
217
|
+
--set 'toolkit.env[1].name=CONTAINERD_SOCKET' \
|
218
|
+
--set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \
|
219
|
+
--set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \
|
220
|
+
--set 'toolkit.env[2].value=nvidia' &&
|
221
|
+
echo 'Waiting for GPU operator installation...' &&
|
222
|
+
while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:'; do
|
223
|
+
echo 'Waiting for GPU operator...'
|
224
|
+
sleep 5
|
225
|
+
done
|
226
|
+
echo 'GPU operator installed successfully.'"
|
227
|
+
success_message "GPU Operator installed."
|
228
|
+
else
|
229
|
+
echo -e "${YELLOW}No GPUs detected. Skipping GPU Operator installation.${NC}"
|
230
|
+
fi
|
231
|
+
|
232
|
+
# Configure SkyPilot
|
233
|
+
progress_message "Configuring SkyPilot..."
|
234
|
+
sky check kubernetes
|
235
|
+
success_message "SkyPilot configured successfully."
|
236
|
+
|
237
|
+
# Display final success message
|
238
|
+
echo -e "${GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}"
|
239
|
+
echo "You can now interact with your Kubernetes cluster through SkyPilot: "
|
240
|
+
echo " • List available GPUs: sky show-gpus --cloud kubernetes"
|
241
|
+
echo " • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes --gpus A100:1"
|
242
|
+
echo " • Connect to pod with SSH: ssh devbox"
|
243
|
+
echo " • Connect to pod with VSCode: code --remote ssh-remote+devbox '/'"
|