skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,9 @@ import yaml
|
|
9
9
|
|
10
10
|
from sky.adaptors import kubernetes
|
11
11
|
from sky.provision import common
|
12
|
+
from sky.provision.kubernetes import network_utils
|
12
13
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
14
|
+
from sky.utils import kubernetes_enums
|
13
15
|
|
14
16
|
logger = logging.getLogger(__name__)
|
15
17
|
|
@@ -21,11 +23,16 @@ def bootstrap_instances(
|
|
21
23
|
region: str, cluster_name: str,
|
22
24
|
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
23
25
|
del region, cluster_name # unused
|
24
|
-
namespace = kubernetes_utils.
|
26
|
+
namespace = kubernetes_utils.get_namespace_from_config(
|
27
|
+
config.provider_config)
|
28
|
+
context = kubernetes_utils.get_context_from_config(config.provider_config)
|
25
29
|
|
26
|
-
_configure_services(namespace, config.provider_config)
|
30
|
+
_configure_services(namespace, context, config.provider_config)
|
27
31
|
|
28
|
-
|
32
|
+
networking_mode = network_utils.get_networking_mode(
|
33
|
+
config.provider_config.get('networking_mode'))
|
34
|
+
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
35
|
+
config = _configure_ssh_jump(namespace, context, config)
|
29
36
|
|
30
37
|
requested_service_account = config.node_config['spec']['serviceAccountName']
|
31
38
|
if (requested_service_account ==
|
@@ -35,26 +42,47 @@ def bootstrap_instances(
|
|
35
42
|
# necessary roles and role bindings.
|
36
43
|
# If not, set up the roles and bindings for skypilot-service-account
|
37
44
|
# here.
|
38
|
-
_configure_autoscaler_service_account(namespace,
|
45
|
+
_configure_autoscaler_service_account(namespace, context,
|
46
|
+
config.provider_config)
|
39
47
|
_configure_autoscaler_role(namespace,
|
48
|
+
context,
|
40
49
|
config.provider_config,
|
41
50
|
role_field='autoscaler_role')
|
42
51
|
_configure_autoscaler_role_binding(
|
43
52
|
namespace,
|
53
|
+
context,
|
44
54
|
config.provider_config,
|
45
55
|
binding_field='autoscaler_role_binding')
|
46
|
-
_configure_autoscaler_cluster_role(namespace,
|
47
|
-
|
56
|
+
_configure_autoscaler_cluster_role(namespace, context,
|
57
|
+
config.provider_config)
|
58
|
+
_configure_autoscaler_cluster_role_binding(namespace, context,
|
48
59
|
config.provider_config)
|
60
|
+
# SkyPilot system namespace is required for FUSE mounting. Here we just
|
61
|
+
# create the namespace and set up the necessary permissions.
|
62
|
+
#
|
63
|
+
# We need to setup the namespace outside the
|
64
|
+
# if config.provider_config.get('fuse_device_required') block below
|
65
|
+
# because if we put in the if block, the following happens:
|
66
|
+
# 1. User launches job controller on Kubernetes with SERVICE_ACCOUNT. No
|
67
|
+
# namespace is created at this point since the controller does not
|
68
|
+
# require FUSE.
|
69
|
+
# 2. User submits a job requiring FUSE.
|
70
|
+
# 3. The namespace is created here, but since the job controller is
|
71
|
+
# using DEFAULT_SERVICE_ACCOUNT_NAME, it does not have the necessary
|
72
|
+
# permissions to create a role for itself to create the FUSE manager.
|
73
|
+
# 4. The job fails to launch.
|
74
|
+
_configure_skypilot_system_namespace(config.provider_config)
|
49
75
|
if config.provider_config.get('port_mode', 'loadbalancer') == 'ingress':
|
50
76
|
logger.info('Port mode is set to ingress, setting up ingress role '
|
51
77
|
'and role binding.')
|
52
78
|
try:
|
53
79
|
_configure_autoscaler_role(namespace,
|
80
|
+
context,
|
54
81
|
config.provider_config,
|
55
82
|
role_field='autoscaler_ingress_role')
|
56
83
|
_configure_autoscaler_role_binding(
|
57
84
|
namespace,
|
85
|
+
context,
|
58
86
|
config.provider_config,
|
59
87
|
binding_field='autoscaler_ingress_role_binding')
|
60
88
|
except kubernetes.api_exception() as e:
|
@@ -69,26 +97,8 @@ def bootstrap_instances(
|
|
69
97
|
elif requested_service_account != 'default':
|
70
98
|
logger.info(f'Using service account {requested_service_account!r}, '
|
71
99
|
'skipping role and role binding setup.')
|
72
|
-
|
73
|
-
# SkyPilot system namespace is required for FUSE mounting. Here we just
|
74
|
-
# create the namespace and set up the necessary permissions.
|
75
|
-
#
|
76
|
-
# We need to setup the namespace outside the if block below because if
|
77
|
-
# we put in the if block, the following happens:
|
78
|
-
# 1. User launches job controller on Kubernetes with SERVICE_ACCOUNT. No
|
79
|
-
# namespace is created at this point since the controller does not
|
80
|
-
# require FUSE.
|
81
|
-
# 2. User submits a job requiring FUSE.
|
82
|
-
# 3. The namespace is created here, but since the job controller is using
|
83
|
-
# SERVICE_ACCOUNT, it does not have the necessary permissions to create
|
84
|
-
# a role for itself to create the FUSE device manager.
|
85
|
-
# 4. The job fails to launch.
|
86
|
-
_configure_skypilot_system_namespace(config.provider_config,
|
87
|
-
requested_service_account)
|
88
|
-
|
89
100
|
if config.provider_config.get('fuse_device_required', False):
|
90
101
|
_configure_fuse_mounting(config.provider_config)
|
91
|
-
|
92
102
|
return config
|
93
103
|
|
94
104
|
|
@@ -222,7 +232,7 @@ def _get_resource(container_resources: Dict[str, Any], resource_name: str,
|
|
222
232
|
# Look for keys containing the resource_name. For example,
|
223
233
|
# the key 'nvidia.com/gpu' contains the key 'gpu'.
|
224
234
|
matching_keys = [key for key in resources if resource_name in key.lower()]
|
225
|
-
if
|
235
|
+
if not matching_keys:
|
226
236
|
return float('inf')
|
227
237
|
if len(matching_keys) > 1:
|
228
238
|
# Should have only one match -- mostly relevant for gpu.
|
@@ -237,7 +247,8 @@ def _get_resource(container_resources: Dict[str, Any], resource_name: str,
|
|
237
247
|
|
238
248
|
|
239
249
|
def _configure_autoscaler_service_account(
|
240
|
-
namespace: str,
|
250
|
+
namespace: str, context: Optional[str],
|
251
|
+
provider_config: Dict[str, Any]) -> None:
|
241
252
|
account_field = 'autoscaler_service_account'
|
242
253
|
if account_field not in provider_config:
|
243
254
|
logger.info('_configure_autoscaler_service_account: '
|
@@ -252,9 +263,9 @@ def _configure_autoscaler_service_account(
|
|
252
263
|
|
253
264
|
name = account['metadata']['name']
|
254
265
|
field_selector = f'metadata.name={name}'
|
255
|
-
accounts = (kubernetes.core_api().list_namespaced_service_account(
|
266
|
+
accounts = (kubernetes.core_api(context).list_namespaced_service_account(
|
256
267
|
namespace, field_selector=field_selector).items)
|
257
|
-
if
|
268
|
+
if accounts:
|
258
269
|
assert len(accounts) == 1
|
259
270
|
# Nothing to check for equality and patch here,
|
260
271
|
# since the service_account.metadata.name is the only important
|
@@ -265,12 +276,14 @@ def _configure_autoscaler_service_account(
|
|
265
276
|
|
266
277
|
logger.info('_configure_autoscaler_service_account: '
|
267
278
|
f'{not_found_msg(account_field, name)}')
|
268
|
-
kubernetes.core_api().create_namespaced_service_account(
|
279
|
+
kubernetes.core_api(context).create_namespaced_service_account(
|
280
|
+
namespace, account)
|
269
281
|
logger.info('_configure_autoscaler_service_account: '
|
270
282
|
f'{created_msg(account_field, name)}')
|
271
283
|
|
272
284
|
|
273
|
-
def _configure_autoscaler_role(namespace: str,
|
285
|
+
def _configure_autoscaler_role(namespace: str, context: Optional[str],
|
286
|
+
provider_config: Dict[str, Any],
|
274
287
|
role_field: str) -> None:
|
275
288
|
""" Reads the role from the provider config, creates if it does not exist.
|
276
289
|
|
@@ -293,9 +306,9 @@ def _configure_autoscaler_role(namespace: str, provider_config: Dict[str, Any],
|
|
293
306
|
|
294
307
|
name = role['metadata']['name']
|
295
308
|
field_selector = f'metadata.name={name}'
|
296
|
-
roles = (kubernetes.auth_api().list_namespaced_role(
|
309
|
+
roles = (kubernetes.auth_api(context).list_namespaced_role(
|
297
310
|
namespace, field_selector=field_selector).items)
|
298
|
-
if
|
311
|
+
if roles:
|
299
312
|
assert len(roles) == 1
|
300
313
|
existing_role = roles[0]
|
301
314
|
# Convert to k8s object to compare
|
@@ -306,17 +319,19 @@ def _configure_autoscaler_role(namespace: str, provider_config: Dict[str, Any],
|
|
306
319
|
return
|
307
320
|
logger.info('_configure_autoscaler_role: '
|
308
321
|
f'{updating_existing_msg(role_field, name)}')
|
309
|
-
kubernetes.auth_api().patch_namespaced_role(
|
322
|
+
kubernetes.auth_api(context).patch_namespaced_role(
|
323
|
+
name, namespace, role)
|
310
324
|
return
|
311
325
|
|
312
326
|
logger.info('_configure_autoscaler_role: '
|
313
327
|
f'{not_found_msg(role_field, name)}')
|
314
|
-
kubernetes.auth_api().create_namespaced_role(namespace, role)
|
328
|
+
kubernetes.auth_api(context).create_namespaced_role(namespace, role)
|
315
329
|
logger.info(f'_configure_autoscaler_role: {created_msg(role_field, name)}')
|
316
330
|
|
317
331
|
|
318
332
|
def _configure_autoscaler_role_binding(
|
319
333
|
namespace: str,
|
334
|
+
context: Optional[str],
|
320
335
|
provider_config: Dict[str, Any],
|
321
336
|
binding_field: str,
|
322
337
|
override_name: Optional[str] = None,
|
@@ -357,9 +372,9 @@ def _configure_autoscaler_role_binding(
|
|
357
372
|
name = binding['metadata']['name']
|
358
373
|
|
359
374
|
field_selector = f'metadata.name={name}'
|
360
|
-
role_bindings = (kubernetes.auth_api().list_namespaced_role_binding(
|
375
|
+
role_bindings = (kubernetes.auth_api(context).list_namespaced_role_binding(
|
361
376
|
rb_namespace, field_selector=field_selector).items)
|
362
|
-
if
|
377
|
+
if role_bindings:
|
363
378
|
assert len(role_bindings) == 1
|
364
379
|
existing_binding = role_bindings[0]
|
365
380
|
new_rb = kubernetes_utils.dict_to_k8s_object(binding, 'V1RoleBinding')
|
@@ -370,18 +385,19 @@ def _configure_autoscaler_role_binding(
|
|
370
385
|
return
|
371
386
|
logger.info('_configure_autoscaler_role_binding: '
|
372
387
|
f'{updating_existing_msg(binding_field, name)}')
|
373
|
-
kubernetes.auth_api().patch_namespaced_role_binding(
|
388
|
+
kubernetes.auth_api(context).patch_namespaced_role_binding(
|
374
389
|
name, rb_namespace, binding)
|
375
390
|
return
|
376
391
|
|
377
392
|
logger.info('_configure_autoscaler_role_binding: '
|
378
393
|
f'{not_found_msg(binding_field, name)}')
|
379
|
-
kubernetes.auth_api().create_namespaced_role_binding(
|
394
|
+
kubernetes.auth_api(context).create_namespaced_role_binding(
|
395
|
+
rb_namespace, binding)
|
380
396
|
logger.info('_configure_autoscaler_role_binding: '
|
381
397
|
f'{created_msg(binding_field, name)}')
|
382
398
|
|
383
399
|
|
384
|
-
def _configure_autoscaler_cluster_role(namespace,
|
400
|
+
def _configure_autoscaler_cluster_role(namespace, context,
|
385
401
|
provider_config: Dict[str, Any]) -> None:
|
386
402
|
role_field = 'autoscaler_cluster_role'
|
387
403
|
if role_field not in provider_config:
|
@@ -397,9 +413,9 @@ def _configure_autoscaler_cluster_role(namespace,
|
|
397
413
|
|
398
414
|
name = role['metadata']['name']
|
399
415
|
field_selector = f'metadata.name={name}'
|
400
|
-
cluster_roles = (kubernetes.auth_api().list_cluster_role(
|
416
|
+
cluster_roles = (kubernetes.auth_api(context).list_cluster_role(
|
401
417
|
field_selector=field_selector).items)
|
402
|
-
if
|
418
|
+
if cluster_roles:
|
403
419
|
assert len(cluster_roles) == 1
|
404
420
|
existing_cr = cluster_roles[0]
|
405
421
|
new_cr = kubernetes_utils.dict_to_k8s_object(role, 'V1ClusterRole')
|
@@ -409,18 +425,18 @@ def _configure_autoscaler_cluster_role(namespace,
|
|
409
425
|
return
|
410
426
|
logger.info('_configure_autoscaler_cluster_role: '
|
411
427
|
f'{updating_existing_msg(role_field, name)}')
|
412
|
-
kubernetes.auth_api().patch_cluster_role(name, role)
|
428
|
+
kubernetes.auth_api(context).patch_cluster_role(name, role)
|
413
429
|
return
|
414
430
|
|
415
431
|
logger.info('_configure_autoscaler_cluster_role: '
|
416
432
|
f'{not_found_msg(role_field, name)}')
|
417
|
-
kubernetes.auth_api().create_cluster_role(role)
|
433
|
+
kubernetes.auth_api(context).create_cluster_role(role)
|
418
434
|
logger.info(
|
419
435
|
f'_configure_autoscaler_cluster_role: {created_msg(role_field, name)}')
|
420
436
|
|
421
437
|
|
422
438
|
def _configure_autoscaler_cluster_role_binding(
|
423
|
-
namespace, provider_config: Dict[str, Any]) -> None:
|
439
|
+
namespace, context, provider_config: Dict[str, Any]) -> None:
|
424
440
|
binding_field = 'autoscaler_cluster_role_binding'
|
425
441
|
if binding_field not in provider_config:
|
426
442
|
logger.info('_configure_autoscaler_cluster_role_binding: '
|
@@ -442,9 +458,9 @@ def _configure_autoscaler_cluster_role_binding(
|
|
442
458
|
|
443
459
|
name = binding['metadata']['name']
|
444
460
|
field_selector = f'metadata.name={name}'
|
445
|
-
cr_bindings = (kubernetes.auth_api().list_cluster_role_binding(
|
461
|
+
cr_bindings = (kubernetes.auth_api(context).list_cluster_role_binding(
|
446
462
|
field_selector=field_selector).items)
|
447
|
-
if
|
463
|
+
if cr_bindings:
|
448
464
|
assert len(cr_bindings) == 1
|
449
465
|
existing_binding = cr_bindings[0]
|
450
466
|
new_binding = kubernetes_utils.dict_to_k8s_object(
|
@@ -456,17 +472,17 @@ def _configure_autoscaler_cluster_role_binding(
|
|
456
472
|
return
|
457
473
|
logger.info('_configure_autoscaler_cluster_role_binding: '
|
458
474
|
f'{updating_existing_msg(binding_field, name)}')
|
459
|
-
kubernetes.auth_api().patch_cluster_role_binding(name, binding)
|
475
|
+
kubernetes.auth_api(context).patch_cluster_role_binding(name, binding)
|
460
476
|
return
|
461
477
|
|
462
478
|
logger.info('_configure_autoscaler_cluster_role_binding: '
|
463
479
|
f'{not_found_msg(binding_field, name)}')
|
464
|
-
kubernetes.auth_api().create_cluster_role_binding(binding)
|
480
|
+
kubernetes.auth_api(context).create_cluster_role_binding(binding)
|
465
481
|
logger.info('_configure_autoscaler_cluster_role_binding: '
|
466
482
|
f'{created_msg(binding_field, name)}')
|
467
483
|
|
468
484
|
|
469
|
-
def _configure_ssh_jump(namespace, config: common.ProvisionConfig):
|
485
|
+
def _configure_ssh_jump(namespace, context, config: common.ProvisionConfig):
|
470
486
|
"""Creates a SSH jump pod to connect to the cluster.
|
471
487
|
|
472
488
|
Also updates config['auth']['ssh_proxy_command'] to use the newly created
|
@@ -497,13 +513,12 @@ def _configure_ssh_jump(namespace, config: common.ProvisionConfig):
|
|
497
513
|
# service is missing, we should raise an error.
|
498
514
|
|
499
515
|
kubernetes_utils.setup_ssh_jump_pod(ssh_jump_name, ssh_jump_image,
|
500
|
-
ssh_key_secret_name, namespace)
|
516
|
+
ssh_key_secret_name, namespace, context)
|
501
517
|
return config
|
502
518
|
|
503
519
|
|
504
520
|
def _configure_skypilot_system_namespace(
|
505
|
-
provider_config: Dict[str,
|
506
|
-
Any], service_account: Optional[str]) -> None:
|
521
|
+
provider_config: Dict[str, Any]) -> None:
|
507
522
|
"""Creates the namespace for skypilot-system mounting if it does not exist.
|
508
523
|
|
509
524
|
Also patches the SkyPilot service account to have the necessary permissions
|
@@ -511,36 +526,33 @@ def _configure_skypilot_system_namespace(
|
|
511
526
|
"""
|
512
527
|
svc_account_namespace = provider_config['namespace']
|
513
528
|
skypilot_system_namespace = provider_config['skypilot_system_namespace']
|
514
|
-
kubernetes_utils.
|
515
|
-
|
516
|
-
|
517
|
-
#
|
518
|
-
#
|
519
|
-
#
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
binding_field='autoscaler_skypilot_system_role_binding',
|
542
|
-
override_name=override_name,
|
543
|
-
override_subject_namespace=svc_account_namespace)
|
529
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
530
|
+
kubernetes_utils.create_namespace(skypilot_system_namespace, context)
|
531
|
+
|
532
|
+
# Note - this must be run only after the service account has been
|
533
|
+
# created in the cluster (in bootstrap_instances).
|
534
|
+
# Create the role in the skypilot-system namespace if it does not exist.
|
535
|
+
_configure_autoscaler_role(skypilot_system_namespace,
|
536
|
+
context,
|
537
|
+
provider_config,
|
538
|
+
role_field='autoscaler_skypilot_system_role')
|
539
|
+
# We must create a unique role binding per-namespace that SkyPilot is
|
540
|
+
# running in, so we override the name with a unique name identifying
|
541
|
+
# the namespace. This is required for multi-tenant setups where
|
542
|
+
# different SkyPilot instances may be running in different namespaces.
|
543
|
+
override_name = provider_config['autoscaler_skypilot_system_role_binding'][
|
544
|
+
'metadata']['name'] + '-' + svc_account_namespace
|
545
|
+
|
546
|
+
# Create the role binding in the skypilot-system namespace, and have
|
547
|
+
# the subject namespace be the namespace that the SkyPilot service
|
548
|
+
# account is created in.
|
549
|
+
_configure_autoscaler_role_binding(
|
550
|
+
skypilot_system_namespace,
|
551
|
+
context,
|
552
|
+
provider_config,
|
553
|
+
binding_field='autoscaler_skypilot_system_role_binding',
|
554
|
+
override_name=override_name,
|
555
|
+
override_subject_namespace=svc_account_namespace)
|
544
556
|
|
545
557
|
|
546
558
|
def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
|
@@ -560,6 +572,7 @@ def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
|
|
560
572
|
logger.info('_configure_fuse_mounting: Setting up FUSE device manager.')
|
561
573
|
|
562
574
|
fuse_device_manager_namespace = provider_config['skypilot_system_namespace']
|
575
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
563
576
|
|
564
577
|
# Read the device manager YAMLs from the manifests directory
|
565
578
|
root_dir = os.path.dirname(os.path.dirname(__file__))
|
@@ -572,7 +585,7 @@ def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
|
|
572
585
|
config_map = yaml.safe_load(file)
|
573
586
|
kubernetes_utils.merge_custom_metadata(config_map['metadata'])
|
574
587
|
try:
|
575
|
-
kubernetes.core_api().create_namespaced_config_map(
|
588
|
+
kubernetes.core_api(context).create_namespaced_config_map(
|
576
589
|
fuse_device_manager_namespace, config_map)
|
577
590
|
except kubernetes.api_exception() as e:
|
578
591
|
if e.status == 409:
|
@@ -592,7 +605,7 @@ def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
|
|
592
605
|
daemonset = yaml.safe_load(file)
|
593
606
|
kubernetes_utils.merge_custom_metadata(daemonset['metadata'])
|
594
607
|
try:
|
595
|
-
kubernetes.apps_api().create_namespaced_daemon_set(
|
608
|
+
kubernetes.apps_api(context).create_namespaced_daemon_set(
|
596
609
|
fuse_device_manager_namespace, daemonset)
|
597
610
|
except kubernetes.api_exception() as e:
|
598
611
|
if e.status == 409:
|
@@ -608,8 +621,8 @@ def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
|
|
608
621
|
f'in namespace {fuse_device_manager_namespace!r}')
|
609
622
|
|
610
623
|
|
611
|
-
def _configure_services(namespace: str,
|
612
|
-
|
624
|
+
def _configure_services(namespace: str, context: Optional[str],
|
625
|
+
provider_config: Dict[str, Any]) -> None:
|
613
626
|
service_field = 'services'
|
614
627
|
if service_field not in provider_config:
|
615
628
|
logger.info(f'_configure_services: {not_provided_msg(service_field)}')
|
@@ -624,9 +637,9 @@ def _configure_services(namespace: str, provider_config: Dict[str,
|
|
624
637
|
|
625
638
|
name = service['metadata']['name']
|
626
639
|
field_selector = f'metadata.name={name}'
|
627
|
-
services = (kubernetes.core_api().list_namespaced_service(
|
640
|
+
services = (kubernetes.core_api(context).list_namespaced_service(
|
628
641
|
namespace, field_selector=field_selector).items)
|
629
|
-
if
|
642
|
+
if services:
|
630
643
|
assert len(services) == 1
|
631
644
|
existing_service = services[0]
|
632
645
|
# Convert to k8s object to compare
|
@@ -638,12 +651,13 @@ def _configure_services(namespace: str, provider_config: Dict[str,
|
|
638
651
|
else:
|
639
652
|
logger.info('_configure_services: '
|
640
653
|
f'{updating_existing_msg("service", name)}')
|
641
|
-
kubernetes.core_api().patch_namespaced_service(
|
654
|
+
kubernetes.core_api(context).patch_namespaced_service(
|
642
655
|
name, namespace, service)
|
643
656
|
else:
|
644
657
|
logger.info(
|
645
658
|
f'_configure_services: {not_found_msg("service", name)}')
|
646
|
-
kubernetes.core_api().create_namespaced_service(
|
659
|
+
kubernetes.core_api(context).create_namespaced_service(
|
660
|
+
namespace, service)
|
647
661
|
logger.info(f'_configure_services: {created_msg("service", name)}')
|
648
662
|
|
649
663
|
|
@@ -0,0 +1,8 @@
|
|
1
|
+
"""Constants for Kubernetes provisioning."""
|
2
|
+
|
3
|
+
NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure '
|
4
|
+
'nvidia.com/gpu resource is available on the nodes and '
|
5
|
+
'the node labels for identifying GPUs '
|
6
|
+
'(e.g., skypilot.co/accelerator) are setup correctly. ')
|
7
|
+
|
8
|
+
KUBERNETES_IN_CLUSTER_NAMESPACE_ENV_VAR = 'SKYPILOT_IN_CLUSTER_NAMESPACE'
|