skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -26,6 +26,9 @@ spec:
|
|
26
26
|
hostname: smarter-device-management
|
27
27
|
hostNetwork: true
|
28
28
|
dnsPolicy: ClusterFirstWithHostNet
|
29
|
+
tolerations:
|
30
|
+
- effect: NoSchedule
|
31
|
+
operator: Exists
|
29
32
|
containers:
|
30
33
|
- name: smarter-device-manager
|
31
34
|
image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/smarter-device-manager:v1.1.2
|
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Kubernetes network provisioning."""
|
2
2
|
from typing import Any, Dict, List, Optional
|
3
3
|
|
4
|
+
from sky import sky_logging
|
4
5
|
from sky.adaptors import kubernetes
|
5
6
|
from sky.provision import common
|
6
7
|
from sky.provision.kubernetes import network_utils
|
@@ -8,6 +9,8 @@ from sky.provision.kubernetes import utils as kubernetes_utils
|
|
8
9
|
from sky.utils import kubernetes_enums
|
9
10
|
from sky.utils.resources_utils import port_ranges_to_set
|
10
11
|
|
12
|
+
logger = sky_logging.init_logger(__name__)
|
13
|
+
|
11
14
|
_PATH_PREFIX = '/skypilot/{namespace}/{cluster_name_on_cloud}/{port}'
|
12
15
|
_LOADBALANCER_SERVICE_NAME = '{cluster_name_on_cloud}--skypilot-lb'
|
13
16
|
|
@@ -55,7 +58,8 @@ def _open_ports_using_loadbalancer(
|
|
55
58
|
kubernetes_utils.merge_custom_metadata(content['service_spec']['metadata'])
|
56
59
|
|
57
60
|
network_utils.create_or_replace_namespaced_service(
|
58
|
-
namespace=
|
61
|
+
namespace=kubernetes_utils.get_namespace_from_config(provider_config),
|
62
|
+
context=kubernetes_utils.get_context_from_config(provider_config),
|
59
63
|
service_name=service_name,
|
60
64
|
service_spec=content['service_spec'])
|
61
65
|
|
@@ -65,8 +69,9 @@ def _open_ports_using_ingress(
|
|
65
69
|
ports: List[int],
|
66
70
|
provider_config: Dict[str, Any],
|
67
71
|
) -> None:
|
72
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
68
73
|
# Check if an ingress controller exists
|
69
|
-
if not network_utils.ingress_controller_exists():
|
74
|
+
if not network_utils.ingress_controller_exists(context):
|
70
75
|
raise Exception(
|
71
76
|
'Ingress controller not found. '
|
72
77
|
'Install Nginx ingress controller first: '
|
@@ -74,13 +79,14 @@ def _open_ports_using_ingress(
|
|
74
79
|
)
|
75
80
|
|
76
81
|
# Prepare service names, ports, for template rendering
|
77
|
-
service_details = [
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
82
|
+
service_details = [
|
83
|
+
(f'{cluster_name_on_cloud}--skypilot-svc--{port}', port,
|
84
|
+
_PATH_PREFIX.format(
|
85
|
+
cluster_name_on_cloud=cluster_name_on_cloud,
|
86
|
+
port=port,
|
87
|
+
namespace=kubernetes_utils.get_kube_config_context_namespace(
|
88
|
+
context)).rstrip('/').lstrip('/')) for port in ports
|
89
|
+
]
|
84
90
|
|
85
91
|
# Generate ingress and services specs
|
86
92
|
# We batch ingress rule creation because each rule triggers a hot reload of
|
@@ -105,7 +111,9 @@ def _open_ports_using_ingress(
|
|
105
111
|
# Update metadata from config
|
106
112
|
kubernetes_utils.merge_custom_metadata(service_spec['metadata'])
|
107
113
|
network_utils.create_or_replace_namespaced_service(
|
108
|
-
namespace=
|
114
|
+
namespace=kubernetes_utils.get_namespace_from_config(
|
115
|
+
provider_config),
|
116
|
+
context=kubernetes_utils.get_context_from_config(provider_config),
|
109
117
|
service_name=service_name,
|
110
118
|
service_spec=service_spec,
|
111
119
|
)
|
@@ -113,7 +121,8 @@ def _open_ports_using_ingress(
|
|
113
121
|
kubernetes_utils.merge_custom_metadata(content['ingress_spec']['metadata'])
|
114
122
|
# Create or update the single ingress for all services
|
115
123
|
network_utils.create_or_replace_namespaced_ingress(
|
116
|
-
namespace=
|
124
|
+
namespace=kubernetes_utils.get_namespace_from_config(provider_config),
|
125
|
+
context=kubernetes_utils.get_context_from_config(provider_config),
|
117
126
|
ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress',
|
118
127
|
ingress_spec=content['ingress_spec'],
|
119
128
|
)
|
@@ -163,14 +172,16 @@ def _cleanup_ports_for_ingress(
|
|
163
172
|
for port in ports:
|
164
173
|
service_name = f'{cluster_name_on_cloud}--skypilot-svc--{port}'
|
165
174
|
network_utils.delete_namespaced_service(
|
166
|
-
namespace=provider_config.get('namespace',
|
175
|
+
namespace=provider_config.get('namespace',
|
176
|
+
kubernetes_utils.DEFAULT_NAMESPACE),
|
167
177
|
service_name=service_name,
|
168
178
|
)
|
169
179
|
|
170
180
|
# Delete the single ingress used for all ports
|
171
181
|
ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress'
|
172
182
|
network_utils.delete_namespaced_ingress(
|
173
|
-
namespace=
|
183
|
+
namespace=kubernetes_utils.get_namespace_from_config(provider_config),
|
184
|
+
context=kubernetes_utils.get_context_from_config(provider_config),
|
174
185
|
ingress_name=ingress_name,
|
175
186
|
)
|
176
187
|
|
@@ -199,11 +210,13 @@ def query_ports(
|
|
199
210
|
return _query_ports_for_ingress(
|
200
211
|
cluster_name_on_cloud=cluster_name_on_cloud,
|
201
212
|
ports=ports,
|
213
|
+
provider_config=provider_config,
|
202
214
|
)
|
203
215
|
elif port_mode == kubernetes_enums.KubernetesPortMode.PODIP:
|
204
216
|
return _query_ports_for_podip(
|
205
217
|
cluster_name_on_cloud=cluster_name_on_cloud,
|
206
218
|
ports=ports,
|
219
|
+
provider_config=provider_config,
|
207
220
|
)
|
208
221
|
else:
|
209
222
|
return {}
|
@@ -218,12 +231,23 @@ def _query_ports_for_loadbalancer(
|
|
218
231
|
ports: List[int],
|
219
232
|
provider_config: Dict[str, Any],
|
220
233
|
) -> Dict[int, List[common.Endpoint]]:
|
234
|
+
logger.debug(f'Getting loadbalancer IP for cluster {cluster_name_on_cloud}')
|
221
235
|
result: Dict[int, List[common.Endpoint]] = {}
|
222
236
|
service_name = _LOADBALANCER_SERVICE_NAME.format(
|
223
237
|
cluster_name_on_cloud=cluster_name_on_cloud)
|
238
|
+
context = provider_config.get(
|
239
|
+
'context', kubernetes_utils.get_current_kube_config_context_name())
|
240
|
+
namespace = provider_config.get(
|
241
|
+
'namespace',
|
242
|
+
kubernetes_utils.get_kube_config_context_namespace(context))
|
224
243
|
external_ip = network_utils.get_loadbalancer_ip(
|
225
|
-
|
226
|
-
|
244
|
+
context=context,
|
245
|
+
namespace=namespace,
|
246
|
+
service_name=service_name,
|
247
|
+
# Timeout is set so that we can retry the query when the
|
248
|
+
# cluster is firstly created and the load balancer is not ready yet.
|
249
|
+
timeout=60,
|
250
|
+
)
|
227
251
|
|
228
252
|
if external_ip is None:
|
229
253
|
return {}
|
@@ -237,19 +261,24 @@ def _query_ports_for_loadbalancer(
|
|
237
261
|
def _query_ports_for_ingress(
|
238
262
|
cluster_name_on_cloud: str,
|
239
263
|
ports: List[int],
|
264
|
+
provider_config: Dict[str, Any],
|
240
265
|
) -> Dict[int, List[common.Endpoint]]:
|
241
|
-
|
266
|
+
context = provider_config.get(
|
267
|
+
'context', kubernetes_utils.get_current_kube_config_context_name())
|
268
|
+
ingress_details = network_utils.get_ingress_external_ip_and_ports(context)
|
242
269
|
external_ip, external_ports = ingress_details
|
243
270
|
if external_ip is None:
|
244
271
|
return {}
|
245
272
|
|
273
|
+
namespace = provider_config.get(
|
274
|
+
'namespace',
|
275
|
+
kubernetes_utils.get_kube_config_context_namespace(context))
|
246
276
|
result: Dict[int, List[common.Endpoint]] = {}
|
247
277
|
for port in ports:
|
248
278
|
path_prefix = _PATH_PREFIX.format(
|
249
279
|
cluster_name_on_cloud=cluster_name_on_cloud,
|
250
280
|
port=port,
|
251
|
-
namespace=
|
252
|
-
get_current_kube_config_context_namespace())
|
281
|
+
namespace=namespace)
|
253
282
|
|
254
283
|
http_port, https_port = external_ports \
|
255
284
|
if external_ports is not None else (None, None)
|
@@ -268,10 +297,15 @@ def _query_ports_for_ingress(
|
|
268
297
|
def _query_ports_for_podip(
|
269
298
|
cluster_name_on_cloud: str,
|
270
299
|
ports: List[int],
|
300
|
+
provider_config: Dict[str, Any],
|
271
301
|
) -> Dict[int, List[common.Endpoint]]:
|
272
|
-
|
302
|
+
context = provider_config.get(
|
303
|
+
'context', kubernetes_utils.get_current_kube_config_context_name())
|
304
|
+
namespace = provider_config.get(
|
305
|
+
'namespace',
|
306
|
+
kubernetes_utils.get_kube_config_context_namespace(context))
|
273
307
|
pod_name = kubernetes_utils.get_head_pod_name(cluster_name_on_cloud)
|
274
|
-
pod_ip = network_utils.get_pod_ip(namespace, pod_name)
|
308
|
+
pod_ip = network_utils.get_pod_ip(context, namespace, pod_name)
|
275
309
|
|
276
310
|
result: Dict[int, List[common.Endpoint]] = {}
|
277
311
|
if pod_ip is None:
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Kubernetes network provisioning utils."""
|
2
2
|
import os
|
3
|
+
import time
|
3
4
|
from typing import Dict, List, Optional, Tuple, Union
|
4
5
|
|
5
6
|
import jinja2
|
@@ -7,12 +8,15 @@ import yaml
|
|
7
8
|
|
8
9
|
import sky
|
9
10
|
from sky import exceptions
|
11
|
+
from sky import sky_logging
|
10
12
|
from sky import skypilot_config
|
11
13
|
from sky.adaptors import kubernetes
|
12
14
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
13
15
|
from sky.utils import kubernetes_enums
|
14
16
|
from sky.utils import ux_utils
|
15
17
|
|
18
|
+
logger = sky_logging.init_logger(__name__)
|
19
|
+
|
16
20
|
_INGRESS_TEMPLATE_NAME = 'kubernetes-ingress.yml.j2'
|
17
21
|
_LOADBALANCER_TEMPLATE_NAME = 'kubernetes-loadbalancer.yml.j2'
|
18
22
|
|
@@ -43,6 +47,23 @@ def get_port_mode(
|
|
43
47
|
return port_mode
|
44
48
|
|
45
49
|
|
50
|
+
def get_networking_mode(
|
51
|
+
mode_str: Optional[str] = None
|
52
|
+
) -> kubernetes_enums.KubernetesNetworkingMode:
|
53
|
+
"""Get the networking mode from the provider config."""
|
54
|
+
mode_str = mode_str or skypilot_config.get_nested(
|
55
|
+
('kubernetes', 'networking_mode'),
|
56
|
+
kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
|
57
|
+
try:
|
58
|
+
networking_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
|
59
|
+
mode_str)
|
60
|
+
except ValueError as e:
|
61
|
+
with ux_utils.print_exception_no_traceback():
|
62
|
+
raise ValueError(str(e) +
|
63
|
+
' Please check: ~/.sky/config.yaml.') from None
|
64
|
+
return networking_mode
|
65
|
+
|
66
|
+
|
46
67
|
def fill_loadbalancer_template(namespace: str, service_name: str,
|
47
68
|
ports: List[int], selector_key: str,
|
48
69
|
selector_value: str) -> Dict:
|
@@ -54,6 +75,10 @@ def fill_loadbalancer_template(namespace: str, service_name: str,
|
|
54
75
|
|
55
76
|
with open(template_path, 'r', encoding='utf-8') as fin:
|
56
77
|
template = fin.read()
|
78
|
+
annotations = skypilot_config.get_nested(
|
79
|
+
('kubernetes', 'custom_metadata', 'annotations'), {})
|
80
|
+
labels = skypilot_config.get_nested(
|
81
|
+
('kubernetes', 'custom_metadata', 'labels'), {})
|
57
82
|
j2_template = jinja2.Template(template)
|
58
83
|
cont = j2_template.render(
|
59
84
|
namespace=namespace,
|
@@ -61,6 +86,8 @@ def fill_loadbalancer_template(namespace: str, service_name: str,
|
|
61
86
|
ports=ports,
|
62
87
|
selector_key=selector_key,
|
63
88
|
selector_value=selector_value,
|
89
|
+
annotations=annotations,
|
90
|
+
labels=labels,
|
64
91
|
)
|
65
92
|
content = yaml.safe_load(cont)
|
66
93
|
return content
|
@@ -77,6 +104,10 @@ def fill_ingress_template(namespace: str, service_details: List[Tuple[str, int,
|
|
77
104
|
f'Template "{_INGRESS_TEMPLATE_NAME}" does not exist.')
|
78
105
|
with open(template_path, 'r', encoding='utf-8') as fin:
|
79
106
|
template = fin.read()
|
107
|
+
annotations = skypilot_config.get_nested(
|
108
|
+
('kubernetes', 'custom_metadata', 'annotations'), {})
|
109
|
+
labels = skypilot_config.get_nested(
|
110
|
+
('kubernetes', 'custom_metadata', 'labels'), {})
|
80
111
|
j2_template = jinja2.Template(template)
|
81
112
|
cont = j2_template.render(
|
82
113
|
namespace=namespace,
|
@@ -88,6 +119,8 @@ def fill_ingress_template(namespace: str, service_details: List[Tuple[str, int,
|
|
88
119
|
ingress_name=ingress_name,
|
89
120
|
selector_key=selector_key,
|
90
121
|
selector_value=selector_value,
|
122
|
+
annotations=annotations,
|
123
|
+
labels=labels,
|
91
124
|
)
|
92
125
|
content = yaml.safe_load(cont)
|
93
126
|
|
@@ -99,10 +132,10 @@ def fill_ingress_template(namespace: str, service_details: List[Tuple[str, int,
|
|
99
132
|
|
100
133
|
|
101
134
|
def create_or_replace_namespaced_ingress(
|
102
|
-
namespace: str, ingress_name: str,
|
135
|
+
namespace: str, context: Optional[str], ingress_name: str,
|
103
136
|
ingress_spec: Dict[str, Union[str, int]]) -> None:
|
104
137
|
"""Creates an ingress resource for the specified service."""
|
105
|
-
networking_api = kubernetes.networking_api()
|
138
|
+
networking_api = kubernetes.networking_api(context)
|
106
139
|
|
107
140
|
try:
|
108
141
|
networking_api.read_namespaced_ingress(
|
@@ -123,9 +156,10 @@ def create_or_replace_namespaced_ingress(
|
|
123
156
|
_request_timeout=kubernetes.API_TIMEOUT)
|
124
157
|
|
125
158
|
|
126
|
-
def delete_namespaced_ingress(namespace: str,
|
159
|
+
def delete_namespaced_ingress(namespace: str, context: Optional[str],
|
160
|
+
ingress_name: str) -> None:
|
127
161
|
"""Deletes an ingress resource."""
|
128
|
-
networking_api = kubernetes.networking_api()
|
162
|
+
networking_api = kubernetes.networking_api(context)
|
129
163
|
try:
|
130
164
|
networking_api.delete_namespaced_ingress(
|
131
165
|
ingress_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
|
@@ -137,10 +171,10 @@ def delete_namespaced_ingress(namespace: str, ingress_name: str) -> None:
|
|
137
171
|
|
138
172
|
|
139
173
|
def create_or_replace_namespaced_service(
|
140
|
-
namespace: str, service_name: str,
|
174
|
+
namespace: str, context: Optional[str], service_name: str,
|
141
175
|
service_spec: Dict[str, Union[str, int]]) -> None:
|
142
176
|
"""Creates a service resource for the specified service."""
|
143
|
-
core_api = kubernetes.core_api()
|
177
|
+
core_api = kubernetes.core_api(context)
|
144
178
|
|
145
179
|
try:
|
146
180
|
core_api.read_namespaced_service(
|
@@ -174,9 +208,10 @@ def delete_namespaced_service(namespace: str, service_name: str) -> None:
|
|
174
208
|
raise e
|
175
209
|
|
176
210
|
|
177
|
-
def ingress_controller_exists(
|
211
|
+
def ingress_controller_exists(context: Optional[str],
|
212
|
+
ingress_class_name: str = 'nginx') -> bool:
|
178
213
|
"""Checks if an ingress controller exists in the cluster."""
|
179
|
-
networking_api = kubernetes.networking_api()
|
214
|
+
networking_api = kubernetes.networking_api(context)
|
180
215
|
ingress_classes = networking_api.list_ingress_class(
|
181
216
|
_request_timeout=kubernetes.API_TIMEOUT).items
|
182
217
|
return any(
|
@@ -185,16 +220,17 @@ def ingress_controller_exists(ingress_class_name: str = 'nginx') -> bool:
|
|
185
220
|
|
186
221
|
|
187
222
|
def get_ingress_external_ip_and_ports(
|
223
|
+
context: Optional[str],
|
188
224
|
namespace: str = 'ingress-nginx'
|
189
225
|
) -> Tuple[Optional[str], Optional[Tuple[int, int]]]:
|
190
226
|
"""Returns external ip and ports for the ingress controller."""
|
191
|
-
core_api = kubernetes.core_api()
|
227
|
+
core_api = kubernetes.core_api(context)
|
192
228
|
ingress_services = [
|
193
229
|
item for item in core_api.list_namespaced_service(
|
194
230
|
namespace, _request_timeout=kubernetes.API_TIMEOUT).items
|
195
231
|
if item.metadata.name == 'ingress-nginx-controller'
|
196
232
|
]
|
197
|
-
if
|
233
|
+
if not ingress_services:
|
198
234
|
return (None, None)
|
199
235
|
|
200
236
|
ingress_service = ingress_services[0]
|
@@ -222,23 +258,36 @@ def get_ingress_external_ip_and_ports(
|
|
222
258
|
return external_ip, None
|
223
259
|
|
224
260
|
|
225
|
-
def get_loadbalancer_ip(
|
261
|
+
def get_loadbalancer_ip(context: Optional[str],
|
262
|
+
namespace: str,
|
263
|
+
service_name: str,
|
264
|
+
timeout: int = 0) -> Optional[str]:
|
226
265
|
"""Returns the IP address of the load balancer."""
|
227
|
-
core_api = kubernetes.core_api()
|
228
|
-
service = core_api.read_namespaced_service(
|
229
|
-
service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
|
266
|
+
core_api = kubernetes.core_api(context)
|
230
267
|
|
231
|
-
|
232
|
-
return None
|
268
|
+
ip = None
|
233
269
|
|
234
|
-
|
235
|
-
|
236
|
-
|
270
|
+
start_time = time.time()
|
271
|
+
retry_cnt = 0
|
272
|
+
while ip is None and (retry_cnt == 0 or time.time() - start_time < timeout):
|
273
|
+
service = core_api.read_namespaced_service(
|
274
|
+
service_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
|
275
|
+
if service.status.load_balancer.ingress is not None:
|
276
|
+
ip = (service.status.load_balancer.ingress[0].ip or
|
277
|
+
service.status.load_balancer.ingress[0].hostname)
|
278
|
+
if ip is None:
|
279
|
+
retry_cnt += 1
|
280
|
+
if retry_cnt % 5 == 0:
|
281
|
+
logger.debug('Waiting for load balancer IP to be assigned'
|
282
|
+
'...')
|
283
|
+
time.sleep(1)
|
284
|
+
return ip
|
237
285
|
|
238
286
|
|
239
|
-
def get_pod_ip(
|
287
|
+
def get_pod_ip(context: Optional[str], namespace: str,
|
288
|
+
pod_name: str) -> Optional[str]:
|
240
289
|
"""Returns the IP address of the pod."""
|
241
|
-
core_api = kubernetes.core_api()
|
290
|
+
core_api = kubernetes.core_api(context)
|
242
291
|
pod = core_api.read_namespaced_pod(pod_name,
|
243
292
|
namespace,
|
244
293
|
_request_timeout=kubernetes.API_TIMEOUT)
|