skypilot-nightly 1.0.0.dev20251002__py3-none-any.whl → 1.0.0.dev20251004__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +19 -109
- sky/backends/cloud_vm_ray_backend.py +42 -27
- sky/client/cli/command.py +1 -11
- sky/clouds/cudo.py +1 -1
- sky/clouds/kubernetes.py +7 -19
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{16g0-hgEgk6Db72hpE8MY → KL03GEega4QqDqTOMtA_w}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-ad77b12fc736dca3.js → [job]-72794fc3fcdd517a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-7340bc0f0dd8ae74.js → webpack-3286453d56f3c0a0.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +9 -0
- sky/execution.py +24 -2
- sky/global_user_state.py +16 -0
- sky/jobs/recovery_strategy.py +45 -0
- sky/jobs/server/core.py +60 -53
- sky/jobs/state.py +21 -1
- sky/jobs/utils.py +29 -11
- sky/provision/kubernetes/config.py +0 -42
- sky/provision/kubernetes/instance.py +1 -33
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network_utils.py +0 -21
- sky/provision/kubernetes/utils.py +136 -300
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +6 -0
- sky/server/server.py +6 -0
- sky/setup_files/dependencies.py +1 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -13
- sky/utils/context.py +12 -7
- sky/utils/env_options.py +4 -0
- sky/utils/kubernetes_enums.py +2 -15
- sky/utils/schemas.py +17 -6
- {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/METADATA +38 -37
- {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/RECORD +55 -56
- sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- /sky/dashboard/out/_next/static/{16g0-hgEgk6Db72hpE8MY → KL03GEega4QqDqTOMtA_w}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/top_level.txt +0 -0
|
@@ -15,7 +15,8 @@ import subprocess
|
|
|
15
15
|
import time
|
|
16
16
|
import typing
|
|
17
17
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
18
|
-
|
|
18
|
+
|
|
19
|
+
import ijson
|
|
19
20
|
|
|
20
21
|
from sky import clouds
|
|
21
22
|
from sky import exceptions
|
|
@@ -33,7 +34,6 @@ from sky.skylet import constants
|
|
|
33
34
|
from sky.utils import annotations
|
|
34
35
|
from sky.utils import common_utils
|
|
35
36
|
from sky.utils import config_utils
|
|
36
|
-
from sky.utils import directory_utils
|
|
37
37
|
from sky.utils import env_options
|
|
38
38
|
from sky.utils import kubernetes_enums
|
|
39
39
|
from sky.utils import schemas
|
|
@@ -62,6 +62,8 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
|
|
|
62
62
|
# and store all data that needs to be persisted in future.
|
|
63
63
|
HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
|
|
64
64
|
|
|
65
|
+
IJSON_BUFFER_SIZE = 64 * 1024 # 64KB, default from ijson
|
|
66
|
+
|
|
65
67
|
|
|
66
68
|
class KubernetesHighPerformanceNetworkType(enum.Enum):
|
|
67
69
|
"""Enum for different Kubernetes cluster types with high performance
|
|
@@ -1143,9 +1145,51 @@ def detect_accelerator_resource(
|
|
|
1143
1145
|
return has_accelerator, cluster_resources
|
|
1144
1146
|
|
|
1145
1147
|
|
|
1148
|
+
@dataclasses.dataclass
|
|
1149
|
+
class V1ObjectMeta:
|
|
1150
|
+
name: str
|
|
1151
|
+
labels: Dict[str, str]
|
|
1152
|
+
namespace: str = '' # Used for pods, not nodes
|
|
1153
|
+
|
|
1154
|
+
|
|
1155
|
+
@dataclasses.dataclass
|
|
1156
|
+
class V1NodeAddress:
|
|
1157
|
+
type: str
|
|
1158
|
+
address: str
|
|
1159
|
+
|
|
1160
|
+
|
|
1161
|
+
@dataclasses.dataclass
|
|
1162
|
+
class V1NodeStatus:
|
|
1163
|
+
allocatable: Dict[str, str]
|
|
1164
|
+
capacity: Dict[str, str]
|
|
1165
|
+
addresses: List[V1NodeAddress]
|
|
1166
|
+
|
|
1167
|
+
|
|
1168
|
+
@dataclasses.dataclass
|
|
1169
|
+
class V1Node:
|
|
1170
|
+
metadata: V1ObjectMeta
|
|
1171
|
+
status: V1NodeStatus
|
|
1172
|
+
|
|
1173
|
+
@classmethod
|
|
1174
|
+
def from_dict(cls, data: dict) -> 'V1Node':
|
|
1175
|
+
"""Create V1Node from a dictionary."""
|
|
1176
|
+
return cls(metadata=V1ObjectMeta(
|
|
1177
|
+
name=data['metadata']['name'],
|
|
1178
|
+
labels=data['metadata'].get('labels', {}),
|
|
1179
|
+
),
|
|
1180
|
+
status=V1NodeStatus(
|
|
1181
|
+
allocatable=data['status']['allocatable'],
|
|
1182
|
+
capacity=data['status']['capacity'],
|
|
1183
|
+
addresses=[
|
|
1184
|
+
V1NodeAddress(type=addr['type'],
|
|
1185
|
+
address=addr['address'])
|
|
1186
|
+
for addr in data['status'].get('addresses', [])
|
|
1187
|
+
]))
|
|
1188
|
+
|
|
1189
|
+
|
|
1146
1190
|
@annotations.lru_cache(scope='request', maxsize=10)
|
|
1147
1191
|
@_retry_on_error(resource_type='node')
|
|
1148
|
-
def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[
|
|
1192
|
+
def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[V1Node]:
|
|
1149
1193
|
"""Gets the kubernetes nodes in the context.
|
|
1150
1194
|
|
|
1151
1195
|
If context is None, gets the nodes in the current context.
|
|
@@ -1153,15 +1197,71 @@ def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
|
|
|
1153
1197
|
if context is None:
|
|
1154
1198
|
context = get_current_kube_config_context_name()
|
|
1155
1199
|
|
|
1156
|
-
|
|
1157
|
-
|
|
1200
|
+
# Return raw urllib3.HTTPResponse object so that we can parse the json
|
|
1201
|
+
# more efficiently.
|
|
1202
|
+
response = kubernetes.core_api(context).list_node(
|
|
1203
|
+
_request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
|
|
1204
|
+
try:
|
|
1205
|
+
nodes = [
|
|
1206
|
+
V1Node.from_dict(item_dict) for item_dict in ijson.items(
|
|
1207
|
+
response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
|
|
1208
|
+
]
|
|
1209
|
+
finally:
|
|
1210
|
+
response.release_conn()
|
|
1211
|
+
|
|
1158
1212
|
return nodes
|
|
1159
1213
|
|
|
1160
1214
|
|
|
1215
|
+
@dataclasses.dataclass
|
|
1216
|
+
class V1PodStatus:
|
|
1217
|
+
phase: str
|
|
1218
|
+
|
|
1219
|
+
|
|
1220
|
+
@dataclasses.dataclass
|
|
1221
|
+
class V1ResourceRequirements:
|
|
1222
|
+
requests: Optional[Dict[str, str]]
|
|
1223
|
+
|
|
1224
|
+
|
|
1225
|
+
@dataclasses.dataclass
|
|
1226
|
+
class V1Container:
|
|
1227
|
+
resources: V1ResourceRequirements
|
|
1228
|
+
|
|
1229
|
+
|
|
1230
|
+
@dataclasses.dataclass
|
|
1231
|
+
class V1PodSpec:
|
|
1232
|
+
containers: List[V1Container]
|
|
1233
|
+
node_name: Optional[str]
|
|
1234
|
+
|
|
1235
|
+
|
|
1236
|
+
@dataclasses.dataclass
|
|
1237
|
+
class V1Pod:
|
|
1238
|
+
metadata: V1ObjectMeta
|
|
1239
|
+
status: V1PodStatus
|
|
1240
|
+
spec: V1PodSpec
|
|
1241
|
+
|
|
1242
|
+
@classmethod
|
|
1243
|
+
def from_dict(cls, data: dict) -> 'V1Pod':
|
|
1244
|
+
"""Create V1Pod from a dictionary."""
|
|
1245
|
+
return cls(metadata=V1ObjectMeta(
|
|
1246
|
+
name=data['metadata']['name'],
|
|
1247
|
+
labels=data['metadata'].get('labels', {}),
|
|
1248
|
+
namespace=data['metadata'].get('namespace'),
|
|
1249
|
+
),
|
|
1250
|
+
status=V1PodStatus(phase=data['status'].get('phase'),),
|
|
1251
|
+
spec=V1PodSpec(
|
|
1252
|
+
node_name=data['spec'].get('nodeName'),
|
|
1253
|
+
containers=[
|
|
1254
|
+
V1Container(resources=V1ResourceRequirements(
|
|
1255
|
+
requests=container.get('resources', {}).get(
|
|
1256
|
+
'requests') or None))
|
|
1257
|
+
for container in data['spec'].get('containers', [])
|
|
1258
|
+
]))
|
|
1259
|
+
|
|
1260
|
+
|
|
1161
1261
|
@_retry_on_error(resource_type='pod')
|
|
1162
1262
|
def get_all_pods_in_kubernetes_cluster(*,
|
|
1163
1263
|
context: Optional[str] = None
|
|
1164
|
-
) -> List[
|
|
1264
|
+
) -> List[V1Pod]:
|
|
1165
1265
|
"""Gets pods in all namespaces in kubernetes cluster indicated by context.
|
|
1166
1266
|
|
|
1167
1267
|
Used for computing cluster resource usage.
|
|
@@ -1169,8 +1269,18 @@ def get_all_pods_in_kubernetes_cluster(*,
|
|
|
1169
1269
|
if context is None:
|
|
1170
1270
|
context = get_current_kube_config_context_name()
|
|
1171
1271
|
|
|
1172
|
-
|
|
1173
|
-
|
|
1272
|
+
# Return raw urllib3.HTTPResponse object so that we can parse the json
|
|
1273
|
+
# more efficiently.
|
|
1274
|
+
response = kubernetes.core_api(context).list_pod_for_all_namespaces(
|
|
1275
|
+
_request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
|
|
1276
|
+
try:
|
|
1277
|
+
pods = [
|
|
1278
|
+
V1Pod.from_dict(item_dict) for item_dict in ijson.items(
|
|
1279
|
+
response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
|
|
1280
|
+
]
|
|
1281
|
+
finally:
|
|
1282
|
+
response.release_conn()
|
|
1283
|
+
|
|
1174
1284
|
return pods
|
|
1175
1285
|
|
|
1176
1286
|
|
|
@@ -1560,23 +1670,6 @@ def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
|
|
|
1560
1670
|
return head_service.spec.ports[0].node_port
|
|
1561
1671
|
|
|
1562
1672
|
|
|
1563
|
-
def get_external_ip(network_mode: Optional[
|
|
1564
|
-
kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
|
|
1565
|
-
if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
|
|
1566
|
-
return '127.0.0.1'
|
|
1567
|
-
# Return the IP address of the first node with an external IP
|
|
1568
|
-
nodes = kubernetes.core_api(context).list_node().items
|
|
1569
|
-
for node in nodes:
|
|
1570
|
-
if node.status.addresses:
|
|
1571
|
-
for address in node.status.addresses:
|
|
1572
|
-
if address.type == 'ExternalIP':
|
|
1573
|
-
return address.address
|
|
1574
|
-
# If no external IP is found, use the API server IP
|
|
1575
|
-
api_host = kubernetes.core_api(context).api_client.configuration.host
|
|
1576
|
-
parsed_url = urlparse(api_host)
|
|
1577
|
-
return parsed_url.hostname
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
1673
|
def check_credentials(context: Optional[str],
|
|
1581
1674
|
timeout: int = kubernetes.API_TIMEOUT,
|
|
1582
1675
|
run_optional_checks: bool = False) -> \
|
|
@@ -2288,16 +2381,14 @@ def construct_ssh_jump_command(
|
|
|
2288
2381
|
|
|
2289
2382
|
|
|
2290
2383
|
def get_ssh_proxy_command(
|
|
2291
|
-
|
|
2292
|
-
network_mode: kubernetes_enums.KubernetesNetworkingMode,
|
|
2384
|
+
pod_name: str,
|
|
2293
2385
|
private_key_path: str,
|
|
2294
2386
|
context: Optional[str],
|
|
2295
2387
|
namespace: str,
|
|
2296
2388
|
) -> str:
|
|
2297
2389
|
"""Generates the SSH proxy command to connect to the pod.
|
|
2298
2390
|
|
|
2299
|
-
Uses a
|
|
2300
|
-
if the network mode is PORTFORWARD.
|
|
2391
|
+
Uses a direct port-forwarding.
|
|
2301
2392
|
|
|
2302
2393
|
By default, establishing an SSH connection creates a communication
|
|
2303
2394
|
channel to a remote node by setting up a TCP connection. When a
|
|
@@ -2308,17 +2399,8 @@ def get_ssh_proxy_command(
|
|
|
2308
2399
|
Pods within a Kubernetes cluster have internal IP addresses that are
|
|
2309
2400
|
typically not accessible from outside the cluster. Since the default TCP
|
|
2310
2401
|
connection of SSH won't allow access to these pods, we employ a
|
|
2311
|
-
ProxyCommand to establish the required communication channel.
|
|
2312
|
-
in two different networking options: NodePort/port-forward.
|
|
2402
|
+
ProxyCommand to establish the required communication channel.
|
|
2313
2403
|
|
|
2314
|
-
With the NodePort networking mode, a NodePort service is launched. This
|
|
2315
|
-
service opens an external port on the node which redirects to the desired
|
|
2316
|
-
port to a SSH jump pod. When establishing an SSH session in this mode, the
|
|
2317
|
-
ProxyCommand makes use of this external port to create a communication
|
|
2318
|
-
channel directly to port 22, which is the default port ssh server listens
|
|
2319
|
-
on, of the jump pod.
|
|
2320
|
-
|
|
2321
|
-
With Port-forward mode, instead of directly exposing an external port,
|
|
2322
2404
|
'kubectl port-forward' sets up a tunnel between a local port
|
|
2323
2405
|
(127.0.0.1:23100) and port 22 of the provisioned pod. Then we establish TCP
|
|
2324
2406
|
connection to the local end of this tunnel, 127.0.0.1:23100, using 'socat'.
|
|
@@ -2329,38 +2411,26 @@ def get_ssh_proxy_command(
|
|
|
2329
2411
|
the local machine.
|
|
2330
2412
|
|
|
2331
2413
|
Args:
|
|
2332
|
-
|
|
2333
|
-
target for SSH.
|
|
2334
|
-
service. If network_mode is PORTFORWARD, this is the pod name.
|
|
2335
|
-
network_mode: KubernetesNetworkingMode; networking mode for ssh
|
|
2336
|
-
session. It is either 'NODEPORT' or 'PORTFORWARD'
|
|
2414
|
+
pod_name: str; The Kubernetes pod name that will be used as the
|
|
2415
|
+
target for SSH.
|
|
2337
2416
|
private_key_path: str; Path to the private key to use for SSH.
|
|
2338
2417
|
This key must be authorized to access the SSH jump pod.
|
|
2339
|
-
Required for NODEPORT networking mode.
|
|
2340
2418
|
namespace: Kubernetes namespace to use.
|
|
2341
|
-
Required for NODEPORT networking mode.
|
|
2342
2419
|
"""
|
|
2343
|
-
|
|
2344
|
-
ssh_jump_ip = get_external_ip(network_mode, context)
|
|
2420
|
+
ssh_jump_ip = '127.0.0.1' # Local end of the port-forward tunnel
|
|
2345
2421
|
assert private_key_path is not None, 'Private key path must be provided'
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
|
|
2355
|
-
|
|
2356
|
-
|
|
2357
|
-
|
|
2358
|
-
proxy_cmd_target_pod=k8s_ssh_target,
|
|
2359
|
-
# We embed both the current context and namespace to the SSH proxy
|
|
2360
|
-
# command to make sure SSH still works when the current
|
|
2361
|
-
# context/namespace is changed by the user.
|
|
2362
|
-
current_kube_context=context,
|
|
2363
|
-
current_kube_namespace=namespace)
|
|
2422
|
+
ssh_jump_proxy_command_path = create_proxy_command_script()
|
|
2423
|
+
ssh_jump_proxy_command = construct_ssh_jump_command(
|
|
2424
|
+
private_key_path,
|
|
2425
|
+
ssh_jump_ip,
|
|
2426
|
+
ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
|
|
2427
|
+
proxy_cmd_path=ssh_jump_proxy_command_path,
|
|
2428
|
+
proxy_cmd_target_pod=pod_name,
|
|
2429
|
+
# We embed both the current context and namespace to the SSH proxy
|
|
2430
|
+
# command to make sure SSH still works when the current
|
|
2431
|
+
# context/namespace is changed by the user.
|
|
2432
|
+
current_kube_context=context,
|
|
2433
|
+
current_kube_namespace=namespace)
|
|
2364
2434
|
return ssh_jump_proxy_command
|
|
2365
2435
|
|
|
2366
2436
|
|
|
@@ -2392,240 +2462,6 @@ def create_proxy_command_script() -> str:
|
|
|
2392
2462
|
return PORT_FORWARD_PROXY_CMD_PATH
|
|
2393
2463
|
|
|
2394
2464
|
|
|
2395
|
-
def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
|
|
2396
|
-
context: Optional[str],
|
|
2397
|
-
service_type: kubernetes_enums.KubernetesServiceType):
|
|
2398
|
-
"""Sets up Kubernetes service resource to access for SSH jump pod.
|
|
2399
|
-
|
|
2400
|
-
This method acts as a necessary complement to be run along with
|
|
2401
|
-
setup_ssh_jump_pod(...) method. This service ensures the pod is accessible.
|
|
2402
|
-
|
|
2403
|
-
Args:
|
|
2404
|
-
ssh_jump_name: Name to use for the SSH jump service
|
|
2405
|
-
namespace: Namespace to create the SSH jump service in
|
|
2406
|
-
service_type: Networking configuration on either to use NodePort
|
|
2407
|
-
or ClusterIP service to ssh in
|
|
2408
|
-
"""
|
|
2409
|
-
# Fill in template - ssh_key_secret and ssh_jump_image are not required for
|
|
2410
|
-
# the service spec, so we pass in empty strs.
|
|
2411
|
-
content = fill_ssh_jump_template('', '', ssh_jump_name, service_type.value)
|
|
2412
|
-
|
|
2413
|
-
# Add custom metadata from config
|
|
2414
|
-
merge_custom_metadata(content['service_spec']['metadata'], context)
|
|
2415
|
-
|
|
2416
|
-
# Create service
|
|
2417
|
-
try:
|
|
2418
|
-
kubernetes.core_api(context).create_namespaced_service(
|
|
2419
|
-
namespace, content['service_spec'])
|
|
2420
|
-
except kubernetes.api_exception() as e:
|
|
2421
|
-
# SSH Jump Pod service already exists.
|
|
2422
|
-
if e.status == 409:
|
|
2423
|
-
ssh_jump_service = kubernetes.core_api(
|
|
2424
|
-
context).read_namespaced_service(name=ssh_jump_name,
|
|
2425
|
-
namespace=namespace)
|
|
2426
|
-
curr_svc_type = ssh_jump_service.spec.type
|
|
2427
|
-
if service_type.value == curr_svc_type:
|
|
2428
|
-
# If the currently existing SSH Jump service's type is identical
|
|
2429
|
-
# to user's configuration for networking mode
|
|
2430
|
-
logger.debug(
|
|
2431
|
-
f'SSH Jump Service {ssh_jump_name} already exists in the '
|
|
2432
|
-
'cluster, using it.')
|
|
2433
|
-
else:
|
|
2434
|
-
# If a different type of service type for SSH Jump pod compared
|
|
2435
|
-
# to user's configuration for networking mode exists, we remove
|
|
2436
|
-
# existing servie to create a new one following user's config
|
|
2437
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2438
|
-
name=ssh_jump_name, namespace=namespace)
|
|
2439
|
-
kubernetes.core_api(context).create_namespaced_service(
|
|
2440
|
-
namespace, content['service_spec'])
|
|
2441
|
-
port_forward_mode = (
|
|
2442
|
-
kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
|
|
2443
|
-
nodeport_mode = (
|
|
2444
|
-
kubernetes_enums.KubernetesNetworkingMode.NODEPORT.value)
|
|
2445
|
-
clusterip_svc = (
|
|
2446
|
-
kubernetes_enums.KubernetesServiceType.CLUSTERIP.value)
|
|
2447
|
-
nodeport_svc = (
|
|
2448
|
-
kubernetes_enums.KubernetesServiceType.NODEPORT.value)
|
|
2449
|
-
curr_network_mode = port_forward_mode \
|
|
2450
|
-
if curr_svc_type == clusterip_svc else nodeport_mode
|
|
2451
|
-
new_network_mode = nodeport_mode \
|
|
2452
|
-
if curr_svc_type == clusterip_svc else port_forward_mode
|
|
2453
|
-
new_svc_type = nodeport_svc \
|
|
2454
|
-
if curr_svc_type == clusterip_svc else clusterip_svc
|
|
2455
|
-
logger.info(
|
|
2456
|
-
f'Switching the networking mode from '
|
|
2457
|
-
f'\'{curr_network_mode}\' to \'{new_network_mode}\' '
|
|
2458
|
-
f'following networking configuration. Deleting existing '
|
|
2459
|
-
f'\'{curr_svc_type}\' service and recreating as '
|
|
2460
|
-
f'\'{new_svc_type}\' service.')
|
|
2461
|
-
else:
|
|
2462
|
-
raise
|
|
2463
|
-
else:
|
|
2464
|
-
logger.info(f'Created SSH Jump Service {ssh_jump_name}.')
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
|
|
2468
|
-
ssh_key_secret: str, namespace: str,
|
|
2469
|
-
context: Optional[str]):
|
|
2470
|
-
"""Sets up Kubernetes RBAC and pod for SSH jump host.
|
|
2471
|
-
|
|
2472
|
-
Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
|
|
2473
|
-
running inside a cluster. This function sets up the resources needed for
|
|
2474
|
-
the SSH jump pod. This includes a service account which grants the jump pod
|
|
2475
|
-
permission to watch for other SkyPilot pods and terminate itself if there
|
|
2476
|
-
are no SkyPilot pods running.
|
|
2477
|
-
|
|
2478
|
-
setup_ssh_jump_service must also be run to ensure that the SSH jump pod is
|
|
2479
|
-
reachable.
|
|
2480
|
-
|
|
2481
|
-
Args:
|
|
2482
|
-
ssh_jump_image: Container image to use for the SSH jump pod
|
|
2483
|
-
ssh_jump_name: Name to use for the SSH jump pod
|
|
2484
|
-
ssh_key_secret: Secret name for the SSH key stored in the cluster
|
|
2485
|
-
namespace: Namespace to create the SSH jump pod in
|
|
2486
|
-
"""
|
|
2487
|
-
# Fill in template - service is created separately so service_type is not
|
|
2488
|
-
# required, so we pass in empty str.
|
|
2489
|
-
content = fill_ssh_jump_template(ssh_key_secret, ssh_jump_image,
|
|
2490
|
-
ssh_jump_name, '')
|
|
2491
|
-
|
|
2492
|
-
# Add custom metadata to all objects
|
|
2493
|
-
for object_type in content.keys():
|
|
2494
|
-
merge_custom_metadata(content[object_type]['metadata'], context)
|
|
2495
|
-
|
|
2496
|
-
# ServiceAccount
|
|
2497
|
-
try:
|
|
2498
|
-
kubernetes.core_api(context).create_namespaced_service_account(
|
|
2499
|
-
namespace, content['service_account'])
|
|
2500
|
-
except kubernetes.api_exception() as e:
|
|
2501
|
-
if e.status == 409:
|
|
2502
|
-
logger.info(
|
|
2503
|
-
'SSH Jump ServiceAccount already exists in the cluster, using '
|
|
2504
|
-
'it.')
|
|
2505
|
-
else:
|
|
2506
|
-
raise
|
|
2507
|
-
else:
|
|
2508
|
-
logger.info('Created SSH Jump ServiceAccount.')
|
|
2509
|
-
# Role
|
|
2510
|
-
try:
|
|
2511
|
-
kubernetes.auth_api(context).create_namespaced_role(
|
|
2512
|
-
namespace, content['role'])
|
|
2513
|
-
except kubernetes.api_exception() as e:
|
|
2514
|
-
if e.status == 409:
|
|
2515
|
-
logger.info(
|
|
2516
|
-
'SSH Jump Role already exists in the cluster, using it.')
|
|
2517
|
-
else:
|
|
2518
|
-
raise
|
|
2519
|
-
else:
|
|
2520
|
-
logger.info('Created SSH Jump Role.')
|
|
2521
|
-
# RoleBinding
|
|
2522
|
-
try:
|
|
2523
|
-
kubernetes.auth_api(context).create_namespaced_role_binding(
|
|
2524
|
-
namespace, content['role_binding'])
|
|
2525
|
-
except kubernetes.api_exception() as e:
|
|
2526
|
-
if e.status == 409:
|
|
2527
|
-
logger.info(
|
|
2528
|
-
'SSH Jump RoleBinding already exists in the cluster, using '
|
|
2529
|
-
'it.')
|
|
2530
|
-
else:
|
|
2531
|
-
raise
|
|
2532
|
-
else:
|
|
2533
|
-
logger.info('Created SSH Jump RoleBinding.')
|
|
2534
|
-
# Pod
|
|
2535
|
-
try:
|
|
2536
|
-
kubernetes.core_api(context).create_namespaced_pod(
|
|
2537
|
-
namespace, content['pod_spec'])
|
|
2538
|
-
except kubernetes.api_exception() as e:
|
|
2539
|
-
if e.status == 409:
|
|
2540
|
-
logger.info(
|
|
2541
|
-
f'SSH Jump Host {ssh_jump_name} already exists in the cluster, '
|
|
2542
|
-
'using it.')
|
|
2543
|
-
else:
|
|
2544
|
-
raise
|
|
2545
|
-
else:
|
|
2546
|
-
logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
|
|
2547
|
-
|
|
2548
|
-
|
|
2549
|
-
def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
|
|
2550
|
-
node_id: str):
|
|
2551
|
-
"""Analyzes SSH jump pod and removes if it is in a bad state
|
|
2552
|
-
|
|
2553
|
-
Prevents the existence of a dangling SSH jump pod. This could happen
|
|
2554
|
-
in case the pod main container did not start properly (or failed). In that
|
|
2555
|
-
case, jump pod lifecycle manager will not function properly to
|
|
2556
|
-
remove the pod and service automatically, and must be done manually.
|
|
2557
|
-
|
|
2558
|
-
Args:
|
|
2559
|
-
namespace: Namespace to remove the SSH jump pod and service from
|
|
2560
|
-
node_id: Name of head pod
|
|
2561
|
-
"""
|
|
2562
|
-
|
|
2563
|
-
def find(l, predicate):
|
|
2564
|
-
"""Utility function to find element in given list"""
|
|
2565
|
-
results = [x for x in l if predicate(x)]
|
|
2566
|
-
return results[0] if results else None
|
|
2567
|
-
|
|
2568
|
-
# Get the SSH jump pod name from the head pod
|
|
2569
|
-
try:
|
|
2570
|
-
pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
2571
|
-
node_id, namespace)
|
|
2572
|
-
except kubernetes.api_exception() as e:
|
|
2573
|
-
if e.status == 404:
|
|
2574
|
-
logger.warning(f'Failed to get pod {node_id},'
|
|
2575
|
-
' but the pod was not found (404).')
|
|
2576
|
-
raise
|
|
2577
|
-
else:
|
|
2578
|
-
ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
|
|
2579
|
-
try:
|
|
2580
|
-
ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
2581
|
-
ssh_jump_name, namespace)
|
|
2582
|
-
cont_ready_cond = find(ssh_jump_pod.status.conditions,
|
|
2583
|
-
lambda c: c.type == 'ContainersReady')
|
|
2584
|
-
if (cont_ready_cond and cont_ready_cond.status
|
|
2585
|
-
== 'False') or ssh_jump_pod.status.phase == 'Pending':
|
|
2586
|
-
# Either the main container is not ready or the pod failed
|
|
2587
|
-
# to schedule. To be on the safe side and prevent a dangling
|
|
2588
|
-
# ssh jump pod, lets remove it and the service. Otherwise, main
|
|
2589
|
-
# container is ready and its lifecycle management script takes
|
|
2590
|
-
# care of the cleaning.
|
|
2591
|
-
kubernetes.core_api(context).delete_namespaced_pod(
|
|
2592
|
-
ssh_jump_name, namespace)
|
|
2593
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2594
|
-
ssh_jump_name, namespace)
|
|
2595
|
-
except kubernetes.api_exception() as e:
|
|
2596
|
-
# We keep the warning in debug to avoid polluting the `sky launch`
|
|
2597
|
-
# output.
|
|
2598
|
-
logger.debug(f'Tried to check ssh jump pod {ssh_jump_name},'
|
|
2599
|
-
f' but got error {e}\n. Consider running `kubectl '
|
|
2600
|
-
f'delete pod {ssh_jump_name} -n {namespace}` to manually '
|
|
2601
|
-
'remove the pod if it has crashed.')
|
|
2602
|
-
# We encountered an issue while checking ssh jump pod. To be on
|
|
2603
|
-
# the safe side, lets remove its service so the port is freed
|
|
2604
|
-
try:
|
|
2605
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2606
|
-
ssh_jump_name, namespace)
|
|
2607
|
-
except kubernetes.api_exception():
|
|
2608
|
-
pass
|
|
2609
|
-
|
|
2610
|
-
|
|
2611
|
-
def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
|
|
2612
|
-
ssh_jump_name: str, service_type: str) -> Dict:
|
|
2613
|
-
template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
|
|
2614
|
-
'kubernetes-ssh-jump.yml.j2')
|
|
2615
|
-
if not os.path.exists(template_path):
|
|
2616
|
-
raise FileNotFoundError(
|
|
2617
|
-
'Template "kubernetes-ssh-jump.j2" does not exist.')
|
|
2618
|
-
with open(template_path, 'r', encoding='utf-8') as fin:
|
|
2619
|
-
template = fin.read()
|
|
2620
|
-
j2_template = jinja2.Template(template)
|
|
2621
|
-
cont = j2_template.render(name=ssh_jump_name,
|
|
2622
|
-
image=ssh_jump_image,
|
|
2623
|
-
secret=ssh_key_secret,
|
|
2624
|
-
service_type=service_type)
|
|
2625
|
-
content = yaml_utils.safe_load(cont)
|
|
2626
|
-
return content
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
2465
|
def check_port_forward_mode_dependencies(
|
|
2630
2466
|
raise_error: bool = True) -> Optional[List[str]]:
|
|
2631
2467
|
"""Checks if 'socat' and 'nc' are installed
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Shared loopback detection utilities for auth middlewares."""
|
|
2
|
+
|
|
3
|
+
import ipaddress
|
|
4
|
+
|
|
5
|
+
import fastapi
|
|
6
|
+
|
|
7
|
+
from sky import sky_logging
|
|
8
|
+
|
|
9
|
+
logger = sky_logging.init_logger(__name__)
|
|
10
|
+
|
|
11
|
+
COMMON_PROXY_HEADERS = [
|
|
12
|
+
'X-Forwarded-For', 'Forwarded', 'X-Real-IP', 'X-Client-IP',
|
|
13
|
+
'X-Forwarded-Host', 'X-Forwarded-Proto'
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _is_loopback_ip(ip_str: str) -> bool:
|
|
18
|
+
"""Check if an IP address is a loopback address."""
|
|
19
|
+
try:
|
|
20
|
+
ip = ipaddress.ip_address(ip_str)
|
|
21
|
+
return ip.is_loopback
|
|
22
|
+
except ValueError:
|
|
23
|
+
return False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def is_loopback_request(request: fastapi.Request) -> bool:
|
|
27
|
+
"""Determine if a request is coming from localhost."""
|
|
28
|
+
if request.client is None:
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
client_host = request.client.host
|
|
32
|
+
if client_host == 'localhost' or _is_loopback_ip(client_host):
|
|
33
|
+
# Additional checks: ensure no forwarding headers are present.
|
|
34
|
+
# If there are any, assume this traffic went through a proxy.
|
|
35
|
+
return not any(
|
|
36
|
+
request.headers.get(header) for header in COMMON_PROXY_HEADERS)
|
|
37
|
+
|
|
38
|
+
return False
|
sky/server/auth/oauth2_proxy.py
CHANGED
|
@@ -15,7 +15,9 @@ import starlette.middleware.base
|
|
|
15
15
|
from sky import global_user_state
|
|
16
16
|
from sky import models
|
|
17
17
|
from sky import sky_logging
|
|
18
|
+
from sky.jobs import utils as managed_job_utils
|
|
18
19
|
from sky.server.auth import authn
|
|
20
|
+
from sky.server.auth import loopback
|
|
19
21
|
from sky.users import permission
|
|
20
22
|
from sky.utils import common_utils
|
|
21
23
|
|
|
@@ -108,6 +110,10 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
108
110
|
# Already authenticated
|
|
109
111
|
return await call_next(request)
|
|
110
112
|
|
|
113
|
+
if managed_job_utils.is_consolidation_mode(
|
|
114
|
+
) and loopback.is_loopback_request(request):
|
|
115
|
+
return await call_next(request)
|
|
116
|
+
|
|
111
117
|
async with aiohttp.ClientSession() as session:
|
|
112
118
|
try:
|
|
113
119
|
return await self._authenticate(request, call_next, session)
|
sky/server/server.py
CHANGED
|
@@ -38,6 +38,7 @@ from sky import global_user_state
|
|
|
38
38
|
from sky import models
|
|
39
39
|
from sky import sky_logging
|
|
40
40
|
from sky.data import storage_utils
|
|
41
|
+
from sky.jobs import utils as managed_job_utils
|
|
41
42
|
from sky.jobs.server import server as jobs_rest
|
|
42
43
|
from sky.metrics import utils as metrics_utils
|
|
43
44
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
@@ -52,6 +53,7 @@ from sky.server import state
|
|
|
52
53
|
from sky.server import stream_utils
|
|
53
54
|
from sky.server import versions
|
|
54
55
|
from sky.server.auth import authn
|
|
56
|
+
from sky.server.auth import loopback
|
|
55
57
|
from sky.server.auth import oauth2_proxy
|
|
56
58
|
from sky.server.requests import executor
|
|
57
59
|
from sky.server.requests import payloads
|
|
@@ -191,6 +193,10 @@ class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
191
193
|
"""Middleware to handle HTTP Basic Auth."""
|
|
192
194
|
|
|
193
195
|
async def dispatch(self, request: fastapi.Request, call_next):
|
|
196
|
+
if managed_job_utils.is_consolidation_mode(
|
|
197
|
+
) and loopback.is_loopback_request(request):
|
|
198
|
+
return await call_next(request)
|
|
199
|
+
|
|
194
200
|
if request.url.path.startswith('/api/health'):
|
|
195
201
|
# Try to set the auth user from basic auth
|
|
196
202
|
_try_set_basic_auth_user(request)
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -48,6 +48,7 @@ install_requires = [
|
|
|
48
48
|
# (https://github.com/yaml/pyyaml/issues/601)
|
|
49
49
|
# <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
|
|
50
50
|
'pyyaml > 3.13, != 5.4.*',
|
|
51
|
+
'ijson',
|
|
51
52
|
'requests',
|
|
52
53
|
# SkyPilot inherits from uvicorn.Server to customize the behavior of
|
|
53
54
|
# uvicorn, so we need to pin uvicorn version to avoid potential break
|
|
@@ -33,14 +33,11 @@ provider:
|
|
|
33
33
|
networking_mode: {{k8s_networking_mode}}
|
|
34
34
|
|
|
35
35
|
# We use internal IPs since we set up a port-forward between the kubernetes
|
|
36
|
-
# cluster and the local machine
|
|
37
|
-
# head node.
|
|
36
|
+
# cluster and the local machine.
|
|
38
37
|
use_internal_ips: true
|
|
39
38
|
|
|
40
39
|
timeout: {{timeout}}
|
|
41
40
|
|
|
42
|
-
ssh_jump_image: {{k8s_ssh_jump_image}}
|
|
43
|
-
|
|
44
41
|
# Namespace used to host SkyPilot system components, such as fuse device
|
|
45
42
|
# manager.
|
|
46
43
|
skypilot_system_namespace: {{k8s_skypilot_system_namespace}}
|
|
@@ -276,8 +273,6 @@ available_node_types:
|
|
|
276
273
|
parent: skypilot
|
|
277
274
|
# component will be set for the head node pod to be the same as the head node service selector above if a
|
|
278
275
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
279
|
-
# Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
|
|
280
|
-
skypilot-ssh-jump: {{k8s_ssh_jump_name}}
|
|
281
276
|
skypilot-user: {{ user }}
|
|
282
277
|
# Custom tags for the pods
|
|
283
278
|
{%- for label_key, label_value in labels.items() %}
|
|
@@ -444,9 +439,6 @@ available_node_types:
|
|
|
444
439
|
# object store. If you do not provide this, Ray will fall back to
|
|
445
440
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
446
441
|
volumes:
|
|
447
|
-
- name: secret-volume
|
|
448
|
-
secret:
|
|
449
|
-
secretName: {{k8s_ssh_key_secret_name}}
|
|
450
442
|
- name: dshm
|
|
451
443
|
emptyDir:
|
|
452
444
|
medium: Memory
|
|
@@ -869,7 +861,9 @@ available_node_types:
|
|
|
869
861
|
$(prefix_cmd) mkdir -p ~/.ssh;
|
|
870
862
|
$(prefix_cmd) chown -R $(whoami) ~/.ssh;
|
|
871
863
|
$(prefix_cmd) chmod 700 ~/.ssh;
|
|
872
|
-
$(prefix_cmd) cat
|
|
864
|
+
$(prefix_cmd) cat > ~/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
|
|
865
|
+
skypilot:ssh_public_key_content
|
|
866
|
+
SKYPILOT_SSH_KEY_EOF
|
|
873
867
|
$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
|
|
874
868
|
$(prefix_cmd) service ssh restart;
|
|
875
869
|
$(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
|
|
@@ -1105,9 +1099,6 @@ available_node_types:
|
|
|
1105
1099
|
# object store. If you do not provide this, Ray will fall back to
|
|
1106
1100
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
1107
1101
|
volumeMounts:
|
|
1108
|
-
- name: secret-volume
|
|
1109
|
-
readOnly: true
|
|
1110
|
-
mountPath: "/etc/secret-volume"
|
|
1111
1102
|
- mountPath: /dev/shm
|
|
1112
1103
|
name: dshm
|
|
1113
1104
|
{% if k8s_enable_gpudirect_tcpx %}
|