skypilot-nightly 1.0.0.dev20251001__py3-none-any.whl → 1.0.0.dev20251003__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +19 -109
- sky/client/cli/command.py +2 -3
- sky/client/cli/table_utils.py +222 -1
- sky/clouds/cudo.py +1 -1
- sky/clouds/kubernetes.py +7 -19
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{m3YT2i5s6v4SsIdYc8WZa → Haazh5IQz6F8Wyiqxcaj8}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-ad77b12fc736dca3.js → [job]-72794fc3fcdd517a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-4f0c389a4ce5fd9c.js → webpack-3286453d56f3c0a0.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +9 -0
- sky/global_user_state.py +16 -0
- sky/jobs/server/core.py +60 -53
- sky/jobs/state.py +21 -1
- sky/jobs/utils.py +29 -11
- sky/provision/kubernetes/config.py +0 -42
- sky/provision/kubernetes/instance.py +1 -33
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network_utils.py +0 -21
- sky/provision/kubernetes/utils.py +68 -322
- sky/schemas/api/responses.py +21 -0
- sky/server/requests/serializers/decoders.py +8 -0
- sky/server/requests/serializers/encoders.py +6 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -13
- sky/utils/env_options.py +4 -0
- sky/utils/kubernetes_enums.py +2 -15
- sky/utils/schemas.py +17 -6
- sky/volumes/client/sdk.py +3 -2
- sky/volumes/server/core.py +3 -2
- {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/METADATA +37 -37
- {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/RECORD +53 -56
- sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- sky/volumes/utils.py +0 -224
- /sky/dashboard/out/_next/static/{m3YT2i5s6v4SsIdYc8WZa → Haazh5IQz6F8Wyiqxcaj8}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Kubernetes utilities for SkyPilot."""
|
|
2
|
+
import collections
|
|
2
3
|
import copy
|
|
3
4
|
import dataclasses
|
|
4
5
|
import datetime
|
|
@@ -14,7 +15,6 @@ import subprocess
|
|
|
14
15
|
import time
|
|
15
16
|
import typing
|
|
16
17
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
17
|
-
from urllib.parse import urlparse
|
|
18
18
|
|
|
19
19
|
from sky import clouds
|
|
20
20
|
from sky import exceptions
|
|
@@ -32,7 +32,6 @@ from sky.skylet import constants
|
|
|
32
32
|
from sky.utils import annotations
|
|
33
33
|
from sky.utils import common_utils
|
|
34
34
|
from sky.utils import config_utils
|
|
35
|
-
from sky.utils import directory_utils
|
|
36
35
|
from sky.utils import env_options
|
|
37
36
|
from sky.utils import kubernetes_enums
|
|
38
37
|
from sky.utils import schemas
|
|
@@ -1559,23 +1558,6 @@ def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
|
|
|
1559
1558
|
return head_service.spec.ports[0].node_port
|
|
1560
1559
|
|
|
1561
1560
|
|
|
1562
|
-
def get_external_ip(network_mode: Optional[
|
|
1563
|
-
kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
|
|
1564
|
-
if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
|
|
1565
|
-
return '127.0.0.1'
|
|
1566
|
-
# Return the IP address of the first node with an external IP
|
|
1567
|
-
nodes = kubernetes.core_api(context).list_node().items
|
|
1568
|
-
for node in nodes:
|
|
1569
|
-
if node.status.addresses:
|
|
1570
|
-
for address in node.status.addresses:
|
|
1571
|
-
if address.type == 'ExternalIP':
|
|
1572
|
-
return address.address
|
|
1573
|
-
# If no external IP is found, use the API server IP
|
|
1574
|
-
api_host = kubernetes.core_api(context).api_client.configuration.host
|
|
1575
|
-
parsed_url = urlparse(api_host)
|
|
1576
|
-
return parsed_url.hostname
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
1561
|
def check_credentials(context: Optional[str],
|
|
1580
1562
|
timeout: int = kubernetes.API_TIMEOUT,
|
|
1581
1563
|
run_optional_checks: bool = False) -> \
|
|
@@ -2287,16 +2269,14 @@ def construct_ssh_jump_command(
|
|
|
2287
2269
|
|
|
2288
2270
|
|
|
2289
2271
|
def get_ssh_proxy_command(
|
|
2290
|
-
|
|
2291
|
-
network_mode: kubernetes_enums.KubernetesNetworkingMode,
|
|
2272
|
+
pod_name: str,
|
|
2292
2273
|
private_key_path: str,
|
|
2293
2274
|
context: Optional[str],
|
|
2294
2275
|
namespace: str,
|
|
2295
2276
|
) -> str:
|
|
2296
2277
|
"""Generates the SSH proxy command to connect to the pod.
|
|
2297
2278
|
|
|
2298
|
-
Uses a
|
|
2299
|
-
if the network mode is PORTFORWARD.
|
|
2279
|
+
Uses a direct port-forwarding.
|
|
2300
2280
|
|
|
2301
2281
|
By default, establishing an SSH connection creates a communication
|
|
2302
2282
|
channel to a remote node by setting up a TCP connection. When a
|
|
@@ -2307,17 +2287,8 @@ def get_ssh_proxy_command(
|
|
|
2307
2287
|
Pods within a Kubernetes cluster have internal IP addresses that are
|
|
2308
2288
|
typically not accessible from outside the cluster. Since the default TCP
|
|
2309
2289
|
connection of SSH won't allow access to these pods, we employ a
|
|
2310
|
-
ProxyCommand to establish the required communication channel.
|
|
2311
|
-
in two different networking options: NodePort/port-forward.
|
|
2312
|
-
|
|
2313
|
-
With the NodePort networking mode, a NodePort service is launched. This
|
|
2314
|
-
service opens an external port on the node which redirects to the desired
|
|
2315
|
-
port to a SSH jump pod. When establishing an SSH session in this mode, the
|
|
2316
|
-
ProxyCommand makes use of this external port to create a communication
|
|
2317
|
-
channel directly to port 22, which is the default port ssh server listens
|
|
2318
|
-
on, of the jump pod.
|
|
2290
|
+
ProxyCommand to establish the required communication channel.
|
|
2319
2291
|
|
|
2320
|
-
With Port-forward mode, instead of directly exposing an external port,
|
|
2321
2292
|
'kubectl port-forward' sets up a tunnel between a local port
|
|
2322
2293
|
(127.0.0.1:23100) and port 22 of the provisioned pod. Then we establish TCP
|
|
2323
2294
|
connection to the local end of this tunnel, 127.0.0.1:23100, using 'socat'.
|
|
@@ -2328,38 +2299,26 @@ def get_ssh_proxy_command(
|
|
|
2328
2299
|
the local machine.
|
|
2329
2300
|
|
|
2330
2301
|
Args:
|
|
2331
|
-
|
|
2332
|
-
target for SSH.
|
|
2333
|
-
service. If network_mode is PORTFORWARD, this is the pod name.
|
|
2334
|
-
network_mode: KubernetesNetworkingMode; networking mode for ssh
|
|
2335
|
-
session. It is either 'NODEPORT' or 'PORTFORWARD'
|
|
2302
|
+
pod_name: str; The Kubernetes pod name that will be used as the
|
|
2303
|
+
target for SSH.
|
|
2336
2304
|
private_key_path: str; Path to the private key to use for SSH.
|
|
2337
2305
|
This key must be authorized to access the SSH jump pod.
|
|
2338
|
-
Required for NODEPORT networking mode.
|
|
2339
2306
|
namespace: Kubernetes namespace to use.
|
|
2340
|
-
Required for NODEPORT networking mode.
|
|
2341
2307
|
"""
|
|
2342
|
-
|
|
2343
|
-
ssh_jump_ip = get_external_ip(network_mode, context)
|
|
2308
|
+
ssh_jump_ip = '127.0.0.1' # Local end of the port-forward tunnel
|
|
2344
2309
|
assert private_key_path is not None, 'Private key path must be provided'
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
|
|
2355
|
-
|
|
2356
|
-
|
|
2357
|
-
proxy_cmd_target_pod=k8s_ssh_target,
|
|
2358
|
-
# We embed both the current context and namespace to the SSH proxy
|
|
2359
|
-
# command to make sure SSH still works when the current
|
|
2360
|
-
# context/namespace is changed by the user.
|
|
2361
|
-
current_kube_context=context,
|
|
2362
|
-
current_kube_namespace=namespace)
|
|
2310
|
+
ssh_jump_proxy_command_path = create_proxy_command_script()
|
|
2311
|
+
ssh_jump_proxy_command = construct_ssh_jump_command(
|
|
2312
|
+
private_key_path,
|
|
2313
|
+
ssh_jump_ip,
|
|
2314
|
+
ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
|
|
2315
|
+
proxy_cmd_path=ssh_jump_proxy_command_path,
|
|
2316
|
+
proxy_cmd_target_pod=pod_name,
|
|
2317
|
+
# We embed both the current context and namespace to the SSH proxy
|
|
2318
|
+
# command to make sure SSH still works when the current
|
|
2319
|
+
# context/namespace is changed by the user.
|
|
2320
|
+
current_kube_context=context,
|
|
2321
|
+
current_kube_namespace=namespace)
|
|
2363
2322
|
return ssh_jump_proxy_command
|
|
2364
2323
|
|
|
2365
2324
|
|
|
@@ -2391,240 +2350,6 @@ def create_proxy_command_script() -> str:
|
|
|
2391
2350
|
return PORT_FORWARD_PROXY_CMD_PATH
|
|
2392
2351
|
|
|
2393
2352
|
|
|
2394
|
-
def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
|
|
2395
|
-
context: Optional[str],
|
|
2396
|
-
service_type: kubernetes_enums.KubernetesServiceType):
|
|
2397
|
-
"""Sets up Kubernetes service resource to access for SSH jump pod.
|
|
2398
|
-
|
|
2399
|
-
This method acts as a necessary complement to be run along with
|
|
2400
|
-
setup_ssh_jump_pod(...) method. This service ensures the pod is accessible.
|
|
2401
|
-
|
|
2402
|
-
Args:
|
|
2403
|
-
ssh_jump_name: Name to use for the SSH jump service
|
|
2404
|
-
namespace: Namespace to create the SSH jump service in
|
|
2405
|
-
service_type: Networking configuration on either to use NodePort
|
|
2406
|
-
or ClusterIP service to ssh in
|
|
2407
|
-
"""
|
|
2408
|
-
# Fill in template - ssh_key_secret and ssh_jump_image are not required for
|
|
2409
|
-
# the service spec, so we pass in empty strs.
|
|
2410
|
-
content = fill_ssh_jump_template('', '', ssh_jump_name, service_type.value)
|
|
2411
|
-
|
|
2412
|
-
# Add custom metadata from config
|
|
2413
|
-
merge_custom_metadata(content['service_spec']['metadata'], context)
|
|
2414
|
-
|
|
2415
|
-
# Create service
|
|
2416
|
-
try:
|
|
2417
|
-
kubernetes.core_api(context).create_namespaced_service(
|
|
2418
|
-
namespace, content['service_spec'])
|
|
2419
|
-
except kubernetes.api_exception() as e:
|
|
2420
|
-
# SSH Jump Pod service already exists.
|
|
2421
|
-
if e.status == 409:
|
|
2422
|
-
ssh_jump_service = kubernetes.core_api(
|
|
2423
|
-
context).read_namespaced_service(name=ssh_jump_name,
|
|
2424
|
-
namespace=namespace)
|
|
2425
|
-
curr_svc_type = ssh_jump_service.spec.type
|
|
2426
|
-
if service_type.value == curr_svc_type:
|
|
2427
|
-
# If the currently existing SSH Jump service's type is identical
|
|
2428
|
-
# to user's configuration for networking mode
|
|
2429
|
-
logger.debug(
|
|
2430
|
-
f'SSH Jump Service {ssh_jump_name} already exists in the '
|
|
2431
|
-
'cluster, using it.')
|
|
2432
|
-
else:
|
|
2433
|
-
# If a different type of service type for SSH Jump pod compared
|
|
2434
|
-
# to user's configuration for networking mode exists, we remove
|
|
2435
|
-
# existing servie to create a new one following user's config
|
|
2436
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2437
|
-
name=ssh_jump_name, namespace=namespace)
|
|
2438
|
-
kubernetes.core_api(context).create_namespaced_service(
|
|
2439
|
-
namespace, content['service_spec'])
|
|
2440
|
-
port_forward_mode = (
|
|
2441
|
-
kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
|
|
2442
|
-
nodeport_mode = (
|
|
2443
|
-
kubernetes_enums.KubernetesNetworkingMode.NODEPORT.value)
|
|
2444
|
-
clusterip_svc = (
|
|
2445
|
-
kubernetes_enums.KubernetesServiceType.CLUSTERIP.value)
|
|
2446
|
-
nodeport_svc = (
|
|
2447
|
-
kubernetes_enums.KubernetesServiceType.NODEPORT.value)
|
|
2448
|
-
curr_network_mode = port_forward_mode \
|
|
2449
|
-
if curr_svc_type == clusterip_svc else nodeport_mode
|
|
2450
|
-
new_network_mode = nodeport_mode \
|
|
2451
|
-
if curr_svc_type == clusterip_svc else port_forward_mode
|
|
2452
|
-
new_svc_type = nodeport_svc \
|
|
2453
|
-
if curr_svc_type == clusterip_svc else clusterip_svc
|
|
2454
|
-
logger.info(
|
|
2455
|
-
f'Switching the networking mode from '
|
|
2456
|
-
f'\'{curr_network_mode}\' to \'{new_network_mode}\' '
|
|
2457
|
-
f'following networking configuration. Deleting existing '
|
|
2458
|
-
f'\'{curr_svc_type}\' service and recreating as '
|
|
2459
|
-
f'\'{new_svc_type}\' service.')
|
|
2460
|
-
else:
|
|
2461
|
-
raise
|
|
2462
|
-
else:
|
|
2463
|
-
logger.info(f'Created SSH Jump Service {ssh_jump_name}.')
|
|
2464
|
-
|
|
2465
|
-
|
|
2466
|
-
def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
|
|
2467
|
-
ssh_key_secret: str, namespace: str,
|
|
2468
|
-
context: Optional[str]):
|
|
2469
|
-
"""Sets up Kubernetes RBAC and pod for SSH jump host.
|
|
2470
|
-
|
|
2471
|
-
Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
|
|
2472
|
-
running inside a cluster. This function sets up the resources needed for
|
|
2473
|
-
the SSH jump pod. This includes a service account which grants the jump pod
|
|
2474
|
-
permission to watch for other SkyPilot pods and terminate itself if there
|
|
2475
|
-
are no SkyPilot pods running.
|
|
2476
|
-
|
|
2477
|
-
setup_ssh_jump_service must also be run to ensure that the SSH jump pod is
|
|
2478
|
-
reachable.
|
|
2479
|
-
|
|
2480
|
-
Args:
|
|
2481
|
-
ssh_jump_image: Container image to use for the SSH jump pod
|
|
2482
|
-
ssh_jump_name: Name to use for the SSH jump pod
|
|
2483
|
-
ssh_key_secret: Secret name for the SSH key stored in the cluster
|
|
2484
|
-
namespace: Namespace to create the SSH jump pod in
|
|
2485
|
-
"""
|
|
2486
|
-
# Fill in template - service is created separately so service_type is not
|
|
2487
|
-
# required, so we pass in empty str.
|
|
2488
|
-
content = fill_ssh_jump_template(ssh_key_secret, ssh_jump_image,
|
|
2489
|
-
ssh_jump_name, '')
|
|
2490
|
-
|
|
2491
|
-
# Add custom metadata to all objects
|
|
2492
|
-
for object_type in content.keys():
|
|
2493
|
-
merge_custom_metadata(content[object_type]['metadata'], context)
|
|
2494
|
-
|
|
2495
|
-
# ServiceAccount
|
|
2496
|
-
try:
|
|
2497
|
-
kubernetes.core_api(context).create_namespaced_service_account(
|
|
2498
|
-
namespace, content['service_account'])
|
|
2499
|
-
except kubernetes.api_exception() as e:
|
|
2500
|
-
if e.status == 409:
|
|
2501
|
-
logger.info(
|
|
2502
|
-
'SSH Jump ServiceAccount already exists in the cluster, using '
|
|
2503
|
-
'it.')
|
|
2504
|
-
else:
|
|
2505
|
-
raise
|
|
2506
|
-
else:
|
|
2507
|
-
logger.info('Created SSH Jump ServiceAccount.')
|
|
2508
|
-
# Role
|
|
2509
|
-
try:
|
|
2510
|
-
kubernetes.auth_api(context).create_namespaced_role(
|
|
2511
|
-
namespace, content['role'])
|
|
2512
|
-
except kubernetes.api_exception() as e:
|
|
2513
|
-
if e.status == 409:
|
|
2514
|
-
logger.info(
|
|
2515
|
-
'SSH Jump Role already exists in the cluster, using it.')
|
|
2516
|
-
else:
|
|
2517
|
-
raise
|
|
2518
|
-
else:
|
|
2519
|
-
logger.info('Created SSH Jump Role.')
|
|
2520
|
-
# RoleBinding
|
|
2521
|
-
try:
|
|
2522
|
-
kubernetes.auth_api(context).create_namespaced_role_binding(
|
|
2523
|
-
namespace, content['role_binding'])
|
|
2524
|
-
except kubernetes.api_exception() as e:
|
|
2525
|
-
if e.status == 409:
|
|
2526
|
-
logger.info(
|
|
2527
|
-
'SSH Jump RoleBinding already exists in the cluster, using '
|
|
2528
|
-
'it.')
|
|
2529
|
-
else:
|
|
2530
|
-
raise
|
|
2531
|
-
else:
|
|
2532
|
-
logger.info('Created SSH Jump RoleBinding.')
|
|
2533
|
-
# Pod
|
|
2534
|
-
try:
|
|
2535
|
-
kubernetes.core_api(context).create_namespaced_pod(
|
|
2536
|
-
namespace, content['pod_spec'])
|
|
2537
|
-
except kubernetes.api_exception() as e:
|
|
2538
|
-
if e.status == 409:
|
|
2539
|
-
logger.info(
|
|
2540
|
-
f'SSH Jump Host {ssh_jump_name} already exists in the cluster, '
|
|
2541
|
-
'using it.')
|
|
2542
|
-
else:
|
|
2543
|
-
raise
|
|
2544
|
-
else:
|
|
2545
|
-
logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
|
|
2546
|
-
|
|
2547
|
-
|
|
2548
|
-
def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
|
|
2549
|
-
node_id: str):
|
|
2550
|
-
"""Analyzes SSH jump pod and removes if it is in a bad state
|
|
2551
|
-
|
|
2552
|
-
Prevents the existence of a dangling SSH jump pod. This could happen
|
|
2553
|
-
in case the pod main container did not start properly (or failed). In that
|
|
2554
|
-
case, jump pod lifecycle manager will not function properly to
|
|
2555
|
-
remove the pod and service automatically, and must be done manually.
|
|
2556
|
-
|
|
2557
|
-
Args:
|
|
2558
|
-
namespace: Namespace to remove the SSH jump pod and service from
|
|
2559
|
-
node_id: Name of head pod
|
|
2560
|
-
"""
|
|
2561
|
-
|
|
2562
|
-
def find(l, predicate):
|
|
2563
|
-
"""Utility function to find element in given list"""
|
|
2564
|
-
results = [x for x in l if predicate(x)]
|
|
2565
|
-
return results[0] if results else None
|
|
2566
|
-
|
|
2567
|
-
# Get the SSH jump pod name from the head pod
|
|
2568
|
-
try:
|
|
2569
|
-
pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
2570
|
-
node_id, namespace)
|
|
2571
|
-
except kubernetes.api_exception() as e:
|
|
2572
|
-
if e.status == 404:
|
|
2573
|
-
logger.warning(f'Failed to get pod {node_id},'
|
|
2574
|
-
' but the pod was not found (404).')
|
|
2575
|
-
raise
|
|
2576
|
-
else:
|
|
2577
|
-
ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
|
|
2578
|
-
try:
|
|
2579
|
-
ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
2580
|
-
ssh_jump_name, namespace)
|
|
2581
|
-
cont_ready_cond = find(ssh_jump_pod.status.conditions,
|
|
2582
|
-
lambda c: c.type == 'ContainersReady')
|
|
2583
|
-
if (cont_ready_cond and cont_ready_cond.status
|
|
2584
|
-
== 'False') or ssh_jump_pod.status.phase == 'Pending':
|
|
2585
|
-
# Either the main container is not ready or the pod failed
|
|
2586
|
-
# to schedule. To be on the safe side and prevent a dangling
|
|
2587
|
-
# ssh jump pod, lets remove it and the service. Otherwise, main
|
|
2588
|
-
# container is ready and its lifecycle management script takes
|
|
2589
|
-
# care of the cleaning.
|
|
2590
|
-
kubernetes.core_api(context).delete_namespaced_pod(
|
|
2591
|
-
ssh_jump_name, namespace)
|
|
2592
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2593
|
-
ssh_jump_name, namespace)
|
|
2594
|
-
except kubernetes.api_exception() as e:
|
|
2595
|
-
# We keep the warning in debug to avoid polluting the `sky launch`
|
|
2596
|
-
# output.
|
|
2597
|
-
logger.debug(f'Tried to check ssh jump pod {ssh_jump_name},'
|
|
2598
|
-
f' but got error {e}\n. Consider running `kubectl '
|
|
2599
|
-
f'delete pod {ssh_jump_name} -n {namespace}` to manually '
|
|
2600
|
-
'remove the pod if it has crashed.')
|
|
2601
|
-
# We encountered an issue while checking ssh jump pod. To be on
|
|
2602
|
-
# the safe side, lets remove its service so the port is freed
|
|
2603
|
-
try:
|
|
2604
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2605
|
-
ssh_jump_name, namespace)
|
|
2606
|
-
except kubernetes.api_exception():
|
|
2607
|
-
pass
|
|
2608
|
-
|
|
2609
|
-
|
|
2610
|
-
def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
|
|
2611
|
-
ssh_jump_name: str, service_type: str) -> Dict:
|
|
2612
|
-
template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
|
|
2613
|
-
'kubernetes-ssh-jump.yml.j2')
|
|
2614
|
-
if not os.path.exists(template_path):
|
|
2615
|
-
raise FileNotFoundError(
|
|
2616
|
-
'Template "kubernetes-ssh-jump.j2" does not exist.')
|
|
2617
|
-
with open(template_path, 'r', encoding='utf-8') as fin:
|
|
2618
|
-
template = fin.read()
|
|
2619
|
-
j2_template = jinja2.Template(template)
|
|
2620
|
-
cont = j2_template.render(name=ssh_jump_name,
|
|
2621
|
-
image=ssh_jump_image,
|
|
2622
|
-
secret=ssh_key_secret,
|
|
2623
|
-
service_type=service_type)
|
|
2624
|
-
content = yaml_utils.safe_load(cont)
|
|
2625
|
-
return content
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
2353
|
def check_port_forward_mode_dependencies(
|
|
2629
2354
|
raise_error: bool = True) -> Optional[List[str]]:
|
|
2630
2355
|
"""Checks if 'socat' and 'nc' are installed
|
|
@@ -3117,14 +2842,6 @@ def get_kubernetes_node_info(
|
|
|
3117
2842
|
information.
|
|
3118
2843
|
"""
|
|
3119
2844
|
nodes = get_kubernetes_nodes(context=context)
|
|
3120
|
-
# Get the pods to get the real-time resource usage
|
|
3121
|
-
try:
|
|
3122
|
-
pods = get_all_pods_in_kubernetes_cluster(context=context)
|
|
3123
|
-
except kubernetes.api_exception() as e:
|
|
3124
|
-
if e.status == 403:
|
|
3125
|
-
pods = None
|
|
3126
|
-
else:
|
|
3127
|
-
raise
|
|
3128
2845
|
|
|
3129
2846
|
lf, _ = detect_gpu_label_formatter(context)
|
|
3130
2847
|
if not lf:
|
|
@@ -3132,6 +2849,46 @@ def get_kubernetes_node_info(
|
|
|
3132
2849
|
else:
|
|
3133
2850
|
label_keys = lf.get_label_keys()
|
|
3134
2851
|
|
|
2852
|
+
# Check if all nodes have no accelerators to avoid fetching pods
|
|
2853
|
+
any_node_has_accelerators = False
|
|
2854
|
+
for node in nodes:
|
|
2855
|
+
accelerator_count = get_node_accelerator_count(context,
|
|
2856
|
+
node.status.allocatable)
|
|
2857
|
+
if accelerator_count > 0:
|
|
2858
|
+
any_node_has_accelerators = True
|
|
2859
|
+
break
|
|
2860
|
+
|
|
2861
|
+
# Get the pods to get the real-time resource usage
|
|
2862
|
+
pods = None
|
|
2863
|
+
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
2864
|
+
if any_node_has_accelerators:
|
|
2865
|
+
try:
|
|
2866
|
+
pods = get_all_pods_in_kubernetes_cluster(context=context)
|
|
2867
|
+
# Pre-compute allocated accelerator count per node
|
|
2868
|
+
for pod in pods:
|
|
2869
|
+
if pod.status.phase in ['Running', 'Pending']:
|
|
2870
|
+
# Skip pods that should not count against GPU count
|
|
2871
|
+
if should_exclude_pod_from_gpu_allocation(pod):
|
|
2872
|
+
logger.debug(f'Excluding low priority pod '
|
|
2873
|
+
f'{pod.metadata.name} from GPU allocation '
|
|
2874
|
+
f'calculations')
|
|
2875
|
+
continue
|
|
2876
|
+
# Iterate over all the containers in the pod and sum the
|
|
2877
|
+
# GPU requests
|
|
2878
|
+
pod_allocated_qty = 0
|
|
2879
|
+
for container in pod.spec.containers:
|
|
2880
|
+
if container.resources.requests:
|
|
2881
|
+
pod_allocated_qty += get_node_accelerator_count(
|
|
2882
|
+
context, container.resources.requests)
|
|
2883
|
+
if pod_allocated_qty > 0:
|
|
2884
|
+
allocated_qty_by_node[
|
|
2885
|
+
pod.spec.node_name] += pod_allocated_qty
|
|
2886
|
+
except kubernetes.api_exception() as e:
|
|
2887
|
+
if e.status == 403:
|
|
2888
|
+
pass
|
|
2889
|
+
else:
|
|
2890
|
+
raise
|
|
2891
|
+
|
|
3135
2892
|
node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
|
|
3136
2893
|
has_multi_host_tpu = False
|
|
3137
2894
|
|
|
@@ -3161,32 +2918,21 @@ def get_kubernetes_node_info(
|
|
|
3161
2918
|
node_ip = address.address
|
|
3162
2919
|
break
|
|
3163
2920
|
|
|
3164
|
-
allocated_qty = 0
|
|
3165
2921
|
accelerator_count = get_node_accelerator_count(context,
|
|
3166
2922
|
node.status.allocatable)
|
|
2923
|
+
if accelerator_count == 0:
|
|
2924
|
+
node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
|
|
2925
|
+
name=node.metadata.name,
|
|
2926
|
+
accelerator_type=accelerator_name,
|
|
2927
|
+
total={'accelerator_count': 0},
|
|
2928
|
+
free={'accelerators_available': 0},
|
|
2929
|
+
ip_address=node_ip)
|
|
2930
|
+
continue
|
|
3167
2931
|
|
|
3168
2932
|
if pods is None:
|
|
3169
2933
|
accelerators_available = -1
|
|
3170
|
-
|
|
3171
2934
|
else:
|
|
3172
|
-
|
|
3173
|
-
# Get all the pods running on the node
|
|
3174
|
-
if (pod.spec.node_name == node.metadata.name and
|
|
3175
|
-
pod.status.phase in ['Running', 'Pending']):
|
|
3176
|
-
# Skip pods that should not count against GPU count
|
|
3177
|
-
if should_exclude_pod_from_gpu_allocation(pod):
|
|
3178
|
-
logger.debug(
|
|
3179
|
-
f'Excluding low priority pod '
|
|
3180
|
-
f'{pod.metadata.name} from GPU allocation '
|
|
3181
|
-
f'calculations on node {node.metadata.name}')
|
|
3182
|
-
continue
|
|
3183
|
-
# Iterate over all the containers in the pod and sum the
|
|
3184
|
-
# GPU requests
|
|
3185
|
-
for container in pod.spec.containers:
|
|
3186
|
-
if container.resources.requests:
|
|
3187
|
-
allocated_qty += get_node_accelerator_count(
|
|
3188
|
-
context, container.resources.requests)
|
|
3189
|
-
|
|
2935
|
+
allocated_qty = allocated_qty_by_node[node.metadata.name]
|
|
3190
2936
|
accelerators_available = accelerator_count - allocated_qty
|
|
3191
2937
|
|
|
3192
2938
|
# Exclude multi-host TPUs from being processed.
|
sky/schemas/api/responses.py
CHANGED
|
@@ -198,3 +198,24 @@ class ManagedJobRecord(ResponseBaseModel):
|
|
|
198
198
|
current_cluster_name: Optional[str] = None
|
|
199
199
|
job_id_on_pool_cluster: Optional[int] = None
|
|
200
200
|
accelerators: Optional[Dict[str, int]] = None
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class VolumeRecord(ResponseBaseModel):
|
|
204
|
+
"""A single volume record."""
|
|
205
|
+
name: str
|
|
206
|
+
type: str
|
|
207
|
+
launched_at: int
|
|
208
|
+
cloud: str
|
|
209
|
+
region: str
|
|
210
|
+
zone: Optional[str] = None
|
|
211
|
+
size: str
|
|
212
|
+
config: Dict[str, Any]
|
|
213
|
+
name_on_cloud: str
|
|
214
|
+
user_hash: str
|
|
215
|
+
user_name: str
|
|
216
|
+
workspace: str
|
|
217
|
+
last_attached_at: Optional[int] = None
|
|
218
|
+
last_use: Optional[str] = None
|
|
219
|
+
status: Optional[str] = None
|
|
220
|
+
usedby_pods: List[str]
|
|
221
|
+
usedby_clusters: List[str]
|
|
@@ -195,6 +195,14 @@ def decode_storage_ls(
|
|
|
195
195
|
]
|
|
196
196
|
|
|
197
197
|
|
|
198
|
+
@register_decoders('volume_list')
|
|
199
|
+
def decode_volume_list(
|
|
200
|
+
return_value: List[Dict[str, Any]]) -> List[responses.VolumeRecord]:
|
|
201
|
+
return [
|
|
202
|
+
responses.VolumeRecord(**volume_info) for volume_info in return_value
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
|
|
198
206
|
@register_decoders('job_status')
|
|
199
207
|
def decode_job_status(
|
|
200
208
|
return_value: Dict[str, Optional[str]]
|
|
@@ -211,6 +211,12 @@ def encode_storage_ls(
|
|
|
211
211
|
return [storage_info.model_dump() for storage_info in return_value]
|
|
212
212
|
|
|
213
213
|
|
|
214
|
+
@register_encoder('volume_list')
|
|
215
|
+
def encode_volume_list(
|
|
216
|
+
return_value: List[responses.VolumeRecord]) -> List[Dict[str, Any]]:
|
|
217
|
+
return [volume_info.model_dump() for volume_info in return_value]
|
|
218
|
+
|
|
219
|
+
|
|
214
220
|
@register_encoder('job_status')
|
|
215
221
|
def encode_job_status(return_value: Dict[int, Any]) -> Dict[int, str]:
|
|
216
222
|
for job_id in return_value.keys():
|
|
@@ -33,14 +33,11 @@ provider:
|
|
|
33
33
|
networking_mode: {{k8s_networking_mode}}
|
|
34
34
|
|
|
35
35
|
# We use internal IPs since we set up a port-forward between the kubernetes
|
|
36
|
-
# cluster and the local machine
|
|
37
|
-
# head node.
|
|
36
|
+
# cluster and the local machine.
|
|
38
37
|
use_internal_ips: true
|
|
39
38
|
|
|
40
39
|
timeout: {{timeout}}
|
|
41
40
|
|
|
42
|
-
ssh_jump_image: {{k8s_ssh_jump_image}}
|
|
43
|
-
|
|
44
41
|
# Namespace used to host SkyPilot system components, such as fuse device
|
|
45
42
|
# manager.
|
|
46
43
|
skypilot_system_namespace: {{k8s_skypilot_system_namespace}}
|
|
@@ -276,8 +273,6 @@ available_node_types:
|
|
|
276
273
|
parent: skypilot
|
|
277
274
|
# component will be set for the head node pod to be the same as the head node service selector above if a
|
|
278
275
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
279
|
-
# Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
|
|
280
|
-
skypilot-ssh-jump: {{k8s_ssh_jump_name}}
|
|
281
276
|
skypilot-user: {{ user }}
|
|
282
277
|
# Custom tags for the pods
|
|
283
278
|
{%- for label_key, label_value in labels.items() %}
|
|
@@ -444,9 +439,6 @@ available_node_types:
|
|
|
444
439
|
# object store. If you do not provide this, Ray will fall back to
|
|
445
440
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
446
441
|
volumes:
|
|
447
|
-
- name: secret-volume
|
|
448
|
-
secret:
|
|
449
|
-
secretName: {{k8s_ssh_key_secret_name}}
|
|
450
442
|
- name: dshm
|
|
451
443
|
emptyDir:
|
|
452
444
|
medium: Memory
|
|
@@ -869,7 +861,9 @@ available_node_types:
|
|
|
869
861
|
$(prefix_cmd) mkdir -p ~/.ssh;
|
|
870
862
|
$(prefix_cmd) chown -R $(whoami) ~/.ssh;
|
|
871
863
|
$(prefix_cmd) chmod 700 ~/.ssh;
|
|
872
|
-
$(prefix_cmd) cat
|
|
864
|
+
$(prefix_cmd) cat > ~/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
|
|
865
|
+
skypilot:ssh_public_key_content
|
|
866
|
+
SKYPILOT_SSH_KEY_EOF
|
|
873
867
|
$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
|
|
874
868
|
$(prefix_cmd) service ssh restart;
|
|
875
869
|
$(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
|
|
@@ -1105,9 +1099,6 @@ available_node_types:
|
|
|
1105
1099
|
# object store. If you do not provide this, Ray will fall back to
|
|
1106
1100
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
1107
1101
|
volumeMounts:
|
|
1108
|
-
- name: secret-volume
|
|
1109
|
-
readOnly: true
|
|
1110
|
-
mountPath: "/etc/secret-volume"
|
|
1111
1102
|
- mountPath: /dev/shm
|
|
1112
1103
|
name: dshm
|
|
1113
1104
|
{% if k8s_enable_gpudirect_tcpx %}
|
sky/utils/env_options.py
CHANGED
|
@@ -27,6 +27,10 @@ class Options(enum.Enum):
|
|
|
27
27
|
# Internal: This is used for testing to enable grpc for communication
|
|
28
28
|
# between the API server and the Skylet.
|
|
29
29
|
ENABLE_GRPC = ('SKYPILOT_ENABLE_GRPC', False)
|
|
30
|
+
# Allow all contexts for Kubernetes if allowed_contexts is not set in
|
|
31
|
+
# config.
|
|
32
|
+
ALLOW_ALL_KUBERNETES_CONTEXTS = ('SKYPILOT_ALLOW_ALL_KUBERNETES_CONTEXTS',
|
|
33
|
+
False)
|
|
30
34
|
|
|
31
35
|
def __init__(self, env_var: str, default: bool) -> None:
|
|
32
36
|
super().__init__()
|
sky/utils/kubernetes_enums.py
CHANGED
|
@@ -2,26 +2,13 @@
|
|
|
2
2
|
import enum
|
|
3
3
|
|
|
4
4
|
|
|
5
|
+
# TODO(kevin): Remove this enum in v0.13.0.
|
|
5
6
|
class KubernetesNetworkingMode(enum.Enum):
|
|
6
|
-
"""Enum for the different types of networking modes for accessing
|
|
7
|
-
jump pods.
|
|
7
|
+
"""Enum for the different types of networking modes for accessing pods.
|
|
8
8
|
"""
|
|
9
9
|
NODEPORT = 'nodeport'
|
|
10
10
|
PORTFORWARD = 'portforward'
|
|
11
11
|
|
|
12
|
-
@classmethod
|
|
13
|
-
def from_str(cls, mode: str) -> 'KubernetesNetworkingMode':
|
|
14
|
-
"""Returns the enum value for the given string."""
|
|
15
|
-
if mode.lower() == cls.NODEPORT.value:
|
|
16
|
-
return cls.NODEPORT
|
|
17
|
-
elif mode.lower() == cls.PORTFORWARD.value:
|
|
18
|
-
return cls.PORTFORWARD
|
|
19
|
-
else:
|
|
20
|
-
raise ValueError(f'Unsupported kubernetes networking mode: '
|
|
21
|
-
f'{mode}. The mode must be either '
|
|
22
|
-
f'\'{cls.PORTFORWARD.value}\' or '
|
|
23
|
-
f'\'{cls.NODEPORT.value}\'. ')
|
|
24
|
-
|
|
25
12
|
|
|
26
13
|
class KubernetesServiceType(enum.Enum):
|
|
27
14
|
"""Enum for the different types of services."""
|
sky/utils/schemas.py
CHANGED
|
@@ -1071,6 +1071,7 @@ _REMOTE_IDENTITY_SCHEMA_KUBERNETES = {
|
|
|
1071
1071
|
}
|
|
1072
1072
|
|
|
1073
1073
|
_CONTEXT_CONFIG_SCHEMA_KUBERNETES = {
|
|
1074
|
+
# TODO(kevin): Remove 'networking' in v0.13.0.
|
|
1074
1075
|
'networking': {
|
|
1075
1076
|
'type': 'string',
|
|
1076
1077
|
'case_insensitive_enum': [
|
|
@@ -1331,10 +1332,15 @@ def get_config_schema():
|
|
|
1331
1332
|
'additionalProperties': False,
|
|
1332
1333
|
'properties': {
|
|
1333
1334
|
'allowed_contexts': {
|
|
1334
|
-
'
|
|
1335
|
-
|
|
1335
|
+
'oneOf': [{
|
|
1336
|
+
'type': 'array',
|
|
1337
|
+
'items': {
|
|
1338
|
+
'type': 'string',
|
|
1339
|
+
},
|
|
1340
|
+
}, {
|
|
1336
1341
|
'type': 'string',
|
|
1337
|
-
|
|
1342
|
+
'pattern': '^all$'
|
|
1343
|
+
}]
|
|
1338
1344
|
},
|
|
1339
1345
|
'context_configs': {
|
|
1340
1346
|
'type': 'object',
|
|
@@ -1656,10 +1662,15 @@ def get_config_schema():
|
|
|
1656
1662
|
'required': [],
|
|
1657
1663
|
'properties': {
|
|
1658
1664
|
'allowed_contexts': {
|
|
1659
|
-
'
|
|
1660
|
-
|
|
1665
|
+
'oneOf': [{
|
|
1666
|
+
'type': 'array',
|
|
1667
|
+
'items': {
|
|
1668
|
+
'type': 'string',
|
|
1669
|
+
},
|
|
1670
|
+
}, {
|
|
1661
1671
|
'type': 'string',
|
|
1662
|
-
|
|
1672
|
+
'pattern': '^all$'
|
|
1673
|
+
}]
|
|
1663
1674
|
},
|
|
1664
1675
|
'disabled': {
|
|
1665
1676
|
'type': 'boolean'
|