skypilot-nightly 1.0.0.dev20251001__py3-none-any.whl → 1.0.0.dev20251003__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (58) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +19 -109
  3. sky/client/cli/command.py +2 -3
  4. sky/client/cli/table_utils.py +222 -1
  5. sky/clouds/cudo.py +1 -1
  6. sky/clouds/kubernetes.py +7 -19
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/_next/static/{m3YT2i5s6v4SsIdYc8WZa → Haazh5IQz6F8Wyiqxcaj8}/_buildManifest.js +1 -1
  9. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +1 -0
  10. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-ad77b12fc736dca3.js → [job]-72794fc3fcdd517a.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/{webpack-4f0c389a4ce5fd9c.js → webpack-3286453d56f3c0a0.js} +1 -1
  13. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  14. sky/dashboard/out/clusters/[cluster].html +1 -1
  15. sky/dashboard/out/clusters.html +1 -1
  16. sky/dashboard/out/config.html +1 -1
  17. sky/dashboard/out/index.html +1 -1
  18. sky/dashboard/out/infra/[context].html +1 -1
  19. sky/dashboard/out/infra.html +1 -1
  20. sky/dashboard/out/jobs/[job].html +1 -1
  21. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  22. sky/dashboard/out/jobs.html +1 -1
  23. sky/dashboard/out/users.html +1 -1
  24. sky/dashboard/out/volumes.html +1 -1
  25. sky/dashboard/out/workspace/new.html +1 -1
  26. sky/dashboard/out/workspaces/[name].html +1 -1
  27. sky/dashboard/out/workspaces.html +1 -1
  28. sky/data/storage_utils.py +9 -0
  29. sky/global_user_state.py +16 -0
  30. sky/jobs/server/core.py +60 -53
  31. sky/jobs/state.py +21 -1
  32. sky/jobs/utils.py +29 -11
  33. sky/provision/kubernetes/config.py +0 -42
  34. sky/provision/kubernetes/instance.py +1 -33
  35. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  36. sky/provision/kubernetes/network_utils.py +0 -21
  37. sky/provision/kubernetes/utils.py +68 -322
  38. sky/schemas/api/responses.py +21 -0
  39. sky/server/requests/serializers/decoders.py +8 -0
  40. sky/server/requests/serializers/encoders.py +6 -0
  41. sky/templates/kubernetes-ray.yml.j2 +4 -13
  42. sky/utils/env_options.py +4 -0
  43. sky/utils/kubernetes_enums.py +2 -15
  44. sky/utils/schemas.py +17 -6
  45. sky/volumes/client/sdk.py +3 -2
  46. sky/volumes/server/core.py +3 -2
  47. {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/METADATA +37 -37
  48. {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/RECORD +53 -56
  49. sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +0 -1
  50. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +0 -1
  51. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  52. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  53. sky/volumes/utils.py +0 -224
  54. /sky/dashboard/out/_next/static/{m3YT2i5s6v4SsIdYc8WZa → Haazh5IQz6F8Wyiqxcaj8}/_ssgManifest.js +0 -0
  55. {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/WHEEL +0 -0
  56. {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/entry_points.txt +0 -0
  57. {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/licenses/LICENSE +0 -0
  58. {skypilot_nightly-1.0.0.dev20251001.dist-info → skypilot_nightly-1.0.0.dev20251003.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  """Kubernetes utilities for SkyPilot."""
2
+ import collections
2
3
  import copy
3
4
  import dataclasses
4
5
  import datetime
@@ -14,7 +15,6 @@ import subprocess
14
15
  import time
15
16
  import typing
16
17
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
17
- from urllib.parse import urlparse
18
18
 
19
19
  from sky import clouds
20
20
  from sky import exceptions
@@ -32,7 +32,6 @@ from sky.skylet import constants
32
32
  from sky.utils import annotations
33
33
  from sky.utils import common_utils
34
34
  from sky.utils import config_utils
35
- from sky.utils import directory_utils
36
35
  from sky.utils import env_options
37
36
  from sky.utils import kubernetes_enums
38
37
  from sky.utils import schemas
@@ -1559,23 +1558,6 @@ def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
1559
1558
  return head_service.spec.ports[0].node_port
1560
1559
 
1561
1560
 
1562
- def get_external_ip(network_mode: Optional[
1563
- kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
1564
- if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
1565
- return '127.0.0.1'
1566
- # Return the IP address of the first node with an external IP
1567
- nodes = kubernetes.core_api(context).list_node().items
1568
- for node in nodes:
1569
- if node.status.addresses:
1570
- for address in node.status.addresses:
1571
- if address.type == 'ExternalIP':
1572
- return address.address
1573
- # If no external IP is found, use the API server IP
1574
- api_host = kubernetes.core_api(context).api_client.configuration.host
1575
- parsed_url = urlparse(api_host)
1576
- return parsed_url.hostname
1577
-
1578
-
1579
1561
  def check_credentials(context: Optional[str],
1580
1562
  timeout: int = kubernetes.API_TIMEOUT,
1581
1563
  run_optional_checks: bool = False) -> \
@@ -2287,16 +2269,14 @@ def construct_ssh_jump_command(
2287
2269
 
2288
2270
 
2289
2271
  def get_ssh_proxy_command(
2290
- k8s_ssh_target: str,
2291
- network_mode: kubernetes_enums.KubernetesNetworkingMode,
2272
+ pod_name: str,
2292
2273
  private_key_path: str,
2293
2274
  context: Optional[str],
2294
2275
  namespace: str,
2295
2276
  ) -> str:
2296
2277
  """Generates the SSH proxy command to connect to the pod.
2297
2278
 
2298
- Uses a jump pod if the network mode is NODEPORT, and direct port-forwarding
2299
- if the network mode is PORTFORWARD.
2279
+ Uses a direct port-forwarding.
2300
2280
 
2301
2281
  By default, establishing an SSH connection creates a communication
2302
2282
  channel to a remote node by setting up a TCP connection. When a
@@ -2307,17 +2287,8 @@ def get_ssh_proxy_command(
2307
2287
  Pods within a Kubernetes cluster have internal IP addresses that are
2308
2288
  typically not accessible from outside the cluster. Since the default TCP
2309
2289
  connection of SSH won't allow access to these pods, we employ a
2310
- ProxyCommand to establish the required communication channel. We offer this
2311
- in two different networking options: NodePort/port-forward.
2312
-
2313
- With the NodePort networking mode, a NodePort service is launched. This
2314
- service opens an external port on the node which redirects to the desired
2315
- port to a SSH jump pod. When establishing an SSH session in this mode, the
2316
- ProxyCommand makes use of this external port to create a communication
2317
- channel directly to port 22, which is the default port ssh server listens
2318
- on, of the jump pod.
2290
+ ProxyCommand to establish the required communication channel.
2319
2291
 
2320
- With Port-forward mode, instead of directly exposing an external port,
2321
2292
  'kubectl port-forward' sets up a tunnel between a local port
2322
2293
  (127.0.0.1:23100) and port 22 of the provisioned pod. Then we establish TCP
2323
2294
  connection to the local end of this tunnel, 127.0.0.1:23100, using 'socat'.
@@ -2328,38 +2299,26 @@ def get_ssh_proxy_command(
2328
2299
  the local machine.
2329
2300
 
2330
2301
  Args:
2331
- k8s_ssh_target: str; The Kubernetes object that will be used as the
2332
- target for SSH. If network_mode is NODEPORT, this is the name of the
2333
- service. If network_mode is PORTFORWARD, this is the pod name.
2334
- network_mode: KubernetesNetworkingMode; networking mode for ssh
2335
- session. It is either 'NODEPORT' or 'PORTFORWARD'
2302
+ pod_name: str; The Kubernetes pod name that will be used as the
2303
+ target for SSH.
2336
2304
  private_key_path: str; Path to the private key to use for SSH.
2337
2305
  This key must be authorized to access the SSH jump pod.
2338
- Required for NODEPORT networking mode.
2339
2306
  namespace: Kubernetes namespace to use.
2340
- Required for NODEPORT networking mode.
2341
2307
  """
2342
- # Fetch IP to connect to for the jump svc
2343
- ssh_jump_ip = get_external_ip(network_mode, context)
2308
+ ssh_jump_ip = '127.0.0.1' # Local end of the port-forward tunnel
2344
2309
  assert private_key_path is not None, 'Private key path must be provided'
2345
- if network_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
2346
- assert namespace is not None, 'Namespace must be provided for NodePort'
2347
- ssh_jump_port = get_port(k8s_ssh_target, namespace, context)
2348
- ssh_jump_proxy_command = construct_ssh_jump_command(
2349
- private_key_path, ssh_jump_ip, ssh_jump_port=ssh_jump_port)
2350
- else:
2351
- ssh_jump_proxy_command_path = create_proxy_command_script()
2352
- ssh_jump_proxy_command = construct_ssh_jump_command(
2353
- private_key_path,
2354
- ssh_jump_ip,
2355
- ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
2356
- proxy_cmd_path=ssh_jump_proxy_command_path,
2357
- proxy_cmd_target_pod=k8s_ssh_target,
2358
- # We embed both the current context and namespace to the SSH proxy
2359
- # command to make sure SSH still works when the current
2360
- # context/namespace is changed by the user.
2361
- current_kube_context=context,
2362
- current_kube_namespace=namespace)
2310
+ ssh_jump_proxy_command_path = create_proxy_command_script()
2311
+ ssh_jump_proxy_command = construct_ssh_jump_command(
2312
+ private_key_path,
2313
+ ssh_jump_ip,
2314
+ ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
2315
+ proxy_cmd_path=ssh_jump_proxy_command_path,
2316
+ proxy_cmd_target_pod=pod_name,
2317
+ # We embed both the current context and namespace to the SSH proxy
2318
+ # command to make sure SSH still works when the current
2319
+ # context/namespace is changed by the user.
2320
+ current_kube_context=context,
2321
+ current_kube_namespace=namespace)
2363
2322
  return ssh_jump_proxy_command
2364
2323
 
2365
2324
 
@@ -2391,240 +2350,6 @@ def create_proxy_command_script() -> str:
2391
2350
  return PORT_FORWARD_PROXY_CMD_PATH
2392
2351
 
2393
2352
 
2394
- def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
2395
- context: Optional[str],
2396
- service_type: kubernetes_enums.KubernetesServiceType):
2397
- """Sets up Kubernetes service resource to access for SSH jump pod.
2398
-
2399
- This method acts as a necessary complement to be run along with
2400
- setup_ssh_jump_pod(...) method. This service ensures the pod is accessible.
2401
-
2402
- Args:
2403
- ssh_jump_name: Name to use for the SSH jump service
2404
- namespace: Namespace to create the SSH jump service in
2405
- service_type: Networking configuration on either to use NodePort
2406
- or ClusterIP service to ssh in
2407
- """
2408
- # Fill in template - ssh_key_secret and ssh_jump_image are not required for
2409
- # the service spec, so we pass in empty strs.
2410
- content = fill_ssh_jump_template('', '', ssh_jump_name, service_type.value)
2411
-
2412
- # Add custom metadata from config
2413
- merge_custom_metadata(content['service_spec']['metadata'], context)
2414
-
2415
- # Create service
2416
- try:
2417
- kubernetes.core_api(context).create_namespaced_service(
2418
- namespace, content['service_spec'])
2419
- except kubernetes.api_exception() as e:
2420
- # SSH Jump Pod service already exists.
2421
- if e.status == 409:
2422
- ssh_jump_service = kubernetes.core_api(
2423
- context).read_namespaced_service(name=ssh_jump_name,
2424
- namespace=namespace)
2425
- curr_svc_type = ssh_jump_service.spec.type
2426
- if service_type.value == curr_svc_type:
2427
- # If the currently existing SSH Jump service's type is identical
2428
- # to user's configuration for networking mode
2429
- logger.debug(
2430
- f'SSH Jump Service {ssh_jump_name} already exists in the '
2431
- 'cluster, using it.')
2432
- else:
2433
- # If a different type of service type for SSH Jump pod compared
2434
- # to user's configuration for networking mode exists, we remove
2435
- # existing servie to create a new one following user's config
2436
- kubernetes.core_api(context).delete_namespaced_service(
2437
- name=ssh_jump_name, namespace=namespace)
2438
- kubernetes.core_api(context).create_namespaced_service(
2439
- namespace, content['service_spec'])
2440
- port_forward_mode = (
2441
- kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
2442
- nodeport_mode = (
2443
- kubernetes_enums.KubernetesNetworkingMode.NODEPORT.value)
2444
- clusterip_svc = (
2445
- kubernetes_enums.KubernetesServiceType.CLUSTERIP.value)
2446
- nodeport_svc = (
2447
- kubernetes_enums.KubernetesServiceType.NODEPORT.value)
2448
- curr_network_mode = port_forward_mode \
2449
- if curr_svc_type == clusterip_svc else nodeport_mode
2450
- new_network_mode = nodeport_mode \
2451
- if curr_svc_type == clusterip_svc else port_forward_mode
2452
- new_svc_type = nodeport_svc \
2453
- if curr_svc_type == clusterip_svc else clusterip_svc
2454
- logger.info(
2455
- f'Switching the networking mode from '
2456
- f'\'{curr_network_mode}\' to \'{new_network_mode}\' '
2457
- f'following networking configuration. Deleting existing '
2458
- f'\'{curr_svc_type}\' service and recreating as '
2459
- f'\'{new_svc_type}\' service.')
2460
- else:
2461
- raise
2462
- else:
2463
- logger.info(f'Created SSH Jump Service {ssh_jump_name}.')
2464
-
2465
-
2466
- def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
2467
- ssh_key_secret: str, namespace: str,
2468
- context: Optional[str]):
2469
- """Sets up Kubernetes RBAC and pod for SSH jump host.
2470
-
2471
- Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
2472
- running inside a cluster. This function sets up the resources needed for
2473
- the SSH jump pod. This includes a service account which grants the jump pod
2474
- permission to watch for other SkyPilot pods and terminate itself if there
2475
- are no SkyPilot pods running.
2476
-
2477
- setup_ssh_jump_service must also be run to ensure that the SSH jump pod is
2478
- reachable.
2479
-
2480
- Args:
2481
- ssh_jump_image: Container image to use for the SSH jump pod
2482
- ssh_jump_name: Name to use for the SSH jump pod
2483
- ssh_key_secret: Secret name for the SSH key stored in the cluster
2484
- namespace: Namespace to create the SSH jump pod in
2485
- """
2486
- # Fill in template - service is created separately so service_type is not
2487
- # required, so we pass in empty str.
2488
- content = fill_ssh_jump_template(ssh_key_secret, ssh_jump_image,
2489
- ssh_jump_name, '')
2490
-
2491
- # Add custom metadata to all objects
2492
- for object_type in content.keys():
2493
- merge_custom_metadata(content[object_type]['metadata'], context)
2494
-
2495
- # ServiceAccount
2496
- try:
2497
- kubernetes.core_api(context).create_namespaced_service_account(
2498
- namespace, content['service_account'])
2499
- except kubernetes.api_exception() as e:
2500
- if e.status == 409:
2501
- logger.info(
2502
- 'SSH Jump ServiceAccount already exists in the cluster, using '
2503
- 'it.')
2504
- else:
2505
- raise
2506
- else:
2507
- logger.info('Created SSH Jump ServiceAccount.')
2508
- # Role
2509
- try:
2510
- kubernetes.auth_api(context).create_namespaced_role(
2511
- namespace, content['role'])
2512
- except kubernetes.api_exception() as e:
2513
- if e.status == 409:
2514
- logger.info(
2515
- 'SSH Jump Role already exists in the cluster, using it.')
2516
- else:
2517
- raise
2518
- else:
2519
- logger.info('Created SSH Jump Role.')
2520
- # RoleBinding
2521
- try:
2522
- kubernetes.auth_api(context).create_namespaced_role_binding(
2523
- namespace, content['role_binding'])
2524
- except kubernetes.api_exception() as e:
2525
- if e.status == 409:
2526
- logger.info(
2527
- 'SSH Jump RoleBinding already exists in the cluster, using '
2528
- 'it.')
2529
- else:
2530
- raise
2531
- else:
2532
- logger.info('Created SSH Jump RoleBinding.')
2533
- # Pod
2534
- try:
2535
- kubernetes.core_api(context).create_namespaced_pod(
2536
- namespace, content['pod_spec'])
2537
- except kubernetes.api_exception() as e:
2538
- if e.status == 409:
2539
- logger.info(
2540
- f'SSH Jump Host {ssh_jump_name} already exists in the cluster, '
2541
- 'using it.')
2542
- else:
2543
- raise
2544
- else:
2545
- logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
2546
-
2547
-
2548
- def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
2549
- node_id: str):
2550
- """Analyzes SSH jump pod and removes if it is in a bad state
2551
-
2552
- Prevents the existence of a dangling SSH jump pod. This could happen
2553
- in case the pod main container did not start properly (or failed). In that
2554
- case, jump pod lifecycle manager will not function properly to
2555
- remove the pod and service automatically, and must be done manually.
2556
-
2557
- Args:
2558
- namespace: Namespace to remove the SSH jump pod and service from
2559
- node_id: Name of head pod
2560
- """
2561
-
2562
- def find(l, predicate):
2563
- """Utility function to find element in given list"""
2564
- results = [x for x in l if predicate(x)]
2565
- return results[0] if results else None
2566
-
2567
- # Get the SSH jump pod name from the head pod
2568
- try:
2569
- pod = kubernetes.core_api(context).read_namespaced_pod(
2570
- node_id, namespace)
2571
- except kubernetes.api_exception() as e:
2572
- if e.status == 404:
2573
- logger.warning(f'Failed to get pod {node_id},'
2574
- ' but the pod was not found (404).')
2575
- raise
2576
- else:
2577
- ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
2578
- try:
2579
- ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
2580
- ssh_jump_name, namespace)
2581
- cont_ready_cond = find(ssh_jump_pod.status.conditions,
2582
- lambda c: c.type == 'ContainersReady')
2583
- if (cont_ready_cond and cont_ready_cond.status
2584
- == 'False') or ssh_jump_pod.status.phase == 'Pending':
2585
- # Either the main container is not ready or the pod failed
2586
- # to schedule. To be on the safe side and prevent a dangling
2587
- # ssh jump pod, lets remove it and the service. Otherwise, main
2588
- # container is ready and its lifecycle management script takes
2589
- # care of the cleaning.
2590
- kubernetes.core_api(context).delete_namespaced_pod(
2591
- ssh_jump_name, namespace)
2592
- kubernetes.core_api(context).delete_namespaced_service(
2593
- ssh_jump_name, namespace)
2594
- except kubernetes.api_exception() as e:
2595
- # We keep the warning in debug to avoid polluting the `sky launch`
2596
- # output.
2597
- logger.debug(f'Tried to check ssh jump pod {ssh_jump_name},'
2598
- f' but got error {e}\n. Consider running `kubectl '
2599
- f'delete pod {ssh_jump_name} -n {namespace}` to manually '
2600
- 'remove the pod if it has crashed.')
2601
- # We encountered an issue while checking ssh jump pod. To be on
2602
- # the safe side, lets remove its service so the port is freed
2603
- try:
2604
- kubernetes.core_api(context).delete_namespaced_service(
2605
- ssh_jump_name, namespace)
2606
- except kubernetes.api_exception():
2607
- pass
2608
-
2609
-
2610
- def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
2611
- ssh_jump_name: str, service_type: str) -> Dict:
2612
- template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
2613
- 'kubernetes-ssh-jump.yml.j2')
2614
- if not os.path.exists(template_path):
2615
- raise FileNotFoundError(
2616
- 'Template "kubernetes-ssh-jump.j2" does not exist.')
2617
- with open(template_path, 'r', encoding='utf-8') as fin:
2618
- template = fin.read()
2619
- j2_template = jinja2.Template(template)
2620
- cont = j2_template.render(name=ssh_jump_name,
2621
- image=ssh_jump_image,
2622
- secret=ssh_key_secret,
2623
- service_type=service_type)
2624
- content = yaml_utils.safe_load(cont)
2625
- return content
2626
-
2627
-
2628
2353
  def check_port_forward_mode_dependencies(
2629
2354
  raise_error: bool = True) -> Optional[List[str]]:
2630
2355
  """Checks if 'socat' and 'nc' are installed
@@ -3117,14 +2842,6 @@ def get_kubernetes_node_info(
3117
2842
  information.
3118
2843
  """
3119
2844
  nodes = get_kubernetes_nodes(context=context)
3120
- # Get the pods to get the real-time resource usage
3121
- try:
3122
- pods = get_all_pods_in_kubernetes_cluster(context=context)
3123
- except kubernetes.api_exception() as e:
3124
- if e.status == 403:
3125
- pods = None
3126
- else:
3127
- raise
3128
2845
 
3129
2846
  lf, _ = detect_gpu_label_formatter(context)
3130
2847
  if not lf:
@@ -3132,6 +2849,46 @@ def get_kubernetes_node_info(
3132
2849
  else:
3133
2850
  label_keys = lf.get_label_keys()
3134
2851
 
2852
+ # Check if all nodes have no accelerators to avoid fetching pods
2853
+ any_node_has_accelerators = False
2854
+ for node in nodes:
2855
+ accelerator_count = get_node_accelerator_count(context,
2856
+ node.status.allocatable)
2857
+ if accelerator_count > 0:
2858
+ any_node_has_accelerators = True
2859
+ break
2860
+
2861
+ # Get the pods to get the real-time resource usage
2862
+ pods = None
2863
+ allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
2864
+ if any_node_has_accelerators:
2865
+ try:
2866
+ pods = get_all_pods_in_kubernetes_cluster(context=context)
2867
+ # Pre-compute allocated accelerator count per node
2868
+ for pod in pods:
2869
+ if pod.status.phase in ['Running', 'Pending']:
2870
+ # Skip pods that should not count against GPU count
2871
+ if should_exclude_pod_from_gpu_allocation(pod):
2872
+ logger.debug(f'Excluding low priority pod '
2873
+ f'{pod.metadata.name} from GPU allocation '
2874
+ f'calculations')
2875
+ continue
2876
+ # Iterate over all the containers in the pod and sum the
2877
+ # GPU requests
2878
+ pod_allocated_qty = 0
2879
+ for container in pod.spec.containers:
2880
+ if container.resources.requests:
2881
+ pod_allocated_qty += get_node_accelerator_count(
2882
+ context, container.resources.requests)
2883
+ if pod_allocated_qty > 0:
2884
+ allocated_qty_by_node[
2885
+ pod.spec.node_name] += pod_allocated_qty
2886
+ except kubernetes.api_exception() as e:
2887
+ if e.status == 403:
2888
+ pass
2889
+ else:
2890
+ raise
2891
+
3135
2892
  node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
3136
2893
  has_multi_host_tpu = False
3137
2894
 
@@ -3161,32 +2918,21 @@ def get_kubernetes_node_info(
3161
2918
  node_ip = address.address
3162
2919
  break
3163
2920
 
3164
- allocated_qty = 0
3165
2921
  accelerator_count = get_node_accelerator_count(context,
3166
2922
  node.status.allocatable)
2923
+ if accelerator_count == 0:
2924
+ node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
2925
+ name=node.metadata.name,
2926
+ accelerator_type=accelerator_name,
2927
+ total={'accelerator_count': 0},
2928
+ free={'accelerators_available': 0},
2929
+ ip_address=node_ip)
2930
+ continue
3167
2931
 
3168
2932
  if pods is None:
3169
2933
  accelerators_available = -1
3170
-
3171
2934
  else:
3172
- for pod in pods:
3173
- # Get all the pods running on the node
3174
- if (pod.spec.node_name == node.metadata.name and
3175
- pod.status.phase in ['Running', 'Pending']):
3176
- # Skip pods that should not count against GPU count
3177
- if should_exclude_pod_from_gpu_allocation(pod):
3178
- logger.debug(
3179
- f'Excluding low priority pod '
3180
- f'{pod.metadata.name} from GPU allocation '
3181
- f'calculations on node {node.metadata.name}')
3182
- continue
3183
- # Iterate over all the containers in the pod and sum the
3184
- # GPU requests
3185
- for container in pod.spec.containers:
3186
- if container.resources.requests:
3187
- allocated_qty += get_node_accelerator_count(
3188
- context, container.resources.requests)
3189
-
2935
+ allocated_qty = allocated_qty_by_node[node.metadata.name]
3190
2936
  accelerators_available = accelerator_count - allocated_qty
3191
2937
 
3192
2938
  # Exclude multi-host TPUs from being processed.
@@ -198,3 +198,24 @@ class ManagedJobRecord(ResponseBaseModel):
198
198
  current_cluster_name: Optional[str] = None
199
199
  job_id_on_pool_cluster: Optional[int] = None
200
200
  accelerators: Optional[Dict[str, int]] = None
201
+
202
+
203
+ class VolumeRecord(ResponseBaseModel):
204
+ """A single volume record."""
205
+ name: str
206
+ type: str
207
+ launched_at: int
208
+ cloud: str
209
+ region: str
210
+ zone: Optional[str] = None
211
+ size: str
212
+ config: Dict[str, Any]
213
+ name_on_cloud: str
214
+ user_hash: str
215
+ user_name: str
216
+ workspace: str
217
+ last_attached_at: Optional[int] = None
218
+ last_use: Optional[str] = None
219
+ status: Optional[str] = None
220
+ usedby_pods: List[str]
221
+ usedby_clusters: List[str]
@@ -195,6 +195,14 @@ def decode_storage_ls(
195
195
  ]
196
196
 
197
197
 
198
+ @register_decoders('volume_list')
199
+ def decode_volume_list(
200
+ return_value: List[Dict[str, Any]]) -> List[responses.VolumeRecord]:
201
+ return [
202
+ responses.VolumeRecord(**volume_info) for volume_info in return_value
203
+ ]
204
+
205
+
198
206
  @register_decoders('job_status')
199
207
  def decode_job_status(
200
208
  return_value: Dict[str, Optional[str]]
@@ -211,6 +211,12 @@ def encode_storage_ls(
211
211
  return [storage_info.model_dump() for storage_info in return_value]
212
212
 
213
213
 
214
+ @register_encoder('volume_list')
215
+ def encode_volume_list(
216
+ return_value: List[responses.VolumeRecord]) -> List[Dict[str, Any]]:
217
+ return [volume_info.model_dump() for volume_info in return_value]
218
+
219
+
214
220
  @register_encoder('job_status')
215
221
  def encode_job_status(return_value: Dict[int, Any]) -> Dict[int, str]:
216
222
  for job_id in return_value.keys():
@@ -33,14 +33,11 @@ provider:
33
33
  networking_mode: {{k8s_networking_mode}}
34
34
 
35
35
  # We use internal IPs since we set up a port-forward between the kubernetes
36
- # cluster and the local machine, or directly use NodePort to reach the
37
- # head node.
36
+ # cluster and the local machine.
38
37
  use_internal_ips: true
39
38
 
40
39
  timeout: {{timeout}}
41
40
 
42
- ssh_jump_image: {{k8s_ssh_jump_image}}
43
-
44
41
  # Namespace used to host SkyPilot system components, such as fuse device
45
42
  # manager.
46
43
  skypilot_system_namespace: {{k8s_skypilot_system_namespace}}
@@ -276,8 +273,6 @@ available_node_types:
276
273
  parent: skypilot
277
274
  # component will be set for the head node pod to be the same as the head node service selector above if a
278
275
  skypilot-cluster: {{cluster_name_on_cloud}}
279
- # Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
280
- skypilot-ssh-jump: {{k8s_ssh_jump_name}}
281
276
  skypilot-user: {{ user }}
282
277
  # Custom tags for the pods
283
278
  {%- for label_key, label_value in labels.items() %}
@@ -444,9 +439,6 @@ available_node_types:
444
439
  # object store. If you do not provide this, Ray will fall back to
445
440
  # /tmp which cause slowdowns if is not a shared memory volume.
446
441
  volumes:
447
- - name: secret-volume
448
- secret:
449
- secretName: {{k8s_ssh_key_secret_name}}
450
442
  - name: dshm
451
443
  emptyDir:
452
444
  medium: Memory
@@ -869,7 +861,9 @@ available_node_types:
869
861
  $(prefix_cmd) mkdir -p ~/.ssh;
870
862
  $(prefix_cmd) chown -R $(whoami) ~/.ssh;
871
863
  $(prefix_cmd) chmod 700 ~/.ssh;
872
- $(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > ~/.ssh/authorized_keys;
864
+ $(prefix_cmd) cat > ~/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
865
+ skypilot:ssh_public_key_content
866
+ SKYPILOT_SSH_KEY_EOF
873
867
  $(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
874
868
  $(prefix_cmd) service ssh restart;
875
869
  $(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
@@ -1105,9 +1099,6 @@ available_node_types:
1105
1099
  # object store. If you do not provide this, Ray will fall back to
1106
1100
  # /tmp which cause slowdowns if is not a shared memory volume.
1107
1101
  volumeMounts:
1108
- - name: secret-volume
1109
- readOnly: true
1110
- mountPath: "/etc/secret-volume"
1111
1102
  - mountPath: /dev/shm
1112
1103
  name: dshm
1113
1104
  {% if k8s_enable_gpudirect_tcpx %}
sky/utils/env_options.py CHANGED
@@ -27,6 +27,10 @@ class Options(enum.Enum):
27
27
  # Internal: This is used for testing to enable grpc for communication
28
28
  # between the API server and the Skylet.
29
29
  ENABLE_GRPC = ('SKYPILOT_ENABLE_GRPC', False)
30
+ # Allow all contexts for Kubernetes if allowed_contexts is not set in
31
+ # config.
32
+ ALLOW_ALL_KUBERNETES_CONTEXTS = ('SKYPILOT_ALLOW_ALL_KUBERNETES_CONTEXTS',
33
+ False)
30
34
 
31
35
  def __init__(self, env_var: str, default: bool) -> None:
32
36
  super().__init__()
@@ -2,26 +2,13 @@
2
2
  import enum
3
3
 
4
4
 
5
+ # TODO(kevin): Remove this enum in v0.13.0.
5
6
  class KubernetesNetworkingMode(enum.Enum):
6
- """Enum for the different types of networking modes for accessing
7
- jump pods.
7
+ """Enum for the different types of networking modes for accessing pods.
8
8
  """
9
9
  NODEPORT = 'nodeport'
10
10
  PORTFORWARD = 'portforward'
11
11
 
12
- @classmethod
13
- def from_str(cls, mode: str) -> 'KubernetesNetworkingMode':
14
- """Returns the enum value for the given string."""
15
- if mode.lower() == cls.NODEPORT.value:
16
- return cls.NODEPORT
17
- elif mode.lower() == cls.PORTFORWARD.value:
18
- return cls.PORTFORWARD
19
- else:
20
- raise ValueError(f'Unsupported kubernetes networking mode: '
21
- f'{mode}. The mode must be either '
22
- f'\'{cls.PORTFORWARD.value}\' or '
23
- f'\'{cls.NODEPORT.value}\'. ')
24
-
25
12
 
26
13
  class KubernetesServiceType(enum.Enum):
27
14
  """Enum for the different types of services."""
sky/utils/schemas.py CHANGED
@@ -1071,6 +1071,7 @@ _REMOTE_IDENTITY_SCHEMA_KUBERNETES = {
1071
1071
  }
1072
1072
 
1073
1073
  _CONTEXT_CONFIG_SCHEMA_KUBERNETES = {
1074
+ # TODO(kevin): Remove 'networking' in v0.13.0.
1074
1075
  'networking': {
1075
1076
  'type': 'string',
1076
1077
  'case_insensitive_enum': [
@@ -1331,10 +1332,15 @@ def get_config_schema():
1331
1332
  'additionalProperties': False,
1332
1333
  'properties': {
1333
1334
  'allowed_contexts': {
1334
- 'type': 'array',
1335
- 'items': {
1335
+ 'oneOf': [{
1336
+ 'type': 'array',
1337
+ 'items': {
1338
+ 'type': 'string',
1339
+ },
1340
+ }, {
1336
1341
  'type': 'string',
1337
- },
1342
+ 'pattern': '^all$'
1343
+ }]
1338
1344
  },
1339
1345
  'context_configs': {
1340
1346
  'type': 'object',
@@ -1656,10 +1662,15 @@ def get_config_schema():
1656
1662
  'required': [],
1657
1663
  'properties': {
1658
1664
  'allowed_contexts': {
1659
- 'type': 'array',
1660
- 'items': {
1665
+ 'oneOf': [{
1666
+ 'type': 'array',
1667
+ 'items': {
1668
+ 'type': 'string',
1669
+ },
1670
+ }, {
1661
1671
  'type': 'string',
1662
- },
1672
+ 'pattern': '^all$'
1673
+ }]
1663
1674
  },
1664
1675
  'disabled': {
1665
1676
  'type': 'boolean'