skypilot-nightly 1.0.0.dev20251002__py3-none-any.whl → 1.0.0.dev20251004__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (59) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +19 -109
  3. sky/backends/cloud_vm_ray_backend.py +42 -27
  4. sky/client/cli/command.py +1 -11
  5. sky/clouds/cudo.py +1 -1
  6. sky/clouds/kubernetes.py +7 -19
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/_next/static/{16g0-hgEgk6Db72hpE8MY → KL03GEega4QqDqTOMtA_w}/_buildManifest.js +1 -1
  9. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +1 -0
  10. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-ad77b12fc736dca3.js → [job]-72794fc3fcdd517a.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/{webpack-7340bc0f0dd8ae74.js → webpack-3286453d56f3c0a0.js} +1 -1
  13. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  14. sky/dashboard/out/clusters/[cluster].html +1 -1
  15. sky/dashboard/out/clusters.html +1 -1
  16. sky/dashboard/out/config.html +1 -1
  17. sky/dashboard/out/index.html +1 -1
  18. sky/dashboard/out/infra/[context].html +1 -1
  19. sky/dashboard/out/infra.html +1 -1
  20. sky/dashboard/out/jobs/[job].html +1 -1
  21. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  22. sky/dashboard/out/jobs.html +1 -1
  23. sky/dashboard/out/users.html +1 -1
  24. sky/dashboard/out/volumes.html +1 -1
  25. sky/dashboard/out/workspace/new.html +1 -1
  26. sky/dashboard/out/workspaces/[name].html +1 -1
  27. sky/dashboard/out/workspaces.html +1 -1
  28. sky/data/storage_utils.py +9 -0
  29. sky/execution.py +24 -2
  30. sky/global_user_state.py +16 -0
  31. sky/jobs/recovery_strategy.py +45 -0
  32. sky/jobs/server/core.py +60 -53
  33. sky/jobs/state.py +21 -1
  34. sky/jobs/utils.py +29 -11
  35. sky/provision/kubernetes/config.py +0 -42
  36. sky/provision/kubernetes/instance.py +1 -33
  37. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  38. sky/provision/kubernetes/network_utils.py +0 -21
  39. sky/provision/kubernetes/utils.py +136 -300
  40. sky/server/auth/loopback.py +38 -0
  41. sky/server/auth/oauth2_proxy.py +6 -0
  42. sky/server/server.py +6 -0
  43. sky/setup_files/dependencies.py +1 -0
  44. sky/templates/kubernetes-ray.yml.j2 +4 -13
  45. sky/utils/context.py +12 -7
  46. sky/utils/env_options.py +4 -0
  47. sky/utils/kubernetes_enums.py +2 -15
  48. sky/utils/schemas.py +17 -6
  49. {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/METADATA +38 -37
  50. {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/RECORD +55 -56
  51. sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +0 -1
  52. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +0 -1
  53. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  54. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  55. /sky/dashboard/out/_next/static/{16g0-hgEgk6Db72hpE8MY → KL03GEega4QqDqTOMtA_w}/_ssgManifest.js +0 -0
  56. {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/WHEEL +0 -0
  57. {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/entry_points.txt +0 -0
  58. {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/licenses/LICENSE +0 -0
  59. {skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,8 @@ import subprocess
15
15
  import time
16
16
  import typing
17
17
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
18
- from urllib.parse import urlparse
18
+
19
+ import ijson
19
20
 
20
21
  from sky import clouds
21
22
  from sky import exceptions
@@ -33,7 +34,6 @@ from sky.skylet import constants
33
34
  from sky.utils import annotations
34
35
  from sky.utils import common_utils
35
36
  from sky.utils import config_utils
36
- from sky.utils import directory_utils
37
37
  from sky.utils import env_options
38
38
  from sky.utils import kubernetes_enums
39
39
  from sky.utils import schemas
@@ -62,6 +62,8 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
62
62
  # and store all data that needs to be persisted in future.
63
63
  HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
64
64
 
65
+ IJSON_BUFFER_SIZE = 64 * 1024 # 64KB, default from ijson
66
+
65
67
 
66
68
  class KubernetesHighPerformanceNetworkType(enum.Enum):
67
69
  """Enum for different Kubernetes cluster types with high performance
@@ -1143,9 +1145,51 @@ def detect_accelerator_resource(
1143
1145
  return has_accelerator, cluster_resources
1144
1146
 
1145
1147
 
1148
+ @dataclasses.dataclass
1149
+ class V1ObjectMeta:
1150
+ name: str
1151
+ labels: Dict[str, str]
1152
+ namespace: str = '' # Used for pods, not nodes
1153
+
1154
+
1155
+ @dataclasses.dataclass
1156
+ class V1NodeAddress:
1157
+ type: str
1158
+ address: str
1159
+
1160
+
1161
+ @dataclasses.dataclass
1162
+ class V1NodeStatus:
1163
+ allocatable: Dict[str, str]
1164
+ capacity: Dict[str, str]
1165
+ addresses: List[V1NodeAddress]
1166
+
1167
+
1168
+ @dataclasses.dataclass
1169
+ class V1Node:
1170
+ metadata: V1ObjectMeta
1171
+ status: V1NodeStatus
1172
+
1173
+ @classmethod
1174
+ def from_dict(cls, data: dict) -> 'V1Node':
1175
+ """Create V1Node from a dictionary."""
1176
+ return cls(metadata=V1ObjectMeta(
1177
+ name=data['metadata']['name'],
1178
+ labels=data['metadata'].get('labels', {}),
1179
+ ),
1180
+ status=V1NodeStatus(
1181
+ allocatable=data['status']['allocatable'],
1182
+ capacity=data['status']['capacity'],
1183
+ addresses=[
1184
+ V1NodeAddress(type=addr['type'],
1185
+ address=addr['address'])
1186
+ for addr in data['status'].get('addresses', [])
1187
+ ]))
1188
+
1189
+
1146
1190
  @annotations.lru_cache(scope='request', maxsize=10)
1147
1191
  @_retry_on_error(resource_type='node')
1148
- def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
1192
+ def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[V1Node]:
1149
1193
  """Gets the kubernetes nodes in the context.
1150
1194
 
1151
1195
  If context is None, gets the nodes in the current context.
@@ -1153,15 +1197,71 @@ def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
1153
1197
  if context is None:
1154
1198
  context = get_current_kube_config_context_name()
1155
1199
 
1156
- nodes = kubernetes.core_api(context).list_node(
1157
- _request_timeout=kubernetes.API_TIMEOUT).items
1200
+ # Return raw urllib3.HTTPResponse object so that we can parse the json
1201
+ # more efficiently.
1202
+ response = kubernetes.core_api(context).list_node(
1203
+ _request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
1204
+ try:
1205
+ nodes = [
1206
+ V1Node.from_dict(item_dict) for item_dict in ijson.items(
1207
+ response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
1208
+ ]
1209
+ finally:
1210
+ response.release_conn()
1211
+
1158
1212
  return nodes
1159
1213
 
1160
1214
 
1215
+ @dataclasses.dataclass
1216
+ class V1PodStatus:
1217
+ phase: str
1218
+
1219
+
1220
+ @dataclasses.dataclass
1221
+ class V1ResourceRequirements:
1222
+ requests: Optional[Dict[str, str]]
1223
+
1224
+
1225
+ @dataclasses.dataclass
1226
+ class V1Container:
1227
+ resources: V1ResourceRequirements
1228
+
1229
+
1230
+ @dataclasses.dataclass
1231
+ class V1PodSpec:
1232
+ containers: List[V1Container]
1233
+ node_name: Optional[str]
1234
+
1235
+
1236
+ @dataclasses.dataclass
1237
+ class V1Pod:
1238
+ metadata: V1ObjectMeta
1239
+ status: V1PodStatus
1240
+ spec: V1PodSpec
1241
+
1242
+ @classmethod
1243
+ def from_dict(cls, data: dict) -> 'V1Pod':
1244
+ """Create V1Pod from a dictionary."""
1245
+ return cls(metadata=V1ObjectMeta(
1246
+ name=data['metadata']['name'],
1247
+ labels=data['metadata'].get('labels', {}),
1248
+ namespace=data['metadata'].get('namespace'),
1249
+ ),
1250
+ status=V1PodStatus(phase=data['status'].get('phase'),),
1251
+ spec=V1PodSpec(
1252
+ node_name=data['spec'].get('nodeName'),
1253
+ containers=[
1254
+ V1Container(resources=V1ResourceRequirements(
1255
+ requests=container.get('resources', {}).get(
1256
+ 'requests') or None))
1257
+ for container in data['spec'].get('containers', [])
1258
+ ]))
1259
+
1260
+
1161
1261
  @_retry_on_error(resource_type='pod')
1162
1262
  def get_all_pods_in_kubernetes_cluster(*,
1163
1263
  context: Optional[str] = None
1164
- ) -> List[Any]:
1264
+ ) -> List[V1Pod]:
1165
1265
  """Gets pods in all namespaces in kubernetes cluster indicated by context.
1166
1266
 
1167
1267
  Used for computing cluster resource usage.
@@ -1169,8 +1269,18 @@ def get_all_pods_in_kubernetes_cluster(*,
1169
1269
  if context is None:
1170
1270
  context = get_current_kube_config_context_name()
1171
1271
 
1172
- pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
1173
- _request_timeout=kubernetes.API_TIMEOUT).items
1272
+ # Return raw urllib3.HTTPResponse object so that we can parse the json
1273
+ # more efficiently.
1274
+ response = kubernetes.core_api(context).list_pod_for_all_namespaces(
1275
+ _request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
1276
+ try:
1277
+ pods = [
1278
+ V1Pod.from_dict(item_dict) for item_dict in ijson.items(
1279
+ response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
1280
+ ]
1281
+ finally:
1282
+ response.release_conn()
1283
+
1174
1284
  return pods
1175
1285
 
1176
1286
 
@@ -1560,23 +1670,6 @@ def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
1560
1670
  return head_service.spec.ports[0].node_port
1561
1671
 
1562
1672
 
1563
- def get_external_ip(network_mode: Optional[
1564
- kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
1565
- if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
1566
- return '127.0.0.1'
1567
- # Return the IP address of the first node with an external IP
1568
- nodes = kubernetes.core_api(context).list_node().items
1569
- for node in nodes:
1570
- if node.status.addresses:
1571
- for address in node.status.addresses:
1572
- if address.type == 'ExternalIP':
1573
- return address.address
1574
- # If no external IP is found, use the API server IP
1575
- api_host = kubernetes.core_api(context).api_client.configuration.host
1576
- parsed_url = urlparse(api_host)
1577
- return parsed_url.hostname
1578
-
1579
-
1580
1673
  def check_credentials(context: Optional[str],
1581
1674
  timeout: int = kubernetes.API_TIMEOUT,
1582
1675
  run_optional_checks: bool = False) -> \
@@ -2288,16 +2381,14 @@ def construct_ssh_jump_command(
2288
2381
 
2289
2382
 
2290
2383
  def get_ssh_proxy_command(
2291
- k8s_ssh_target: str,
2292
- network_mode: kubernetes_enums.KubernetesNetworkingMode,
2384
+ pod_name: str,
2293
2385
  private_key_path: str,
2294
2386
  context: Optional[str],
2295
2387
  namespace: str,
2296
2388
  ) -> str:
2297
2389
  """Generates the SSH proxy command to connect to the pod.
2298
2390
 
2299
- Uses a jump pod if the network mode is NODEPORT, and direct port-forwarding
2300
- if the network mode is PORTFORWARD.
2391
+ Uses a direct port-forwarding.
2301
2392
 
2302
2393
  By default, establishing an SSH connection creates a communication
2303
2394
  channel to a remote node by setting up a TCP connection. When a
@@ -2308,17 +2399,8 @@ def get_ssh_proxy_command(
2308
2399
  Pods within a Kubernetes cluster have internal IP addresses that are
2309
2400
  typically not accessible from outside the cluster. Since the default TCP
2310
2401
  connection of SSH won't allow access to these pods, we employ a
2311
- ProxyCommand to establish the required communication channel. We offer this
2312
- in two different networking options: NodePort/port-forward.
2402
+ ProxyCommand to establish the required communication channel.
2313
2403
 
2314
- With the NodePort networking mode, a NodePort service is launched. This
2315
- service opens an external port on the node which redirects to the desired
2316
- port to a SSH jump pod. When establishing an SSH session in this mode, the
2317
- ProxyCommand makes use of this external port to create a communication
2318
- channel directly to port 22, which is the default port ssh server listens
2319
- on, of the jump pod.
2320
-
2321
- With Port-forward mode, instead of directly exposing an external port,
2322
2404
  'kubectl port-forward' sets up a tunnel between a local port
2323
2405
  (127.0.0.1:23100) and port 22 of the provisioned pod. Then we establish TCP
2324
2406
  connection to the local end of this tunnel, 127.0.0.1:23100, using 'socat'.
@@ -2329,38 +2411,26 @@ def get_ssh_proxy_command(
2329
2411
  the local machine.
2330
2412
 
2331
2413
  Args:
2332
- k8s_ssh_target: str; The Kubernetes object that will be used as the
2333
- target for SSH. If network_mode is NODEPORT, this is the name of the
2334
- service. If network_mode is PORTFORWARD, this is the pod name.
2335
- network_mode: KubernetesNetworkingMode; networking mode for ssh
2336
- session. It is either 'NODEPORT' or 'PORTFORWARD'
2414
+ pod_name: str; The Kubernetes pod name that will be used as the
2415
+ target for SSH.
2337
2416
  private_key_path: str; Path to the private key to use for SSH.
2338
2417
  This key must be authorized to access the SSH jump pod.
2339
- Required for NODEPORT networking mode.
2340
2418
  namespace: Kubernetes namespace to use.
2341
- Required for NODEPORT networking mode.
2342
2419
  """
2343
- # Fetch IP to connect to for the jump svc
2344
- ssh_jump_ip = get_external_ip(network_mode, context)
2420
+ ssh_jump_ip = '127.0.0.1' # Local end of the port-forward tunnel
2345
2421
  assert private_key_path is not None, 'Private key path must be provided'
2346
- if network_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
2347
- assert namespace is not None, 'Namespace must be provided for NodePort'
2348
- ssh_jump_port = get_port(k8s_ssh_target, namespace, context)
2349
- ssh_jump_proxy_command = construct_ssh_jump_command(
2350
- private_key_path, ssh_jump_ip, ssh_jump_port=ssh_jump_port)
2351
- else:
2352
- ssh_jump_proxy_command_path = create_proxy_command_script()
2353
- ssh_jump_proxy_command = construct_ssh_jump_command(
2354
- private_key_path,
2355
- ssh_jump_ip,
2356
- ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
2357
- proxy_cmd_path=ssh_jump_proxy_command_path,
2358
- proxy_cmd_target_pod=k8s_ssh_target,
2359
- # We embed both the current context and namespace to the SSH proxy
2360
- # command to make sure SSH still works when the current
2361
- # context/namespace is changed by the user.
2362
- current_kube_context=context,
2363
- current_kube_namespace=namespace)
2422
+ ssh_jump_proxy_command_path = create_proxy_command_script()
2423
+ ssh_jump_proxy_command = construct_ssh_jump_command(
2424
+ private_key_path,
2425
+ ssh_jump_ip,
2426
+ ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
2427
+ proxy_cmd_path=ssh_jump_proxy_command_path,
2428
+ proxy_cmd_target_pod=pod_name,
2429
+ # We embed both the current context and namespace to the SSH proxy
2430
+ # command to make sure SSH still works when the current
2431
+ # context/namespace is changed by the user.
2432
+ current_kube_context=context,
2433
+ current_kube_namespace=namespace)
2364
2434
  return ssh_jump_proxy_command
2365
2435
 
2366
2436
 
@@ -2392,240 +2462,6 @@ def create_proxy_command_script() -> str:
2392
2462
  return PORT_FORWARD_PROXY_CMD_PATH
2393
2463
 
2394
2464
 
2395
- def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
2396
- context: Optional[str],
2397
- service_type: kubernetes_enums.KubernetesServiceType):
2398
- """Sets up Kubernetes service resource to access for SSH jump pod.
2399
-
2400
- This method acts as a necessary complement to be run along with
2401
- setup_ssh_jump_pod(...) method. This service ensures the pod is accessible.
2402
-
2403
- Args:
2404
- ssh_jump_name: Name to use for the SSH jump service
2405
- namespace: Namespace to create the SSH jump service in
2406
- service_type: Networking configuration on either to use NodePort
2407
- or ClusterIP service to ssh in
2408
- """
2409
- # Fill in template - ssh_key_secret and ssh_jump_image are not required for
2410
- # the service spec, so we pass in empty strs.
2411
- content = fill_ssh_jump_template('', '', ssh_jump_name, service_type.value)
2412
-
2413
- # Add custom metadata from config
2414
- merge_custom_metadata(content['service_spec']['metadata'], context)
2415
-
2416
- # Create service
2417
- try:
2418
- kubernetes.core_api(context).create_namespaced_service(
2419
- namespace, content['service_spec'])
2420
- except kubernetes.api_exception() as e:
2421
- # SSH Jump Pod service already exists.
2422
- if e.status == 409:
2423
- ssh_jump_service = kubernetes.core_api(
2424
- context).read_namespaced_service(name=ssh_jump_name,
2425
- namespace=namespace)
2426
- curr_svc_type = ssh_jump_service.spec.type
2427
- if service_type.value == curr_svc_type:
2428
- # If the currently existing SSH Jump service's type is identical
2429
- # to user's configuration for networking mode
2430
- logger.debug(
2431
- f'SSH Jump Service {ssh_jump_name} already exists in the '
2432
- 'cluster, using it.')
2433
- else:
2434
- # If a different type of service type for SSH Jump pod compared
2435
- # to user's configuration for networking mode exists, we remove
2436
- # existing servie to create a new one following user's config
2437
- kubernetes.core_api(context).delete_namespaced_service(
2438
- name=ssh_jump_name, namespace=namespace)
2439
- kubernetes.core_api(context).create_namespaced_service(
2440
- namespace, content['service_spec'])
2441
- port_forward_mode = (
2442
- kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
2443
- nodeport_mode = (
2444
- kubernetes_enums.KubernetesNetworkingMode.NODEPORT.value)
2445
- clusterip_svc = (
2446
- kubernetes_enums.KubernetesServiceType.CLUSTERIP.value)
2447
- nodeport_svc = (
2448
- kubernetes_enums.KubernetesServiceType.NODEPORT.value)
2449
- curr_network_mode = port_forward_mode \
2450
- if curr_svc_type == clusterip_svc else nodeport_mode
2451
- new_network_mode = nodeport_mode \
2452
- if curr_svc_type == clusterip_svc else port_forward_mode
2453
- new_svc_type = nodeport_svc \
2454
- if curr_svc_type == clusterip_svc else clusterip_svc
2455
- logger.info(
2456
- f'Switching the networking mode from '
2457
- f'\'{curr_network_mode}\' to \'{new_network_mode}\' '
2458
- f'following networking configuration. Deleting existing '
2459
- f'\'{curr_svc_type}\' service and recreating as '
2460
- f'\'{new_svc_type}\' service.')
2461
- else:
2462
- raise
2463
- else:
2464
- logger.info(f'Created SSH Jump Service {ssh_jump_name}.')
2465
-
2466
-
2467
- def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
2468
- ssh_key_secret: str, namespace: str,
2469
- context: Optional[str]):
2470
- """Sets up Kubernetes RBAC and pod for SSH jump host.
2471
-
2472
- Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
2473
- running inside a cluster. This function sets up the resources needed for
2474
- the SSH jump pod. This includes a service account which grants the jump pod
2475
- permission to watch for other SkyPilot pods and terminate itself if there
2476
- are no SkyPilot pods running.
2477
-
2478
- setup_ssh_jump_service must also be run to ensure that the SSH jump pod is
2479
- reachable.
2480
-
2481
- Args:
2482
- ssh_jump_image: Container image to use for the SSH jump pod
2483
- ssh_jump_name: Name to use for the SSH jump pod
2484
- ssh_key_secret: Secret name for the SSH key stored in the cluster
2485
- namespace: Namespace to create the SSH jump pod in
2486
- """
2487
- # Fill in template - service is created separately so service_type is not
2488
- # required, so we pass in empty str.
2489
- content = fill_ssh_jump_template(ssh_key_secret, ssh_jump_image,
2490
- ssh_jump_name, '')
2491
-
2492
- # Add custom metadata to all objects
2493
- for object_type in content.keys():
2494
- merge_custom_metadata(content[object_type]['metadata'], context)
2495
-
2496
- # ServiceAccount
2497
- try:
2498
- kubernetes.core_api(context).create_namespaced_service_account(
2499
- namespace, content['service_account'])
2500
- except kubernetes.api_exception() as e:
2501
- if e.status == 409:
2502
- logger.info(
2503
- 'SSH Jump ServiceAccount already exists in the cluster, using '
2504
- 'it.')
2505
- else:
2506
- raise
2507
- else:
2508
- logger.info('Created SSH Jump ServiceAccount.')
2509
- # Role
2510
- try:
2511
- kubernetes.auth_api(context).create_namespaced_role(
2512
- namespace, content['role'])
2513
- except kubernetes.api_exception() as e:
2514
- if e.status == 409:
2515
- logger.info(
2516
- 'SSH Jump Role already exists in the cluster, using it.')
2517
- else:
2518
- raise
2519
- else:
2520
- logger.info('Created SSH Jump Role.')
2521
- # RoleBinding
2522
- try:
2523
- kubernetes.auth_api(context).create_namespaced_role_binding(
2524
- namespace, content['role_binding'])
2525
- except kubernetes.api_exception() as e:
2526
- if e.status == 409:
2527
- logger.info(
2528
- 'SSH Jump RoleBinding already exists in the cluster, using '
2529
- 'it.')
2530
- else:
2531
- raise
2532
- else:
2533
- logger.info('Created SSH Jump RoleBinding.')
2534
- # Pod
2535
- try:
2536
- kubernetes.core_api(context).create_namespaced_pod(
2537
- namespace, content['pod_spec'])
2538
- except kubernetes.api_exception() as e:
2539
- if e.status == 409:
2540
- logger.info(
2541
- f'SSH Jump Host {ssh_jump_name} already exists in the cluster, '
2542
- 'using it.')
2543
- else:
2544
- raise
2545
- else:
2546
- logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
2547
-
2548
-
2549
- def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
2550
- node_id: str):
2551
- """Analyzes SSH jump pod and removes if it is in a bad state
2552
-
2553
- Prevents the existence of a dangling SSH jump pod. This could happen
2554
- in case the pod main container did not start properly (or failed). In that
2555
- case, jump pod lifecycle manager will not function properly to
2556
- remove the pod and service automatically, and must be done manually.
2557
-
2558
- Args:
2559
- namespace: Namespace to remove the SSH jump pod and service from
2560
- node_id: Name of head pod
2561
- """
2562
-
2563
- def find(l, predicate):
2564
- """Utility function to find element in given list"""
2565
- results = [x for x in l if predicate(x)]
2566
- return results[0] if results else None
2567
-
2568
- # Get the SSH jump pod name from the head pod
2569
- try:
2570
- pod = kubernetes.core_api(context).read_namespaced_pod(
2571
- node_id, namespace)
2572
- except kubernetes.api_exception() as e:
2573
- if e.status == 404:
2574
- logger.warning(f'Failed to get pod {node_id},'
2575
- ' but the pod was not found (404).')
2576
- raise
2577
- else:
2578
- ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
2579
- try:
2580
- ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
2581
- ssh_jump_name, namespace)
2582
- cont_ready_cond = find(ssh_jump_pod.status.conditions,
2583
- lambda c: c.type == 'ContainersReady')
2584
- if (cont_ready_cond and cont_ready_cond.status
2585
- == 'False') or ssh_jump_pod.status.phase == 'Pending':
2586
- # Either the main container is not ready or the pod failed
2587
- # to schedule. To be on the safe side and prevent a dangling
2588
- # ssh jump pod, lets remove it and the service. Otherwise, main
2589
- # container is ready and its lifecycle management script takes
2590
- # care of the cleaning.
2591
- kubernetes.core_api(context).delete_namespaced_pod(
2592
- ssh_jump_name, namespace)
2593
- kubernetes.core_api(context).delete_namespaced_service(
2594
- ssh_jump_name, namespace)
2595
- except kubernetes.api_exception() as e:
2596
- # We keep the warning in debug to avoid polluting the `sky launch`
2597
- # output.
2598
- logger.debug(f'Tried to check ssh jump pod {ssh_jump_name},'
2599
- f' but got error {e}\n. Consider running `kubectl '
2600
- f'delete pod {ssh_jump_name} -n {namespace}` to manually '
2601
- 'remove the pod if it has crashed.')
2602
- # We encountered an issue while checking ssh jump pod. To be on
2603
- # the safe side, lets remove its service so the port is freed
2604
- try:
2605
- kubernetes.core_api(context).delete_namespaced_service(
2606
- ssh_jump_name, namespace)
2607
- except kubernetes.api_exception():
2608
- pass
2609
-
2610
-
2611
- def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
2612
- ssh_jump_name: str, service_type: str) -> Dict:
2613
- template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
2614
- 'kubernetes-ssh-jump.yml.j2')
2615
- if not os.path.exists(template_path):
2616
- raise FileNotFoundError(
2617
- 'Template "kubernetes-ssh-jump.j2" does not exist.')
2618
- with open(template_path, 'r', encoding='utf-8') as fin:
2619
- template = fin.read()
2620
- j2_template = jinja2.Template(template)
2621
- cont = j2_template.render(name=ssh_jump_name,
2622
- image=ssh_jump_image,
2623
- secret=ssh_key_secret,
2624
- service_type=service_type)
2625
- content = yaml_utils.safe_load(cont)
2626
- return content
2627
-
2628
-
2629
2465
  def check_port_forward_mode_dependencies(
2630
2466
  raise_error: bool = True) -> Optional[List[str]]:
2631
2467
  """Checks if 'socat' and 'nc' are installed
@@ -0,0 +1,38 @@
1
+ """Shared loopback detection utilities for auth middlewares."""
2
+
3
+ import ipaddress
4
+
5
+ import fastapi
6
+
7
+ from sky import sky_logging
8
+
9
+ logger = sky_logging.init_logger(__name__)
10
+
11
+ COMMON_PROXY_HEADERS = [
12
+ 'X-Forwarded-For', 'Forwarded', 'X-Real-IP', 'X-Client-IP',
13
+ 'X-Forwarded-Host', 'X-Forwarded-Proto'
14
+ ]
15
+
16
+
17
+ def _is_loopback_ip(ip_str: str) -> bool:
18
+ """Check if an IP address is a loopback address."""
19
+ try:
20
+ ip = ipaddress.ip_address(ip_str)
21
+ return ip.is_loopback
22
+ except ValueError:
23
+ return False
24
+
25
+
26
+ def is_loopback_request(request: fastapi.Request) -> bool:
27
+ """Determine if a request is coming from localhost."""
28
+ if request.client is None:
29
+ return False
30
+
31
+ client_host = request.client.host
32
+ if client_host == 'localhost' or _is_loopback_ip(client_host):
33
+ # Additional checks: ensure no forwarding headers are present.
34
+ # If there are any, assume this traffic went through a proxy.
35
+ return not any(
36
+ request.headers.get(header) for header in COMMON_PROXY_HEADERS)
37
+
38
+ return False
@@ -15,7 +15,9 @@ import starlette.middleware.base
15
15
  from sky import global_user_state
16
16
  from sky import models
17
17
  from sky import sky_logging
18
+ from sky.jobs import utils as managed_job_utils
18
19
  from sky.server.auth import authn
20
+ from sky.server.auth import loopback
19
21
  from sky.users import permission
20
22
  from sky.utils import common_utils
21
23
 
@@ -108,6 +110,10 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
108
110
  # Already authenticated
109
111
  return await call_next(request)
110
112
 
113
+ if managed_job_utils.is_consolidation_mode(
114
+ ) and loopback.is_loopback_request(request):
115
+ return await call_next(request)
116
+
111
117
  async with aiohttp.ClientSession() as session:
112
118
  try:
113
119
  return await self._authenticate(request, call_next, session)
sky/server/server.py CHANGED
@@ -38,6 +38,7 @@ from sky import global_user_state
38
38
  from sky import models
39
39
  from sky import sky_logging
40
40
  from sky.data import storage_utils
41
+ from sky.jobs import utils as managed_job_utils
41
42
  from sky.jobs.server import server as jobs_rest
42
43
  from sky.metrics import utils as metrics_utils
43
44
  from sky.provision.kubernetes import utils as kubernetes_utils
@@ -52,6 +53,7 @@ from sky.server import state
52
53
  from sky.server import stream_utils
53
54
  from sky.server import versions
54
55
  from sky.server.auth import authn
56
+ from sky.server.auth import loopback
55
57
  from sky.server.auth import oauth2_proxy
56
58
  from sky.server.requests import executor
57
59
  from sky.server.requests import payloads
@@ -191,6 +193,10 @@ class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
191
193
  """Middleware to handle HTTP Basic Auth."""
192
194
 
193
195
  async def dispatch(self, request: fastapi.Request, call_next):
196
+ if managed_job_utils.is_consolidation_mode(
197
+ ) and loopback.is_loopback_request(request):
198
+ return await call_next(request)
199
+
194
200
  if request.url.path.startswith('/api/health'):
195
201
  # Try to set the auth user from basic auth
196
202
  _try_set_basic_auth_user(request)
@@ -48,6 +48,7 @@ install_requires = [
48
48
  # (https://github.com/yaml/pyyaml/issues/601)
49
49
  # <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
50
50
  'pyyaml > 3.13, != 5.4.*',
51
+ 'ijson',
51
52
  'requests',
52
53
  # SkyPilot inherits from uvicorn.Server to customize the behavior of
53
54
  # uvicorn, so we need to pin uvicorn version to avoid potential break
@@ -33,14 +33,11 @@ provider:
33
33
  networking_mode: {{k8s_networking_mode}}
34
34
 
35
35
  # We use internal IPs since we set up a port-forward between the kubernetes
36
- # cluster and the local machine, or directly use NodePort to reach the
37
- # head node.
36
+ # cluster and the local machine.
38
37
  use_internal_ips: true
39
38
 
40
39
  timeout: {{timeout}}
41
40
 
42
- ssh_jump_image: {{k8s_ssh_jump_image}}
43
-
44
41
  # Namespace used to host SkyPilot system components, such as fuse device
45
42
  # manager.
46
43
  skypilot_system_namespace: {{k8s_skypilot_system_namespace}}
@@ -276,8 +273,6 @@ available_node_types:
276
273
  parent: skypilot
277
274
  # component will be set for the head node pod to be the same as the head node service selector above if a
278
275
  skypilot-cluster: {{cluster_name_on_cloud}}
279
- # Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
280
- skypilot-ssh-jump: {{k8s_ssh_jump_name}}
281
276
  skypilot-user: {{ user }}
282
277
  # Custom tags for the pods
283
278
  {%- for label_key, label_value in labels.items() %}
@@ -444,9 +439,6 @@ available_node_types:
444
439
  # object store. If you do not provide this, Ray will fall back to
445
440
  # /tmp which cause slowdowns if is not a shared memory volume.
446
441
  volumes:
447
- - name: secret-volume
448
- secret:
449
- secretName: {{k8s_ssh_key_secret_name}}
450
442
  - name: dshm
451
443
  emptyDir:
452
444
  medium: Memory
@@ -869,7 +861,9 @@ available_node_types:
869
861
  $(prefix_cmd) mkdir -p ~/.ssh;
870
862
  $(prefix_cmd) chown -R $(whoami) ~/.ssh;
871
863
  $(prefix_cmd) chmod 700 ~/.ssh;
872
- $(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > ~/.ssh/authorized_keys;
864
+ $(prefix_cmd) cat > ~/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
865
+ skypilot:ssh_public_key_content
866
+ SKYPILOT_SSH_KEY_EOF
873
867
  $(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
874
868
  $(prefix_cmd) service ssh restart;
875
869
  $(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
@@ -1105,9 +1099,6 @@ available_node_types:
1105
1099
  # object store. If you do not provide this, Ray will fall back to
1106
1100
  # /tmp which cause slowdowns if is not a shared memory volume.
1107
1101
  volumeMounts:
1108
- - name: secret-volume
1109
- readOnly: true
1110
- mountPath: "/etc/secret-volume"
1111
1102
  - mountPath: /dev/shm
1112
1103
  name: dshm
1113
1104
  {% if k8s_enable_gpudirect_tcpx %}