skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Utilities for handling interactive SSH authentication."""
|
|
2
|
+
import asyncio
|
|
3
|
+
import fcntl
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
7
|
+
import termios
|
|
8
|
+
import tty
|
|
9
|
+
import typing
|
|
10
|
+
|
|
11
|
+
from sky import sky_logging
|
|
12
|
+
from sky.adaptors import common as adaptors_common
|
|
13
|
+
from sky.client import service_account_auth
|
|
14
|
+
from sky.server import common as server_common
|
|
15
|
+
from sky.utils import rich_utils
|
|
16
|
+
|
|
17
|
+
if typing.TYPE_CHECKING:
|
|
18
|
+
import websockets
|
|
19
|
+
else:
|
|
20
|
+
websockets = adaptors_common.LazyImport('websockets')
|
|
21
|
+
|
|
22
|
+
logger = sky_logging.init_logger(__name__)
|
|
23
|
+
|
|
24
|
+
SKY_INTERACTIVE_PATTERN = re.compile(r'<sky-interactive session="([^"]+)"/>')
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# TODO(kevin): Refactor to share code with websocket_proxy.py.
|
|
28
|
+
async def _handle_interactive_auth_websocket(session_id: str) -> None:
|
|
29
|
+
"""Handle interactive SSH authentication via websocket.
|
|
30
|
+
|
|
31
|
+
This establishes a websocket connection to the API server and bridges
|
|
32
|
+
the user's terminal I/O bidirectionally with the PTY on the server,
|
|
33
|
+
allowing interactive authentication (e.g., 2FA).
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
session_id: The session identifier from the <sky-interactive> signal.
|
|
37
|
+
"""
|
|
38
|
+
# Get HTTP server URL and convert to websocket URL
|
|
39
|
+
server_url = server_common.get_server_url()
|
|
40
|
+
server_proto, server_fqdn = server_url.split('://')
|
|
41
|
+
websocket_proto = 'wss' if server_proto == 'https' else 'ws'
|
|
42
|
+
ws_url = (f'{websocket_proto}://{server_fqdn}'
|
|
43
|
+
f'/ssh-interactive-auth?session_id={session_id}')
|
|
44
|
+
|
|
45
|
+
logger.info('Starting interactive SSH authentication...')
|
|
46
|
+
|
|
47
|
+
headers = {}
|
|
48
|
+
# Add service account auth if available
|
|
49
|
+
headers.update(service_account_auth.get_service_account_headers())
|
|
50
|
+
# Add cookie auth with URL-aware filtering
|
|
51
|
+
headers.update(server_common.get_cookie_header_for_url(ws_url))
|
|
52
|
+
|
|
53
|
+
# Set terminal to raw mode if stdin is a tty
|
|
54
|
+
old_settings = None
|
|
55
|
+
if os.isatty(sys.stdin.fileno()):
|
|
56
|
+
old_settings = termios.tcgetattr(sys.stdin.fileno())
|
|
57
|
+
tty.setraw(sys.stdin.fileno())
|
|
58
|
+
|
|
59
|
+
stdin_dup_fd = None
|
|
60
|
+
stdout_dup_fd = None
|
|
61
|
+
try:
|
|
62
|
+
# Duplicate stdin/stdout fds before passing to asyncio.
|
|
63
|
+
# When asyncio's loop.connect_read/write_pipe() is called,
|
|
64
|
+
# it creates a transport that takes ownership of the file passed to it.
|
|
65
|
+
# By duplicating the fds, we give asyncio independent copies that it can
|
|
66
|
+
# safely close, while preserving the original sys.stdin/stdout.
|
|
67
|
+
stdin_dup_fd = os.dup(sys.stdin.fileno())
|
|
68
|
+
stdout_dup_fd = os.dup(sys.stdout.fileno())
|
|
69
|
+
|
|
70
|
+
async with websockets.connect(ws_url,
|
|
71
|
+
additional_headers=headers,
|
|
72
|
+
ping_interval=None) as ws:
|
|
73
|
+
loop = asyncio.get_running_loop()
|
|
74
|
+
|
|
75
|
+
stdin_reader = asyncio.StreamReader()
|
|
76
|
+
stdin_protocol = asyncio.StreamReaderProtocol(stdin_reader)
|
|
77
|
+
stdin_dup_file = os.fdopen(stdin_dup_fd, 'rb', buffering=0)
|
|
78
|
+
stdin_dup_fd = None # File object now owns the FD
|
|
79
|
+
await loop.connect_read_pipe(lambda: stdin_protocol, stdin_dup_file)
|
|
80
|
+
|
|
81
|
+
stdout_dup_file = os.fdopen(stdout_dup_fd, 'wb', buffering=0)
|
|
82
|
+
stdout_dup_fd = None # File object now owns the FD
|
|
83
|
+
stdout_transport, stdout_protocol = await loop.connect_write_pipe(
|
|
84
|
+
asyncio.streams.FlowControlMixin,
|
|
85
|
+
stdout_dup_file) # type: ignore
|
|
86
|
+
stdout_writer = asyncio.StreamWriter(stdout_transport,
|
|
87
|
+
stdout_protocol, None, loop)
|
|
88
|
+
|
|
89
|
+
async def stdin_to_websocket():
|
|
90
|
+
"""Forward stdin to websocket."""
|
|
91
|
+
try:
|
|
92
|
+
while True:
|
|
93
|
+
data = await stdin_reader.read(4096)
|
|
94
|
+
if not data:
|
|
95
|
+
break
|
|
96
|
+
await ws.send(data)
|
|
97
|
+
except asyncio.CancelledError:
|
|
98
|
+
# Task was cancelled - auth complete
|
|
99
|
+
pass
|
|
100
|
+
except Exception as e: # pylint: disable=broad-except
|
|
101
|
+
logger.debug(f'Error in stdin_to_websocket: {e}')
|
|
102
|
+
|
|
103
|
+
async def websocket_to_stdout():
|
|
104
|
+
"""Forward websocket to stdout."""
|
|
105
|
+
try:
|
|
106
|
+
async for message in ws:
|
|
107
|
+
stdout_writer.write(message)
|
|
108
|
+
await stdout_writer.drain()
|
|
109
|
+
except Exception as e: # pylint: disable=broad-except
|
|
110
|
+
logger.debug(f'Error in websocket_to_stdout: {e}')
|
|
111
|
+
|
|
112
|
+
# Run both directions concurrently
|
|
113
|
+
# Use tasks so we can cancel stdin reader when websocket closes
|
|
114
|
+
stdin_task = asyncio.create_task(stdin_to_websocket())
|
|
115
|
+
stdout_task = asyncio.create_task(websocket_to_stdout())
|
|
116
|
+
|
|
117
|
+
# Wait for websocket to close (auth complete)
|
|
118
|
+
await stdout_task
|
|
119
|
+
# Cancel stdin reader so it doesn't consume the next keystroke
|
|
120
|
+
stdin_task.cancel()
|
|
121
|
+
try:
|
|
122
|
+
await stdin_task
|
|
123
|
+
except asyncio.CancelledError:
|
|
124
|
+
pass
|
|
125
|
+
except Exception as e: # pylint: disable=broad-except
|
|
126
|
+
logger.error(f'Failed to handle interactive authentication: {e}')
|
|
127
|
+
raise
|
|
128
|
+
finally:
|
|
129
|
+
# Restore terminal settings if they were changed
|
|
130
|
+
if old_settings:
|
|
131
|
+
termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
|
|
132
|
+
old_settings)
|
|
133
|
+
# Flush any buffered input from stdin
|
|
134
|
+
termios.tcflush(sys.stdin.fileno(), termios.TCIFLUSH)
|
|
135
|
+
# Ensure stdout is in blocking mode (can be non-blocking after
|
|
136
|
+
# asyncio transport operations)
|
|
137
|
+
flags = fcntl.fcntl(sys.stdout.fileno(), fcntl.F_GETFL)
|
|
138
|
+
fcntl.fcntl(sys.stdout.fileno(), fcntl.F_SETFL,
|
|
139
|
+
flags & ~os.O_NONBLOCK)
|
|
140
|
+
|
|
141
|
+
for fd in [stdin_dup_fd, stdout_dup_fd]:
|
|
142
|
+
if fd is not None:
|
|
143
|
+
try:
|
|
144
|
+
os.close(fd)
|
|
145
|
+
except OSError:
|
|
146
|
+
# Already closed by asyncio or never opened
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def handle_interactive_auth(line: str) -> typing.Optional[str]:
|
|
151
|
+
"""Handle interactive SSH authentication signals (sync version).
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
line: The log line to check for interactive auth markers.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
The line with the marker removed, or None if this was an interactive
|
|
158
|
+
auth signal (meaning the line was consumed).
|
|
159
|
+
"""
|
|
160
|
+
match = SKY_INTERACTIVE_PATTERN.search(line)
|
|
161
|
+
if not match:
|
|
162
|
+
return line
|
|
163
|
+
|
|
164
|
+
session_id = match.group(1)
|
|
165
|
+
# Temporarily stop any spinners to allow terminal I/O
|
|
166
|
+
with rich_utils.safe_logger():
|
|
167
|
+
asyncio.run(_handle_interactive_auth_websocket(session_id))
|
|
168
|
+
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
async def handle_interactive_auth_async(line: str) -> typing.Optional[str]:
|
|
173
|
+
"""Handle interactive SSH authentication signals (async version).
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
line: The log line to check for interactive auth markers.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
The line with the marker removed, or None if this was an interactive
|
|
180
|
+
auth signal (meaning the line was consumed).
|
|
181
|
+
"""
|
|
182
|
+
match = SKY_INTERACTIVE_PATTERN.search(line)
|
|
183
|
+
if not match:
|
|
184
|
+
return line
|
|
185
|
+
|
|
186
|
+
session_id = match.group(1)
|
|
187
|
+
with rich_utils.safe_logger():
|
|
188
|
+
await _handle_interactive_auth_websocket(session_id)
|
|
189
|
+
|
|
190
|
+
return None
|
sky/client/sdk.py
CHANGED
|
@@ -30,6 +30,7 @@ from sky import sky_logging
|
|
|
30
30
|
from sky import skypilot_config
|
|
31
31
|
from sky.adaptors import common as adaptors_common
|
|
32
32
|
from sky.client import common as client_common
|
|
33
|
+
from sky.client import interactive_utils
|
|
33
34
|
from sky.client import oauth as oauth_lib
|
|
34
35
|
from sky.jobs import scheduler
|
|
35
36
|
from sky.jobs import utils as managed_job_utils
|
|
@@ -42,6 +43,7 @@ from sky.server.requests import request_names
|
|
|
42
43
|
from sky.server.requests import requests as requests_lib
|
|
43
44
|
from sky.skylet import autostop_lib
|
|
44
45
|
from sky.skylet import constants
|
|
46
|
+
from sky.ssh_node_pools import utils as ssh_utils
|
|
45
47
|
from sky.usage import usage_lib
|
|
46
48
|
from sky.utils import admin_policy_utils
|
|
47
49
|
from sky.utils import annotations
|
|
@@ -57,7 +59,6 @@ from sky.utils import status_lib
|
|
|
57
59
|
from sky.utils import subprocess_utils
|
|
58
60
|
from sky.utils import ux_utils
|
|
59
61
|
from sky.utils import yaml_utils
|
|
60
|
-
from sky.utils.kubernetes import ssh_utils
|
|
61
62
|
|
|
62
63
|
if typing.TYPE_CHECKING:
|
|
63
64
|
import base64
|
|
@@ -157,9 +158,16 @@ def stream_response(request_id: Optional[server_common.RequestId[T]],
|
|
|
157
158
|
retry_context = rest.get_retry_context()
|
|
158
159
|
try:
|
|
159
160
|
line_count = 0
|
|
161
|
+
|
|
160
162
|
for line in rich_utils.decode_rich_status(response):
|
|
161
163
|
if line is not None:
|
|
162
164
|
line_count += 1
|
|
165
|
+
|
|
166
|
+
line = interactive_utils.handle_interactive_auth(line)
|
|
167
|
+
if line is None:
|
|
168
|
+
# Line was consumed by interactive auth handler
|
|
169
|
+
continue
|
|
170
|
+
|
|
163
171
|
if retry_context is None:
|
|
164
172
|
print(line, flush=True, end='', file=output_stream)
|
|
165
173
|
elif line_count > retry_context.line_processed:
|
|
@@ -675,7 +683,7 @@ def _launch(
|
|
|
675
683
|
clusters = get(status_request_id)
|
|
676
684
|
cluster_user_hash = common_utils.get_user_hash()
|
|
677
685
|
cluster_user_hash_str = ''
|
|
678
|
-
current_user = common_utils.
|
|
686
|
+
current_user = common_utils.get_local_user_name()
|
|
679
687
|
cluster_user_name = current_user
|
|
680
688
|
if not clusters:
|
|
681
689
|
# Show the optimize log before the prompt if the cluster does not
|
|
@@ -2744,3 +2752,57 @@ def api_logout() -> None:
|
|
|
2744
2752
|
_clear_api_server_config()
|
|
2745
2753
|
logger.info(f'{colorama.Fore.GREEN}Logged out of SkyPilot API server.'
|
|
2746
2754
|
f'{colorama.Style.RESET_ALL}')
|
|
2755
|
+
|
|
2756
|
+
|
|
2757
|
+
@usage_lib.entrypoint
|
|
2758
|
+
@server_common.check_server_healthy_or_start
|
|
2759
|
+
@versions.minimal_api_version(24)
|
|
2760
|
+
@annotations.client_api
|
|
2761
|
+
def realtime_slurm_gpu_availability(
|
|
2762
|
+
name_filter: Optional[str] = None,
|
|
2763
|
+
quantity_filter: Optional[int] = None) -> server_common.RequestId:
|
|
2764
|
+
"""Gets the real-time Slurm GPU availability.
|
|
2765
|
+
|
|
2766
|
+
Args:
|
|
2767
|
+
name_filter: Optional name filter for GPUs.
|
|
2768
|
+
quantity_filter: Optional quantity filter for GPUs.
|
|
2769
|
+
|
|
2770
|
+
Returns:
|
|
2771
|
+
The request ID of the Slurm GPU availability request.
|
|
2772
|
+
"""
|
|
2773
|
+
body = payloads.SlurmGpuAvailabilityRequestBody(
|
|
2774
|
+
name_filter=name_filter,
|
|
2775
|
+
quantity_filter=quantity_filter,
|
|
2776
|
+
)
|
|
2777
|
+
response = server_common.make_authenticated_request(
|
|
2778
|
+
'POST',
|
|
2779
|
+
'/slurm_gpu_availability',
|
|
2780
|
+
json=json.loads(body.model_dump_json()),
|
|
2781
|
+
)
|
|
2782
|
+
return server_common.get_request_id(response)
|
|
2783
|
+
|
|
2784
|
+
|
|
2785
|
+
@usage_lib.entrypoint
|
|
2786
|
+
@server_common.check_server_healthy_or_start
|
|
2787
|
+
@versions.minimal_api_version(24)
|
|
2788
|
+
@annotations.client_api
|
|
2789
|
+
def slurm_node_info(
|
|
2790
|
+
slurm_cluster_name: Optional[str] = None) -> server_common.RequestId:
|
|
2791
|
+
"""Gets the resource information for all nodes in the Slurm cluster.
|
|
2792
|
+
|
|
2793
|
+
Returns:
|
|
2794
|
+
The request ID of the Slurm node info request.
|
|
2795
|
+
|
|
2796
|
+
Request Returns:
|
|
2797
|
+
List[Dict[str, Any]]: A list of dictionaries, each containing info
|
|
2798
|
+
for a single Slurm node (node_name, partition, node_state,
|
|
2799
|
+
gpu_type, total_gpus, free_gpus, vcpu_count, memory_gb).
|
|
2800
|
+
"""
|
|
2801
|
+
body = payloads.SlurmNodeInfoRequestBody(
|
|
2802
|
+
slurm_cluster_name=slurm_cluster_name)
|
|
2803
|
+
response = server_common.make_authenticated_request(
|
|
2804
|
+
'GET',
|
|
2805
|
+
'/slurm_node_info',
|
|
2806
|
+
json=json.loads(body.model_dump_json()),
|
|
2807
|
+
)
|
|
2808
|
+
return server_common.get_request_id(response)
|
sky/client/sdk_async.py
CHANGED
|
@@ -23,6 +23,7 @@ from sky import catalog
|
|
|
23
23
|
from sky import exceptions
|
|
24
24
|
from sky import sky_logging
|
|
25
25
|
from sky.client import common as client_common
|
|
26
|
+
from sky.client import interactive_utils
|
|
26
27
|
from sky.client import sdk
|
|
27
28
|
from sky.schemas.api import responses
|
|
28
29
|
from sky.server import common as server_common
|
|
@@ -167,9 +168,17 @@ async def stream_response_async(request_id: Optional[str],
|
|
|
167
168
|
retry_context = rest.get_retry_context()
|
|
168
169
|
try:
|
|
169
170
|
line_count = 0
|
|
171
|
+
|
|
170
172
|
async for line in rich_utils.decode_rich_status_async(response):
|
|
171
173
|
if line is not None:
|
|
172
174
|
line_count += 1
|
|
175
|
+
|
|
176
|
+
line = await interactive_utils.handle_interactive_auth_async(
|
|
177
|
+
line)
|
|
178
|
+
if line is None:
|
|
179
|
+
# Line was consumed by interactive auth handler
|
|
180
|
+
continue
|
|
181
|
+
|
|
173
182
|
if retry_context is None:
|
|
174
183
|
print(line, flush=True, end='', file=output_stream)
|
|
175
184
|
elif line_count > retry_context.line_processed:
|
sky/clouds/__init__.py
CHANGED
|
@@ -31,6 +31,7 @@ from sky.clouds.runpod import RunPod
|
|
|
31
31
|
from sky.clouds.scp import SCP
|
|
32
32
|
from sky.clouds.seeweb import Seeweb
|
|
33
33
|
from sky.clouds.shadeform import Shadeform
|
|
34
|
+
from sky.clouds.slurm import Slurm
|
|
34
35
|
from sky.clouds.ssh import SSH
|
|
35
36
|
from sky.clouds.vast import Vast
|
|
36
37
|
from sky.clouds.vsphere import Vsphere
|
|
@@ -48,6 +49,7 @@ __all__ = [
|
|
|
48
49
|
'Paperspace',
|
|
49
50
|
'PrimeIntellect',
|
|
50
51
|
'SCP',
|
|
52
|
+
'Slurm',
|
|
51
53
|
'RunPod',
|
|
52
54
|
'Shadeform',
|
|
53
55
|
'Vast',
|
sky/clouds/aws.py
CHANGED
|
@@ -12,6 +12,7 @@ import typing
|
|
|
12
12
|
from typing import (Any, Callable, Dict, Iterator, List, Literal, Optional, Set,
|
|
13
13
|
Tuple, TypeVar, Union)
|
|
14
14
|
|
|
15
|
+
import colorama
|
|
15
16
|
from typing_extensions import ParamSpec
|
|
16
17
|
|
|
17
18
|
from sky import catalog
|
|
@@ -758,6 +759,36 @@ class AWS(clouds.Cloud):
|
|
|
758
759
|
max_efa_interfaces = 0
|
|
759
760
|
enable_efa = False
|
|
760
761
|
|
|
762
|
+
use_internal_ips = skypilot_config.get_effective_region_config(
|
|
763
|
+
cloud='aws',
|
|
764
|
+
region=region_name,
|
|
765
|
+
keys=('use_internal_ips',),
|
|
766
|
+
default_value=False)
|
|
767
|
+
if max_efa_interfaces > 1 and not use_internal_ips:
|
|
768
|
+
logger.warning(
|
|
769
|
+
f'{colorama.Fore.YELLOW}'
|
|
770
|
+
f'Instance type {resources.instance_type} supports up to '
|
|
771
|
+
f'{max_efa_interfaces} EFA interfaces, but '
|
|
772
|
+
'`use_internal_ips` is not enabled.\nLaunching with the '
|
|
773
|
+
'current configuration will use only 1 EFA interface.\n'
|
|
774
|
+
f'To use all {max_efa_interfaces} EFA interfaces, enable '
|
|
775
|
+
'internal IPs by adding one of the following '
|
|
776
|
+
'configurations to SkyPilot config:\n'
|
|
777
|
+
'Option 1 (with SSM):\n'
|
|
778
|
+
' aws:\n'
|
|
779
|
+
' use_internal_ips: true\n'
|
|
780
|
+
' use_ssm: true\n'
|
|
781
|
+
'Option 2 (with SSH proxy):\n'
|
|
782
|
+
' aws:\n'
|
|
783
|
+
' use_internal_ips: true\n'
|
|
784
|
+
' ssh_proxy_command: ssh -W %h:%p -i <ssh key path> '
|
|
785
|
+
'-o StrictHostKeyChecking=no <user>@<jump server public'
|
|
786
|
+
' ip>\n'
|
|
787
|
+
'Refer to '
|
|
788
|
+
'https://docs.skypilot.co/en/latest/reference/config.html'
|
|
789
|
+
'#aws-use-internal-ips for more details.'
|
|
790
|
+
f'{colorama.Style.RESET_ALL}')
|
|
791
|
+
|
|
761
792
|
docker_run_options = []
|
|
762
793
|
if resources.extract_docker_image() is not None:
|
|
763
794
|
image_id_to_use = None
|
|
@@ -1005,8 +1036,10 @@ class AWS(clouds.Cloud):
|
|
|
1005
1036
|
hints = 'AWS SSO is set.'
|
|
1006
1037
|
if static_credential_exists:
|
|
1007
1038
|
hints += (
|
|
1008
|
-
' To ensure
|
|
1009
|
-
'
|
|
1039
|
+
' To ensure S3 mounting and other features work correctly '
|
|
1040
|
+
'on Kubernetes and other clouds, '
|
|
1041
|
+
'please use SkyPilot with static AWS credentials '
|
|
1042
|
+
'(e.g., ~/.aws/credentials) by unsetting '
|
|
1010
1043
|
'the AWS_PROFILE environment variable.')
|
|
1011
1044
|
else:
|
|
1012
1045
|
hints += single_cloud_hint
|
|
@@ -1081,6 +1114,31 @@ class AWS(clouds.Cloud):
|
|
|
1081
1114
|
return identity_type
|
|
1082
1115
|
return AWSIdentityType.SHARED_CREDENTIALS_FILE
|
|
1083
1116
|
|
|
1117
|
+
@classmethod
|
|
1118
|
+
def should_use_env_auth_for_s3(cls) -> bool:
|
|
1119
|
+
"""Returns True if S3 should use environment-based auth.
|
|
1120
|
+
|
|
1121
|
+
When using non-static AWS credentials (SSO, IAM role, container role),
|
|
1122
|
+
we should not embed credentials into rclone config. Instead, we should
|
|
1123
|
+
use env_auth=true so that rclone uses the AWS SDK credential chain,
|
|
1124
|
+
which properly handles temporary credentials and IAM roles.
|
|
1125
|
+
|
|
1126
|
+
Returns:
|
|
1127
|
+
True if environment-based auth should be used, False for static
|
|
1128
|
+
credentials that can be embedded.
|
|
1129
|
+
"""
|
|
1130
|
+
identity_type = cls._current_identity_type()
|
|
1131
|
+
if identity_type is None:
|
|
1132
|
+
return False
|
|
1133
|
+
# These credential types use temporary credentials that should not be
|
|
1134
|
+
# embedded in config files. They rely on the AWS SDK credential chain.
|
|
1135
|
+
non_static_types = {
|
|
1136
|
+
AWSIdentityType.SSO,
|
|
1137
|
+
AWSIdentityType.IAM_ROLE,
|
|
1138
|
+
AWSIdentityType.CONTAINER_ROLE,
|
|
1139
|
+
}
|
|
1140
|
+
return identity_type in non_static_types
|
|
1141
|
+
|
|
1084
1142
|
@classmethod
|
|
1085
1143
|
@aws_profile_aware_lru_cache(scope='request',
|
|
1086
1144
|
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
|
sky/clouds/azure.py
CHANGED
|
@@ -97,6 +97,8 @@ class Azure(clouds.Cloud):
|
|
|
97
97
|
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS: (
|
|
98
98
|
f'High availability controllers are not supported on {cls._REPR}.'
|
|
99
99
|
),
|
|
100
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
|
101
|
+
(f'Custom network tier is not supported on {cls._REPR}.'),
|
|
100
102
|
clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK: (
|
|
101
103
|
f'Customized multiple network interfaces are not supported on {cls._REPR}.'
|
|
102
104
|
),
|
sky/clouds/cloud.py
CHANGED
|
@@ -182,6 +182,13 @@ class Cloud:
|
|
|
182
182
|
"""
|
|
183
183
|
return cls._SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE
|
|
184
184
|
|
|
185
|
+
@classmethod
|
|
186
|
+
def uses_ray(cls) -> bool:
|
|
187
|
+
"""Returns whether this cloud uses Ray as the distributed
|
|
188
|
+
execution framework.
|
|
189
|
+
"""
|
|
190
|
+
return True
|
|
191
|
+
|
|
185
192
|
#### Regions/Zones ####
|
|
186
193
|
|
|
187
194
|
@classmethod
|
sky/clouds/kubernetes.py
CHANGED
|
@@ -766,6 +766,8 @@ class Kubernetes(clouds.Cloud):
|
|
|
766
766
|
'ha_recovery_log_path':
|
|
767
767
|
constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format(''),
|
|
768
768
|
'sky_python_cmd': constants.SKY_PYTHON_CMD,
|
|
769
|
+
'sky_unset_pythonpath_and_set_cwd':
|
|
770
|
+
constants.SKY_UNSET_PYTHONPATH_AND_SET_CWD,
|
|
769
771
|
'k8s_high_availability_storage_class_name':
|
|
770
772
|
(k8s_ha_storage_class_name),
|
|
771
773
|
'avoid_label_keys': avoid_label_keys,
|
sky/clouds/runpod.py
CHANGED
|
@@ -7,6 +7,7 @@ from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
|
7
7
|
|
|
8
8
|
from sky import catalog
|
|
9
9
|
from sky import clouds
|
|
10
|
+
from sky.utils import common_utils
|
|
10
11
|
from sky.utils import registry
|
|
11
12
|
from sky.utils import resources_utils
|
|
12
13
|
|
|
@@ -312,18 +313,48 @@ class RunPod(clouds.Cloud):
|
|
|
312
313
|
# If that happens to be set to None, then ValueError is raised.
|
|
313
314
|
return False, dependency_error_msg
|
|
314
315
|
|
|
316
|
+
hint_msg = (
|
|
317
|
+
'Credentials can be set up by running: \n'
|
|
318
|
+
' $ pip install runpod \n'
|
|
319
|
+
' $ runpod config\n'
|
|
320
|
+
' For more information, see https://docs.skypilot.co/en/latest/getting-started/installation.html#runpod' # pylint: disable=line-too-long
|
|
321
|
+
)
|
|
322
|
+
|
|
315
323
|
valid, error = cls._check_runpod_credentials()
|
|
316
324
|
if not valid:
|
|
317
|
-
return False, (
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
)
|
|
325
|
+
return False, (f'{error} \n {hint_msg}')
|
|
326
|
+
|
|
327
|
+
# Validate credentials by making an actual API call
|
|
328
|
+
valid, error = cls._validate_api_key()
|
|
329
|
+
if not valid:
|
|
330
|
+
return False, (f'{error} \n {hint_msg}')
|
|
324
331
|
|
|
325
332
|
return True, None
|
|
326
333
|
|
|
334
|
+
@classmethod
|
|
335
|
+
def _validate_api_key(cls) -> Tuple[bool, Optional[str]]:
|
|
336
|
+
"""Validate RunPod API key by making an actual API call."""
|
|
337
|
+
# Import here to avoid circular imports and ensure runpod is configured
|
|
338
|
+
# pylint: disable=import-outside-toplevel
|
|
339
|
+
from sky.provision.runpod import utils as runpod_utils
|
|
340
|
+
try:
|
|
341
|
+
# Try to list instances to validate the API key works
|
|
342
|
+
runpod_utils.list_instances()
|
|
343
|
+
return True, None
|
|
344
|
+
except Exception as e: # pylint: disable=broad-except
|
|
345
|
+
from sky.adaptors import runpod
|
|
346
|
+
error_msg = common_utils.format_exception(e, use_bracket=True)
|
|
347
|
+
if isinstance(e, runpod.runpod.error.QueryError):
|
|
348
|
+
error_msg_lower = str(e).lower()
|
|
349
|
+
auth_keywords = ['unauthorized', 'forbidden', '401', '403']
|
|
350
|
+
if any(keyword in error_msg_lower for keyword in auth_keywords):
|
|
351
|
+
return False, (
|
|
352
|
+
'RunPod API key is invalid or lacks required '
|
|
353
|
+
f'permissions. {error_msg}')
|
|
354
|
+
return False, (f'Failed to verify RunPod API key. {error_msg}')
|
|
355
|
+
return False, ('An unexpected error occurred during RunPod API '
|
|
356
|
+
f'key validation. {error_msg}')
|
|
357
|
+
|
|
327
358
|
@classmethod
|
|
328
359
|
def _check_runpod_credentials(cls, profile: str = 'default'):
|
|
329
360
|
"""Checks if the credentials file exists and is valid."""
|