skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/slurm.py +159 -72
- sky/backends/backend_utils.py +52 -10
- sky/backends/cloud_vm_ray_backend.py +192 -32
- sky/backends/task_codegen.py +40 -2
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +0 -7
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +11 -8
- sky/client/cli/command.py +106 -54
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +8 -0
- sky/client/sdk_async.py +9 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +44 -12
- sky/clouds/ssh.py +1 -1
- sky/clouds/vast.py +30 -17
- sky/core.py +69 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +29 -4
- sky/global_user_state.py +108 -16
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +9 -0
- sky/optimizer.py +2 -1
- sky/provision/__init__.py +11 -9
- sky/provision/kubernetes/utils.py +122 -15
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +2 -1
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/instance.py +75 -29
- sky/provision/slurm/utils.py +213 -107
- sky/provision/vast/utils.py +1 -0
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +16 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +28 -10
- sky/server/requests/serializers/encoders.py +5 -0
- sky/server/requests/serializers/return_value_serializers.py +14 -4
- sky/server/server.py +434 -107
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +21 -10
- sky/sky_logging.py +2 -1
- sky/skylet/constants.py +22 -5
- sky/skylet/executor/slurm.py +4 -6
- sky/skylet/job_lib.py +89 -4
- sky/skylet/services.py +18 -3
- sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -6
- sky/templates/slurm-ray.yml.j2 +32 -2
- sky/templates/websocket_proxy.py +18 -41
- sky/users/permission.py +61 -51
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +256 -94
- sky/utils/command_runner.pyi +16 -0
- sky/utils/common_utils.py +30 -29
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +63 -20
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
- /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/catalog/slurm_catalog.py
CHANGED
|
@@ -112,9 +112,6 @@ def list_accelerators_realtime(
|
|
|
112
112
|
else:
|
|
113
113
|
slurm_cluster = region_filter
|
|
114
114
|
|
|
115
|
-
partition_filter = slurm_utils.get_cluster_default_partition(slurm_cluster)
|
|
116
|
-
|
|
117
|
-
# Call the helper function to get node info
|
|
118
115
|
slurm_nodes_info = slurm_utils.slurm_node_info(
|
|
119
116
|
slurm_cluster_name=slurm_cluster)
|
|
120
117
|
|
|
@@ -126,8 +123,6 @@ def list_accelerators_realtime(
|
|
|
126
123
|
filters_applied.append(f'gpu_name={name_filter!r}')
|
|
127
124
|
if quantity_filter:
|
|
128
125
|
filters_applied.append(f'quantity>={quantity_filter}')
|
|
129
|
-
if region_filter:
|
|
130
|
-
filters_applied.append(f'cluster={region_filter!r}')
|
|
131
126
|
if filters_applied:
|
|
132
127
|
err_msg += f' with filters ({", ".join(filters_applied)})'
|
|
133
128
|
err_msg += '.'
|
|
@@ -214,8 +209,6 @@ def list_accelerators_realtime(
|
|
|
214
209
|
filters_applied.append(f'gpu_name={name_filter!r}')
|
|
215
210
|
if quantity_filter:
|
|
216
211
|
filters_applied.append(f'quantity>={quantity_filter}')
|
|
217
|
-
if partition_filter:
|
|
218
|
-
filters_applied.append(f'partition={partition_filter!r}')
|
|
219
212
|
if filters_applied:
|
|
220
213
|
err_msg += f' with filters ({", ".join(filters_applied)})'
|
|
221
214
|
err_msg += '.'
|
sky/catalog/vast_catalog.py
CHANGED
|
@@ -7,7 +7,10 @@ query instance types and pricing information for Vast.ai.
|
|
|
7
7
|
import typing
|
|
8
8
|
from typing import Dict, List, Optional, Tuple, Union
|
|
9
9
|
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
10
12
|
from sky.catalog import common
|
|
13
|
+
from sky.utils import resources_utils
|
|
11
14
|
from sky.utils import ux_utils
|
|
12
15
|
|
|
13
16
|
if typing.TYPE_CHECKING:
|
|
@@ -16,6 +19,17 @@ if typing.TYPE_CHECKING:
|
|
|
16
19
|
_df = common.read_catalog('vast/vms.csv')
|
|
17
20
|
|
|
18
21
|
|
|
22
|
+
def _apply_datacenter_filter(df: pd.DataFrame,
|
|
23
|
+
datacenter_only: bool) -> pd.DataFrame:
|
|
24
|
+
"""Filter dataframe by hosting_type if datacenter_only is True.
|
|
25
|
+
|
|
26
|
+
hosting_type: 0 = Consumer hosted, 1 = Datacenter hosted
|
|
27
|
+
"""
|
|
28
|
+
if not datacenter_only or 'HostingType' not in df.columns:
|
|
29
|
+
return df
|
|
30
|
+
return df[df['HostingType'] >= 1]
|
|
31
|
+
|
|
32
|
+
|
|
19
33
|
def instance_type_exists(instance_type: str) -> bool:
|
|
20
34
|
return common.instance_type_exists_impl(_df, instance_type)
|
|
21
35
|
|
|
@@ -48,13 +62,16 @@ def get_vcpus_mem_from_instance_type(
|
|
|
48
62
|
|
|
49
63
|
def get_default_instance_type(cpus: Optional[str] = None,
|
|
50
64
|
memory: Optional[str] = None,
|
|
51
|
-
disk_tier: Optional[
|
|
65
|
+
disk_tier: Optional[
|
|
66
|
+
resources_utils.DiskTier] = None,
|
|
52
67
|
region: Optional[str] = None,
|
|
53
|
-
zone: Optional[str] = None
|
|
68
|
+
zone: Optional[str] = None,
|
|
69
|
+
datacenter_only: bool = False) -> Optional[str]:
|
|
54
70
|
del disk_tier
|
|
55
71
|
# NOTE: After expanding catalog to multiple entries, you may
|
|
56
72
|
# want to specify a default instance type or family.
|
|
57
|
-
|
|
73
|
+
df = _apply_datacenter_filter(_df, datacenter_only)
|
|
74
|
+
return common.get_instance_type_for_cpus_mem_impl(df, cpus, memory, region,
|
|
58
75
|
zone)
|
|
59
76
|
|
|
60
77
|
|
|
@@ -70,12 +87,19 @@ def get_instance_type_for_accelerator(
|
|
|
70
87
|
memory: Optional[str] = None,
|
|
71
88
|
use_spot: bool = False,
|
|
72
89
|
region: Optional[str] = None,
|
|
73
|
-
zone: Optional[str] = None
|
|
74
|
-
|
|
90
|
+
zone: Optional[str] = None,
|
|
91
|
+
datacenter_only: bool = False) -> Tuple[Optional[List[str]], List[str]]:
|
|
92
|
+
"""Returns a list of instance types that have the given accelerator.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
datacenter_only: If True, only return instances hosted in datacenters
|
|
96
|
+
(hosting_type >= 1).
|
|
97
|
+
"""
|
|
75
98
|
if zone is not None:
|
|
76
99
|
with ux_utils.print_exception_no_traceback():
|
|
77
100
|
raise ValueError('Vast does not support zones.')
|
|
78
|
-
|
|
101
|
+
df = _apply_datacenter_filter(_df, datacenter_only)
|
|
102
|
+
return common.get_instance_type_for_accelerator_impl(df=df,
|
|
79
103
|
acc_name=acc_name,
|
|
80
104
|
acc_count=acc_count,
|
|
81
105
|
cpus=cpus,
|
sky/check.py
CHANGED
|
@@ -528,8 +528,9 @@ def _print_checked_cloud(
|
|
|
528
528
|
# `dict` reasons for K8s and SSH will be printed in detail in
|
|
529
529
|
# _format_enabled_cloud. Skip here unless the cloud is disabled.
|
|
530
530
|
if not isinstance(reason, str):
|
|
531
|
-
if not ok and isinstance(
|
|
532
|
-
|
|
531
|
+
if not ok and isinstance(
|
|
532
|
+
cloud_tuple[1],
|
|
533
|
+
(sky_clouds.SSH, sky_clouds.Kubernetes, sky_clouds.Slurm)):
|
|
533
534
|
if reason is not None:
|
|
534
535
|
reason_str = _format_context_details(cloud_tuple[1],
|
|
535
536
|
show_details=True,
|
|
@@ -555,7 +556,9 @@ def _print_checked_cloud(
|
|
|
555
556
|
capability_string = f'[{", ".join(enabled_capabilities)}]'
|
|
556
557
|
if verbose and cloud is not cloudflare and cloud is not coreweave:
|
|
557
558
|
activated_account = cloud.get_active_user_identity_str()
|
|
558
|
-
if isinstance(
|
|
559
|
+
if isinstance(
|
|
560
|
+
cloud_tuple[1],
|
|
561
|
+
(sky_clouds.SSH, sky_clouds.Kubernetes, sky_clouds.Slurm)):
|
|
559
562
|
detail_string = _format_context_details(cloud_tuple[1],
|
|
560
563
|
show_details=True,
|
|
561
564
|
ctx2text=ctx2text)
|
|
@@ -653,11 +656,11 @@ def _format_context_details(cloud: Union[str, sky_clouds.Cloud],
|
|
|
653
656
|
'configuration.'))
|
|
654
657
|
else:
|
|
655
658
|
# Default case - not set up
|
|
656
|
-
text_suffix = (': ' + _red_color('disabled. ') +
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
659
|
+
text_suffix = (': ' + _red_color('disabled. ') + _dim_color(
|
|
660
|
+
'Reason: Not set up. Use '
|
|
661
|
+
'`sky ssh up --infra '
|
|
662
|
+
f'{common_utils.removeprefix(context, "ssh-")}` '
|
|
663
|
+
'to set up.'))
|
|
661
664
|
contexts_formatted.append(
|
|
662
665
|
f'\n {symbol}{cleaned_context}{text_suffix}')
|
|
663
666
|
if isinstance(cloud_type, sky_clouds.SSH):
|
sky/client/cli/command.py
CHANGED
|
@@ -216,45 +216,27 @@ def _get_cluster_records_and_set_ssh_config(
|
|
|
216
216
|
f'\"{escaped_executable_path} '
|
|
217
217
|
f'{escaped_websocket_proxy_path} '
|
|
218
218
|
f'{server_common.get_server_url()} '
|
|
219
|
-
f'{handle.cluster_name}
|
|
219
|
+
f'{handle.cluster_name} '
|
|
220
|
+
f'kubernetes-pod-ssh-proxy\"')
|
|
220
221
|
credentials['ssh_proxy_command'] = proxy_command
|
|
221
222
|
elif isinstance(handle.launched_resources.cloud, clouds.Slurm):
|
|
222
|
-
#
|
|
223
|
-
#
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
escaped_key_path = shlex.quote(
|
|
235
|
-
cluster_utils.SSHConfigHelper.generate_local_key_file(
|
|
236
|
-
handle.cluster_name, credentials))
|
|
237
|
-
controller_host = handle.cached_external_ips[0]
|
|
238
|
-
|
|
239
|
-
# Build jump proxy: ssh to worker via controller/login node
|
|
240
|
-
proxy_command = (f'ssh -tt -i {escaped_key_path} '
|
|
241
|
-
'-o StrictHostKeyChecking=no '
|
|
242
|
-
'-o UserKnownHostsFile=/dev/null '
|
|
243
|
-
'-o IdentitiesOnly=yes '
|
|
244
|
-
'-W %h:%p '
|
|
245
|
-
f'{handle.ssh_user}@{controller_host}')
|
|
246
|
-
original_proxy = credentials.get('ssh_proxy_command')
|
|
247
|
-
if original_proxy:
|
|
248
|
-
proxy_command += (
|
|
249
|
-
f' -o ProxyCommand={shlex.quote(original_proxy)}')
|
|
250
|
-
|
|
223
|
+
# Replace the proxy command to proxy through the SkyPilot API
|
|
224
|
+
# server with websocket.
|
|
225
|
+
escaped_executable_path = shlex.quote(sys.executable)
|
|
226
|
+
escaped_websocket_proxy_path = shlex.quote(
|
|
227
|
+
f'{directory_utils.get_sky_dir()}/templates/websocket_proxy.py')
|
|
228
|
+
# %w is a placeholder for the node index, substituted per-node
|
|
229
|
+
# in cluster_utils.SSHConfigHelper.add_cluster().
|
|
230
|
+
proxy_command = (f'{escaped_executable_path} '
|
|
231
|
+
f'{escaped_websocket_proxy_path} '
|
|
232
|
+
f'{server_common.get_server_url()} '
|
|
233
|
+
f'{handle.cluster_name} '
|
|
234
|
+
f'slurm-job-ssh-proxy %w')
|
|
251
235
|
credentials['ssh_proxy_command'] = proxy_command
|
|
252
236
|
|
|
253
|
-
# For Slurm, use the worker's internal IP as the SSH target
|
|
254
|
-
ips = handle.cached_internal_ips
|
|
255
|
-
|
|
256
237
|
cluster_utils.SSHConfigHelper.add_cluster(
|
|
257
238
|
handle.cluster_name,
|
|
239
|
+
handle.cluster_name_on_cloud,
|
|
258
240
|
ips,
|
|
259
241
|
credentials,
|
|
260
242
|
handle.cached_external_ssh_ports,
|
|
@@ -3471,7 +3453,12 @@ def _down_or_stop_clusters(
|
|
|
3471
3453
|
click.echo(f' {name} ({first})')
|
|
3472
3454
|
|
|
3473
3455
|
if failures:
|
|
3474
|
-
|
|
3456
|
+
failure_str = 'Cluster(s) failed. See details above.'
|
|
3457
|
+
if down:
|
|
3458
|
+
failure_str += (
|
|
3459
|
+
' If you want to ignore the errors and remove the '
|
|
3460
|
+
'cluster(s) from the status table, use `sky down --purge`.')
|
|
3461
|
+
click.echo(failure_str)
|
|
3475
3462
|
|
|
3476
3463
|
|
|
3477
3464
|
@cli.command(cls=_DocumentedCodeCommand)
|
|
@@ -3898,8 +3885,10 @@ def show_gpus(
|
|
|
3898
3885
|
contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
|
|
3899
3886
|
cloud_str: str = 'Kubernetes',
|
|
3900
3887
|
context_title_str: str = 'CONTEXT') -> str:
|
|
3901
|
-
node_table = log_utils.create_table(
|
|
3902
|
-
|
|
3888
|
+
node_table = log_utils.create_table([
|
|
3889
|
+
context_title_str, 'NODE', 'vCPU', 'Memory (GB)', 'GPU',
|
|
3890
|
+
'GPU UTILIZATION'
|
|
3891
|
+
])
|
|
3903
3892
|
|
|
3904
3893
|
no_permissions_str = '<no permissions>'
|
|
3905
3894
|
hints = []
|
|
@@ -3916,6 +3905,44 @@ def show_gpus(
|
|
|
3916
3905
|
acc_type = node_info.accelerator_type
|
|
3917
3906
|
if acc_type is None:
|
|
3918
3907
|
acc_type = '-'
|
|
3908
|
+
|
|
3909
|
+
# Format CPU and memory: "X of Y free" or just "Y" if
|
|
3910
|
+
# free is unknown
|
|
3911
|
+
cpu_str = '-'
|
|
3912
|
+
if node_info.cpu_count is not None:
|
|
3913
|
+
cpu_total_str = common_utils.format_float(
|
|
3914
|
+
node_info.cpu_count, precision=0)
|
|
3915
|
+
|
|
3916
|
+
# Check if we have free CPU info (use hasattr to
|
|
3917
|
+
# check if field exists, then access directly)
|
|
3918
|
+
cpu_free = None
|
|
3919
|
+
if hasattr(node_info, 'cpu_free'):
|
|
3920
|
+
cpu_free = node_info.cpu_free
|
|
3921
|
+
if cpu_free is not None:
|
|
3922
|
+
cpu_free_str = common_utils.format_float(cpu_free,
|
|
3923
|
+
precision=0)
|
|
3924
|
+
cpu_str = f'{cpu_free_str} of {cpu_total_str} free'
|
|
3925
|
+
else:
|
|
3926
|
+
cpu_str = cpu_total_str
|
|
3927
|
+
|
|
3928
|
+
memory_str = '-'
|
|
3929
|
+
if node_info.memory_gb is not None:
|
|
3930
|
+
memory_total_str = common_utils.format_float(
|
|
3931
|
+
node_info.memory_gb, precision=0)
|
|
3932
|
+
|
|
3933
|
+
# Check if we have free memory info (use hasattr
|
|
3934
|
+
# to check if field exists, then access directly)
|
|
3935
|
+
memory_free_gb = None
|
|
3936
|
+
if hasattr(node_info, 'memory_free_gb'):
|
|
3937
|
+
memory_free_gb = node_info.memory_free_gb
|
|
3938
|
+
if memory_free_gb is not None:
|
|
3939
|
+
memory_free_str = common_utils.format_float(
|
|
3940
|
+
memory_free_gb, precision=0)
|
|
3941
|
+
memory_str = (
|
|
3942
|
+
f'{memory_free_str} of {memory_total_str} free')
|
|
3943
|
+
else:
|
|
3944
|
+
memory_str = memory_total_str
|
|
3945
|
+
|
|
3919
3946
|
utilization_str = (
|
|
3920
3947
|
f'{available} of '
|
|
3921
3948
|
f'{node_info.total["accelerator_count"]} free')
|
|
@@ -3924,8 +3951,11 @@ def show_gpus(
|
|
|
3924
3951
|
node_is_ready = getattr(node_info, 'is_ready', True)
|
|
3925
3952
|
if not node_is_ready:
|
|
3926
3953
|
utilization_str += ' (Node NotReady)'
|
|
3927
|
-
|
|
3928
|
-
|
|
3954
|
+
|
|
3955
|
+
node_table.add_row([
|
|
3956
|
+
context_name, node_name, cpu_str, memory_str, acc_type,
|
|
3957
|
+
utilization_str
|
|
3958
|
+
])
|
|
3929
3959
|
|
|
3930
3960
|
k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
|
|
3931
3961
|
if hints:
|
|
@@ -3936,7 +3966,7 @@ def show_gpus(
|
|
|
3936
3966
|
f'{colorama.Style.RESET_ALL}\n'
|
|
3937
3967
|
f'{node_table.get_string()}')
|
|
3938
3968
|
|
|
3939
|
-
def _format_slurm_node_info() -> str:
|
|
3969
|
+
def _format_slurm_node_info(slurm_cluster_names: List[str]) -> str:
|
|
3940
3970
|
node_table = log_utils.create_table([
|
|
3941
3971
|
'CLUSTER',
|
|
3942
3972
|
'NODE',
|
|
@@ -3946,13 +3976,12 @@ def show_gpus(
|
|
|
3946
3976
|
'UTILIZATION',
|
|
3947
3977
|
])
|
|
3948
3978
|
|
|
3949
|
-
|
|
3950
|
-
|
|
3979
|
+
request_ids = [(cluster_name,
|
|
3980
|
+
sdk.slurm_node_info(slurm_cluster_name=cluster_name))
|
|
3981
|
+
for cluster_name in slurm_cluster_names]
|
|
3951
3982
|
|
|
3952
|
-
|
|
3953
|
-
|
|
3954
|
-
nodes_info = sdk.stream_and_get(
|
|
3955
|
-
sdk.slurm_node_info(slurm_cluster_name=cluster_name))
|
|
3983
|
+
for cluster_name, request_id in request_ids:
|
|
3984
|
+
nodes_info = sdk.stream_and_get(request_id)
|
|
3956
3985
|
|
|
3957
3986
|
for node_info in nodes_info:
|
|
3958
3987
|
node_table.add_row([
|
|
@@ -4122,7 +4151,8 @@ def show_gpus(
|
|
|
4122
4151
|
yield from slurm_realtime_table.get_string()
|
|
4123
4152
|
yield '\n'
|
|
4124
4153
|
if show_node_info:
|
|
4125
|
-
|
|
4154
|
+
cluster_names = [cluster for cluster, _ in slurm_realtime_infos]
|
|
4155
|
+
yield _format_slurm_node_info(cluster_names)
|
|
4126
4156
|
|
|
4127
4157
|
def _output() -> Generator[str, None, None]:
|
|
4128
4158
|
gpu_table = log_utils.create_table(
|
|
@@ -4705,6 +4735,13 @@ def volumes_ls(verbose: bool):
|
|
|
4705
4735
|
is_flag=True,
|
|
4706
4736
|
required=False,
|
|
4707
4737
|
help='Delete all volumes.')
|
|
4738
|
+
@click.option('--purge',
|
|
4739
|
+
'-p',
|
|
4740
|
+
default=False,
|
|
4741
|
+
is_flag=True,
|
|
4742
|
+
required=False,
|
|
4743
|
+
help=('Forcibly delete the volume from the volumes table even '
|
|
4744
|
+
'if the deletion API fails.'))
|
|
4708
4745
|
@click.option('--yes',
|
|
4709
4746
|
'-y',
|
|
4710
4747
|
default=False,
|
|
@@ -4713,7 +4750,12 @@ def volumes_ls(verbose: bool):
|
|
|
4713
4750
|
help='Skip confirmation prompt.')
|
|
4714
4751
|
@_add_click_options(flags.COMMON_OPTIONS)
|
|
4715
4752
|
@usage_lib.entrypoint
|
|
4716
|
-
def volumes_delete(
|
|
4753
|
+
def volumes_delete(
|
|
4754
|
+
names: List[str],
|
|
4755
|
+
all: bool, # pylint: disable=redefined-builtin
|
|
4756
|
+
purge: bool,
|
|
4757
|
+
yes: bool,
|
|
4758
|
+
async_call: bool):
|
|
4717
4759
|
"""Delete volumes.
|
|
4718
4760
|
|
|
4719
4761
|
Examples:
|
|
@@ -4728,6 +4770,9 @@ def volumes_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
|
|
|
4728
4770
|
\b
|
|
4729
4771
|
# Delete all volumes.
|
|
4730
4772
|
sky volumes delete -a
|
|
4773
|
+
\b
|
|
4774
|
+
# Forcibly delete a volume.
|
|
4775
|
+
sky volumes delete pvc1 -p
|
|
4731
4776
|
"""
|
|
4732
4777
|
if sum([bool(names), all]) != 1:
|
|
4733
4778
|
raise click.UsageError('Either --all or a name must be specified.')
|
|
@@ -4754,8 +4799,8 @@ def volumes_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
|
|
|
4754
4799
|
show_default=True)
|
|
4755
4800
|
|
|
4756
4801
|
try:
|
|
4757
|
-
_async_call_or_wait(volumes_sdk.delete(names
|
|
4758
|
-
'sky.volumes.delete')
|
|
4802
|
+
_async_call_or_wait(volumes_sdk.delete(names, purge=purge),
|
|
4803
|
+
async_call, 'sky.volumes.delete')
|
|
4759
4804
|
except Exception as e: # pylint: disable=broad-except
|
|
4760
4805
|
logger.error(f'{colorama.Fore.RED}Error deleting volumes {names}: '
|
|
4761
4806
|
f'{str(e)}{colorama.Style.RESET_ALL}')
|
|
@@ -5427,9 +5472,14 @@ def jobs_pool_apply(
|
|
|
5427
5472
|
@flags.config_option(expose_value=False)
|
|
5428
5473
|
@flags.verbose_option()
|
|
5429
5474
|
@click.argument('pool_names', required=False, type=str, nargs=-1)
|
|
5475
|
+
@click.option('--all',
|
|
5476
|
+
'-a',
|
|
5477
|
+
'show_all',
|
|
5478
|
+
is_flag=True,
|
|
5479
|
+
default=False,
|
|
5480
|
+
help='Show all workers.')
|
|
5430
5481
|
@usage_lib.entrypoint
|
|
5431
|
-
|
|
5432
|
-
def jobs_pool_status(verbose: bool, pool_names: List[str]):
|
|
5482
|
+
def jobs_pool_status(verbose: bool, pool_names: List[str], show_all: bool):
|
|
5433
5483
|
"""Show statuses of pools.
|
|
5434
5484
|
|
|
5435
5485
|
Show detailed statuses of one or more pools. If POOL_NAME is not
|
|
@@ -5442,7 +5492,7 @@ def jobs_pool_status(verbose: bool, pool_names: List[str]):
|
|
|
5442
5492
|
pool_status_request_id = managed_jobs.pool_status(pool_names_to_query)
|
|
5443
5493
|
_, msg = _handle_services_request(pool_status_request_id,
|
|
5444
5494
|
service_names=pool_names_to_query,
|
|
5445
|
-
show_all=verbose,
|
|
5495
|
+
show_all=verbose or show_all,
|
|
5446
5496
|
show_endpoint=False,
|
|
5447
5497
|
pool=True,
|
|
5448
5498
|
is_called_by_user=True)
|
|
@@ -6745,9 +6795,11 @@ def api_status(request_id_prefixes: Optional[List[str]], all_status: bool,
|
|
|
6745
6795
|
if not verbose:
|
|
6746
6796
|
r_id = common_utils.truncate_long_string(r_id, 36)
|
|
6747
6797
|
req_status = requests.RequestStatus(request.status)
|
|
6748
|
-
|
|
6798
|
+
user_display = status_utils.get_user_display_name(
|
|
6799
|
+
request.user_name or '-', request.user_id)
|
|
6800
|
+
row = [r_id, user_display, request.name]
|
|
6749
6801
|
if verbose:
|
|
6750
|
-
row.append(request.cluster_name)
|
|
6802
|
+
row.append(request.cluster_name or '-')
|
|
6751
6803
|
row.extend([
|
|
6752
6804
|
log_utils.readable_time_duration(request.created_at),
|
|
6753
6805
|
req_status.colored_str()
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Utilities for handling interactive SSH authentication."""
|
|
2
|
+
import asyncio
|
|
3
|
+
import fcntl
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
7
|
+
import termios
|
|
8
|
+
import tty
|
|
9
|
+
import typing
|
|
10
|
+
|
|
11
|
+
from sky import sky_logging
|
|
12
|
+
from sky.adaptors import common as adaptors_common
|
|
13
|
+
from sky.client import service_account_auth
|
|
14
|
+
from sky.server import common as server_common
|
|
15
|
+
from sky.utils import rich_utils
|
|
16
|
+
|
|
17
|
+
if typing.TYPE_CHECKING:
|
|
18
|
+
import websockets
|
|
19
|
+
else:
|
|
20
|
+
websockets = adaptors_common.LazyImport('websockets')
|
|
21
|
+
|
|
22
|
+
logger = sky_logging.init_logger(__name__)
|
|
23
|
+
|
|
24
|
+
SKY_INTERACTIVE_PATTERN = re.compile(r'<sky-interactive session="([^"]+)"/>')
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# TODO(kevin): Refactor to share code with websocket_proxy.py.
|
|
28
|
+
async def _handle_interactive_auth_websocket(session_id: str) -> None:
|
|
29
|
+
"""Handle interactive SSH authentication via websocket.
|
|
30
|
+
|
|
31
|
+
This establishes a websocket connection to the API server and bridges
|
|
32
|
+
the user's terminal I/O bidirectionally with the PTY on the server,
|
|
33
|
+
allowing interactive authentication (e.g., 2FA).
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
session_id: The session identifier from the <sky-interactive> signal.
|
|
37
|
+
"""
|
|
38
|
+
# Get HTTP server URL and convert to websocket URL
|
|
39
|
+
server_url = server_common.get_server_url()
|
|
40
|
+
server_proto, server_fqdn = server_url.split('://')
|
|
41
|
+
websocket_proto = 'wss' if server_proto == 'https' else 'ws'
|
|
42
|
+
ws_url = (f'{websocket_proto}://{server_fqdn}'
|
|
43
|
+
f'/ssh-interactive-auth?session_id={session_id}')
|
|
44
|
+
|
|
45
|
+
logger.info('Starting interactive SSH authentication...')
|
|
46
|
+
|
|
47
|
+
headers = {}
|
|
48
|
+
# Add service account auth if available
|
|
49
|
+
headers.update(service_account_auth.get_service_account_headers())
|
|
50
|
+
# Add cookie auth with URL-aware filtering
|
|
51
|
+
headers.update(server_common.get_cookie_header_for_url(ws_url))
|
|
52
|
+
|
|
53
|
+
# Set terminal to raw mode if stdin is a tty
|
|
54
|
+
old_settings = None
|
|
55
|
+
if os.isatty(sys.stdin.fileno()):
|
|
56
|
+
old_settings = termios.tcgetattr(sys.stdin.fileno())
|
|
57
|
+
tty.setraw(sys.stdin.fileno())
|
|
58
|
+
|
|
59
|
+
stdin_dup_fd = None
|
|
60
|
+
stdout_dup_fd = None
|
|
61
|
+
try:
|
|
62
|
+
# Duplicate stdin/stdout fds before passing to asyncio.
|
|
63
|
+
# When asyncio's loop.connect_read/write_pipe() is called,
|
|
64
|
+
# it creates a transport that takes ownership of the file passed to it.
|
|
65
|
+
# By duplicating the fds, we give asyncio independent copies that it can
|
|
66
|
+
# safely close, while preserving the original sys.stdin/stdout.
|
|
67
|
+
stdin_dup_fd = os.dup(sys.stdin.fileno())
|
|
68
|
+
stdout_dup_fd = os.dup(sys.stdout.fileno())
|
|
69
|
+
|
|
70
|
+
async with websockets.connect(ws_url,
|
|
71
|
+
additional_headers=headers,
|
|
72
|
+
ping_interval=None) as ws:
|
|
73
|
+
loop = asyncio.get_running_loop()
|
|
74
|
+
|
|
75
|
+
stdin_reader = asyncio.StreamReader()
|
|
76
|
+
stdin_protocol = asyncio.StreamReaderProtocol(stdin_reader)
|
|
77
|
+
stdin_dup_file = os.fdopen(stdin_dup_fd, 'rb', buffering=0)
|
|
78
|
+
stdin_dup_fd = None # File object now owns the FD
|
|
79
|
+
await loop.connect_read_pipe(lambda: stdin_protocol, stdin_dup_file)
|
|
80
|
+
|
|
81
|
+
stdout_dup_file = os.fdopen(stdout_dup_fd, 'wb', buffering=0)
|
|
82
|
+
stdout_dup_fd = None # File object now owns the FD
|
|
83
|
+
stdout_transport, stdout_protocol = await loop.connect_write_pipe(
|
|
84
|
+
asyncio.streams.FlowControlMixin,
|
|
85
|
+
stdout_dup_file) # type: ignore
|
|
86
|
+
stdout_writer = asyncio.StreamWriter(stdout_transport,
|
|
87
|
+
stdout_protocol, None, loop)
|
|
88
|
+
|
|
89
|
+
async def stdin_to_websocket():
|
|
90
|
+
"""Forward stdin to websocket."""
|
|
91
|
+
try:
|
|
92
|
+
while True:
|
|
93
|
+
data = await stdin_reader.read(4096)
|
|
94
|
+
if not data:
|
|
95
|
+
break
|
|
96
|
+
await ws.send(data)
|
|
97
|
+
except asyncio.CancelledError:
|
|
98
|
+
# Task was cancelled - auth complete
|
|
99
|
+
pass
|
|
100
|
+
except Exception as e: # pylint: disable=broad-except
|
|
101
|
+
logger.debug(f'Error in stdin_to_websocket: {e}')
|
|
102
|
+
|
|
103
|
+
async def websocket_to_stdout():
|
|
104
|
+
"""Forward websocket to stdout."""
|
|
105
|
+
try:
|
|
106
|
+
async for message in ws:
|
|
107
|
+
stdout_writer.write(message)
|
|
108
|
+
await stdout_writer.drain()
|
|
109
|
+
except Exception as e: # pylint: disable=broad-except
|
|
110
|
+
logger.debug(f'Error in websocket_to_stdout: {e}')
|
|
111
|
+
|
|
112
|
+
# Run both directions concurrently
|
|
113
|
+
# Use tasks so we can cancel stdin reader when websocket closes
|
|
114
|
+
stdin_task = asyncio.create_task(stdin_to_websocket())
|
|
115
|
+
stdout_task = asyncio.create_task(websocket_to_stdout())
|
|
116
|
+
|
|
117
|
+
# Wait for websocket to close (auth complete)
|
|
118
|
+
await stdout_task
|
|
119
|
+
# Cancel stdin reader so it doesn't consume the next keystroke
|
|
120
|
+
stdin_task.cancel()
|
|
121
|
+
try:
|
|
122
|
+
await stdin_task
|
|
123
|
+
except asyncio.CancelledError:
|
|
124
|
+
pass
|
|
125
|
+
except Exception as e: # pylint: disable=broad-except
|
|
126
|
+
logger.error(f'Failed to handle interactive authentication: {e}')
|
|
127
|
+
raise
|
|
128
|
+
finally:
|
|
129
|
+
# Restore terminal settings if they were changed
|
|
130
|
+
if old_settings:
|
|
131
|
+
termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
|
|
132
|
+
old_settings)
|
|
133
|
+
# Flush any buffered input from stdin
|
|
134
|
+
termios.tcflush(sys.stdin.fileno(), termios.TCIFLUSH)
|
|
135
|
+
# Ensure stdout is in blocking mode (can be non-blocking after
|
|
136
|
+
# asyncio transport operations)
|
|
137
|
+
flags = fcntl.fcntl(sys.stdout.fileno(), fcntl.F_GETFL)
|
|
138
|
+
fcntl.fcntl(sys.stdout.fileno(), fcntl.F_SETFL,
|
|
139
|
+
flags & ~os.O_NONBLOCK)
|
|
140
|
+
|
|
141
|
+
for fd in [stdin_dup_fd, stdout_dup_fd]:
|
|
142
|
+
if fd is not None:
|
|
143
|
+
try:
|
|
144
|
+
os.close(fd)
|
|
145
|
+
except OSError:
|
|
146
|
+
# Already closed by asyncio or never opened
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def handle_interactive_auth(line: str) -> typing.Optional[str]:
|
|
151
|
+
"""Handle interactive SSH authentication signals (sync version).
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
line: The log line to check for interactive auth markers.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
The line with the marker removed, or None if this was an interactive
|
|
158
|
+
auth signal (meaning the line was consumed).
|
|
159
|
+
"""
|
|
160
|
+
match = SKY_INTERACTIVE_PATTERN.search(line)
|
|
161
|
+
if not match:
|
|
162
|
+
return line
|
|
163
|
+
|
|
164
|
+
session_id = match.group(1)
|
|
165
|
+
# Temporarily stop any spinners to allow terminal I/O
|
|
166
|
+
with rich_utils.safe_logger():
|
|
167
|
+
asyncio.run(_handle_interactive_auth_websocket(session_id))
|
|
168
|
+
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
async def handle_interactive_auth_async(line: str) -> typing.Optional[str]:
|
|
173
|
+
"""Handle interactive SSH authentication signals (async version).
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
line: The log line to check for interactive auth markers.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
The line with the marker removed, or None if this was an interactive
|
|
180
|
+
auth signal (meaning the line was consumed).
|
|
181
|
+
"""
|
|
182
|
+
match = SKY_INTERACTIVE_PATTERN.search(line)
|
|
183
|
+
if not match:
|
|
184
|
+
return line
|
|
185
|
+
|
|
186
|
+
session_id = match.group(1)
|
|
187
|
+
with rich_utils.safe_logger():
|
|
188
|
+
await _handle_interactive_auth_websocket(session_id)
|
|
189
|
+
|
|
190
|
+
return None
|
sky/client/sdk.py
CHANGED
|
@@ -30,6 +30,7 @@ from sky import sky_logging
|
|
|
30
30
|
from sky import skypilot_config
|
|
31
31
|
from sky.adaptors import common as adaptors_common
|
|
32
32
|
from sky.client import common as client_common
|
|
33
|
+
from sky.client import interactive_utils
|
|
33
34
|
from sky.client import oauth as oauth_lib
|
|
34
35
|
from sky.jobs import scheduler
|
|
35
36
|
from sky.jobs import utils as managed_job_utils
|
|
@@ -157,9 +158,16 @@ def stream_response(request_id: Optional[server_common.RequestId[T]],
|
|
|
157
158
|
retry_context = rest.get_retry_context()
|
|
158
159
|
try:
|
|
159
160
|
line_count = 0
|
|
161
|
+
|
|
160
162
|
for line in rich_utils.decode_rich_status(response):
|
|
161
163
|
if line is not None:
|
|
162
164
|
line_count += 1
|
|
165
|
+
|
|
166
|
+
line = interactive_utils.handle_interactive_auth(line)
|
|
167
|
+
if line is None:
|
|
168
|
+
# Line was consumed by interactive auth handler
|
|
169
|
+
continue
|
|
170
|
+
|
|
163
171
|
if retry_context is None:
|
|
164
172
|
print(line, flush=True, end='', file=output_stream)
|
|
165
173
|
elif line_count > retry_context.line_processed:
|
sky/client/sdk_async.py
CHANGED
|
@@ -23,6 +23,7 @@ from sky import catalog
|
|
|
23
23
|
from sky import exceptions
|
|
24
24
|
from sky import sky_logging
|
|
25
25
|
from sky.client import common as client_common
|
|
26
|
+
from sky.client import interactive_utils
|
|
26
27
|
from sky.client import sdk
|
|
27
28
|
from sky.schemas.api import responses
|
|
28
29
|
from sky.server import common as server_common
|
|
@@ -167,9 +168,17 @@ async def stream_response_async(request_id: Optional[str],
|
|
|
167
168
|
retry_context = rest.get_retry_context()
|
|
168
169
|
try:
|
|
169
170
|
line_count = 0
|
|
171
|
+
|
|
170
172
|
async for line in rich_utils.decode_rich_status_async(response):
|
|
171
173
|
if line is not None:
|
|
172
174
|
line_count += 1
|
|
175
|
+
|
|
176
|
+
line = await interactive_utils.handle_interactive_auth_async(
|
|
177
|
+
line)
|
|
178
|
+
if line is None:
|
|
179
|
+
# Line was consumed by interactive auth handler
|
|
180
|
+
continue
|
|
181
|
+
|
|
173
182
|
if retry_context is None:
|
|
174
183
|
print(line, flush=True, end='', file=output_stream)
|
|
175
184
|
elif line_count > retry_context.line_processed:
|