skypilot-nightly 1.0.0.dev20250627__py3-none-any.whl → 1.0.0.dev20250630__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +14 -0
- sky/adaptors/nebius.py +2 -2
- sky/authentication.py +12 -5
- sky/backends/backend_utils.py +92 -26
- sky/check.py +5 -2
- sky/client/cli/command.py +39 -8
- sky/client/sdk.py +217 -167
- sky/client/service_account_auth.py +47 -0
- sky/clouds/aws.py +10 -4
- sky/clouds/azure.py +5 -2
- sky/clouds/cloud.py +5 -2
- sky/clouds/gcp.py +31 -18
- sky/clouds/kubernetes.py +54 -34
- sky/clouds/nebius.py +8 -2
- sky/clouds/ssh.py +5 -2
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +22 -7
- sky/clouds/utils/oci_utils.py +62 -14
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/NdypbqMxaYucRGfopkKXa/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1043-1b39779691bb4030.js +1 -0
- sky/dashboard/out/_next/static/chunks/{141-fa5a20cbf401b351.js → 1141-726e5a3f00b67185.js} +2 -2
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +1 -0
- sky/dashboard/out/_next/static/chunks/1691.44e378727a41f3b5.js +21 -0
- sky/dashboard/out/_next/static/chunks/1871-80dea41717729fa5.js +6 -0
- sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +1 -0
- sky/dashboard/out/_next/static/chunks/{875.52c962183328b3f2.js → 2875.c24c6d57dc82e436.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3256.7257acd01b481bed.js +11 -0
- sky/dashboard/out/_next/static/chunks/3698-52ad1ca228faa776.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.b3cc2bc1d49d2c3c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +1 -0
- sky/dashboard/out/_next/static/chunks/{947-6620842ef80ae879.js → 3947-b059261d6fa88a1f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{697.6460bf72e760addd.js → 4697.f5421144224da9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.4c849b1e05c8e9ad.js +1 -0
- sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +1 -0
- sky/dashboard/out/_next/static/chunks/{491.b3d264269613fe09.js → 5491.918ffed0ba7a5294.js} +1 -1
- sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +8 -0
- sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +39 -0
- sky/dashboard/out/_next/static/chunks/6601-fcfad0ddf92ec7ab.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-6ff4e45dfb49d11d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-d0dc765474fa0eca.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-909d53833da080cb.js +1 -0
- sky/dashboard/out/_next/static/chunks/8982.a2e214068f30a857.js +1 -0
- sky/dashboard/out/_next/static/chunks/{25.76c246239df93d50.js → 9025.a7c44babfe56ce09.js} +2 -2
- sky/dashboard/out/_next/static/chunks/938-044ad21de8b4626b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9470-21d059a1dfa03f61.js +1 -0
- sky/dashboard/out/_next/static/chunks/9984.739ae958a066298d.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{framework-87d061ee6ed71b28.js → framework-efc06c2733009cd3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +1 -0
- sky/dashboard/out/_next/static/chunks/{main-e0e2335212e72357.js → main-c0a4f1ea606d48d2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-9a3ce3170d2edcec.js → _app-a37b06ddb64521fd.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8135aba0712bda37.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b8e1114e6d38218c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9744c271a1642f76.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-c4d5cfac7fbc0668.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-cd43fb3c122eedde.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-4ebf6484f7216387.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-06bde99155fa6292.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-d427db53e54de9ce.js +1 -0
- sky/dashboard/out/_next/static/css/0da6afe66176678a.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +8 -3
- sky/global_user_state.py +257 -9
- sky/jobs/client/sdk.py +20 -25
- sky/models.py +16 -0
- sky/optimizer.py +46 -0
- sky/provision/__init__.py +14 -6
- sky/provision/kubernetes/config.py +1 -1
- sky/provision/kubernetes/constants.py +9 -0
- sky/provision/kubernetes/instance.py +24 -18
- sky/provision/kubernetes/network.py +15 -9
- sky/provision/kubernetes/network_utils.py +42 -23
- sky/provision/kubernetes/utils.py +73 -35
- sky/provision/kubernetes/volume.py +77 -15
- sky/provision/nebius/utils.py +10 -4
- sky/resources.py +10 -4
- sky/serve/client/sdk.py +28 -34
- sky/server/common.py +51 -3
- sky/server/constants.py +3 -0
- sky/server/requests/executor.py +4 -0
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +19 -0
- sky/server/rest.py +6 -15
- sky/server/server.py +121 -6
- sky/skylet/constants.py +7 -0
- sky/skypilot_config.py +32 -4
- sky/task.py +12 -0
- sky/users/permission.py +29 -0
- sky/users/server.py +384 -5
- sky/users/token_service.py +196 -0
- sky/utils/common_utils.py +4 -5
- sky/utils/config_utils.py +41 -0
- sky/utils/controller_utils.py +5 -1
- sky/utils/log_utils.py +68 -0
- sky/utils/resource_checker.py +153 -0
- sky/utils/resources_utils.py +12 -4
- sky/utils/schemas.py +87 -60
- sky/utils/subprocess_utils.py +2 -6
- sky/volumes/server/core.py +103 -78
- sky/volumes/utils.py +22 -5
- sky/workspaces/core.py +9 -117
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/RECORD +133 -128
- sky/dashboard/out/_next/static/HudU4f4Xsy-cP51JvXSZ-/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +0 -1
- sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +0 -1
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +0 -1
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/616-d6128fa9e7cae6e6.js +0 -39
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +0 -1
- sky/dashboard/out/_next/static/chunks/664-047bc03493fda379.js +0 -1
- sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +0 -16
- sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +0 -1
- sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +0 -1
- sky/dashboard/out/_next/static/chunks/799-3625946b2ec2eb30.js +0 -8
- sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +0 -6
- sky/dashboard/out/_next/static/chunks/937.3759f538f11a0953.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +0 -1
- sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +0 -1
- sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +0 -1
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +0 -1
- sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-f119a5630a1efd61.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-6b0d9e5031b70c58.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-5b59bce9eb208d84.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +0 -1
- sky/dashboard/out/_next/static/css/52082cf558ec9705.css +0 -3
- /sky/dashboard/out/_next/static/{HudU4f4Xsy-cP51JvXSZ- → NdypbqMxaYucRGfopkKXa}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{804-4c9fc53aa74bc191.js → 804-9f5e98ce84d46bdd.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,12 @@
|
|
1
1
|
"""Kubernetes pvc provisioning."""
|
2
2
|
from typing import Any, Dict, List, Optional, Tuple
|
3
3
|
|
4
|
+
from sky import global_user_state
|
4
5
|
from sky import models
|
5
6
|
from sky import sky_logging
|
6
7
|
from sky.adaptors import kubernetes
|
7
8
|
from sky.provision.kubernetes import config as config_lib
|
9
|
+
from sky.provision.kubernetes import constants as k8s_constants
|
8
10
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
9
11
|
from sky.volumes import volume as volume_lib
|
10
12
|
|
@@ -45,17 +47,26 @@ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
|
|
45
47
|
access_mode = pvc.spec.access_modes[0]
|
46
48
|
if access_mode not in once_modes:
|
47
49
|
continue
|
48
|
-
|
49
|
-
if
|
50
|
+
usedby_pods, _ = _get_volume_usedby(context, namespace, pvc_name)
|
51
|
+
if usedby_pods:
|
50
52
|
raise config_lib.KubernetesError(f'Volume {pvc_name} with access '
|
51
53
|
f'mode {access_mode} is already '
|
52
|
-
f'in use by {
|
54
|
+
f'in use by Pods {usedby_pods}.')
|
53
55
|
|
54
56
|
|
55
57
|
def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
56
58
|
"""Creates or registers a volume."""
|
57
59
|
context, namespace = _get_context_namespace(config)
|
58
60
|
pvc_spec = _get_pvc_spec(namespace, config)
|
61
|
+
# Check if the storage class exists
|
62
|
+
storage_class_name = pvc_spec['spec'].get('storageClassName')
|
63
|
+
if storage_class_name is not None:
|
64
|
+
try:
|
65
|
+
kubernetes.storage_api(context).read_storage_class(
|
66
|
+
name=storage_class_name)
|
67
|
+
except kubernetes.api_exception() as e:
|
68
|
+
raise config_lib.KubernetesError(
|
69
|
+
f'Check storage class {storage_class_name} error: {e}')
|
59
70
|
create_persistent_volume_claim(namespace, context, pvc_spec)
|
60
71
|
return config
|
61
72
|
|
@@ -76,22 +87,73 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
76
87
|
return config
|
77
88
|
|
78
89
|
|
79
|
-
def _get_volume_usedby(
|
80
|
-
|
81
|
-
|
82
|
-
|
90
|
+
def _get_volume_usedby(
|
91
|
+
context: Optional[str],
|
92
|
+
namespace: str,
|
93
|
+
pvc_name: str,
|
94
|
+
) -> Tuple[List[str], List[str]]:
|
95
|
+
"""Gets the usedby resources of a volume.
|
96
|
+
|
97
|
+
This function returns the pods and clusters that are using the volume.
|
98
|
+
The usedby_pods is accurate, which also includes the Pods that are not
|
99
|
+
managed by SkyPilot.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
context: Kubernetes context
|
103
|
+
namespace: Kubernetes namespace
|
104
|
+
pvc_name: PVC name
|
105
|
+
|
106
|
+
Returns:
|
107
|
+
usedby_pods: List of pods using the volume. These may include pods
|
108
|
+
not created by SkyPilot.
|
109
|
+
usedby_clusters: List of clusters using the volume.
|
110
|
+
"""
|
111
|
+
usedby_pods = []
|
112
|
+
usedby_clusters = []
|
113
|
+
field_selector = ','.join([
|
114
|
+
f'status.phase!={phase}'
|
115
|
+
for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
|
116
|
+
])
|
117
|
+
cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
|
83
118
|
# Get all pods in the namespace
|
84
|
-
pods = kubernetes.core_api(context).list_namespaced_pod(
|
119
|
+
pods = kubernetes.core_api(context).list_namespaced_pod(
|
120
|
+
namespace=namespace, field_selector=field_selector)
|
85
121
|
for pod in pods.items:
|
86
|
-
if pod.spec.volumes is
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
122
|
+
if pod.spec.volumes is None:
|
123
|
+
continue
|
124
|
+
for volume in pod.spec.volumes:
|
125
|
+
if volume.persistent_volume_claim is None:
|
126
|
+
continue
|
127
|
+
if volume.persistent_volume_claim.claim_name == pvc_name:
|
128
|
+
usedby_pods.append(pod.metadata.name)
|
129
|
+
# Get the real cluster name
|
130
|
+
cluster_name_on_cloud = pod.metadata.labels.get(
|
131
|
+
k8s_constants.TAG_SKYPILOT_CLUSTER_NAME)
|
132
|
+
if cluster_name_on_cloud is None:
|
133
|
+
continue
|
134
|
+
cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
|
135
|
+
if cluster_name is not None:
|
136
|
+
usedby_clusters.append(cluster_name)
|
137
|
+
if usedby_pods:
|
138
|
+
logger.debug(f'Volume {pvc_name} is used by Pods {usedby_pods}'
|
139
|
+
f' and clusters {usedby_clusters}')
|
140
|
+
return usedby_pods, usedby_clusters
|
141
|
+
|
142
|
+
|
143
|
+
def _get_cluster_name_on_cloud_to_cluster_name_map() -> Dict[str, str]:
|
144
|
+
"""Gets the map from cluster name on cloud to cluster name."""
|
145
|
+
clusters = global_user_state.get_clusters()
|
146
|
+
cloud_to_name_map = {}
|
147
|
+
for cluster in clusters:
|
148
|
+
handle = cluster['handle']
|
149
|
+
if handle is None:
|
150
|
+
continue
|
151
|
+
cloud_to_name_map[handle.cluster_name_on_cloud] = cluster['name']
|
152
|
+
return cloud_to_name_map
|
92
153
|
|
93
154
|
|
94
|
-
def get_volume_usedby(
|
155
|
+
def get_volume_usedby(
|
156
|
+
config: models.VolumeConfig,) -> Tuple[List[str], List[str]]:
|
95
157
|
"""Gets the usedby resources of a volume."""
|
96
158
|
context, namespace = _get_context_namespace(config)
|
97
159
|
pvc_name = config.name_on_cloud
|
sky/provision/nebius/utils.py
CHANGED
@@ -40,8 +40,11 @@ def get_project_by_region(region: str) -> str:
|
|
40
40
|
parent_id=nebius.get_tenant_id())).wait()
|
41
41
|
|
42
42
|
# Check is there project if in config
|
43
|
-
project_id = skypilot_config.
|
44
|
-
|
43
|
+
project_id = skypilot_config.get_effective_region_config(
|
44
|
+
cloud='nebius',
|
45
|
+
region=None,
|
46
|
+
keys=(region, 'project_id'),
|
47
|
+
default_value=None)
|
45
48
|
if project_id is not None:
|
46
49
|
return project_id
|
47
50
|
for project in projects.items:
|
@@ -184,8 +187,11 @@ def launch(cluster_name_on_cloud: str,
|
|
184
187
|
# https://docs.nebius.com/compute/clusters/gpu
|
185
188
|
if platform in nebius_constants.INFINIBAND_INSTANCE_PLATFORMS:
|
186
189
|
if preset == '8gpu-128vcpu-1600gb':
|
187
|
-
fabric = skypilot_config.
|
188
|
-
|
190
|
+
fabric = skypilot_config.get_effective_region_config(
|
191
|
+
cloud='nebius',
|
192
|
+
region=None,
|
193
|
+
keys=(region, 'fabric'),
|
194
|
+
default_value=None)
|
189
195
|
|
190
196
|
# Auto-select fabric if network_tier=best and no fabric configured
|
191
197
|
if (fabric is None and
|
sky/resources.py
CHANGED
@@ -1064,8 +1064,11 @@ class Resources:
|
|
1064
1064
|
regions = [r for r in regions if r.name in self._image_id]
|
1065
1065
|
|
1066
1066
|
# Filter the regions by the skypilot_config
|
1067
|
-
ssh_proxy_command_config = skypilot_config.
|
1068
|
-
|
1067
|
+
ssh_proxy_command_config = skypilot_config.get_effective_region_config(
|
1068
|
+
cloud=str(self._cloud).lower(),
|
1069
|
+
region=None,
|
1070
|
+
keys=('ssh_proxy_command',),
|
1071
|
+
default_value=None)
|
1069
1072
|
if (isinstance(ssh_proxy_command_config, str) or
|
1070
1073
|
ssh_proxy_command_config is None):
|
1071
1074
|
# All regions are valid as the regions are not specified for the
|
@@ -1550,8 +1553,11 @@ class Resources:
|
|
1550
1553
|
# to each cloud if any cloud supports reservations for spot.
|
1551
1554
|
return {}
|
1552
1555
|
specific_reservations = set(
|
1553
|
-
skypilot_config.
|
1554
|
-
|
1556
|
+
skypilot_config.get_effective_region_config(
|
1557
|
+
cloud=str(self.cloud).lower(),
|
1558
|
+
region=self.region,
|
1559
|
+
keys=('specific_reservations',),
|
1560
|
+
default_value=set()))
|
1555
1561
|
|
1556
1562
|
if isinstance(self.cloud, clouds.DummyCloud):
|
1557
1563
|
return self.cloud.get_reservations_available_resources(
|
sky/serve/client/sdk.py
CHANGED
@@ -74,12 +74,11 @@ def up(
|
|
74
74
|
task=dag_str,
|
75
75
|
service_name=service_name,
|
76
76
|
)
|
77
|
-
response =
|
78
|
-
|
77
|
+
response = server_common.make_authenticated_request(
|
78
|
+
'POST',
|
79
|
+
'/serve/up',
|
79
80
|
json=json.loads(body.model_dump_json()),
|
80
|
-
timeout=(5, None)
|
81
|
-
cookies=server_common.get_api_cookie_jar(),
|
82
|
-
)
|
81
|
+
timeout=(5, None))
|
83
82
|
return server_common.get_request_id(response)
|
84
83
|
|
85
84
|
|
@@ -136,12 +135,11 @@ def update(
|
|
136
135
|
mode=mode,
|
137
136
|
)
|
138
137
|
|
139
|
-
response =
|
140
|
-
|
138
|
+
response = server_common.make_authenticated_request(
|
139
|
+
'POST',
|
140
|
+
'/serve/update',
|
141
141
|
json=json.loads(body.model_dump_json()),
|
142
|
-
timeout=(5, None)
|
143
|
-
cookies=server_common.get_api_cookie_jar(),
|
144
|
-
)
|
142
|
+
timeout=(5, None))
|
145
143
|
return server_common.get_request_id(response)
|
146
144
|
|
147
145
|
|
@@ -178,12 +176,11 @@ def down(
|
|
178
176
|
all=all,
|
179
177
|
purge=purge,
|
180
178
|
)
|
181
|
-
response =
|
182
|
-
|
179
|
+
response = server_common.make_authenticated_request(
|
180
|
+
'POST',
|
181
|
+
'/serve/down',
|
183
182
|
json=json.loads(body.model_dump_json()),
|
184
|
-
timeout=(5, None)
|
185
|
-
cookies=server_common.get_api_cookie_jar(),
|
186
|
-
)
|
183
|
+
timeout=(5, None))
|
187
184
|
return server_common.get_request_id(response)
|
188
185
|
|
189
186
|
|
@@ -213,12 +210,11 @@ def terminate_replica(service_name: str, replica_id: int,
|
|
213
210
|
replica_id=replica_id,
|
214
211
|
purge=purge,
|
215
212
|
)
|
216
|
-
response =
|
217
|
-
|
213
|
+
response = server_common.make_authenticated_request(
|
214
|
+
'POST',
|
215
|
+
'/serve/terminate-replica',
|
218
216
|
json=json.loads(body.model_dump_json()),
|
219
|
-
timeout=(5, None)
|
220
|
-
cookies=server_common.get_api_cookie_jar(),
|
221
|
-
)
|
217
|
+
timeout=(5, None))
|
222
218
|
return server_common.get_request_id(response)
|
223
219
|
|
224
220
|
|
@@ -286,12 +282,11 @@ def status(
|
|
286
282
|
exceptions.ClusterNotUpError: if the sky serve controller is not up.
|
287
283
|
"""
|
288
284
|
body = payloads.ServeStatusBody(service_names=service_names,)
|
289
|
-
response =
|
290
|
-
|
285
|
+
response = server_common.make_authenticated_request(
|
286
|
+
'POST',
|
287
|
+
'/serve/status',
|
291
288
|
json=json.loads(body.model_dump_json()),
|
292
|
-
timeout=(5, None)
|
293
|
-
cookies=server_common.get_api_cookie_jar(),
|
294
|
-
)
|
289
|
+
timeout=(5, None))
|
295
290
|
return server_common.get_request_id(response)
|
296
291
|
|
297
292
|
|
@@ -373,13 +368,12 @@ def tail_logs(service_name: str,
|
|
373
368
|
replica_id=replica_id,
|
374
369
|
follow=follow,
|
375
370
|
)
|
376
|
-
response =
|
377
|
-
|
371
|
+
response = server_common.make_authenticated_request(
|
372
|
+
'POST',
|
373
|
+
'/serve/logs',
|
378
374
|
json=json.loads(body.model_dump_json()),
|
379
375
|
timeout=(5, None),
|
380
|
-
stream=True
|
381
|
-
cookies=server_common.get_api_cookie_jar(),
|
382
|
-
)
|
376
|
+
stream=True)
|
383
377
|
request_id = server_common.get_request_id(response)
|
384
378
|
return sdk.stream_response(request_id=request_id,
|
385
379
|
response=response,
|
@@ -436,11 +430,11 @@ def sync_down_logs(service_name: str,
|
|
436
430
|
targets=targets,
|
437
431
|
replica_ids=replica_ids,
|
438
432
|
)
|
439
|
-
response =
|
440
|
-
|
433
|
+
response = server_common.make_authenticated_request(
|
434
|
+
'POST',
|
435
|
+
'/serve/sync-down-logs',
|
441
436
|
json=json.loads(body.model_dump_json()),
|
442
|
-
timeout=(5, None)
|
443
|
-
)
|
437
|
+
timeout=(5, None))
|
444
438
|
remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
|
445
439
|
|
446
440
|
# Download from API server paths to the client's local_dir
|
sky/server/common.py
CHANGED
@@ -27,6 +27,7 @@ from sky import exceptions
|
|
27
27
|
from sky import sky_logging
|
28
28
|
from sky import skypilot_config
|
29
29
|
from sky.adaptors import common as adaptors_common
|
30
|
+
from sky.client import service_account_auth
|
30
31
|
from sky.data import data_utils
|
31
32
|
from sky.server import constants as server_constants
|
32
33
|
from sky.server import rest
|
@@ -185,6 +186,53 @@ def get_cookies_from_response(
|
|
185
186
|
return cookies
|
186
187
|
|
187
188
|
|
189
|
+
def make_authenticated_request(method: str,
|
190
|
+
path: str,
|
191
|
+
server_url: Optional[str] = None,
|
192
|
+
retry: bool = True,
|
193
|
+
**kwargs) -> 'requests.Response':
|
194
|
+
"""Make an authenticated HTTP request to the API server.
|
195
|
+
|
196
|
+
Automatically handles service account token authentication or cookie-based
|
197
|
+
authentication based on what's available.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
method: HTTP method (GET, POST, etc.)
|
201
|
+
path: API path (e.g., '/api/v1/status')
|
202
|
+
server_url: Server URL, defaults to configured server
|
203
|
+
**kwargs: Additional arguments to pass to requests
|
204
|
+
|
205
|
+
Returns:
|
206
|
+
requests.Response object
|
207
|
+
"""
|
208
|
+
if server_url is None:
|
209
|
+
server_url = get_server_url()
|
210
|
+
|
211
|
+
# Prepare headers and URL for service account authentication
|
212
|
+
headers = service_account_auth.get_service_account_headers()
|
213
|
+
|
214
|
+
# Merge with existing headers
|
215
|
+
if 'headers' in kwargs:
|
216
|
+
headers.update(kwargs['headers'])
|
217
|
+
kwargs['headers'] = headers
|
218
|
+
|
219
|
+
# Always use the same URL regardless of authentication type
|
220
|
+
# OAuth2 proxy will handle authentication based on headers
|
221
|
+
url = f'{server_url}/{path}' if not path.startswith(
|
222
|
+
'/') else f'{server_url}{path}'
|
223
|
+
|
224
|
+
# Use cookie authentication if no Bearer token present
|
225
|
+
if not headers.get('Authorization') and 'cookies' not in kwargs:
|
226
|
+
kwargs['cookies'] = get_api_cookie_jar()
|
227
|
+
|
228
|
+
# Make the request
|
229
|
+
if retry:
|
230
|
+
return rest.request(method, url, **kwargs)
|
231
|
+
else:
|
232
|
+
assert method == 'GET', 'Only GET requests can be done without retry'
|
233
|
+
return rest.request_without_retry(method, url, **kwargs)
|
234
|
+
|
235
|
+
|
188
236
|
@annotations.lru_cache(scope='global')
|
189
237
|
def get_server_url(host: Optional[str] = None) -> str:
|
190
238
|
endpoint = DEFAULT_SERVER_URL
|
@@ -243,9 +291,9 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
243
291
|
server_url = endpoint if endpoint is not None else get_server_url()
|
244
292
|
while time_out_try_count <= RETRY_COUNT_ON_TIMEOUT:
|
245
293
|
try:
|
246
|
-
response =
|
247
|
-
|
248
|
-
|
294
|
+
response = make_authenticated_request('GET',
|
295
|
+
'/api/health',
|
296
|
+
timeout=2.5)
|
249
297
|
except requests.exceptions.Timeout:
|
250
298
|
if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
|
251
299
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
sky/server/constants.py
CHANGED
@@ -36,3 +36,6 @@ API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
|
|
36
36
|
# The path to the dashboard build output
|
37
37
|
DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
|
38
38
|
'out')
|
39
|
+
|
40
|
+
# The interval (seconds) for the event to be restarted in the background.
|
41
|
+
DAEMON_RESTART_INTERVAL_SECONDS = 20
|
sky/server/requests/executor.py
CHANGED
@@ -268,6 +268,10 @@ def override_request_env_and_config(
|
|
268
268
|
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
269
269
|
name=request_body.env_vars[constants.USER_ENV_VAR])
|
270
270
|
global_user_state.add_or_update_user(user)
|
271
|
+
# Refetch the user to get the latest user info, including the created_at
|
272
|
+
# field.
|
273
|
+
user = global_user_state.get_user(user.id)
|
274
|
+
|
271
275
|
# Force color to be enabled.
|
272
276
|
os.environ['CLICOLOR_FORCE'] = '1'
|
273
277
|
server_common.reload_for_new_request(
|
sky/server/requests/payloads.py
CHANGED
@@ -358,6 +358,39 @@ class UserImportBody(RequestBody):
|
|
358
358
|
csv_content: str
|
359
359
|
|
360
360
|
|
361
|
+
class ServiceAccountTokenCreateBody(RequestBody):
|
362
|
+
"""The request body for creating a service account token."""
|
363
|
+
token_name: str
|
364
|
+
expires_in_days: Optional[int] = None
|
365
|
+
|
366
|
+
|
367
|
+
class ServiceAccountTokenDeleteBody(RequestBody):
|
368
|
+
"""The request body for deleting a service account token."""
|
369
|
+
token_id: str
|
370
|
+
|
371
|
+
|
372
|
+
class UpdateRoleBody(RequestBody):
|
373
|
+
"""The request body for updating a user role."""
|
374
|
+
role: str
|
375
|
+
|
376
|
+
|
377
|
+
class ServiceAccountTokenRoleBody(RequestBody):
|
378
|
+
"""The request body for getting a service account token role."""
|
379
|
+
token_id: str
|
380
|
+
|
381
|
+
|
382
|
+
class ServiceAccountTokenUpdateRoleBody(RequestBody):
|
383
|
+
"""The request body for updating a service account token role."""
|
384
|
+
token_id: str
|
385
|
+
role: str
|
386
|
+
|
387
|
+
|
388
|
+
class ServiceAccountTokenRotateBody(RequestBody):
|
389
|
+
"""The request body for rotating a service account token."""
|
390
|
+
token_id: str
|
391
|
+
expires_in_days: Optional[int] = None
|
392
|
+
|
393
|
+
|
361
394
|
class DownloadBody(RequestBody):
|
362
395
|
"""The request body for the download endpoint."""
|
363
396
|
folder_paths: List[str]
|
sky/server/requests/requests.py
CHANGED
@@ -375,10 +375,29 @@ def managed_job_status_refresh_event():
|
|
375
375
|
|
376
376
|
@dataclasses.dataclass
|
377
377
|
class InternalRequestDaemon:
|
378
|
+
"""Internal daemon that runs an event in the background."""
|
379
|
+
|
378
380
|
id: str
|
379
381
|
name: str
|
380
382
|
event_fn: Callable[[], None]
|
381
383
|
|
384
|
+
def run_event(self):
|
385
|
+
"""Run the event."""
|
386
|
+
while True:
|
387
|
+
with ux_utils.enable_traceback():
|
388
|
+
try:
|
389
|
+
self.event_fn()
|
390
|
+
break
|
391
|
+
except Exception: # pylint: disable=broad-except
|
392
|
+
# It is OK to fail to run the event, as the event is not
|
393
|
+
# critical, but we should log the error.
|
394
|
+
logger.exception(
|
395
|
+
f'Error running {self.name} event. '
|
396
|
+
f'Restarting in '
|
397
|
+
f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
|
398
|
+
'seconds...')
|
399
|
+
time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
|
400
|
+
|
382
401
|
|
383
402
|
# Register the events to run in the background.
|
384
403
|
INTERNAL_REQUEST_DAEMONS = [
|
sky/server/rest.py
CHANGED
@@ -129,25 +129,16 @@ def handle_server_unavailable(response: 'requests.Response') -> None:
|
|
129
129
|
|
130
130
|
|
131
131
|
@retry_on_server_unavailable()
|
132
|
-
def
|
133
|
-
"""Send a
|
132
|
+
def request(method, url, **kwargs) -> 'requests.Response':
|
133
|
+
"""Send a request to the API server, retry on server temporarily
|
134
134
|
unavailable."""
|
135
|
-
response = requests.
|
135
|
+
response = requests.request(method, url, **kwargs)
|
136
136
|
handle_server_unavailable(response)
|
137
137
|
return response
|
138
138
|
|
139
139
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
unavailable."""
|
144
|
-
response = requests.get(url, params=params, **kwargs)
|
145
|
-
handle_server_unavailable(response)
|
146
|
-
return response
|
147
|
-
|
148
|
-
|
149
|
-
def get_without_retry(url, params=None, **kwargs) -> 'requests.Response':
|
150
|
-
"""Send a GET request to the API server without retry."""
|
151
|
-
response = requests.get(url, params=params, **kwargs)
|
140
|
+
def request_without_retry(method, url, **kwargs) -> 'requests.Response':
|
141
|
+
"""Send a request to the API server without retry."""
|
142
|
+
response = requests.request(method, url, **kwargs)
|
152
143
|
handle_server_unavailable(response)
|
153
144
|
return response
|
sky/server/server.py
CHANGED
@@ -119,8 +119,11 @@ def _basic_auth_401_response(content: str):
|
|
119
119
|
# TODO(hailong): Remove this function and use request.state.auth_user instead.
|
120
120
|
async def _override_user_info_in_request_body(request: fastapi.Request,
|
121
121
|
auth_user: Optional[models.User]):
|
122
|
+
if auth_user is None:
|
123
|
+
return
|
124
|
+
|
122
125
|
body = await request.body()
|
123
|
-
if
|
126
|
+
if body:
|
124
127
|
try:
|
125
128
|
original_json = await request.json()
|
126
129
|
except json.JSONDecodeError as e:
|
@@ -228,14 +231,17 @@ class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
228
231
|
|
229
232
|
async def dispatch(self, request: fastapi.Request, call_next):
|
230
233
|
if request.url.path.startswith('/api/'):
|
231
|
-
# Try to set the auth user from
|
232
|
-
# following endpoint handlers can leverage the auth_user info
|
234
|
+
# Try to set the auth user from basic auth
|
233
235
|
_try_set_basic_auth_user(request)
|
234
236
|
return await call_next(request)
|
235
237
|
|
236
238
|
auth_header = request.headers.get('authorization')
|
237
|
-
if not auth_header
|
238
|
-
return _basic_auth_401_response('
|
239
|
+
if not auth_header:
|
240
|
+
return _basic_auth_401_response('Authentication required')
|
241
|
+
|
242
|
+
# Only handle basic auth
|
243
|
+
if not auth_header.lower().startswith('basic '):
|
244
|
+
return _basic_auth_401_response('Invalid authentication method')
|
239
245
|
|
240
246
|
# Check username and password
|
241
247
|
encoded = auth_header.split(' ', 1)[1]
|
@@ -267,6 +273,111 @@ class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
267
273
|
return await call_next(request)
|
268
274
|
|
269
275
|
|
276
|
+
class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
277
|
+
"""Middleware to handle Bearer Token Auth (Service Accounts)."""
|
278
|
+
|
279
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
280
|
+
# Only process requests with Bearer token authorization header
|
281
|
+
auth_header = request.headers.get('authorization')
|
282
|
+
if not auth_header or not auth_header.lower().startswith('bearer '):
|
283
|
+
# No Bearer token, continue with normal processing (OAuth2 cookies,
|
284
|
+
# etc.)
|
285
|
+
return await call_next(request)
|
286
|
+
|
287
|
+
# Extract token
|
288
|
+
sa_token = auth_header.split(' ', 1)[1]
|
289
|
+
|
290
|
+
# Handle SkyPilot service account tokens
|
291
|
+
if sa_token.startswith('sky_'):
|
292
|
+
return await self._handle_service_account_token(
|
293
|
+
request, sa_token, call_next)
|
294
|
+
|
295
|
+
# Handle other Bearer tokens (OAuth2 access tokens, etc.)
|
296
|
+
# These requests bypassed OAuth2 proxy, so let the application decide
|
297
|
+
# how to handle them
|
298
|
+
# For now, we'll let them continue through normal processing
|
299
|
+
logger.debug(
|
300
|
+
'Non-SkyPilot Bearer token detected, continuing with normal '
|
301
|
+
'processing')
|
302
|
+
return await call_next(request)
|
303
|
+
|
304
|
+
async def _handle_service_account_token(self, request: fastapi.Request,
|
305
|
+
sa_token: str, call_next):
|
306
|
+
"""Handle SkyPilot service account tokens."""
|
307
|
+
# Check if service account tokens are enabled
|
308
|
+
sa_enabled = os.environ.get(constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
|
309
|
+
'false').lower()
|
310
|
+
if sa_enabled != 'true':
|
311
|
+
return fastapi.responses.JSONResponse(
|
312
|
+
status_code=401,
|
313
|
+
content={'detail': 'Service account authentication disabled'})
|
314
|
+
|
315
|
+
try:
|
316
|
+
# Import here to avoid circular imports
|
317
|
+
# pylint: disable=import-outside-toplevel
|
318
|
+
from sky.users.token_service import token_service
|
319
|
+
|
320
|
+
# Verify and decode JWT token
|
321
|
+
payload = token_service.verify_token(sa_token)
|
322
|
+
|
323
|
+
if payload is None:
|
324
|
+
logger.warning('Service account token verification failed')
|
325
|
+
return fastapi.responses.JSONResponse(
|
326
|
+
status_code=401,
|
327
|
+
content={
|
328
|
+
'detail': 'Invalid or expired service account token'
|
329
|
+
})
|
330
|
+
|
331
|
+
# Extract user information from JWT payload
|
332
|
+
user_id = payload.get('sub')
|
333
|
+
user_name = payload.get('name')
|
334
|
+
token_id = payload.get('token_id')
|
335
|
+
|
336
|
+
if not user_id or not token_id:
|
337
|
+
logger.warning(
|
338
|
+
'Invalid token payload: missing user_id or token_id')
|
339
|
+
return fastapi.responses.JSONResponse(
|
340
|
+
status_code=401,
|
341
|
+
content={'detail': 'Invalid token payload'})
|
342
|
+
|
343
|
+
# Verify user still exists in database
|
344
|
+
user_info = global_user_state.get_user(user_id)
|
345
|
+
if user_info is None:
|
346
|
+
logger.warning(
|
347
|
+
f'Service account user {user_id} no longer exists')
|
348
|
+
return fastapi.responses.JSONResponse(
|
349
|
+
status_code=401,
|
350
|
+
content={'detail': 'Service account user no longer exists'})
|
351
|
+
|
352
|
+
# Update last used timestamp for token tracking
|
353
|
+
try:
|
354
|
+
global_user_state.update_service_account_token_last_used(
|
355
|
+
token_id)
|
356
|
+
except Exception as e: # pylint: disable=broad-except
|
357
|
+
logger.debug(f'Failed to update token last used time: {e}')
|
358
|
+
|
359
|
+
# Set the authenticated user
|
360
|
+
auth_user = models.User(id=user_id,
|
361
|
+
name=user_name or user_info.name)
|
362
|
+
request.state.auth_user = auth_user
|
363
|
+
|
364
|
+
# Override user info in request body for service account requests
|
365
|
+
await _override_user_info_in_request_body(request, auth_user)
|
366
|
+
|
367
|
+
logger.debug(f'Authenticated service account: {user_id}')
|
368
|
+
|
369
|
+
except Exception as e: # pylint: disable=broad-except
|
370
|
+
logger.error(f'Service account authentication failed: {e}',
|
371
|
+
exc_info=True)
|
372
|
+
return fastapi.responses.JSONResponse(
|
373
|
+
status_code=401,
|
374
|
+
content={
|
375
|
+
'detail': f'Service account authentication failed: {str(e)}'
|
376
|
+
})
|
377
|
+
|
378
|
+
return await call_next(request)
|
379
|
+
|
380
|
+
|
270
381
|
class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
271
382
|
"""Middleware to handle auth proxy."""
|
272
383
|
|
@@ -330,7 +441,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
330
441
|
request_id=event.id,
|
331
442
|
request_name=event.name,
|
332
443
|
request_body=payloads.RequestBody(),
|
333
|
-
func=event.
|
444
|
+
func=event.run_event,
|
334
445
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
335
446
|
is_skypilot_system=True,
|
336
447
|
)
|
@@ -424,6 +535,9 @@ app.add_middleware(
|
|
424
535
|
enable_basic_auth = os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH, 'false')
|
425
536
|
if str(enable_basic_auth).lower() == 'true':
|
426
537
|
app.add_middleware(BasicAuthMiddleware)
|
538
|
+
# Bearer token middleware should always be present to handle service account
|
539
|
+
# authentication
|
540
|
+
app.add_middleware(BearerTokenMiddleware)
|
427
541
|
app.add_middleware(AuthProxyMiddleware)
|
428
542
|
app.add_middleware(RequestIDMiddleware)
|
429
543
|
app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
|
@@ -1339,6 +1453,7 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
|
|
1339
1453
|
- commit: str; The commit hash of SkyPilot used for API server.
|
1340
1454
|
"""
|
1341
1455
|
user = request.state.auth_user
|
1456
|
+
logger.info(f'Health endpoint: request.state.auth_user = {user}')
|
1342
1457
|
return {
|
1343
1458
|
'status': common.ApiServerStatus.HEALTHY.value,
|
1344
1459
|
'api_version': server_constants.API_VERSION,
|