skypilot-nightly 1.0.0.dev20250627__py3-none-any.whl → 1.0.0.dev20250630__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +14 -0
- sky/adaptors/nebius.py +2 -2
- sky/authentication.py +12 -5
- sky/backends/backend_utils.py +92 -26
- sky/check.py +5 -2
- sky/client/cli/command.py +39 -8
- sky/client/sdk.py +217 -167
- sky/client/service_account_auth.py +47 -0
- sky/clouds/aws.py +10 -4
- sky/clouds/azure.py +5 -2
- sky/clouds/cloud.py +5 -2
- sky/clouds/gcp.py +31 -18
- sky/clouds/kubernetes.py +54 -34
- sky/clouds/nebius.py +8 -2
- sky/clouds/ssh.py +5 -2
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +22 -7
- sky/clouds/utils/oci_utils.py +62 -14
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/NdypbqMxaYucRGfopkKXa/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1043-1b39779691bb4030.js +1 -0
- sky/dashboard/out/_next/static/chunks/{141-fa5a20cbf401b351.js → 1141-726e5a3f00b67185.js} +2 -2
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +1 -0
- sky/dashboard/out/_next/static/chunks/1691.44e378727a41f3b5.js +21 -0
- sky/dashboard/out/_next/static/chunks/1871-80dea41717729fa5.js +6 -0
- sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +1 -0
- sky/dashboard/out/_next/static/chunks/{875.52c962183328b3f2.js → 2875.c24c6d57dc82e436.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3256.7257acd01b481bed.js +11 -0
- sky/dashboard/out/_next/static/chunks/3698-52ad1ca228faa776.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.b3cc2bc1d49d2c3c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +1 -0
- sky/dashboard/out/_next/static/chunks/{947-6620842ef80ae879.js → 3947-b059261d6fa88a1f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{697.6460bf72e760addd.js → 4697.f5421144224da9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.4c849b1e05c8e9ad.js +1 -0
- sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +1 -0
- sky/dashboard/out/_next/static/chunks/{491.b3d264269613fe09.js → 5491.918ffed0ba7a5294.js} +1 -1
- sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +8 -0
- sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +39 -0
- sky/dashboard/out/_next/static/chunks/6601-fcfad0ddf92ec7ab.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-6ff4e45dfb49d11d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-d0dc765474fa0eca.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-909d53833da080cb.js +1 -0
- sky/dashboard/out/_next/static/chunks/8982.a2e214068f30a857.js +1 -0
- sky/dashboard/out/_next/static/chunks/{25.76c246239df93d50.js → 9025.a7c44babfe56ce09.js} +2 -2
- sky/dashboard/out/_next/static/chunks/938-044ad21de8b4626b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9470-21d059a1dfa03f61.js +1 -0
- sky/dashboard/out/_next/static/chunks/9984.739ae958a066298d.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{framework-87d061ee6ed71b28.js → framework-efc06c2733009cd3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +1 -0
- sky/dashboard/out/_next/static/chunks/{main-e0e2335212e72357.js → main-c0a4f1ea606d48d2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-9a3ce3170d2edcec.js → _app-a37b06ddb64521fd.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8135aba0712bda37.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b8e1114e6d38218c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9744c271a1642f76.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-c4d5cfac7fbc0668.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-cd43fb3c122eedde.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-4ebf6484f7216387.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-06bde99155fa6292.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-d427db53e54de9ce.js +1 -0
- sky/dashboard/out/_next/static/css/0da6afe66176678a.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +8 -3
- sky/global_user_state.py +257 -9
- sky/jobs/client/sdk.py +20 -25
- sky/models.py +16 -0
- sky/optimizer.py +46 -0
- sky/provision/__init__.py +14 -6
- sky/provision/kubernetes/config.py +1 -1
- sky/provision/kubernetes/constants.py +9 -0
- sky/provision/kubernetes/instance.py +24 -18
- sky/provision/kubernetes/network.py +15 -9
- sky/provision/kubernetes/network_utils.py +42 -23
- sky/provision/kubernetes/utils.py +73 -35
- sky/provision/kubernetes/volume.py +77 -15
- sky/provision/nebius/utils.py +10 -4
- sky/resources.py +10 -4
- sky/serve/client/sdk.py +28 -34
- sky/server/common.py +51 -3
- sky/server/constants.py +3 -0
- sky/server/requests/executor.py +4 -0
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +19 -0
- sky/server/rest.py +6 -15
- sky/server/server.py +121 -6
- sky/skylet/constants.py +7 -0
- sky/skypilot_config.py +32 -4
- sky/task.py +12 -0
- sky/users/permission.py +29 -0
- sky/users/server.py +384 -5
- sky/users/token_service.py +196 -0
- sky/utils/common_utils.py +4 -5
- sky/utils/config_utils.py +41 -0
- sky/utils/controller_utils.py +5 -1
- sky/utils/log_utils.py +68 -0
- sky/utils/resource_checker.py +153 -0
- sky/utils/resources_utils.py +12 -4
- sky/utils/schemas.py +87 -60
- sky/utils/subprocess_utils.py +2 -6
- sky/volumes/server/core.py +103 -78
- sky/volumes/utils.py +22 -5
- sky/workspaces/core.py +9 -117
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/RECORD +133 -128
- sky/dashboard/out/_next/static/HudU4f4Xsy-cP51JvXSZ-/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +0 -1
- sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +0 -1
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +0 -1
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/616-d6128fa9e7cae6e6.js +0 -39
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +0 -1
- sky/dashboard/out/_next/static/chunks/664-047bc03493fda379.js +0 -1
- sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +0 -16
- sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +0 -1
- sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +0 -1
- sky/dashboard/out/_next/static/chunks/799-3625946b2ec2eb30.js +0 -8
- sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +0 -6
- sky/dashboard/out/_next/static/chunks/937.3759f538f11a0953.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +0 -1
- sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +0 -1
- sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +0 -1
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +0 -1
- sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-f119a5630a1efd61.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-6b0d9e5031b70c58.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-5b59bce9eb208d84.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +0 -1
- sky/dashboard/out/_next/static/css/52082cf558ec9705.css +0 -3
- /sky/dashboard/out/_next/static/{HudU4f4Xsy-cP51JvXSZ- → NdypbqMxaYucRGfopkKXa}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{804-4c9fc53aa74bc191.js → 804-9f5e98ce84d46bdd.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250630.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,196 @@
|
|
1
|
+
"""JWT-based service account token management for SkyPilot."""
|
2
|
+
|
3
|
+
import contextlib
|
4
|
+
import datetime
|
5
|
+
import hashlib
|
6
|
+
import os
|
7
|
+
import secrets
|
8
|
+
from typing import Any, Dict, Generator, Optional
|
9
|
+
|
10
|
+
import filelock
|
11
|
+
import jwt
|
12
|
+
|
13
|
+
from sky import global_user_state
|
14
|
+
from sky import sky_logging
|
15
|
+
|
16
|
+
logger = sky_logging.init_logger(__name__)
|
17
|
+
|
18
|
+
# JWT Configuration
|
19
|
+
JWT_ALGORITHM = 'HS256'
|
20
|
+
JWT_ISSUER = 'sky' # Shortened for compact tokens
|
21
|
+
JWT_SECRET_DB_KEY = 'jwt_secret'
|
22
|
+
|
23
|
+
# File lock for JWT secret initialization
|
24
|
+
JWT_SECRET_LOCK_PATH = os.path.expanduser('~/.sky/.jwt_secret_init.lock')
|
25
|
+
JWT_SECRET_LOCK_TIMEOUT_SECONDS = 20
|
26
|
+
|
27
|
+
|
28
|
+
@contextlib.contextmanager
|
29
|
+
def _jwt_secret_lock() -> Generator[None, None, None]:
|
30
|
+
"""Context manager for JWT secret initialization lock."""
|
31
|
+
try:
|
32
|
+
with filelock.FileLock(JWT_SECRET_LOCK_PATH,
|
33
|
+
JWT_SECRET_LOCK_TIMEOUT_SECONDS):
|
34
|
+
yield
|
35
|
+
except filelock.Timeout as e:
|
36
|
+
raise RuntimeError(f'Failed to initialize JWT secret due to a timeout '
|
37
|
+
f'when trying to acquire the lock at '
|
38
|
+
f'{JWT_SECRET_LOCK_PATH}. '
|
39
|
+
'Please try again or manually remove the lock '
|
40
|
+
f'file if you believe it is stale.') from e
|
41
|
+
|
42
|
+
|
43
|
+
class TokenService:
|
44
|
+
"""Service for managing JWT-based service account tokens."""
|
45
|
+
|
46
|
+
def __init__(self):
|
47
|
+
self.secret_key = self._get_or_generate_secret()
|
48
|
+
|
49
|
+
def _get_or_generate_secret(self) -> str:
|
50
|
+
"""Get JWT secret from database or generate a new one."""
|
51
|
+
with _jwt_secret_lock():
|
52
|
+
# Try to get from database (persistent across deployments)
|
53
|
+
try:
|
54
|
+
db_secret = global_user_state.get_system_config(
|
55
|
+
JWT_SECRET_DB_KEY)
|
56
|
+
if db_secret:
|
57
|
+
logger.debug('Retrieved existing JWT secret from database')
|
58
|
+
return db_secret
|
59
|
+
except Exception as e: # pylint: disable=broad-except
|
60
|
+
logger.debug(f'Failed to get JWT secret from database: {e}')
|
61
|
+
|
62
|
+
# Generate a new secret and store in database
|
63
|
+
new_secret = secrets.token_urlsafe(64)
|
64
|
+
try:
|
65
|
+
global_user_state.set_system_config(JWT_SECRET_DB_KEY,
|
66
|
+
new_secret)
|
67
|
+
logger.info(
|
68
|
+
'Generated new JWT secret and stored in database. '
|
69
|
+
'This secret will persist across API server restarts.')
|
70
|
+
except Exception as e: # pylint: disable=broad-except
|
71
|
+
logger.warning(
|
72
|
+
f'Failed to store new JWT secret in database: {e}. '
|
73
|
+
f'Using in-memory secret (tokens will not persist '
|
74
|
+
f'across restarts).')
|
75
|
+
|
76
|
+
return new_secret
|
77
|
+
|
78
|
+
def create_token(self,
|
79
|
+
creator_user_id: str,
|
80
|
+
service_account_user_id: str,
|
81
|
+
token_name: str,
|
82
|
+
expires_in_days: Optional[int] = None) -> Dict[str, Any]:
|
83
|
+
"""Create a new JWT service account token.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
creator_user_id: The creator's user hash
|
87
|
+
service_account_user_id: The service account's own user ID
|
88
|
+
token_name: Descriptive name for the token
|
89
|
+
expires_in_days: Optional expiration in days
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
Dict containing token info including the JWT token
|
93
|
+
"""
|
94
|
+
now = datetime.datetime.now(datetime.timezone.utc)
|
95
|
+
token_id = secrets.token_urlsafe(12) # Shorter ID for JWT
|
96
|
+
|
97
|
+
# Build minimal JWT payload with single-character field names for
|
98
|
+
# compactness
|
99
|
+
payload = {
|
100
|
+
'i': JWT_ISSUER, # Issuer (use constant)
|
101
|
+
't': int(now.timestamp()), # Issued at (shortened from 'iat')
|
102
|
+
# Service account user ID (shortened from 'sub')
|
103
|
+
'u': service_account_user_id,
|
104
|
+
'k': token_id, # Token ID (shortened from 'token_id')
|
105
|
+
'y': 'sa', # Type: service account (shortened from 'type')
|
106
|
+
}
|
107
|
+
|
108
|
+
# Add expiration if specified
|
109
|
+
expires_at = None
|
110
|
+
if expires_in_days:
|
111
|
+
exp_time = now + datetime.timedelta(days=expires_in_days)
|
112
|
+
payload['e'] = int(
|
113
|
+
exp_time.timestamp()) # Expiration (shortened from 'exp')
|
114
|
+
expires_at = int(exp_time.timestamp())
|
115
|
+
|
116
|
+
# Generate JWT
|
117
|
+
jwt_token = jwt.encode(payload,
|
118
|
+
self.secret_key,
|
119
|
+
algorithm=JWT_ALGORITHM)
|
120
|
+
|
121
|
+
# Create token with SkyPilot prefix
|
122
|
+
full_token = f'sky_{jwt_token}'
|
123
|
+
|
124
|
+
# Generate hash for database storage (we still hash the full token)
|
125
|
+
token_hash = hashlib.sha256(full_token.encode()).hexdigest()
|
126
|
+
|
127
|
+
return {
|
128
|
+
'token_id': token_id,
|
129
|
+
'token': full_token,
|
130
|
+
'token_hash': token_hash,
|
131
|
+
'creator_user_id': creator_user_id,
|
132
|
+
'service_account_user_id': service_account_user_id,
|
133
|
+
'token_name': token_name,
|
134
|
+
'created_at': int(now.timestamp()),
|
135
|
+
'expires_at': expires_at,
|
136
|
+
}
|
137
|
+
|
138
|
+
def verify_token(self, token: str) -> Optional[Dict[str, Any]]:
|
139
|
+
"""Verify and decode a JWT token.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
token: The full token (with sky_ prefix)
|
143
|
+
|
144
|
+
Returns:
|
145
|
+
Decoded token payload or None if invalid
|
146
|
+
"""
|
147
|
+
if not token.startswith('sky_'):
|
148
|
+
return None
|
149
|
+
|
150
|
+
# Remove the sky_ prefix
|
151
|
+
jwt_token = token[4:]
|
152
|
+
|
153
|
+
try:
|
154
|
+
# Decode and verify JWT (without issuer verification)
|
155
|
+
payload = jwt.decode(jwt_token,
|
156
|
+
self.secret_key,
|
157
|
+
algorithms=[JWT_ALGORITHM])
|
158
|
+
|
159
|
+
# Manually verify issuer using our shortened field name
|
160
|
+
token_issuer = payload.get('i')
|
161
|
+
if token_issuer != JWT_ISSUER:
|
162
|
+
logger.warning(f'Invalid token issuer: {token_issuer}')
|
163
|
+
return None
|
164
|
+
|
165
|
+
# Verify token type
|
166
|
+
token_type = payload.get('y')
|
167
|
+
if token_type != 'sa':
|
168
|
+
logger.warning(f'Invalid token type: {token_type}')
|
169
|
+
return None
|
170
|
+
|
171
|
+
# Convert shortened field names back to standard names for
|
172
|
+
# compatibility
|
173
|
+
normalized_payload = {
|
174
|
+
'iss': payload.get('i'), # issuer
|
175
|
+
'iat': payload.get('t'), # issued at
|
176
|
+
'sub': payload.get('u'), # subject (service account user ID)
|
177
|
+
'token_id': payload.get('k'), # token ID
|
178
|
+
'type': 'service_account', # expand shortened type
|
179
|
+
}
|
180
|
+
|
181
|
+
# Add expiration if present
|
182
|
+
if 'e' in payload:
|
183
|
+
normalized_payload['exp'] = payload['e']
|
184
|
+
|
185
|
+
return normalized_payload
|
186
|
+
|
187
|
+
except jwt.ExpiredSignatureError:
|
188
|
+
logger.warning('Token has expired')
|
189
|
+
return None
|
190
|
+
except jwt.InvalidTokenError as e:
|
191
|
+
logger.warning(f'Invalid token: {e}')
|
192
|
+
return None
|
193
|
+
|
194
|
+
|
195
|
+
# Singleton instance
|
196
|
+
token_service = TokenService()
|
sky/utils/common_utils.py
CHANGED
@@ -71,11 +71,10 @@ def get_usage_run_id() -> str:
|
|
71
71
|
def is_valid_user_hash(user_hash: Optional[str]) -> bool:
|
72
72
|
if user_hash is None:
|
73
73
|
return False
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
return len(user_hash) == USER_HASH_LENGTH
|
74
|
+
# Must start with a letter, followed by alphanumeric characters and hyphens
|
75
|
+
# This covers both old hex format (e.g., "abc123") and new service account
|
76
|
+
# format (e.g., "sa-abc123-token-xyz")
|
77
|
+
return bool(re.match(r'^[a-zA-Z0-9][a-zA-Z0-9-]*$', user_hash))
|
79
78
|
|
80
79
|
|
81
80
|
def generate_user_hash() -> str:
|
sky/utils/config_utils.py
CHANGED
@@ -226,3 +226,44 @@ def merge_k8s_configs(
|
|
226
226
|
base_config[key].extend(value)
|
227
227
|
else:
|
228
228
|
base_config[key] = value
|
229
|
+
|
230
|
+
|
231
|
+
def get_cloud_config_value_from_dict(
|
232
|
+
dict_config: Dict[str, Any],
|
233
|
+
cloud: str,
|
234
|
+
keys: Tuple[str, ...],
|
235
|
+
region: Optional[str] = None,
|
236
|
+
default_value: Optional[Any] = None,
|
237
|
+
override_configs: Optional[Dict[str, Any]] = None) -> Any:
|
238
|
+
"""Returns the nested key value by reading from config
|
239
|
+
Order to get the property_name value:
|
240
|
+
1. if region is specified,
|
241
|
+
try to get the value from <cloud>/<region_key>/<region>/keys
|
242
|
+
2. if no region or no override,
|
243
|
+
try to get it at the cloud level <cloud>/keys
|
244
|
+
3. if not found at cloud level,
|
245
|
+
return either default_value if specified or None
|
246
|
+
"""
|
247
|
+
input_config = Config(dict_config)
|
248
|
+
region_key = None
|
249
|
+
if cloud == 'kubernetes':
|
250
|
+
region_key = 'context_configs'
|
251
|
+
|
252
|
+
per_context_config = None
|
253
|
+
if region is not None and region_key is not None:
|
254
|
+
per_context_config = input_config.get_nested(
|
255
|
+
keys=(cloud, region_key, region) + keys,
|
256
|
+
default_value=None,
|
257
|
+
override_configs=override_configs)
|
258
|
+
# if no override found for specified region
|
259
|
+
general_config = input_config.get_nested(keys=(cloud,) + keys,
|
260
|
+
default_value=default_value,
|
261
|
+
override_configs=override_configs)
|
262
|
+
|
263
|
+
if (cloud == 'kubernetes' and isinstance(general_config, dict) and
|
264
|
+
isinstance(per_context_config, dict)):
|
265
|
+
merge_k8s_configs(general_config, per_context_config)
|
266
|
+
return general_config
|
267
|
+
else:
|
268
|
+
return (general_config
|
269
|
+
if per_context_config is None else per_context_config)
|
sky/utils/controller_utils.py
CHANGED
@@ -733,7 +733,11 @@ def _setup_proxy_command_on_controller(
|
|
733
733
|
config = config_utils.Config.from_dict(user_config)
|
734
734
|
proxy_command_key = (str(controller_launched_cloud).lower(),
|
735
735
|
'ssh_proxy_command')
|
736
|
-
ssh_proxy_command =
|
736
|
+
ssh_proxy_command = skypilot_config.get_effective_region_config(
|
737
|
+
cloud=str(controller_launched_cloud).lower(),
|
738
|
+
region=None,
|
739
|
+
keys=('ssh_proxy_command',),
|
740
|
+
default_value=None)
|
737
741
|
if isinstance(ssh_proxy_command, str):
|
738
742
|
config.set_nested(proxy_command_key, None)
|
739
743
|
elif isinstance(ssh_proxy_command, dict):
|
sky/utils/log_utils.py
CHANGED
@@ -573,6 +573,74 @@ def readable_time_duration(start: Optional[float],
|
|
573
573
|
return diff
|
574
574
|
|
575
575
|
|
576
|
+
def human_duration(start: int, end: Optional[int] = None) -> str:
|
577
|
+
"""Calculates the time elapsed between two timestamps and returns
|
578
|
+
it as a human-readable string, similar to Kubernetes' duration format.
|
579
|
+
|
580
|
+
Args:
|
581
|
+
start: The start time as a Unix timestamp (seconds since epoch).
|
582
|
+
end: The end time as a Unix timestamp (seconds since epoch).
|
583
|
+
If None, current time is used.
|
584
|
+
|
585
|
+
Returns:
|
586
|
+
A string representing the duration, e.g., "2d3h", "15m", "30s".
|
587
|
+
Returns "0s" for zero, negative durations, or if the timestamp
|
588
|
+
is invalid.
|
589
|
+
"""
|
590
|
+
if not start or start <= 0:
|
591
|
+
return '0s'
|
592
|
+
|
593
|
+
if end is None:
|
594
|
+
end = int(time.time())
|
595
|
+
duration_seconds = end - start
|
596
|
+
|
597
|
+
units = {
|
598
|
+
'y': 365 * 24 * 60 * 60,
|
599
|
+
'd': 60 * 60 * 24,
|
600
|
+
'h': 60 * 60,
|
601
|
+
'm': 60,
|
602
|
+
's': 1,
|
603
|
+
}
|
604
|
+
|
605
|
+
if duration_seconds <= 0:
|
606
|
+
return '0s'
|
607
|
+
elif duration_seconds < 60 * 2:
|
608
|
+
return f'{duration_seconds}s'
|
609
|
+
|
610
|
+
minutes = int(duration_seconds / units['m'])
|
611
|
+
if minutes < 10:
|
612
|
+
s = int(duration_seconds / units['s']) % 60
|
613
|
+
if s == 0:
|
614
|
+
return f'{minutes}m'
|
615
|
+
return f'{minutes}m{s}s'
|
616
|
+
elif minutes < 60 * 3:
|
617
|
+
return f'{minutes}m'
|
618
|
+
|
619
|
+
hours = int(duration_seconds / units['h'])
|
620
|
+
days = int(hours / 24)
|
621
|
+
years = int(hours / 24 / 365)
|
622
|
+
if hours < 8:
|
623
|
+
m = int(duration_seconds / units['m']) % 60
|
624
|
+
if m == 0:
|
625
|
+
return f'{hours}h'
|
626
|
+
return f'{hours}h{m}m'
|
627
|
+
elif hours < 48:
|
628
|
+
return f'{hours}h'
|
629
|
+
elif hours < 24 * 8:
|
630
|
+
h = hours % 24
|
631
|
+
if h == 0:
|
632
|
+
return f'{days}d'
|
633
|
+
return f'{days}d{h}h'
|
634
|
+
elif hours < 24 * 365 * 2:
|
635
|
+
return f'{days}d'
|
636
|
+
elif hours < 24 * 365 * 8:
|
637
|
+
dy = int(hours / 24) % 365
|
638
|
+
if dy == 0:
|
639
|
+
return f'{years}y'
|
640
|
+
return f'{years}y{dy}d'
|
641
|
+
return f'{years}y'
|
642
|
+
|
643
|
+
|
576
644
|
def follow_logs(
|
577
645
|
file: TextIO,
|
578
646
|
*,
|
@@ -0,0 +1,153 @@
|
|
1
|
+
"""Resource checking utilities for finding active clusters and managed jobs."""
|
2
|
+
|
3
|
+
import concurrent.futures
|
4
|
+
from typing import Any, Callable, Dict, List, Tuple
|
5
|
+
|
6
|
+
from sky import exceptions
|
7
|
+
from sky import global_user_state
|
8
|
+
from sky import sky_logging
|
9
|
+
from sky.skylet import constants
|
10
|
+
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
def check_no_active_resources_for_users(
|
15
|
+
user_operations: List[Tuple[str, str]]) -> None:
|
16
|
+
"""Check if users have active clusters or managed jobs.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
user_operations: List of tuples (user_id, operation) where
|
20
|
+
operation is 'update' or 'delete'.
|
21
|
+
|
22
|
+
Raises:
|
23
|
+
ValueError: If any user has active clusters or managed jobs.
|
24
|
+
The error message will include all users with issues.
|
25
|
+
"""
|
26
|
+
if not user_operations:
|
27
|
+
return
|
28
|
+
|
29
|
+
def filter_by_user(user_id: str):
|
30
|
+
return lambda resource: resource.get('user_hash') == user_id
|
31
|
+
|
32
|
+
_check_active_resources(user_operations, filter_by_user, 'user')
|
33
|
+
|
34
|
+
|
35
|
+
def check_no_active_resources_for_workspaces(
|
36
|
+
workspace_operations: List[Tuple[str, str]]) -> None:
|
37
|
+
"""Check if workspaces have active clusters or managed jobs.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
workspace_operations: List of tuples (workspace_name, operation) where
|
41
|
+
operation is 'update' or 'delete'.
|
42
|
+
|
43
|
+
Raises:
|
44
|
+
ValueError: If any workspace has active clusters or managed jobs.
|
45
|
+
The error message will include all workspaces with issues.
|
46
|
+
"""
|
47
|
+
if not workspace_operations:
|
48
|
+
return
|
49
|
+
|
50
|
+
def filter_by_workspace(workspace_name: str):
|
51
|
+
return lambda resource: (resource.get(
|
52
|
+
'workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) == workspace_name
|
53
|
+
)
|
54
|
+
|
55
|
+
_check_active_resources(workspace_operations, filter_by_workspace,
|
56
|
+
'workspace')
|
57
|
+
|
58
|
+
|
59
|
+
def _check_active_resources(resource_operations: List[Tuple[str, str]],
|
60
|
+
filter_factory: Callable[[str],
|
61
|
+
Callable[[Dict[str, Any]],
|
62
|
+
bool]],
|
63
|
+
resource_type: str) -> None:
|
64
|
+
"""Check if resource entities have active clusters or managed jobs.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
resource_operations: List of tuples (resource_name, operation) where
|
68
|
+
operation is 'update' or 'delete'.
|
69
|
+
filter_factory: Function that takes a resource_name and returns a filter
|
70
|
+
function for clusters/jobs.
|
71
|
+
resource_type: Type of resource being checked ('user' or 'workspace').
|
72
|
+
|
73
|
+
Raises:
|
74
|
+
ValueError: If any resource has active clusters or managed jobs.
|
75
|
+
"""
|
76
|
+
|
77
|
+
def get_all_clusters():
|
78
|
+
return global_user_state.get_clusters()
|
79
|
+
|
80
|
+
def get_all_managed_jobs():
|
81
|
+
# pylint: disable=import-outside-toplevel
|
82
|
+
from sky.jobs.server import core as managed_jobs_core
|
83
|
+
try:
|
84
|
+
return managed_jobs_core.queue(refresh=False,
|
85
|
+
skip_finished=True,
|
86
|
+
all_users=True)
|
87
|
+
except exceptions.ClusterNotUpError:
|
88
|
+
logger.warning('All jobs should be finished.')
|
89
|
+
return []
|
90
|
+
|
91
|
+
# Fetch both clusters and jobs in parallel
|
92
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
93
|
+
clusters_future = executor.submit(get_all_clusters)
|
94
|
+
jobs_future = executor.submit(get_all_managed_jobs)
|
95
|
+
|
96
|
+
all_clusters = clusters_future.result()
|
97
|
+
all_managed_jobs = jobs_future.result()
|
98
|
+
|
99
|
+
# Collect all error messages instead of raising immediately
|
100
|
+
error_messages = []
|
101
|
+
|
102
|
+
# Check each resource against the fetched data
|
103
|
+
for resource_name, operation in resource_operations:
|
104
|
+
resource_filter = filter_factory(resource_name)
|
105
|
+
|
106
|
+
# Filter clusters for this resource
|
107
|
+
resource_clusters = [
|
108
|
+
cluster for cluster in all_clusters if resource_filter(cluster)
|
109
|
+
]
|
110
|
+
|
111
|
+
# Filter managed jobs for this resource
|
112
|
+
resource_active_jobs = [
|
113
|
+
job for job in all_managed_jobs if resource_filter(job)
|
114
|
+
]
|
115
|
+
|
116
|
+
# Collect error messages for this resource
|
117
|
+
resource_errors = []
|
118
|
+
|
119
|
+
if resource_clusters:
|
120
|
+
active_cluster_names = [
|
121
|
+
cluster['name'] for cluster in resource_clusters
|
122
|
+
]
|
123
|
+
cluster_list = ', '.join(active_cluster_names)
|
124
|
+
resource_errors.append(
|
125
|
+
f'{len(resource_clusters)} active cluster(s): {cluster_list}')
|
126
|
+
|
127
|
+
if resource_active_jobs:
|
128
|
+
job_names = [str(job['job_id']) for job in resource_active_jobs]
|
129
|
+
job_list = ', '.join(job_names)
|
130
|
+
resource_errors.append(
|
131
|
+
f'{len(resource_active_jobs)} active managed job(s): '
|
132
|
+
f'{job_list}')
|
133
|
+
|
134
|
+
# If this resource has issues, add to overall error messages
|
135
|
+
if resource_errors:
|
136
|
+
resource_error_summary = ' and '.join(resource_errors)
|
137
|
+
error_messages.append(
|
138
|
+
f'Cannot {operation} {resource_type} {resource_name!r} '
|
139
|
+
f'because it has {resource_error_summary}.')
|
140
|
+
|
141
|
+
# If we collected any errors, raise them all together
|
142
|
+
if error_messages:
|
143
|
+
if len(error_messages) == 1:
|
144
|
+
# Single resource error
|
145
|
+
full_message = error_messages[
|
146
|
+
0] + ' Please terminate these resources first.'
|
147
|
+
else:
|
148
|
+
# Multiple resource errors
|
149
|
+
full_message = (f'Cannot proceed due to active resources in '
|
150
|
+
f'{len(error_messages)} {resource_type}(s):\n' +
|
151
|
+
'\n'.join(f'• {msg}' for msg in error_messages) +
|
152
|
+
'\nPlease terminate these resources first.')
|
153
|
+
raise ValueError(full_message)
|
sky/utils/resources_utils.py
CHANGED
@@ -273,10 +273,18 @@ def need_to_query_reservations() -> bool:
|
|
273
273
|
clouds that do not use reservations.
|
274
274
|
"""
|
275
275
|
for cloud_str in registry.CLOUD_REGISTRY.keys():
|
276
|
-
cloud_specific_reservations =
|
277
|
-
(
|
278
|
-
|
279
|
-
|
276
|
+
cloud_specific_reservations = (
|
277
|
+
skypilot_config.get_effective_region_config(
|
278
|
+
cloud=cloud_str,
|
279
|
+
region=None,
|
280
|
+
keys=('specific_reservations',),
|
281
|
+
default_value=None))
|
282
|
+
cloud_prioritize_reservations = (
|
283
|
+
skypilot_config.get_effective_region_config(
|
284
|
+
cloud=cloud_str,
|
285
|
+
region=None,
|
286
|
+
keys=('prioritize_reservations',),
|
287
|
+
default_value=False))
|
280
288
|
if (cloud_specific_reservations is not None or
|
281
289
|
cloud_prioritize_reservations):
|
282
290
|
return True
|