skypilot-nightly 1.0.0.dev20250828__py3-none-any.whl → 1.0.0.dev20250831__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +24 -2
- sky/backends/backend_utils.py +152 -59
- sky/backends/cloud_vm_ray_backend.py +56 -3
- sky/backends/wheel_utils.py +35 -8
- sky/client/cli/command.py +17 -6
- sky/client/common.py +5 -4
- sky/client/sdk.py +5 -0
- sky/client/sdk_async.py +8 -2
- sky/clouds/aws.py +118 -1
- sky/core.py +8 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-6dae1cd599a34def.js → webpack-6e76f636a048e145.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +58 -10
- sky/provision/aws/config.py +78 -3
- sky/provision/aws/instance.py +45 -6
- sky/provision/docker_utils.py +1 -1
- sky/provision/kubernetes/utils.py +48 -26
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/server/common.py +1 -2
- sky/server/daemons.py +6 -0
- sky/server/requests/executor.py +2 -1
- sky/server/requests/payloads.py +4 -1
- sky/server/server.py +67 -58
- sky/setup_files/dependencies.py +25 -8
- sky/setup_files/setup.py +2 -0
- sky/sky_logging.py +28 -0
- sky/skylet/constants.py +6 -0
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/utils/annotations.py +8 -2
- sky/utils/cluster_utils.py +3 -3
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes_enums.py +1 -0
- sky/utils/lock_events.py +94 -0
- sky/utils/schemas.py +6 -0
- sky/utils/timeline.py +24 -93
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/METADATA +36 -48
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/RECORD +59 -57
- /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → FtHzmn6BMJ5PzqHhEY51g}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → FtHzmn6BMJ5PzqHhEY51g}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/top_level.txt +0 -0
sky/server/server.py
CHANGED
|
@@ -21,6 +21,7 @@ import uuid
|
|
|
21
21
|
import zipfile
|
|
22
22
|
|
|
23
23
|
import aiofiles
|
|
24
|
+
import anyio
|
|
24
25
|
import fastapi
|
|
25
26
|
from fastapi.middleware import cors
|
|
26
27
|
import starlette.middleware.base
|
|
@@ -847,7 +848,7 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
|
|
|
847
848
|
client_file_mounts_dir = (
|
|
848
849
|
common.API_SERVER_CLIENT_DIR.expanduser().resolve() / user_id /
|
|
849
850
|
'file_mounts')
|
|
850
|
-
client_file_mounts_dir.mkdir(parents=True, exist_ok=True)
|
|
851
|
+
await anyio.Path(client_file_mounts_dir).mkdir(parents=True, exist_ok=True)
|
|
851
852
|
|
|
852
853
|
# Check upload_id to be a valid SkyPilot run_timestamp appended with 8 hex
|
|
853
854
|
# characters, e.g. 'sky-2025-01-17-09-10-13-933602-35d31c22'.
|
|
@@ -870,7 +871,7 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
|
|
|
870
871
|
zip_file_path = client_file_mounts_dir / f'{upload_id}.zip'
|
|
871
872
|
else:
|
|
872
873
|
chunk_dir = client_file_mounts_dir / upload_id
|
|
873
|
-
chunk_dir.mkdir(parents=True, exist_ok=True)
|
|
874
|
+
await anyio.Path(chunk_dir).mkdir(parents=True, exist_ok=True)
|
|
874
875
|
zip_file_path = chunk_dir / f'part{chunk_index}.incomplete'
|
|
875
876
|
|
|
876
877
|
try:
|
|
@@ -916,9 +917,9 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
|
|
|
916
917
|
await zip_file.write(data)
|
|
917
918
|
|
|
918
919
|
logger.info(f'Uploaded zip file: {zip_file_path}')
|
|
919
|
-
unzip_file(zip_file_path, client_file_mounts_dir)
|
|
920
|
+
await unzip_file(zip_file_path, client_file_mounts_dir)
|
|
920
921
|
if total_chunks > 1:
|
|
921
|
-
shutil.rmtree
|
|
922
|
+
await context_utils.to_thread(shutil.rmtree, chunk_dir)
|
|
922
923
|
return payloads.UploadZipFileResponse(
|
|
923
924
|
status=responses.UploadStatus.COMPLETED.value)
|
|
924
925
|
|
|
@@ -933,61 +934,69 @@ def _is_relative_to(path: pathlib.Path, parent: pathlib.Path) -> bool:
|
|
|
933
934
|
return False
|
|
934
935
|
|
|
935
936
|
|
|
936
|
-
def unzip_file(zip_file_path: pathlib.Path,
|
|
937
|
-
|
|
938
|
-
"""Unzips a zip file."""
|
|
939
|
-
try:
|
|
940
|
-
with zipfile.ZipFile(zip_file_path, 'r') as zipf:
|
|
941
|
-
for member in zipf.infolist():
|
|
942
|
-
# Determine the new path
|
|
943
|
-
original_path = os.path.normpath(member.filename)
|
|
944
|
-
new_path = client_file_mounts_dir / original_path.lstrip('/')
|
|
945
|
-
|
|
946
|
-
if (member.external_attr >> 28) == 0xA:
|
|
947
|
-
# Symlink. Read the target path and create a symlink.
|
|
948
|
-
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
949
|
-
target = zipf.read(member).decode()
|
|
950
|
-
assert not os.path.isabs(target), target
|
|
951
|
-
# Since target is a relative path, we need to check that it
|
|
952
|
-
# is under `client_file_mounts_dir` for security.
|
|
953
|
-
full_target_path = (new_path.parent / target).resolve()
|
|
954
|
-
if not _is_relative_to(full_target_path,
|
|
955
|
-
client_file_mounts_dir):
|
|
956
|
-
raise ValueError(f'Symlink target {target} leads to a '
|
|
957
|
-
'file not in userspace. Aborted.')
|
|
958
|
-
|
|
959
|
-
if new_path.exists() or new_path.is_symlink():
|
|
960
|
-
new_path.unlink(missing_ok=True)
|
|
961
|
-
new_path.symlink_to(
|
|
962
|
-
target,
|
|
963
|
-
target_is_directory=member.filename.endswith('/'))
|
|
964
|
-
continue
|
|
965
|
-
|
|
966
|
-
# Handle directories
|
|
967
|
-
if member.filename.endswith('/'):
|
|
968
|
-
new_path.mkdir(parents=True, exist_ok=True)
|
|
969
|
-
continue
|
|
970
|
-
|
|
971
|
-
# Handle files
|
|
972
|
-
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
973
|
-
with zipf.open(member) as member_file, new_path.open('wb') as f:
|
|
974
|
-
# Use shutil.copyfileobj to copy files in chunks, so it does
|
|
975
|
-
# not load the entire file into memory.
|
|
976
|
-
shutil.copyfileobj(member_file, f)
|
|
977
|
-
except zipfile.BadZipFile as e:
|
|
978
|
-
logger.error(f'Bad zip file: {zip_file_path}')
|
|
979
|
-
raise fastapi.HTTPException(
|
|
980
|
-
status_code=400,
|
|
981
|
-
detail=f'Invalid zip file: {common_utils.format_exception(e)}')
|
|
982
|
-
except Exception as e:
|
|
983
|
-
logger.error(f'Error unzipping file: {zip_file_path}')
|
|
984
|
-
raise fastapi.HTTPException(
|
|
985
|
-
status_code=500,
|
|
986
|
-
detail=(f'Error unzipping file: '
|
|
987
|
-
f'{common_utils.format_exception(e)}'))
|
|
937
|
+
async def unzip_file(zip_file_path: pathlib.Path,
|
|
938
|
+
client_file_mounts_dir: pathlib.Path) -> None:
|
|
939
|
+
"""Unzips a zip file without blocking the event loop."""
|
|
988
940
|
|
|
989
|
-
|
|
990
|
-
|
|
941
|
+
def _do_unzip() -> None:
|
|
942
|
+
try:
|
|
943
|
+
with zipfile.ZipFile(zip_file_path, 'r') as zipf:
|
|
944
|
+
for member in zipf.infolist():
|
|
945
|
+
# Determine the new path
|
|
946
|
+
original_path = os.path.normpath(member.filename)
|
|
947
|
+
new_path = client_file_mounts_dir / original_path.lstrip(
|
|
948
|
+
'/')
|
|
949
|
+
|
|
950
|
+
if (member.external_attr >> 28) == 0xA:
|
|
951
|
+
# Symlink. Read the target path and create a symlink.
|
|
952
|
+
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
953
|
+
target = zipf.read(member).decode()
|
|
954
|
+
assert not os.path.isabs(target), target
|
|
955
|
+
# Since target is a relative path, we need to check that
|
|
956
|
+
# it is under `client_file_mounts_dir` for security.
|
|
957
|
+
full_target_path = (new_path.parent / target).resolve()
|
|
958
|
+
if not _is_relative_to(full_target_path,
|
|
959
|
+
client_file_mounts_dir):
|
|
960
|
+
raise ValueError(
|
|
961
|
+
f'Symlink target {target} leads to a '
|
|
962
|
+
'file not in userspace. Aborted.')
|
|
963
|
+
|
|
964
|
+
if new_path.exists() or new_path.is_symlink():
|
|
965
|
+
new_path.unlink(missing_ok=True)
|
|
966
|
+
new_path.symlink_to(
|
|
967
|
+
target,
|
|
968
|
+
target_is_directory=member.filename.endswith('/'))
|
|
969
|
+
continue
|
|
970
|
+
|
|
971
|
+
# Handle directories
|
|
972
|
+
if member.filename.endswith('/'):
|
|
973
|
+
new_path.mkdir(parents=True, exist_ok=True)
|
|
974
|
+
continue
|
|
975
|
+
|
|
976
|
+
# Handle files
|
|
977
|
+
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
978
|
+
with zipf.open(member) as member_file, new_path.open(
|
|
979
|
+
'wb') as f:
|
|
980
|
+
# Use shutil.copyfileobj to copy files in chunks,
|
|
981
|
+
# so it does not load the entire file into memory.
|
|
982
|
+
shutil.copyfileobj(member_file, f)
|
|
983
|
+
except zipfile.BadZipFile as e:
|
|
984
|
+
logger.error(f'Bad zip file: {zip_file_path}')
|
|
985
|
+
raise fastapi.HTTPException(
|
|
986
|
+
status_code=400,
|
|
987
|
+
detail=f'Invalid zip file: {common_utils.format_exception(e)}')
|
|
988
|
+
except Exception as e:
|
|
989
|
+
logger.error(f'Error unzipping file: {zip_file_path}')
|
|
990
|
+
raise fastapi.HTTPException(
|
|
991
|
+
status_code=500,
|
|
992
|
+
detail=(f'Error unzipping file: '
|
|
993
|
+
f'{common_utils.format_exception(e)}'))
|
|
994
|
+
finally:
|
|
995
|
+
# Cleanup the temporary file regardless of
|
|
996
|
+
# success/failure handling above
|
|
997
|
+
zip_file_path.unlink(missing_ok=True)
|
|
998
|
+
|
|
999
|
+
await context_utils.to_thread(_do_unzip)
|
|
991
1000
|
|
|
992
1001
|
|
|
993
1002
|
@app.post('/launch')
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -8,8 +8,12 @@ This file is imported by setup.py, so:
|
|
|
8
8
|
import sys
|
|
9
9
|
from typing import Dict, List
|
|
10
10
|
|
|
11
|
+
clouds_with_ray = ['ibm', 'docker', 'scp']
|
|
12
|
+
|
|
11
13
|
install_requires = [
|
|
12
14
|
'wheel<0.46.0', # https://github.com/skypilot-org/skypilot/issues/5153
|
|
15
|
+
'setuptools', # TODO: match version to pyproject.toml once #5153 is fixed
|
|
16
|
+
'pip',
|
|
13
17
|
'cachetools',
|
|
14
18
|
# NOTE: ray requires click>=7.0.
|
|
15
19
|
# click 8.2.0 has a bug in parsing the command line arguments:
|
|
@@ -71,6 +75,7 @@ install_requires = [
|
|
|
71
75
|
'types-paramiko',
|
|
72
76
|
'alembic',
|
|
73
77
|
'aiohttp',
|
|
78
|
+
'anyio',
|
|
74
79
|
]
|
|
75
80
|
|
|
76
81
|
# See requirements-dev.txt for the version of grpc and protobuf
|
|
@@ -92,6 +97,7 @@ server_dependencies = [
|
|
|
92
97
|
'passlib',
|
|
93
98
|
'pyjwt',
|
|
94
99
|
'aiohttp',
|
|
100
|
+
'anyio',
|
|
95
101
|
GRPC,
|
|
96
102
|
PROTOBUF,
|
|
97
103
|
]
|
|
@@ -143,7 +149,7 @@ extras_require: Dict[str, List[str]] = {
|
|
|
143
149
|
'azure-storage-blob>=12.23.1',
|
|
144
150
|
'msgraph-sdk',
|
|
145
151
|
'msrestazure',
|
|
146
|
-
]
|
|
152
|
+
],
|
|
147
153
|
# We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
|
|
148
154
|
# parameter for stopping instances. Reference:
|
|
149
155
|
# https://github.com/googleapis/google-api-python-client/commit/f6e9d3869ed605b06f7cbf2e8cf2db25108506e6
|
|
@@ -164,7 +170,7 @@ extras_require: Dict[str, List[str]] = {
|
|
|
164
170
|
'lambda': [], # No dependencies needed for lambda
|
|
165
171
|
'cloudflare': aws_dependencies,
|
|
166
172
|
'scp': local_ray,
|
|
167
|
-
'oci': ['oci']
|
|
173
|
+
'oci': ['oci'],
|
|
168
174
|
# Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
|
|
169
175
|
'kubernetes': [
|
|
170
176
|
'kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'
|
|
@@ -195,10 +201,21 @@ extras_require: Dict[str, List[str]] = {
|
|
|
195
201
|
'server': server_dependencies,
|
|
196
202
|
}
|
|
197
203
|
|
|
198
|
-
#
|
|
204
|
+
# Calculate which clouds should be included in the [all] installation.
|
|
205
|
+
clouds_for_all = set(extras_require)
|
|
206
|
+
clouds_for_all.remove('remote')
|
|
207
|
+
|
|
199
208
|
if sys.version_info < (3, 10):
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
209
|
+
# Nebius needs python3.10. If python 3.9 [all] will not install nebius
|
|
210
|
+
clouds_for_all.remove('nebius')
|
|
211
|
+
|
|
212
|
+
if sys.version_info >= (3, 12):
|
|
213
|
+
# The version of ray we use does not work with >= 3.12, so avoid clouds
|
|
214
|
+
# that require ray.
|
|
215
|
+
clouds_for_all -= set(clouds_with_ray)
|
|
216
|
+
# vast requires setuptools==51.1.1 which will not work with python >= 3.12
|
|
217
|
+
# TODO: Remove once https://github.com/vast-ai/vast-sdk/pull/6 is released
|
|
218
|
+
clouds_for_all.remove('vast')
|
|
219
|
+
|
|
220
|
+
extras_require['all'] = list(
|
|
221
|
+
set().union(*[extras_require[cloud] for cloud in clouds_for_all]))
|
sky/setup_files/setup.py
CHANGED
|
@@ -178,6 +178,8 @@ setuptools.setup(
|
|
|
178
178
|
'Programming Language :: Python :: 3.9',
|
|
179
179
|
'Programming Language :: Python :: 3.10',
|
|
180
180
|
'Programming Language :: Python :: 3.11',
|
|
181
|
+
'Programming Language :: Python :: 3.12',
|
|
182
|
+
'Programming Language :: Python :: 3.13',
|
|
181
183
|
'License :: OSI Approved :: Apache Software License',
|
|
182
184
|
'Operating System :: OS Independent',
|
|
183
185
|
'Topic :: Software Development :: Libraries :: Python Modules',
|
sky/sky_logging.py
CHANGED
|
@@ -19,6 +19,9 @@ _FORMAT = '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
|
|
|
19
19
|
_DATE_FORMAT = '%m-%d %H:%M:%S'
|
|
20
20
|
_SENSITIVE_LOGGER = ['sky.provisioner', 'sky.optimizer']
|
|
21
21
|
|
|
22
|
+
_DEBUG_LOG_DIR = os.path.expanduser(
|
|
23
|
+
os.path.join(constants.SKY_LOGS_DIRECTORY, 'request_debug'))
|
|
24
|
+
|
|
22
25
|
DEBUG = logging.DEBUG
|
|
23
26
|
INFO = logging.INFO
|
|
24
27
|
WARNING = logging.WARNING
|
|
@@ -254,3 +257,28 @@ def generate_tmp_logging_file_path(file_name: str) -> str:
|
|
|
254
257
|
log_path = os.path.expanduser(os.path.join(log_dir, file_name))
|
|
255
258
|
|
|
256
259
|
return log_path
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
@contextlib.contextmanager
|
|
263
|
+
def add_debug_log_handler(request_id: str):
|
|
264
|
+
if os.getenv(constants.ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING) != 'true':
|
|
265
|
+
yield
|
|
266
|
+
return
|
|
267
|
+
|
|
268
|
+
os.makedirs(_DEBUG_LOG_DIR, exist_ok=True)
|
|
269
|
+
log_path = os.path.join(_DEBUG_LOG_DIR, f'{request_id}.log')
|
|
270
|
+
try:
|
|
271
|
+
debug_log_handler = logging.FileHandler(log_path)
|
|
272
|
+
debug_log_handler.setFormatter(FORMATTER)
|
|
273
|
+
debug_log_handler.setLevel(logging.DEBUG)
|
|
274
|
+
_root_logger.addHandler(debug_log_handler)
|
|
275
|
+
# sky.provision sets up its own logger/handler with propogate=False,
|
|
276
|
+
# so add it there too.
|
|
277
|
+
provision_logger = logging.getLogger('sky.provision')
|
|
278
|
+
provision_logger.addHandler(debug_log_handler)
|
|
279
|
+
provision_logger.setLevel(logging.DEBUG)
|
|
280
|
+
yield
|
|
281
|
+
finally:
|
|
282
|
+
_root_logger.removeHandler(debug_log_handler)
|
|
283
|
+
provision_logger.removeHandler(debug_log_handler)
|
|
284
|
+
debug_log_handler.close()
|
sky/skylet/constants.py
CHANGED
|
@@ -70,6 +70,7 @@ DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
|
|
|
70
70
|
|
|
71
71
|
# Prefix for SkyPilot environment variables
|
|
72
72
|
SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
|
|
73
|
+
SKYPILOT_SERVER_ENV_VAR_PREFIX = 'SKYPILOT_SERVER_'
|
|
73
74
|
|
|
74
75
|
# The name for the environment variable that stores the unique ID of the
|
|
75
76
|
# current task. This will stay the same across multiple recoveries of the
|
|
@@ -417,6 +418,7 @@ LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
|
|
|
417
418
|
# Path to the generated cluster config yamls and ssh configs.
|
|
418
419
|
SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
419
420
|
|
|
421
|
+
# TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
|
|
420
422
|
# Environment variable that is set to 'true' if this is a skypilot server.
|
|
421
423
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
|
422
424
|
|
|
@@ -436,6 +438,10 @@ ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
|
|
|
436
438
|
SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
|
|
437
439
|
ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
|
|
438
440
|
|
|
441
|
+
# Enable debug logging for requests.
|
|
442
|
+
ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING = (
|
|
443
|
+
f'{SKYPILOT_SERVER_ENV_VAR_PREFIX}ENABLE_REQUEST_DEBUG_LOGGING')
|
|
444
|
+
|
|
439
445
|
SKYPILOT_DEFAULT_WORKSPACE = 'default'
|
|
440
446
|
|
|
441
447
|
# BEGIN constants used for service catalog.
|
sky/templates/aws-ray.yml.j2
CHANGED
sky/utils/annotations.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing_extensions import ParamSpec
|
|
|
7
7
|
|
|
8
8
|
# Whether the current process is a SkyPilot API server process.
|
|
9
9
|
is_on_api_server = True
|
|
10
|
-
|
|
10
|
+
_FUNCTIONS_NEED_RELOAD_CACHE = []
|
|
11
11
|
|
|
12
12
|
T = TypeVar('T')
|
|
13
13
|
P = ParamSpec('P')
|
|
@@ -50,7 +50,13 @@ def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
|
|
|
50
50
|
else:
|
|
51
51
|
cached_func = functools.lru_cache(*lru_cache_args,
|
|
52
52
|
**lru_cache_kwargs)(func)
|
|
53
|
-
|
|
53
|
+
_FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
|
|
54
54
|
return cached_func
|
|
55
55
|
|
|
56
56
|
return decorator
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def clear_request_level_cache():
|
|
60
|
+
"""Clear the request-level cache."""
|
|
61
|
+
for func in _FUNCTIONS_NEED_RELOAD_CACHE:
|
|
62
|
+
func.cache_clear()
|
sky/utils/cluster_utils.py
CHANGED
|
@@ -11,7 +11,7 @@ import uuid
|
|
|
11
11
|
from sky.skylet import constants
|
|
12
12
|
from sky.utils import command_runner
|
|
13
13
|
from sky.utils import common_utils
|
|
14
|
-
from sky.utils import
|
|
14
|
+
from sky.utils import lock_events
|
|
15
15
|
|
|
16
16
|
# The cluster yaml used to create the current cluster where the module is
|
|
17
17
|
# called.
|
|
@@ -107,7 +107,7 @@ class SSHConfigHelper(object):
|
|
|
107
107
|
return auth_config['ssh_private_key']
|
|
108
108
|
|
|
109
109
|
@classmethod
|
|
110
|
-
@
|
|
110
|
+
@lock_events.FileLockEvent(ssh_conf_lock_path)
|
|
111
111
|
def add_cluster(
|
|
112
112
|
cls,
|
|
113
113
|
cluster_name: str,
|
|
@@ -334,7 +334,7 @@ class SSHConfigHelper(object):
|
|
|
334
334
|
cluster_name: Cluster name.
|
|
335
335
|
"""
|
|
336
336
|
|
|
337
|
-
with
|
|
337
|
+
with lock_events.FileLockEvent(
|
|
338
338
|
cls.ssh_conf_per_cluster_lock_path.format(cluster_name)):
|
|
339
339
|
cluster_config_path = os.path.expanduser(
|
|
340
340
|
cls.ssh_cluster_path.format(cluster_name))
|
sky/utils/db/migration_utils.py
CHANGED
|
@@ -19,7 +19,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
19
19
|
DB_INIT_LOCK_TIMEOUT_SECONDS = 10
|
|
20
20
|
|
|
21
21
|
GLOBAL_USER_STATE_DB_NAME = 'state_db'
|
|
22
|
-
GLOBAL_USER_STATE_VERSION = '
|
|
22
|
+
GLOBAL_USER_STATE_VERSION = '007'
|
|
23
23
|
GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
|
|
24
24
|
|
|
25
25
|
SPOT_JOBS_DB_NAME = 'spot_jobs_db'
|
sky/utils/kubernetes_enums.py
CHANGED
sky/utils/lock_events.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Lock events."""
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import os
|
|
5
|
+
from typing import Optional, Union
|
|
6
|
+
|
|
7
|
+
import filelock
|
|
8
|
+
|
|
9
|
+
from sky.utils import locks
|
|
10
|
+
from sky.utils import timeline
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DistributedLockEvent:
|
|
14
|
+
"""Serve both as a distributed lock and event for the lock."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, lock_id: str, timeout: Optional[float] = None):
|
|
17
|
+
self._lock_id = lock_id
|
|
18
|
+
self._lock = locks.get_lock(lock_id, timeout)
|
|
19
|
+
self._hold_lock_event = timeline.Event(
|
|
20
|
+
f'[DistributedLock.hold]:{lock_id}')
|
|
21
|
+
|
|
22
|
+
def acquire(self):
|
|
23
|
+
was_locked = self._lock.is_locked
|
|
24
|
+
with timeline.Event(f'[DistributedLock.acquire]:{self._lock_id}'):
|
|
25
|
+
self._lock.acquire()
|
|
26
|
+
if not was_locked and self._lock.is_locked:
|
|
27
|
+
# start holding the lock after initial acquiring
|
|
28
|
+
self._hold_lock_event.begin()
|
|
29
|
+
|
|
30
|
+
def release(self):
|
|
31
|
+
was_locked = self._lock.is_locked
|
|
32
|
+
self._lock.release()
|
|
33
|
+
if was_locked and not self._lock.is_locked:
|
|
34
|
+
# stop holding the lock after initial releasing
|
|
35
|
+
self._hold_lock_event.end()
|
|
36
|
+
|
|
37
|
+
def __enter__(self):
|
|
38
|
+
self.acquire()
|
|
39
|
+
return self
|
|
40
|
+
|
|
41
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
42
|
+
self.release()
|
|
43
|
+
|
|
44
|
+
def __call__(self, f):
|
|
45
|
+
|
|
46
|
+
@functools.wraps(f)
|
|
47
|
+
def wrapper(*args, **kwargs):
|
|
48
|
+
with self:
|
|
49
|
+
return f(*args, **kwargs)
|
|
50
|
+
|
|
51
|
+
return wrapper
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class FileLockEvent:
|
|
55
|
+
"""Serve both as a file lock and event for the lock."""
|
|
56
|
+
|
|
57
|
+
def __init__(self, lockfile: Union[str, os.PathLike], timeout: float = -1):
|
|
58
|
+
self._lockfile = lockfile
|
|
59
|
+
os.makedirs(os.path.dirname(os.path.abspath(self._lockfile)),
|
|
60
|
+
exist_ok=True)
|
|
61
|
+
self._lock = filelock.FileLock(self._lockfile, timeout)
|
|
62
|
+
self._hold_lock_event = timeline.Event(
|
|
63
|
+
f'[FileLock.hold]:{self._lockfile}')
|
|
64
|
+
|
|
65
|
+
def acquire(self):
|
|
66
|
+
was_locked = self._lock.is_locked
|
|
67
|
+
with timeline.Event(f'[FileLock.acquire]:{self._lockfile}'):
|
|
68
|
+
self._lock.acquire()
|
|
69
|
+
if not was_locked and self._lock.is_locked:
|
|
70
|
+
# start holding the lock after initial acquiring
|
|
71
|
+
self._hold_lock_event.begin()
|
|
72
|
+
|
|
73
|
+
def release(self):
|
|
74
|
+
was_locked = self._lock.is_locked
|
|
75
|
+
self._lock.release()
|
|
76
|
+
if was_locked and not self._lock.is_locked:
|
|
77
|
+
# stop holding the lock after initial releasing
|
|
78
|
+
self._hold_lock_event.end()
|
|
79
|
+
|
|
80
|
+
def __enter__(self):
|
|
81
|
+
self.acquire()
|
|
82
|
+
return self
|
|
83
|
+
|
|
84
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
85
|
+
self.release()
|
|
86
|
+
|
|
87
|
+
def __call__(self, f):
|
|
88
|
+
# Make this class callable as a decorator.
|
|
89
|
+
@functools.wraps(f)
|
|
90
|
+
def wrapper(*args, **kwargs):
|
|
91
|
+
with self:
|
|
92
|
+
return f(*args, **kwargs)
|
|
93
|
+
|
|
94
|
+
return wrapper
|
sky/utils/schemas.py
CHANGED
|
@@ -1410,6 +1410,9 @@ def get_config_schema():
|
|
|
1410
1410
|
**_NETWORK_CONFIG_SCHEMA, 'tenant_id': {
|
|
1411
1411
|
'type': 'string',
|
|
1412
1412
|
},
|
|
1413
|
+
'domain': {
|
|
1414
|
+
'type': 'string',
|
|
1415
|
+
},
|
|
1413
1416
|
'region_configs': {
|
|
1414
1417
|
'type': 'object',
|
|
1415
1418
|
'required': [],
|
|
@@ -1668,6 +1671,9 @@ def get_config_schema():
|
|
|
1668
1671
|
'tenant_id': {
|
|
1669
1672
|
'type': 'string',
|
|
1670
1673
|
},
|
|
1674
|
+
'domain': {
|
|
1675
|
+
'type': 'string',
|
|
1676
|
+
},
|
|
1671
1677
|
'disabled': {
|
|
1672
1678
|
'type': 'boolean'
|
|
1673
1679
|
},
|
sky/utils/timeline.py
CHANGED
|
@@ -4,7 +4,6 @@ The timeline follows the trace event format defined here:
|
|
|
4
4
|
https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
|
|
5
5
|
""" # pylint: disable=line-too-long
|
|
6
6
|
import atexit
|
|
7
|
-
import functools
|
|
8
7
|
import json
|
|
9
8
|
import os
|
|
10
9
|
import threading
|
|
@@ -12,14 +11,15 @@ import time
|
|
|
12
11
|
import traceback
|
|
13
12
|
from typing import Callable, Optional, Union
|
|
14
13
|
|
|
15
|
-
import filelock
|
|
16
|
-
|
|
17
14
|
from sky.utils import common_utils
|
|
18
|
-
from sky.utils import locks
|
|
19
15
|
|
|
20
16
|
_events = []
|
|
21
17
|
|
|
22
18
|
|
|
19
|
+
def _get_events_file_path():
|
|
20
|
+
return os.environ.get('SKYPILOT_TIMELINE_FILE_PATH')
|
|
21
|
+
|
|
22
|
+
|
|
23
23
|
class Event:
|
|
24
24
|
"""Record an event.
|
|
25
25
|
|
|
@@ -29,6 +29,10 @@ class Event:
|
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
31
|
def __init__(self, name: str, message: Optional[str] = None):
|
|
32
|
+
self._skipped = False
|
|
33
|
+
if not _get_events_file_path():
|
|
34
|
+
self._skipped = True
|
|
35
|
+
return
|
|
32
36
|
self._name = name
|
|
33
37
|
self._message = message
|
|
34
38
|
# See the module doc for the event format.
|
|
@@ -45,6 +49,8 @@ class Event:
|
|
|
45
49
|
self._event['args'] = {'message': self._message}
|
|
46
50
|
|
|
47
51
|
def begin(self):
|
|
52
|
+
if self._skipped:
|
|
53
|
+
return
|
|
48
54
|
event_begin = self._event.copy()
|
|
49
55
|
event_begin.update({
|
|
50
56
|
'ph': 'B',
|
|
@@ -56,6 +62,8 @@ class Event:
|
|
|
56
62
|
_events.append(event_begin)
|
|
57
63
|
|
|
58
64
|
def end(self):
|
|
65
|
+
if self._skipped:
|
|
66
|
+
return
|
|
59
67
|
event_end = self._event.copy()
|
|
60
68
|
event_end.update({
|
|
61
69
|
'ph': 'E',
|
|
@@ -77,103 +85,26 @@ def event(name_or_fn: Union[str, Callable], message: Optional[str] = None):
|
|
|
77
85
|
return common_utils.make_decorator(Event, name_or_fn, message=message)
|
|
78
86
|
|
|
79
87
|
|
|
80
|
-
class DistributedLockEvent:
|
|
81
|
-
"""Serve both as a distributed lock and event for the lock."""
|
|
82
|
-
|
|
83
|
-
def __init__(self, lock_id: str, timeout: Optional[float] = None):
|
|
84
|
-
self._lock_id = lock_id
|
|
85
|
-
self._lock = locks.get_lock(lock_id, timeout)
|
|
86
|
-
self._hold_lock_event = Event(f'[DistributedLock.hold]:{lock_id}')
|
|
87
|
-
|
|
88
|
-
def acquire(self):
|
|
89
|
-
was_locked = self._lock.is_locked
|
|
90
|
-
with Event(f'[DistributedLock.acquire]:{self._lock_id}'):
|
|
91
|
-
self._lock.acquire()
|
|
92
|
-
if not was_locked and self._lock.is_locked:
|
|
93
|
-
# start holding the lock after initial acquiring
|
|
94
|
-
self._hold_lock_event.begin()
|
|
95
|
-
|
|
96
|
-
def release(self):
|
|
97
|
-
was_locked = self._lock.is_locked
|
|
98
|
-
self._lock.release()
|
|
99
|
-
if was_locked and not self._lock.is_locked:
|
|
100
|
-
# stop holding the lock after initial releasing
|
|
101
|
-
self._hold_lock_event.end()
|
|
102
|
-
|
|
103
|
-
def __enter__(self):
|
|
104
|
-
self.acquire()
|
|
105
|
-
return self
|
|
106
|
-
|
|
107
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
108
|
-
self.release()
|
|
109
|
-
|
|
110
|
-
def __call__(self, f):
|
|
111
|
-
|
|
112
|
-
@functools.wraps(f)
|
|
113
|
-
def wrapper(*args, **kwargs):
|
|
114
|
-
with self:
|
|
115
|
-
return f(*args, **kwargs)
|
|
116
|
-
|
|
117
|
-
return wrapper
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
class FileLockEvent:
|
|
121
|
-
"""Serve both as a file lock and event for the lock."""
|
|
122
|
-
|
|
123
|
-
def __init__(self, lockfile: Union[str, os.PathLike], timeout: float = -1):
|
|
124
|
-
self._lockfile = lockfile
|
|
125
|
-
os.makedirs(os.path.dirname(os.path.abspath(self._lockfile)),
|
|
126
|
-
exist_ok=True)
|
|
127
|
-
self._lock = filelock.FileLock(self._lockfile, timeout)
|
|
128
|
-
self._hold_lock_event = Event(f'[FileLock.hold]:{self._lockfile}')
|
|
129
|
-
|
|
130
|
-
def acquire(self):
|
|
131
|
-
was_locked = self._lock.is_locked
|
|
132
|
-
with Event(f'[FileLock.acquire]:{self._lockfile}'):
|
|
133
|
-
self._lock.acquire()
|
|
134
|
-
if not was_locked and self._lock.is_locked:
|
|
135
|
-
# start holding the lock after initial acquiring
|
|
136
|
-
self._hold_lock_event.begin()
|
|
137
|
-
|
|
138
|
-
def release(self):
|
|
139
|
-
was_locked = self._lock.is_locked
|
|
140
|
-
self._lock.release()
|
|
141
|
-
if was_locked and not self._lock.is_locked:
|
|
142
|
-
# stop holding the lock after initial releasing
|
|
143
|
-
self._hold_lock_event.end()
|
|
144
|
-
|
|
145
|
-
def __enter__(self):
|
|
146
|
-
self.acquire()
|
|
147
|
-
return self
|
|
148
|
-
|
|
149
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
150
|
-
self.release()
|
|
151
|
-
|
|
152
|
-
def __call__(self, f):
|
|
153
|
-
# Make this class callable as a decorator.
|
|
154
|
-
@functools.wraps(f)
|
|
155
|
-
def wrapper(*args, **kwargs):
|
|
156
|
-
with self:
|
|
157
|
-
return f(*args, **kwargs)
|
|
158
|
-
|
|
159
|
-
return wrapper
|
|
160
|
-
|
|
161
|
-
|
|
162
88
|
def save_timeline():
|
|
163
|
-
|
|
164
|
-
if not
|
|
89
|
+
events_file_path = _get_events_file_path()
|
|
90
|
+
if not events_file_path:
|
|
165
91
|
return
|
|
92
|
+
global _events
|
|
93
|
+
events_to_write = _events
|
|
94
|
+
_events = []
|
|
166
95
|
json_output = {
|
|
167
|
-
'traceEvents':
|
|
96
|
+
'traceEvents': events_to_write,
|
|
168
97
|
'displayTimeUnit': 'ms',
|
|
169
98
|
'otherData': {
|
|
170
|
-
'log_dir': os.path.dirname(os.path.abspath(
|
|
99
|
+
'log_dir': os.path.dirname(os.path.abspath(events_file_path)),
|
|
171
100
|
}
|
|
172
101
|
}
|
|
173
|
-
os.makedirs(os.path.dirname(os.path.abspath(
|
|
174
|
-
|
|
102
|
+
os.makedirs(os.path.dirname(os.path.abspath(events_file_path)),
|
|
103
|
+
exist_ok=True)
|
|
104
|
+
with open(events_file_path, 'w', encoding='utf-8') as f:
|
|
175
105
|
json.dump(json_output, f)
|
|
106
|
+
del events_to_write
|
|
176
107
|
|
|
177
108
|
|
|
178
|
-
if
|
|
109
|
+
if _get_events_file_path():
|
|
179
110
|
atexit.register(save_timeline)
|