skypilot-nightly 1.0.0.dev20250828__py3-none-any.whl → 1.0.0.dev20250831__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (59) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +24 -2
  3. sky/backends/backend_utils.py +152 -59
  4. sky/backends/cloud_vm_ray_backend.py +56 -3
  5. sky/backends/wheel_utils.py +35 -8
  6. sky/client/cli/command.py +17 -6
  7. sky/client/common.py +5 -4
  8. sky/client/sdk.py +5 -0
  9. sky/client/sdk_async.py +8 -2
  10. sky/clouds/aws.py +118 -1
  11. sky/core.py +8 -3
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/{webpack-6dae1cd599a34def.js → webpack-6e76f636a048e145.js} +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/config.html +1 -1
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/infra/[context].html +1 -1
  20. sky/dashboard/out/infra.html +1 -1
  21. sky/dashboard/out/jobs/[job].html +1 -1
  22. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/volumes.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/global_user_state.py +58 -10
  30. sky/provision/aws/config.py +78 -3
  31. sky/provision/aws/instance.py +45 -6
  32. sky/provision/docker_utils.py +1 -1
  33. sky/provision/kubernetes/utils.py +48 -26
  34. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  35. sky/server/common.py +1 -2
  36. sky/server/daemons.py +6 -0
  37. sky/server/requests/executor.py +2 -1
  38. sky/server/requests/payloads.py +4 -1
  39. sky/server/server.py +67 -58
  40. sky/setup_files/dependencies.py +25 -8
  41. sky/setup_files/setup.py +2 -0
  42. sky/sky_logging.py +28 -0
  43. sky/skylet/constants.py +6 -0
  44. sky/templates/aws-ray.yml.j2 +1 -0
  45. sky/utils/annotations.py +8 -2
  46. sky/utils/cluster_utils.py +3 -3
  47. sky/utils/db/migration_utils.py +1 -1
  48. sky/utils/kubernetes_enums.py +1 -0
  49. sky/utils/lock_events.py +94 -0
  50. sky/utils/schemas.py +6 -0
  51. sky/utils/timeline.py +24 -93
  52. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/METADATA +36 -48
  53. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/RECORD +59 -57
  54. /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → FtHzmn6BMJ5PzqHhEY51g}/_buildManifest.js +0 -0
  55. /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → FtHzmn6BMJ5PzqHhEY51g}/_ssgManifest.js +0 -0
  56. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/WHEEL +0 -0
  57. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/entry_points.txt +0 -0
  58. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/licenses/LICENSE +0 -0
  59. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/top_level.txt +0 -0
sky/server/server.py CHANGED
@@ -21,6 +21,7 @@ import uuid
21
21
  import zipfile
22
22
 
23
23
  import aiofiles
24
+ import anyio
24
25
  import fastapi
25
26
  from fastapi.middleware import cors
26
27
  import starlette.middleware.base
@@ -847,7 +848,7 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
847
848
  client_file_mounts_dir = (
848
849
  common.API_SERVER_CLIENT_DIR.expanduser().resolve() / user_id /
849
850
  'file_mounts')
850
- client_file_mounts_dir.mkdir(parents=True, exist_ok=True)
851
+ await anyio.Path(client_file_mounts_dir).mkdir(parents=True, exist_ok=True)
851
852
 
852
853
  # Check upload_id to be a valid SkyPilot run_timestamp appended with 8 hex
853
854
  # characters, e.g. 'sky-2025-01-17-09-10-13-933602-35d31c22'.
@@ -870,7 +871,7 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
870
871
  zip_file_path = client_file_mounts_dir / f'{upload_id}.zip'
871
872
  else:
872
873
  chunk_dir = client_file_mounts_dir / upload_id
873
- chunk_dir.mkdir(parents=True, exist_ok=True)
874
+ await anyio.Path(chunk_dir).mkdir(parents=True, exist_ok=True)
874
875
  zip_file_path = chunk_dir / f'part{chunk_index}.incomplete'
875
876
 
876
877
  try:
@@ -916,9 +917,9 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
916
917
  await zip_file.write(data)
917
918
 
918
919
  logger.info(f'Uploaded zip file: {zip_file_path}')
919
- unzip_file(zip_file_path, client_file_mounts_dir)
920
+ await unzip_file(zip_file_path, client_file_mounts_dir)
920
921
  if total_chunks > 1:
921
- shutil.rmtree(chunk_dir)
922
+ await context_utils.to_thread(shutil.rmtree, chunk_dir)
922
923
  return payloads.UploadZipFileResponse(
923
924
  status=responses.UploadStatus.COMPLETED.value)
924
925
 
@@ -933,61 +934,69 @@ def _is_relative_to(path: pathlib.Path, parent: pathlib.Path) -> bool:
933
934
  return False
934
935
 
935
936
 
936
- def unzip_file(zip_file_path: pathlib.Path,
937
- client_file_mounts_dir: pathlib.Path) -> None:
938
- """Unzips a zip file."""
939
- try:
940
- with zipfile.ZipFile(zip_file_path, 'r') as zipf:
941
- for member in zipf.infolist():
942
- # Determine the new path
943
- original_path = os.path.normpath(member.filename)
944
- new_path = client_file_mounts_dir / original_path.lstrip('/')
945
-
946
- if (member.external_attr >> 28) == 0xA:
947
- # Symlink. Read the target path and create a symlink.
948
- new_path.parent.mkdir(parents=True, exist_ok=True)
949
- target = zipf.read(member).decode()
950
- assert not os.path.isabs(target), target
951
- # Since target is a relative path, we need to check that it
952
- # is under `client_file_mounts_dir` for security.
953
- full_target_path = (new_path.parent / target).resolve()
954
- if not _is_relative_to(full_target_path,
955
- client_file_mounts_dir):
956
- raise ValueError(f'Symlink target {target} leads to a '
957
- 'file not in userspace. Aborted.')
958
-
959
- if new_path.exists() or new_path.is_symlink():
960
- new_path.unlink(missing_ok=True)
961
- new_path.symlink_to(
962
- target,
963
- target_is_directory=member.filename.endswith('/'))
964
- continue
965
-
966
- # Handle directories
967
- if member.filename.endswith('/'):
968
- new_path.mkdir(parents=True, exist_ok=True)
969
- continue
970
-
971
- # Handle files
972
- new_path.parent.mkdir(parents=True, exist_ok=True)
973
- with zipf.open(member) as member_file, new_path.open('wb') as f:
974
- # Use shutil.copyfileobj to copy files in chunks, so it does
975
- # not load the entire file into memory.
976
- shutil.copyfileobj(member_file, f)
977
- except zipfile.BadZipFile as e:
978
- logger.error(f'Bad zip file: {zip_file_path}')
979
- raise fastapi.HTTPException(
980
- status_code=400,
981
- detail=f'Invalid zip file: {common_utils.format_exception(e)}')
982
- except Exception as e:
983
- logger.error(f'Error unzipping file: {zip_file_path}')
984
- raise fastapi.HTTPException(
985
- status_code=500,
986
- detail=(f'Error unzipping file: '
987
- f'{common_utils.format_exception(e)}'))
937
+ async def unzip_file(zip_file_path: pathlib.Path,
938
+ client_file_mounts_dir: pathlib.Path) -> None:
939
+ """Unzips a zip file without blocking the event loop."""
988
940
 
989
- # Cleanup the temporary file
990
- zip_file_path.unlink()
941
+ def _do_unzip() -> None:
942
+ try:
943
+ with zipfile.ZipFile(zip_file_path, 'r') as zipf:
944
+ for member in zipf.infolist():
945
+ # Determine the new path
946
+ original_path = os.path.normpath(member.filename)
947
+ new_path = client_file_mounts_dir / original_path.lstrip(
948
+ '/')
949
+
950
+ if (member.external_attr >> 28) == 0xA:
951
+ # Symlink. Read the target path and create a symlink.
952
+ new_path.parent.mkdir(parents=True, exist_ok=True)
953
+ target = zipf.read(member).decode()
954
+ assert not os.path.isabs(target), target
955
+ # Since target is a relative path, we need to check that
956
+ # it is under `client_file_mounts_dir` for security.
957
+ full_target_path = (new_path.parent / target).resolve()
958
+ if not _is_relative_to(full_target_path,
959
+ client_file_mounts_dir):
960
+ raise ValueError(
961
+ f'Symlink target {target} leads to a '
962
+ 'file not in userspace. Aborted.')
963
+
964
+ if new_path.exists() or new_path.is_symlink():
965
+ new_path.unlink(missing_ok=True)
966
+ new_path.symlink_to(
967
+ target,
968
+ target_is_directory=member.filename.endswith('/'))
969
+ continue
970
+
971
+ # Handle directories
972
+ if member.filename.endswith('/'):
973
+ new_path.mkdir(parents=True, exist_ok=True)
974
+ continue
975
+
976
+ # Handle files
977
+ new_path.parent.mkdir(parents=True, exist_ok=True)
978
+ with zipf.open(member) as member_file, new_path.open(
979
+ 'wb') as f:
980
+ # Use shutil.copyfileobj to copy files in chunks,
981
+ # so it does not load the entire file into memory.
982
+ shutil.copyfileobj(member_file, f)
983
+ except zipfile.BadZipFile as e:
984
+ logger.error(f'Bad zip file: {zip_file_path}')
985
+ raise fastapi.HTTPException(
986
+ status_code=400,
987
+ detail=f'Invalid zip file: {common_utils.format_exception(e)}')
988
+ except Exception as e:
989
+ logger.error(f'Error unzipping file: {zip_file_path}')
990
+ raise fastapi.HTTPException(
991
+ status_code=500,
992
+ detail=(f'Error unzipping file: '
993
+ f'{common_utils.format_exception(e)}'))
994
+ finally:
995
+ # Cleanup the temporary file regardless of
996
+ # success/failure handling above
997
+ zip_file_path.unlink(missing_ok=True)
998
+
999
+ await context_utils.to_thread(_do_unzip)
991
1000
 
992
1001
 
993
1002
  @app.post('/launch')
@@ -8,8 +8,12 @@ This file is imported by setup.py, so:
8
8
  import sys
9
9
  from typing import Dict, List
10
10
 
11
+ clouds_with_ray = ['ibm', 'docker', 'scp']
12
+
11
13
  install_requires = [
12
14
  'wheel<0.46.0', # https://github.com/skypilot-org/skypilot/issues/5153
15
+ 'setuptools', # TODO: match version to pyproject.toml once #5153 is fixed
16
+ 'pip',
13
17
  'cachetools',
14
18
  # NOTE: ray requires click>=7.0.
15
19
  # click 8.2.0 has a bug in parsing the command line arguments:
@@ -71,6 +75,7 @@ install_requires = [
71
75
  'types-paramiko',
72
76
  'alembic',
73
77
  'aiohttp',
78
+ 'anyio',
74
79
  ]
75
80
 
76
81
  # See requirements-dev.txt for the version of grpc and protobuf
@@ -92,6 +97,7 @@ server_dependencies = [
92
97
  'passlib',
93
98
  'pyjwt',
94
99
  'aiohttp',
100
+ 'anyio',
95
101
  GRPC,
96
102
  PROTOBUF,
97
103
  ]
@@ -143,7 +149,7 @@ extras_require: Dict[str, List[str]] = {
143
149
  'azure-storage-blob>=12.23.1',
144
150
  'msgraph-sdk',
145
151
  'msrestazure',
146
- ] + local_ray,
152
+ ],
147
153
  # We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
148
154
  # parameter for stopping instances. Reference:
149
155
  # https://github.com/googleapis/google-api-python-client/commit/f6e9d3869ed605b06f7cbf2e8cf2db25108506e6
@@ -164,7 +170,7 @@ extras_require: Dict[str, List[str]] = {
164
170
  'lambda': [], # No dependencies needed for lambda
165
171
  'cloudflare': aws_dependencies,
166
172
  'scp': local_ray,
167
- 'oci': ['oci'] + local_ray,
173
+ 'oci': ['oci'],
168
174
  # Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
169
175
  'kubernetes': [
170
176
  'kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'
@@ -195,10 +201,21 @@ extras_require: Dict[str, List[str]] = {
195
201
  'server': server_dependencies,
196
202
  }
197
203
 
198
- # Nebius needs python3.10. If python 3.9 [all] will not install nebius
204
+ # Calculate which clouds should be included in the [all] installation.
205
+ clouds_for_all = set(extras_require)
206
+ clouds_for_all.remove('remote')
207
+
199
208
  if sys.version_info < (3, 10):
200
- filtered_keys = [k for k in extras_require if k != 'nebius']
201
- extras_require['all'] = sum(
202
- [v for k, v in extras_require.items() if k != 'nebius'], [])
203
- else:
204
- extras_require['all'] = sum(extras_require.values(), [])
209
+ # Nebius needs python3.10. If python 3.9 [all] will not install nebius
210
+ clouds_for_all.remove('nebius')
211
+
212
+ if sys.version_info >= (3, 12):
213
+ # The version of ray we use does not work with >= 3.12, so avoid clouds
214
+ # that require ray.
215
+ clouds_for_all -= set(clouds_with_ray)
216
+ # vast requires setuptools==51.1.1 which will not work with python >= 3.12
217
+ # TODO: Remove once https://github.com/vast-ai/vast-sdk/pull/6 is released
218
+ clouds_for_all.remove('vast')
219
+
220
+ extras_require['all'] = list(
221
+ set().union(*[extras_require[cloud] for cloud in clouds_for_all]))
sky/setup_files/setup.py CHANGED
@@ -178,6 +178,8 @@ setuptools.setup(
178
178
  'Programming Language :: Python :: 3.9',
179
179
  'Programming Language :: Python :: 3.10',
180
180
  'Programming Language :: Python :: 3.11',
181
+ 'Programming Language :: Python :: 3.12',
182
+ 'Programming Language :: Python :: 3.13',
181
183
  'License :: OSI Approved :: Apache Software License',
182
184
  'Operating System :: OS Independent',
183
185
  'Topic :: Software Development :: Libraries :: Python Modules',
sky/sky_logging.py CHANGED
@@ -19,6 +19,9 @@ _FORMAT = '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
19
19
  _DATE_FORMAT = '%m-%d %H:%M:%S'
20
20
  _SENSITIVE_LOGGER = ['sky.provisioner', 'sky.optimizer']
21
21
 
22
+ _DEBUG_LOG_DIR = os.path.expanduser(
23
+ os.path.join(constants.SKY_LOGS_DIRECTORY, 'request_debug'))
24
+
22
25
  DEBUG = logging.DEBUG
23
26
  INFO = logging.INFO
24
27
  WARNING = logging.WARNING
@@ -254,3 +257,28 @@ def generate_tmp_logging_file_path(file_name: str) -> str:
254
257
  log_path = os.path.expanduser(os.path.join(log_dir, file_name))
255
258
 
256
259
  return log_path
260
+
261
+
262
+ @contextlib.contextmanager
263
+ def add_debug_log_handler(request_id: str):
264
+ if os.getenv(constants.ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING) != 'true':
265
+ yield
266
+ return
267
+
268
+ os.makedirs(_DEBUG_LOG_DIR, exist_ok=True)
269
+ log_path = os.path.join(_DEBUG_LOG_DIR, f'{request_id}.log')
270
+ try:
271
+ debug_log_handler = logging.FileHandler(log_path)
272
+ debug_log_handler.setFormatter(FORMATTER)
273
+ debug_log_handler.setLevel(logging.DEBUG)
274
+ _root_logger.addHandler(debug_log_handler)
275
+ # sky.provision sets up its own logger/handler with propogate=False,
276
+ # so add it there too.
277
+ provision_logger = logging.getLogger('sky.provision')
278
+ provision_logger.addHandler(debug_log_handler)
279
+ provision_logger.setLevel(logging.DEBUG)
280
+ yield
281
+ finally:
282
+ _root_logger.removeHandler(debug_log_handler)
283
+ provision_logger.removeHandler(debug_log_handler)
284
+ debug_log_handler.close()
sky/skylet/constants.py CHANGED
@@ -70,6 +70,7 @@ DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
70
70
 
71
71
  # Prefix for SkyPilot environment variables
72
72
  SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
73
+ SKYPILOT_SERVER_ENV_VAR_PREFIX = 'SKYPILOT_SERVER_'
73
74
 
74
75
  # The name for the environment variable that stores the unique ID of the
75
76
  # current task. This will stay the same across multiple recoveries of the
@@ -417,6 +418,7 @@ LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
417
418
  # Path to the generated cluster config yamls and ssh configs.
418
419
  SKY_USER_FILE_PATH = '~/.sky/generated'
419
420
 
421
+ # TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
420
422
  # Environment variable that is set to 'true' if this is a skypilot server.
421
423
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
422
424
 
@@ -436,6 +438,10 @@ ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
436
438
  SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
437
439
  ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
438
440
 
441
+ # Enable debug logging for requests.
442
+ ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING = (
443
+ f'{SKYPILOT_SERVER_ENV_VAR_PREFIX}ENABLE_REQUEST_DEBUG_LOGGING')
444
+
439
445
  SKYPILOT_DEFAULT_WORKSPACE = 'default'
440
446
 
441
447
  # BEGIN constants used for service catalog.
@@ -48,6 +48,7 @@ provider:
48
48
  # The upper-level SkyPilot code has make sure there will not be resource
49
49
  # leakage.
50
50
  disable_launch_config_check: true
51
+ max_efa_interfaces: {{max_efa_interfaces}}
51
52
 
52
53
  auth:
53
54
  ssh_user: {{ssh_user}}
sky/utils/annotations.py CHANGED
@@ -7,7 +7,7 @@ from typing_extensions import ParamSpec
7
7
 
8
8
  # Whether the current process is a SkyPilot API server process.
9
9
  is_on_api_server = True
10
- FUNCTIONS_NEED_RELOAD_CACHE = []
10
+ _FUNCTIONS_NEED_RELOAD_CACHE = []
11
11
 
12
12
  T = TypeVar('T')
13
13
  P = ParamSpec('P')
@@ -50,7 +50,13 @@ def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
50
50
  else:
51
51
  cached_func = functools.lru_cache(*lru_cache_args,
52
52
  **lru_cache_kwargs)(func)
53
- FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
53
+ _FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
54
54
  return cached_func
55
55
 
56
56
  return decorator
57
+
58
+
59
+ def clear_request_level_cache():
60
+ """Clear the request-level cache."""
61
+ for func in _FUNCTIONS_NEED_RELOAD_CACHE:
62
+ func.cache_clear()
@@ -11,7 +11,7 @@ import uuid
11
11
  from sky.skylet import constants
12
12
  from sky.utils import command_runner
13
13
  from sky.utils import common_utils
14
- from sky.utils import timeline
14
+ from sky.utils import lock_events
15
15
 
16
16
  # The cluster yaml used to create the current cluster where the module is
17
17
  # called.
@@ -107,7 +107,7 @@ class SSHConfigHelper(object):
107
107
  return auth_config['ssh_private_key']
108
108
 
109
109
  @classmethod
110
- @timeline.FileLockEvent(ssh_conf_lock_path)
110
+ @lock_events.FileLockEvent(ssh_conf_lock_path)
111
111
  def add_cluster(
112
112
  cls,
113
113
  cluster_name: str,
@@ -334,7 +334,7 @@ class SSHConfigHelper(object):
334
334
  cluster_name: Cluster name.
335
335
  """
336
336
 
337
- with timeline.FileLockEvent(
337
+ with lock_events.FileLockEvent(
338
338
  cls.ssh_conf_per_cluster_lock_path.format(cluster_name)):
339
339
  cluster_config_path = os.path.expanduser(
340
340
  cls.ssh_cluster_path.format(cluster_name))
@@ -19,7 +19,7 @@ logger = sky_logging.init_logger(__name__)
19
19
  DB_INIT_LOCK_TIMEOUT_SECONDS = 10
20
20
 
21
21
  GLOBAL_USER_STATE_DB_NAME = 'state_db'
22
- GLOBAL_USER_STATE_VERSION = '006'
22
+ GLOBAL_USER_STATE_VERSION = '007'
23
23
  GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
24
24
 
25
25
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'
@@ -42,4 +42,5 @@ class KubernetesAutoscalerType(enum.Enum):
42
42
  """Enum for the different types of cluster autoscalers for Kubernetes."""
43
43
  GKE = 'gke'
44
44
  KARPENTER = 'karpenter'
45
+ COREWEAVE = 'coreweave'
45
46
  GENERIC = 'generic'
@@ -0,0 +1,94 @@
1
+ """Lock events."""
2
+
3
+ import functools
4
+ import os
5
+ from typing import Optional, Union
6
+
7
+ import filelock
8
+
9
+ from sky.utils import locks
10
+ from sky.utils import timeline
11
+
12
+
13
+ class DistributedLockEvent:
14
+ """Serve both as a distributed lock and event for the lock."""
15
+
16
+ def __init__(self, lock_id: str, timeout: Optional[float] = None):
17
+ self._lock_id = lock_id
18
+ self._lock = locks.get_lock(lock_id, timeout)
19
+ self._hold_lock_event = timeline.Event(
20
+ f'[DistributedLock.hold]:{lock_id}')
21
+
22
+ def acquire(self):
23
+ was_locked = self._lock.is_locked
24
+ with timeline.Event(f'[DistributedLock.acquire]:{self._lock_id}'):
25
+ self._lock.acquire()
26
+ if not was_locked and self._lock.is_locked:
27
+ # start holding the lock after initial acquiring
28
+ self._hold_lock_event.begin()
29
+
30
+ def release(self):
31
+ was_locked = self._lock.is_locked
32
+ self._lock.release()
33
+ if was_locked and not self._lock.is_locked:
34
+ # stop holding the lock after initial releasing
35
+ self._hold_lock_event.end()
36
+
37
+ def __enter__(self):
38
+ self.acquire()
39
+ return self
40
+
41
+ def __exit__(self, exc_type, exc_val, exc_tb):
42
+ self.release()
43
+
44
+ def __call__(self, f):
45
+
46
+ @functools.wraps(f)
47
+ def wrapper(*args, **kwargs):
48
+ with self:
49
+ return f(*args, **kwargs)
50
+
51
+ return wrapper
52
+
53
+
54
+ class FileLockEvent:
55
+ """Serve both as a file lock and event for the lock."""
56
+
57
+ def __init__(self, lockfile: Union[str, os.PathLike], timeout: float = -1):
58
+ self._lockfile = lockfile
59
+ os.makedirs(os.path.dirname(os.path.abspath(self._lockfile)),
60
+ exist_ok=True)
61
+ self._lock = filelock.FileLock(self._lockfile, timeout)
62
+ self._hold_lock_event = timeline.Event(
63
+ f'[FileLock.hold]:{self._lockfile}')
64
+
65
+ def acquire(self):
66
+ was_locked = self._lock.is_locked
67
+ with timeline.Event(f'[FileLock.acquire]:{self._lockfile}'):
68
+ self._lock.acquire()
69
+ if not was_locked and self._lock.is_locked:
70
+ # start holding the lock after initial acquiring
71
+ self._hold_lock_event.begin()
72
+
73
+ def release(self):
74
+ was_locked = self._lock.is_locked
75
+ self._lock.release()
76
+ if was_locked and not self._lock.is_locked:
77
+ # stop holding the lock after initial releasing
78
+ self._hold_lock_event.end()
79
+
80
+ def __enter__(self):
81
+ self.acquire()
82
+ return self
83
+
84
+ def __exit__(self, exc_type, exc_val, exc_tb):
85
+ self.release()
86
+
87
+ def __call__(self, f):
88
+ # Make this class callable as a decorator.
89
+ @functools.wraps(f)
90
+ def wrapper(*args, **kwargs):
91
+ with self:
92
+ return f(*args, **kwargs)
93
+
94
+ return wrapper
sky/utils/schemas.py CHANGED
@@ -1410,6 +1410,9 @@ def get_config_schema():
1410
1410
  **_NETWORK_CONFIG_SCHEMA, 'tenant_id': {
1411
1411
  'type': 'string',
1412
1412
  },
1413
+ 'domain': {
1414
+ 'type': 'string',
1415
+ },
1413
1416
  'region_configs': {
1414
1417
  'type': 'object',
1415
1418
  'required': [],
@@ -1668,6 +1671,9 @@ def get_config_schema():
1668
1671
  'tenant_id': {
1669
1672
  'type': 'string',
1670
1673
  },
1674
+ 'domain': {
1675
+ 'type': 'string',
1676
+ },
1671
1677
  'disabled': {
1672
1678
  'type': 'boolean'
1673
1679
  },
sky/utils/timeline.py CHANGED
@@ -4,7 +4,6 @@ The timeline follows the trace event format defined here:
4
4
  https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
5
5
  """ # pylint: disable=line-too-long
6
6
  import atexit
7
- import functools
8
7
  import json
9
8
  import os
10
9
  import threading
@@ -12,14 +11,15 @@ import time
12
11
  import traceback
13
12
  from typing import Callable, Optional, Union
14
13
 
15
- import filelock
16
-
17
14
  from sky.utils import common_utils
18
- from sky.utils import locks
19
15
 
20
16
  _events = []
21
17
 
22
18
 
19
+ def _get_events_file_path():
20
+ return os.environ.get('SKYPILOT_TIMELINE_FILE_PATH')
21
+
22
+
23
23
  class Event:
24
24
  """Record an event.
25
25
 
@@ -29,6 +29,10 @@ class Event:
29
29
  """
30
30
 
31
31
  def __init__(self, name: str, message: Optional[str] = None):
32
+ self._skipped = False
33
+ if not _get_events_file_path():
34
+ self._skipped = True
35
+ return
32
36
  self._name = name
33
37
  self._message = message
34
38
  # See the module doc for the event format.
@@ -45,6 +49,8 @@ class Event:
45
49
  self._event['args'] = {'message': self._message}
46
50
 
47
51
  def begin(self):
52
+ if self._skipped:
53
+ return
48
54
  event_begin = self._event.copy()
49
55
  event_begin.update({
50
56
  'ph': 'B',
@@ -56,6 +62,8 @@ class Event:
56
62
  _events.append(event_begin)
57
63
 
58
64
  def end(self):
65
+ if self._skipped:
66
+ return
59
67
  event_end = self._event.copy()
60
68
  event_end.update({
61
69
  'ph': 'E',
@@ -77,103 +85,26 @@ def event(name_or_fn: Union[str, Callable], message: Optional[str] = None):
77
85
  return common_utils.make_decorator(Event, name_or_fn, message=message)
78
86
 
79
87
 
80
- class DistributedLockEvent:
81
- """Serve both as a distributed lock and event for the lock."""
82
-
83
- def __init__(self, lock_id: str, timeout: Optional[float] = None):
84
- self._lock_id = lock_id
85
- self._lock = locks.get_lock(lock_id, timeout)
86
- self._hold_lock_event = Event(f'[DistributedLock.hold]:{lock_id}')
87
-
88
- def acquire(self):
89
- was_locked = self._lock.is_locked
90
- with Event(f'[DistributedLock.acquire]:{self._lock_id}'):
91
- self._lock.acquire()
92
- if not was_locked and self._lock.is_locked:
93
- # start holding the lock after initial acquiring
94
- self._hold_lock_event.begin()
95
-
96
- def release(self):
97
- was_locked = self._lock.is_locked
98
- self._lock.release()
99
- if was_locked and not self._lock.is_locked:
100
- # stop holding the lock after initial releasing
101
- self._hold_lock_event.end()
102
-
103
- def __enter__(self):
104
- self.acquire()
105
- return self
106
-
107
- def __exit__(self, exc_type, exc_val, exc_tb):
108
- self.release()
109
-
110
- def __call__(self, f):
111
-
112
- @functools.wraps(f)
113
- def wrapper(*args, **kwargs):
114
- with self:
115
- return f(*args, **kwargs)
116
-
117
- return wrapper
118
-
119
-
120
- class FileLockEvent:
121
- """Serve both as a file lock and event for the lock."""
122
-
123
- def __init__(self, lockfile: Union[str, os.PathLike], timeout: float = -1):
124
- self._lockfile = lockfile
125
- os.makedirs(os.path.dirname(os.path.abspath(self._lockfile)),
126
- exist_ok=True)
127
- self._lock = filelock.FileLock(self._lockfile, timeout)
128
- self._hold_lock_event = Event(f'[FileLock.hold]:{self._lockfile}')
129
-
130
- def acquire(self):
131
- was_locked = self._lock.is_locked
132
- with Event(f'[FileLock.acquire]:{self._lockfile}'):
133
- self._lock.acquire()
134
- if not was_locked and self._lock.is_locked:
135
- # start holding the lock after initial acquiring
136
- self._hold_lock_event.begin()
137
-
138
- def release(self):
139
- was_locked = self._lock.is_locked
140
- self._lock.release()
141
- if was_locked and not self._lock.is_locked:
142
- # stop holding the lock after initial releasing
143
- self._hold_lock_event.end()
144
-
145
- def __enter__(self):
146
- self.acquire()
147
- return self
148
-
149
- def __exit__(self, exc_type, exc_val, exc_tb):
150
- self.release()
151
-
152
- def __call__(self, f):
153
- # Make this class callable as a decorator.
154
- @functools.wraps(f)
155
- def wrapper(*args, **kwargs):
156
- with self:
157
- return f(*args, **kwargs)
158
-
159
- return wrapper
160
-
161
-
162
88
  def save_timeline():
163
- file_path = os.environ.get('SKYPILOT_TIMELINE_FILE_PATH')
164
- if not file_path:
89
+ events_file_path = _get_events_file_path()
90
+ if not events_file_path:
165
91
  return
92
+ global _events
93
+ events_to_write = _events
94
+ _events = []
166
95
  json_output = {
167
- 'traceEvents': _events,
96
+ 'traceEvents': events_to_write,
168
97
  'displayTimeUnit': 'ms',
169
98
  'otherData': {
170
- 'log_dir': os.path.dirname(os.path.abspath(file_path)),
99
+ 'log_dir': os.path.dirname(os.path.abspath(events_file_path)),
171
100
  }
172
101
  }
173
- os.makedirs(os.path.dirname(os.path.abspath(file_path)), exist_ok=True)
174
- with open(file_path, 'w', encoding='utf-8') as f:
102
+ os.makedirs(os.path.dirname(os.path.abspath(events_file_path)),
103
+ exist_ok=True)
104
+ with open(events_file_path, 'w', encoding='utf-8') as f:
175
105
  json.dump(json_output, f)
106
+ del events_to_write
176
107
 
177
108
 
178
- if os.environ.get('SKYPILOT_TIMELINE_FILE_PATH'):
109
+ if _get_events_file_path():
179
110
  atexit.register(save_timeline)