skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250730__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/client/cli/command.py +0 -7
- sky/client/common.py +12 -9
- sky/clouds/nebius.py +1 -1
- sky/clouds/utils/gcp_utils.py +1 -1
- sky/clouds/vast.py +1 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/_r2LwCFLjlWjZDUIJQG_V/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
- sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
- sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
- sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
- sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
- sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
- sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
- sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
- sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
- sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +25 -0
- sky/data/storage.py +1219 -1775
- sky/global_user_state.py +18 -8
- sky/jobs/state.py +35 -7
- sky/jobs/utils.py +35 -17
- sky/logs/aws.py +4 -2
- sky/provision/kubernetes/utils.py +6 -4
- sky/provision/vast/instance.py +2 -1
- sky/provision/vast/utils.py +9 -6
- sky/resources.py +8 -2
- sky/server/server.py +6 -1
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/RECORD +91 -90
- sky/dashboard/out/_next/static/Q2sVXboB_t7cgvntL-6nD/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
- sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
- sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
- sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
- sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
- sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
- sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
- sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
- sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
- sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
- sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
- sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
- sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
- /sky/dashboard/out/_next/static/{Q2sVXboB_t7cgvntL-6nD → _r2LwCFLjlWjZDUIJQG_V}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
|
@@ -11,6 +11,7 @@ import json
|
|
|
11
11
|
import os
|
|
12
12
|
import pickle
|
|
13
13
|
import re
|
|
14
|
+
import threading
|
|
14
15
|
import time
|
|
15
16
|
import typing
|
|
16
17
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
@@ -47,6 +48,7 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
|
|
|
47
48
|
_ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
|
|
48
49
|
|
|
49
50
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
51
|
+
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
50
52
|
|
|
51
53
|
Base = declarative.declarative_base()
|
|
52
54
|
|
|
@@ -241,21 +243,29 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
241
243
|
migration_utils.GLOBAL_USER_STATE_VERSION)
|
|
242
244
|
|
|
243
245
|
|
|
246
|
+
# We wrap the sqlalchemy engine initialization in a thread
|
|
247
|
+
# lock to ensure that multiple threads do not initialize the
|
|
248
|
+
# engine which could result in a rare race condition where
|
|
249
|
+
# a session has already been created with _SQLALCHEMY_ENGINE = e1,
|
|
250
|
+
# and then another thread overwrites _SQLALCHEMY_ENGINE = e2
|
|
251
|
+
# which could result in e1 being garbage collected unexpectedly.
|
|
244
252
|
def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
245
253
|
global _SQLALCHEMY_ENGINE
|
|
246
254
|
|
|
247
255
|
if _SQLALCHEMY_ENGINE is not None:
|
|
248
256
|
return _SQLALCHEMY_ENGINE
|
|
257
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
258
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
259
|
+
return _SQLALCHEMY_ENGINE
|
|
260
|
+
# get an engine to the db
|
|
261
|
+
engine = migration_utils.get_engine('state')
|
|
249
262
|
|
|
250
|
-
|
|
251
|
-
|
|
263
|
+
# run migrations if needed
|
|
264
|
+
create_table(engine)
|
|
252
265
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
# return engine
|
|
257
|
-
_SQLALCHEMY_ENGINE = engine
|
|
258
|
-
return _SQLALCHEMY_ENGINE
|
|
266
|
+
# return engine
|
|
267
|
+
_SQLALCHEMY_ENGINE = engine
|
|
268
|
+
return _SQLALCHEMY_ENGINE
|
|
259
269
|
|
|
260
270
|
|
|
261
271
|
def _init_db(func):
|
sky/jobs/state.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
import enum
|
|
5
5
|
import functools
|
|
6
6
|
import json
|
|
7
|
+
import threading
|
|
7
8
|
import time
|
|
8
9
|
import typing
|
|
9
10
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
@@ -33,6 +34,7 @@ CallbackType = Callable[[str], None]
|
|
|
33
34
|
logger = sky_logging.init_logger(__name__)
|
|
34
35
|
|
|
35
36
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
37
|
+
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
36
38
|
|
|
37
39
|
Base = declarative.declarative_base()
|
|
38
40
|
|
|
@@ -131,21 +133,30 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
131
133
|
migration_utils.SPOT_JOBS_VERSION)
|
|
132
134
|
|
|
133
135
|
|
|
136
|
+
# We wrap the sqlalchemy engine initialization in a thread
|
|
137
|
+
# lock to ensure that multiple threads do not initialize the
|
|
138
|
+
# engine which could result in a rare race condition where
|
|
139
|
+
# a session has already been created with _SQLALCHEMY_ENGINE = e1,
|
|
140
|
+
# and then another thread overwrites _SQLALCHEMY_ENGINE = e2
|
|
141
|
+
# which could result in e1 being garbage collected unexpectedly.
|
|
134
142
|
def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
135
143
|
global _SQLALCHEMY_ENGINE
|
|
136
144
|
|
|
137
145
|
if _SQLALCHEMY_ENGINE is not None:
|
|
138
146
|
return _SQLALCHEMY_ENGINE
|
|
139
147
|
|
|
140
|
-
|
|
141
|
-
|
|
148
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
149
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
150
|
+
return _SQLALCHEMY_ENGINE
|
|
151
|
+
# get an engine to the db
|
|
152
|
+
engine = migration_utils.get_engine('spot_jobs')
|
|
142
153
|
|
|
143
|
-
|
|
144
|
-
|
|
154
|
+
# run migrations if needed
|
|
155
|
+
create_table(engine)
|
|
145
156
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
157
|
+
# return engine
|
|
158
|
+
_SQLALCHEMY_ENGINE = engine
|
|
159
|
+
return _SQLALCHEMY_ENGINE
|
|
149
160
|
|
|
150
161
|
|
|
151
162
|
def _init_db(func):
|
|
@@ -1045,6 +1056,23 @@ def _get_all_task_ids_statuses(
|
|
|
1045
1056
|
return [(row[0], ManagedJobStatus(row[1])) for row in id_statuses]
|
|
1046
1057
|
|
|
1047
1058
|
|
|
1059
|
+
@_init_db
|
|
1060
|
+
def get_all_task_ids_names_statuses_logs(
|
|
1061
|
+
job_id: int) -> List[Tuple[int, str, ManagedJobStatus, str]]:
|
|
1062
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1063
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1064
|
+
id_names = session.execute(
|
|
1065
|
+
sqlalchemy.select(
|
|
1066
|
+
spot_table.c.task_id,
|
|
1067
|
+
spot_table.c.task_name,
|
|
1068
|
+
spot_table.c.status,
|
|
1069
|
+
spot_table.c.local_log_file,
|
|
1070
|
+
).where(spot_table.c.spot_job_id == job_id).order_by(
|
|
1071
|
+
spot_table.c.task_id.asc())).fetchall()
|
|
1072
|
+
return [(row[0], row[1], ManagedJobStatus(row[2]), row[3])
|
|
1073
|
+
for row in id_names]
|
|
1074
|
+
|
|
1075
|
+
|
|
1048
1076
|
@_init_db
|
|
1049
1077
|
def get_job_status_with_task_id(job_id: int,
|
|
1050
1078
|
task_id: int) -> Optional[ManagedJobStatus]:
|
sky/jobs/utils.py
CHANGED
|
@@ -716,23 +716,41 @@ def stream_logs_by_id(job_id: int,
|
|
|
716
716
|
if managed_job_status.is_failed():
|
|
717
717
|
job_msg = ('\nFailure reason: '
|
|
718
718
|
f'{managed_job_state.get_failure_reason(job_id)}')
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
719
|
+
log_file_exists = False
|
|
720
|
+
task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
|
|
721
|
+
job_id)
|
|
722
|
+
num_tasks = len(task_info)
|
|
723
|
+
for task_id, task_name, task_status, log_file in task_info:
|
|
724
|
+
if log_file:
|
|
725
|
+
log_file_exists = True
|
|
726
|
+
task_str = (f'Task {task_name}({task_id})'
|
|
727
|
+
if task_name else f'Task {task_id}')
|
|
728
|
+
if num_tasks > 1:
|
|
729
|
+
print(f'=== {task_str} ===')
|
|
730
|
+
with open(os.path.expanduser(log_file),
|
|
731
|
+
'r',
|
|
732
|
+
encoding='utf-8') as f:
|
|
733
|
+
# Stream the logs to the console without reading the
|
|
734
|
+
# whole file into memory.
|
|
735
|
+
start_streaming = False
|
|
736
|
+
read_from: Union[TextIO, Deque[str]] = f
|
|
737
|
+
if tail is not None:
|
|
738
|
+
assert tail > 0
|
|
739
|
+
# Read only the last 'tail' lines using deque
|
|
740
|
+
read_from = collections.deque(f, maxlen=tail)
|
|
741
|
+
for line in read_from:
|
|
742
|
+
if log_lib.LOG_FILE_START_STREAMING_AT in line:
|
|
743
|
+
start_streaming = True
|
|
744
|
+
if start_streaming:
|
|
745
|
+
print(line, end='', flush=True)
|
|
746
|
+
if num_tasks > 1:
|
|
747
|
+
# Add the "Task finished" message for terminal states
|
|
748
|
+
if task_status.is_terminal():
|
|
749
|
+
print(ux_utils.finishing_message(
|
|
750
|
+
f'{task_str} finished '
|
|
751
|
+
f'(status: {task_status.value}).'),
|
|
752
|
+
flush=True)
|
|
753
|
+
if log_file_exists:
|
|
736
754
|
# Add the "Job finished" message for terminal states
|
|
737
755
|
if managed_job_status.is_terminal():
|
|
738
756
|
print(ux_utils.finishing_message(
|
sky/logs/aws.py
CHANGED
|
@@ -9,6 +9,8 @@ from sky.skylet import constants
|
|
|
9
9
|
from sky.utils import common_utils
|
|
10
10
|
from sky.utils import resources_utils
|
|
11
11
|
|
|
12
|
+
EC2_MD_URL = '"${AWS_EC2_METADATA_SERVICE_ENDPOINT:-http://169.254.169.254/}"'
|
|
13
|
+
|
|
12
14
|
|
|
13
15
|
class _CloudwatchLoggingConfig(pydantic.BaseModel):
|
|
14
16
|
"""Configuration for AWS CloudWatch logging agent."""
|
|
@@ -109,8 +111,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
109
111
|
# Check if we're running on EC2 with an IAM role or if
|
|
110
112
|
# AWS credentials are available in the environment
|
|
111
113
|
pre_cmd = (
|
|
112
|
-
'if ! curl -s -m 1
|
|
113
|
-
'
|
|
114
|
+
f'if ! curl -s -m 1 {EC2_MD_URL}'
|
|
115
|
+
'latest/meta-data/iam/security-credentials/ > /dev/null; '
|
|
114
116
|
'then '
|
|
115
117
|
# failed EC2 check, look for env vars
|
|
116
118
|
'if [ -z "$AWS_ACCESS_KEY_ID" ] || '
|
|
@@ -3179,10 +3179,12 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
|
|
|
3179
3179
|
return pods
|
|
3180
3180
|
|
|
3181
3181
|
|
|
3182
|
-
def is_tpu_on_gke(accelerator: str) -> bool:
|
|
3182
|
+
def is_tpu_on_gke(accelerator: str, normalize: bool = True) -> bool:
|
|
3183
3183
|
"""Determines if the given accelerator is a TPU supported on GKE."""
|
|
3184
|
-
|
|
3185
|
-
|
|
3184
|
+
if normalize:
|
|
3185
|
+
normalized, _ = normalize_tpu_accelerator_name(accelerator)
|
|
3186
|
+
return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
|
|
3187
|
+
return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
|
|
3186
3188
|
|
|
3187
3189
|
|
|
3188
3190
|
def get_node_accelerator_count(context: Optional[str],
|
|
@@ -3384,7 +3386,7 @@ def process_skypilot_pods(
|
|
|
3384
3386
|
|
|
3385
3387
|
def _gpu_resource_key_helper(context: Optional[str]) -> str:
|
|
3386
3388
|
"""Helper function to get the GPU resource key."""
|
|
3387
|
-
gpu_resource_key = SUPPORTED_GPU_RESOURCE_KEYS['
|
|
3389
|
+
gpu_resource_key = SUPPORTED_GPU_RESOURCE_KEYS['nvidia']
|
|
3388
3390
|
try:
|
|
3389
3391
|
nodes = kubernetes.core_api(context).list_node().items
|
|
3390
3392
|
for gpu_key in SUPPORTED_GPU_RESOURCE_KEYS.values():
|
sky/provision/vast/instance.py
CHANGED
|
@@ -97,7 +97,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
97
97
|
region=region,
|
|
98
98
|
disk_size=config.node_config['DiskSize'],
|
|
99
99
|
preemptible=config.node_config['Preemptible'],
|
|
100
|
-
image_name=config.node_config['ImageId']
|
|
100
|
+
image_name=config.node_config['ImageId'],
|
|
101
|
+
ports=config.ports_to_open_on_launch)
|
|
101
102
|
except Exception as e: # pylint: disable=broad-except
|
|
102
103
|
logger.warning(f'run_instances error: {e}')
|
|
103
104
|
raise
|
sky/provision/vast/utils.py
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
# python sdk.
|
|
6
6
|
#
|
|
7
7
|
"""Vast library wrapper for SkyPilot."""
|
|
8
|
-
from typing import Any, Dict, List
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
9
|
|
|
10
10
|
from sky import sky_logging
|
|
11
11
|
from sky.adaptors import vast
|
|
@@ -34,7 +34,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
37
|
-
image_name: str,
|
|
37
|
+
image_name: str, ports: Optional[List[int]],
|
|
38
|
+
preemptible: bool) -> str:
|
|
38
39
|
"""Launches an instance with the given parameters.
|
|
39
40
|
|
|
40
41
|
Converts the instance_type to the Vast GPU name, finds the specs for the
|
|
@@ -58,6 +59,8 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
58
59
|
The disk size {xx} GB is not exactly matched the requested
|
|
59
60
|
size {yy} GB. It is possible to charge extra cost on disk.
|
|
60
61
|
|
|
62
|
+
* `ports`: This is a feature flag to expose ports to the internet.
|
|
63
|
+
|
|
61
64
|
* `geolocation`: Geolocation on Vast can be as specific as the
|
|
62
65
|
host chooses to be. They can say, for instance, "Yutakachō,
|
|
63
66
|
Shinagawa District, Tokyo, JP." Such a specific geolocation
|
|
@@ -79,9 +82,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
79
82
|
|
|
80
83
|
* Vast instance types are an invention for skypilot. Refer to
|
|
81
84
|
catalog/vast_catalog.py for the current construction
|
|
82
|
-
of the type.
|
|
83
|
-
|
|
84
|
-
"""
|
|
85
|
+
of the type."""
|
|
85
86
|
cpu_ram = float(instance_type.split('-')[-1]) / 1024
|
|
86
87
|
gpu_name = instance_type.split('-')[1].replace('_', ' ')
|
|
87
88
|
num_gpus = int(instance_type.split('-')[0].replace('x', ''))
|
|
@@ -104,11 +105,13 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
104
105
|
|
|
105
106
|
instance_touse = instance_list[0]
|
|
106
107
|
|
|
108
|
+
port_map = ' '.join([f'-p {p}:{p}' for p in ports]) if ports else ''
|
|
109
|
+
|
|
107
110
|
launch_params = {
|
|
108
111
|
'id': instance_touse['id'],
|
|
109
112
|
'direct': True,
|
|
110
113
|
'ssh': True,
|
|
111
|
-
'env': '-e __SOURCE=skypilot',
|
|
114
|
+
'env': f'-e __SOURCE=skypilot {port_map}',
|
|
112
115
|
'onstart_cmd': ';'.join([
|
|
113
116
|
'touch ~/.no_auto_tmux',
|
|
114
117
|
f'echo "{vast.vast().api_key_access}" > ~/.vast_api_key',
|
sky/resources.py
CHANGED
|
@@ -797,8 +797,13 @@ class Resources:
|
|
|
797
797
|
|
|
798
798
|
acc, _ = list(accelerators.items())[0]
|
|
799
799
|
if 'tpu' in acc.lower():
|
|
800
|
+
# TODO(syang): GCP TPU names are supported on both GCP and
|
|
801
|
+
# kubernetes (GKE), but this logic automatically assumes
|
|
802
|
+
# GCP TPUs can only be used on GCP.
|
|
803
|
+
# Fix the logic such that GCP TPU names can failover between
|
|
804
|
+
# GCP and kubernetes.
|
|
800
805
|
if self.cloud is None:
|
|
801
|
-
if kubernetes_utils.is_tpu_on_gke(acc):
|
|
806
|
+
if kubernetes_utils.is_tpu_on_gke(acc, normalize=False):
|
|
802
807
|
self._cloud = clouds.Kubernetes()
|
|
803
808
|
else:
|
|
804
809
|
self._cloud = clouds.GCP()
|
|
@@ -813,7 +818,8 @@ class Resources:
|
|
|
813
818
|
|
|
814
819
|
use_tpu_vm = accelerator_args.get('tpu_vm', True)
|
|
815
820
|
if (self.cloud.is_same_cloud(clouds.GCP()) and
|
|
816
|
-
not kubernetes_utils.is_tpu_on_gke(acc
|
|
821
|
+
not kubernetes_utils.is_tpu_on_gke(acc,
|
|
822
|
+
normalize=False)):
|
|
817
823
|
if 'runtime_version' not in accelerator_args:
|
|
818
824
|
|
|
819
825
|
def _get_default_runtime_version() -> str:
|
sky/server/server.py
CHANGED
|
@@ -882,10 +882,15 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
|
|
|
882
882
|
upload_ids_to_cleanup[(upload_id,
|
|
883
883
|
user_hash)] = (datetime.datetime.now() +
|
|
884
884
|
_DEFAULT_UPLOAD_EXPIRATION_TIME)
|
|
885
|
+
# For anonymous access, use the user hash from client
|
|
886
|
+
user_id = user_hash
|
|
887
|
+
if request.state.auth_user is not None:
|
|
888
|
+
# Otherwise, the authenticated identity should be used.
|
|
889
|
+
user_id = request.state.auth_user.id
|
|
885
890
|
|
|
886
891
|
# TODO(SKY-1271): We need to double check security of uploading zip file.
|
|
887
892
|
client_file_mounts_dir = (
|
|
888
|
-
common.API_SERVER_CLIENT_DIR.expanduser().resolve() /
|
|
893
|
+
common.API_SERVER_CLIENT_DIR.expanduser().resolve() / user_id /
|
|
889
894
|
'file_mounts')
|
|
890
895
|
client_file_mounts_dir.mkdir(parents=True, exist_ok=True)
|
|
891
896
|
|