skypilot-nightly 1.0.0.dev20250728__py3-none-any.whl → 1.0.0.dev20250730__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/catalog/kubernetes_catalog.py +2 -2
- sky/client/cli/command.py +0 -7
- sky/client/common.py +12 -9
- sky/clouds/kubernetes.py +2 -1
- sky/clouds/nebius.py +1 -1
- sky/clouds/utils/gcp_utils.py +1 -1
- sky/clouds/vast.py +1 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/_r2LwCFLjlWjZDUIJQG_V/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
- sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
- sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
- sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
- sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
- sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
- sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
- sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
- sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
- sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +25 -0
- sky/data/storage.py +1219 -1775
- sky/global_user_state.py +18 -8
- sky/jobs/server/core.py +4 -1
- sky/jobs/state.py +35 -7
- sky/jobs/utils.py +35 -17
- sky/logs/agent.py +0 -14
- sky/logs/aws.py +4 -30
- sky/provision/kubernetes/instance.py +4 -3
- sky/provision/kubernetes/utils.py +56 -31
- sky/provision/vast/instance.py +2 -1
- sky/provision/vast/utils.py +9 -6
- sky/resources.py +8 -2
- sky/serve/server/core.py +21 -2
- sky/serve/service.py +22 -2
- sky/server/server.py +7 -2
- sky/templates/sky-serve-controller.yaml.j2 +3 -0
- sky/utils/kubernetes/gpu_labeler.py +2 -2
- sky/utils/schemas.py +5 -1
- {skypilot_nightly-1.0.0.dev20250728.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250728.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/RECORD +101 -100
- sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
- sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
- sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
- sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
- sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
- sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
- sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
- sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
- sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
- sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
- sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
- sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
- sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
- sky/dashboard/out/_next/static/ucBqsWPN0A5D2kXj8-FqQ/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{ucBqsWPN0A5D2kXj8-FqQ → _r2LwCFLjlWjZDUIJQG_V}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250728.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250728.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250728.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250728.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
|
@@ -11,6 +11,7 @@ import json
|
|
|
11
11
|
import os
|
|
12
12
|
import pickle
|
|
13
13
|
import re
|
|
14
|
+
import threading
|
|
14
15
|
import time
|
|
15
16
|
import typing
|
|
16
17
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
@@ -47,6 +48,7 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
|
|
|
47
48
|
_ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
|
|
48
49
|
|
|
49
50
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
51
|
+
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
50
52
|
|
|
51
53
|
Base = declarative.declarative_base()
|
|
52
54
|
|
|
@@ -241,21 +243,29 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
241
243
|
migration_utils.GLOBAL_USER_STATE_VERSION)
|
|
242
244
|
|
|
243
245
|
|
|
246
|
+
# We wrap the sqlalchemy engine initialization in a thread
|
|
247
|
+
# lock to ensure that multiple threads do not initialize the
|
|
248
|
+
# engine which could result in a rare race condition where
|
|
249
|
+
# a session has already been created with _SQLALCHEMY_ENGINE = e1,
|
|
250
|
+
# and then another thread overwrites _SQLALCHEMY_ENGINE = e2
|
|
251
|
+
# which could result in e1 being garbage collected unexpectedly.
|
|
244
252
|
def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
245
253
|
global _SQLALCHEMY_ENGINE
|
|
246
254
|
|
|
247
255
|
if _SQLALCHEMY_ENGINE is not None:
|
|
248
256
|
return _SQLALCHEMY_ENGINE
|
|
257
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
258
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
259
|
+
return _SQLALCHEMY_ENGINE
|
|
260
|
+
# get an engine to the db
|
|
261
|
+
engine = migration_utils.get_engine('state')
|
|
249
262
|
|
|
250
|
-
|
|
251
|
-
|
|
263
|
+
# run migrations if needed
|
|
264
|
+
create_table(engine)
|
|
252
265
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
# return engine
|
|
257
|
-
_SQLALCHEMY_ENGINE = engine
|
|
258
|
-
return _SQLALCHEMY_ENGINE
|
|
266
|
+
# return engine
|
|
267
|
+
_SQLALCHEMY_ENGINE = engine
|
|
268
|
+
return _SQLALCHEMY_ENGINE
|
|
259
269
|
|
|
260
270
|
|
|
261
271
|
def _init_db(func):
|
sky/jobs/server/core.py
CHANGED
|
@@ -59,7 +59,10 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
|
|
59
59
|
# as uploading to the controller is only a local copy.
|
|
60
60
|
storage_clouds = (
|
|
61
61
|
storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
|
|
62
|
-
|
|
62
|
+
force_disable_cloud_bucket = skypilot_config.get_nested(
|
|
63
|
+
('jobs', 'force_disable_cloud_bucket'), False)
|
|
64
|
+
if (not managed_job_utils.is_consolidation_mode() and storage_clouds and
|
|
65
|
+
not force_disable_cloud_bucket):
|
|
63
66
|
for task_ in dag.tasks:
|
|
64
67
|
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
|
65
68
|
task_, task_type='jobs')
|
sky/jobs/state.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
import enum
|
|
5
5
|
import functools
|
|
6
6
|
import json
|
|
7
|
+
import threading
|
|
7
8
|
import time
|
|
8
9
|
import typing
|
|
9
10
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
@@ -33,6 +34,7 @@ CallbackType = Callable[[str], None]
|
|
|
33
34
|
logger = sky_logging.init_logger(__name__)
|
|
34
35
|
|
|
35
36
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
37
|
+
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
36
38
|
|
|
37
39
|
Base = declarative.declarative_base()
|
|
38
40
|
|
|
@@ -131,21 +133,30 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
131
133
|
migration_utils.SPOT_JOBS_VERSION)
|
|
132
134
|
|
|
133
135
|
|
|
136
|
+
# We wrap the sqlalchemy engine initialization in a thread
|
|
137
|
+
# lock to ensure that multiple threads do not initialize the
|
|
138
|
+
# engine which could result in a rare race condition where
|
|
139
|
+
# a session has already been created with _SQLALCHEMY_ENGINE = e1,
|
|
140
|
+
# and then another thread overwrites _SQLALCHEMY_ENGINE = e2
|
|
141
|
+
# which could result in e1 being garbage collected unexpectedly.
|
|
134
142
|
def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
135
143
|
global _SQLALCHEMY_ENGINE
|
|
136
144
|
|
|
137
145
|
if _SQLALCHEMY_ENGINE is not None:
|
|
138
146
|
return _SQLALCHEMY_ENGINE
|
|
139
147
|
|
|
140
|
-
|
|
141
|
-
|
|
148
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
149
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
150
|
+
return _SQLALCHEMY_ENGINE
|
|
151
|
+
# get an engine to the db
|
|
152
|
+
engine = migration_utils.get_engine('spot_jobs')
|
|
142
153
|
|
|
143
|
-
|
|
144
|
-
|
|
154
|
+
# run migrations if needed
|
|
155
|
+
create_table(engine)
|
|
145
156
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
157
|
+
# return engine
|
|
158
|
+
_SQLALCHEMY_ENGINE = engine
|
|
159
|
+
return _SQLALCHEMY_ENGINE
|
|
149
160
|
|
|
150
161
|
|
|
151
162
|
def _init_db(func):
|
|
@@ -1045,6 +1056,23 @@ def _get_all_task_ids_statuses(
|
|
|
1045
1056
|
return [(row[0], ManagedJobStatus(row[1])) for row in id_statuses]
|
|
1046
1057
|
|
|
1047
1058
|
|
|
1059
|
+
@_init_db
|
|
1060
|
+
def get_all_task_ids_names_statuses_logs(
|
|
1061
|
+
job_id: int) -> List[Tuple[int, str, ManagedJobStatus, str]]:
|
|
1062
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1063
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1064
|
+
id_names = session.execute(
|
|
1065
|
+
sqlalchemy.select(
|
|
1066
|
+
spot_table.c.task_id,
|
|
1067
|
+
spot_table.c.task_name,
|
|
1068
|
+
spot_table.c.status,
|
|
1069
|
+
spot_table.c.local_log_file,
|
|
1070
|
+
).where(spot_table.c.spot_job_id == job_id).order_by(
|
|
1071
|
+
spot_table.c.task_id.asc())).fetchall()
|
|
1072
|
+
return [(row[0], row[1], ManagedJobStatus(row[2]), row[3])
|
|
1073
|
+
for row in id_names]
|
|
1074
|
+
|
|
1075
|
+
|
|
1048
1076
|
@_init_db
|
|
1049
1077
|
def get_job_status_with_task_id(job_id: int,
|
|
1050
1078
|
task_id: int) -> Optional[ManagedJobStatus]:
|
sky/jobs/utils.py
CHANGED
|
@@ -716,23 +716,41 @@ def stream_logs_by_id(job_id: int,
|
|
|
716
716
|
if managed_job_status.is_failed():
|
|
717
717
|
job_msg = ('\nFailure reason: '
|
|
718
718
|
f'{managed_job_state.get_failure_reason(job_id)}')
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
719
|
+
log_file_exists = False
|
|
720
|
+
task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
|
|
721
|
+
job_id)
|
|
722
|
+
num_tasks = len(task_info)
|
|
723
|
+
for task_id, task_name, task_status, log_file in task_info:
|
|
724
|
+
if log_file:
|
|
725
|
+
log_file_exists = True
|
|
726
|
+
task_str = (f'Task {task_name}({task_id})'
|
|
727
|
+
if task_name else f'Task {task_id}')
|
|
728
|
+
if num_tasks > 1:
|
|
729
|
+
print(f'=== {task_str} ===')
|
|
730
|
+
with open(os.path.expanduser(log_file),
|
|
731
|
+
'r',
|
|
732
|
+
encoding='utf-8') as f:
|
|
733
|
+
# Stream the logs to the console without reading the
|
|
734
|
+
# whole file into memory.
|
|
735
|
+
start_streaming = False
|
|
736
|
+
read_from: Union[TextIO, Deque[str]] = f
|
|
737
|
+
if tail is not None:
|
|
738
|
+
assert tail > 0
|
|
739
|
+
# Read only the last 'tail' lines using deque
|
|
740
|
+
read_from = collections.deque(f, maxlen=tail)
|
|
741
|
+
for line in read_from:
|
|
742
|
+
if log_lib.LOG_FILE_START_STREAMING_AT in line:
|
|
743
|
+
start_streaming = True
|
|
744
|
+
if start_streaming:
|
|
745
|
+
print(line, end='', flush=True)
|
|
746
|
+
if num_tasks > 1:
|
|
747
|
+
# Add the "Task finished" message for terminal states
|
|
748
|
+
if task_status.is_terminal():
|
|
749
|
+
print(ux_utils.finishing_message(
|
|
750
|
+
f'{task_str} finished '
|
|
751
|
+
f'(status: {task_status.value}).'),
|
|
752
|
+
flush=True)
|
|
753
|
+
if log_file_exists:
|
|
736
754
|
# Add the "Job finished" message for terminal states
|
|
737
755
|
if managed_job_status.is_terminal():
|
|
738
756
|
print(ux_utils.finishing_message(
|
sky/logs/agent.py
CHANGED
|
@@ -67,20 +67,6 @@ class FluentbitAgent(LoggingAgent):
|
|
|
67
67
|
}
|
|
68
68
|
return common_utils.dump_yaml_str(cfg_dict)
|
|
69
69
|
|
|
70
|
-
def add_fallback_outputs(self, cfg_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
71
|
-
"""Add fallback outputs to the Fluent Bit configuration.
|
|
72
|
-
|
|
73
|
-
This method can be overridden by subclasses to add fallback outputs
|
|
74
|
-
in case the primary output fails.
|
|
75
|
-
|
|
76
|
-
Args:
|
|
77
|
-
cfg_dict: The Fluent Bit configuration dictionary.
|
|
78
|
-
|
|
79
|
-
Returns:
|
|
80
|
-
The updated configuration dictionary.
|
|
81
|
-
"""
|
|
82
|
-
return cfg_dict
|
|
83
|
-
|
|
84
70
|
@abc.abstractmethod
|
|
85
71
|
def fluentbit_output_config(
|
|
86
72
|
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
sky/logs/aws.py
CHANGED
|
@@ -9,6 +9,8 @@ from sky.skylet import constants
|
|
|
9
9
|
from sky.utils import common_utils
|
|
10
10
|
from sky.utils import resources_utils
|
|
11
11
|
|
|
12
|
+
EC2_MD_URL = '"${AWS_EC2_METADATA_SERVICE_ENDPOINT:-http://169.254.169.254/}"'
|
|
13
|
+
|
|
12
14
|
|
|
13
15
|
class _CloudwatchLoggingConfig(pydantic.BaseModel):
|
|
14
16
|
"""Configuration for AWS CloudWatch logging agent."""
|
|
@@ -109,8 +111,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
109
111
|
# Check if we're running on EC2 with an IAM role or if
|
|
110
112
|
# AWS credentials are available in the environment
|
|
111
113
|
pre_cmd = (
|
|
112
|
-
'if ! curl -s -m 1
|
|
113
|
-
'
|
|
114
|
+
f'if ! curl -s -m 1 {EC2_MD_URL}'
|
|
115
|
+
'latest/meta-data/iam/security-credentials/ > /dev/null; '
|
|
114
116
|
'then '
|
|
115
117
|
# failed EC2 check, look for env vars
|
|
116
118
|
'if [ -z "$AWS_ACCESS_KEY_ID" ] || '
|
|
@@ -211,36 +213,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
211
213
|
}
|
|
212
214
|
}
|
|
213
215
|
|
|
214
|
-
# Add fallback outputs for graceful failure handling
|
|
215
|
-
cfg_dict = self.add_fallback_outputs(cfg_dict)
|
|
216
|
-
|
|
217
216
|
return common_utils.dump_yaml_str(cfg_dict)
|
|
218
217
|
|
|
219
|
-
def add_fallback_outputs(self, cfg_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
220
|
-
"""Add fallback outputs to the Fluent Bit configuration.
|
|
221
|
-
|
|
222
|
-
This adds a local file output as a fallback in case
|
|
223
|
-
CloudWatch logging fails.
|
|
224
|
-
|
|
225
|
-
Args:
|
|
226
|
-
cfg_dict: The Fluent Bit configuration dictionary.
|
|
227
|
-
|
|
228
|
-
Returns:
|
|
229
|
-
The updated configuration dictionary.
|
|
230
|
-
"""
|
|
231
|
-
# Add a local file output as a fallback
|
|
232
|
-
fallback_output = {
|
|
233
|
-
'name': 'file',
|
|
234
|
-
'match': '*',
|
|
235
|
-
'path': '/tmp/skypilot_logs_fallback.log',
|
|
236
|
-
'format': 'out_file',
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
# Add the fallback output to the configuration
|
|
240
|
-
cfg_dict['pipeline']['outputs'].append(fallback_output)
|
|
241
|
-
|
|
242
|
-
return cfg_dict
|
|
243
|
-
|
|
244
218
|
def fluentbit_output_config(
|
|
245
219
|
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
|
246
220
|
"""Get the Fluent Bit output configuration for CloudWatch.
|
|
@@ -210,7 +210,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
|
210
210
|
# case we will need to update this logic.
|
|
211
211
|
# TODO(Doyoung): Update the error message raised
|
|
212
212
|
# with the multi-host TPU support.
|
|
213
|
-
gpu_resource_key = kubernetes_utils.get_gpu_resource_key() # pylint: disable=line-too-long
|
|
213
|
+
gpu_resource_key = kubernetes_utils.get_gpu_resource_key(context) # pylint: disable=line-too-long
|
|
214
214
|
if 'Insufficient google.com/tpu' in event_message:
|
|
215
215
|
extra_msg = (
|
|
216
216
|
f'Verify if '
|
|
@@ -797,7 +797,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
797
797
|
limits = pod_spec['spec']['containers'][0].get('resources',
|
|
798
798
|
{}).get('limits')
|
|
799
799
|
if limits is not None:
|
|
800
|
-
needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(),
|
|
800
|
+
needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(context),
|
|
801
|
+
0) > 0
|
|
801
802
|
|
|
802
803
|
# TPU pods provisioned on GKE use the default containerd runtime.
|
|
803
804
|
# Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long
|
|
@@ -900,7 +901,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
900
901
|
# to the non-DWS case.
|
|
901
902
|
if needs_gpus:
|
|
902
903
|
gpu_toleration = {
|
|
903
|
-
'key': kubernetes_utils.get_gpu_resource_key(),
|
|
904
|
+
'key': kubernetes_utils.get_gpu_resource_key(context),
|
|
904
905
|
'operator': 'Exists',
|
|
905
906
|
'effect': 'NoSchedule'
|
|
906
907
|
}
|
|
@@ -147,12 +147,14 @@ MEMORY_SIZE_UNITS = {
|
|
|
147
147
|
# The resource keys used by Kubernetes to track NVIDIA GPUs and Google TPUs on
|
|
148
148
|
# nodes. These keys are typically used in the node's status.allocatable
|
|
149
149
|
# or status.capacity fields to indicate the available resources on the node.
|
|
150
|
-
|
|
150
|
+
SUPPORTED_GPU_RESOURCE_KEYS = {'amd': 'amd.com/gpu', 'nvidia': 'nvidia.com/gpu'}
|
|
151
151
|
TPU_RESOURCE_KEY = 'google.com/tpu'
|
|
152
152
|
|
|
153
153
|
NO_ACCELERATOR_HELP_MESSAGE = (
|
|
154
154
|
'If your cluster contains GPUs or TPUs, make sure '
|
|
155
|
-
f'
|
|
155
|
+
f'one of {SUPPORTED_GPU_RESOURCE_KEYS["amd"]}, '
|
|
156
|
+
f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]} or '
|
|
157
|
+
f'{TPU_RESOURCE_KEY} resource is available '
|
|
156
158
|
'on the nodes and the node labels for identifying GPUs/TPUs '
|
|
157
159
|
'(e.g., skypilot.co/accelerator) are setup correctly. ')
|
|
158
160
|
|
|
@@ -391,6 +393,8 @@ def get_gke_accelerator_name(accelerator: str) -> str:
|
|
|
391
393
|
return 'nvidia-h200-141gb'
|
|
392
394
|
elif accelerator.startswith('tpu-'):
|
|
393
395
|
return accelerator
|
|
396
|
+
elif accelerator.startswith('amd-'):
|
|
397
|
+
return accelerator
|
|
394
398
|
else:
|
|
395
399
|
return 'nvidia-tesla-{}'.format(accelerator.lower())
|
|
396
400
|
|
|
@@ -1098,10 +1102,10 @@ def detect_accelerator_resource(
|
|
|
1098
1102
|
context: Optional[str]) -> Tuple[bool, Set[str]]:
|
|
1099
1103
|
"""Checks if the Kubernetes cluster has GPU/TPU resource.
|
|
1100
1104
|
|
|
1101
|
-
|
|
1102
|
-
with nvidia.com/gpu and google.com/tpu. If nvidia.com/gpu resource is
|
|
1105
|
+
Three types of accelerator resources are available which are each checked
|
|
1106
|
+
with amd.com/gpu, nvidia.com/gpu and google.com/tpu. If amd.com/gpu or nvidia.com/gpu resource is
|
|
1103
1107
|
missing, that typically means that the Kubernetes cluster does not have
|
|
1104
|
-
GPUs or the nvidia GPU operator and/or device drivers are not installed.
|
|
1108
|
+
GPUs or the amd/nvidia GPU operator and/or device drivers are not installed.
|
|
1105
1109
|
|
|
1106
1110
|
Returns:
|
|
1107
1111
|
bool: True if the cluster has GPU_RESOURCE_KEY or TPU_RESOURCE_KEY
|
|
@@ -1112,7 +1116,7 @@ def detect_accelerator_resource(
|
|
|
1112
1116
|
nodes = get_kubernetes_nodes(context=context)
|
|
1113
1117
|
for node in nodes:
|
|
1114
1118
|
cluster_resources.update(node.status.allocatable.keys())
|
|
1115
|
-
has_accelerator = (get_gpu_resource_key() in cluster_resources or
|
|
1119
|
+
has_accelerator = (get_gpu_resource_key(context) in cluster_resources or
|
|
1116
1120
|
TPU_RESOURCE_KEY in cluster_resources)
|
|
1117
1121
|
|
|
1118
1122
|
return has_accelerator, cluster_resources
|
|
@@ -1262,8 +1266,8 @@ def check_instance_fits(context: Optional[str],
|
|
|
1262
1266
|
else:
|
|
1263
1267
|
# Check if any of the GPU nodes have sufficient number of GPUs.
|
|
1264
1268
|
gpu_nodes = [
|
|
1265
|
-
node for node in gpu_nodes if
|
|
1266
|
-
|
|
1269
|
+
node for node in gpu_nodes if get_node_accelerator_count(
|
|
1270
|
+
context, node.status.allocatable) >= acc_count
|
|
1267
1271
|
]
|
|
1268
1272
|
if not gpu_nodes:
|
|
1269
1273
|
return False, (
|
|
@@ -1325,14 +1329,14 @@ def get_accelerator_label_key_values(
|
|
|
1325
1329
|
Raises:
|
|
1326
1330
|
ResourcesUnavailableError: Can be raised from the following conditions:
|
|
1327
1331
|
- The cluster does not have GPU/TPU resources
|
|
1328
|
-
(nvidia.com/gpu, google.com/tpu)
|
|
1332
|
+
(amd.com/gpu, nvidia.com/gpu, google.com/tpu)
|
|
1329
1333
|
- The cluster has GPU/TPU resources, but no node in the cluster has
|
|
1330
1334
|
an accelerator label.
|
|
1331
1335
|
- The cluster has a node with an invalid accelerator label value.
|
|
1332
1336
|
- The cluster doesn't have any nodes with acc_type GPU/TPU
|
|
1333
1337
|
"""
|
|
1334
1338
|
# Check if the cluster has GPU resources
|
|
1335
|
-
# TODO(romilb): This assumes the accelerator is a nvidia GPU. We
|
|
1339
|
+
# TODO(romilb): This assumes the accelerator is a amd/nvidia GPU. We
|
|
1336
1340
|
# need to support TPUs and other accelerators as well.
|
|
1337
1341
|
# TODO(romilb): Currently, we broadly disable all GPU checks if autoscaling
|
|
1338
1342
|
# is configured in config.yaml since the cluster may be scaling up from
|
|
@@ -1496,12 +1500,15 @@ def get_accelerator_label_key_values(
|
|
|
1496
1500
|
f'`sky ssh up --infra {context_display_name}`. {suffix}')
|
|
1497
1501
|
else:
|
|
1498
1502
|
msg = (
|
|
1499
|
-
f'Could not detect GPU/TPU resources ({
|
|
1503
|
+
f'Could not detect GPU/TPU resources ({SUPPORTED_GPU_RESOURCE_KEYS["amd"]!r}, '
|
|
1504
|
+
f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]!r} or '
|
|
1500
1505
|
f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
|
|
1501
1506
|
' contains GPUs, please ensure GPU drivers are installed on '
|
|
1502
1507
|
'the node. Check if the GPUs are setup correctly by running '
|
|
1503
1508
|
'`kubectl describe nodes` and looking for the '
|
|
1504
|
-
f'{
|
|
1509
|
+
f'{SUPPORTED_GPU_RESOURCE_KEYS["amd"]!r}, '
|
|
1510
|
+
f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]!r} or '
|
|
1511
|
+
f'{TPU_RESOURCE_KEY!r} resource. '
|
|
1505
1512
|
'Please refer to the documentation on how to set up GPUs.'
|
|
1506
1513
|
f'{suffix}')
|
|
1507
1514
|
raise exceptions.ResourcesUnavailableError(msg)
|
|
@@ -2861,7 +2868,7 @@ def get_unlabeled_accelerator_nodes(context: Optional[str] = None) -> List[Any]:
|
|
|
2861
2868
|
nodes = get_kubernetes_nodes(context=context)
|
|
2862
2869
|
nodes_with_accelerator = []
|
|
2863
2870
|
for node in nodes:
|
|
2864
|
-
if get_gpu_resource_key() in node.status.capacity:
|
|
2871
|
+
if get_gpu_resource_key(context) in node.status.capacity:
|
|
2865
2872
|
nodes_with_accelerator.append(node)
|
|
2866
2873
|
|
|
2867
2874
|
label_formatter, _ = detect_gpu_label_formatter(context)
|
|
@@ -2950,7 +2957,8 @@ def get_kubernetes_node_info(
|
|
|
2950
2957
|
break
|
|
2951
2958
|
|
|
2952
2959
|
allocated_qty = 0
|
|
2953
|
-
accelerator_count = get_node_accelerator_count(
|
|
2960
|
+
accelerator_count = get_node_accelerator_count(context,
|
|
2961
|
+
node.status.allocatable)
|
|
2954
2962
|
|
|
2955
2963
|
if pods is None:
|
|
2956
2964
|
accelerators_available = -1
|
|
@@ -2965,7 +2973,7 @@ def get_kubernetes_node_info(
|
|
|
2965
2973
|
for container in pod.spec.containers:
|
|
2966
2974
|
if container.resources.requests:
|
|
2967
2975
|
allocated_qty += get_node_accelerator_count(
|
|
2968
|
-
container.resources.requests)
|
|
2976
|
+
context, container.resources.requests)
|
|
2969
2977
|
|
|
2970
2978
|
accelerators_available = accelerator_count - allocated_qty
|
|
2971
2979
|
|
|
@@ -3171,13 +3179,16 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
|
|
|
3171
3179
|
return pods
|
|
3172
3180
|
|
|
3173
3181
|
|
|
3174
|
-
def is_tpu_on_gke(accelerator: str) -> bool:
|
|
3182
|
+
def is_tpu_on_gke(accelerator: str, normalize: bool = True) -> bool:
|
|
3175
3183
|
"""Determines if the given accelerator is a TPU supported on GKE."""
|
|
3176
|
-
|
|
3177
|
-
|
|
3184
|
+
if normalize:
|
|
3185
|
+
normalized, _ = normalize_tpu_accelerator_name(accelerator)
|
|
3186
|
+
return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
|
|
3187
|
+
return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
|
|
3178
3188
|
|
|
3179
3189
|
|
|
3180
|
-
def get_node_accelerator_count(
|
|
3190
|
+
def get_node_accelerator_count(context: Optional[str],
|
|
3191
|
+
attribute_dict: dict) -> int:
|
|
3181
3192
|
"""Retrieves the count of accelerators from a node's resource dictionary.
|
|
3182
3193
|
|
|
3183
3194
|
This method checks the node's allocatable resources or the accelerators
|
|
@@ -3192,7 +3203,7 @@ def get_node_accelerator_count(attribute_dict: dict) -> int:
|
|
|
3192
3203
|
Number of accelerators allocated or available from the node. If no
|
|
3193
3204
|
resource is found, it returns 0.
|
|
3194
3205
|
"""
|
|
3195
|
-
gpu_resource_name = get_gpu_resource_key()
|
|
3206
|
+
gpu_resource_name = get_gpu_resource_key(context)
|
|
3196
3207
|
assert not (gpu_resource_name in attribute_dict and
|
|
3197
3208
|
TPU_RESOURCE_KEY in attribute_dict)
|
|
3198
3209
|
if gpu_resource_name in attribute_dict:
|
|
@@ -3318,7 +3329,7 @@ def process_skypilot_pods(
|
|
|
3318
3329
|
unit='G')
|
|
3319
3330
|
gpu_count = parse_cpu_or_gpu_resource(
|
|
3320
3331
|
pod.spec.containers[0].resources.requests.get(
|
|
3321
|
-
|
|
3332
|
+
get_gpu_resource_key(context), '0'))
|
|
3322
3333
|
gpu_name = None
|
|
3323
3334
|
if gpu_count > 0:
|
|
3324
3335
|
label_formatter, _ = (detect_gpu_label_formatter(context))
|
|
@@ -3373,19 +3384,33 @@ def process_skypilot_pods(
|
|
|
3373
3384
|
return list(clusters.values()), jobs_controllers, serve_controllers
|
|
3374
3385
|
|
|
3375
3386
|
|
|
3376
|
-
def
|
|
3377
|
-
"""
|
|
3378
|
-
|
|
3379
|
-
|
|
3380
|
-
|
|
3381
|
-
|
|
3387
|
+
def _gpu_resource_key_helper(context: Optional[str]) -> str:
|
|
3388
|
+
"""Helper function to get the GPU resource key."""
|
|
3389
|
+
gpu_resource_key = SUPPORTED_GPU_RESOURCE_KEYS['nvidia']
|
|
3390
|
+
try:
|
|
3391
|
+
nodes = kubernetes.core_api(context).list_node().items
|
|
3392
|
+
for gpu_key in SUPPORTED_GPU_RESOURCE_KEYS.values():
|
|
3393
|
+
if any(gpu_key in node.status.capacity for node in nodes):
|
|
3394
|
+
return gpu_key
|
|
3395
|
+
except Exception as e: # pylint: disable=broad-except
|
|
3396
|
+
logger.warning(f'Failed to load kube config or query nodes: {e}. '
|
|
3397
|
+
'Falling back to default GPU resource key.')
|
|
3398
|
+
return gpu_resource_key
|
|
3399
|
+
|
|
3400
|
+
|
|
3401
|
+
@annotations.lru_cache(scope='request')
|
|
3402
|
+
def get_gpu_resource_key(context: Optional[str] = None) -> str:
|
|
3403
|
+
"""Get the GPU resource name to use in Kubernetes.
|
|
3404
|
+
|
|
3405
|
+
The function auto-detects the GPU resource key by querying the Kubernetes node API.
|
|
3406
|
+
If detection fails, it falls back to a default value.
|
|
3407
|
+
An environment variable can override the detected or default value.
|
|
3408
|
+
|
|
3382
3409
|
Returns:
|
|
3383
3410
|
str: The selected GPU resource name.
|
|
3384
3411
|
"""
|
|
3385
|
-
|
|
3386
|
-
|
|
3387
|
-
# E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
|
|
3388
|
-
return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY)
|
|
3412
|
+
gpu_resource_key = _gpu_resource_key_helper(context)
|
|
3413
|
+
return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=gpu_resource_key)
|
|
3389
3414
|
|
|
3390
3415
|
|
|
3391
3416
|
def get_kubeconfig_paths() -> List[str]:
|
sky/provision/vast/instance.py
CHANGED
|
@@ -97,7 +97,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
97
97
|
region=region,
|
|
98
98
|
disk_size=config.node_config['DiskSize'],
|
|
99
99
|
preemptible=config.node_config['Preemptible'],
|
|
100
|
-
image_name=config.node_config['ImageId']
|
|
100
|
+
image_name=config.node_config['ImageId'],
|
|
101
|
+
ports=config.ports_to_open_on_launch)
|
|
101
102
|
except Exception as e: # pylint: disable=broad-except
|
|
102
103
|
logger.warning(f'run_instances error: {e}')
|
|
103
104
|
raise
|
sky/provision/vast/utils.py
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
# python sdk.
|
|
6
6
|
#
|
|
7
7
|
"""Vast library wrapper for SkyPilot."""
|
|
8
|
-
from typing import Any, Dict, List
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
9
|
|
|
10
10
|
from sky import sky_logging
|
|
11
11
|
from sky.adaptors import vast
|
|
@@ -34,7 +34,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
37
|
-
image_name: str,
|
|
37
|
+
image_name: str, ports: Optional[List[int]],
|
|
38
|
+
preemptible: bool) -> str:
|
|
38
39
|
"""Launches an instance with the given parameters.
|
|
39
40
|
|
|
40
41
|
Converts the instance_type to the Vast GPU name, finds the specs for the
|
|
@@ -58,6 +59,8 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
58
59
|
The disk size {xx} GB is not exactly matched the requested
|
|
59
60
|
size {yy} GB. It is possible to charge extra cost on disk.
|
|
60
61
|
|
|
62
|
+
* `ports`: This is a feature flag to expose ports to the internet.
|
|
63
|
+
|
|
61
64
|
* `geolocation`: Geolocation on Vast can be as specific as the
|
|
62
65
|
host chooses to be. They can say, for instance, "Yutakachō,
|
|
63
66
|
Shinagawa District, Tokyo, JP." Such a specific geolocation
|
|
@@ -79,9 +82,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
79
82
|
|
|
80
83
|
* Vast instance types are an invention for skypilot. Refer to
|
|
81
84
|
catalog/vast_catalog.py for the current construction
|
|
82
|
-
of the type.
|
|
83
|
-
|
|
84
|
-
"""
|
|
85
|
+
of the type."""
|
|
85
86
|
cpu_ram = float(instance_type.split('-')[-1]) / 1024
|
|
86
87
|
gpu_name = instance_type.split('-')[1].replace('_', ' ')
|
|
87
88
|
num_gpus = int(instance_type.split('-')[0].replace('x', ''))
|
|
@@ -104,11 +105,13 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
104
105
|
|
|
105
106
|
instance_touse = instance_list[0]
|
|
106
107
|
|
|
108
|
+
port_map = ' '.join([f'-p {p}:{p}' for p in ports]) if ports else ''
|
|
109
|
+
|
|
107
110
|
launch_params = {
|
|
108
111
|
'id': instance_touse['id'],
|
|
109
112
|
'direct': True,
|
|
110
113
|
'ssh': True,
|
|
111
|
-
'env': '-e __SOURCE=skypilot',
|
|
114
|
+
'env': f'-e __SOURCE=skypilot {port_map}',
|
|
112
115
|
'onstart_cmd': ';'.join([
|
|
113
116
|
'touch ~/.no_auto_tmux',
|
|
114
117
|
f'echo "{vast.vast().api_key_access}" > ~/.vast_api_key',
|
sky/resources.py
CHANGED
|
@@ -797,8 +797,13 @@ class Resources:
|
|
|
797
797
|
|
|
798
798
|
acc, _ = list(accelerators.items())[0]
|
|
799
799
|
if 'tpu' in acc.lower():
|
|
800
|
+
# TODO(syang): GCP TPU names are supported on both GCP and
|
|
801
|
+
# kubernetes (GKE), but this logic automatically assumes
|
|
802
|
+
# GCP TPUs can only be used on GCP.
|
|
803
|
+
# Fix the logic such that GCP TPU names can failover between
|
|
804
|
+
# GCP and kubernetes.
|
|
800
805
|
if self.cloud is None:
|
|
801
|
-
if kubernetes_utils.is_tpu_on_gke(acc):
|
|
806
|
+
if kubernetes_utils.is_tpu_on_gke(acc, normalize=False):
|
|
802
807
|
self._cloud = clouds.Kubernetes()
|
|
803
808
|
else:
|
|
804
809
|
self._cloud = clouds.GCP()
|
|
@@ -813,7 +818,8 @@ class Resources:
|
|
|
813
818
|
|
|
814
819
|
use_tpu_vm = accelerator_args.get('tpu_vm', True)
|
|
815
820
|
if (self.cloud.is_same_cloud(clouds.GCP()) and
|
|
816
|
-
not kubernetes_utils.is_tpu_on_gke(acc
|
|
821
|
+
not kubernetes_utils.is_tpu_on_gke(acc,
|
|
822
|
+
normalize=False)):
|
|
817
823
|
if 'runtime_version' not in accelerator_args:
|
|
818
824
|
|
|
819
825
|
def _get_default_runtime_version() -> str:
|
sky/serve/server/core.py
CHANGED
|
@@ -18,6 +18,7 @@ from sky import skypilot_config
|
|
|
18
18
|
from sky import task as task_lib
|
|
19
19
|
from sky.backends import backend_utils
|
|
20
20
|
from sky.catalog import common as service_catalog_common
|
|
21
|
+
from sky.data import storage as storage_lib
|
|
21
22
|
from sky.serve import constants as serve_constants
|
|
22
23
|
from sky.serve import serve_state
|
|
23
24
|
from sky.serve import serve_utils
|
|
@@ -151,8 +152,25 @@ def up(
|
|
|
151
152
|
|
|
152
153
|
with rich_utils.safe_status(
|
|
153
154
|
ux_utils.spinner_message('Initializing service')):
|
|
154
|
-
|
|
155
|
-
|
|
155
|
+
# Handle file mounts using two-hop approach when cloud storage
|
|
156
|
+
# unavailable
|
|
157
|
+
storage_clouds = (
|
|
158
|
+
storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
|
|
159
|
+
force_disable_cloud_bucket = skypilot_config.get_nested(
|
|
160
|
+
('serve', 'force_disable_cloud_bucket'), False)
|
|
161
|
+
if storage_clouds and not force_disable_cloud_bucket:
|
|
162
|
+
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
|
163
|
+
task, task_type='serve')
|
|
164
|
+
local_to_controller_file_mounts = {}
|
|
165
|
+
else:
|
|
166
|
+
# Fall back to two-hop file_mount uploading when no cloud storage
|
|
167
|
+
if task.storage_mounts:
|
|
168
|
+
raise exceptions.NotSupportedError(
|
|
169
|
+
'Cloud-based file_mounts are specified, but no cloud '
|
|
170
|
+
'storage is available. Please specify local '
|
|
171
|
+
'file_mounts only.')
|
|
172
|
+
local_to_controller_file_mounts = (
|
|
173
|
+
controller_utils.translate_local_file_mounts_to_two_hop(task))
|
|
156
174
|
|
|
157
175
|
tls_template_vars = _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
|
158
176
|
service_name, task)
|
|
@@ -183,6 +201,7 @@ def up(
|
|
|
183
201
|
'service_name': service_name,
|
|
184
202
|
'controller_log_file': controller_log_file,
|
|
185
203
|
'remote_user_config_path': remote_config_yaml_path,
|
|
204
|
+
'local_to_controller_file_mounts': local_to_controller_file_mounts,
|
|
186
205
|
'modified_catalogs':
|
|
187
206
|
service_catalog_common.get_modified_catalog_file_mounts(),
|
|
188
207
|
**tls_template_vars,
|