skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/backends/backend_utils.py +9 -6
- sky/backends/cloud_vm_ray_backend.py +2 -3
- sky/check.py +25 -13
- sky/client/cli/command.py +52 -24
- sky/cloud_stores.py +73 -0
- sky/clouds/aws.py +59 -11
- sky/core.py +7 -5
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +71 -2
- sky/data/storage.py +166 -9
- sky/global_user_state.py +14 -18
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +62 -67
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/scheduler.py +15 -2
- sky/jobs/server/core.py +85 -13
- sky/jobs/server/server.py +14 -13
- sky/jobs/server/utils.py +28 -10
- sky/jobs/state.py +216 -40
- sky/jobs/utils.py +65 -28
- sky/metrics/utils.py +18 -0
- sky/optimizer.py +1 -1
- sky/provision/kubernetes/instance.py +88 -19
- sky/provision/kubernetes/volume.py +2 -2
- sky/schemas/api/responses.py +3 -5
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/replica_managers.py +2 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/server.py +8 -7
- sky/server/common.py +21 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +23 -17
- sky/server/requests/executor.py +7 -3
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/request_names.py +80 -0
- sky/server/requests/requests.py +137 -102
- sky/server/requests/serializers/decoders.py +0 -6
- sky/server/requests/serializers/encoders.py +33 -6
- sky/server/server.py +105 -36
- sky/server/stream_utils.py +56 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +6 -1
- sky/skylet/events.py +7 -0
- sky/skylet/services.py +18 -7
- sky/ssh_node_pools/server.py +5 -4
- sky/task.py +14 -42
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +140 -12
- sky/users/permission.py +4 -1
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/context_utils.py +13 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/volumes/server/server.py +4 -3
- sky/workspaces/server.py +7 -6
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
- sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
- sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
- /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = '2205c51a08e8dc517375ba0f653557257c6f6751'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20251101'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
sky/adaptors/aws.py
CHANGED
|
@@ -34,6 +34,7 @@ import time
|
|
|
34
34
|
import typing
|
|
35
35
|
from typing import Callable, Literal, Optional, TypeVar
|
|
36
36
|
|
|
37
|
+
from sky import skypilot_config
|
|
37
38
|
from sky.adaptors import common
|
|
38
39
|
from sky.utils import annotations
|
|
39
40
|
from sky.utils import common_utils
|
|
@@ -119,12 +120,27 @@ def _create_aws_object(creation_fn_or_cls: Callable[[], T],
|
|
|
119
120
|
f'{common_utils.format_exception(e)}.')
|
|
120
121
|
|
|
121
122
|
|
|
123
|
+
def get_workspace_profile() -> Optional[str]:
|
|
124
|
+
"""Get AWS profile name from workspace config."""
|
|
125
|
+
return skypilot_config.get_workspace_cloud('aws').get('profile', None)
|
|
126
|
+
|
|
127
|
+
|
|
122
128
|
# The LRU cache needs to be thread-local to avoid multiple threads sharing the
|
|
123
129
|
# same session object, which is not guaranteed to be thread-safe.
|
|
124
130
|
@_thread_local_lru_cache()
|
|
125
|
-
def session(check_credentials: bool = True):
|
|
126
|
-
"""Create an AWS session.
|
|
127
|
-
|
|
131
|
+
def session(check_credentials: bool = True, profile: Optional[str] = None):
|
|
132
|
+
"""Create an AWS session.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
check_credentials: Whether to check if credentials are available.
|
|
136
|
+
profile: AWS profile name to use. If None, uses default credentials.
|
|
137
|
+
"""
|
|
138
|
+
if profile is not None:
|
|
139
|
+
logger.debug(f'Using AWS profile \'{profile}\'.')
|
|
140
|
+
s = _create_aws_object(
|
|
141
|
+
lambda: boto3.session.Session(profile_name=profile), 'session')
|
|
142
|
+
else:
|
|
143
|
+
s = _create_aws_object(boto3.session.Session, 'session')
|
|
128
144
|
if check_credentials and s.get_credentials() is None:
|
|
129
145
|
# s.get_credentials() can be None if there are actually no credentials,
|
|
130
146
|
# or if we fail to get credentials from IMDS (e.g. due to throttling).
|
|
@@ -180,13 +196,14 @@ def resource(service_name: str, **kwargs):
|
|
|
180
196
|
kwargs['config'] = config
|
|
181
197
|
|
|
182
198
|
check_credentials = kwargs.pop('check_credentials', True)
|
|
199
|
+
profile = get_workspace_profile()
|
|
183
200
|
|
|
184
201
|
# Need to use the client retrieved from the per-thread session to avoid
|
|
185
202
|
# thread-safety issues (Directly creating the client with boto3.resource()
|
|
186
203
|
# is not thread-safe). Reference: https://stackoverflow.com/a/59635814
|
|
187
204
|
return _create_aws_object(
|
|
188
|
-
lambda: session(check_credentials=check_credentials).
|
|
189
|
-
|
|
205
|
+
lambda: session(check_credentials=check_credentials, profile=profile).
|
|
206
|
+
resource(service_name, **kwargs), 'resource')
|
|
190
207
|
|
|
191
208
|
|
|
192
209
|
# New typing overloads can be added as needed.
|
|
@@ -221,14 +238,15 @@ def client(service_name: str, **kwargs):
|
|
|
221
238
|
_assert_kwargs_builtin_type(kwargs)
|
|
222
239
|
|
|
223
240
|
check_credentials = kwargs.pop('check_credentials', True)
|
|
241
|
+
profile = get_workspace_profile()
|
|
224
242
|
|
|
225
243
|
# Need to use the client retrieved from the per-thread session to avoid
|
|
226
244
|
# thread-safety issues (Directly creating the client with boto3.client() is
|
|
227
245
|
# not thread-safe). Reference: https://stackoverflow.com/a/59635814
|
|
228
246
|
|
|
229
247
|
return _create_aws_object(
|
|
230
|
-
lambda: session(check_credentials=check_credentials).
|
|
231
|
-
|
|
248
|
+
lambda: session(check_credentials=check_credentials, profile=profile).
|
|
249
|
+
client(service_name, **kwargs), 'client')
|
|
232
250
|
|
|
233
251
|
|
|
234
252
|
@common.load_lazy_modules(modules=_LAZY_MODULES)
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""CoreWeave cloud adaptor."""
|
|
2
|
+
|
|
3
|
+
import configparser
|
|
4
|
+
import contextlib
|
|
5
|
+
import os
|
|
6
|
+
import threading
|
|
7
|
+
from typing import Dict, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
from sky import exceptions
|
|
10
|
+
from sky import sky_logging
|
|
11
|
+
from sky.adaptors import common
|
|
12
|
+
from sky.clouds import cloud
|
|
13
|
+
from sky.utils import annotations
|
|
14
|
+
from sky.utils import ux_utils
|
|
15
|
+
|
|
16
|
+
logger = sky_logging.init_logger(__name__)
|
|
17
|
+
|
|
18
|
+
COREWEAVE_PROFILE_NAME = 'cw'
|
|
19
|
+
COREWEAVE_CREDENTIALS_PATH = '~/.coreweave/cw.credentials'
|
|
20
|
+
COREWEAVE_CONFIG_PATH = '~/.coreweave/cw.config'
|
|
21
|
+
NAME = 'CoreWeave'
|
|
22
|
+
DEFAULT_REGION = 'US-EAST-01A'
|
|
23
|
+
_DEFAULT_ENDPOINT = 'https://cwobject.com'
|
|
24
|
+
_INDENT_PREFIX = ' '
|
|
25
|
+
|
|
26
|
+
_IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for CoreWeave.'
|
|
27
|
+
'Try pip install "skypilot[coreweave]"')
|
|
28
|
+
|
|
29
|
+
boto3 = common.LazyImport('boto3', import_error_message=_IMPORT_ERROR_MESSAGE)
|
|
30
|
+
botocore = common.LazyImport('botocore',
|
|
31
|
+
import_error_message=_IMPORT_ERROR_MESSAGE)
|
|
32
|
+
|
|
33
|
+
_LAZY_MODULES = (boto3, botocore)
|
|
34
|
+
_session_creation_lock = threading.RLock()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@contextlib.contextmanager
|
|
38
|
+
def _load_cw_credentials_env():
|
|
39
|
+
"""Context manager to temporarily change the AWS credentials file path."""
|
|
40
|
+
prev_credentials_path = os.environ.get('AWS_SHARED_CREDENTIALS_FILE')
|
|
41
|
+
prev_config_path = os.environ.get('AWS_CONFIG_FILE')
|
|
42
|
+
os.environ['AWS_SHARED_CREDENTIALS_FILE'] = COREWEAVE_CREDENTIALS_PATH
|
|
43
|
+
os.environ['AWS_CONFIG_FILE'] = COREWEAVE_CONFIG_PATH
|
|
44
|
+
try:
|
|
45
|
+
yield
|
|
46
|
+
finally:
|
|
47
|
+
if prev_credentials_path is None:
|
|
48
|
+
del os.environ['AWS_SHARED_CREDENTIALS_FILE']
|
|
49
|
+
else:
|
|
50
|
+
os.environ['AWS_SHARED_CREDENTIALS_FILE'] = prev_credentials_path
|
|
51
|
+
if prev_config_path is None:
|
|
52
|
+
del os.environ['AWS_CONFIG_FILE']
|
|
53
|
+
else:
|
|
54
|
+
os.environ['AWS_CONFIG_FILE'] = prev_config_path
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_coreweave_credentials(boto3_session):
|
|
58
|
+
"""Gets the CoreWeave credentials from the boto3 session object.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
boto3_session: The boto3 session object.
|
|
62
|
+
Returns:
|
|
63
|
+
botocore.credentials.ReadOnlyCredentials object with the CoreWeave
|
|
64
|
+
credentials.
|
|
65
|
+
"""
|
|
66
|
+
with _load_cw_credentials_env():
|
|
67
|
+
coreweave_credentials = boto3_session.get_credentials()
|
|
68
|
+
if coreweave_credentials is None:
|
|
69
|
+
with ux_utils.print_exception_no_traceback():
|
|
70
|
+
raise ValueError('CoreWeave credentials not found. Run '
|
|
71
|
+
'`sky check` to verify credentials are '
|
|
72
|
+
'correctly set up.')
|
|
73
|
+
return coreweave_credentials.get_frozen_credentials()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@annotations.lru_cache(scope='global')
|
|
77
|
+
def session():
|
|
78
|
+
"""Create an AWS session for CoreWeave."""
|
|
79
|
+
# Creating the session object is not thread-safe for boto3,
|
|
80
|
+
# so we add a reentrant lock to synchronize the session creation.
|
|
81
|
+
# Reference: https://github.com/boto/boto3/issues/1592
|
|
82
|
+
# However, the session object itself is thread-safe, so we are
|
|
83
|
+
# able to use lru_cache() to cache the session object.
|
|
84
|
+
with _session_creation_lock:
|
|
85
|
+
with _load_cw_credentials_env():
|
|
86
|
+
session_ = boto3.session.Session(
|
|
87
|
+
profile_name=COREWEAVE_PROFILE_NAME)
|
|
88
|
+
return session_
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@annotations.lru_cache(scope='global')
|
|
92
|
+
def resource(resource_name: str, **kwargs):
|
|
93
|
+
"""Create a CoreWeave resource.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
resource_name: CoreWeave resource name (e.g., 's3').
|
|
97
|
+
kwargs: Other options.
|
|
98
|
+
"""
|
|
99
|
+
# Need to use the resource retrieved from the per-thread session
|
|
100
|
+
# to avoid thread-safety issues (Directly creating the client
|
|
101
|
+
# with boto3.resource() is not thread-safe).
|
|
102
|
+
# Reference: https://stackoverflow.com/a/59635814
|
|
103
|
+
|
|
104
|
+
session_ = session()
|
|
105
|
+
coreweave_credentials = get_coreweave_credentials(session_)
|
|
106
|
+
endpoint = get_endpoint()
|
|
107
|
+
|
|
108
|
+
return session_.resource(
|
|
109
|
+
resource_name,
|
|
110
|
+
endpoint_url=endpoint,
|
|
111
|
+
aws_access_key_id=coreweave_credentials.access_key,
|
|
112
|
+
aws_secret_access_key=coreweave_credentials.secret_key,
|
|
113
|
+
region_name='auto',
|
|
114
|
+
config=botocore.config.Config(s3={'addressing_style': 'virtual'}),
|
|
115
|
+
**kwargs)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@annotations.lru_cache(scope='global')
|
|
119
|
+
def client(service_name: str):
|
|
120
|
+
"""Create CoreWeave client of a certain service.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
service_name: CoreWeave service name (e.g., 's3').
|
|
124
|
+
"""
|
|
125
|
+
# Need to use the client retrieved from the per-thread session
|
|
126
|
+
# to avoid thread-safety issues (Directly creating the client
|
|
127
|
+
# with boto3.client() is not thread-safe).
|
|
128
|
+
# Reference: https://stackoverflow.com/a/59635814
|
|
129
|
+
|
|
130
|
+
session_ = session()
|
|
131
|
+
coreweave_credentials = get_coreweave_credentials(session_)
|
|
132
|
+
endpoint = get_endpoint()
|
|
133
|
+
|
|
134
|
+
return session_.client(
|
|
135
|
+
service_name,
|
|
136
|
+
endpoint_url=endpoint,
|
|
137
|
+
aws_access_key_id=coreweave_credentials.access_key,
|
|
138
|
+
aws_secret_access_key=coreweave_credentials.secret_key,
|
|
139
|
+
region_name='auto',
|
|
140
|
+
config=botocore.config.Config(s3={'addressing_style': 'virtual'}),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
|
145
|
+
def botocore_exceptions():
|
|
146
|
+
"""AWS botocore exception."""
|
|
147
|
+
# pylint: disable=import-outside-toplevel
|
|
148
|
+
from botocore import exceptions as boto_exceptions
|
|
149
|
+
return boto_exceptions
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def get_endpoint():
|
|
153
|
+
"""Parse the COREWEAVE_CONFIG_PATH to get the endpoint_url.
|
|
154
|
+
|
|
155
|
+
The config file is an AWS-style config file with format:
|
|
156
|
+
[profile cw]
|
|
157
|
+
endpoint_url = https://cwobject.com
|
|
158
|
+
s3 =
|
|
159
|
+
addressing_style = virtual
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
str: The endpoint URL from the config file, or the default endpoint
|
|
163
|
+
if the file doesn't exist or doesn't contain the endpoint_url.
|
|
164
|
+
"""
|
|
165
|
+
config_path = os.path.expanduser(COREWEAVE_CONFIG_PATH)
|
|
166
|
+
if not os.path.isfile(config_path):
|
|
167
|
+
return _DEFAULT_ENDPOINT
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
config = configparser.ConfigParser()
|
|
171
|
+
config.read(config_path)
|
|
172
|
+
|
|
173
|
+
# Try to get endpoint_url from [profile cw] section
|
|
174
|
+
profile_section = f'profile {COREWEAVE_PROFILE_NAME}'
|
|
175
|
+
if config.has_section(profile_section):
|
|
176
|
+
if config.has_option(profile_section, 'endpoint_url'):
|
|
177
|
+
endpoint = config.get(profile_section, 'endpoint_url')
|
|
178
|
+
return endpoint.strip()
|
|
179
|
+
except (configparser.Error, OSError) as e:
|
|
180
|
+
logger.warning(f'Failed to parse CoreWeave config file: {e}. '
|
|
181
|
+
f'Using default endpoint: {_DEFAULT_ENDPOINT}')
|
|
182
|
+
|
|
183
|
+
return _DEFAULT_ENDPOINT
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def check_credentials(
|
|
187
|
+
cloud_capability: cloud.CloudCapability) -> Tuple[bool, Optional[str]]:
|
|
188
|
+
if cloud_capability == cloud.CloudCapability.STORAGE:
|
|
189
|
+
return check_storage_credentials()
|
|
190
|
+
else:
|
|
191
|
+
raise exceptions.NotSupportedError(
|
|
192
|
+
f'{NAME} does not support {cloud_capability}.')
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def check_storage_credentials() -> Tuple[bool, Optional[str]]:
|
|
196
|
+
"""Checks if the user has access credentials to CoreWeave Object Storage.
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
A tuple of a boolean value and a hint message where the bool
|
|
200
|
+
is True when both credentials needed for CoreWeave storage is set.
|
|
201
|
+
It is False when either of those are not set, which would hint with a
|
|
202
|
+
string on unset credential.
|
|
203
|
+
"""
|
|
204
|
+
hints = None
|
|
205
|
+
profile_in_cred = coreweave_profile_in_cred()
|
|
206
|
+
profile_in_config = coreweave_profile_in_config()
|
|
207
|
+
|
|
208
|
+
if not profile_in_cred:
|
|
209
|
+
hints = (f'[{COREWEAVE_PROFILE_NAME}] profile is not set in '
|
|
210
|
+
f'{COREWEAVE_CREDENTIALS_PATH}.')
|
|
211
|
+
if not profile_in_config:
|
|
212
|
+
if hints:
|
|
213
|
+
hints += ' Additionally, '
|
|
214
|
+
else:
|
|
215
|
+
hints = ''
|
|
216
|
+
hints += (f'[{COREWEAVE_PROFILE_NAME}] profile is not set in '
|
|
217
|
+
f'{COREWEAVE_CONFIG_PATH}.')
|
|
218
|
+
|
|
219
|
+
if hints:
|
|
220
|
+
hints += ' Run the following commands:'
|
|
221
|
+
if not profile_in_cred:
|
|
222
|
+
hints += f'\n{_INDENT_PREFIX} $ pip install boto3'
|
|
223
|
+
hints += (f'\n{_INDENT_PREFIX} $ AWS_SHARED_CREDENTIALS_FILE='
|
|
224
|
+
f'{COREWEAVE_CREDENTIALS_PATH} aws configure --profile '
|
|
225
|
+
f'{COREWEAVE_PROFILE_NAME}')
|
|
226
|
+
if not profile_in_config:
|
|
227
|
+
hints += (f'\n{_INDENT_PREFIX} $ AWS_CONFIG_FILE='
|
|
228
|
+
f'{COREWEAVE_CONFIG_PATH} aws configure set endpoint_url'
|
|
229
|
+
f' <ENDPOINT_URL> --profile '
|
|
230
|
+
f'{COREWEAVE_PROFILE_NAME}')
|
|
231
|
+
hints += (f'\n{_INDENT_PREFIX} $ AWS_CONFIG_FILE='
|
|
232
|
+
f'{COREWEAVE_CONFIG_PATH} aws configure set '
|
|
233
|
+
f's3.addressing_style virtual --profile '
|
|
234
|
+
f'{COREWEAVE_PROFILE_NAME}')
|
|
235
|
+
hints += f'\n{_INDENT_PREFIX}For more info: '
|
|
236
|
+
hints += 'https://docs.coreweave.com/docs/products/storage/object-storage/get-started-caios' # pylint: disable=line-too-long
|
|
237
|
+
|
|
238
|
+
return (False, hints) if hints else (True, hints)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def coreweave_profile_in_config() -> bool:
|
|
242
|
+
"""Checks if CoreWeave profile is set in config"""
|
|
243
|
+
conf_path = os.path.expanduser(COREWEAVE_CONFIG_PATH)
|
|
244
|
+
coreweave_profile_exists = False
|
|
245
|
+
if os.path.isfile(conf_path):
|
|
246
|
+
with open(conf_path, 'r', encoding='utf-8') as file:
|
|
247
|
+
for line in file:
|
|
248
|
+
if f'[profile {COREWEAVE_PROFILE_NAME}]' in line:
|
|
249
|
+
coreweave_profile_exists = True
|
|
250
|
+
break
|
|
251
|
+
return coreweave_profile_exists
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def coreweave_profile_in_cred() -> bool:
|
|
255
|
+
"""Checks if CoreWeave profile is set in credentials"""
|
|
256
|
+
cred_path = os.path.expanduser(COREWEAVE_CREDENTIALS_PATH)
|
|
257
|
+
coreweave_profile_exists = False
|
|
258
|
+
if os.path.isfile(cred_path):
|
|
259
|
+
with open(cred_path, 'r', encoding='utf-8') as file:
|
|
260
|
+
for line in file:
|
|
261
|
+
if f'[{COREWEAVE_PROFILE_NAME}]' in line:
|
|
262
|
+
coreweave_profile_exists = True
|
|
263
|
+
break
|
|
264
|
+
return coreweave_profile_exists
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def get_credential_file_mounts() -> Dict[str, str]:
|
|
268
|
+
"""Returns credential file mounts for CoreWeave.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
Dict[str, str]: A dictionary mapping source paths to destination paths
|
|
272
|
+
for credential files.
|
|
273
|
+
"""
|
|
274
|
+
coreweave_credential_mounts = {
|
|
275
|
+
COREWEAVE_CREDENTIALS_PATH: COREWEAVE_CREDENTIALS_PATH,
|
|
276
|
+
COREWEAVE_CONFIG_PATH: COREWEAVE_CONFIG_PATH
|
|
277
|
+
}
|
|
278
|
+
return coreweave_credential_mounts
|
sky/backends/backend_utils.py
CHANGED
|
@@ -3157,6 +3157,7 @@ def get_clusters(
|
|
|
3157
3157
|
all_users: bool = True,
|
|
3158
3158
|
include_credentials: bool = False,
|
|
3159
3159
|
summary_response: bool = False,
|
|
3160
|
+
include_handle: bool = True,
|
|
3160
3161
|
# Internal only:
|
|
3161
3162
|
# pylint: disable=invalid-name
|
|
3162
3163
|
_include_is_managed: bool = False,
|
|
@@ -3240,13 +3241,13 @@ def get_clusters(
|
|
|
3240
3241
|
"""Add resource str to record"""
|
|
3241
3242
|
for record in _get_records_with_handle(records):
|
|
3242
3243
|
handle = record['handle']
|
|
3243
|
-
|
|
3244
|
-
|
|
3245
|
-
handle,
|
|
3246
|
-
record[
|
|
3247
|
-
'resources_str_full'] = resources_utils.get_readable_resources_repr(
|
|
3248
|
-
handle, simplify=False)
|
|
3244
|
+
resource_str_simple, resource_str_full = (
|
|
3245
|
+
resources_utils.get_readable_resources_repr(
|
|
3246
|
+
handle, simplified_only=summary_response))
|
|
3247
|
+
record['resources_str'] = resource_str_simple
|
|
3249
3248
|
if not summary_response:
|
|
3249
|
+
assert resource_str_full is not None
|
|
3250
|
+
record['resources_str_full'] = resource_str_full
|
|
3250
3251
|
record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
|
|
3251
3252
|
|
|
3252
3253
|
def _update_records_with_credentials(
|
|
@@ -3313,6 +3314,8 @@ def get_clusters(
|
|
|
3313
3314
|
record['accelerators'] = (
|
|
3314
3315
|
f'{handle.launched_resources.accelerators}'
|
|
3315
3316
|
if handle.launched_resources.accelerators else None)
|
|
3317
|
+
if not include_handle:
|
|
3318
|
+
record.pop('handle', None)
|
|
3316
3319
|
|
|
3317
3320
|
# Add handle info to the records
|
|
3318
3321
|
_update_records_with_handle_info(records)
|
|
@@ -2369,9 +2369,8 @@ class RetryingVmProvisioner(object):
|
|
|
2369
2369
|
for (resource, exception) in resource_exceptions.items():
|
|
2370
2370
|
table.add_row([
|
|
2371
2371
|
resource.infra.formatted_str(),
|
|
2372
|
-
resources_utils.format_resource(
|
|
2373
|
-
|
|
2374
|
-
exception
|
|
2372
|
+
resources_utils.format_resource(
|
|
2373
|
+
resource, simplified_only=True)[0], exception
|
|
2375
2374
|
])
|
|
2376
2375
|
# Set the max width of REASON column to 80 to avoid the table
|
|
2377
2376
|
# being wrapped in a unreadable way.
|
sky/check.py
CHANGED
|
@@ -14,6 +14,7 @@ from sky import global_user_state
|
|
|
14
14
|
from sky import sky_logging
|
|
15
15
|
from sky import skypilot_config
|
|
16
16
|
from sky.adaptors import cloudflare
|
|
17
|
+
from sky.adaptors import coreweave
|
|
17
18
|
from sky.clouds import cloud as sky_cloud
|
|
18
19
|
from sky.skylet import constants
|
|
19
20
|
from sky.utils import common_utils
|
|
@@ -33,7 +34,8 @@ def _get_workspace_allowed_clouds(workspace: str) -> List[str]:
|
|
|
33
34
|
# clouds. Also validate names with get_cloud_tuple.
|
|
34
35
|
config_allowed_cloud_names = skypilot_config.get_nested(
|
|
35
36
|
('allowed_clouds',),
|
|
36
|
-
[repr(c) for c in registry.CLOUD_REGISTRY.values()] +
|
|
37
|
+
[repr(c) for c in registry.CLOUD_REGISTRY.values()] +
|
|
38
|
+
[cloudflare.NAME, coreweave.NAME])
|
|
37
39
|
# filter out the clouds that are disabled in the workspace config
|
|
38
40
|
workspace_disabled_clouds = []
|
|
39
41
|
for cloud in config_allowed_cloud_names:
|
|
@@ -81,7 +83,7 @@ def check_capabilities(
|
|
|
81
83
|
|
|
82
84
|
def get_all_clouds() -> Tuple[str, ...]:
|
|
83
85
|
return tuple([repr(c) for c in registry.CLOUD_REGISTRY.values()] +
|
|
84
|
-
[cloudflare.NAME])
|
|
86
|
+
[cloudflare.NAME, coreweave.NAME])
|
|
85
87
|
|
|
86
88
|
def _execute_check_logic_for_workspace(
|
|
87
89
|
current_workspace_name: str,
|
|
@@ -121,9 +123,12 @@ def check_capabilities(
|
|
|
121
123
|
cloud_name: str
|
|
122
124
|
) -> Tuple[str, Union[sky_clouds.Cloud, ModuleType]]:
|
|
123
125
|
# Validates cloud_name and returns a tuple of the cloud's name and
|
|
124
|
-
# the cloud object. Includes special handling for Cloudflare
|
|
126
|
+
# the cloud object. Includes special handling for Cloudflare and
|
|
127
|
+
# CoreWeave.
|
|
125
128
|
if cloud_name.lower().startswith('cloudflare'):
|
|
126
129
|
return cloudflare.NAME, cloudflare
|
|
130
|
+
elif cloud_name.lower().startswith('coreweave'):
|
|
131
|
+
return coreweave.NAME, coreweave
|
|
127
132
|
else:
|
|
128
133
|
cloud_obj = registry.CLOUD_REGISTRY.from_str(cloud_name)
|
|
129
134
|
assert cloud_obj is not None, f'Cloud {cloud_name!r} not found'
|
|
@@ -219,23 +224,24 @@ def check_capabilities(
|
|
|
219
224
|
# allowed_clouds in config.yaml, it will be disabled.
|
|
220
225
|
all_enabled_clouds: Set[str] = set()
|
|
221
226
|
for capability in capabilities:
|
|
222
|
-
# Cloudflare
|
|
223
|
-
# should not be inserted into the DB
|
|
224
|
-
# other code would error out when it's
|
|
225
|
-
# registry).
|
|
227
|
+
# Cloudflare and CoreWeave are not real clouds in
|
|
228
|
+
# registry.CLOUD_REGISTRY, and should not be inserted into the DB
|
|
229
|
+
# (otherwise `sky launch` and other code would error out when it's
|
|
230
|
+
# trying to look it up in the registry).
|
|
226
231
|
enabled_clouds_set = {
|
|
227
232
|
cloud for cloud, capabilities in enabled_clouds.items()
|
|
228
|
-
if capability in capabilities and
|
|
229
|
-
|
|
233
|
+
if capability in capabilities and not cloud.startswith(
|
|
234
|
+
'Cloudflare') and not cloud.startswith('CoreWeave')
|
|
230
235
|
}
|
|
231
236
|
disabled_clouds_set = {
|
|
232
237
|
cloud for cloud, capabilities in disabled_clouds.items()
|
|
233
|
-
if capability in capabilities and
|
|
234
|
-
|
|
238
|
+
if capability in capabilities and not cloud.startswith(
|
|
239
|
+
'Cloudflare') and not cloud.startswith('CoreWeave')
|
|
235
240
|
}
|
|
236
241
|
config_allowed_clouds_set = {
|
|
237
242
|
cloud for cloud in config_allowed_cloud_names
|
|
238
|
-
if not cloud.startswith('Cloudflare')
|
|
243
|
+
if not cloud.startswith('Cloudflare') and
|
|
244
|
+
not cloud.startswith('CoreWeave')
|
|
239
245
|
}
|
|
240
246
|
previously_enabled_clouds_set = {
|
|
241
247
|
repr(cloud)
|
|
@@ -430,6 +436,12 @@ def get_cloud_credential_file_mounts(
|
|
|
430
436
|
if r2_is_enabled:
|
|
431
437
|
r2_credential_mounts = cloudflare.get_credential_file_mounts()
|
|
432
438
|
file_mounts.update(r2_credential_mounts)
|
|
439
|
+
|
|
440
|
+
# Similarly, handle CoreWeave storage credentials
|
|
441
|
+
coreweave_is_enabled, _ = coreweave.check_storage_credentials()
|
|
442
|
+
if coreweave_is_enabled:
|
|
443
|
+
coreweave_credential_mounts = coreweave.get_credential_file_mounts()
|
|
444
|
+
file_mounts.update(coreweave_credential_mounts)
|
|
433
445
|
return file_mounts
|
|
434
446
|
|
|
435
447
|
|
|
@@ -494,7 +506,7 @@ def _print_checked_cloud(
|
|
|
494
506
|
style_str = f'{colorama.Fore.GREEN}{colorama.Style.NORMAL}'
|
|
495
507
|
status_msg = 'enabled'
|
|
496
508
|
capability_string = f'[{", ".join(enabled_capabilities)}]'
|
|
497
|
-
if verbose and cloud is not cloudflare:
|
|
509
|
+
if verbose and cloud is not cloudflare and cloud is not coreweave:
|
|
498
510
|
activated_account = cloud.get_active_user_identity_str()
|
|
499
511
|
if isinstance(cloud_tuple[1], (sky_clouds.SSH, sky_clouds.Kubernetes)):
|
|
500
512
|
detail_string = _format_context_details(cloud_tuple[1],
|
sky/client/cli/command.py
CHANGED
|
@@ -1383,7 +1383,26 @@ def _handle_jobs_queue_request(
|
|
|
1383
1383
|
try:
|
|
1384
1384
|
if not is_called_by_user:
|
|
1385
1385
|
usage_lib.messages.usage.set_internal()
|
|
1386
|
-
|
|
1386
|
+
# Call both stream_and_get functions in parallel
|
|
1387
|
+
def get_jobs_queue_result():
|
|
1388
|
+
return sdk.stream_and_get(request_id)
|
|
1389
|
+
|
|
1390
|
+
def get_pool_status_result():
|
|
1391
|
+
if pool_status_request_id is not None:
|
|
1392
|
+
try:
|
|
1393
|
+
return sdk.stream_and_get(pool_status_request_id)
|
|
1394
|
+
except Exception: # pylint: disable=broad-except
|
|
1395
|
+
# If getting pool status fails, just continue without it
|
|
1396
|
+
return None
|
|
1397
|
+
return None
|
|
1398
|
+
|
|
1399
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
1400
|
+
jobs_future = executor.submit(get_jobs_queue_result)
|
|
1401
|
+
pool_status_future = executor.submit(get_pool_status_result)
|
|
1402
|
+
|
|
1403
|
+
result = jobs_future.result()
|
|
1404
|
+
pool_status_result = pool_status_future.result()
|
|
1405
|
+
|
|
1387
1406
|
if isinstance(result, tuple):
|
|
1388
1407
|
managed_jobs_, total, status_counts, _ = result
|
|
1389
1408
|
if only_in_progress:
|
|
@@ -1400,13 +1419,6 @@ def _handle_jobs_queue_request(
|
|
|
1400
1419
|
managed_jobs_ = result
|
|
1401
1420
|
num_in_progress_jobs = len(
|
|
1402
1421
|
set(job['job_id'] for job in managed_jobs_))
|
|
1403
|
-
# Try to get pool status if request was made
|
|
1404
|
-
if pool_status_request_id is not None:
|
|
1405
|
-
try:
|
|
1406
|
-
pool_status_result = sdk.stream_and_get(pool_status_request_id)
|
|
1407
|
-
except Exception: # pylint: disable=broad-except
|
|
1408
|
-
# If getting pool status fails, just continue without it
|
|
1409
|
-
pool_status_result = None
|
|
1410
1422
|
except exceptions.ClusterNotUpError as e:
|
|
1411
1423
|
controller_status = e.cluster_status
|
|
1412
1424
|
msg = str(e)
|
|
@@ -3452,7 +3464,7 @@ def _down_or_stop_clusters(
|
|
|
3452
3464
|
click.echo(f' {name} ({first})')
|
|
3453
3465
|
|
|
3454
3466
|
if failures:
|
|
3455
|
-
|
|
3467
|
+
click.echo('Cluster(s) failed. See details above.')
|
|
3456
3468
|
|
|
3457
3469
|
|
|
3458
3470
|
@cli.command(cls=_DocumentedCodeCommand)
|
|
@@ -4253,6 +4265,10 @@ def volumes():
|
|
|
4253
4265
|
pass
|
|
4254
4266
|
|
|
4255
4267
|
|
|
4268
|
+
# Add 'volume' as an alias for 'volumes'
|
|
4269
|
+
cli.add_command(volumes, name='volume')
|
|
4270
|
+
|
|
4271
|
+
|
|
4256
4272
|
@volumes.command('apply', cls=_DocumentedCodeCommand)
|
|
4257
4273
|
@flags.config_option(expose_value=False)
|
|
4258
4274
|
@click.argument('entrypoint',
|
|
@@ -4661,7 +4677,8 @@ def jobs_launch(
|
|
|
4661
4677
|
else:
|
|
4662
4678
|
# TODO(tian): This can be very long. Considering have a "group id"
|
|
4663
4679
|
# and query all job ids with the same group id.
|
|
4664
|
-
|
|
4680
|
+
# Sort job ids to ensure consistent ordering.
|
|
4681
|
+
job_ids_str = ','.join(map(str, sorted(job_ids)))
|
|
4665
4682
|
click.secho(
|
|
4666
4683
|
f'Jobs submitted with IDs: {colorama.Fore.CYAN}'
|
|
4667
4684
|
f'{job_ids_str}{colorama.Style.RESET_ALL}.'
|
|
@@ -4775,19 +4792,28 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
|
4775
4792
|
fields = fields + _USER_NAME_FIELD
|
|
4776
4793
|
if verbose:
|
|
4777
4794
|
fields = fields + _USER_HASH_FIELD
|
|
4778
|
-
|
|
4779
|
-
|
|
4780
|
-
|
|
4781
|
-
|
|
4782
|
-
|
|
4783
|
-
|
|
4784
|
-
|
|
4785
|
-
|
|
4786
|
-
|
|
4787
|
-
|
|
4788
|
-
|
|
4789
|
-
#
|
|
4790
|
-
|
|
4795
|
+
# Call both managed_jobs.queue and managed_jobs.pool_status in parallel
|
|
4796
|
+
def get_managed_jobs_queue():
|
|
4797
|
+
return managed_jobs.queue(refresh=refresh,
|
|
4798
|
+
skip_finished=skip_finished,
|
|
4799
|
+
all_users=all_users,
|
|
4800
|
+
limit=max_num_jobs_to_show,
|
|
4801
|
+
fields=fields)
|
|
4802
|
+
|
|
4803
|
+
def get_pool_status():
|
|
4804
|
+
try:
|
|
4805
|
+
return managed_jobs.pool_status(pool_names=None)
|
|
4806
|
+
except Exception: # pylint: disable=broad-except
|
|
4807
|
+
# If pool_status fails, we'll just skip the worker information
|
|
4808
|
+
return None
|
|
4809
|
+
|
|
4810
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
4811
|
+
managed_jobs_future = executor.submit(get_managed_jobs_queue)
|
|
4812
|
+
pool_status_future = executor.submit(get_pool_status)
|
|
4813
|
+
|
|
4814
|
+
managed_jobs_request_id = managed_jobs_future.result()
|
|
4815
|
+
pool_status_request_id = pool_status_future.result()
|
|
4816
|
+
|
|
4791
4817
|
num_jobs, msg = _handle_jobs_queue_request(
|
|
4792
4818
|
managed_jobs_request_id,
|
|
4793
4819
|
pool_status_request_id=pool_status_request_id,
|
|
@@ -6364,7 +6390,9 @@ INT_OR_NONE = IntOrNone()
|
|
|
6364
6390
|
is_flag=True,
|
|
6365
6391
|
default=False,
|
|
6366
6392
|
required=False,
|
|
6367
|
-
help='Show requests of all statuses
|
|
6393
|
+
help=('Show requests of all statuses, including finished ones '
|
|
6394
|
+
'(SUCCEEDED, FAILED, CANCELLED). By default, only active '
|
|
6395
|
+
'requests (PENDING, RUNNING) are shown.'))
|
|
6368
6396
|
@click.option(
|
|
6369
6397
|
'--limit',
|
|
6370
6398
|
'-l',
|