skypilot-nightly 1.0.0.dev20250715__py3-none-any.whl → 1.0.0.dev20250717__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/nebius.py +6 -0
- sky/backends/backend.py +8 -4
- sky/backends/cloud_vm_ray_backend.py +50 -1
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +2 -1
- sky/catalog/common.py +60 -50
- sky/catalog/data_fetchers/fetch_gcp.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +308 -0
- sky/catalog/gcp_catalog.py +24 -7
- sky/catalog/kubernetes_catalog.py +5 -1
- sky/client/cli/command.py +180 -77
- sky/client/cli/git.py +549 -0
- sky/client/common.py +1 -1
- sky/clouds/gcp.py +1 -1
- sky/clouds/hyperbolic.py +2 -0
- sky/clouds/nebius.py +6 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/Et5IQ5Y3WvH608nXClo4z/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-d8c6404a7c6fffe6.js +11 -0
- sky/dashboard/out/_next/static/chunks/1746.27d40aedc22bd2d6.js +60 -0
- sky/dashboard/out/_next/static/chunks/2641.35edc9ccaeaad9e3.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.b3cc2bc1d49d2c3c.js → 3785.95b94f18aaec7233.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +16 -0
- sky/dashboard/out/_next/static/chunks/6990-dcb411b566e64cde.js +1 -0
- sky/dashboard/out/_next/static/chunks/{9025.a7c44babfe56ce09.js → 9025.133e9ba5c780afeb.js} +1 -1
- sky/dashboard/out/_next/static/chunks/9847.46e613d000c55859.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-e0f63ea4704026ad.js → _app-771a40cde532309b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-9096ea50b8e2cf9e.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-14d404b7dd28502a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-cd43fb3c122eedde.js → users-19e98664bdd61643.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-06bde99155fa6292.js → workspaces-a1e43d9ef51a9cea.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-4a336bff3bcec29a.js → webpack-c3b45b7b0eaef66f.js} +1 -1
- sky/dashboard/out/_next/static/css/219887b94512388c.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +5 -0
- sky/execution.py +5 -1
- sky/provision/hyperbolic/__init__.py +1 -0
- sky/provision/hyperbolic/instance.py +10 -0
- sky/provision/kubernetes/utils.py +6 -0
- sky/server/common.py +4 -3
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +3 -1
- sky/task.py +12 -2
- sky/utils/command_runner.py +144 -35
- sky/utils/controller_utils.py +4 -3
- sky/utils/git.py +9 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/schemas.py +15 -1
- sky/utils/tempstore.py +20 -1
- {skypilot_nightly-1.0.0.dev20250715.dist-info → skypilot_nightly-1.0.0.dev20250717.dist-info}/METADATA +5 -3
- {skypilot_nightly-1.0.0.dev20250715.dist-info → skypilot_nightly-1.0.0.dev20250717.dist-info}/RECORD +79 -75
- sky/dashboard/out/_next/static/chunks/1141-726e5a3f00b67185.js +0 -11
- sky/dashboard/out/_next/static/chunks/1691.44e378727a41f3b5.js +0 -21
- sky/dashboard/out/_next/static/chunks/3256.7257acd01b481bed.js +0 -11
- sky/dashboard/out/_next/static/chunks/4697.f5421144224da9fc.js +0 -20
- sky/dashboard/out/_next/static/chunks/6990-d0dc765474fa0eca.js +0 -1
- sky/dashboard/out/_next/static/chunks/8982.a2e214068f30a857.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4608dc89f95eba89.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-980d6f6b64ca7833.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/volumes-4ebf6484f7216387.js +0 -1
- sky/dashboard/out/_next/static/css/eacc7d65a8686c76.css +0 -3
- sky/dashboard/out/_next/static/y4pSeZ-9XymSDfPlcWhVO/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{y4pSeZ-9XymSDfPlcWhVO → Et5IQ5Y3WvH608nXClo4z}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1043-5e5ef6198735ff7e.js → 1043-90a88c46f27b3df5.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{1871-4d1f786e83bd9ffe.js → 1871-76491ac174a95278.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{3698-52ad1ca228faa776.js → 3698-9fa11dafb5cad4a6.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{6601-d38d10f957dff832.js → 6601-d4a381403a8bae91.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{8969-13bb52ce3cffa4e3.js → 8969-743abf4bc86baf48.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-8e25c8ea0baa271a.js → 938-6a9ffdaa21eee969.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{9470-21d059a1dfa03f61.js → 9470-b6f6a35283863a6f.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0fbfb1dd0b08c90c.js → [cluster]-0c37ee1ac5f3474d.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250715.dist-info → skypilot_nightly-1.0.0.dev20250717.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250715.dist-info → skypilot_nightly-1.0.0.dev20250717.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250715.dist-info → skypilot_nightly-1.0.0.dev20250717.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250715.dist-info → skypilot_nightly-1.0.0.dev20250717.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '5ed1775c386bbd0cc4b9d1c80fc0d2d91b352870'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250717'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -88,6 +88,7 @@ from sky.admin_policy import UserRequest
|
|
88
88
|
from sky.catalog import list_accelerators
|
89
89
|
from sky.client.sdk import api_cancel
|
90
90
|
from sky.client.sdk import api_info
|
91
|
+
from sky.client.sdk import api_login
|
91
92
|
from sky.client.sdk import api_server_logs
|
92
93
|
from sky.client.sdk import api_start
|
93
94
|
from sky.client.sdk import api_status
|
@@ -206,6 +207,7 @@ __all__ = [
|
|
206
207
|
'api_status',
|
207
208
|
'api_cancel',
|
208
209
|
'api_info',
|
210
|
+
'api_login',
|
209
211
|
'api_start',
|
210
212
|
'api_stop',
|
211
213
|
'api_server_logs',
|
sky/adaptors/nebius.py
CHANGED
@@ -89,6 +89,12 @@ def iam():
|
|
89
89
|
return iam_v1
|
90
90
|
|
91
91
|
|
92
|
+
def billing():
|
93
|
+
# pylint: disable=import-outside-toplevel
|
94
|
+
from nebius.api.nebius.billing import v1alpha1 as billing_v1alpha1
|
95
|
+
return billing_v1alpha1
|
96
|
+
|
97
|
+
|
92
98
|
def nebius_common():
|
93
99
|
# pylint: disable=import-outside-toplevel
|
94
100
|
from nebius.api.nebius.common import v1 as common_v1
|
sky/backends/backend.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
"""Sky backend interface."""
|
2
2
|
import typing
|
3
|
-
from typing import Dict, Generic, Optional, Tuple
|
3
|
+
from typing import Any, Dict, Generic, Optional, Tuple, Union
|
4
4
|
|
5
5
|
from sky.usage import usage_lib
|
6
6
|
from sky.utils import cluster_utils
|
@@ -90,8 +90,10 @@ class Backend(Generic[_ResourceHandleType]):
|
|
90
90
|
|
91
91
|
@timeline.event
|
92
92
|
@usage_lib.messages.usage.update_runtime('sync_workdir')
|
93
|
-
def sync_workdir(self, handle: _ResourceHandleType,
|
94
|
-
|
93
|
+
def sync_workdir(self, handle: _ResourceHandleType,
|
94
|
+
workdir: Union[Path, Dict[str, Any]],
|
95
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
96
|
+
return self._sync_workdir(handle, workdir, envs_and_secrets)
|
95
97
|
|
96
98
|
@timeline.event
|
97
99
|
@usage_lib.messages.usage.update_runtime('sync_file_mounts')
|
@@ -165,7 +167,9 @@ class Backend(Generic[_ResourceHandleType]):
|
|
165
167
|
) -> Tuple[Optional[_ResourceHandleType], bool]:
|
166
168
|
raise NotImplementedError
|
167
169
|
|
168
|
-
def _sync_workdir(self, handle: _ResourceHandleType,
|
170
|
+
def _sync_workdir(self, handle: _ResourceHandleType,
|
171
|
+
workdir: Union[Path, Dict[str, Any]],
|
172
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
169
173
|
raise NotImplementedError
|
170
174
|
|
171
175
|
def _sync_file_mounts(
|
@@ -3240,10 +3240,59 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3240
3240
|
common_utils.remove_file_if_exists(lock_path)
|
3241
3241
|
|
3242
3242
|
def _sync_workdir(self, handle: CloudVmRayResourceHandle,
|
3243
|
-
workdir: Path
|
3243
|
+
workdir: Union[Path, Dict[str, Any]],
|
3244
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
3244
3245
|
# Even though provision() takes care of it, there may be cases where
|
3245
3246
|
# this function is called in isolation, without calling provision(),
|
3246
3247
|
# e.g., in CLI. So we should rerun rsync_up.
|
3248
|
+
if isinstance(workdir, dict):
|
3249
|
+
self._sync_git_workdir(handle, envs_and_secrets)
|
3250
|
+
else:
|
3251
|
+
self._sync_path_workdir(handle, workdir)
|
3252
|
+
|
3253
|
+
def _sync_git_workdir(self, handle: CloudVmRayResourceHandle,
|
3254
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
3255
|
+
style = colorama.Style
|
3256
|
+
ip_list = handle.external_ips()
|
3257
|
+
assert ip_list is not None, 'external_ips is not cached in handle'
|
3258
|
+
|
3259
|
+
log_path = os.path.join(self.log_dir, 'workdir_sync.log')
|
3260
|
+
|
3261
|
+
# TODO(zhwu): refactor this with backend_utils.parallel_cmd_with_rsync
|
3262
|
+
runners = handle.get_command_runners()
|
3263
|
+
|
3264
|
+
def _sync_git_workdir_node(
|
3265
|
+
runner: command_runner.CommandRunner) -> None:
|
3266
|
+
# Type assertion to help mypy understand the type
|
3267
|
+
assert hasattr(
|
3268
|
+
runner, 'git_clone'
|
3269
|
+
), f'CommandRunner should have git_clone method, ' \
|
3270
|
+
f'got {type(runner)}'
|
3271
|
+
runner.git_clone(
|
3272
|
+
target_dir=SKY_REMOTE_WORKDIR,
|
3273
|
+
log_path=log_path,
|
3274
|
+
stream_logs=False,
|
3275
|
+
max_retry=3,
|
3276
|
+
envs_and_secrets=envs_and_secrets,
|
3277
|
+
)
|
3278
|
+
|
3279
|
+
num_nodes = handle.launched_nodes
|
3280
|
+
plural = 's' if num_nodes > 1 else ''
|
3281
|
+
logger.info(
|
3282
|
+
f' {style.DIM}Syncing workdir (to {num_nodes} node{plural}): '
|
3283
|
+
f'{SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
|
3284
|
+
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
3285
|
+
os.system(f'touch {log_path}')
|
3286
|
+
num_threads = subprocess_utils.get_parallel_threads(
|
3287
|
+
str(handle.launched_resources.cloud))
|
3288
|
+
with rich_utils.safe_status(
|
3289
|
+
ux_utils.spinner_message('Syncing workdir', log_path)):
|
3290
|
+
subprocess_utils.run_in_parallel(_sync_git_workdir_node, runners,
|
3291
|
+
num_threads)
|
3292
|
+
logger.info(ux_utils.finishing_message('Synced workdir.', log_path))
|
3293
|
+
|
3294
|
+
def _sync_path_workdir(self, handle: CloudVmRayResourceHandle,
|
3295
|
+
workdir: Path) -> None:
|
3247
3296
|
fore = colorama.Fore
|
3248
3297
|
style = colorama.Style
|
3249
3298
|
ip_list = handle.external_ips()
|
sky/backends/docker_utils.py
CHANGED
@@ -168,7 +168,7 @@ def build_dockerimage(task: task_mod.Task,
|
|
168
168
|
build_dir=temp_dir)
|
169
169
|
|
170
170
|
dst = os.path.join(temp_dir, SKY_DOCKER_WORKDIR)
|
171
|
-
if task.workdir is not None:
|
171
|
+
if task.workdir is not None and isinstance(task.workdir, str):
|
172
172
|
# Copy workdir contents to tempdir
|
173
173
|
shutil.copytree(os.path.expanduser(task.workdir), dst)
|
174
174
|
else:
|
@@ -178,7 +178,8 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
178
178
|
return handle, False
|
179
179
|
|
180
180
|
def _sync_workdir(self, handle: LocalDockerResourceHandle,
|
181
|
-
workdir: Path
|
181
|
+
workdir: Union[Path, Dict[str, Any]],
|
182
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
182
183
|
"""Workdir is sync'd by adding to the docker image.
|
183
184
|
|
184
185
|
This happens in the execute step.
|
sky/catalog/common.py
CHANGED
@@ -13,6 +13,7 @@ from sky import sky_logging
|
|
13
13
|
from sky.adaptors import common as adaptors_common
|
14
14
|
from sky.clouds import cloud as cloud_lib
|
15
15
|
from sky.skylet import constants
|
16
|
+
from sky.utils import annotations
|
16
17
|
from sky.utils import common_utils
|
17
18
|
from sky.utils import registry
|
18
19
|
from sky.utils import rich_utils
|
@@ -125,17 +126,21 @@ class LazyDataFrame:
|
|
125
126
|
|
126
127
|
We don't need to load the catalog for every SkyPilot call, and this class
|
127
128
|
allows us to load the catalog only when needed.
|
129
|
+
|
130
|
+
Use update_if_stale_func to pass in a function that decides whether to
|
131
|
+
update the catalog on disk, updates it if needed, and returns
|
132
|
+
a bool indicating whether the update was done.
|
128
133
|
"""
|
129
134
|
|
130
|
-
def __init__(self, filename: str,
|
135
|
+
def __init__(self, filename: str, update_if_stale_func: Callable[[], bool]):
|
131
136
|
self._filename = filename
|
132
137
|
self._df: Optional['pd.DataFrame'] = None
|
133
|
-
self.
|
138
|
+
self._update_if_stale_func = update_if_stale_func
|
134
139
|
|
140
|
+
@annotations.lru_cache(scope='request')
|
135
141
|
def _load_df(self) -> 'pd.DataFrame':
|
136
|
-
if self._df is None:
|
142
|
+
if self._update_if_stale_func() or self._df is None:
|
137
143
|
try:
|
138
|
-
self._update_func()
|
139
144
|
self._df = pd.read_csv(self._filename)
|
140
145
|
except Exception as e: # pylint: disable=broad-except
|
141
146
|
# As users can manually modify the catalog, read_csv can fail.
|
@@ -193,55 +198,60 @@ def read_catalog(filename: str,
|
|
193
198
|
return last_update + pull_frequency_hours * 3600 < time.time()
|
194
199
|
|
195
200
|
def _update_catalog():
|
201
|
+
# Fast path: Exit early to avoid lock contention.
|
202
|
+
if not _need_update():
|
203
|
+
return False
|
204
|
+
|
196
205
|
# Atomic check, to avoid conflicts with other processes.
|
197
206
|
with filelock.FileLock(meta_path + '.lock'):
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
'connection.')
|
231
|
-
with ux_utils.print_exception_no_traceback():
|
232
|
-
raise e
|
207
|
+
# Double check after acquiring the lock.
|
208
|
+
if not _need_update():
|
209
|
+
return False
|
210
|
+
|
211
|
+
url = f'{constants.HOSTED_CATALOG_DIR_URL}/{constants.CATALOG_SCHEMA_VERSION}/{filename}' # pylint: disable=line-too-long
|
212
|
+
url_fallback = f'{constants.HOSTED_CATALOG_DIR_URL_S3_MIRROR}/{constants.CATALOG_SCHEMA_VERSION}/{filename}' # pylint: disable=line-too-long
|
213
|
+
headers = {'User-Agent': 'SkyPilot/0.7'}
|
214
|
+
update_frequency_str = ''
|
215
|
+
if pull_frequency_hours is not None:
|
216
|
+
update_frequency_str = (
|
217
|
+
f' (every {pull_frequency_hours} hours)')
|
218
|
+
with rich_utils.safe_status(
|
219
|
+
ux_utils.spinner_message(
|
220
|
+
f'Updating {cloud} catalog: {filename}') +
|
221
|
+
f'{update_frequency_str}'):
|
222
|
+
try:
|
223
|
+
r = requests.get(url=url, headers=headers)
|
224
|
+
if r.status_code == 429:
|
225
|
+
# fallback to s3 mirror, github introduced rate
|
226
|
+
# limit after 2025-05, see
|
227
|
+
# https://github.com/skypilot-org/skypilot/issues/5438
|
228
|
+
# for more details
|
229
|
+
r = requests.get(url=url_fallback, headers=headers)
|
230
|
+
r.raise_for_status()
|
231
|
+
except requests.exceptions.RequestException as e:
|
232
|
+
error_str = (f'Failed to fetch {cloud} catalog '
|
233
|
+
f'{filename}. ')
|
234
|
+
if os.path.exists(catalog_path):
|
235
|
+
logger.warning(
|
236
|
+
f'{error_str}Using cached catalog files.')
|
237
|
+
# Update catalog file modification time.
|
238
|
+
os.utime(catalog_path, None) # Sets to current time
|
233
239
|
else:
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
240
|
+
logger.error(f'{error_str}Please check your internet '
|
241
|
+
'connection.')
|
242
|
+
with ux_utils.print_exception_no_traceback():
|
243
|
+
raise e
|
244
|
+
else:
|
245
|
+
# Download successful, save the catalog to a local file.
|
246
|
+
os.makedirs(os.path.dirname(catalog_path), exist_ok=True)
|
247
|
+
with open(catalog_path, 'w', encoding='utf-8') as f:
|
248
|
+
f.write(r.text)
|
249
|
+
with open(meta_path + '.md5', 'w', encoding='utf-8') as f:
|
250
|
+
f.write(hashlib.md5(r.text.encode()).hexdigest())
|
251
|
+
logger.debug(f'Updated {cloud} catalog {filename}.')
|
252
|
+
return True
|
253
|
+
|
254
|
+
return LazyDataFrame(catalog_path, update_if_stale_func=_update_catalog)
|
245
255
|
|
246
256
|
|
247
257
|
def _get_instance_type(
|
@@ -0,0 +1,308 @@
|
|
1
|
+
"""A script that queries Nebius API to get instance types and pricing info.
|
2
|
+
|
3
|
+
This script takes about 1 minute to finish.
|
4
|
+
"""
|
5
|
+
import csv
|
6
|
+
from dataclasses import dataclass
|
7
|
+
import decimal
|
8
|
+
import json
|
9
|
+
import logging
|
10
|
+
import re
|
11
|
+
from typing import Any, Dict, List, Optional
|
12
|
+
|
13
|
+
from sky.adaptors import nebius
|
14
|
+
from sky.adaptors.nebius import billing
|
15
|
+
from sky.adaptors.nebius import compute
|
16
|
+
from sky.adaptors.nebius import iam
|
17
|
+
from sky.adaptors.nebius import nebius_common
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
TIMEOUT = 10
|
22
|
+
PARENT_ID_TEMPLATE = 'project-{}public-images'
|
23
|
+
ACCELERATOR_MANUFACTURER = 'NVIDIA'
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass
|
27
|
+
class PresetInfo:
|
28
|
+
"""Represents information about a specific compute preset,
|
29
|
+
including its pricing.
|
30
|
+
|
31
|
+
Attributes:
|
32
|
+
region (str): The geographical region where the preset is available.
|
33
|
+
fullname (str): The full name of the preset, a combination of platform
|
34
|
+
and preset name.
|
35
|
+
name (str): The name of the preset.
|
36
|
+
platform_name (str): The name of the platform the preset belongs to.
|
37
|
+
gpu (int): The number of GPUs in the preset.
|
38
|
+
vcpu (int): The number of virtual CPUs in the preset.
|
39
|
+
memory_gib (int): The amount of memory in GiB in the preset.
|
40
|
+
accelerator_manufacturer (str | None): The manufacturer of the
|
41
|
+
accelerator (e.g., "NVIDIA"), or None if no accelerator.
|
42
|
+
accelerator_name (str | None): The name of the accelerator
|
43
|
+
(e.g., "H100"), or None if no accelerator.
|
44
|
+
price_hourly (decimal.Decimal): The hourly price of the preset.
|
45
|
+
"""
|
46
|
+
|
47
|
+
region: str
|
48
|
+
fullname: str
|
49
|
+
name: str
|
50
|
+
platform_name: str
|
51
|
+
gpu: int
|
52
|
+
vcpu: int
|
53
|
+
memory_gib: int
|
54
|
+
accelerator_manufacturer: Optional[str]
|
55
|
+
accelerator_name: Optional[str]
|
56
|
+
price_hourly: decimal.Decimal
|
57
|
+
|
58
|
+
|
59
|
+
def _format_decimal(value: decimal.Decimal) -> str:
|
60
|
+
"""Formats a decimal value to a string with at least two decimal places,
|
61
|
+
removing trailing zeros and ensuring a two-digit decimal part.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
value (decimal.Decimal): The decimal value to format.
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
str: The formatted string representation of the decimal.
|
68
|
+
"""
|
69
|
+
formatted_value = f'{value:f}'.rstrip('0').rstrip('.')
|
70
|
+
integer_part, decimal_part = formatted_value.split(
|
71
|
+
'.') if '.' in formatted_value else (formatted_value, '')
|
72
|
+
if len(decimal_part) < 2:
|
73
|
+
decimal_part += '0' * (2 - len(decimal_part))
|
74
|
+
|
75
|
+
return f'{integer_part}.{decimal_part}'
|
76
|
+
|
77
|
+
|
78
|
+
def _estimate_platforms(platforms: List[Any], parent_id: str,
|
79
|
+
region: str) -> List[PresetInfo]:
|
80
|
+
"""Collects specifications for all presets on the given platforms to form a
|
81
|
+
batch price request. It then sends the request and processes the responses
|
82
|
+
to create a list of PresetInfo objects.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
platforms (List[Platform]): A List of compute platforms to estimate
|
86
|
+
prices for.
|
87
|
+
parent_id (str): The parent ID used for resource metadata
|
88
|
+
in the estimate request.
|
89
|
+
region (str): The region associated with the platforms.
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
List[PresetInfo]: A list of PresetInfo objects containing details and
|
93
|
+
estimated prices for each preset.
|
94
|
+
"""
|
95
|
+
|
96
|
+
calculator_service = billing().CalculatorServiceClient(nebius.sdk())
|
97
|
+
futures = []
|
98
|
+
|
99
|
+
for platform in platforms:
|
100
|
+
platform_name = platform.metadata.name
|
101
|
+
|
102
|
+
for preset in platform.spec.presets:
|
103
|
+
# Form the specification for the price request
|
104
|
+
estimate_spec = billing().ResourceSpec(
|
105
|
+
compute_instance_spec=compute().CreateInstanceRequest(
|
106
|
+
metadata=nebius_common().ResourceMetadata(
|
107
|
+
parent_id=parent_id,),
|
108
|
+
spec=compute().InstanceSpec(
|
109
|
+
resources=compute().ResourcesSpec(
|
110
|
+
platform=platform_name,
|
111
|
+
preset=preset.name,
|
112
|
+
)),
|
113
|
+
))
|
114
|
+
|
115
|
+
price_request = billing().EstimateBatchRequest(
|
116
|
+
resource_specs=[estimate_spec])
|
117
|
+
# Start future for each preset
|
118
|
+
futures.append((
|
119
|
+
platform,
|
120
|
+
preset,
|
121
|
+
calculator_service.estimate_batch(price_request,
|
122
|
+
timeout=TIMEOUT),
|
123
|
+
))
|
124
|
+
|
125
|
+
# wait all futures to complete and collect results
|
126
|
+
result = []
|
127
|
+
for platform, preset, future in futures:
|
128
|
+
platform_name = platform.metadata.name
|
129
|
+
result.append(
|
130
|
+
PresetInfo(
|
131
|
+
region=region,
|
132
|
+
fullname=f'{platform_name}_{preset.name}',
|
133
|
+
name=preset.name,
|
134
|
+
platform_name=platform_name,
|
135
|
+
gpu=preset.resources.gpu_count or 0,
|
136
|
+
vcpu=preset.resources.vcpu_count,
|
137
|
+
memory_gib=preset.resources.memory_gibibytes,
|
138
|
+
accelerator_manufacturer=ACCELERATOR_MANUFACTURER
|
139
|
+
if platform_name.startswith('gpu-') else '',
|
140
|
+
accelerator_name=platform_name.split('-')[1].upper()
|
141
|
+
if platform_name.startswith('gpu-') else '',
|
142
|
+
price_hourly=decimal.Decimal(
|
143
|
+
future.wait().hourly_cost.general.total.cost),
|
144
|
+
))
|
145
|
+
|
146
|
+
return result
|
147
|
+
|
148
|
+
|
149
|
+
def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
|
150
|
+
"""Writes the provided preset information to a CSV file.
|
151
|
+
|
152
|
+
Args:
|
153
|
+
presets (List[PresetInfo]): A list of PresetInfo objects to write.
|
154
|
+
output_file (str): The path to the output CSV file.
|
155
|
+
"""
|
156
|
+
# Set up the CSV writer to output to stdout
|
157
|
+
with open(output_file, 'w', encoding='utf-8') as out:
|
158
|
+
header = [
|
159
|
+
'InstanceType',
|
160
|
+
'AcceleratorName',
|
161
|
+
'AcceleratorCount',
|
162
|
+
'vCPUs',
|
163
|
+
'MemoryGiB',
|
164
|
+
'Price',
|
165
|
+
'Region',
|
166
|
+
'GpuInfo',
|
167
|
+
'SpotPrice',
|
168
|
+
]
|
169
|
+
writer = csv.DictWriter(out, fieldnames=header)
|
170
|
+
writer.writeheader()
|
171
|
+
|
172
|
+
for preset in sorted(presets,
|
173
|
+
key=lambda x:
|
174
|
+
(bool(x.gpu), x.region, x.platform_name, x.vcpu)):
|
175
|
+
gpu_info = ''
|
176
|
+
if preset.gpu > 0:
|
177
|
+
gpu_info_dict = {
|
178
|
+
'Gpus': [{
|
179
|
+
'Name': preset.accelerator_name,
|
180
|
+
'Manufacturer': preset.accelerator_manufacturer,
|
181
|
+
'Count': preset.gpu,
|
182
|
+
'MemoryInfo': {
|
183
|
+
'SizeInMiB': preset.memory_gib * 1024 // preset.gpu
|
184
|
+
},
|
185
|
+
}],
|
186
|
+
'TotalGpuMemoryInMiB': preset.memory_gib * 1024,
|
187
|
+
}
|
188
|
+
gpu_info = json.dumps(gpu_info_dict).replace('"', '\'')
|
189
|
+
|
190
|
+
writer.writerow({
|
191
|
+
'InstanceType': preset.fullname,
|
192
|
+
'AcceleratorName': preset.accelerator_name,
|
193
|
+
'AcceleratorCount': preset.gpu,
|
194
|
+
'vCPUs': preset.vcpu,
|
195
|
+
'MemoryGiB': preset.memory_gib,
|
196
|
+
'Price': _format_decimal(preset.price_hourly),
|
197
|
+
'Region': preset.region,
|
198
|
+
'GpuInfo': gpu_info,
|
199
|
+
'SpotPrice': '',
|
200
|
+
})
|
201
|
+
|
202
|
+
|
203
|
+
def _fetch_platforms_for_project(project_id: str) -> List[Any]:
|
204
|
+
"""Fetches all available compute platforms for a given project.
|
205
|
+
|
206
|
+
Args:
|
207
|
+
project_id (str): The ID of the project to fetch platforms from.
|
208
|
+
|
209
|
+
Returns:
|
210
|
+
List[ComputePlatform]: A list of ComputePlatform objects available
|
211
|
+
in the project.
|
212
|
+
"""
|
213
|
+
platform_service = compute().PlatformServiceClient(nebius.sdk())
|
214
|
+
|
215
|
+
platform_request = compute().ListPlatformsRequest(page_size=999,
|
216
|
+
parent_id=project_id)
|
217
|
+
platform_response = platform_service.list(platform_request,
|
218
|
+
timeout=TIMEOUT).wait()
|
219
|
+
|
220
|
+
return platform_response.items
|
221
|
+
|
222
|
+
|
223
|
+
def _get_regions_map() -> Dict[str, str]:
|
224
|
+
"""Maps region codes to their full names by iterating through tenants and
|
225
|
+
projects.
|
226
|
+
|
227
|
+
Returns:
|
228
|
+
dict[str, str]: A dictionary where keys are region codes (e.g., "e00")
|
229
|
+
and values are full region names (e.g., "eu-north1").
|
230
|
+
"""
|
231
|
+
result = {}
|
232
|
+
response = iam().TenantServiceClient(nebius.sdk()).list(
|
233
|
+
iam().ListTenantsRequest(), timeout=TIMEOUT).wait()
|
234
|
+
|
235
|
+
for tenant in response.items:
|
236
|
+
projects = (iam().ProjectServiceClient(nebius.sdk()).list(
|
237
|
+
iam().ListProjectsRequest(parent_id=tenant.metadata.id),
|
238
|
+
timeout=TIMEOUT).wait())
|
239
|
+
|
240
|
+
for project in projects.items:
|
241
|
+
match = re.match(r'^project-([a-z0-9]{3})', project.metadata.id)
|
242
|
+
if match is None:
|
243
|
+
logger.error('Could not parse project id %s',
|
244
|
+
project.metadata.id)
|
245
|
+
continue
|
246
|
+
result[match.group(1)] = project.status.region
|
247
|
+
|
248
|
+
return result
|
249
|
+
|
250
|
+
|
251
|
+
def _get_all_platform_prices() -> List[PresetInfo]:
|
252
|
+
"""Orchestrates fetching specifications and prices for all platforms across
|
253
|
+
all regions.
|
254
|
+
|
255
|
+
This function first retrieves a map of region codes to full names, then
|
256
|
+
iterates through each region, fetches available platforms for
|
257
|
+
the corresponding project ID, and finally estimates prices for all presets
|
258
|
+
on those platforms.
|
259
|
+
|
260
|
+
Returns:
|
261
|
+
List[PresetInfo]: A consolidated list of PresetInfo objects for all
|
262
|
+
platforms and presets across all regions.
|
263
|
+
"""
|
264
|
+
|
265
|
+
# Get regions codes to names
|
266
|
+
regions_map = _get_regions_map()
|
267
|
+
|
268
|
+
presets = []
|
269
|
+
|
270
|
+
for region_code in sorted(regions_map.keys()):
|
271
|
+
project_id = PARENT_ID_TEMPLATE.format(region_code)
|
272
|
+
region = regions_map[region_code]
|
273
|
+
logger.info('Processing region: %s (project: %s)...', region,
|
274
|
+
project_id)
|
275
|
+
|
276
|
+
platforms = _fetch_platforms_for_project(project_id)
|
277
|
+
if not platforms:
|
278
|
+
logger.warning('No platforms found in region %s', region)
|
279
|
+
continue
|
280
|
+
|
281
|
+
presets.extend(
|
282
|
+
_estimate_platforms(platforms=platforms,
|
283
|
+
parent_id=project_id,
|
284
|
+
region=region))
|
285
|
+
|
286
|
+
return presets
|
287
|
+
|
288
|
+
|
289
|
+
def main() -> None:
|
290
|
+
"""Main function to fetch and write Nebius platform prices to a CSV file.
|
291
|
+
|
292
|
+
It initializes the SDK, fetches all platform prices, and then writes them
|
293
|
+
to the specified CSV file.
|
294
|
+
"""
|
295
|
+
|
296
|
+
output_file = 'nebius/vms.csv'
|
297
|
+
|
298
|
+
# Fetch presets and estimate
|
299
|
+
presets = _get_all_platform_prices()
|
300
|
+
|
301
|
+
# Write CSV
|
302
|
+
_write_preset_prices(presets, output_file)
|
303
|
+
|
304
|
+
logger.info('Done!')
|
305
|
+
|
306
|
+
|
307
|
+
if __name__ == '__main__':
|
308
|
+
main()
|
sky/catalog/gcp_catalog.py
CHANGED
@@ -37,20 +37,37 @@ _image_df = common.read_catalog('gcp/images.csv',
|
|
37
37
|
_quotas_df = common.read_catalog('gcp/accelerator_quota_mapping.csv',
|
38
38
|
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
39
39
|
|
40
|
-
# We will select from the following
|
40
|
+
# We will select from the following six CPU instance families:
|
41
41
|
_DEFAULT_INSTANCE_FAMILY = [
|
42
|
-
# This is
|
43
|
-
# CPU: Intel Ice Lake
|
42
|
+
# This is a widely used general-purpose instance family as of July 2025.
|
43
|
+
# CPU: Primarily Intel Ice Lake (3rd Gen Intel Xeon Scalable Processors)
|
44
|
+
# or Cascade Lake (2nd Gen Intel Xeon Scalable Processors).
|
44
45
|
# Memory: 4 GiB RAM per 1 vCPU;
|
45
46
|
'n2-standard',
|
46
|
-
# This is
|
47
|
-
# CPU: Intel Ice Lake
|
47
|
+
# This is a memory-optimized instance family as of July 2025.
|
48
|
+
# CPU: Primarily Intel Ice Lake (3rd Gen Intel Xeon Scalable Processors)
|
49
|
+
# or Cascade Lake (2nd Gen Intel Xeon Scalable Processors).
|
48
50
|
# Memory: 8 GiB RAM per 1 vCPU;
|
49
51
|
'n2-highmem',
|
50
|
-
# This is
|
51
|
-
# CPU: Intel Ice Lake
|
52
|
+
# This is a compute-optimized instance family as of July 2025.
|
53
|
+
# CPU: Primarily Intel Ice Lake (3rd Gen Intel Xeon Scalable Processors)
|
54
|
+
# or Cascade Lake (2nd Gen Intel Xeon Scalable Processors).
|
52
55
|
# Memory: 1 GiB RAM per 1 vCPU;
|
53
56
|
'n2-highcpu',
|
57
|
+
# This is the latest general-purpose instance family as of July 2025.
|
58
|
+
# CPU: Intel 5th Gen Xeon Scalable processor (Emerald Rapids).
|
59
|
+
# Memory: 4 GiB RAM per 1 vCPU;
|
60
|
+
'n4-standard',
|
61
|
+
# This is the latest general-purpose instance family
|
62
|
+
# with a higher vCPU to memory ratio as of July 2025.
|
63
|
+
# CPU: Intel 5th Gen Xeon Scalable processor (Emerald Rapids).
|
64
|
+
# Memory: 2 GiB RAM per 1 vCPU;
|
65
|
+
'n4-highcpu',
|
66
|
+
# This is the latest general-purpose instance family
|
67
|
+
# with a lower vCPU to memory ratio as of July 2025.
|
68
|
+
# CPU: Intel 5th Gen Xeon Scalable processor (Emerald Rapids).
|
69
|
+
# Memory: 8 GiB RAM per 1 vCPU;
|
70
|
+
'n4-highmem',
|
54
71
|
]
|
55
72
|
# n2 is not allowed for launching GPUs for now.
|
56
73
|
_DEFAULT_HOST_VM_FAMILY = (
|