skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/seeweb.py +103 -0
- sky/authentication.py +38 -0
- sky/backends/backend_utils.py +148 -30
- sky/backends/cloud_vm_ray_backend.py +606 -223
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -37
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/seeweb_catalog.py +184 -0
- sky/client/cli/command.py +2 -71
- sky/client/sdk_async.py +5 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +23 -5
- sky/clouds/cloud.py +8 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/seeweb.py +463 -0
- sky/core.py +46 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
- sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
- sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +75 -26
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/controller.py +4 -2
- sky/jobs/recovery_strategy.py +1 -1
- sky/jobs/state.py +26 -16
- sky/jobs/utils.py +67 -24
- sky/logs/agent.py +10 -2
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/kubernetes/utils.py +14 -3
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +806 -0
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +252 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/server/config.py +14 -5
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/server.py +1 -0
- sky/server/uvicorn.py +11 -5
- sky/setup_files/dependencies.py +8 -1
- sky/skylet/constants.py +14 -8
- sky/skylet/job_lib.py +128 -10
- sky/skylet/log_lib.py +14 -3
- sky/skylet/log_lib.pyi +9 -0
- sky/skylet/services.py +203 -0
- sky/skylet/skylet.py +4 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +22 -0
- sky/utils/context_utils.py +15 -2
- sky/utils/controller_utils.py +11 -5
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
sky/catalog/__init__.py
CHANGED
|
@@ -247,6 +247,13 @@ def get_accelerators_from_instance_type(
|
|
|
247
247
|
instance_type)
|
|
248
248
|
|
|
249
249
|
|
|
250
|
+
def get_arch_from_instance_type(instance_type: str,
|
|
251
|
+
clouds: CloudFilter = None) -> Optional[str]:
|
|
252
|
+
"""Returns the arch from a instance type."""
|
|
253
|
+
return _map_clouds_catalog(clouds, 'get_arch_from_instance_type',
|
|
254
|
+
instance_type)
|
|
255
|
+
|
|
256
|
+
|
|
250
257
|
def get_instance_type_for_accelerator(
|
|
251
258
|
acc_name: str,
|
|
252
259
|
acc_count: Union[int, float],
|
sky/catalog/aws_catalog.py
CHANGED
|
@@ -271,6 +271,10 @@ def get_accelerators_from_instance_type(
|
|
|
271
271
|
_get_df(), instance_type)
|
|
272
272
|
|
|
273
273
|
|
|
274
|
+
def get_arch_from_instance_type(instance_type: str) -> Optional[str]:
|
|
275
|
+
return common.get_arch_from_instance_type_impl(_get_df(), instance_type)
|
|
276
|
+
|
|
277
|
+
|
|
274
278
|
def get_instance_type_for_accelerator(
|
|
275
279
|
acc_name: str,
|
|
276
280
|
acc_count: int,
|
sky/catalog/common.py
CHANGED
|
@@ -527,6 +527,24 @@ def get_accelerators_from_instance_type_impl(
|
|
|
527
527
|
return {acc_name: _convert(acc_count)}
|
|
528
528
|
|
|
529
529
|
|
|
530
|
+
def get_arch_from_instance_type_impl(
|
|
531
|
+
df: 'pd.DataFrame',
|
|
532
|
+
instance_type: str,
|
|
533
|
+
) -> Optional[str]:
|
|
534
|
+
df = _get_instance_type(df, instance_type, None)
|
|
535
|
+
if df.empty:
|
|
536
|
+
with ux_utils.print_exception_no_traceback():
|
|
537
|
+
raise ValueError(f'No instance type {instance_type} found.')
|
|
538
|
+
row = df.iloc[0]
|
|
539
|
+
if 'Arch' not in row:
|
|
540
|
+
return None
|
|
541
|
+
arch = row['Arch']
|
|
542
|
+
if pd.isnull(arch):
|
|
543
|
+
return None
|
|
544
|
+
|
|
545
|
+
return arch
|
|
546
|
+
|
|
547
|
+
|
|
530
548
|
def get_instance_type_for_accelerator_impl(
|
|
531
549
|
df: 'pd.DataFrame',
|
|
532
550
|
acc_name: str,
|
|
@@ -67,17 +67,13 @@ US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
|
|
|
67
67
|
# The following columns will be included in the final catalog.
|
|
68
68
|
USEFUL_COLUMNS = [
|
|
69
69
|
'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB',
|
|
70
|
-
'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone'
|
|
70
|
+
'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Arch'
|
|
71
71
|
]
|
|
72
72
|
|
|
73
73
|
# NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is
|
|
74
74
|
# only available in this region, but it serves pricing information for all
|
|
75
75
|
# regions.
|
|
76
76
|
PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv' # pylint: disable=line-too-long
|
|
77
|
-
# Hardcode the regions that offer p4de.24xlarge as our credential does not have
|
|
78
|
-
# the permission to query the offerings of the instance.
|
|
79
|
-
# Ref: https://aws.amazon.com/ec2/instance-types/p4/
|
|
80
|
-
P4DE_REGIONS = ['us-east-1', 'us-west-2']
|
|
81
77
|
# g6f instances have fractional GPUs, but the API returns Count: 1 under
|
|
82
78
|
# GpuInfo. However, the GPU memory is properly scaled. Taking the instance GPU
|
|
83
79
|
# divided by the total memory of an L4 will give us the fraction of the GPU.
|
|
@@ -214,35 +210,6 @@ def _get_spot_pricing_table(region: str) -> 'pd.DataFrame':
|
|
|
214
210
|
return df
|
|
215
211
|
|
|
216
212
|
|
|
217
|
-
def _patch_p4de(region: str, df: 'pd.DataFrame',
|
|
218
|
-
pricing_df: 'pd.DataFrame') -> 'pd.DataFrame':
|
|
219
|
-
# Hardcoded patch for p4de.24xlarge, as our credentials doesn't have access
|
|
220
|
-
# to the instance type.
|
|
221
|
-
# Columns:
|
|
222
|
-
# InstanceType,AcceleratorName,AcceleratorCount,vCPUs,MemoryGiB,GpuInfo,
|
|
223
|
-
# Price,SpotPrice,Region,AvailabilityZone
|
|
224
|
-
records = []
|
|
225
|
-
for zone in df[df['Region'] == region]['AvailabilityZone'].unique():
|
|
226
|
-
records.append({
|
|
227
|
-
'InstanceType': 'p4de.24xlarge',
|
|
228
|
-
'AcceleratorName': 'A100-80GB',
|
|
229
|
-
'AcceleratorCount': 8,
|
|
230
|
-
'vCPUs': 96,
|
|
231
|
-
'MemoryGiB': 1152,
|
|
232
|
-
'GpuInfo':
|
|
233
|
-
('{\'Gpus\': [{\'Name\': \'A100-80GB\', \'Manufacturer\': '
|
|
234
|
-
'\'NVIDIA\', \'Count\': 8, \'MemoryInfo\': {\'SizeInMiB\': '
|
|
235
|
-
'81920}}], \'TotalGpuMemoryInMiB\': 655360}'),
|
|
236
|
-
'AvailabilityZone': zone,
|
|
237
|
-
'Region': region,
|
|
238
|
-
'Price': pricing_df[pricing_df['InstanceType'] == 'p4de.24xlarge']
|
|
239
|
-
['Price'].values[0],
|
|
240
|
-
'SpotPrice': np.nan,
|
|
241
|
-
})
|
|
242
|
-
df = pd.concat([df, pd.DataFrame.from_records(records)])
|
|
243
|
-
return df
|
|
244
|
-
|
|
245
|
-
|
|
246
213
|
def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
247
214
|
try:
|
|
248
215
|
# Fetch the zone info first to make sure the account has access to the
|
|
@@ -275,6 +242,17 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
275
242
|
return None, np.nan
|
|
276
243
|
return accelerator['Name'], accelerator['Count']
|
|
277
244
|
|
|
245
|
+
def get_arch(row) -> Optional[str]:
|
|
246
|
+
if 'ProcessorInfo' in row:
|
|
247
|
+
processor = row['ProcessorInfo']
|
|
248
|
+
if 'SupportedArchitectures' in processor:
|
|
249
|
+
archs = processor['SupportedArchitectures']
|
|
250
|
+
if isinstance(archs, list):
|
|
251
|
+
return archs[0]
|
|
252
|
+
elif isinstance(archs, str):
|
|
253
|
+
return archs
|
|
254
|
+
return None
|
|
255
|
+
|
|
278
256
|
def get_vcpus(row) -> float:
|
|
279
257
|
if not np.isnan(row['vCPU']):
|
|
280
258
|
return float(row['vCPU'])
|
|
@@ -332,6 +310,7 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
332
310
|
'AcceleratorCount': acc_count,
|
|
333
311
|
'vCPUs': get_vcpus(row),
|
|
334
312
|
'MemoryGiB': get_memory_gib(row),
|
|
313
|
+
'Arch': get_arch(row),
|
|
335
314
|
})
|
|
336
315
|
|
|
337
316
|
# The AWS API may not have all the instance types in the pricing table,
|
|
@@ -355,9 +334,6 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
|
|
|
355
334
|
df = pd.concat(
|
|
356
335
|
[df, df.apply(get_additional_columns, axis='columns')],
|
|
357
336
|
axis='columns')
|
|
358
|
-
# patch the df for p4de.24xlarge
|
|
359
|
-
if region in P4DE_REGIONS:
|
|
360
|
-
df = _patch_p4de(region, df, pricing_df)
|
|
361
337
|
if 'GpuInfo' not in df.columns:
|
|
362
338
|
df['GpuInfo'] = np.nan
|
|
363
339
|
df = df[USEFUL_COLUMNS]
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
"""A script that generates the Seeweb catalog.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
python fetch_seeweb.py [-h] [--api-key API_KEY]
|
|
5
|
+
[--api-key-path API_KEY_PATH]
|
|
6
|
+
|
|
7
|
+
If neither --api-key nor --api-key-path are provided, this script will parse
|
|
8
|
+
`~/.seeweb_cloud/seeweb_keys` to look for Seeweb API key.
|
|
9
|
+
"""
|
|
10
|
+
import argparse
|
|
11
|
+
import configparser
|
|
12
|
+
import csv
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
from typing import Any, Dict, List, Optional
|
|
16
|
+
|
|
17
|
+
from sky.adaptors.seeweb import ecsapi
|
|
18
|
+
|
|
19
|
+
# GPU name mapping from Seeweb to SkyPilot canonical names
|
|
20
|
+
SEEWEB_GPU_NAME_TO_SKYPILOT_GPU_NAME = {
|
|
21
|
+
'H200 141GB': 'H200',
|
|
22
|
+
'RTX A6000 48GB': 'RTXA6000',
|
|
23
|
+
'A100 80GB': 'A100',
|
|
24
|
+
'L4 24GB': 'L4',
|
|
25
|
+
'L40s 48GB': 'L40s',
|
|
26
|
+
'H100 80GB': 'H100',
|
|
27
|
+
'MI300X': 'MI300X',
|
|
28
|
+
'A30': 'A30',
|
|
29
|
+
'RTX 6000 24GB': 'RTX6000',
|
|
30
|
+
'Tenstorrent Grayskull e75': 'GRAYSKULL-E75',
|
|
31
|
+
'Tenstorrent Grayskull e150': 'GRAYSKULL-E150',
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# GPU VRAM mapping in MB
|
|
35
|
+
VRAM = {
|
|
36
|
+
'RTXA6000': 48384, # 48GB
|
|
37
|
+
'H200': 144384, # 141GB
|
|
38
|
+
'A100': 81920, # 80GB
|
|
39
|
+
'L4': 24576, # 24GB
|
|
40
|
+
'L40s': 49152, # 48GB
|
|
41
|
+
'H100': 81920, # 80GB
|
|
42
|
+
'MI300X': 192000, # 192GB
|
|
43
|
+
'A30': 24576, # 24GB
|
|
44
|
+
'RTX6000': 24576, # 24GB
|
|
45
|
+
'GRAYSKULL-E75': 8192, # 8GB
|
|
46
|
+
'GRAYSKULL-E150': 8192, # 8GB
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def is_tenstorrent_gpu_name(gpu_name: Optional[str]) -> bool:
|
|
51
|
+
"""Return True if the given GPU name refers to a Tenstorrent GPU.
|
|
52
|
+
|
|
53
|
+
Detects by common identifiers present in normalized names (e.g., GRAYSKULL)
|
|
54
|
+
or by the vendor name directly.
|
|
55
|
+
"""
|
|
56
|
+
if not gpu_name:
|
|
57
|
+
return False
|
|
58
|
+
upper = str(gpu_name).upper()
|
|
59
|
+
return 'TENSTORRENT' in upper or 'GRAYSKULL' in upper
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def is_mi300x_gpu_name(gpu_name: Optional[str]) -> bool:
|
|
63
|
+
"""Return True if the given GPU name refers to AMD MI300X."""
|
|
64
|
+
if not gpu_name:
|
|
65
|
+
return False
|
|
66
|
+
return 'MI300X' in str(gpu_name).upper()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def get_api_key(path: Optional[str] = None) -> str:
|
|
70
|
+
"""Get API key from config file or environment variable."""
|
|
71
|
+
# Step 1: Try to get from config file
|
|
72
|
+
if path is None:
|
|
73
|
+
path = os.path.expanduser('~/.seeweb_cloud/seeweb_keys')
|
|
74
|
+
else:
|
|
75
|
+
path = os.path.expanduser(path)
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
parser = configparser.ConfigParser()
|
|
79
|
+
parser.read(path)
|
|
80
|
+
return parser['DEFAULT']['api_key'].strip()
|
|
81
|
+
except (KeyError, FileNotFoundError) as exc:
|
|
82
|
+
# Step 2: Try environment variable
|
|
83
|
+
api_key = os.environ.get('SEEWEB_API_KEY')
|
|
84
|
+
if api_key:
|
|
85
|
+
return api_key.strip()
|
|
86
|
+
|
|
87
|
+
# If neither found, raise error
|
|
88
|
+
raise ValueError(
|
|
89
|
+
f'API key not found in {path} or ENV variable SEEWEB_API_KEY'
|
|
90
|
+
) from exc
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def normalize_gpu_name(gpu_name: str) -> str:
|
|
94
|
+
"""Normalize GPU name from Seeweb API to SkyPilot canonical name."""
|
|
95
|
+
if not gpu_name:
|
|
96
|
+
return ''
|
|
97
|
+
|
|
98
|
+
# Map to canonical name if available
|
|
99
|
+
canonical_name = SEEWEB_GPU_NAME_TO_SKYPILOT_GPU_NAME.get(gpu_name)
|
|
100
|
+
if canonical_name:
|
|
101
|
+
return canonical_name
|
|
102
|
+
|
|
103
|
+
# If not found in mapping, return original name
|
|
104
|
+
print(f'Warning: GPU name "{gpu_name}" not found in mapping,'
|
|
105
|
+
f'using original name')
|
|
106
|
+
return gpu_name
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def parse_plan_info(plan: Any) -> Dict[str, Any]:
|
|
110
|
+
"""Parse plan information from Seeweb API response."""
|
|
111
|
+
# Handle both dictionary and object formats
|
|
112
|
+
if hasattr(plan, 'name'):
|
|
113
|
+
# Object format from API
|
|
114
|
+
plan_name = getattr(plan, 'name', 'unknown')
|
|
115
|
+
vcpus = int(getattr(plan, 'cpu', 0))
|
|
116
|
+
|
|
117
|
+
# Handle memory conversion safely
|
|
118
|
+
memory_mb = getattr(plan, 'ram', 0)
|
|
119
|
+
try:
|
|
120
|
+
memory_gb = int(
|
|
121
|
+
memory_mb) / 1024 if memory_mb else 0 # Convert to GB
|
|
122
|
+
except (ValueError, TypeError):
|
|
123
|
+
memory_gb = 0
|
|
124
|
+
|
|
125
|
+
# Handle price safely
|
|
126
|
+
try:
|
|
127
|
+
price = float(getattr(plan, 'hourly_price', 0.0))
|
|
128
|
+
except (ValueError, TypeError):
|
|
129
|
+
price = 0.0
|
|
130
|
+
|
|
131
|
+
# Handle GPU info
|
|
132
|
+
try:
|
|
133
|
+
gpu_count = int(getattr(plan, 'gpu', 0))
|
|
134
|
+
except (ValueError, TypeError):
|
|
135
|
+
gpu_count = 0
|
|
136
|
+
|
|
137
|
+
gpu_label = getattr(plan, 'gpu_label', None)
|
|
138
|
+
|
|
139
|
+
# Determine GPU name - use gpu_label if available,
|
|
140
|
+
# otherwise try to infer from plan name
|
|
141
|
+
if gpu_label:
|
|
142
|
+
gpu_name = normalize_gpu_name(gpu_label) # Normalize the GPU name
|
|
143
|
+
else:
|
|
144
|
+
# Try to extract GPU name from plan name
|
|
145
|
+
plan_name = getattr(plan, 'name', '')
|
|
146
|
+
if 'GPU' in plan_name:
|
|
147
|
+
# Extract GPU type from plan name (e.g., ECS1GPU11 -> GPU11)
|
|
148
|
+
parts = plan_name.split('GPU')
|
|
149
|
+
if len(parts) > 1:
|
|
150
|
+
gpu_name = 'GPU' + parts[1]
|
|
151
|
+
else:
|
|
152
|
+
gpu_name = 'GPU'
|
|
153
|
+
else:
|
|
154
|
+
gpu_name = None
|
|
155
|
+
|
|
156
|
+
# Get GPU VRAM from mapping using the normalized name
|
|
157
|
+
gpu_vram_mb = VRAM.get(gpu_name, 0) if gpu_name else 0
|
|
158
|
+
else:
|
|
159
|
+
raise ValueError(f'Unsupported plan format: {type(plan)}')
|
|
160
|
+
|
|
161
|
+
return {
|
|
162
|
+
'plan_name': plan_name,
|
|
163
|
+
'vcpus': vcpus,
|
|
164
|
+
'memory_gb': memory_gb,
|
|
165
|
+
'gpu_name': gpu_name,
|
|
166
|
+
'gpu_count': gpu_count,
|
|
167
|
+
'gpu_vram_mb': gpu_vram_mb,
|
|
168
|
+
'price': price,
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def get_gpu_info(gpu_count: int, gpu_name: str, gpu_vram_mb: int = 0) -> str:
|
|
173
|
+
"""Generate GPU info JSON string compatible with SkyPilot."""
|
|
174
|
+
if not gpu_name or gpu_count == 0:
|
|
175
|
+
return ''
|
|
176
|
+
|
|
177
|
+
# Determine manufacturer based on GPU name
|
|
178
|
+
gpu_name_upper = str(gpu_name).upper()
|
|
179
|
+
if 'MI300' in gpu_name_upper or gpu_name_upper == 'MI300X':
|
|
180
|
+
manufacturer = 'AMD'
|
|
181
|
+
elif 'GRAYSKULL' in gpu_name_upper:
|
|
182
|
+
manufacturer = 'TENSTORRENT'
|
|
183
|
+
else:
|
|
184
|
+
manufacturer = 'NVIDIA'
|
|
185
|
+
|
|
186
|
+
gpu_info = {
|
|
187
|
+
'Gpus': [{
|
|
188
|
+
'Name': gpu_name,
|
|
189
|
+
'Manufacturer': manufacturer,
|
|
190
|
+
'Count': float(gpu_count),
|
|
191
|
+
'MemoryInfo': {
|
|
192
|
+
'SizeInMiB': gpu_vram_mb
|
|
193
|
+
},
|
|
194
|
+
}],
|
|
195
|
+
'TotalGpuMemoryInMiB': gpu_vram_mb * gpu_count if gpu_vram_mb else 0
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return json.dumps(gpu_info).replace('"', '\'')
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def fetch_seeweb_data(api_key: str) -> List[Dict]:
|
|
202
|
+
"""Fetch data from Seeweb API."""
|
|
203
|
+
if ecsapi is None:
|
|
204
|
+
raise ImportError('ecsapi not available')
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
client = ecsapi.Api(token=api_key)
|
|
208
|
+
|
|
209
|
+
print('Fetching plans from Seeweb API...')
|
|
210
|
+
api_plans = client.fetch_plans()
|
|
211
|
+
|
|
212
|
+
if not api_plans:
|
|
213
|
+
raise ValueError('No plans returned from API')
|
|
214
|
+
|
|
215
|
+
print(f'Successfully fetched {len(api_plans)} plans from API')
|
|
216
|
+
plans = []
|
|
217
|
+
|
|
218
|
+
for plan in api_plans:
|
|
219
|
+
try:
|
|
220
|
+
# Parse first so we can filter
|
|
221
|
+
# Tenstorrent before extra API calls
|
|
222
|
+
parsed = parse_plan_info(plan)
|
|
223
|
+
|
|
224
|
+
if is_tenstorrent_gpu_name(parsed.get('gpu_name')):
|
|
225
|
+
print(f'Skipping Tenstorrent plan {plan.name}')
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
if is_mi300x_gpu_name(parsed.get('gpu_name')):
|
|
229
|
+
print(f'Skipping MI300X plan {plan.name}')
|
|
230
|
+
continue
|
|
231
|
+
|
|
232
|
+
print(f'Fetching regions available for {plan.name}')
|
|
233
|
+
regions_available = client.fetch_regions_available(plan.name)
|
|
234
|
+
|
|
235
|
+
parsed.update({'regions_available': regions_available})
|
|
236
|
+
plans.append(parsed)
|
|
237
|
+
except Exception as e: # pylint: disable=broad-except
|
|
238
|
+
print(f'Error parsing plan {plan.name}: {e}')
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
print(f'Successfully parsed {len(plans)} plans')
|
|
242
|
+
return plans
|
|
243
|
+
|
|
244
|
+
except Exception as e: # pylint: disable=broad-except
|
|
245
|
+
raise Exception(f'Error fetching data from Seeweb API: {e}') from e
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def create_catalog(api_key: str, output_path: str) -> None:
|
|
249
|
+
"""Create Seeweb catalog by fetching data from API."""
|
|
250
|
+
plans = fetch_seeweb_data(api_key)
|
|
251
|
+
|
|
252
|
+
# Create CSV catalog
|
|
253
|
+
print(f'Writing catalog to {output_path}')
|
|
254
|
+
with open(output_path, mode='w', encoding='utf-8') as f:
|
|
255
|
+
writer = csv.writer(f, delimiter=',', quotechar='"')
|
|
256
|
+
writer.writerow([
|
|
257
|
+
'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs',
|
|
258
|
+
'MemoryGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
|
|
259
|
+
])
|
|
260
|
+
|
|
261
|
+
for plan in plans:
|
|
262
|
+
try:
|
|
263
|
+
gpu_info_str = ''
|
|
264
|
+
if plan['gpu_name'] and plan['gpu_count'] > 0:
|
|
265
|
+
gpu_info_str = get_gpu_info(plan['gpu_count'],
|
|
266
|
+
plan['gpu_name'],
|
|
267
|
+
plan.get('gpu_vram_mb', 0))
|
|
268
|
+
|
|
269
|
+
# Handle regions - create a row for each available region
|
|
270
|
+
regions_available = plan['regions_available']
|
|
271
|
+
if isinstance(regions_available,
|
|
272
|
+
list) and len(regions_available) > 0:
|
|
273
|
+
# Create a row for each region
|
|
274
|
+
for region in regions_available:
|
|
275
|
+
writer.writerow([
|
|
276
|
+
plan['plan_name'], # InstanceType
|
|
277
|
+
plan['gpu_name'], # AcceleratorName (cleaned)
|
|
278
|
+
plan['gpu_count'] if plan['gpu_count'] > 0 else
|
|
279
|
+
'', # AcceleratorCount
|
|
280
|
+
plan['vcpus'], # vCPUs
|
|
281
|
+
plan['memory_gb'], # MemoryGiB
|
|
282
|
+
plan['price'], # Price
|
|
283
|
+
region, # Region (single region per row)
|
|
284
|
+
gpu_info_str, # GpuInfo
|
|
285
|
+
'' # SpotPrice (Seeweb doesn't support spot)
|
|
286
|
+
])
|
|
287
|
+
else:
|
|
288
|
+
# No regions available, create a row with empty region
|
|
289
|
+
writer.writerow([
|
|
290
|
+
plan['plan_name'], # InstanceType
|
|
291
|
+
plan['gpu_name'], # AcceleratorName (cleaned)
|
|
292
|
+
plan['gpu_count']
|
|
293
|
+
if plan['gpu_count'] > 0 else '', # AcceleratorCount
|
|
294
|
+
plan['vcpus'], # vCPUs
|
|
295
|
+
plan['memory_gb'], # MemoryGiB
|
|
296
|
+
plan['price'], # Price
|
|
297
|
+
'', # Region (empty)
|
|
298
|
+
gpu_info_str, # GpuInfo
|
|
299
|
+
'' # SpotPrice (Seeweb doesn't support spot)
|
|
300
|
+
])
|
|
301
|
+
except Exception as e: # pylint: disable=broad-except
|
|
302
|
+
print(f'Error processing plan {plan["plan_name"]}: {e}')
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
print(f'Seeweb catalog saved to {output_path}')
|
|
306
|
+
print(f'Created {len(plans)} instance types')
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def main() -> None:
|
|
310
|
+
"""Main function to fetch and write Seeweb platform prices to a CSV file."""
|
|
311
|
+
parser = argparse.ArgumentParser()
|
|
312
|
+
parser.add_argument('--api-key', help='Seeweb API key')
|
|
313
|
+
parser.add_argument('--api-key-path',
|
|
314
|
+
help='Path to file containing Seeweb API key')
|
|
315
|
+
args = parser.parse_args()
|
|
316
|
+
|
|
317
|
+
# Get API key
|
|
318
|
+
if args.api_key:
|
|
319
|
+
api_key = args.api_key
|
|
320
|
+
else:
|
|
321
|
+
api_key = get_api_key(args.api_key_path)
|
|
322
|
+
|
|
323
|
+
os.makedirs('seeweb', exist_ok=True)
|
|
324
|
+
create_catalog(api_key, 'seeweb/vms.csv')
|
|
325
|
+
print('Seeweb Service Catalog saved to seeweb/vms.csv')
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
if __name__ == '__main__':
|
|
329
|
+
main()
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Seeweb service catalog.
|
|
2
|
+
|
|
3
|
+
This module loads the service catalog file and can be used to
|
|
4
|
+
query instance types and pricing information for Seeweb.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import typing
|
|
8
|
+
from typing import Dict, List, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from sky.catalog import common
|
|
13
|
+
from sky.utils import resources_utils
|
|
14
|
+
from sky.utils import ux_utils
|
|
15
|
+
|
|
16
|
+
if typing.TYPE_CHECKING:
|
|
17
|
+
from sky.clouds import cloud
|
|
18
|
+
|
|
19
|
+
_PULL_FREQUENCY_HOURS = 8
|
|
20
|
+
_df = common.read_catalog('seeweb/vms.csv',
|
|
21
|
+
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def instance_type_exists(instance_type: str) -> bool:
|
|
25
|
+
result = common.instance_type_exists_impl(_df, instance_type)
|
|
26
|
+
return result
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def validate_region_zone(
|
|
30
|
+
region: Optional[str],
|
|
31
|
+
zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
|
|
32
|
+
if zone is not None:
|
|
33
|
+
with ux_utils.print_exception_no_traceback():
|
|
34
|
+
raise ValueError('Seeweb does not support zones.')
|
|
35
|
+
|
|
36
|
+
result = common.validate_region_zone_impl('Seeweb', _df, region, zone)
|
|
37
|
+
return result
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_hourly_cost(instance_type: str,
|
|
41
|
+
use_spot: bool = False,
|
|
42
|
+
region: Optional[str] = None,
|
|
43
|
+
zone: Optional[str] = None) -> float:
|
|
44
|
+
"""Returns the cost, or the cheapest cost among all zones for spot."""
|
|
45
|
+
if zone is not None:
|
|
46
|
+
with ux_utils.print_exception_no_traceback():
|
|
47
|
+
raise ValueError('Seeweb does not support zones.')
|
|
48
|
+
|
|
49
|
+
result = common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
|
|
50
|
+
zone)
|
|
51
|
+
return result
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_vcpus_mem_from_instance_type(
|
|
55
|
+
instance_type: str) -> Tuple[Optional[float], Optional[float]]:
|
|
56
|
+
result = common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
|
|
57
|
+
return result
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
61
|
+
memory: Optional[str] = None,
|
|
62
|
+
disk_tier: Optional[
|
|
63
|
+
resources_utils.DiskTier] = None,
|
|
64
|
+
region: Optional[str] = None,
|
|
65
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
66
|
+
del disk_tier # unused
|
|
67
|
+
result = common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory,
|
|
68
|
+
region, zone)
|
|
69
|
+
return result
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_accelerators_from_instance_type(
|
|
73
|
+
instance_type: str) -> Optional[Dict[str, int]]:
|
|
74
|
+
# Filter the dataframe for the specific instance type
|
|
75
|
+
df_filtered = _df[_df['InstanceType'] == instance_type]
|
|
76
|
+
if df_filtered.empty:
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
# Get the first row (all rows for same instance
|
|
80
|
+
# type should have same accelerator info)
|
|
81
|
+
row = df_filtered.iloc[0]
|
|
82
|
+
acc_name = row['AcceleratorName']
|
|
83
|
+
acc_count = row['AcceleratorCount']
|
|
84
|
+
|
|
85
|
+
# Check if the instance has accelerators
|
|
86
|
+
if pd.isna(acc_name) or pd.isna(
|
|
87
|
+
acc_count) or acc_name == '' or acc_count == '':
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
# Convert accelerator count to int/float
|
|
91
|
+
try:
|
|
92
|
+
if int(acc_count) == acc_count:
|
|
93
|
+
acc_count = int(acc_count)
|
|
94
|
+
else:
|
|
95
|
+
acc_count = float(acc_count)
|
|
96
|
+
except (ValueError, TypeError):
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
result = {acc_name: acc_count}
|
|
100
|
+
return result
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def get_instance_type_for_accelerator(
|
|
104
|
+
acc_name: str,
|
|
105
|
+
acc_count: int,
|
|
106
|
+
cpus: Optional[str] = None,
|
|
107
|
+
memory: Optional[str] = None,
|
|
108
|
+
use_spot: bool = False,
|
|
109
|
+
region: Optional[str] = None,
|
|
110
|
+
zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
|
|
111
|
+
"""Returns a list of instance types satisfying
|
|
112
|
+
the required count of accelerators."""
|
|
113
|
+
if zone is not None:
|
|
114
|
+
with ux_utils.print_exception_no_traceback():
|
|
115
|
+
raise ValueError('Seeweb does not support zones.')
|
|
116
|
+
|
|
117
|
+
result = common.get_instance_type_for_accelerator_impl(df=_df,
|
|
118
|
+
acc_name=acc_name,
|
|
119
|
+
acc_count=acc_count,
|
|
120
|
+
cpus=cpus,
|
|
121
|
+
memory=memory,
|
|
122
|
+
use_spot=use_spot,
|
|
123
|
+
region=region,
|
|
124
|
+
zone=zone)
|
|
125
|
+
return result
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def regions() -> List['cloud.Region']:
|
|
129
|
+
result = common.get_region_zones(_df, use_spot=False)
|
|
130
|
+
return result
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def get_region_zones_for_instance_type(instance_type: str,
|
|
134
|
+
use_spot: bool = False
|
|
135
|
+
) -> List['cloud.Region']:
|
|
136
|
+
"""Returns a list of regions for a given instance type."""
|
|
137
|
+
# Filter the dataframe for the specific instance type
|
|
138
|
+
df_filtered = _df[_df['InstanceType'] == instance_type]
|
|
139
|
+
if df_filtered.empty:
|
|
140
|
+
return []
|
|
141
|
+
|
|
142
|
+
# Use common.get_region_zones() like all other providers
|
|
143
|
+
region_list = common.get_region_zones(df_filtered, use_spot)
|
|
144
|
+
|
|
145
|
+
# Default region: Frosinone (it-fr2)
|
|
146
|
+
# Other regions: Milano (it-mi2), Lugano (ch-lug1), Bulgaria (bg-sof1)
|
|
147
|
+
priority_regions = ['it-fr2']
|
|
148
|
+
prioritized_regions = []
|
|
149
|
+
other_regions = []
|
|
150
|
+
|
|
151
|
+
# First, add regions in priority order if they exist
|
|
152
|
+
for priority_region in priority_regions:
|
|
153
|
+
for region in region_list:
|
|
154
|
+
if region.name == priority_region:
|
|
155
|
+
prioritized_regions.append(region)
|
|
156
|
+
break
|
|
157
|
+
|
|
158
|
+
# Then, add any remaining regions that weren't in the priority list
|
|
159
|
+
for region in region_list:
|
|
160
|
+
if region.name not in priority_regions:
|
|
161
|
+
other_regions.append(region)
|
|
162
|
+
|
|
163
|
+
result = prioritized_regions + other_regions
|
|
164
|
+
return result
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def list_accelerators(
|
|
168
|
+
gpus_only: bool,
|
|
169
|
+
name_filter: Optional[str],
|
|
170
|
+
region_filter: Optional[str],
|
|
171
|
+
quantity_filter: Optional[int],
|
|
172
|
+
case_sensitive: bool = True,
|
|
173
|
+
all_regions: bool = False,
|
|
174
|
+
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
|
175
|
+
"""Lists accelerators offered in Seeweb."""
|
|
176
|
+
# Filter out rows with empty or null regions (indicating unavailability)
|
|
177
|
+
df_filtered = _df.dropna(subset=['Region'])
|
|
178
|
+
df_filtered = df_filtered[df_filtered['Region'].str.strip() != '']
|
|
179
|
+
|
|
180
|
+
result = common.list_accelerators_impl('Seeweb', df_filtered, gpus_only,
|
|
181
|
+
name_filter, region_filter,
|
|
182
|
+
quantity_filter, case_sensitive,
|
|
183
|
+
all_regions, require_price)
|
|
184
|
+
return result
|