skypilot-nightly 1.0.0.dev20250204__py3-none-any.whl → 1.0.0.dev20250205__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/vast.py +29 -0
- sky/authentication.py +18 -0
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +1 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/kubernetes_catalog.py +11 -6
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/vast.py +279 -0
- sky/jobs/dashboard/templates/index.html +117 -52
- sky/jobs/scheduler.py +14 -5
- sky/provision/__init__.py +1 -0
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +161 -0
- sky/setup_files/dependencies.py +1 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/utils/controller_utils.py +5 -0
- {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/METADATA +4 -1
- {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/RECORD +27 -18
- {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/top_level.txt +0 -0
sky/clouds/vast.py
ADDED
@@ -0,0 +1,279 @@
|
|
1
|
+
""" Vast Cloud. """
|
2
|
+
|
3
|
+
import typing
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
5
|
+
|
6
|
+
from sky import clouds
|
7
|
+
from sky.clouds import service_catalog
|
8
|
+
from sky.utils import resources_utils
|
9
|
+
|
10
|
+
if typing.TYPE_CHECKING:
|
11
|
+
from sky import resources as resources_lib
|
12
|
+
|
13
|
+
|
14
|
+
@clouds.CLOUD_REGISTRY.register
|
15
|
+
class Vast(clouds.Cloud):
|
16
|
+
""" Vast GPU Cloud
|
17
|
+
|
18
|
+
_REPR | The string representation for the Vast GPU cloud object.
|
19
|
+
"""
|
20
|
+
_REPR = 'Vast'
|
21
|
+
_CLOUD_UNSUPPORTED_FEATURES = {
|
22
|
+
clouds.CloudImplementationFeatures.MULTI_NODE:
|
23
|
+
('Multi-node not supported yet, as the interconnection among nodes '
|
24
|
+
'are non-trivial on Vast.'),
|
25
|
+
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
26
|
+
('Customizing disk tier is not supported yet on Vast.'),
|
27
|
+
clouds.CloudImplementationFeatures.OPEN_PORTS:
|
28
|
+
('Opening ports is currently not supported on Vast.'),
|
29
|
+
clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
|
30
|
+
('Mounting object stores is not supported on Vast.'),
|
31
|
+
}
|
32
|
+
#
|
33
|
+
# Vast doesn't have a max cluster name limit. This number
|
34
|
+
# is reasonably large and exists to play nicely with the
|
35
|
+
# other providers
|
36
|
+
#
|
37
|
+
_MAX_CLUSTER_NAME_LEN_LIMIT = 120
|
38
|
+
_regions: List[clouds.Region] = []
|
39
|
+
|
40
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
41
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
42
|
+
|
43
|
+
@classmethod
|
44
|
+
def _unsupported_features_for_resources(
|
45
|
+
cls, resources: 'resources_lib.Resources'
|
46
|
+
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
47
|
+
"""The features not supported based on the resources provided.
|
48
|
+
|
49
|
+
This method is used by check_features_are_supported() to check if the
|
50
|
+
cloud implementation supports all the requested features.
|
51
|
+
|
52
|
+
Returns:
|
53
|
+
A dict of {feature: reason} for the features not supported by the
|
54
|
+
cloud implementation.
|
55
|
+
"""
|
56
|
+
del resources # unused
|
57
|
+
return cls._CLOUD_UNSUPPORTED_FEATURES
|
58
|
+
|
59
|
+
@classmethod
|
60
|
+
def _max_cluster_name_length(cls) -> Optional[int]:
|
61
|
+
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
62
|
+
|
63
|
+
@classmethod
|
64
|
+
def regions_with_offering(cls, instance_type: str,
|
65
|
+
accelerators: Optional[Dict[str, int]],
|
66
|
+
use_spot: bool, region: Optional[str],
|
67
|
+
zone: Optional[str]) -> List[clouds.Region]:
|
68
|
+
assert zone is None, 'Vast does not support zones.'
|
69
|
+
del accelerators, zone # unused
|
70
|
+
regions = service_catalog.get_region_zones_for_instance_type(
|
71
|
+
instance_type, use_spot, 'vast')
|
72
|
+
|
73
|
+
if region is not None:
|
74
|
+
regions = [r for r in regions if r.name == region]
|
75
|
+
return regions
|
76
|
+
|
77
|
+
@classmethod
|
78
|
+
def get_vcpus_mem_from_instance_type(
|
79
|
+
cls,
|
80
|
+
instance_type: str,
|
81
|
+
) -> Tuple[Optional[float], Optional[float]]:
|
82
|
+
return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
|
83
|
+
clouds='vast')
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def zones_provision_loop(
|
87
|
+
cls,
|
88
|
+
*,
|
89
|
+
region: str,
|
90
|
+
num_nodes: int,
|
91
|
+
instance_type: str,
|
92
|
+
accelerators: Optional[Dict[str, int]] = None,
|
93
|
+
use_spot: bool = False,
|
94
|
+
) -> Iterator[None]:
|
95
|
+
del num_nodes # unused
|
96
|
+
regions = cls.regions_with_offering(instance_type,
|
97
|
+
accelerators,
|
98
|
+
use_spot,
|
99
|
+
region=region,
|
100
|
+
zone=None)
|
101
|
+
for r in regions:
|
102
|
+
assert r.zones is None, r
|
103
|
+
yield r.zones
|
104
|
+
|
105
|
+
def instance_type_to_hourly_cost(self,
|
106
|
+
instance_type: str,
|
107
|
+
use_spot: bool,
|
108
|
+
region: Optional[str] = None,
|
109
|
+
zone: Optional[str] = None) -> float:
|
110
|
+
return service_catalog.get_hourly_cost(instance_type,
|
111
|
+
use_spot=use_spot,
|
112
|
+
region=region,
|
113
|
+
zone=zone,
|
114
|
+
clouds='vast')
|
115
|
+
|
116
|
+
def accelerators_to_hourly_cost(self,
|
117
|
+
accelerators: Dict[str, int],
|
118
|
+
use_spot: bool,
|
119
|
+
region: Optional[str] = None,
|
120
|
+
zone: Optional[str] = None) -> float:
|
121
|
+
"""Returns the hourly cost of the accelerators, in dollars/hour."""
|
122
|
+
del accelerators, use_spot, region, zone # unused
|
123
|
+
return 0.0 # Vast includes accelerators in the hourly cost.
|
124
|
+
|
125
|
+
def get_egress_cost(self, num_gigabytes: float) -> float:
|
126
|
+
return 0.0
|
127
|
+
|
128
|
+
@classmethod
|
129
|
+
def get_default_instance_type(
|
130
|
+
cls,
|
131
|
+
cpus: Optional[str] = None,
|
132
|
+
memory: Optional[str] = None,
|
133
|
+
disk_tier: Optional[resources_utils.DiskTier] = None
|
134
|
+
) -> Optional[str]:
|
135
|
+
"""Returns the default instance type for Vast."""
|
136
|
+
return service_catalog.get_default_instance_type(cpus=cpus,
|
137
|
+
memory=memory,
|
138
|
+
disk_tier=disk_tier,
|
139
|
+
clouds='vast')
|
140
|
+
|
141
|
+
@classmethod
|
142
|
+
def get_accelerators_from_instance_type(
|
143
|
+
cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
144
|
+
return service_catalog.get_accelerators_from_instance_type(
|
145
|
+
instance_type, clouds='vast')
|
146
|
+
|
147
|
+
@classmethod
|
148
|
+
def get_zone_shell_cmd(cls) -> Optional[str]:
|
149
|
+
return None
|
150
|
+
|
151
|
+
def make_deploy_resources_variables(
|
152
|
+
self,
|
153
|
+
resources: 'resources_lib.Resources',
|
154
|
+
cluster_name: resources_utils.ClusterName,
|
155
|
+
region: 'clouds.Region',
|
156
|
+
zones: Optional[List['clouds.Zone']],
|
157
|
+
num_nodes: int,
|
158
|
+
dryrun: bool = False) -> Dict[str, Optional[str]]:
|
159
|
+
del zones, dryrun, cluster_name, num_nodes # unused
|
160
|
+
|
161
|
+
r = resources
|
162
|
+
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
163
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
164
|
+
acc_dict)
|
165
|
+
|
166
|
+
if r.image_id is None:
|
167
|
+
image_id = 'vastai/base:0.0.2'
|
168
|
+
elif r.extract_docker_image() is not None:
|
169
|
+
image_id = r.extract_docker_image()
|
170
|
+
else:
|
171
|
+
image_id = r.image_id[r.region]
|
172
|
+
|
173
|
+
return {
|
174
|
+
'instance_type': resources.instance_type,
|
175
|
+
'custom_resources': custom_resources,
|
176
|
+
'region': region.name,
|
177
|
+
'image_id': image_id,
|
178
|
+
}
|
179
|
+
|
180
|
+
def _get_feasible_launchable_resources(
|
181
|
+
self, resources: 'resources_lib.Resources'
|
182
|
+
) -> 'resources_utils.FeasibleResources':
|
183
|
+
"""Returns a list of feasible resources for the given resources."""
|
184
|
+
if resources.instance_type is not None:
|
185
|
+
assert resources.is_launchable(), resources
|
186
|
+
resources = resources.copy(accelerators=None)
|
187
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
188
|
+
|
189
|
+
def _make(instance_list):
|
190
|
+
resource_list = []
|
191
|
+
for instance_type in instance_list:
|
192
|
+
r = resources.copy(
|
193
|
+
cloud=Vast(),
|
194
|
+
instance_type=instance_type,
|
195
|
+
accelerators=None,
|
196
|
+
cpus=None,
|
197
|
+
)
|
198
|
+
resource_list.append(r)
|
199
|
+
return resource_list
|
200
|
+
|
201
|
+
# Currently, handle a filter on accelerators only.
|
202
|
+
accelerators = resources.accelerators
|
203
|
+
if accelerators is None:
|
204
|
+
# Return a default instance type
|
205
|
+
default_instance_type = Vast.get_default_instance_type(
|
206
|
+
cpus=resources.cpus,
|
207
|
+
memory=resources.memory,
|
208
|
+
disk_tier=resources.disk_tier)
|
209
|
+
if default_instance_type is None:
|
210
|
+
# TODO: Add hints to all return values in this method to help
|
211
|
+
# users understand why the resources are not launchable.
|
212
|
+
return resources_utils.FeasibleResources([], [], None)
|
213
|
+
else:
|
214
|
+
return resources_utils.FeasibleResources(
|
215
|
+
_make([default_instance_type]), [], None)
|
216
|
+
|
217
|
+
assert len(accelerators) == 1, resources
|
218
|
+
acc, acc_count = list(accelerators.items())[0]
|
219
|
+
(instance_list, fuzzy_candidate_list
|
220
|
+
) = service_catalog.get_instance_type_for_accelerator(
|
221
|
+
acc,
|
222
|
+
acc_count,
|
223
|
+
use_spot=resources.use_spot,
|
224
|
+
cpus=resources.cpus,
|
225
|
+
region=resources.region,
|
226
|
+
zone=resources.zone,
|
227
|
+
memory=resources.memory,
|
228
|
+
clouds='vast')
|
229
|
+
if instance_list is None:
|
230
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
231
|
+
None)
|
232
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
233
|
+
fuzzy_candidate_list, None)
|
234
|
+
|
235
|
+
@classmethod
|
236
|
+
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
237
|
+
""" Verify that the user has valid credentials for Vast. """
|
238
|
+
try:
|
239
|
+
import vastai_sdk as _vast # pylint: disable=import-outside-toplevel
|
240
|
+
vast = _vast.VastAI()
|
241
|
+
|
242
|
+
# We only support file pased credential passing
|
243
|
+
if vast.creds_source != 'FILE':
|
244
|
+
return False, (
|
245
|
+
'error \n' # First line is indented by 4 spaces
|
246
|
+
' Credentials can be set up by running: \n'
|
247
|
+
' $ pip install vastai\n'
|
248
|
+
' $ echo [key] > ~/.vast_api_key\n'
|
249
|
+
' For more information, see https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#vast' # pylint: disable=line-too-long
|
250
|
+
)
|
251
|
+
|
252
|
+
return True, None
|
253
|
+
|
254
|
+
except ImportError:
|
255
|
+
return False, ('Failed to import vast. '
|
256
|
+
'To install, run: pip install skypilot[vast]')
|
257
|
+
|
258
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
259
|
+
return {
|
260
|
+
'~/.config/vastai/vast_api_key': '~/.config/vastai/vast_api_key'
|
261
|
+
}
|
262
|
+
|
263
|
+
@classmethod
|
264
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
265
|
+
# NOTE: used for very advanced SkyPilot functionality
|
266
|
+
# Can implement later if desired
|
267
|
+
return None
|
268
|
+
|
269
|
+
def instance_type_exists(self, instance_type: str) -> bool:
|
270
|
+
return service_catalog.instance_type_exists(instance_type, 'vast')
|
271
|
+
|
272
|
+
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
273
|
+
return service_catalog.validate_region_zone(region, zone, clouds='vast')
|
274
|
+
|
275
|
+
@classmethod
|
276
|
+
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
|
277
|
+
# TODO: use 0.0 for now to allow all images. We should change this to
|
278
|
+
# return the docker image size.
|
279
|
+
return 0.0
|
@@ -10,7 +10,6 @@
|
|
10
10
|
integrity="sha384-rbsA2VBKQhggwzxH7pPCaAqO46MgnOM80zW1RWuH61DGLwZJEdK2Kadq2F9CUG65" crossorigin="anonymous">
|
11
11
|
<style>
|
12
12
|
:root {
|
13
|
-
--primary-color: #0d6efd;
|
14
13
|
--secondary-color: #6c757d;
|
15
14
|
--success-color: #198754;
|
16
15
|
--warning-color: #ffc107;
|
@@ -98,10 +97,38 @@
|
|
98
97
|
|
99
98
|
/* Allow Details column to wrap */
|
100
99
|
.table th:nth-child(12), /* Details column */
|
100
|
+
|
101
101
|
.table td:nth-child(12) {
|
102
|
-
|
103
|
-
|
104
|
-
|
102
|
+
max-width: 250px; /* Limit width */
|
103
|
+
overflow: hidden; /* Hide overflow */
|
104
|
+
text-overflow: ellipsis; /* Show ellipsis for overflow */
|
105
|
+
position: relative; /* For tooltip positioning */
|
106
|
+
cursor: pointer !important; /* Force show pointer cursor */
|
107
|
+
padding-right: 24px; /* Make room for the arrow */
|
108
|
+
}
|
109
|
+
|
110
|
+
.table td:nth-child(12)::after {
|
111
|
+
content: '▼';
|
112
|
+
position: absolute;
|
113
|
+
right: 8px;
|
114
|
+
top: 12px;
|
115
|
+
font-size: 0.6em;
|
116
|
+
opacity: 0.5;
|
117
|
+
display: none; /* Hide by default */
|
118
|
+
}
|
119
|
+
|
120
|
+
.table td:nth-child(12).expandable::after {
|
121
|
+
display: block; /* Only show for expandable content */
|
122
|
+
}
|
123
|
+
|
124
|
+
.table td:nth-child(12).expanded::after {
|
125
|
+
transform: rotate(180deg);
|
126
|
+
}
|
127
|
+
|
128
|
+
.table td:nth-child(12).expanded {
|
129
|
+
max-width: none;
|
130
|
+
white-space: normal;
|
131
|
+
word-wrap: break-word;
|
105
132
|
}
|
106
133
|
|
107
134
|
.badge {
|
@@ -190,11 +217,11 @@
|
|
190
217
|
}
|
191
218
|
|
192
219
|
.status-container:hover::after {
|
193
|
-
content: attr(
|
220
|
+
content: attr(data-tooltip);
|
194
221
|
position: absolute;
|
195
|
-
left: 0;
|
196
|
-
|
197
|
-
transform: translateY(
|
222
|
+
left: 0;
|
223
|
+
top: 100%;
|
224
|
+
transform: translateY(8px);
|
198
225
|
z-index: 1001;
|
199
226
|
background-color: rgba(33, 37, 41, 0.9);
|
200
227
|
color: white;
|
@@ -214,11 +241,11 @@
|
|
214
241
|
.status-container:hover::before {
|
215
242
|
content: '';
|
216
243
|
position: absolute;
|
217
|
-
left: 20px;
|
218
|
-
bottom: 100
|
219
|
-
transform: none;
|
244
|
+
left: 20px;
|
245
|
+
top: 100%; /* Changed from bottom: 100% */
|
246
|
+
transform: none;
|
220
247
|
border: 8px solid transparent;
|
221
|
-
border-
|
248
|
+
border-bottom-color: rgba(33, 37, 41, 0.9); /* Changed from border-top-color */
|
222
249
|
z-index: 1001;
|
223
250
|
pointer-events: none;
|
224
251
|
opacity: 0;
|
@@ -236,7 +263,7 @@
|
|
236
263
|
.table {
|
237
264
|
overflow: visible !important;
|
238
265
|
}
|
239
|
-
|
266
|
+
|
240
267
|
.fixed-header-table {
|
241
268
|
overflow: visible !important;
|
242
269
|
}
|
@@ -262,9 +289,9 @@
|
|
262
289
|
}
|
263
290
|
|
264
291
|
#last-updated:hover::after {
|
265
|
-
content: attr(
|
292
|
+
content: attr(data-tooltip);
|
266
293
|
position: absolute;
|
267
|
-
bottom: 100
|
294
|
+
top: 100%; /* Changed from bottom: 100% to top: 100% */
|
268
295
|
left: 50%;
|
269
296
|
transform: translateX(-50%);
|
270
297
|
padding: 0.5rem 1rem;
|
@@ -274,34 +301,33 @@
|
|
274
301
|
font-size: 0.875rem;
|
275
302
|
white-space: nowrap;
|
276
303
|
z-index: 1000;
|
277
|
-
margin-
|
304
|
+
margin-top: 8px; /* Changed from margin-bottom to margin-top */
|
278
305
|
}
|
279
306
|
|
280
307
|
#last-updated:hover::before {
|
281
308
|
content: '';
|
282
309
|
position: absolute;
|
283
|
-
bottom: 100
|
310
|
+
top: 100%; /* Changed from bottom: 100% to top: 100% */
|
284
311
|
left: 50%;
|
285
312
|
transform: translateX(-50%);
|
286
313
|
border: 8px solid transparent;
|
287
|
-
border-
|
288
|
-
margin-
|
314
|
+
border-bottom-color: rgba(33, 37, 41, 0.9); /* Changed from border-top-color to border-bottom-color */
|
315
|
+
margin-top: -8px; /* Changed from margin-bottom to margin-top */
|
289
316
|
z-index: 1000;
|
290
317
|
}
|
291
318
|
|
292
319
|
.clickable-badge {
|
293
320
|
cursor: pointer;
|
294
321
|
transition: transform 0.2s, opacity 0.2s;
|
295
|
-
opacity: 0.4;
|
322
|
+
opacity: 0.4;
|
296
323
|
}
|
297
324
|
|
298
325
|
.clickable-badge:hover {
|
299
|
-
transform: scale(1.
|
326
|
+
transform: scale(1.05);
|
300
327
|
opacity: 1;
|
301
328
|
}
|
302
329
|
|
303
330
|
.clickable-badge.selected-filter {
|
304
|
-
transform: scale(1.1);
|
305
331
|
opacity: 1;
|
306
332
|
box-shadow: 0 0 0 2px #fff, 0 0 0 4px currentColor;
|
307
333
|
}
|
@@ -313,6 +339,39 @@
|
|
313
339
|
#last-updated:hover::before {
|
314
340
|
z-index: 1001;
|
315
341
|
}
|
342
|
+
|
343
|
+
/* Add tooltip styles for refresh label */
|
344
|
+
.refresh-label {
|
345
|
+
position: relative;
|
346
|
+
cursor: help;
|
347
|
+
}
|
348
|
+
|
349
|
+
.refresh-label:hover::after {
|
350
|
+
content: attr(data-tooltip);
|
351
|
+
position: absolute;
|
352
|
+
left: 50%;
|
353
|
+
top: 100%;
|
354
|
+
transform: translateX(-50%) translateY(8px);
|
355
|
+
z-index: 1001;
|
356
|
+
background-color: rgba(33, 37, 41, 0.9);
|
357
|
+
color: white;
|
358
|
+
padding: 0.5rem 1rem;
|
359
|
+
border-radius: 6px;
|
360
|
+
font-size: 0.875rem;
|
361
|
+
white-space: nowrap;
|
362
|
+
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.15);
|
363
|
+
}
|
364
|
+
|
365
|
+
.refresh-label:hover::before {
|
366
|
+
content: '';
|
367
|
+
position: absolute;
|
368
|
+
left: 50%;
|
369
|
+
top: 100%;
|
370
|
+
transform: translateX(-50%);
|
371
|
+
border: 8px solid transparent;
|
372
|
+
border-bottom-color: rgba(33, 37, 41, 0.9);
|
373
|
+
z-index: 1001;
|
374
|
+
}
|
316
375
|
</style>
|
317
376
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.29.1/moment.min.js"></script>
|
318
377
|
<script
|
@@ -328,14 +387,14 @@
|
|
328
387
|
<div class="d-flex align-items-center">
|
329
388
|
<div class="form-check form-switch me-3">
|
330
389
|
<input class="form-check-input" type="checkbox" id="refresh-toggle" checked>
|
331
|
-
<label class="form-check-label" for="refresh-toggle">
|
390
|
+
<label class="form-check-label refresh-label" for="refresh-toggle" data-tooltip="Refreshes every 30 seconds">
|
332
391
|
Auto-refresh
|
333
392
|
<span class="refresh-indicator">
|
334
393
|
<span class="refresh-spinner" id="refresh-spinner"></span>
|
335
394
|
</span>
|
336
395
|
</label>
|
337
396
|
</div>
|
338
|
-
<p class="text-muted mb-0" id="last-updated"></p>
|
397
|
+
<p class="text-muted mb-0" id="last-updated" data-tooltip="{{ utcTimestamp }}"></p>
|
339
398
|
</div>
|
340
399
|
</div>
|
341
400
|
</header>
|
@@ -349,7 +408,7 @@
|
|
349
408
|
</select>
|
350
409
|
|
351
410
|
{% if rows %}
|
352
|
-
<p>Filter
|
411
|
+
<p>Filter by status:
|
353
412
|
<span class="badge bg-secondary clickable-badge selected-filter me-2" data-status="ALL">All</span>
|
354
413
|
{% set status_dict = {} %}
|
355
414
|
{% for row in rows %}
|
@@ -362,7 +421,7 @@
|
|
362
421
|
{% endfor %}
|
363
422
|
{% for status, count in status_dict|dictsort %}
|
364
423
|
<span class="me-2">
|
365
|
-
<span class="me-1"
|
424
|
+
<span class="me-1">| {{ count }}</span>
|
366
425
|
{% if status.startswith('RUNNING') %}
|
367
426
|
<span class="badge bg-primary clickable-badge" data-status="{{ status }}">{{ status }}</span>
|
368
427
|
{% elif status.startswith('PENDING') or status.startswith('SUBMITTED') %}
|
@@ -414,7 +473,7 @@
|
|
414
473
|
<td>{{ row[7] }}</td>
|
415
474
|
<td>
|
416
475
|
<!-- Status column with tooltip -->
|
417
|
-
<div class="status-container" style="position: relative;"
|
476
|
+
<div class="status-container" style="position: relative;" data-tooltip="{{ row[14] }}">
|
418
477
|
{% if row[9].startswith('RUNNING') %}
|
419
478
|
<span class="badge bg-primary">{{ row[9].split()[0] }}</span>{{ row[9][row[9].split()[0]|length:] }}
|
420
479
|
{% elif row[9].startswith('PENDING') or row[9].startswith('SUBMITTED') %}
|
@@ -436,12 +495,12 @@
|
|
436
495
|
<td>{{ row[11] }}</td> {# Cluster #}
|
437
496
|
<td>{{ row[12] }}</td> {# Region #}
|
438
497
|
<td>{{ row[8] }}</td> {# Recoveries #}
|
439
|
-
<td>{{ row[13] }}</td> {# Details #}
|
498
|
+
<td data-full-text="{{ row[13] }}">{{ row[13] }}</td> {# Details #}
|
440
499
|
<td>
|
441
500
|
{% if row[1]|string|replace(' \u21B3', '') and row[1]|string|replace(' \u21B3', '') != '-' %}
|
442
|
-
<a href="{{ url_for('download_log', job_id=row[1]|string|replace(' \u21B3', '')) }}"
|
501
|
+
<a href="{{ url_for('download_log', job_id=row[1]|string|replace(' \u21B3', '')) }}"
|
443
502
|
class="btn btn-sm btn-outline-secondary">
|
444
|
-
|
503
|
+
controller log
|
445
504
|
</a>
|
446
505
|
{% endif %}
|
447
506
|
</td>
|
@@ -515,7 +574,7 @@
|
|
515
574
|
var localTimestamp = moment.utc(timestamp).tz(moment.tz.guess()).format('YYYY-MM-DD HH:mm:ss z');
|
516
575
|
var utcTimestamp = moment.utc(timestamp).format('YYYY-MM-DD HH:mm:ss UTC');
|
517
576
|
document.getElementById("last-updated").textContent = "Last updated: " + localTimestamp;
|
518
|
-
document.getElementById("last-updated").
|
577
|
+
document.getElementById("last-updated").setAttribute('data-tooltip', utcTimestamp);
|
519
578
|
});
|
520
579
|
</script>
|
521
580
|
<script>
|
@@ -567,7 +626,7 @@
|
|
567
626
|
var statusCell = row.querySelector("td:nth-child(7)"); // Status is now in the 7th column
|
568
627
|
if (statusCell) {
|
569
628
|
var statusText = statusCell.textContent.trim().split(' ')[0]; // Get first word of status
|
570
|
-
|
629
|
+
|
571
630
|
if (status === '' || statusText === status) {
|
572
631
|
row.style.display = "";
|
573
632
|
} else {
|
@@ -589,18 +648,6 @@
|
|
589
648
|
window.addEventListener('beforeunload', function() {
|
590
649
|
document.getElementById('refresh-spinner').style.display = 'inline-block';
|
591
650
|
});
|
592
|
-
|
593
|
-
// Enhance table row hover effect
|
594
|
-
document.querySelectorAll('tbody tr').forEach(row => {
|
595
|
-
row.addEventListener('mouseenter', function() {
|
596
|
-
this.style.transform = 'scale(1.002)';
|
597
|
-
this.style.transition = 'transform 0.2s';
|
598
|
-
});
|
599
|
-
|
600
|
-
row.addEventListener('mouseleave', function() {
|
601
|
-
this.style.transform = 'scale(1)';
|
602
|
-
});
|
603
|
-
});
|
604
651
|
</script>
|
605
652
|
<script>
|
606
653
|
// Update column indices for job table
|
@@ -615,20 +662,20 @@
|
|
615
662
|
document.addEventListener("DOMContentLoaded", function() {
|
616
663
|
const statusFilter = document.getElementById('status-filter');
|
617
664
|
const badges = document.querySelectorAll('.clickable-badge');
|
618
|
-
|
665
|
+
|
619
666
|
// Set initial state
|
620
667
|
const savedFilter = localStorage.getItem("statusFilter") || '';
|
621
668
|
updateSelectedBadge(savedFilter);
|
622
|
-
|
669
|
+
|
623
670
|
badges.forEach(function(badge) {
|
624
671
|
badge.addEventListener('click', function() {
|
625
672
|
const status = this.dataset.status;
|
626
673
|
const currentFilter = statusFilter.value;
|
627
|
-
|
674
|
+
|
628
675
|
// If clicking the already selected filter, clear it (show all)
|
629
|
-
const newStatus = (status === currentFilter || (status === 'ALL' && currentFilter === '')) ? '' :
|
676
|
+
const newStatus = (status === currentFilter || (status === 'ALL' && currentFilter === '')) ? '' :
|
630
677
|
(status === 'ALL' ? '' : status);
|
631
|
-
|
678
|
+
|
632
679
|
// Update filter and UI
|
633
680
|
statusFilter.value = newStatus;
|
634
681
|
filterStatus(newStatus);
|
@@ -636,11 +683,11 @@
|
|
636
683
|
updateSelectedBadge(newStatus);
|
637
684
|
});
|
638
685
|
});
|
639
|
-
|
686
|
+
|
640
687
|
function updateSelectedBadge(selectedStatus) {
|
641
688
|
badges.forEach(badge => {
|
642
689
|
badge.classList.remove('selected-filter');
|
643
|
-
if ((selectedStatus === '' && badge.dataset.status === 'ALL') ||
|
690
|
+
if ((selectedStatus === '' && badge.dataset.status === 'ALL') ||
|
644
691
|
badge.dataset.status === selectedStatus) {
|
645
692
|
badge.classList.add('selected-filter');
|
646
693
|
}
|
@@ -653,7 +700,7 @@
|
|
653
700
|
document.addEventListener("DOMContentLoaded", function() {
|
654
701
|
const header = document.querySelector('header');
|
655
702
|
const container = document.querySelector('.container');
|
656
|
-
|
703
|
+
|
657
704
|
window.addEventListener('scroll', function() {
|
658
705
|
if (container.getBoundingClientRect().top < 0) {
|
659
706
|
header.classList.add('sticky');
|
@@ -663,7 +710,25 @@
|
|
663
710
|
});
|
664
711
|
});
|
665
712
|
</script>
|
713
|
+
<script>
|
714
|
+
// Add click handler for Details cells
|
715
|
+
document.addEventListener('DOMContentLoaded', function() {
|
716
|
+
const detailsCells = document.querySelectorAll('#jobs-table td:nth-child(12)');
|
717
|
+
|
718
|
+
detailsCells.forEach(cell => {
|
719
|
+
// Check if content is truncated
|
720
|
+
if (cell.scrollWidth > cell.clientWidth) {
|
721
|
+
cell.classList.add('expandable');
|
722
|
+
}
|
666
723
|
|
724
|
+
cell.addEventListener('click', function() {
|
725
|
+
if (this.scrollWidth > this.clientWidth || this.classList.contains('expanded')) {
|
726
|
+
this.classList.toggle('expanded');
|
727
|
+
}
|
728
|
+
});
|
729
|
+
});
|
730
|
+
});
|
731
|
+
</script>
|
667
732
|
</body>
|
668
733
|
|
669
734
|
</html>
|
sky/jobs/scheduler.py
CHANGED
@@ -60,6 +60,14 @@ logger = sky_logging.init_logger('sky.jobs.controller')
|
|
60
60
|
_MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
|
61
61
|
_ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
|
62
62
|
|
63
|
+
# Based on testing, assume a running job uses 350MB memory.
|
64
|
+
JOB_MEMORY_MB = 350
|
65
|
+
# Past 2000 simultaneous jobs, we become unstable.
|
66
|
+
# See https://github.com/skypilot-org/skypilot/issues/4649.
|
67
|
+
MAX_JOB_LIMIT = 2000
|
68
|
+
# Number of ongoing launches launches allowed per CPU.
|
69
|
+
LAUNCHES_PER_CPU = 4
|
70
|
+
|
63
71
|
|
64
72
|
@lru_cache(maxsize=1)
|
65
73
|
def _get_lock_path() -> str:
|
@@ -247,15 +255,16 @@ def _set_alive_waiting(job_id: int) -> None:
|
|
247
255
|
|
248
256
|
|
249
257
|
def _get_job_parallelism() -> int:
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
258
|
+
job_memory = JOB_MEMORY_MB * 1024 * 1024
|
259
|
+
|
260
|
+
job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
|
261
|
+
|
262
|
+
return max(job_limit, 1)
|
254
263
|
|
255
264
|
|
256
265
|
def _get_launch_parallelism() -> int:
|
257
266
|
cpus = os.cpu_count()
|
258
|
-
return cpus *
|
267
|
+
return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
|
259
268
|
|
260
269
|
|
261
270
|
def _can_start_new_job() -> bool:
|
sky/provision/__init__.py
CHANGED
@@ -22,6 +22,7 @@ from sky.provision import kubernetes
|
|
22
22
|
from sky.provision import lambda_cloud
|
23
23
|
from sky.provision import oci
|
24
24
|
from sky.provision import runpod
|
25
|
+
from sky.provision import vast
|
25
26
|
from sky.provision import vsphere
|
26
27
|
from sky.utils import command_runner
|
27
28
|
from sky.utils import timeline
|