skypilot-nightly 1.0.0.dev20250204__py3-none-any.whl → 1.0.0.dev20250205__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/vast.py +29 -0
  3. sky/authentication.py +18 -0
  4. sky/backends/backend_utils.py +4 -1
  5. sky/backends/cloud_vm_ray_backend.py +1 -0
  6. sky/clouds/__init__.py +2 -0
  7. sky/clouds/service_catalog/constants.py +1 -1
  8. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  9. sky/clouds/service_catalog/kubernetes_catalog.py +11 -6
  10. sky/clouds/service_catalog/vast_catalog.py +104 -0
  11. sky/clouds/vast.py +279 -0
  12. sky/jobs/dashboard/templates/index.html +117 -52
  13. sky/jobs/scheduler.py +14 -5
  14. sky/provision/__init__.py +1 -0
  15. sky/provision/vast/__init__.py +10 -0
  16. sky/provision/vast/config.py +11 -0
  17. sky/provision/vast/instance.py +247 -0
  18. sky/provision/vast/utils.py +161 -0
  19. sky/setup_files/dependencies.py +1 -0
  20. sky/templates/vast-ray.yml.j2 +70 -0
  21. sky/utils/controller_utils.py +5 -0
  22. {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/METADATA +4 -1
  23. {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/RECORD +27 -18
  24. {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/LICENSE +0 -0
  25. {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/WHEEL +0 -0
  26. {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/entry_points.txt +0 -0
  27. {skypilot_nightly-1.0.0.dev20250204.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/top_level.txt +0 -0
sky/clouds/vast.py ADDED
@@ -0,0 +1,279 @@
1
+ """ Vast Cloud. """
2
+
3
+ import typing
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
5
+
6
+ from sky import clouds
7
+ from sky.clouds import service_catalog
8
+ from sky.utils import resources_utils
9
+
10
+ if typing.TYPE_CHECKING:
11
+ from sky import resources as resources_lib
12
+
13
+
14
+ @clouds.CLOUD_REGISTRY.register
15
+ class Vast(clouds.Cloud):
16
+ """ Vast GPU Cloud
17
+
18
+ _REPR | The string representation for the Vast GPU cloud object.
19
+ """
20
+ _REPR = 'Vast'
21
+ _CLOUD_UNSUPPORTED_FEATURES = {
22
+ clouds.CloudImplementationFeatures.MULTI_NODE:
23
+ ('Multi-node not supported yet, as the interconnection among nodes '
24
+ 'are non-trivial on Vast.'),
25
+ clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
26
+ ('Customizing disk tier is not supported yet on Vast.'),
27
+ clouds.CloudImplementationFeatures.OPEN_PORTS:
28
+ ('Opening ports is currently not supported on Vast.'),
29
+ clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
30
+ ('Mounting object stores is not supported on Vast.'),
31
+ }
32
+ #
33
+ # Vast doesn't have a max cluster name limit. This number
34
+ # is reasonably large and exists to play nicely with the
35
+ # other providers
36
+ #
37
+ _MAX_CLUSTER_NAME_LEN_LIMIT = 120
38
+ _regions: List[clouds.Region] = []
39
+
40
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
41
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
42
+
43
+ @classmethod
44
+ def _unsupported_features_for_resources(
45
+ cls, resources: 'resources_lib.Resources'
46
+ ) -> Dict[clouds.CloudImplementationFeatures, str]:
47
+ """The features not supported based on the resources provided.
48
+
49
+ This method is used by check_features_are_supported() to check if the
50
+ cloud implementation supports all the requested features.
51
+
52
+ Returns:
53
+ A dict of {feature: reason} for the features not supported by the
54
+ cloud implementation.
55
+ """
56
+ del resources # unused
57
+ return cls._CLOUD_UNSUPPORTED_FEATURES
58
+
59
+ @classmethod
60
+ def _max_cluster_name_length(cls) -> Optional[int]:
61
+ return cls._MAX_CLUSTER_NAME_LEN_LIMIT
62
+
63
+ @classmethod
64
+ def regions_with_offering(cls, instance_type: str,
65
+ accelerators: Optional[Dict[str, int]],
66
+ use_spot: bool, region: Optional[str],
67
+ zone: Optional[str]) -> List[clouds.Region]:
68
+ assert zone is None, 'Vast does not support zones.'
69
+ del accelerators, zone # unused
70
+ regions = service_catalog.get_region_zones_for_instance_type(
71
+ instance_type, use_spot, 'vast')
72
+
73
+ if region is not None:
74
+ regions = [r for r in regions if r.name == region]
75
+ return regions
76
+
77
+ @classmethod
78
+ def get_vcpus_mem_from_instance_type(
79
+ cls,
80
+ instance_type: str,
81
+ ) -> Tuple[Optional[float], Optional[float]]:
82
+ return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
83
+ clouds='vast')
84
+
85
+ @classmethod
86
+ def zones_provision_loop(
87
+ cls,
88
+ *,
89
+ region: str,
90
+ num_nodes: int,
91
+ instance_type: str,
92
+ accelerators: Optional[Dict[str, int]] = None,
93
+ use_spot: bool = False,
94
+ ) -> Iterator[None]:
95
+ del num_nodes # unused
96
+ regions = cls.regions_with_offering(instance_type,
97
+ accelerators,
98
+ use_spot,
99
+ region=region,
100
+ zone=None)
101
+ for r in regions:
102
+ assert r.zones is None, r
103
+ yield r.zones
104
+
105
+ def instance_type_to_hourly_cost(self,
106
+ instance_type: str,
107
+ use_spot: bool,
108
+ region: Optional[str] = None,
109
+ zone: Optional[str] = None) -> float:
110
+ return service_catalog.get_hourly_cost(instance_type,
111
+ use_spot=use_spot,
112
+ region=region,
113
+ zone=zone,
114
+ clouds='vast')
115
+
116
+ def accelerators_to_hourly_cost(self,
117
+ accelerators: Dict[str, int],
118
+ use_spot: bool,
119
+ region: Optional[str] = None,
120
+ zone: Optional[str] = None) -> float:
121
+ """Returns the hourly cost of the accelerators, in dollars/hour."""
122
+ del accelerators, use_spot, region, zone # unused
123
+ return 0.0 # Vast includes accelerators in the hourly cost.
124
+
125
+ def get_egress_cost(self, num_gigabytes: float) -> float:
126
+ return 0.0
127
+
128
+ @classmethod
129
+ def get_default_instance_type(
130
+ cls,
131
+ cpus: Optional[str] = None,
132
+ memory: Optional[str] = None,
133
+ disk_tier: Optional[resources_utils.DiskTier] = None
134
+ ) -> Optional[str]:
135
+ """Returns the default instance type for Vast."""
136
+ return service_catalog.get_default_instance_type(cpus=cpus,
137
+ memory=memory,
138
+ disk_tier=disk_tier,
139
+ clouds='vast')
140
+
141
+ @classmethod
142
+ def get_accelerators_from_instance_type(
143
+ cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
144
+ return service_catalog.get_accelerators_from_instance_type(
145
+ instance_type, clouds='vast')
146
+
147
+ @classmethod
148
+ def get_zone_shell_cmd(cls) -> Optional[str]:
149
+ return None
150
+
151
+ def make_deploy_resources_variables(
152
+ self,
153
+ resources: 'resources_lib.Resources',
154
+ cluster_name: resources_utils.ClusterName,
155
+ region: 'clouds.Region',
156
+ zones: Optional[List['clouds.Zone']],
157
+ num_nodes: int,
158
+ dryrun: bool = False) -> Dict[str, Optional[str]]:
159
+ del zones, dryrun, cluster_name, num_nodes # unused
160
+
161
+ r = resources
162
+ acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
163
+ custom_resources = resources_utils.make_ray_custom_resources_str(
164
+ acc_dict)
165
+
166
+ if r.image_id is None:
167
+ image_id = 'vastai/base:0.0.2'
168
+ elif r.extract_docker_image() is not None:
169
+ image_id = r.extract_docker_image()
170
+ else:
171
+ image_id = r.image_id[r.region]
172
+
173
+ return {
174
+ 'instance_type': resources.instance_type,
175
+ 'custom_resources': custom_resources,
176
+ 'region': region.name,
177
+ 'image_id': image_id,
178
+ }
179
+
180
+ def _get_feasible_launchable_resources(
181
+ self, resources: 'resources_lib.Resources'
182
+ ) -> 'resources_utils.FeasibleResources':
183
+ """Returns a list of feasible resources for the given resources."""
184
+ if resources.instance_type is not None:
185
+ assert resources.is_launchable(), resources
186
+ resources = resources.copy(accelerators=None)
187
+ return resources_utils.FeasibleResources([resources], [], None)
188
+
189
+ def _make(instance_list):
190
+ resource_list = []
191
+ for instance_type in instance_list:
192
+ r = resources.copy(
193
+ cloud=Vast(),
194
+ instance_type=instance_type,
195
+ accelerators=None,
196
+ cpus=None,
197
+ )
198
+ resource_list.append(r)
199
+ return resource_list
200
+
201
+ # Currently, handle a filter on accelerators only.
202
+ accelerators = resources.accelerators
203
+ if accelerators is None:
204
+ # Return a default instance type
205
+ default_instance_type = Vast.get_default_instance_type(
206
+ cpus=resources.cpus,
207
+ memory=resources.memory,
208
+ disk_tier=resources.disk_tier)
209
+ if default_instance_type is None:
210
+ # TODO: Add hints to all return values in this method to help
211
+ # users understand why the resources are not launchable.
212
+ return resources_utils.FeasibleResources([], [], None)
213
+ else:
214
+ return resources_utils.FeasibleResources(
215
+ _make([default_instance_type]), [], None)
216
+
217
+ assert len(accelerators) == 1, resources
218
+ acc, acc_count = list(accelerators.items())[0]
219
+ (instance_list, fuzzy_candidate_list
220
+ ) = service_catalog.get_instance_type_for_accelerator(
221
+ acc,
222
+ acc_count,
223
+ use_spot=resources.use_spot,
224
+ cpus=resources.cpus,
225
+ region=resources.region,
226
+ zone=resources.zone,
227
+ memory=resources.memory,
228
+ clouds='vast')
229
+ if instance_list is None:
230
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
231
+ None)
232
+ return resources_utils.FeasibleResources(_make(instance_list),
233
+ fuzzy_candidate_list, None)
234
+
235
+ @classmethod
236
+ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
237
+ """ Verify that the user has valid credentials for Vast. """
238
+ try:
239
+ import vastai_sdk as _vast # pylint: disable=import-outside-toplevel
240
+ vast = _vast.VastAI()
241
+
242
+ # We only support file pased credential passing
243
+ if vast.creds_source != 'FILE':
244
+ return False, (
245
+ 'error \n' # First line is indented by 4 spaces
246
+ ' Credentials can be set up by running: \n'
247
+ ' $ pip install vastai\n'
248
+ ' $ echo [key] > ~/.vast_api_key\n'
249
+ ' For more information, see https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#vast' # pylint: disable=line-too-long
250
+ )
251
+
252
+ return True, None
253
+
254
+ except ImportError:
255
+ return False, ('Failed to import vast. '
256
+ 'To install, run: pip install skypilot[vast]')
257
+
258
+ def get_credential_file_mounts(self) -> Dict[str, str]:
259
+ return {
260
+ '~/.config/vastai/vast_api_key': '~/.config/vastai/vast_api_key'
261
+ }
262
+
263
+ @classmethod
264
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
265
+ # NOTE: used for very advanced SkyPilot functionality
266
+ # Can implement later if desired
267
+ return None
268
+
269
+ def instance_type_exists(self, instance_type: str) -> bool:
270
+ return service_catalog.instance_type_exists(instance_type, 'vast')
271
+
272
+ def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
273
+ return service_catalog.validate_region_zone(region, zone, clouds='vast')
274
+
275
+ @classmethod
276
+ def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
277
+ # TODO: use 0.0 for now to allow all images. We should change this to
278
+ # return the docker image size.
279
+ return 0.0
@@ -10,7 +10,6 @@
10
10
  integrity="sha384-rbsA2VBKQhggwzxH7pPCaAqO46MgnOM80zW1RWuH61DGLwZJEdK2Kadq2F9CUG65" crossorigin="anonymous">
11
11
  <style>
12
12
  :root {
13
- --primary-color: #0d6efd;
14
13
  --secondary-color: #6c757d;
15
14
  --success-color: #198754;
16
15
  --warning-color: #ffc107;
@@ -98,10 +97,38 @@
98
97
 
99
98
  /* Allow Details column to wrap */
100
99
  .table th:nth-child(12), /* Details column */
100
+
101
101
  .table td:nth-child(12) {
102
- white-space: normal; /* Allow text wrapping */
103
- max-width: 250px; /* Limit width to prevent excessive stretching */
104
- word-wrap: break-word; /* Break long words if needed */
102
+ max-width: 250px; /* Limit width */
103
+ overflow: hidden; /* Hide overflow */
104
+ text-overflow: ellipsis; /* Show ellipsis for overflow */
105
+ position: relative; /* For tooltip positioning */
106
+ cursor: pointer !important; /* Force show pointer cursor */
107
+ padding-right: 24px; /* Make room for the arrow */
108
+ }
109
+
110
+ .table td:nth-child(12)::after {
111
+ content: '▼';
112
+ position: absolute;
113
+ right: 8px;
114
+ top: 12px;
115
+ font-size: 0.6em;
116
+ opacity: 0.5;
117
+ display: none; /* Hide by default */
118
+ }
119
+
120
+ .table td:nth-child(12).expandable::after {
121
+ display: block; /* Only show for expandable content */
122
+ }
123
+
124
+ .table td:nth-child(12).expanded::after {
125
+ transform: rotate(180deg);
126
+ }
127
+
128
+ .table td:nth-child(12).expanded {
129
+ max-width: none;
130
+ white-space: normal;
131
+ word-wrap: break-word;
105
132
  }
106
133
 
107
134
  .badge {
@@ -190,11 +217,11 @@
190
217
  }
191
218
 
192
219
  .status-container:hover::after {
193
- content: attr(title);
220
+ content: attr(data-tooltip);
194
221
  position: absolute;
195
- left: 0; /* Changed from 50% */
196
- bottom: 100%;
197
- transform: translateY(-8px); /* Removed translateX */
222
+ left: 0;
223
+ top: 100%;
224
+ transform: translateY(8px);
198
225
  z-index: 1001;
199
226
  background-color: rgba(33, 37, 41, 0.9);
200
227
  color: white;
@@ -214,11 +241,11 @@
214
241
  .status-container:hover::before {
215
242
  content: '';
216
243
  position: absolute;
217
- left: 20px; /* Changed from 50% */
218
- bottom: 100%;
219
- transform: none; /* Removed transform */
244
+ left: 20px;
245
+ top: 100%; /* Changed from bottom: 100% */
246
+ transform: none;
220
247
  border: 8px solid transparent;
221
- border-top-color: rgba(33, 37, 41, 0.9);
248
+ border-bottom-color: rgba(33, 37, 41, 0.9); /* Changed from border-top-color */
222
249
  z-index: 1001;
223
250
  pointer-events: none;
224
251
  opacity: 0;
@@ -236,7 +263,7 @@
236
263
  .table {
237
264
  overflow: visible !important;
238
265
  }
239
-
266
+
240
267
  .fixed-header-table {
241
268
  overflow: visible !important;
242
269
  }
@@ -262,9 +289,9 @@
262
289
  }
263
290
 
264
291
  #last-updated:hover::after {
265
- content: attr(title);
292
+ content: attr(data-tooltip);
266
293
  position: absolute;
267
- bottom: 100%;
294
+ top: 100%; /* Changed from bottom: 100% to top: 100% */
268
295
  left: 50%;
269
296
  transform: translateX(-50%);
270
297
  padding: 0.5rem 1rem;
@@ -274,34 +301,33 @@
274
301
  font-size: 0.875rem;
275
302
  white-space: nowrap;
276
303
  z-index: 1000;
277
- margin-bottom: 8px;
304
+ margin-top: 8px; /* Changed from margin-bottom to margin-top */
278
305
  }
279
306
 
280
307
  #last-updated:hover::before {
281
308
  content: '';
282
309
  position: absolute;
283
- bottom: 100%;
310
+ top: 100%; /* Changed from bottom: 100% to top: 100% */
284
311
  left: 50%;
285
312
  transform: translateX(-50%);
286
313
  border: 8px solid transparent;
287
- border-top-color: rgba(33, 37, 41, 0.9);
288
- margin-bottom: -8px;
314
+ border-bottom-color: rgba(33, 37, 41, 0.9); /* Changed from border-top-color to border-bottom-color */
315
+ margin-top: -8px; /* Changed from margin-bottom to margin-top */
289
316
  z-index: 1000;
290
317
  }
291
318
 
292
319
  .clickable-badge {
293
320
  cursor: pointer;
294
321
  transition: transform 0.2s, opacity 0.2s;
295
- opacity: 0.4;
322
+ opacity: 0.4;
296
323
  }
297
324
 
298
325
  .clickable-badge:hover {
299
- transform: scale(1.1);
326
+ transform: scale(1.05);
300
327
  opacity: 1;
301
328
  }
302
329
 
303
330
  .clickable-badge.selected-filter {
304
- transform: scale(1.1);
305
331
  opacity: 1;
306
332
  box-shadow: 0 0 0 2px #fff, 0 0 0 4px currentColor;
307
333
  }
@@ -313,6 +339,39 @@
313
339
  #last-updated:hover::before {
314
340
  z-index: 1001;
315
341
  }
342
+
343
+ /* Add tooltip styles for refresh label */
344
+ .refresh-label {
345
+ position: relative;
346
+ cursor: help;
347
+ }
348
+
349
+ .refresh-label:hover::after {
350
+ content: attr(data-tooltip);
351
+ position: absolute;
352
+ left: 50%;
353
+ top: 100%;
354
+ transform: translateX(-50%) translateY(8px);
355
+ z-index: 1001;
356
+ background-color: rgba(33, 37, 41, 0.9);
357
+ color: white;
358
+ padding: 0.5rem 1rem;
359
+ border-radius: 6px;
360
+ font-size: 0.875rem;
361
+ white-space: nowrap;
362
+ box-shadow: 0 2px 12px rgba(0, 0, 0, 0.15);
363
+ }
364
+
365
+ .refresh-label:hover::before {
366
+ content: '';
367
+ position: absolute;
368
+ left: 50%;
369
+ top: 100%;
370
+ transform: translateX(-50%);
371
+ border: 8px solid transparent;
372
+ border-bottom-color: rgba(33, 37, 41, 0.9);
373
+ z-index: 1001;
374
+ }
316
375
  </style>
317
376
  <script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.29.1/moment.min.js"></script>
318
377
  <script
@@ -328,14 +387,14 @@
328
387
  <div class="d-flex align-items-center">
329
388
  <div class="form-check form-switch me-3">
330
389
  <input class="form-check-input" type="checkbox" id="refresh-toggle" checked>
331
- <label class="form-check-label" for="refresh-toggle">
390
+ <label class="form-check-label refresh-label" for="refresh-toggle" data-tooltip="Refreshes every 30 seconds">
332
391
  Auto-refresh
333
392
  <span class="refresh-indicator">
334
393
  <span class="refresh-spinner" id="refresh-spinner"></span>
335
394
  </span>
336
395
  </label>
337
396
  </div>
338
- <p class="text-muted mb-0" id="last-updated"></p>
397
+ <p class="text-muted mb-0" id="last-updated" data-tooltip="{{ utcTimestamp }}"></p>
339
398
  </div>
340
399
  </div>
341
400
  </header>
@@ -349,7 +408,7 @@
349
408
  </select>
350
409
 
351
410
  {% if rows %}
352
- <p>Filter By :
411
+ <p>Filter by status:
353
412
  <span class="badge bg-secondary clickable-badge selected-filter me-2" data-status="ALL">All</span>
354
413
  {% set status_dict = {} %}
355
414
  {% for row in rows %}
@@ -362,7 +421,7 @@
362
421
  {% endfor %}
363
422
  {% for status, count in status_dict|dictsort %}
364
423
  <span class="me-2">
365
- <span class="me-1">; {{ count }}</span>
424
+ <span class="me-1">| {{ count }}</span>
366
425
  {% if status.startswith('RUNNING') %}
367
426
  <span class="badge bg-primary clickable-badge" data-status="{{ status }}">{{ status }}</span>
368
427
  {% elif status.startswith('PENDING') or status.startswith('SUBMITTED') %}
@@ -414,7 +473,7 @@
414
473
  <td>{{ row[7] }}</td>
415
474
  <td>
416
475
  <!-- Status column with tooltip -->
417
- <div class="status-container" style="position: relative;" title="{{ row[14] }}">
476
+ <div class="status-container" style="position: relative;" data-tooltip="{{ row[14] }}">
418
477
  {% if row[9].startswith('RUNNING') %}
419
478
  <span class="badge bg-primary">{{ row[9].split()[0] }}</span>{{ row[9][row[9].split()[0]|length:] }}
420
479
  {% elif row[9].startswith('PENDING') or row[9].startswith('SUBMITTED') %}
@@ -436,12 +495,12 @@
436
495
  <td>{{ row[11] }}</td> {# Cluster #}
437
496
  <td>{{ row[12] }}</td> {# Region #}
438
497
  <td>{{ row[8] }}</td> {# Recoveries #}
439
- <td>{{ row[13] }}</td> {# Details #}
498
+ <td data-full-text="{{ row[13] }}">{{ row[13] }}</td> {# Details #}
440
499
  <td>
441
500
  {% if row[1]|string|replace(' \u21B3', '') and row[1]|string|replace(' \u21B3', '') != '-' %}
442
- <a href="{{ url_for('download_log', job_id=row[1]|string|replace(' \u21B3', '')) }}"
501
+ <a href="{{ url_for('download_log', job_id=row[1]|string|replace(' \u21B3', '')) }}"
443
502
  class="btn btn-sm btn-outline-secondary">
444
- Controller Log
503
+ controller log
445
504
  </a>
446
505
  {% endif %}
447
506
  </td>
@@ -515,7 +574,7 @@
515
574
  var localTimestamp = moment.utc(timestamp).tz(moment.tz.guess()).format('YYYY-MM-DD HH:mm:ss z');
516
575
  var utcTimestamp = moment.utc(timestamp).format('YYYY-MM-DD HH:mm:ss UTC');
517
576
  document.getElementById("last-updated").textContent = "Last updated: " + localTimestamp;
518
- document.getElementById("last-updated").title = utcTimestamp;
577
+ document.getElementById("last-updated").setAttribute('data-tooltip', utcTimestamp);
519
578
  });
520
579
  </script>
521
580
  <script>
@@ -567,7 +626,7 @@
567
626
  var statusCell = row.querySelector("td:nth-child(7)"); // Status is now in the 7th column
568
627
  if (statusCell) {
569
628
  var statusText = statusCell.textContent.trim().split(' ')[0]; // Get first word of status
570
-
629
+
571
630
  if (status === '' || statusText === status) {
572
631
  row.style.display = "";
573
632
  } else {
@@ -589,18 +648,6 @@
589
648
  window.addEventListener('beforeunload', function() {
590
649
  document.getElementById('refresh-spinner').style.display = 'inline-block';
591
650
  });
592
-
593
- // Enhance table row hover effect
594
- document.querySelectorAll('tbody tr').forEach(row => {
595
- row.addEventListener('mouseenter', function() {
596
- this.style.transform = 'scale(1.002)';
597
- this.style.transition = 'transform 0.2s';
598
- });
599
-
600
- row.addEventListener('mouseleave', function() {
601
- this.style.transform = 'scale(1)';
602
- });
603
- });
604
651
  </script>
605
652
  <script>
606
653
  // Update column indices for job table
@@ -615,20 +662,20 @@
615
662
  document.addEventListener("DOMContentLoaded", function() {
616
663
  const statusFilter = document.getElementById('status-filter');
617
664
  const badges = document.querySelectorAll('.clickable-badge');
618
-
665
+
619
666
  // Set initial state
620
667
  const savedFilter = localStorage.getItem("statusFilter") || '';
621
668
  updateSelectedBadge(savedFilter);
622
-
669
+
623
670
  badges.forEach(function(badge) {
624
671
  badge.addEventListener('click', function() {
625
672
  const status = this.dataset.status;
626
673
  const currentFilter = statusFilter.value;
627
-
674
+
628
675
  // If clicking the already selected filter, clear it (show all)
629
- const newStatus = (status === currentFilter || (status === 'ALL' && currentFilter === '')) ? '' :
676
+ const newStatus = (status === currentFilter || (status === 'ALL' && currentFilter === '')) ? '' :
630
677
  (status === 'ALL' ? '' : status);
631
-
678
+
632
679
  // Update filter and UI
633
680
  statusFilter.value = newStatus;
634
681
  filterStatus(newStatus);
@@ -636,11 +683,11 @@
636
683
  updateSelectedBadge(newStatus);
637
684
  });
638
685
  });
639
-
686
+
640
687
  function updateSelectedBadge(selectedStatus) {
641
688
  badges.forEach(badge => {
642
689
  badge.classList.remove('selected-filter');
643
- if ((selectedStatus === '' && badge.dataset.status === 'ALL') ||
690
+ if ((selectedStatus === '' && badge.dataset.status === 'ALL') ||
644
691
  badge.dataset.status === selectedStatus) {
645
692
  badge.classList.add('selected-filter');
646
693
  }
@@ -653,7 +700,7 @@
653
700
  document.addEventListener("DOMContentLoaded", function() {
654
701
  const header = document.querySelector('header');
655
702
  const container = document.querySelector('.container');
656
-
703
+
657
704
  window.addEventListener('scroll', function() {
658
705
  if (container.getBoundingClientRect().top < 0) {
659
706
  header.classList.add('sticky');
@@ -663,7 +710,25 @@
663
710
  });
664
711
  });
665
712
  </script>
713
+ <script>
714
+ // Add click handler for Details cells
715
+ document.addEventListener('DOMContentLoaded', function() {
716
+ const detailsCells = document.querySelectorAll('#jobs-table td:nth-child(12)');
717
+
718
+ detailsCells.forEach(cell => {
719
+ // Check if content is truncated
720
+ if (cell.scrollWidth > cell.clientWidth) {
721
+ cell.classList.add('expandable');
722
+ }
666
723
 
724
+ cell.addEventListener('click', function() {
725
+ if (this.scrollWidth > this.clientWidth || this.classList.contains('expanded')) {
726
+ this.classList.toggle('expanded');
727
+ }
728
+ });
729
+ });
730
+ });
731
+ </script>
667
732
  </body>
668
733
 
669
734
  </html>
sky/jobs/scheduler.py CHANGED
@@ -60,6 +60,14 @@ logger = sky_logging.init_logger('sky.jobs.controller')
60
60
  _MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
61
61
  _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
62
62
 
63
+ # Based on testing, assume a running job uses 350MB memory.
64
+ JOB_MEMORY_MB = 350
65
+ # Past 2000 simultaneous jobs, we become unstable.
66
+ # See https://github.com/skypilot-org/skypilot/issues/4649.
67
+ MAX_JOB_LIMIT = 2000
68
+ # Number of ongoing launches launches allowed per CPU.
69
+ LAUNCHES_PER_CPU = 4
70
+
63
71
 
64
72
  @lru_cache(maxsize=1)
65
73
  def _get_lock_path() -> str:
@@ -247,15 +255,16 @@ def _set_alive_waiting(job_id: int) -> None:
247
255
 
248
256
 
249
257
  def _get_job_parallelism() -> int:
250
- # Assume a running job uses 350MB memory.
251
- # We observe 230-300 in practice.
252
- job_memory = 350 * 1024 * 1024
253
- return max(psutil.virtual_memory().total // job_memory, 1)
258
+ job_memory = JOB_MEMORY_MB * 1024 * 1024
259
+
260
+ job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
261
+
262
+ return max(job_limit, 1)
254
263
 
255
264
 
256
265
  def _get_launch_parallelism() -> int:
257
266
  cpus = os.cpu_count()
258
- return cpus * 4 if cpus is not None else 1
267
+ return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
259
268
 
260
269
 
261
270
  def _can_start_new_job() -> bool:
sky/provision/__init__.py CHANGED
@@ -22,6 +22,7 @@ from sky.provision import kubernetes
22
22
  from sky.provision import lambda_cloud
23
23
  from sky.provision import oci
24
24
  from sky.provision import runpod
25
+ from sky.provision import vast
25
26
  from sky.provision import vsphere
26
27
  from sky.utils import command_runner
27
28
  from sky.utils import timeline