skypilot-nightly 1.0.0.dev20250707__py3-none-any.whl → 1.0.0.dev20250709__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +52 -8
- sky/client/common.py +6 -1
- sky/clouds/aws.py +5 -0
- sky/clouds/azure.py +3 -0
- sky/clouds/cloud.py +3 -0
- sky/clouds/cudo.py +3 -0
- sky/clouds/do.py +3 -0
- sky/clouds/fluidstack.py +3 -0
- sky/clouds/gcp.py +3 -2
- sky/clouds/ibm.py +3 -0
- sky/clouds/kubernetes.py +63 -24
- sky/clouds/lambda_cloud.py +3 -0
- sky/clouds/nebius.py +3 -0
- sky/clouds/oci.py +3 -0
- sky/clouds/paperspace.py +3 -0
- sky/clouds/runpod.py +3 -0
- sky/clouds/scp.py +3 -0
- sky/clouds/utils/gcp_utils.py +61 -1
- sky/clouds/vast.py +3 -0
- sky/clouds/vsphere.py +3 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +6 -3
- sky/jobs/state.py +6 -3
- sky/provision/__init__.py +11 -0
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +106 -13
- sky/provision/gcp/constants.py +0 -3
- sky/provision/gcp/instance.py +21 -0
- sky/provision/kubernetes/instance.py +16 -0
- sky/provision/kubernetes/utils.py +9 -2
- sky/resources.py +1 -30
- sky/server/metrics.py +2 -3
- sky/server/requests/executor.py +2 -5
- sky/server/requests/payloads.py +1 -0
- sky/server/requests/requests.py +94 -4
- sky/server/server.py +20 -6
- sky/server/uvicorn.py +4 -1
- sky/skylet/constants.py +6 -2
- sky/templates/gcp-ray.yml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +8 -2
- sky/utils/resources_utils.py +30 -0
- sky/utils/schemas.py +22 -0
- {skypilot_nightly-1.0.0.dev20250707.dist-info → skypilot_nightly-1.0.0.dev20250709.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250707.dist-info → skypilot_nightly-1.0.0.dev20250709.dist-info}/RECORD +65 -65
- /sky/dashboard/out/_next/static/{wEkAg9F21A-COXJLf20VU → EqELoF4IXcALfWVihInou}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{wEkAg9F21A-COXJLf20VU → EqELoF4IXcALfWVihInou}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250707.dist-info → skypilot_nightly-1.0.0.dev20250709.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250707.dist-info → skypilot_nightly-1.0.0.dev20250709.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250707.dist-info → skypilot_nightly-1.0.0.dev20250709.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250707.dist-info → skypilot_nightly-1.0.0.dev20250709.dist-info}/top_level.txt +0 -0
sky/server/requests/payloads.py
CHANGED
sky/server/requests/requests.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
"""Utilities for REST API."""
|
2
|
+
import asyncio
|
2
3
|
import contextlib
|
3
4
|
import dataclasses
|
4
5
|
import enum
|
@@ -20,6 +21,7 @@ import filelock
|
|
20
21
|
from sky import exceptions
|
21
22
|
from sky import global_user_state
|
22
23
|
from sky import sky_logging
|
24
|
+
from sky import skypilot_config
|
23
25
|
from sky.server import common as server_common
|
24
26
|
from sky.server import constants as server_constants
|
25
27
|
from sky.server.requests import payloads
|
@@ -29,6 +31,7 @@ from sky.utils import common
|
|
29
31
|
from sky.utils import common_utils
|
30
32
|
from sky.utils import db_utils
|
31
33
|
from sky.utils import env_options
|
34
|
+
from sky.utils import subprocess_utils
|
32
35
|
from sky.utils import ux_utils
|
33
36
|
|
34
37
|
logger = sky_logging.init_logger(__name__)
|
@@ -39,8 +42,11 @@ COL_CLUSTER_NAME = 'cluster_name'
|
|
39
42
|
COL_USER_ID = 'user_id'
|
40
43
|
COL_STATUS_MSG = 'status_msg'
|
41
44
|
COL_SHOULD_RETRY = 'should_retry'
|
45
|
+
COL_FINISHED_AT = 'finished_at'
|
42
46
|
REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
|
43
47
|
|
48
|
+
DEFAULT_REQUESTS_RETENTION_HOURS = 24 # 1 day
|
49
|
+
|
44
50
|
# TODO(zhwu): For scalability, there are several TODOs:
|
45
51
|
# [x] Have a way to queue requests.
|
46
52
|
# [ ] Move logs to persistent place.
|
@@ -64,6 +70,10 @@ class RequestStatus(enum.Enum):
|
|
64
70
|
color = _STATUS_TO_COLOR[self]
|
65
71
|
return f'{color}{self.value}{colorama.Style.RESET_ALL}'
|
66
72
|
|
73
|
+
@classmethod
|
74
|
+
def finished_status(cls) -> List['RequestStatus']:
|
75
|
+
return [cls.SUCCEEDED, cls.FAILED, cls.CANCELLED]
|
76
|
+
|
67
77
|
|
68
78
|
_STATUS_TO_COLOR = {
|
69
79
|
RequestStatus.PENDING: colorama.Fore.BLUE,
|
@@ -88,6 +98,7 @@ REQUEST_COLUMNS = [
|
|
88
98
|
COL_USER_ID,
|
89
99
|
COL_STATUS_MSG,
|
90
100
|
COL_SHOULD_RETRY,
|
101
|
+
COL_FINISHED_AT,
|
91
102
|
]
|
92
103
|
|
93
104
|
|
@@ -120,6 +131,8 @@ class Request:
|
|
120
131
|
status_msg: Optional[str] = None
|
121
132
|
# Whether the request should be retried.
|
122
133
|
should_retry: bool = False
|
134
|
+
# When the request finished.
|
135
|
+
finished_at: Optional[float] = None
|
123
136
|
|
124
137
|
@property
|
125
138
|
def log_path(self) -> pathlib.Path:
|
@@ -206,6 +219,7 @@ class Request:
|
|
206
219
|
cluster_name=self.cluster_name,
|
207
220
|
status_msg=self.status_msg,
|
208
221
|
should_retry=self.should_retry,
|
222
|
+
finished_at=self.finished_at,
|
209
223
|
)
|
210
224
|
|
211
225
|
def encode(self) -> payloads.RequestPayload:
|
@@ -228,6 +242,7 @@ class Request:
|
|
228
242
|
cluster_name=self.cluster_name,
|
229
243
|
status_msg=self.status_msg,
|
230
244
|
should_retry=self.should_retry,
|
245
|
+
finished_at=self.finished_at,
|
231
246
|
)
|
232
247
|
except (TypeError, ValueError) as e:
|
233
248
|
# The error is unexpected, so we don't suppress the stack trace.
|
@@ -260,6 +275,7 @@ class Request:
|
|
260
275
|
cluster_name=payload.cluster_name,
|
261
276
|
status_msg=payload.status_msg,
|
262
277
|
should_retry=payload.should_retry,
|
278
|
+
finished_at=payload.finished_at,
|
263
279
|
)
|
264
280
|
except (TypeError, ValueError) as e:
|
265
281
|
logger.error(
|
@@ -439,6 +455,7 @@ def kill_requests(request_ids: Optional[List[str]] = None,
|
|
439
455
|
# process for each request.
|
440
456
|
os.kill(request_record.pid, signal.SIGTERM)
|
441
457
|
request_record.status = RequestStatus.CANCELLED
|
458
|
+
request_record.finished_at = time.time()
|
442
459
|
cancelled_request_ids.append(request_id)
|
443
460
|
return cancelled_request_ids
|
444
461
|
|
@@ -474,13 +491,16 @@ def create_table(cursor, conn):
|
|
474
491
|
schedule_type TEXT,
|
475
492
|
{COL_USER_ID} TEXT,
|
476
493
|
{COL_STATUS_MSG} TEXT,
|
477
|
-
{COL_SHOULD_RETRY} INTEGER
|
494
|
+
{COL_SHOULD_RETRY} INTEGER,
|
495
|
+
{COL_FINISHED_AT} REAL
|
478
496
|
)""")
|
479
497
|
|
480
498
|
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_STATUS_MSG,
|
481
499
|
'TEXT')
|
482
500
|
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_SHOULD_RETRY,
|
483
501
|
'INTEGER')
|
502
|
+
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_FINISHED_AT,
|
503
|
+
'REAL')
|
484
504
|
|
485
505
|
|
486
506
|
_DB = None
|
@@ -583,6 +603,7 @@ def get_request_tasks(
|
|
583
603
|
user_id: Optional[str] = None,
|
584
604
|
exclude_request_names: Optional[List[str]] = None,
|
585
605
|
include_request_names: Optional[List[str]] = None,
|
606
|
+
finished_before: Optional[float] = None,
|
586
607
|
) -> List[Request]:
|
587
608
|
"""Get a list of requests that match the given filters.
|
588
609
|
|
@@ -595,6 +616,8 @@ def get_request_tasks(
|
|
595
616
|
If None, all users are included.
|
596
617
|
include_request_names: a list of request names to filter on.
|
597
618
|
Mutually exclusive with exclude_request_names.
|
619
|
+
finished_before: if provided, only include requests finished before this
|
620
|
+
timestamp.
|
598
621
|
|
599
622
|
Raises:
|
600
623
|
ValueError: If both exclude_request_names and include_request_names are
|
@@ -606,7 +629,7 @@ def get_request_tasks(
|
|
606
629
|
'provided, not both.')
|
607
630
|
|
608
631
|
filters = []
|
609
|
-
filter_params = []
|
632
|
+
filter_params: List[Any] = []
|
610
633
|
if status is not None:
|
611
634
|
status_list_str = ','.join(repr(status.value) for status in status)
|
612
635
|
filters.append(f'status IN ({status_list_str})')
|
@@ -624,6 +647,9 @@ def get_request_tasks(
|
|
624
647
|
request_names_str = ','.join(
|
625
648
|
repr(name) for name in include_request_names)
|
626
649
|
filters.append(f'name IN ({request_names_str})')
|
650
|
+
if finished_before is not None:
|
651
|
+
filters.append('finished_at < ?')
|
652
|
+
filter_params.append(finished_before)
|
627
653
|
assert _DB is not None
|
628
654
|
with _DB.conn:
|
629
655
|
cursor = _DB.conn.cursor()
|
@@ -665,19 +691,83 @@ def set_request_failed(request_id: str, e: BaseException) -> None:
|
|
665
691
|
with update_request(request_id) as request_task:
|
666
692
|
assert request_task is not None, request_id
|
667
693
|
request_task.status = RequestStatus.FAILED
|
694
|
+
request_task.finished_at = time.time()
|
668
695
|
request_task.set_error(e)
|
669
696
|
|
670
697
|
|
671
|
-
def set_request_succeeded(request_id: str, result: Any) -> None:
|
698
|
+
def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
|
672
699
|
"""Set a request to succeeded and populate the result."""
|
673
700
|
with update_request(request_id) as request_task:
|
674
701
|
assert request_task is not None, request_id
|
675
702
|
request_task.status = RequestStatus.SUCCEEDED
|
676
|
-
request_task.
|
703
|
+
request_task.finished_at = time.time()
|
704
|
+
if result is not None:
|
705
|
+
request_task.set_return_value(result)
|
677
706
|
|
678
707
|
|
679
708
|
def set_request_cancelled(request_id: str) -> None:
|
680
709
|
"""Set a request to cancelled."""
|
681
710
|
with update_request(request_id) as request_task:
|
682
711
|
assert request_task is not None, request_id
|
712
|
+
request_task.finished_at = time.time()
|
683
713
|
request_task.status = RequestStatus.CANCELLED
|
714
|
+
|
715
|
+
|
716
|
+
@init_db
|
717
|
+
def _delete_requests(requests: List[Request]):
|
718
|
+
"""Clean up requests by their IDs."""
|
719
|
+
id_list_str = ','.join(repr(req.request_id) for req in requests)
|
720
|
+
assert _DB is not None
|
721
|
+
with _DB.conn:
|
722
|
+
cursor = _DB.conn.cursor()
|
723
|
+
cursor.execute(
|
724
|
+
f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
|
725
|
+
|
726
|
+
|
727
|
+
def clean_finished_requests_with_retention(retention_seconds: int):
|
728
|
+
"""Clean up finished requests older than the retention period.
|
729
|
+
|
730
|
+
This function removes old finished requests (SUCCEEDED, FAILED, CANCELLED)
|
731
|
+
from the database and cleans up their associated log files.
|
732
|
+
|
733
|
+
Args:
|
734
|
+
retention_seconds: Requests older than this many seconds will be
|
735
|
+
deleted.
|
736
|
+
"""
|
737
|
+
reqs = get_request_tasks(status=RequestStatus.finished_status(),
|
738
|
+
finished_before=time.time() - retention_seconds)
|
739
|
+
|
740
|
+
subprocess_utils.run_in_parallel(
|
741
|
+
func=lambda req: req.log_path.unlink(missing_ok=True),
|
742
|
+
args=reqs,
|
743
|
+
num_threads=len(reqs))
|
744
|
+
|
745
|
+
_delete_requests(reqs)
|
746
|
+
|
747
|
+
# To avoid leakage of the log file, logs must be deleted before the
|
748
|
+
# request task in the database.
|
749
|
+
logger.info(f'Cleaned up {len(reqs)} finished requests '
|
750
|
+
f'older than {retention_seconds} seconds')
|
751
|
+
|
752
|
+
|
753
|
+
async def requests_gc_daemon():
|
754
|
+
"""Garbage collect finished requests periodically."""
|
755
|
+
while True:
|
756
|
+
logger.info('Running requests GC daemon...')
|
757
|
+
# Use the latest config.
|
758
|
+
skypilot_config.reload_config()
|
759
|
+
retention_seconds = skypilot_config.get_nested(
|
760
|
+
('api_server', 'requests_retention_hours'),
|
761
|
+
DEFAULT_REQUESTS_RETENTION_HOURS) * 3600
|
762
|
+
try:
|
763
|
+
# Negative value disables the requests GC
|
764
|
+
if retention_seconds >= 0:
|
765
|
+
clean_finished_requests_with_retention(retention_seconds)
|
766
|
+
except asyncio.CancelledError:
|
767
|
+
logger.info('Requests GC daemon cancelled')
|
768
|
+
break
|
769
|
+
except Exception as e: # pylint: disable=broad-except
|
770
|
+
logger.error(f'Error running requests GC daemon: {e}')
|
771
|
+
# Run the daemon at most once every hour to avoid too frequent
|
772
|
+
# cleanup.
|
773
|
+
await asyncio.sleep(max(retention_seconds, 3600))
|
sky/server/server.py
CHANGED
@@ -26,6 +26,7 @@ import fastapi
|
|
26
26
|
from fastapi.middleware import cors
|
27
27
|
from passlib.hash import apr_md5_crypt
|
28
28
|
import starlette.middleware.base
|
29
|
+
import uvloop
|
29
30
|
|
30
31
|
import sky
|
31
32
|
from sky import catalog
|
@@ -128,7 +129,7 @@ async def _override_user_info_in_request_body(request: fastapi.Request,
|
|
128
129
|
if body:
|
129
130
|
try:
|
130
131
|
original_json = await request.json()
|
131
|
-
except json.JSONDecodeError as e:
|
132
|
+
except (json.JSONDecodeError, UnicodeDecodeError) as e:
|
132
133
|
logger.error(f'Error parsing request JSON: {e}')
|
133
134
|
else:
|
134
135
|
logger.debug(f'Overriding user for {request.state.request_id}: '
|
@@ -1461,6 +1462,12 @@ async def stream(
|
|
1461
1462
|
raise fastapi.HTTPException(
|
1462
1463
|
status_code=404, detail=f'Request {request_id!r} not found')
|
1463
1464
|
log_path_to_stream = request_task.log_path
|
1465
|
+
if not log_path_to_stream.exists():
|
1466
|
+
# The log file might be deleted by the request GC daemon but the
|
1467
|
+
# request task is still in the database.
|
1468
|
+
raise fastapi.HTTPException(
|
1469
|
+
status_code=404,
|
1470
|
+
detail=f'Log of request {request_id!r} has been deleted')
|
1464
1471
|
else:
|
1465
1472
|
assert log_path is not None, (request_id, log_path)
|
1466
1473
|
if log_path == constants.API_SERVER_LOGS:
|
@@ -1775,13 +1782,18 @@ if __name__ == '__main__':
|
|
1775
1782
|
|
1776
1783
|
queue_server: Optional[multiprocessing.Process] = None
|
1777
1784
|
workers: List[executor.RequestWorker] = []
|
1785
|
+
# Global background tasks that will be scheduled in a separate event loop.
|
1786
|
+
global_tasks: List[asyncio.Task] = []
|
1778
1787
|
try:
|
1788
|
+
background = uvloop.new_event_loop()
|
1779
1789
|
if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
|
1780
|
-
|
1781
|
-
|
1782
|
-
|
1783
|
-
|
1784
|
-
|
1790
|
+
metrics_server = metrics.build_metrics_server(
|
1791
|
+
cmd_args.host, cmd_args.metrics_port)
|
1792
|
+
global_tasks.append(background.create_task(metrics_server.serve()))
|
1793
|
+
global_tasks.append(
|
1794
|
+
background.create_task(requests_lib.requests_gc_daemon()))
|
1795
|
+
threading.Thread(target=background.run_forever, daemon=True).start()
|
1796
|
+
|
1785
1797
|
queue_server, workers = executor.start(config)
|
1786
1798
|
|
1787
1799
|
logger.info(f'Starting SkyPilot API server, workers={num_workers}')
|
@@ -1799,6 +1811,8 @@ if __name__ == '__main__':
|
|
1799
1811
|
finally:
|
1800
1812
|
logger.info('Shutting down SkyPilot API server...')
|
1801
1813
|
|
1814
|
+
for gt in global_tasks:
|
1815
|
+
gt.cancel()
|
1802
1816
|
subprocess_utils.run_in_parallel(lambda worker: worker.cancel(),
|
1803
1817
|
workers,
|
1804
1818
|
num_threads=len(workers))
|
sky/server/uvicorn.py
CHANGED
@@ -150,7 +150,10 @@ class Server(uvicorn.Server):
|
|
150
150
|
if req is None:
|
151
151
|
return
|
152
152
|
if req.pid is not None:
|
153
|
-
|
153
|
+
try:
|
154
|
+
os.kill(req.pid, signal.SIGTERM)
|
155
|
+
except ProcessLookupError:
|
156
|
+
logger.debug(f'Process {req.pid} already finished.')
|
154
157
|
req.status = requests_lib.RequestStatus.CANCELLED
|
155
158
|
req.should_retry = True
|
156
159
|
logger.info(
|
sky/skylet/constants.py
CHANGED
@@ -375,6 +375,8 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
375
375
|
('ssh', 'pod_config'),
|
376
376
|
('kubernetes', 'pod_config'),
|
377
377
|
('kubernetes', 'provision_timeout'),
|
378
|
+
('kubernetes', 'dws'),
|
379
|
+
('kubernetes', 'kueue'),
|
378
380
|
('gcp', 'managed_instance_group'),
|
379
381
|
('gcp', 'enable_gvnic'),
|
380
382
|
('gcp', 'enable_gpu_direct'),
|
@@ -460,8 +462,10 @@ TIME_UNITS = {
|
|
460
462
|
'w': 7 * 24 * 60,
|
461
463
|
}
|
462
464
|
|
463
|
-
TIME_PATTERN: str = (
|
464
|
-
|
465
|
+
TIME_PATTERN: str = ('^[0-9]+('
|
466
|
+
f'{"|".join([unit.lower() for unit in TIME_UNITS])}|'
|
467
|
+
f'{"|".join([unit.upper() for unit in TIME_UNITS])}|'
|
468
|
+
')?$')
|
465
469
|
|
466
470
|
MEMORY_SIZE_UNITS = {
|
467
471
|
'kb': 2**10,
|
sky/templates/gcp-ray.yml.j2
CHANGED
@@ -297,6 +297,9 @@ available_node_types:
|
|
297
297
|
annotations:
|
298
298
|
kueue.x-k8s.io/retriable-in-group: "false"
|
299
299
|
kueue.x-k8s.io/pod-group-total-count: "{{ num_nodes|string }}"
|
300
|
+
{% if k8s_max_run_duration_seconds %}
|
301
|
+
provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{k8s_max_run_duration_seconds|string}}"
|
302
|
+
{% endif %}
|
300
303
|
{% endif %}
|
301
304
|
spec:
|
302
305
|
# serviceAccountName: skypilot-service-account
|
@@ -309,7 +312,7 @@ available_node_types:
|
|
309
312
|
{% endif %}
|
310
313
|
|
311
314
|
# Add node selector if GPU/TPUs are requested:
|
312
|
-
{% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
|
315
|
+
{% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) or (k8s_enable_flex_start) %}
|
313
316
|
nodeSelector:
|
314
317
|
{% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %}
|
315
318
|
{{k8s_topology_label_key}}: {{k8s_topology_label_value}}
|
@@ -317,6 +320,9 @@ available_node_types:
|
|
317
320
|
{% if k8s_spot_label_key is not none %}
|
318
321
|
{{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
|
319
322
|
{% endif %}
|
323
|
+
{% if k8s_enable_flex_start %}
|
324
|
+
cloud.google.com/gke-flex-start: "true"
|
325
|
+
{% endif %}
|
320
326
|
{% endif %}
|
321
327
|
{% if (k8s_acc_label_key is not none and k8s_acc_label_values is not none) or (avoid_label_keys is not none) %}
|
322
328
|
affinity:
|
@@ -392,7 +398,7 @@ available_node_types:
|
|
392
398
|
{% endfor %}
|
393
399
|
containers:
|
394
400
|
- name: ray-node
|
395
|
-
imagePullPolicy:
|
401
|
+
imagePullPolicy: Always
|
396
402
|
image: {{image_id}}
|
397
403
|
env:
|
398
404
|
- name: SKYPILOT_POD_NODE_TYPE
|
sky/utils/resources_utils.py
CHANGED
@@ -405,3 +405,33 @@ def parse_memory_resource(resource_qty_str: Union[str, int, float],
|
|
405
405
|
continue
|
406
406
|
|
407
407
|
raise ValueError(error_msg)
|
408
|
+
|
409
|
+
|
410
|
+
def parse_time_minutes(time: str) -> int:
|
411
|
+
"""Convert a time string to minutes.
|
412
|
+
|
413
|
+
Args:
|
414
|
+
time: Time string with optional unit suffix (e.g., '30m', '2h', '1d')
|
415
|
+
|
416
|
+
Returns:
|
417
|
+
Time in minutes as an integer
|
418
|
+
"""
|
419
|
+
time_str = str(time)
|
420
|
+
|
421
|
+
if time_str.isdecimal():
|
422
|
+
# We assume it is already in minutes to maintain backwards
|
423
|
+
# compatibility
|
424
|
+
return int(time_str)
|
425
|
+
|
426
|
+
time_str = time_str.lower()
|
427
|
+
for unit, multiplier in constants.TIME_UNITS.items():
|
428
|
+
if time_str.endswith(unit):
|
429
|
+
try:
|
430
|
+
value = float(time_str[:-len(unit)])
|
431
|
+
final_value = math.ceil(value * multiplier)
|
432
|
+
if final_value >= 0:
|
433
|
+
return final_value
|
434
|
+
except ValueError:
|
435
|
+
continue
|
436
|
+
|
437
|
+
raise ValueError(f'Invalid time format: {time}')
|
sky/utils/schemas.py
CHANGED
@@ -1084,6 +1084,25 @@ _CONTEXT_CONFIG_SCHEMA_KUBERNETES = {
|
|
1084
1084
|
},
|
1085
1085
|
},
|
1086
1086
|
},
|
1087
|
+
'dws': {
|
1088
|
+
'type': 'object',
|
1089
|
+
'required': [],
|
1090
|
+
'additionalProperties': False,
|
1091
|
+
'properties': {
|
1092
|
+
'enabled': {
|
1093
|
+
'type': 'boolean',
|
1094
|
+
},
|
1095
|
+
# Only used when Kueue is enabled.
|
1096
|
+
'max_run_duration': {
|
1097
|
+
'anyOf': [{
|
1098
|
+
'type': 'string',
|
1099
|
+
'pattern': constants.TIME_PATTERN,
|
1100
|
+
}, {
|
1101
|
+
'type': 'integer',
|
1102
|
+
}]
|
1103
|
+
},
|
1104
|
+
},
|
1105
|
+
},
|
1087
1106
|
}
|
1088
1107
|
|
1089
1108
|
|
@@ -1430,6 +1449,9 @@ def get_config_schema():
|
|
1430
1449
|
}
|
1431
1450
|
]
|
1432
1451
|
},
|
1452
|
+
'requests_retention_hours': {
|
1453
|
+
'type': 'integer',
|
1454
|
+
},
|
1433
1455
|
}
|
1434
1456
|
}
|
1435
1457
|
|