skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend_utils.py +6 -2
- sky/backends/cloud_vm_ray_backend.py +13 -4
- sky/client/cli/command.py +22 -8
- sky/client/sdk.py +50 -0
- sky/clouds/kubernetes.py +2 -6
- sky/clouds/nebius.py +3 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
- sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
- sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c2ea34fda4f1f8c8.js → _app-ce361c6959bc2001.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-664c36eda967b1ba.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +13 -10
- sky/global_user_state.py +128 -1
- sky/jobs/constants.py +1 -1
- sky/jobs/scheduler.py +14 -21
- sky/jobs/server/core.py +64 -10
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +1 -3
- sky/jobs/utils.py +159 -8
- sky/provision/aws/config.py +19 -3
- sky/provision/aws/instance.py +2 -1
- sky/provision/nebius/utils.py +101 -86
- sky/provision/provisioner.py +13 -8
- sky/resources.py +5 -5
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/serve/replica_managers.py +123 -101
- sky/serve/serve_state.py +32 -0
- sky/serve/serve_utils.py +37 -16
- sky/serve/service.py +51 -17
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +12 -2
- sky/server/requests/serializers/encoders.py +10 -2
- sky/server/server.py +44 -2
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/common_utils.py +20 -0
- sky/utils/controller_utils.py +17 -4
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +3 -0
- sky/utils/ux_utils.py +36 -5
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +99 -98
- sky/dashboard/out/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-00c0a51d21157453.js +0 -1
- /sky/dashboard/out/_next/static/{Y0eNlwi85qGRecLTin11y → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{6989-37611fe6b86d274d.js → 6989-01359c57e018caa4.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
|
@@ -6,6 +6,7 @@ Concepts:
|
|
|
6
6
|
- Cluster handle: (non-user facing) an opaque backend handle for us to
|
|
7
7
|
interact with a cluster.
|
|
8
8
|
"""
|
|
9
|
+
import asyncio
|
|
9
10
|
import enum
|
|
10
11
|
import functools
|
|
11
12
|
import json
|
|
@@ -51,6 +52,9 @@ _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
|
|
|
51
52
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
52
53
|
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
53
54
|
|
|
55
|
+
DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
|
|
56
|
+
MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS = 3600
|
|
57
|
+
|
|
54
58
|
Base = declarative.declarative_base()
|
|
55
59
|
|
|
56
60
|
config_table = sqlalchemy.Table(
|
|
@@ -102,6 +106,9 @@ cluster_table = sqlalchemy.Table(
|
|
|
102
106
|
sqlalchemy.Text,
|
|
103
107
|
server_default=None),
|
|
104
108
|
sqlalchemy.Column('is_managed', sqlalchemy.Integer, server_default='0'),
|
|
109
|
+
sqlalchemy.Column('provision_log_path',
|
|
110
|
+
sqlalchemy.Text,
|
|
111
|
+
server_default=None),
|
|
105
112
|
)
|
|
106
113
|
|
|
107
114
|
storage_table = sqlalchemy.Table(
|
|
@@ -161,6 +168,9 @@ cluster_history_table = sqlalchemy.Table(
|
|
|
161
168
|
sqlalchemy.Text,
|
|
162
169
|
server_default=None),
|
|
163
170
|
sqlalchemy.Column('workspace', sqlalchemy.Text, server_default=None),
|
|
171
|
+
sqlalchemy.Column('provision_log_path',
|
|
172
|
+
sqlalchemy.Text,
|
|
173
|
+
server_default=None),
|
|
164
174
|
)
|
|
165
175
|
|
|
166
176
|
|
|
@@ -430,6 +440,17 @@ def get_user_by_name(username: str) -> List[models.User]:
|
|
|
430
440
|
]
|
|
431
441
|
|
|
432
442
|
|
|
443
|
+
@_init_db
|
|
444
|
+
def get_user_by_name_match(username_match: str) -> List[models.User]:
|
|
445
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
446
|
+
rows = session.query(user_table).filter(
|
|
447
|
+
user_table.c.name.like(f'%{username_match}%')).all()
|
|
448
|
+
return [
|
|
449
|
+
models.User(id=row.id, name=row.name, created_at=row.created_at)
|
|
450
|
+
for row in rows
|
|
451
|
+
]
|
|
452
|
+
|
|
453
|
+
|
|
433
454
|
@_init_db
|
|
434
455
|
def delete_user(user_id: str) -> None:
|
|
435
456
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
@@ -458,7 +479,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
458
479
|
is_launch: bool = True,
|
|
459
480
|
config_hash: Optional[str] = None,
|
|
460
481
|
task_config: Optional[Dict[str, Any]] = None,
|
|
461
|
-
is_managed: bool = False
|
|
482
|
+
is_managed: bool = False,
|
|
483
|
+
provision_log_path: Optional[str] = None):
|
|
462
484
|
"""Adds or updates cluster_name -> cluster_handle mapping.
|
|
463
485
|
|
|
464
486
|
Args:
|
|
@@ -473,6 +495,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
473
495
|
task_config: The config of the task being launched.
|
|
474
496
|
is_managed: Whether the cluster is launched by the
|
|
475
497
|
controller.
|
|
498
|
+
provision_log_path: Absolute path to provision.log, if available.
|
|
476
499
|
"""
|
|
477
500
|
assert _SQLALCHEMY_ENGINE is not None
|
|
478
501
|
# FIXME: launched_at will be changed when `sky launch -c` is called.
|
|
@@ -555,6 +578,10 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
555
578
|
if task_config else None,
|
|
556
579
|
'last_creation_command': last_use,
|
|
557
580
|
})
|
|
581
|
+
if provision_log_path is not None:
|
|
582
|
+
conditional_values.update({
|
|
583
|
+
'provision_log_path': provision_log_path,
|
|
584
|
+
})
|
|
558
585
|
|
|
559
586
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
560
587
|
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
@@ -618,6 +645,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
618
645
|
usage_intervals=pickle.dumps(usage_intervals),
|
|
619
646
|
user_hash=user_hash,
|
|
620
647
|
workspace=history_workspace,
|
|
648
|
+
provision_log_path=provision_log_path,
|
|
621
649
|
**creation_info,
|
|
622
650
|
)
|
|
623
651
|
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
@@ -633,6 +661,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
633
661
|
pickle.dumps(usage_intervals),
|
|
634
662
|
cluster_history_table.c.user_hash: history_hash,
|
|
635
663
|
cluster_history_table.c.workspace: history_workspace,
|
|
664
|
+
cluster_history_table.c.provision_log_path: provision_log_path,
|
|
636
665
|
**creation_info,
|
|
637
666
|
})
|
|
638
667
|
session.execute(do_update_stmt)
|
|
@@ -731,6 +760,41 @@ def get_last_cluster_event(cluster_hash: str,
|
|
|
731
760
|
return row.reason
|
|
732
761
|
|
|
733
762
|
|
|
763
|
+
def cleanup_cluster_events_with_retention(retention_hours: float) -> None:
|
|
764
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
765
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
766
|
+
query = session.query(cluster_event_table).filter(
|
|
767
|
+
cluster_event_table.c.transitioned_at < time.time() -
|
|
768
|
+
retention_hours * 3600)
|
|
769
|
+
logger.debug(f'Deleting {query.count()} cluster events.')
|
|
770
|
+
query.delete()
|
|
771
|
+
session.commit()
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
async def cluster_event_retention_daemon():
|
|
775
|
+
"""Garbage collect cluster events periodically."""
|
|
776
|
+
while True:
|
|
777
|
+
logger.info('Running cluster event retention daemon...')
|
|
778
|
+
# Use the latest config.
|
|
779
|
+
skypilot_config.reload_config()
|
|
780
|
+
retention_hours = skypilot_config.get_nested(
|
|
781
|
+
('api_server', 'cluster_event_retention_hours'),
|
|
782
|
+
DEFAULT_CLUSTER_EVENT_RETENTION_HOURS)
|
|
783
|
+
try:
|
|
784
|
+
if retention_hours >= 0:
|
|
785
|
+
cleanup_cluster_events_with_retention(retention_hours)
|
|
786
|
+
except asyncio.CancelledError:
|
|
787
|
+
logger.info('Cluster event retention daemon cancelled')
|
|
788
|
+
break
|
|
789
|
+
except Exception as e: # pylint: disable=broad-except
|
|
790
|
+
logger.error(f'Error running cluster event retention daemon: {e}')
|
|
791
|
+
|
|
792
|
+
# Run daemon at most once every hour to avoid too frequent cleanup.
|
|
793
|
+
sleep_amount = max(retention_hours * 3600,
|
|
794
|
+
MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS)
|
|
795
|
+
await asyncio.sleep(sleep_amount)
|
|
796
|
+
|
|
797
|
+
|
|
734
798
|
def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
|
|
735
799
|
event_type: ClusterEventType) -> List[str]:
|
|
736
800
|
"""Returns the cluster events for the cluster.
|
|
@@ -798,6 +862,7 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
|
|
|
798
862
|
assert _SQLALCHEMY_ENGINE is not None
|
|
799
863
|
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
800
864
|
usage_intervals = _get_cluster_usage_intervals(cluster_hash)
|
|
865
|
+
provision_log_path = get_cluster_provision_log_path(cluster_name)
|
|
801
866
|
|
|
802
867
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
803
868
|
# usage_intervals is not None and not empty
|
|
@@ -808,6 +873,16 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
|
|
|
808
873
|
usage_intervals.append((start_time, end_time))
|
|
809
874
|
_set_cluster_usage_intervals(cluster_hash, usage_intervals)
|
|
810
875
|
|
|
876
|
+
if provision_log_path:
|
|
877
|
+
assert cluster_hash is not None, cluster_name
|
|
878
|
+
session.query(cluster_history_table).filter_by(
|
|
879
|
+
cluster_hash=cluster_hash
|
|
880
|
+
).filter(
|
|
881
|
+
cluster_history_table.c.provision_log_path.is_(None)
|
|
882
|
+
).update({
|
|
883
|
+
cluster_history_table.c.provision_log_path: provision_log_path
|
|
884
|
+
})
|
|
885
|
+
|
|
811
886
|
if terminate:
|
|
812
887
|
session.query(cluster_table).filter_by(name=cluster_name).delete()
|
|
813
888
|
session.query(cluster_event_table).filter_by(
|
|
@@ -915,6 +990,58 @@ def get_cluster_info(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
915
990
|
return json.loads(row.metadata)
|
|
916
991
|
|
|
917
992
|
|
|
993
|
+
@_init_db
|
|
994
|
+
def get_cluster_provision_log_path(cluster_name: str) -> Optional[str]:
|
|
995
|
+
"""Returns provision_log_path from clusters table, if recorded."""
|
|
996
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
997
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
998
|
+
row = session.query(cluster_table).filter_by(name=cluster_name).first()
|
|
999
|
+
if row is None:
|
|
1000
|
+
return None
|
|
1001
|
+
return getattr(row, 'provision_log_path', None)
|
|
1002
|
+
|
|
1003
|
+
|
|
1004
|
+
@_init_db
|
|
1005
|
+
def get_cluster_history_provision_log_path(cluster_name: str) -> Optional[str]:
|
|
1006
|
+
"""Returns provision_log_path from cluster_history for this name.
|
|
1007
|
+
|
|
1008
|
+
If the cluster currently exists, we use its hash. Otherwise, we look up
|
|
1009
|
+
historical rows by name and choose the most recent one based on
|
|
1010
|
+
usage_intervals.
|
|
1011
|
+
"""
|
|
1012
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1013
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1014
|
+
# Try current cluster first (fast path)
|
|
1015
|
+
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
1016
|
+
if cluster_hash is not None:
|
|
1017
|
+
row = session.query(cluster_history_table).filter_by(
|
|
1018
|
+
cluster_hash=cluster_hash).first()
|
|
1019
|
+
if row is not None:
|
|
1020
|
+
return getattr(row, 'provision_log_path', None)
|
|
1021
|
+
|
|
1022
|
+
# Fallback: search history by name and pick the latest by
|
|
1023
|
+
# usage_intervals
|
|
1024
|
+
rows = session.query(cluster_history_table).filter_by(
|
|
1025
|
+
name=cluster_name).all()
|
|
1026
|
+
if not rows:
|
|
1027
|
+
return None
|
|
1028
|
+
|
|
1029
|
+
def latest_timestamp(usages_bin) -> int:
|
|
1030
|
+
try:
|
|
1031
|
+
intervals = pickle.loads(usages_bin)
|
|
1032
|
+
# intervals: List[Tuple[int, Optional[int]]]
|
|
1033
|
+
if not intervals:
|
|
1034
|
+
return -1
|
|
1035
|
+
_, end = intervals[-1]
|
|
1036
|
+
return end if end is not None else int(time.time())
|
|
1037
|
+
except Exception: # pylint: disable=broad-except
|
|
1038
|
+
return -1
|
|
1039
|
+
|
|
1040
|
+
latest_row = max(rows,
|
|
1041
|
+
key=lambda r: latest_timestamp(r.usage_intervals))
|
|
1042
|
+
return getattr(latest_row, 'provision_log_path', None)
|
|
1043
|
+
|
|
1044
|
+
|
|
918
1045
|
@_init_db
|
|
919
1046
|
def set_cluster_info(cluster_name: str, metadata: Dict[str, Any]) -> None:
|
|
920
1047
|
assert _SQLALCHEMY_ENGINE is not None
|
sky/jobs/constants.py
CHANGED
|
@@ -47,7 +47,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
|
|
|
47
47
|
# The version of the lib files that jobs/utils use. Whenever there is an API
|
|
48
48
|
# change for the jobs/utils, we need to bump this version and update
|
|
49
49
|
# job.utils.ManagedJobCodeGen to handle the version update.
|
|
50
|
-
MANAGED_JOBS_VERSION =
|
|
50
|
+
MANAGED_JOBS_VERSION = 9
|
|
51
51
|
|
|
52
52
|
# The command for setting up the jobs dashboard on the controller. It firstly
|
|
53
53
|
# checks if the systemd services are available, and if not (e.g., Kubernetes
|
sky/jobs/scheduler.py
CHANGED
|
@@ -93,7 +93,7 @@ def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
|
|
|
93
93
|
logger.debug(f'Job {job_id} started with pid {pid}')
|
|
94
94
|
|
|
95
95
|
|
|
96
|
-
def maybe_schedule_next_jobs(
|
|
96
|
+
def maybe_schedule_next_jobs() -> None:
|
|
97
97
|
"""Determine if any managed jobs can be scheduled, and if so, schedule them.
|
|
98
98
|
|
|
99
99
|
Here, "schedule" means to select job that is waiting, and allow it to
|
|
@@ -139,7 +139,7 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
|
|
|
139
139
|
with filelock.FileLock(controller_utils.get_resources_lock_path(),
|
|
140
140
|
blocking=False):
|
|
141
141
|
while True:
|
|
142
|
-
maybe_next_job = state.get_waiting_job(
|
|
142
|
+
maybe_next_job = state.get_waiting_job()
|
|
143
143
|
if maybe_next_job is None:
|
|
144
144
|
# Nothing left to start, break from scheduling loop
|
|
145
145
|
break
|
|
@@ -158,22 +158,11 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
|
|
|
158
158
|
# an ALIVE_WAITING job, but we would be able to launch a WAITING
|
|
159
159
|
# job.
|
|
160
160
|
if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
|
|
161
|
-
if not
|
|
162
|
-
actual_pool is not None):
|
|
161
|
+
if not controller_utils.can_provision():
|
|
163
162
|
# Can't schedule anything, break from scheduling loop.
|
|
164
163
|
break
|
|
165
164
|
elif current_state == state.ManagedJobScheduleState.WAITING:
|
|
166
165
|
if not _can_start_new_job(actual_pool):
|
|
167
|
-
# If there is no job can be scheduled in the pool, we
|
|
168
|
-
# try to schedule another job regardless of the pool.
|
|
169
|
-
# This is to avoid the case where the pool is scaled
|
|
170
|
-
# down at the same time as a job is done. In this case,
|
|
171
|
-
# we won't have any job to schedule in the pool, but
|
|
172
|
-
# other jobs in other pool (or no pool) can still be
|
|
173
|
-
# scheduled.
|
|
174
|
-
if pool is not None:
|
|
175
|
-
pool = None
|
|
176
|
-
continue
|
|
177
166
|
# Can't schedule anything, break from scheduling loop.
|
|
178
167
|
break
|
|
179
168
|
|
|
@@ -218,7 +207,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
|
218
207
|
if is_resume:
|
|
219
208
|
_start_controller(job_id, dag_yaml_path, env_file_path, pool)
|
|
220
209
|
else:
|
|
221
|
-
maybe_schedule_next_jobs(
|
|
210
|
+
maybe_schedule_next_jobs()
|
|
222
211
|
|
|
223
212
|
|
|
224
213
|
@contextlib.contextmanager
|
|
@@ -243,6 +232,13 @@ def scheduled_launch(job_id: int):
|
|
|
243
232
|
multiple uses of this context are nested, behavior is undefined. Don't do
|
|
244
233
|
that.
|
|
245
234
|
"""
|
|
235
|
+
pool = state.get_pool_from_job_id(job_id)
|
|
236
|
+
# For pool, since there is no execution.launch, we don't need to have all
|
|
237
|
+
# the ALIVE_WAITING state. The state transition will be
|
|
238
|
+
# WAITING -> ALIVE -> DONE without any intermediate transitions.
|
|
239
|
+
if pool is not None:
|
|
240
|
+
yield
|
|
241
|
+
return
|
|
246
242
|
|
|
247
243
|
# If we're already in LAUNCHING schedule_state, we don't need to wait.
|
|
248
244
|
# This may be the case for the first launch of a job.
|
|
@@ -254,7 +250,6 @@ def scheduled_launch(job_id: int):
|
|
|
254
250
|
while (state.get_job_schedule_state(job_id) !=
|
|
255
251
|
state.ManagedJobScheduleState.LAUNCHING):
|
|
256
252
|
time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
|
|
257
|
-
pool = state.get_pool_from_job_id(job_id)
|
|
258
253
|
|
|
259
254
|
try:
|
|
260
255
|
yield
|
|
@@ -268,7 +263,7 @@ def scheduled_launch(job_id: int):
|
|
|
268
263
|
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
269
264
|
state.scheduler_set_alive(job_id)
|
|
270
265
|
finally:
|
|
271
|
-
maybe_schedule_next_jobs(
|
|
266
|
+
maybe_schedule_next_jobs()
|
|
272
267
|
|
|
273
268
|
|
|
274
269
|
def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
@@ -283,19 +278,17 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
|
283
278
|
if idempotent and (state.get_job_schedule_state(job_id)
|
|
284
279
|
== state.ManagedJobScheduleState.DONE):
|
|
285
280
|
return
|
|
286
|
-
pool = state.get_pool_from_job_id(job_id)
|
|
287
281
|
|
|
288
282
|
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
289
283
|
state.scheduler_set_done(job_id, idempotent)
|
|
290
|
-
maybe_schedule_next_jobs(
|
|
284
|
+
maybe_schedule_next_jobs()
|
|
291
285
|
|
|
292
286
|
|
|
293
287
|
def _set_alive_waiting(job_id: int) -> None:
|
|
294
288
|
"""Should use wait_until_launch_okay() to transition to this state."""
|
|
295
289
|
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
296
290
|
state.scheduler_set_alive_waiting(job_id)
|
|
297
|
-
|
|
298
|
-
maybe_schedule_next_jobs(pool)
|
|
291
|
+
maybe_schedule_next_jobs()
|
|
299
292
|
|
|
300
293
|
|
|
301
294
|
def _can_start_new_job(pool: Optional[str]) -> bool:
|
sky/jobs/server/core.py
CHANGED
|
@@ -497,7 +497,8 @@ def queue_from_kubernetes_pod(
|
|
|
497
497
|
managed_jobs_runner = provision_lib.get_command_runners(
|
|
498
498
|
'kubernetes', cluster_info)[0]
|
|
499
499
|
|
|
500
|
-
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
500
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
501
|
+
skip_finished=skip_finished)
|
|
501
502
|
returncode, job_table_payload, stderr = managed_jobs_runner.run(
|
|
502
503
|
code,
|
|
503
504
|
require_outputs=True,
|
|
@@ -513,7 +514,14 @@ def queue_from_kubernetes_pod(
|
|
|
513
514
|
except exceptions.CommandError as e:
|
|
514
515
|
raise RuntimeError(str(e)) from e
|
|
515
516
|
|
|
516
|
-
jobs = managed_job_utils.load_managed_job_queue(
|
|
517
|
+
jobs, _, result_type = managed_job_utils.load_managed_job_queue(
|
|
518
|
+
job_table_payload)
|
|
519
|
+
|
|
520
|
+
if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
|
|
521
|
+
return jobs
|
|
522
|
+
|
|
523
|
+
# Backward compatibility for old jobs controller without filtering
|
|
524
|
+
# TODO(hailong): remove this after 0.12.0
|
|
517
525
|
if skip_finished:
|
|
518
526
|
# Filter out the finished jobs. If a multi-task job is partially
|
|
519
527
|
# finished, we will include all its tasks.
|
|
@@ -568,10 +576,18 @@ def _maybe_restart_controller(
|
|
|
568
576
|
|
|
569
577
|
|
|
570
578
|
@usage_lib.entrypoint
|
|
571
|
-
def queue(
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
579
|
+
def queue(
|
|
580
|
+
refresh: bool,
|
|
581
|
+
skip_finished: bool = False,
|
|
582
|
+
all_users: bool = False,
|
|
583
|
+
job_ids: Optional[List[int]] = None,
|
|
584
|
+
user_match: Optional[str] = None,
|
|
585
|
+
workspace_match: Optional[str] = None,
|
|
586
|
+
name_match: Optional[str] = None,
|
|
587
|
+
pool_match: Optional[str] = None,
|
|
588
|
+
page: Optional[int] = None,
|
|
589
|
+
limit: Optional[int] = None,
|
|
590
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
575
591
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
576
592
|
"""Gets statuses of managed jobs.
|
|
577
593
|
|
|
@@ -601,6 +617,17 @@ def queue(refresh: bool,
|
|
|
601
617
|
does not exist.
|
|
602
618
|
RuntimeError: if failed to get the managed jobs with ssh.
|
|
603
619
|
"""
|
|
620
|
+
if limit is not None:
|
|
621
|
+
if limit < 1:
|
|
622
|
+
raise ValueError(f'Limit must be at least 1, got {limit}')
|
|
623
|
+
if page is None:
|
|
624
|
+
page = 1
|
|
625
|
+
if page < 1:
|
|
626
|
+
raise ValueError(f'Page must be at least 1, got {page}')
|
|
627
|
+
else:
|
|
628
|
+
if page is not None:
|
|
629
|
+
raise ValueError('Limit must be specified when page is specified')
|
|
630
|
+
|
|
604
631
|
handle = _maybe_restart_controller(refresh,
|
|
605
632
|
stopped_message='No in-progress '
|
|
606
633
|
'managed jobs.',
|
|
@@ -609,7 +636,22 @@ def queue(refresh: bool,
|
|
|
609
636
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
610
637
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
611
638
|
|
|
612
|
-
|
|
639
|
+
user_hashes: Optional[List[Optional[str]]] = None
|
|
640
|
+
if not all_users:
|
|
641
|
+
user_hashes = [common_utils.get_user_hash()]
|
|
642
|
+
# For backwards compatibility, we show jobs that do not have a
|
|
643
|
+
# user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
644
|
+
user_hashes.append(None)
|
|
645
|
+
elif user_match is not None:
|
|
646
|
+
users = global_user_state.get_user_by_name_match(user_match)
|
|
647
|
+
if not users:
|
|
648
|
+
return [], 0
|
|
649
|
+
user_hashes = [user.id for user in users]
|
|
650
|
+
|
|
651
|
+
accessible_workspaces = list(workspaces_core.get_workspaces().keys())
|
|
652
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
653
|
+
skip_finished, accessible_workspaces, job_ids, workspace_match,
|
|
654
|
+
name_match, pool_match, page, limit, user_hashes)
|
|
613
655
|
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
614
656
|
handle,
|
|
615
657
|
code,
|
|
@@ -622,8 +664,14 @@ def queue(refresh: bool,
|
|
|
622
664
|
raise RuntimeError('Failed to fetch managed jobs with returncode: '
|
|
623
665
|
f'{returncode}.\n{job_table_payload + stderr}')
|
|
624
666
|
|
|
625
|
-
jobs = managed_job_utils.load_managed_job_queue(
|
|
667
|
+
jobs, total, result_type = managed_job_utils.load_managed_job_queue(
|
|
668
|
+
job_table_payload)
|
|
669
|
+
|
|
670
|
+
if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
|
|
671
|
+
return jobs, total
|
|
626
672
|
|
|
673
|
+
# Backward compatibility for old jobs controller without filtering
|
|
674
|
+
# TODO(hailong): remove this after 0.12.0
|
|
627
675
|
if not all_users:
|
|
628
676
|
|
|
629
677
|
def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
|
|
@@ -636,7 +684,6 @@ def queue(refresh: bool,
|
|
|
636
684
|
|
|
637
685
|
jobs = list(filter(user_hash_matches_or_missing, jobs))
|
|
638
686
|
|
|
639
|
-
accessible_workspaces = workspaces_core.get_workspaces()
|
|
640
687
|
jobs = list(
|
|
641
688
|
filter(
|
|
642
689
|
lambda job: job.get('workspace', skylet_constants.
|
|
@@ -655,7 +702,14 @@ def queue(refresh: bool,
|
|
|
655
702
|
if job_ids:
|
|
656
703
|
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
657
704
|
|
|
658
|
-
return jobs
|
|
705
|
+
return managed_job_utils.filter_jobs(jobs,
|
|
706
|
+
workspace_match,
|
|
707
|
+
name_match,
|
|
708
|
+
pool_match,
|
|
709
|
+
page=page,
|
|
710
|
+
limit=limit,
|
|
711
|
+
user_match=user_match,
|
|
712
|
+
enable_user_match=True)
|
|
659
713
|
|
|
660
714
|
|
|
661
715
|
@usage_lib.entrypoint
|
sky/jobs/server/utils.py
CHANGED
|
@@ -62,7 +62,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
62
62
|
version_matches = controller_version == local_version
|
|
63
63
|
|
|
64
64
|
# Load and filter jobs locally using existing method
|
|
65
|
-
jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
|
|
65
|
+
jobs, _, _ = managed_job_utils.load_managed_job_queue(job_table_payload)
|
|
66
66
|
non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
|
|
67
67
|
has_non_terminal_jobs = len(non_terminal_jobs) > 0
|
|
68
68
|
|
sky/jobs/state.py
CHANGED
|
@@ -1528,7 +1528,7 @@ def get_nonterminal_job_ids_by_pool(pool: str,
|
|
|
1528
1528
|
|
|
1529
1529
|
|
|
1530
1530
|
@_init_db
|
|
1531
|
-
def get_waiting_job(
|
|
1531
|
+
def get_waiting_job() -> Optional[Dict[str, Any]]:
|
|
1532
1532
|
"""Get the next job that should transition to LAUNCHING.
|
|
1533
1533
|
|
|
1534
1534
|
Selects the highest-priority WAITING or ALIVE_WAITING job, provided its
|
|
@@ -1559,8 +1559,6 @@ def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
|
|
|
1559
1559
|
job_info_table.c.priority >= sqlalchemy.func.coalesce(
|
|
1560
1560
|
max_priority_subquery, 0),
|
|
1561
1561
|
]
|
|
1562
|
-
if pool is not None:
|
|
1563
|
-
select_conds.append(job_info_table.c.pool == pool)
|
|
1564
1562
|
query = sqlalchemy.select(
|
|
1565
1563
|
job_info_table.c.spot_job_id,
|
|
1566
1564
|
job_info_table.c.schedule_state,
|