skypilot-nightly 1.0.0.dev20250826__py3-none-any.whl → 1.0.0.dev20250828__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/admin_policy.py +11 -10
- sky/authentication.py +4 -10
- sky/backends/backend.py +3 -5
- sky/backends/backend_utils.py +41 -56
- sky/backends/cloud_vm_ray_backend.py +13 -24
- sky/backends/local_docker_backend.py +3 -8
- sky/client/cli/command.py +43 -10
- sky/client/common.py +41 -14
- sky/client/sdk.py +24 -9
- sky/client/sdk_async.py +6 -2
- sky/clouds/aws.py +1 -1
- sky/clouds/cloud.py +15 -0
- sky/clouds/kubernetes.py +27 -0
- sky/clouds/ssh.py +2 -3
- sky/core.py +1 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-6e76f636a048e145.js → webpack-6dae1cd599a34def.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +127 -23
- sky/jobs/client/sdk.py +5 -2
- sky/jobs/recovery_strategy.py +9 -4
- sky/logs/agent.py +2 -2
- sky/logs/aws.py +6 -3
- sky/provision/do/utils.py +2 -1
- sky/provision/kubernetes/config.py +2 -8
- sky/provision/kubernetes/instance.py +58 -8
- sky/provision/kubernetes/network_utils.py +3 -4
- sky/provision/kubernetes/utils.py +8 -7
- sky/provision/nebius/utils.py +51 -9
- sky/provision/vsphere/vsphere_utils.py +2 -8
- sky/schemas/api/responses.py +7 -0
- sky/serve/client/impl.py +5 -4
- sky/serve/replica_managers.py +4 -3
- sky/serve/serve_utils.py +4 -4
- sky/serve/server/impl.py +3 -2
- sky/serve/service_spec.py +2 -8
- sky/server/auth/authn.py +4 -0
- sky/server/auth/oauth2_proxy.py +10 -4
- sky/server/common.py +10 -3
- sky/server/daemons.py +10 -5
- sky/server/requests/executor.py +6 -1
- sky/server/requests/requests.py +21 -0
- sky/server/server.py +34 -33
- sky/server/uvicorn.py +33 -0
- sky/setup_files/dependencies.py +1 -0
- sky/sky_logging.py +4 -1
- sky/skylet/events.py +4 -5
- sky/skypilot_config.py +14 -12
- sky/ssh_node_pools/core.py +3 -1
- sky/task.py +4 -10
- sky/templates/nebius-ray.yml.j2 +4 -8
- sky/usage/usage_lib.py +3 -2
- sky/users/server.py +6 -6
- sky/utils/common_utils.py +0 -71
- sky/utils/controller_utils.py +4 -3
- sky/utils/dag_utils.py +4 -4
- sky/utils/kubernetes/config_map_utils.py +3 -3
- sky/utils/schemas.py +3 -0
- sky/utils/yaml_utils.py +102 -0
- sky/volumes/volume.py +8 -3
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/RECORD +83 -82
- /sky/dashboard/out/_next/static/{TPMkEeuj85tHTmIW7Gu3S → 9DW6d9jaP2kZt0NcgIfFa}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{TPMkEeuj85tHTmIW7Gu3S → 9DW6d9jaP2kZt0NcgIfFa}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6dae1cd599a34def.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/9DW6d9jaP2kZt0NcgIfFa/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/9DW6d9jaP2kZt0NcgIfFa/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"9DW6d9jaP2kZt0NcgIfFa","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6dae1cd599a34def.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-8afcf719ea87debc.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-6c9c09593b1e67b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static/9DW6d9jaP2kZt0NcgIfFa/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/9DW6d9jaP2kZt0NcgIfFa/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"9DW6d9jaP2kZt0NcgIfFa","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6dae1cd599a34def.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static/9DW6d9jaP2kZt0NcgIfFa/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/9DW6d9jaP2kZt0NcgIfFa/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"9DW6d9jaP2kZt0NcgIfFa","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/global_user_state.py
CHANGED
|
@@ -25,7 +25,6 @@ from sqlalchemy import orm
|
|
|
25
25
|
from sqlalchemy.dialects import postgresql
|
|
26
26
|
from sqlalchemy.dialects import sqlite
|
|
27
27
|
from sqlalchemy.ext import declarative
|
|
28
|
-
import yaml
|
|
29
28
|
|
|
30
29
|
from sky import models
|
|
31
30
|
from sky import sky_logging
|
|
@@ -35,6 +34,7 @@ from sky.utils import common_utils
|
|
|
35
34
|
from sky.utils import context_utils
|
|
36
35
|
from sky.utils import registry
|
|
37
36
|
from sky.utils import status_lib
|
|
37
|
+
from sky.utils import yaml_utils
|
|
38
38
|
from sky.utils.db import db_utils
|
|
39
39
|
from sky.utils.db import migration_utils
|
|
40
40
|
|
|
@@ -53,6 +53,7 @@ _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
|
53
53
|
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
54
54
|
|
|
55
55
|
DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
|
|
56
|
+
DEBUG_CLUSTER_EVENT_RETENTION_HOURS = 30 * 24.0
|
|
56
57
|
MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS = 3600
|
|
57
58
|
|
|
58
59
|
_UNIQUE_CONSTRAINT_FAILED_ERROR_MSGS = [
|
|
@@ -433,6 +434,20 @@ def get_user(user_id: str) -> Optional[models.User]:
|
|
|
433
434
|
created_at=row.created_at)
|
|
434
435
|
|
|
435
436
|
|
|
437
|
+
@_init_db
|
|
438
|
+
def _get_users(user_ids: Set[str]) -> Dict[str, models.User]:
|
|
439
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
440
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
441
|
+
rows = session.query(user_table).filter(
|
|
442
|
+
user_table.c.id.in_(user_ids)).all()
|
|
443
|
+
return {
|
|
444
|
+
row.id: models.User(id=row.id,
|
|
445
|
+
name=row.name,
|
|
446
|
+
password=row.password,
|
|
447
|
+
created_at=row.created_at) for row in rows
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
|
|
436
451
|
@_init_db
|
|
437
452
|
def get_user_by_name(username: str) -> List[models.User]:
|
|
438
453
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
@@ -581,7 +596,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
581
596
|
if (is_launch and not cluster_row or
|
|
582
597
|
cluster_row.status != status_lib.ClusterStatus.UP.value):
|
|
583
598
|
conditional_values.update({
|
|
584
|
-
'last_creation_yaml':
|
|
599
|
+
'last_creation_yaml': yaml_utils.dump_yaml_str(task_config)
|
|
585
600
|
if task_config else None,
|
|
586
601
|
'last_creation_command': last_use,
|
|
587
602
|
})
|
|
@@ -767,12 +782,41 @@ def get_last_cluster_event(cluster_hash: str,
|
|
|
767
782
|
return row.reason
|
|
768
783
|
|
|
769
784
|
|
|
770
|
-
def
|
|
785
|
+
def _get_last_cluster_event_multiple(
|
|
786
|
+
cluster_hashes: Set[str],
|
|
787
|
+
event_type: ClusterEventType) -> Dict[str, str]:
|
|
788
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
789
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
790
|
+
# Use a subquery to get the latest event for each cluster_hash
|
|
791
|
+
latest_events = session.query(
|
|
792
|
+
cluster_event_table.c.cluster_hash,
|
|
793
|
+
sqlalchemy.func.max(cluster_event_table.c.transitioned_at).label(
|
|
794
|
+
'max_time')).filter(
|
|
795
|
+
cluster_event_table.c.cluster_hash.in_(cluster_hashes),
|
|
796
|
+
cluster_event_table.c.type == event_type.value).group_by(
|
|
797
|
+
cluster_event_table.c.cluster_hash).subquery()
|
|
798
|
+
|
|
799
|
+
# Join with original table to get the full event details
|
|
800
|
+
rows = session.query(cluster_event_table).join(
|
|
801
|
+
latest_events,
|
|
802
|
+
sqlalchemy.and_(
|
|
803
|
+
cluster_event_table.c.cluster_hash ==
|
|
804
|
+
latest_events.c.cluster_hash,
|
|
805
|
+
cluster_event_table.c.transitioned_at ==
|
|
806
|
+
latest_events.c.max_time)).all()
|
|
807
|
+
|
|
808
|
+
return {row.cluster_hash: row.reason for row in rows}
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
def cleanup_cluster_events_with_retention(retention_hours: float,
|
|
812
|
+
event_type: ClusterEventType) -> None:
|
|
771
813
|
assert _SQLALCHEMY_ENGINE is not None
|
|
814
|
+
# Once for events with type STATUS_CHANGE.
|
|
772
815
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
773
816
|
query = session.query(cluster_event_table).filter(
|
|
774
|
-
cluster_event_table.c.transitioned_at <
|
|
775
|
-
retention_hours * 3600
|
|
817
|
+
cluster_event_table.c.transitioned_at <
|
|
818
|
+
time.time() - retention_hours * 3600,
|
|
819
|
+
cluster_event_table.c.type == event_type.value)
|
|
776
820
|
logger.debug(f'Deleting {query.count()} cluster events.')
|
|
777
821
|
query.delete()
|
|
778
822
|
session.commit()
|
|
@@ -787,9 +831,20 @@ async def cluster_event_retention_daemon():
|
|
|
787
831
|
retention_hours = skypilot_config.get_nested(
|
|
788
832
|
('api_server', 'cluster_event_retention_hours'),
|
|
789
833
|
DEFAULT_CLUSTER_EVENT_RETENTION_HOURS)
|
|
834
|
+
debug_retention_hours = skypilot_config.get_nested(
|
|
835
|
+
('api_server', 'cluster_debug_event_retention_hours'),
|
|
836
|
+
DEBUG_CLUSTER_EVENT_RETENTION_HOURS)
|
|
790
837
|
try:
|
|
791
838
|
if retention_hours >= 0:
|
|
792
|
-
|
|
839
|
+
logger.debug('Cleaning up cluster events with retention '
|
|
840
|
+
f'{retention_hours} hours.')
|
|
841
|
+
cleanup_cluster_events_with_retention(
|
|
842
|
+
retention_hours, ClusterEventType.STATUS_CHANGE)
|
|
843
|
+
if debug_retention_hours >= 0:
|
|
844
|
+
logger.debug('Cleaning up debug cluster events with retention '
|
|
845
|
+
f'{debug_retention_hours} hours.')
|
|
846
|
+
cleanup_cluster_events_with_retention(debug_retention_hours,
|
|
847
|
+
ClusterEventType.DEBUG)
|
|
793
848
|
except asyncio.CancelledError:
|
|
794
849
|
logger.info('Cluster event retention daemon cancelled')
|
|
795
850
|
break
|
|
@@ -797,8 +852,9 @@ async def cluster_event_retention_daemon():
|
|
|
797
852
|
logger.error(f'Error running cluster event retention daemon: {e}')
|
|
798
853
|
|
|
799
854
|
# Run daemon at most once every hour to avoid too frequent cleanup.
|
|
800
|
-
sleep_amount = max(
|
|
801
|
-
|
|
855
|
+
sleep_amount = max(
|
|
856
|
+
min(retention_hours * 3600, debug_retention_hours * 3600),
|
|
857
|
+
MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS)
|
|
802
858
|
await asyncio.sleep(sleep_amount)
|
|
803
859
|
|
|
804
860
|
|
|
@@ -864,8 +920,7 @@ def update_last_use(cluster_name: str):
|
|
|
864
920
|
|
|
865
921
|
|
|
866
922
|
@_init_db
|
|
867
|
-
def remove_cluster(cluster_name: str, terminate: bool
|
|
868
|
-
remove_events: bool) -> None:
|
|
923
|
+
def remove_cluster(cluster_name: str, terminate: bool) -> None:
|
|
869
924
|
"""Removes cluster_name mapping."""
|
|
870
925
|
assert _SQLALCHEMY_ENGINE is not None
|
|
871
926
|
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
@@ -893,9 +948,6 @@ def remove_cluster(cluster_name: str, terminate: bool,
|
|
|
893
948
|
|
|
894
949
|
if terminate:
|
|
895
950
|
session.query(cluster_table).filter_by(name=cluster_name).delete()
|
|
896
|
-
if remove_events:
|
|
897
|
-
session.query(cluster_event_table).filter_by(
|
|
898
|
-
cluster_hash=cluster_hash).delete()
|
|
899
951
|
else:
|
|
900
952
|
handle = get_handle_from_cluster_name(cluster_name)
|
|
901
953
|
if handle is None:
|
|
@@ -1266,18 +1318,70 @@ def get_cluster_from_name(
|
|
|
1266
1318
|
|
|
1267
1319
|
|
|
1268
1320
|
@_init_db
|
|
1269
|
-
def get_clusters(
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1321
|
+
def get_clusters(
|
|
1322
|
+
*, # keyword only separator
|
|
1323
|
+
exclude_managed_clusters: bool = False,
|
|
1324
|
+
workspaces_filter: Optional[Set[str]] = None,
|
|
1325
|
+
user_hashes_filter: Optional[Set[str]] = None,
|
|
1326
|
+
) -> List[Dict[str, Any]]:
|
|
1327
|
+
"""Get clusters from the database.
|
|
1328
|
+
|
|
1329
|
+
Args:
|
|
1330
|
+
exclude_managed_clusters: If True, exclude clusters that have
|
|
1331
|
+
is_managed field set to True.
|
|
1332
|
+
workspaces_filter: If specified, only include clusters
|
|
1333
|
+
that has workspace field set to one of the values.
|
|
1334
|
+
user_hashes_filter: If specified, only include clusters
|
|
1335
|
+
that has user_hash field set to one of the values.
|
|
1336
|
+
"""
|
|
1337
|
+
# is a cluster has a null user_hash,
|
|
1338
|
+
# we treat it as belonging to the current user.
|
|
1339
|
+
current_user_hash = common_utils.get_user_hash()
|
|
1340
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1341
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1342
|
+
query = session.query(cluster_table)
|
|
1343
|
+
if exclude_managed_clusters:
|
|
1344
|
+
query = query.filter(cluster_table.c.is_managed == int(False))
|
|
1345
|
+
if workspaces_filter is not None:
|
|
1346
|
+
query = query.filter(
|
|
1347
|
+
cluster_table.c.workspace.in_(workspaces_filter))
|
|
1348
|
+
if user_hashes_filter is not None:
|
|
1349
|
+
if current_user_hash in user_hashes_filter:
|
|
1350
|
+
# backwards compatibility for old clusters.
|
|
1351
|
+
# If current_user_hash is in user_hashes_filter, we include
|
|
1352
|
+
# clusters that have a null user_hash.
|
|
1353
|
+
query = query.filter(
|
|
1354
|
+
cluster_table.c.user_hash.in_(user_hashes_filter) |
|
|
1355
|
+
(cluster_table.c.user_hash is None))
|
|
1356
|
+
else:
|
|
1357
|
+
query = query.filter(
|
|
1358
|
+
cluster_table.c.user_hash.in_(user_hashes_filter))
|
|
1359
|
+
query = query.order_by(sqlalchemy.desc(cluster_table.c.launched_at))
|
|
1360
|
+
rows = query.all()
|
|
1274
1361
|
records = []
|
|
1362
|
+
|
|
1363
|
+
# get user hash for each row
|
|
1364
|
+
row_to_user_hash = {}
|
|
1275
1365
|
for row in rows:
|
|
1276
|
-
user_hash =
|
|
1277
|
-
|
|
1366
|
+
user_hash = (row.user_hash
|
|
1367
|
+
if row.user_hash is not None else current_user_hash)
|
|
1368
|
+
row_to_user_hash[row.cluster_hash] = user_hash
|
|
1369
|
+
|
|
1370
|
+
# get all users needed for the rows at once
|
|
1371
|
+
user_hashes = set(row_to_user_hash.values())
|
|
1372
|
+
user_hash_to_user = _get_users(user_hashes)
|
|
1373
|
+
|
|
1374
|
+
# get last cluster event for each row
|
|
1375
|
+
cluster_hashes = set(row_to_user_hash.keys())
|
|
1376
|
+
last_cluster_event_dict = _get_last_cluster_event_multiple(
|
|
1377
|
+
cluster_hashes, ClusterEventType.STATUS_CHANGE)
|
|
1378
|
+
|
|
1379
|
+
# get user for each row
|
|
1380
|
+
for row in rows:
|
|
1381
|
+
user_hash = row_to_user_hash[row.cluster_hash]
|
|
1382
|
+
user = user_hash_to_user.get(user_hash, None)
|
|
1278
1383
|
user_name = user.name if user is not None else None
|
|
1279
|
-
last_event =
|
|
1280
|
-
row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE)
|
|
1384
|
+
last_event = last_cluster_event_dict.get(row.cluster_hash, None)
|
|
1281
1385
|
# TODO: use namedtuple instead of dict
|
|
1282
1386
|
record = {
|
|
1283
1387
|
'name': row.name,
|
|
@@ -1999,7 +2103,7 @@ def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
|
|
|
1999
2103
|
yaml_str = get_cluster_yaml_str(cluster_yaml_path)
|
|
2000
2104
|
if yaml_str is None:
|
|
2001
2105
|
raise ValueError(f'Cluster yaml {cluster_yaml_path} not found.')
|
|
2002
|
-
return
|
|
2106
|
+
return yaml_utils.safe_load(yaml_str)
|
|
2003
2107
|
|
|
2004
2108
|
|
|
2005
2109
|
@_init_db
|
sky/jobs/client/sdk.py
CHANGED
|
@@ -243,7 +243,7 @@ def tail_logs(name: Optional[str] = None,
|
|
|
243
243
|
controller: bool = False,
|
|
244
244
|
refresh: bool = False,
|
|
245
245
|
tail: Optional[int] = None,
|
|
246
|
-
output_stream: Optional['io.TextIOBase'] = None) -> int:
|
|
246
|
+
output_stream: Optional['io.TextIOBase'] = None) -> Optional[int]:
|
|
247
247
|
"""Tails logs of managed jobs.
|
|
248
248
|
|
|
249
249
|
You can provide either a job name or a job ID to tail logs. If both are not
|
|
@@ -263,6 +263,8 @@ def tail_logs(name: Optional[str] = None,
|
|
|
263
263
|
Exit code based on success or failure of the job. 0 if success,
|
|
264
264
|
100 if the job failed. See exceptions.JobExitCode for possible exit
|
|
265
265
|
codes.
|
|
266
|
+
Will return None if follow is False
|
|
267
|
+
(see note in sky/client/sdk.py::stream_response)
|
|
266
268
|
|
|
267
269
|
Request Raises:
|
|
268
270
|
ValueError: invalid arguments.
|
|
@@ -289,7 +291,8 @@ def tail_logs(name: Optional[str] = None,
|
|
|
289
291
|
return sdk.stream_response(request_id=request_id,
|
|
290
292
|
response=response,
|
|
291
293
|
output_stream=output_stream,
|
|
292
|
-
resumable=(tail == 0)
|
|
294
|
+
resumable=(tail == 0),
|
|
295
|
+
get_result=follow)
|
|
293
296
|
|
|
294
297
|
|
|
295
298
|
@usage_lib.entrypoint
|
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -327,10 +327,15 @@ class StrategyExecutor:
|
|
|
327
327
|
cluster_name=self.cluster_name,
|
|
328
328
|
# We expect to tear down the cluster as soon as
|
|
329
329
|
# the job is finished. However, in case the
|
|
330
|
-
# controller dies,
|
|
331
|
-
#
|
|
332
|
-
|
|
333
|
-
|
|
330
|
+
# controller dies, we may end up with a
|
|
331
|
+
# resource leak.
|
|
332
|
+
# Ideally, we should autodown to be safe,
|
|
333
|
+
# but it's fine to disable it for now, as
|
|
334
|
+
# Nebius doesn't support autodown yet.
|
|
335
|
+
# TODO(kevin): set down=True once Nebius
|
|
336
|
+
# supports autodown.
|
|
337
|
+
# idle_minutes_to_autostop=_AUTODOWN_MINUTES,
|
|
338
|
+
# down=True,
|
|
334
339
|
_is_launched_by_jobs_controller=True)
|
|
335
340
|
else:
|
|
336
341
|
self.cluster_name = (
|
sky/logs/agent.py
CHANGED
|
@@ -5,8 +5,8 @@ import shlex
|
|
|
5
5
|
from typing import Any, Dict
|
|
6
6
|
|
|
7
7
|
from sky.skylet import constants
|
|
8
|
-
from sky.utils import common_utils
|
|
9
8
|
from sky.utils import resources_utils
|
|
9
|
+
from sky.utils import yaml_utils
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class LoggingAgent(abc.ABC):
|
|
@@ -65,7 +65,7 @@ class FluentbitAgent(LoggingAgent):
|
|
|
65
65
|
'outputs': [self.fluentbit_output_config(cluster_name)],
|
|
66
66
|
}
|
|
67
67
|
}
|
|
68
|
-
return
|
|
68
|
+
return yaml_utils.dump_yaml_str(cfg_dict)
|
|
69
69
|
|
|
70
70
|
@abc.abstractmethod
|
|
71
71
|
def fluentbit_output_config(
|
sky/logs/aws.py
CHANGED
|
@@ -6,8 +6,8 @@ import pydantic
|
|
|
6
6
|
|
|
7
7
|
from sky.logs.agent import FluentbitAgent
|
|
8
8
|
from sky.skylet import constants
|
|
9
|
-
from sky.utils import common_utils
|
|
10
9
|
from sky.utils import resources_utils
|
|
10
|
+
from sky.utils import yaml_utils
|
|
11
11
|
|
|
12
12
|
EC2_MD_URL = '"${AWS_EC2_METADATA_SERVICE_ENDPOINT:-http://169.254.169.254/}"'
|
|
13
13
|
|
|
@@ -130,7 +130,10 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
130
130
|
|
|
131
131
|
# If region is specified, set it in the environment
|
|
132
132
|
if self.config.region:
|
|
133
|
-
pre_cmd += f' export AWS_REGION={self.config.region}
|
|
133
|
+
pre_cmd += (f' export AWS_REGION={self.config.region}'
|
|
134
|
+
f' AWS_DEFAULT_REGION={self.config.region};'
|
|
135
|
+
' command -v aws &>/dev/null && '
|
|
136
|
+
f'aws configure set region {self.config.region};')
|
|
134
137
|
else:
|
|
135
138
|
# If region is not specified, check if it's available in
|
|
136
139
|
# the environment or credentials file
|
|
@@ -213,7 +216,7 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
213
216
|
}
|
|
214
217
|
}
|
|
215
218
|
|
|
216
|
-
return
|
|
219
|
+
return yaml_utils.dump_yaml_str(cfg_dict)
|
|
217
220
|
|
|
218
221
|
def fluentbit_output_config(
|
|
219
222
|
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
sky/provision/do/utils.py
CHANGED
|
@@ -17,6 +17,7 @@ from sky.provision import constants as provision_constants
|
|
|
17
17
|
from sky.provision.do import constants
|
|
18
18
|
from sky.utils import annotations
|
|
19
19
|
from sky.utils import common_utils
|
|
20
|
+
from sky.utils import yaml_utils
|
|
20
21
|
|
|
21
22
|
logger = sky_logging.init_logger(__name__)
|
|
22
23
|
|
|
@@ -61,7 +62,7 @@ def _init_client():
|
|
|
61
62
|
if get_credentials_path() is None:
|
|
62
63
|
raise DigitalOceanError(
|
|
63
64
|
'No credentials found, please run `doctl auth init`')
|
|
64
|
-
credentials =
|
|
65
|
+
credentials = yaml_utils.read_yaml(get_credentials_path())
|
|
65
66
|
default_token = credentials.get('access-token', None)
|
|
66
67
|
if default_token is not None:
|
|
67
68
|
try:
|
|
@@ -3,20 +3,14 @@ import copy
|
|
|
3
3
|
import logging
|
|
4
4
|
import math
|
|
5
5
|
import os
|
|
6
|
-
import typing
|
|
7
6
|
from typing import Any, Dict, Optional, Union
|
|
8
7
|
|
|
9
|
-
from sky.adaptors import common as adaptors_common
|
|
10
8
|
from sky.adaptors import kubernetes
|
|
11
9
|
from sky.provision import common
|
|
12
10
|
from sky.provision.kubernetes import network_utils
|
|
13
11
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
14
12
|
from sky.utils import kubernetes_enums
|
|
15
|
-
|
|
16
|
-
if typing.TYPE_CHECKING:
|
|
17
|
-
import yaml
|
|
18
|
-
else:
|
|
19
|
-
yaml = adaptors_common.LazyImport('yaml')
|
|
13
|
+
from sky.utils import yaml_utils
|
|
20
14
|
|
|
21
15
|
logger = logging.getLogger(__name__)
|
|
22
16
|
|
|
@@ -592,7 +586,7 @@ def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
|
|
|
592
586
|
daemonset_path = os.path.join(
|
|
593
587
|
root_dir, 'kubernetes/manifests/fusermount-server-daemonset.yaml')
|
|
594
588
|
with open(daemonset_path, 'r', encoding='utf-8') as file:
|
|
595
|
-
daemonset =
|
|
589
|
+
daemonset = yaml_utils.safe_load(file)
|
|
596
590
|
kubernetes_utils.merge_custom_metadata(daemonset['metadata'])
|
|
597
591
|
try:
|
|
598
592
|
kubernetes.apps_api(context).create_namespaced_daemon_set(
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Kubernetes instance provisioning."""
|
|
2
2
|
import copy
|
|
3
|
+
import datetime
|
|
3
4
|
import json
|
|
4
5
|
import re
|
|
5
6
|
import time
|
|
@@ -1254,9 +1255,11 @@ def get_cluster_info(
|
|
|
1254
1255
|
provider_config=provider_config)
|
|
1255
1256
|
|
|
1256
1257
|
|
|
1257
|
-
def _get_pod_termination_reason(pod: Any) -> str:
|
|
1258
|
+
def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
|
|
1259
|
+
"""Get pod termination reason and write to cluster events."""
|
|
1258
1260
|
reasons = []
|
|
1259
|
-
|
|
1261
|
+
latest_timestamp = pod.status.start_time or datetime.datetime.min
|
|
1262
|
+
if pod.status and pod.status.container_statuses:
|
|
1260
1263
|
for container_status in pod.status.container_statuses:
|
|
1261
1264
|
terminated = container_status.state.terminated
|
|
1262
1265
|
if terminated:
|
|
@@ -1264,20 +1267,38 @@ def _get_pod_termination_reason(pod: Any) -> str:
|
|
|
1264
1267
|
reason = terminated.reason
|
|
1265
1268
|
if exit_code == 0:
|
|
1266
1269
|
# skip exit 0 (non-failed) just for sanity
|
|
1270
|
+
logger.debug(f'{pod.metadata.name}/{container_status.name} '
|
|
1271
|
+
'had exit code 0. Skipping.')
|
|
1267
1272
|
continue
|
|
1268
1273
|
if reason is None:
|
|
1269
1274
|
# just in-case reason is None, have default for debugging
|
|
1270
1275
|
reason = f'exit({exit_code})'
|
|
1271
1276
|
reasons.append(reason)
|
|
1277
|
+
if terminated.finished_at > latest_timestamp:
|
|
1278
|
+
latest_timestamp = terminated.finished_at
|
|
1279
|
+
|
|
1272
1280
|
# TODO (kyuds): later, if needed, query `last_state` too.
|
|
1273
1281
|
|
|
1282
|
+
if not reasons:
|
|
1283
|
+
return ''
|
|
1284
|
+
|
|
1274
1285
|
# Normally we will have a single container per pod for skypilot
|
|
1275
1286
|
# but doing this just in-case there are multiple containers.
|
|
1276
|
-
|
|
1287
|
+
pod_reason = ' | '.join(reasons)
|
|
1288
|
+
|
|
1289
|
+
global_user_state.add_cluster_event(
|
|
1290
|
+
cluster_name,
|
|
1291
|
+
None,
|
|
1292
|
+
f'[kubernetes pod {pod.metadata.name} terminated] {pod_reason}',
|
|
1293
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1294
|
+
transitioned_at=int(latest_timestamp.timestamp()),
|
|
1295
|
+
)
|
|
1296
|
+
return pod_reason
|
|
1277
1297
|
|
|
1278
1298
|
|
|
1279
1299
|
def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
1280
1300
|
cluster_name: str, pod_name: str) -> Optional[str]:
|
|
1301
|
+
"""Get events for missing pod and write to cluster events."""
|
|
1281
1302
|
logger.debug(f'Analyzing events for pod {pod_name}')
|
|
1282
1303
|
pod_field_selector = (
|
|
1283
1304
|
f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
|
|
@@ -1293,6 +1314,8 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
|
1293
1314
|
last_scheduled_node = None
|
|
1294
1315
|
insert_new_pod_event = True
|
|
1295
1316
|
new_event_inserted = False
|
|
1317
|
+
inserted_pod_events = 0
|
|
1318
|
+
|
|
1296
1319
|
for event in pod_events:
|
|
1297
1320
|
if event.reason == 'Scheduled':
|
|
1298
1321
|
pattern = r'Successfully assigned (\S+) to (\S+)'
|
|
@@ -1313,10 +1336,18 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
|
1313
1336
|
transitioned_at=int(
|
|
1314
1337
|
event.metadata.creation_timestamp.timestamp()),
|
|
1315
1338
|
expose_duplicate_error=True)
|
|
1339
|
+
logger.debug(f'[pod {pod_name}] encountered new pod event: '
|
|
1340
|
+
f'{event.metadata.creation_timestamp} '
|
|
1341
|
+
f'{event.reason} {event.message}')
|
|
1316
1342
|
except db_utils.UniqueConstraintViolationError:
|
|
1317
1343
|
insert_new_pod_event = False
|
|
1318
1344
|
else:
|
|
1319
1345
|
new_event_inserted = True
|
|
1346
|
+
inserted_pod_events += 1
|
|
1347
|
+
|
|
1348
|
+
logger.debug(f'[pod {pod_name}] processed {len(pod_events)} pod events and '
|
|
1349
|
+
f'inserted {inserted_pod_events} new pod events '
|
|
1350
|
+
'previously unseen')
|
|
1320
1351
|
|
|
1321
1352
|
if last_scheduled_node is not None:
|
|
1322
1353
|
node_field_selector = ('involvedObject.kind=Node,'
|
|
@@ -1331,6 +1362,7 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
|
1331
1362
|
# latest event appears first
|
|
1332
1363
|
reverse=True)
|
|
1333
1364
|
insert_new_node_event = True
|
|
1365
|
+
inserted_node_events = 0
|
|
1334
1366
|
for event in node_events:
|
|
1335
1367
|
if insert_new_node_event:
|
|
1336
1368
|
# Try inserting the latest events first. If the event is a
|
|
@@ -1345,10 +1377,23 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
|
1345
1377
|
transitioned_at=int(
|
|
1346
1378
|
event.metadata.creation_timestamp.timestamp()),
|
|
1347
1379
|
expose_duplicate_error=True)
|
|
1380
|
+
logger.debug(
|
|
1381
|
+
f'[pod {pod_name}] encountered new node event: '
|
|
1382
|
+
f'{event.metadata.creation_timestamp} '
|
|
1383
|
+
f'{event.reason} {event.message}')
|
|
1348
1384
|
except db_utils.UniqueConstraintViolationError:
|
|
1349
1385
|
insert_new_node_event = False
|
|
1350
1386
|
else:
|
|
1351
1387
|
new_event_inserted = True
|
|
1388
|
+
inserted_node_events += 1
|
|
1389
|
+
|
|
1390
|
+
logger.debug(f'[pod {pod_name}: node {last_scheduled_node}] '
|
|
1391
|
+
f'processed {len(node_events)} node events and '
|
|
1392
|
+
f'inserted {inserted_node_events} new node events '
|
|
1393
|
+
'previously unseen')
|
|
1394
|
+
else:
|
|
1395
|
+
logger.debug(f'[pod {pod_name}] could not determine the node '
|
|
1396
|
+
'the pod was scheduled to')
|
|
1352
1397
|
|
|
1353
1398
|
if not new_event_inserted:
|
|
1354
1399
|
# If new event is not inserted, there is no useful information to
|
|
@@ -1390,13 +1435,15 @@ def query_instances(
|
|
|
1390
1435
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
1391
1436
|
non_terminated_only: bool = True
|
|
1392
1437
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
1438
|
+
# Mapping from pod phase to skypilot status. These are the only valid pod
|
|
1439
|
+
# phases.
|
|
1440
|
+
# https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
|
|
1393
1441
|
status_map = {
|
|
1394
1442
|
'Pending': status_lib.ClusterStatus.INIT,
|
|
1395
1443
|
'Running': status_lib.ClusterStatus.UP,
|
|
1396
1444
|
'Failed': status_lib.ClusterStatus.INIT,
|
|
1397
1445
|
'Unknown': None,
|
|
1398
1446
|
'Succeeded': None,
|
|
1399
|
-
'Terminating': None,
|
|
1400
1447
|
}
|
|
1401
1448
|
|
|
1402
1449
|
assert provider_config is not None
|
|
@@ -1440,12 +1487,15 @@ def query_instances(
|
|
|
1440
1487
|
for pod in pods:
|
|
1441
1488
|
phase = pod.status.phase
|
|
1442
1489
|
pod_status = status_map[phase]
|
|
1490
|
+
reason = None
|
|
1491
|
+
if phase in ('Failed', 'Unknown'):
|
|
1492
|
+
reason = _get_pod_termination_reason(pod, cluster_name)
|
|
1493
|
+
logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
|
|
1443
1494
|
if non_terminated_only and pod_status is None:
|
|
1495
|
+
logger.debug(f'Pod {pod.metadata.name} is terminated, but '
|
|
1496
|
+
'query_instances is called with '
|
|
1497
|
+
f'non_terminated_only=True. Phase: {phase}')
|
|
1444
1498
|
continue
|
|
1445
|
-
reason = None
|
|
1446
|
-
if phase == 'Failed':
|
|
1447
|
-
reason = _get_pod_termination_reason(pod)
|
|
1448
|
-
logger.debug(f'Pod Status Reason(s): {reason}')
|
|
1449
1499
|
pod_name = pod.metadata.name
|
|
1450
1500
|
reason = f'{pod_name}: {reason}' if reason is not None else None
|
|
1451
1501
|
cluster_status[pod_name] = (pod_status, reason)
|
|
@@ -13,13 +13,12 @@ from sky.provision.kubernetes import utils as kubernetes_utils
|
|
|
13
13
|
from sky.utils import directory_utils
|
|
14
14
|
from sky.utils import kubernetes_enums
|
|
15
15
|
from sky.utils import ux_utils
|
|
16
|
+
from sky.utils import yaml_utils
|
|
16
17
|
|
|
17
18
|
if typing.TYPE_CHECKING:
|
|
18
19
|
import jinja2
|
|
19
|
-
import yaml
|
|
20
20
|
else:
|
|
21
21
|
jinja2 = adaptors_common.LazyImport('jinja2')
|
|
22
|
-
yaml = adaptors_common.LazyImport('yaml')
|
|
23
22
|
|
|
24
23
|
logger = sky_logging.init_logger(__name__)
|
|
25
24
|
|
|
@@ -108,7 +107,7 @@ def fill_loadbalancer_template(namespace: str, context: Optional[str],
|
|
|
108
107
|
annotations=annotations,
|
|
109
108
|
labels=labels,
|
|
110
109
|
)
|
|
111
|
-
content =
|
|
110
|
+
content = yaml_utils.safe_load(cont)
|
|
112
111
|
return content
|
|
113
112
|
|
|
114
113
|
|
|
@@ -147,7 +146,7 @@ def fill_ingress_template(namespace: str, context: Optional[str],
|
|
|
147
146
|
annotations=annotations,
|
|
148
147
|
labels=labels,
|
|
149
148
|
)
|
|
150
|
-
content =
|
|
149
|
+
content = yaml_utils.safe_load(cont)
|
|
151
150
|
|
|
152
151
|
# Return a dictionary containing both specs
|
|
153
152
|
return {
|