skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend.py +10 -0
- sky/backends/backend_utils.py +207 -79
- sky/backends/cloud_vm_ray_backend.py +37 -13
- sky/backends/local_docker_backend.py +9 -0
- sky/client/cli/command.py +112 -53
- sky/client/common.py +4 -2
- sky/client/sdk.py +17 -7
- sky/client/sdk_async.py +4 -2
- sky/clouds/kubernetes.py +2 -1
- sky/clouds/runpod.py +20 -7
- sky/core.py +9 -54
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +19 -10
- sky/execution.py +4 -2
- sky/global_user_state.py +271 -67
- sky/jobs/client/sdk.py +10 -1
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +11 -7
- sky/jobs/server/core.py +5 -3
- sky/jobs/server/server.py +15 -11
- sky/jobs/utils.py +1 -1
- sky/logs/agent.py +30 -3
- sky/logs/aws.py +9 -19
- sky/provision/__init__.py +2 -1
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/cudo/instance.py +2 -2
- sky/provision/do/instance.py +2 -2
- sky/provision/docker_utils.py +41 -19
- sky/provision/fluidstack/instance.py +2 -2
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +134 -8
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -2
- sky/provision/primeintellect/instance.py +2 -2
- sky/provision/provisioner.py +1 -0
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +2 -2
- sky/provision/scp/instance.py +2 -2
- sky/provision/seeweb/instance.py +2 -1
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +6 -5
- sky/schemas/api/responses.py +2 -1
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +45 -19
- sky/serve/replica_managers.py +12 -5
- sky/serve/serve_utils.py +5 -7
- sky/serve/server/core.py +9 -6
- sky/serve/server/impl.py +78 -25
- sky/serve/server/server.py +4 -5
- sky/serve/service_spec.py +33 -0
- sky/server/constants.py +1 -1
- sky/server/daemons.py +2 -3
- sky/server/requests/executor.py +56 -6
- sky/server/requests/payloads.py +32 -8
- sky/server/requests/preconditions.py +2 -3
- sky/server/rest.py +2 -0
- sky/server/server.py +28 -19
- sky/server/stream_utils.py +34 -12
- sky/setup_files/dependencies.py +5 -2
- sky/setup_files/setup.py +44 -44
- sky/skylet/constants.py +4 -1
- sky/skylet/events.py +42 -0
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +24 -18
- sky/usage/usage_lib.py +3 -0
- sky/utils/cli_utils/status_utils.py +4 -5
- sky/utils/context.py +104 -29
- sky/utils/controller_utils.py +7 -6
- sky/utils/db/db_utils.py +5 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes/create_cluster.sh +13 -28
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
- sky/utils/kubernetes_enums.py +5 -0
- sky/utils/ux_utils.py +35 -1
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +44 -8
- sky/volumes/server/core.py +1 -0
- sky/volumes/server/server.py +33 -7
- sky/volumes/volume.py +35 -28
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
- /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
|
@@ -24,6 +24,7 @@ from sqlalchemy import exc as sqlalchemy_exc
|
|
|
24
24
|
from sqlalchemy import orm
|
|
25
25
|
from sqlalchemy.dialects import postgresql
|
|
26
26
|
from sqlalchemy.dialects import sqlite
|
|
27
|
+
from sqlalchemy.ext import asyncio as sql_async
|
|
27
28
|
from sqlalchemy.ext import declarative
|
|
28
29
|
|
|
29
30
|
from sky import models
|
|
@@ -51,6 +52,7 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
|
|
|
51
52
|
_ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
|
|
52
53
|
|
|
53
54
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
55
|
+
_SQLALCHEMY_ENGINE_ASYNC: Optional[sql_async.AsyncEngine] = None
|
|
54
56
|
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
55
57
|
|
|
56
58
|
DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
|
|
@@ -183,6 +185,14 @@ cluster_history_table = sqlalchemy.Table(
|
|
|
183
185
|
sqlalchemy.Column('provision_log_path',
|
|
184
186
|
sqlalchemy.Text,
|
|
185
187
|
server_default=None),
|
|
188
|
+
sqlalchemy.Column('last_activity_time',
|
|
189
|
+
sqlalchemy.Integer,
|
|
190
|
+
server_default=None,
|
|
191
|
+
index=True),
|
|
192
|
+
sqlalchemy.Column('launched_at',
|
|
193
|
+
sqlalchemy.Integer,
|
|
194
|
+
server_default=None,
|
|
195
|
+
index=True),
|
|
186
196
|
)
|
|
187
197
|
|
|
188
198
|
|
|
@@ -296,6 +306,20 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
296
306
|
migration_utils.GLOBAL_USER_STATE_VERSION)
|
|
297
307
|
|
|
298
308
|
|
|
309
|
+
def initialize_and_get_db_async() -> sql_async.AsyncEngine:
|
|
310
|
+
global _SQLALCHEMY_ENGINE_ASYNC
|
|
311
|
+
if _SQLALCHEMY_ENGINE_ASYNC is not None:
|
|
312
|
+
return _SQLALCHEMY_ENGINE_ASYNC
|
|
313
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
314
|
+
if _SQLALCHEMY_ENGINE_ASYNC is not None:
|
|
315
|
+
return _SQLALCHEMY_ENGINE_ASYNC
|
|
316
|
+
|
|
317
|
+
_SQLALCHEMY_ENGINE_ASYNC = db_utils.get_engine('state',
|
|
318
|
+
async_engine=True)
|
|
319
|
+
initialize_and_get_db()
|
|
320
|
+
return _SQLALCHEMY_ENGINE_ASYNC
|
|
321
|
+
|
|
322
|
+
|
|
299
323
|
# We wrap the sqlalchemy engine initialization in a thread
|
|
300
324
|
# lock to ensure that multiple threads do not initialize the
|
|
301
325
|
# engine which could result in a rare race condition where
|
|
@@ -321,6 +345,22 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
|
321
345
|
return _SQLALCHEMY_ENGINE
|
|
322
346
|
|
|
323
347
|
|
|
348
|
+
def _init_db_async(func):
|
|
349
|
+
"""Initialize the async database."""
|
|
350
|
+
|
|
351
|
+
@functools.wraps(func)
|
|
352
|
+
async def wrapper(*args, **kwargs):
|
|
353
|
+
if _SQLALCHEMY_ENGINE_ASYNC is None:
|
|
354
|
+
# this may happen multiple times since there is no locking
|
|
355
|
+
# here but thats fine, this is just a short circuit for the
|
|
356
|
+
# common case.
|
|
357
|
+
await context_utils.to_thread(initialize_and_get_db_async)
|
|
358
|
+
|
|
359
|
+
return await func(*args, **kwargs)
|
|
360
|
+
|
|
361
|
+
return wrapper
|
|
362
|
+
|
|
363
|
+
|
|
324
364
|
def _init_db(func):
|
|
325
365
|
"""Initialize the database."""
|
|
326
366
|
|
|
@@ -688,6 +728,10 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
688
728
|
conditional_values.get('last_creation_command'),
|
|
689
729
|
}
|
|
690
730
|
|
|
731
|
+
# Calculate last_activity_time and launched_at from usage_intervals
|
|
732
|
+
last_activity_time = _get_cluster_last_activity_time(usage_intervals)
|
|
733
|
+
launched_at = _get_cluster_launch_time(usage_intervals)
|
|
734
|
+
|
|
691
735
|
insert_stmnt = insert_func(cluster_history_table).values(
|
|
692
736
|
cluster_hash=cluster_hash,
|
|
693
737
|
name=cluster_name,
|
|
@@ -698,6 +742,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
698
742
|
user_hash=user_hash,
|
|
699
743
|
workspace=history_workspace,
|
|
700
744
|
provision_log_path=provision_log_path,
|
|
745
|
+
last_activity_time=last_activity_time,
|
|
746
|
+
launched_at=launched_at,
|
|
701
747
|
**creation_info,
|
|
702
748
|
)
|
|
703
749
|
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
@@ -714,6 +760,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
714
760
|
cluster_history_table.c.user_hash: history_hash,
|
|
715
761
|
cluster_history_table.c.workspace: history_workspace,
|
|
716
762
|
cluster_history_table.c.provision_log_path: provision_log_path,
|
|
763
|
+
cluster_history_table.c.last_activity_time: last_activity_time,
|
|
764
|
+
cluster_history_table.c.launched_at: launched_at,
|
|
717
765
|
**creation_info,
|
|
718
766
|
})
|
|
719
767
|
session.execute(do_update_stmt)
|
|
@@ -1010,29 +1058,68 @@ def get_handle_from_cluster_name(
|
|
|
1010
1058
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1011
1059
|
assert cluster_name is not None, 'cluster_name cannot be None'
|
|
1012
1060
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1013
|
-
row = session.query(
|
|
1061
|
+
row = (session.query(
|
|
1062
|
+
cluster_table.c.handle).filter_by(name=cluster_name).first())
|
|
1014
1063
|
if row is None:
|
|
1015
1064
|
return None
|
|
1016
1065
|
return pickle.loads(row.handle)
|
|
1017
1066
|
|
|
1018
1067
|
|
|
1068
|
+
@_init_db_async
|
|
1069
|
+
@metrics_lib.time_me
|
|
1070
|
+
async def get_status_from_cluster_name_async(
|
|
1071
|
+
cluster_name: str) -> Optional[status_lib.ClusterStatus]:
|
|
1072
|
+
"""Get the status of a cluster."""
|
|
1073
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1074
|
+
assert cluster_name is not None, 'cluster_name cannot be None'
|
|
1075
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1076
|
+
result = await session.execute(
|
|
1077
|
+
sqlalchemy.select(cluster_table.c.status).where(
|
|
1078
|
+
cluster_table.c.name == cluster_name))
|
|
1079
|
+
row = result.first()
|
|
1080
|
+
|
|
1081
|
+
if row is None:
|
|
1082
|
+
return None
|
|
1083
|
+
return status_lib.ClusterStatus(row[0])
|
|
1084
|
+
|
|
1085
|
+
|
|
1019
1086
|
@_init_db
|
|
1020
1087
|
@metrics_lib.time_me
|
|
1021
|
-
def
|
|
1088
|
+
def get_status_from_cluster_name(
|
|
1089
|
+
cluster_name: str) -> Optional[status_lib.ClusterStatus]:
|
|
1090
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1091
|
+
assert cluster_name is not None, 'cluster_name cannot be None'
|
|
1092
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1093
|
+
row = session.query(
|
|
1094
|
+
cluster_table.c.status).filter_by(name=cluster_name).first()
|
|
1095
|
+
if row is None:
|
|
1096
|
+
return None
|
|
1097
|
+
return status_lib.ClusterStatus[row.status]
|
|
1098
|
+
|
|
1099
|
+
|
|
1100
|
+
@_init_db
|
|
1101
|
+
@metrics_lib.time_me
|
|
1102
|
+
def get_glob_cluster_names(
|
|
1103
|
+
cluster_name: str,
|
|
1104
|
+
workspaces_filter: Optional[Set[str]] = None) -> List[str]:
|
|
1022
1105
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1023
1106
|
assert cluster_name is not None, 'cluster_name cannot be None'
|
|
1024
1107
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1025
1108
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
1026
1109
|
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
1027
|
-
|
|
1028
|
-
cluster_table.c.name.op('GLOB')(cluster_name))
|
|
1110
|
+
query = session.query(cluster_table.c.name).filter(
|
|
1111
|
+
cluster_table.c.name.op('GLOB')(cluster_name))
|
|
1029
1112
|
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
1030
1113
|
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
1031
|
-
|
|
1114
|
+
query = session.query(cluster_table.c.name).filter(
|
|
1032
1115
|
cluster_table.c.name.op('SIMILAR TO')(
|
|
1033
|
-
_glob_to_similar(cluster_name)))
|
|
1116
|
+
_glob_to_similar(cluster_name)))
|
|
1034
1117
|
else:
|
|
1035
1118
|
raise ValueError('Unsupported database dialect')
|
|
1119
|
+
if workspaces_filter is not None:
|
|
1120
|
+
query = query.filter(
|
|
1121
|
+
cluster_table.c.workspace.in_(workspaces_filter))
|
|
1122
|
+
rows = query.all()
|
|
1036
1123
|
return [row.name for row in rows]
|
|
1037
1124
|
|
|
1038
1125
|
|
|
@@ -1076,7 +1163,8 @@ def set_cluster_autostop_value(cluster_name: str, idle_minutes: int,
|
|
|
1076
1163
|
def get_cluster_launch_time(cluster_name: str) -> Optional[int]:
|
|
1077
1164
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1078
1165
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1079
|
-
row = session.query(
|
|
1166
|
+
row = session.query(
|
|
1167
|
+
cluster_table.c.launched_at).filter_by(name=cluster_name).first()
|
|
1080
1168
|
if row is None or row.launched_at is None:
|
|
1081
1169
|
return None
|
|
1082
1170
|
return int(row.launched_at)
|
|
@@ -1087,7 +1175,8 @@ def get_cluster_launch_time(cluster_name: str) -> Optional[int]:
|
|
|
1087
1175
|
def get_cluster_info(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
1088
1176
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1089
1177
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1090
|
-
row = session.query(
|
|
1178
|
+
row = session.query(
|
|
1179
|
+
cluster_table.c.metadata).filter_by(name=cluster_name).first()
|
|
1091
1180
|
if row is None or row.metadata is None:
|
|
1092
1181
|
return None
|
|
1093
1182
|
return json.loads(row.metadata)
|
|
@@ -1167,7 +1256,8 @@ def get_cluster_storage_mounts_metadata(
|
|
|
1167
1256
|
cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
1168
1257
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1169
1258
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1170
|
-
row = session.query(cluster_table).filter_by(
|
|
1259
|
+
row = (session.query(cluster_table.c.storage_mounts_metadata).filter_by(
|
|
1260
|
+
name=cluster_name).first())
|
|
1171
1261
|
if row is None or row.storage_mounts_metadata is None:
|
|
1172
1262
|
return None
|
|
1173
1263
|
return pickle.loads(row.storage_mounts_metadata)
|
|
@@ -1196,7 +1286,9 @@ def get_cluster_skylet_ssh_tunnel_metadata(
|
|
|
1196
1286
|
cluster_name: str) -> Optional[Tuple[int, int]]:
|
|
1197
1287
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1198
1288
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1199
|
-
row = session.query(
|
|
1289
|
+
row = session.query(
|
|
1290
|
+
cluster_table.c.skylet_ssh_tunnel_metadata).filter_by(
|
|
1291
|
+
name=cluster_name).first()
|
|
1200
1292
|
if row is None or row.skylet_ssh_tunnel_metadata is None:
|
|
1201
1293
|
return None
|
|
1202
1294
|
return pickle.loads(row.skylet_ssh_tunnel_metadata)
|
|
@@ -1230,7 +1322,7 @@ def _get_cluster_usage_intervals(
|
|
|
1230
1322
|
if cluster_hash is None:
|
|
1231
1323
|
return None
|
|
1232
1324
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1233
|
-
row = session.query(cluster_history_table).filter_by(
|
|
1325
|
+
row = session.query(cluster_history_table.c.usage_intervals).filter_by(
|
|
1234
1326
|
cluster_hash=cluster_hash).first()
|
|
1235
1327
|
if row is None or row.usage_intervals is None:
|
|
1236
1328
|
return None
|
|
@@ -1264,17 +1356,33 @@ def _get_cluster_duration(
|
|
|
1264
1356
|
return total_duration
|
|
1265
1357
|
|
|
1266
1358
|
|
|
1359
|
+
def _get_cluster_last_activity_time(
|
|
1360
|
+
usage_intervals: Optional[List[Tuple[int,
|
|
1361
|
+
Optional[int]]]]) -> Optional[int]:
|
|
1362
|
+
last_activity_time = None
|
|
1363
|
+
if usage_intervals:
|
|
1364
|
+
last_interval = usage_intervals[-1]
|
|
1365
|
+
last_activity_time = (last_interval[1] if last_interval[1] is not None
|
|
1366
|
+
else last_interval[0])
|
|
1367
|
+
return last_activity_time
|
|
1368
|
+
|
|
1369
|
+
|
|
1267
1370
|
@_init_db
|
|
1268
1371
|
@metrics_lib.time_me
|
|
1269
1372
|
def _set_cluster_usage_intervals(
|
|
1270
1373
|
cluster_hash: str, usage_intervals: List[Tuple[int,
|
|
1271
1374
|
Optional[int]]]) -> None:
|
|
1272
1375
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1376
|
+
|
|
1377
|
+
# Calculate last_activity_time from usage_intervals
|
|
1378
|
+
last_activity_time = _get_cluster_last_activity_time(usage_intervals)
|
|
1379
|
+
|
|
1273
1380
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1274
1381
|
count = session.query(cluster_history_table).filter_by(
|
|
1275
1382
|
cluster_hash=cluster_hash).update({
|
|
1276
1383
|
cluster_history_table.c.usage_intervals:
|
|
1277
|
-
pickle.dumps(usage_intervals)
|
|
1384
|
+
pickle.dumps(usage_intervals),
|
|
1385
|
+
cluster_history_table.c.last_activity_time: last_activity_time,
|
|
1278
1386
|
})
|
|
1279
1387
|
session.commit()
|
|
1280
1388
|
assert count <= 1, count
|
|
@@ -1305,7 +1413,8 @@ def set_owner_identity_for_cluster(cluster_name: str,
|
|
|
1305
1413
|
def _get_hash_for_existing_cluster(cluster_name: str) -> Optional[str]:
|
|
1306
1414
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1307
1415
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1308
|
-
row = session.query(
|
|
1416
|
+
row = (session.query(
|
|
1417
|
+
cluster_table.c.cluster_hash).filter_by(name=cluster_name).first())
|
|
1309
1418
|
if row is None or row.cluster_hash is None:
|
|
1310
1419
|
return None
|
|
1311
1420
|
return row.cluster_hash
|
|
@@ -1317,8 +1426,10 @@ def get_launched_resources_from_cluster_hash(
|
|
|
1317
1426
|
cluster_hash: str) -> Optional[Tuple[int, Any]]:
|
|
1318
1427
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1319
1428
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1320
|
-
row = session.query(
|
|
1321
|
-
|
|
1429
|
+
row = session.query(
|
|
1430
|
+
cluster_history_table.c.num_nodes,
|
|
1431
|
+
cluster_history_table.c.launched_resources).filter_by(
|
|
1432
|
+
cluster_hash=cluster_hash).first()
|
|
1322
1433
|
if row is None:
|
|
1323
1434
|
return None
|
|
1324
1435
|
num_nodes = row.num_nodes
|
|
@@ -1362,17 +1473,56 @@ def _load_storage_mounts_metadata(
|
|
|
1362
1473
|
@metrics_lib.time_me
|
|
1363
1474
|
@context_utils.cancellation_guard
|
|
1364
1475
|
def get_cluster_from_name(
|
|
1365
|
-
cluster_name: Optional[str]
|
|
1476
|
+
cluster_name: Optional[str],
|
|
1477
|
+
*,
|
|
1478
|
+
include_user_info: bool = True,
|
|
1479
|
+
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
1366
1480
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1367
1481
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1368
|
-
|
|
1482
|
+
if summary_response:
|
|
1483
|
+
query = session.query(
|
|
1484
|
+
cluster_table.c.name, cluster_table.c.launched_at,
|
|
1485
|
+
cluster_table.c.handle, cluster_table.c.last_use,
|
|
1486
|
+
cluster_table.c.status, cluster_table.c.autostop,
|
|
1487
|
+
cluster_table.c.to_down, cluster_table.c.owner,
|
|
1488
|
+
cluster_table.c.metadata, cluster_table.c.cluster_hash,
|
|
1489
|
+
cluster_table.c.storage_mounts_metadata,
|
|
1490
|
+
cluster_table.c.cluster_ever_up,
|
|
1491
|
+
cluster_table.c.status_updated_at, cluster_table.c.user_hash,
|
|
1492
|
+
cluster_table.c.config_hash, cluster_table.c.workspace,
|
|
1493
|
+
cluster_table.c.is_managed)
|
|
1494
|
+
else:
|
|
1495
|
+
query = session.query(
|
|
1496
|
+
cluster_table.c.name,
|
|
1497
|
+
cluster_table.c.launched_at,
|
|
1498
|
+
cluster_table.c.handle,
|
|
1499
|
+
cluster_table.c.last_use,
|
|
1500
|
+
cluster_table.c.status,
|
|
1501
|
+
cluster_table.c.autostop,
|
|
1502
|
+
cluster_table.c.to_down,
|
|
1503
|
+
cluster_table.c.owner,
|
|
1504
|
+
cluster_table.c.metadata,
|
|
1505
|
+
cluster_table.c.cluster_hash,
|
|
1506
|
+
cluster_table.c.storage_mounts_metadata,
|
|
1507
|
+
cluster_table.c.cluster_ever_up,
|
|
1508
|
+
cluster_table.c.status_updated_at,
|
|
1509
|
+
cluster_table.c.user_hash,
|
|
1510
|
+
cluster_table.c.config_hash,
|
|
1511
|
+
cluster_table.c.workspace,
|
|
1512
|
+
cluster_table.c.is_managed,
|
|
1513
|
+
# extra fields compared to above query
|
|
1514
|
+
cluster_table.c.last_creation_yaml,
|
|
1515
|
+
cluster_table.c.last_creation_command)
|
|
1516
|
+
row = query.filter_by(name=cluster_name).first()
|
|
1369
1517
|
if row is None:
|
|
1370
1518
|
return None
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1519
|
+
if include_user_info:
|
|
1520
|
+
user_hash = _get_user_hash_or_current_user(row.user_hash)
|
|
1521
|
+
user = get_user(user_hash)
|
|
1522
|
+
user_name = user.name if user is not None else None
|
|
1523
|
+
if not summary_response:
|
|
1524
|
+
last_event = get_last_cluster_event(
|
|
1525
|
+
row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE)
|
|
1376
1526
|
# TODO: use namedtuple instead of dict
|
|
1377
1527
|
record = {
|
|
1378
1528
|
'name': row.name,
|
|
@@ -1389,27 +1539,43 @@ def get_cluster_from_name(
|
|
|
1389
1539
|
row.storage_mounts_metadata),
|
|
1390
1540
|
'cluster_ever_up': bool(row.cluster_ever_up),
|
|
1391
1541
|
'status_updated_at': row.status_updated_at,
|
|
1392
|
-
'user_hash': user_hash,
|
|
1393
|
-
'user_name': user_name,
|
|
1394
|
-
'config_hash': row.config_hash,
|
|
1395
1542
|
'workspace': row.workspace,
|
|
1396
|
-
'last_creation_yaml': row.last_creation_yaml,
|
|
1397
|
-
'last_creation_command': row.last_creation_command,
|
|
1398
1543
|
'is_managed': bool(row.is_managed),
|
|
1399
|
-
'
|
|
1544
|
+
'config_hash': row.config_hash,
|
|
1400
1545
|
}
|
|
1546
|
+
if not summary_response:
|
|
1547
|
+
record['last_creation_yaml'] = row.last_creation_yaml
|
|
1548
|
+
record['last_creation_command'] = row.last_creation_command
|
|
1549
|
+
record['last_event'] = last_event
|
|
1550
|
+
if include_user_info:
|
|
1551
|
+
record['user_hash'] = user_hash
|
|
1552
|
+
record['user_name'] = user_name
|
|
1401
1553
|
|
|
1402
1554
|
return record
|
|
1403
1555
|
|
|
1404
1556
|
|
|
1557
|
+
@_init_db
|
|
1558
|
+
@metrics_lib.time_me
|
|
1559
|
+
@context_utils.cancellation_guard
|
|
1560
|
+
def cluster_with_name_exists(cluster_name: str) -> bool:
|
|
1561
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1562
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1563
|
+
row = session.query(
|
|
1564
|
+
cluster_table.c.name).filter_by(name=cluster_name).first()
|
|
1565
|
+
if row is None:
|
|
1566
|
+
return False
|
|
1567
|
+
return True
|
|
1568
|
+
|
|
1569
|
+
|
|
1405
1570
|
@_init_db
|
|
1406
1571
|
@metrics_lib.time_me
|
|
1407
1572
|
def get_clusters(
|
|
1408
1573
|
*, # keyword only separator
|
|
1409
1574
|
exclude_managed_clusters: bool = False,
|
|
1410
|
-
workspaces_filter: Optional[
|
|
1575
|
+
workspaces_filter: Optional[Dict[str, Any]] = None,
|
|
1411
1576
|
user_hashes_filter: Optional[Set[str]] = None,
|
|
1412
1577
|
cluster_names: Optional[List[str]] = None,
|
|
1578
|
+
summary_response: bool = False,
|
|
1413
1579
|
) -> List[Dict[str, Any]]:
|
|
1414
1580
|
"""Get clusters from the database.
|
|
1415
1581
|
|
|
@@ -1428,7 +1594,40 @@ def get_clusters(
|
|
|
1428
1594
|
current_user_hash = common_utils.get_user_hash()
|
|
1429
1595
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1430
1596
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1431
|
-
|
|
1597
|
+
if summary_response:
|
|
1598
|
+
query = session.query(
|
|
1599
|
+
cluster_table.c.name, cluster_table.c.launched_at,
|
|
1600
|
+
cluster_table.c.handle, cluster_table.c.last_use,
|
|
1601
|
+
cluster_table.c.status, cluster_table.c.autostop,
|
|
1602
|
+
cluster_table.c.to_down, cluster_table.c.owner,
|
|
1603
|
+
cluster_table.c.metadata, cluster_table.c.cluster_hash,
|
|
1604
|
+
cluster_table.c.storage_mounts_metadata,
|
|
1605
|
+
cluster_table.c.cluster_ever_up,
|
|
1606
|
+
cluster_table.c.status_updated_at, cluster_table.c.user_hash,
|
|
1607
|
+
cluster_table.c.config_hash, cluster_table.c.workspace,
|
|
1608
|
+
cluster_table.c.is_managed)
|
|
1609
|
+
else:
|
|
1610
|
+
query = session.query(
|
|
1611
|
+
cluster_table.c.name,
|
|
1612
|
+
cluster_table.c.launched_at,
|
|
1613
|
+
cluster_table.c.handle,
|
|
1614
|
+
cluster_table.c.last_use,
|
|
1615
|
+
cluster_table.c.status,
|
|
1616
|
+
cluster_table.c.autostop,
|
|
1617
|
+
cluster_table.c.to_down,
|
|
1618
|
+
cluster_table.c.owner,
|
|
1619
|
+
cluster_table.c.metadata,
|
|
1620
|
+
cluster_table.c.cluster_hash,
|
|
1621
|
+
cluster_table.c.storage_mounts_metadata,
|
|
1622
|
+
cluster_table.c.cluster_ever_up,
|
|
1623
|
+
cluster_table.c.status_updated_at,
|
|
1624
|
+
cluster_table.c.user_hash,
|
|
1625
|
+
cluster_table.c.config_hash,
|
|
1626
|
+
cluster_table.c.workspace,
|
|
1627
|
+
cluster_table.c.is_managed,
|
|
1628
|
+
# extra fields compared to above query
|
|
1629
|
+
cluster_table.c.last_creation_yaml,
|
|
1630
|
+
cluster_table.c.last_creation_command)
|
|
1432
1631
|
if exclude_managed_clusters:
|
|
1433
1632
|
query = query.filter(cluster_table.c.is_managed == int(False))
|
|
1434
1633
|
if workspaces_filter is not None:
|
|
@@ -1464,15 +1663,15 @@ def get_clusters(
|
|
|
1464
1663
|
|
|
1465
1664
|
# get last cluster event for each row
|
|
1466
1665
|
cluster_hashes = set(row_to_user_hash.keys())
|
|
1467
|
-
|
|
1468
|
-
|
|
1666
|
+
if not summary_response:
|
|
1667
|
+
last_cluster_event_dict = _get_last_cluster_event_multiple(
|
|
1668
|
+
cluster_hashes, ClusterEventType.STATUS_CHANGE)
|
|
1469
1669
|
|
|
1470
1670
|
# get user for each row
|
|
1471
1671
|
for row in rows:
|
|
1472
1672
|
user_hash = row_to_user_hash[row.cluster_hash]
|
|
1473
1673
|
user = user_hash_to_user.get(user_hash, None)
|
|
1474
1674
|
user_name = user.name if user is not None else None
|
|
1475
|
-
last_event = last_cluster_event_dict.get(row.cluster_hash, None)
|
|
1476
1675
|
# TODO: use namedtuple instead of dict
|
|
1477
1676
|
record = {
|
|
1478
1677
|
'name': row.name,
|
|
@@ -1491,18 +1690,32 @@ def get_clusters(
|
|
|
1491
1690
|
'status_updated_at': row.status_updated_at,
|
|
1492
1691
|
'user_hash': user_hash,
|
|
1493
1692
|
'user_name': user_name,
|
|
1494
|
-
'config_hash': row.config_hash,
|
|
1495
1693
|
'workspace': row.workspace,
|
|
1496
|
-
'last_creation_yaml': row.last_creation_yaml,
|
|
1497
|
-
'last_creation_command': row.last_creation_command,
|
|
1498
1694
|
'is_managed': bool(row.is_managed),
|
|
1499
|
-
'
|
|
1695
|
+
'config_hash': row.config_hash,
|
|
1500
1696
|
}
|
|
1697
|
+
if not summary_response:
|
|
1698
|
+
record['last_creation_yaml'] = row.last_creation_yaml
|
|
1699
|
+
record['last_creation_command'] = row.last_creation_command
|
|
1700
|
+
record['last_event'] = last_cluster_event_dict.get(
|
|
1701
|
+
row.cluster_hash, None)
|
|
1501
1702
|
|
|
1502
1703
|
records.append(record)
|
|
1503
1704
|
return records
|
|
1504
1705
|
|
|
1505
1706
|
|
|
1707
|
+
@_init_db
|
|
1708
|
+
@metrics_lib.time_me
|
|
1709
|
+
def get_cluster_names(exclude_managed_clusters: bool = False,) -> List[str]:
|
|
1710
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1711
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1712
|
+
query = session.query(cluster_table.c.name)
|
|
1713
|
+
if exclude_managed_clusters:
|
|
1714
|
+
query = query.filter(cluster_table.c.is_managed == int(False))
|
|
1715
|
+
rows = query.all()
|
|
1716
|
+
return [row[0] for row in rows]
|
|
1717
|
+
|
|
1718
|
+
|
|
1506
1719
|
@_init_db
|
|
1507
1720
|
@metrics_lib.time_me
|
|
1508
1721
|
def get_clusters_from_history(
|
|
@@ -1525,7 +1738,7 @@ def get_clusters_from_history(
|
|
|
1525
1738
|
current_user_hash = common_utils.get_user_hash()
|
|
1526
1739
|
|
|
1527
1740
|
# Prepare filtering parameters
|
|
1528
|
-
cutoff_time =
|
|
1741
|
+
cutoff_time = 0
|
|
1529
1742
|
if days is not None:
|
|
1530
1743
|
cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
|
|
1531
1744
|
|
|
@@ -1539,7 +1752,9 @@ def get_clusters_from_history(
|
|
|
1539
1752
|
cluster_history_table.c.usage_intervals,
|
|
1540
1753
|
cluster_history_table.c.user_hash,
|
|
1541
1754
|
cluster_history_table.c.workspace.label('history_workspace'),
|
|
1542
|
-
|
|
1755
|
+
cluster_history_table.c.last_activity_time,
|
|
1756
|
+
cluster_history_table.c.launched_at, cluster_table.c.status,
|
|
1757
|
+
cluster_table.c.workspace)
|
|
1543
1758
|
else:
|
|
1544
1759
|
query = session.query(
|
|
1545
1760
|
cluster_history_table.c.cluster_hash,
|
|
@@ -1550,19 +1765,33 @@ def get_clusters_from_history(
|
|
|
1550
1765
|
cluster_history_table.c.last_creation_yaml,
|
|
1551
1766
|
cluster_history_table.c.last_creation_command,
|
|
1552
1767
|
cluster_history_table.c.workspace.label('history_workspace'),
|
|
1553
|
-
|
|
1768
|
+
cluster_history_table.c.last_activity_time,
|
|
1769
|
+
cluster_history_table.c.launched_at, cluster_table.c.status,
|
|
1770
|
+
cluster_table.c.workspace)
|
|
1554
1771
|
|
|
1555
1772
|
query = query.select_from(
|
|
1556
1773
|
cluster_history_table.join(cluster_table,
|
|
1557
1774
|
cluster_history_table.c.cluster_hash ==
|
|
1558
1775
|
cluster_table.c.cluster_hash,
|
|
1559
1776
|
isouter=True))
|
|
1777
|
+
|
|
1778
|
+
# Only include clusters that are either active (status is not None)
|
|
1779
|
+
# or are within the cutoff time (cutoff_time <= last_activity_time).
|
|
1780
|
+
# If days is not specified, we include all clusters by setting
|
|
1781
|
+
# cutoff_time to 0.
|
|
1782
|
+
query = query.filter(
|
|
1783
|
+
(cluster_table.c.status.isnot(None) |
|
|
1784
|
+
(cluster_history_table.c.last_activity_time >= cutoff_time)))
|
|
1785
|
+
|
|
1786
|
+
# Order by launched_at descending (most recent first)
|
|
1787
|
+
query = query.order_by(
|
|
1788
|
+
sqlalchemy.desc(cluster_history_table.c.launched_at))
|
|
1789
|
+
|
|
1560
1790
|
if cluster_hashes is not None:
|
|
1561
1791
|
query = query.filter(
|
|
1562
1792
|
cluster_history_table.c.cluster_hash.in_(cluster_hashes))
|
|
1563
1793
|
rows = query.all()
|
|
1564
1794
|
|
|
1565
|
-
filtered_rows = []
|
|
1566
1795
|
usage_intervals_dict = {}
|
|
1567
1796
|
row_to_user_hash = {}
|
|
1568
1797
|
for row in rows:
|
|
@@ -1572,36 +1801,11 @@ def get_clusters_from_history(
|
|
|
1572
1801
|
row_usage_intervals = pickle.loads(row.usage_intervals)
|
|
1573
1802
|
except (pickle.PickleError, AttributeError):
|
|
1574
1803
|
pass
|
|
1575
|
-
# Parse status
|
|
1576
|
-
status = None
|
|
1577
|
-
if row.status:
|
|
1578
|
-
status = status_lib.ClusterStatus[row.status]
|
|
1579
|
-
# Apply filtering: always include active clusters, filter historical
|
|
1580
|
-
# ones by time
|
|
1581
|
-
if cutoff_time is not None and status is None: # Historical cluster
|
|
1582
|
-
# For historical clusters, check if they were used recently
|
|
1583
|
-
# Use the most recent activity from usage_intervals to determine
|
|
1584
|
-
# last use
|
|
1585
|
-
# Find the most recent activity time from usage_intervals
|
|
1586
|
-
last_activity_time = None
|
|
1587
|
-
if row_usage_intervals:
|
|
1588
|
-
# Get the end time of the last interval (or start time if
|
|
1589
|
-
# still running)
|
|
1590
|
-
last_interval = row_usage_intervals[-1]
|
|
1591
|
-
last_activity_time = (last_interval[1] if last_interval[1]
|
|
1592
|
-
is not None else last_interval[0])
|
|
1593
|
-
|
|
1594
|
-
# Skip historical clusters that haven't been used recently
|
|
1595
|
-
if last_activity_time is None or last_activity_time < cutoff_time:
|
|
1596
|
-
continue
|
|
1597
|
-
|
|
1598
|
-
filtered_rows.append(row)
|
|
1599
1804
|
usage_intervals_dict[row.cluster_hash] = row_usage_intervals
|
|
1600
1805
|
user_hash = (row.user_hash
|
|
1601
1806
|
if row.user_hash is not None else current_user_hash)
|
|
1602
1807
|
row_to_user_hash[row.cluster_hash] = user_hash
|
|
1603
1808
|
|
|
1604
|
-
rows = filtered_rows
|
|
1605
1809
|
user_hashes = set(row_to_user_hash.values())
|
|
1606
1810
|
user_hash_to_user = _get_users(user_hashes)
|
|
1607
1811
|
cluster_hashes = set(row_to_user_hash.keys())
|
|
@@ -1616,10 +1820,10 @@ def get_clusters_from_history(
|
|
|
1616
1820
|
user_name = user.name if user is not None else None
|
|
1617
1821
|
if not abbreviate_response:
|
|
1618
1822
|
last_event = last_cluster_event_dict.get(row.cluster_hash, None)
|
|
1823
|
+
launched_at = row.launched_at
|
|
1619
1824
|
usage_intervals: Optional[List[Tuple[
|
|
1620
1825
|
int,
|
|
1621
1826
|
Optional[int]]]] = usage_intervals_dict.get(row.cluster_hash, None)
|
|
1622
|
-
launched_at = _get_cluster_launch_time(usage_intervals)
|
|
1623
1827
|
duration = _get_cluster_duration(usage_intervals)
|
|
1624
1828
|
|
|
1625
1829
|
# Parse status
|
sky/jobs/client/sdk.py
CHANGED
|
@@ -383,15 +383,24 @@ def dashboard() -> None:
|
|
|
383
383
|
@server_common.check_server_healthy_or_start
|
|
384
384
|
@versions.minimal_api_version(12)
|
|
385
385
|
def pool_apply(
|
|
386
|
-
task: Union['sky.Task', 'sky.Dag'],
|
|
386
|
+
task: Optional[Union['sky.Task', 'sky.Dag']],
|
|
387
387
|
pool_name: str,
|
|
388
388
|
mode: 'serve_utils.UpdateMode',
|
|
389
|
+
workers: Optional[int] = None,
|
|
389
390
|
# Internal only:
|
|
390
391
|
# pylint: disable=invalid-name
|
|
391
392
|
_need_confirmation: bool = False
|
|
392
393
|
) -> server_common.RequestId[None]:
|
|
393
394
|
"""Apply a config to a pool."""
|
|
395
|
+
remote_api_version = versions.get_remote_api_version()
|
|
396
|
+
if (workers is not None and
|
|
397
|
+
(remote_api_version is None or remote_api_version < 19)):
|
|
398
|
+
raise click.UsageError('Updating the number of workers in a pool is '
|
|
399
|
+
'not supported in your API server. Please '
|
|
400
|
+
'upgrade to a newer API server to use this '
|
|
401
|
+
'feature.')
|
|
394
402
|
return impl.apply(task,
|
|
403
|
+
workers,
|
|
395
404
|
pool_name,
|
|
396
405
|
mode,
|
|
397
406
|
pool=True,
|
sky/jobs/constants.py
CHANGED
|
@@ -10,6 +10,8 @@ JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
|
|
|
10
10
|
|
|
11
11
|
JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
|
|
12
12
|
|
|
13
|
+
JOB_CONTROLLER_INDICATOR_FILE = '~/.sky/is_jobs_controller'
|
|
14
|
+
|
|
13
15
|
CONSOLIDATED_SIGNAL_PATH = os.path.expanduser('~/.sky/signals/')
|
|
14
16
|
SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
|
|
15
17
|
# Resources as a dict for the jobs controller.
|