skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (123) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +10 -0
  3. sky/backends/backend_utils.py +207 -79
  4. sky/backends/cloud_vm_ray_backend.py +37 -13
  5. sky/backends/local_docker_backend.py +9 -0
  6. sky/client/cli/command.py +112 -53
  7. sky/client/common.py +4 -2
  8. sky/client/sdk.py +17 -7
  9. sky/client/sdk_async.py +4 -2
  10. sky/clouds/kubernetes.py +2 -1
  11. sky/clouds/runpod.py +20 -7
  12. sky/core.py +9 -54
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
  15. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
  22. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  23. sky/dashboard/out/clusters/[cluster].html +1 -1
  24. sky/dashboard/out/clusters.html +1 -1
  25. sky/dashboard/out/config.html +1 -1
  26. sky/dashboard/out/index.html +1 -1
  27. sky/dashboard/out/infra/[context].html +1 -1
  28. sky/dashboard/out/infra.html +1 -1
  29. sky/dashboard/out/jobs/[job].html +1 -1
  30. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  31. sky/dashboard/out/jobs.html +1 -1
  32. sky/dashboard/out/users.html +1 -1
  33. sky/dashboard/out/volumes.html +1 -1
  34. sky/dashboard/out/workspace/new.html +1 -1
  35. sky/dashboard/out/workspaces/[name].html +1 -1
  36. sky/dashboard/out/workspaces.html +1 -1
  37. sky/data/mounting_utils.py +19 -10
  38. sky/execution.py +4 -2
  39. sky/global_user_state.py +271 -67
  40. sky/jobs/client/sdk.py +10 -1
  41. sky/jobs/constants.py +2 -0
  42. sky/jobs/controller.py +11 -7
  43. sky/jobs/server/core.py +5 -3
  44. sky/jobs/server/server.py +15 -11
  45. sky/jobs/utils.py +1 -1
  46. sky/logs/agent.py +30 -3
  47. sky/logs/aws.py +9 -19
  48. sky/provision/__init__.py +2 -1
  49. sky/provision/aws/instance.py +2 -1
  50. sky/provision/azure/instance.py +2 -1
  51. sky/provision/cudo/instance.py +2 -2
  52. sky/provision/do/instance.py +2 -2
  53. sky/provision/docker_utils.py +41 -19
  54. sky/provision/fluidstack/instance.py +2 -2
  55. sky/provision/gcp/instance.py +2 -1
  56. sky/provision/hyperbolic/instance.py +2 -1
  57. sky/provision/instance_setup.py +1 -1
  58. sky/provision/kubernetes/instance.py +134 -8
  59. sky/provision/lambda_cloud/instance.py +2 -1
  60. sky/provision/nebius/instance.py +2 -1
  61. sky/provision/oci/instance.py +2 -1
  62. sky/provision/paperspace/instance.py +2 -2
  63. sky/provision/primeintellect/instance.py +2 -2
  64. sky/provision/provisioner.py +1 -0
  65. sky/provision/runpod/__init__.py +2 -0
  66. sky/provision/runpod/instance.py +2 -2
  67. sky/provision/scp/instance.py +2 -2
  68. sky/provision/seeweb/instance.py +2 -1
  69. sky/provision/vast/instance.py +2 -1
  70. sky/provision/vsphere/instance.py +6 -5
  71. sky/schemas/api/responses.py +2 -1
  72. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  73. sky/serve/autoscalers.py +2 -0
  74. sky/serve/client/impl.py +45 -19
  75. sky/serve/replica_managers.py +12 -5
  76. sky/serve/serve_utils.py +5 -7
  77. sky/serve/server/core.py +9 -6
  78. sky/serve/server/impl.py +78 -25
  79. sky/serve/server/server.py +4 -5
  80. sky/serve/service_spec.py +33 -0
  81. sky/server/constants.py +1 -1
  82. sky/server/daemons.py +2 -3
  83. sky/server/requests/executor.py +56 -6
  84. sky/server/requests/payloads.py +32 -8
  85. sky/server/requests/preconditions.py +2 -3
  86. sky/server/rest.py +2 -0
  87. sky/server/server.py +28 -19
  88. sky/server/stream_utils.py +34 -12
  89. sky/setup_files/dependencies.py +5 -2
  90. sky/setup_files/setup.py +44 -44
  91. sky/skylet/constants.py +4 -1
  92. sky/skylet/events.py +42 -0
  93. sky/templates/jobs-controller.yaml.j2 +3 -0
  94. sky/templates/kubernetes-ray.yml.j2 +24 -18
  95. sky/usage/usage_lib.py +3 -0
  96. sky/utils/cli_utils/status_utils.py +4 -5
  97. sky/utils/context.py +104 -29
  98. sky/utils/controller_utils.py +7 -6
  99. sky/utils/db/db_utils.py +5 -1
  100. sky/utils/db/migration_utils.py +1 -1
  101. sky/utils/kubernetes/create_cluster.sh +13 -28
  102. sky/utils/kubernetes/delete_cluster.sh +10 -7
  103. sky/utils/kubernetes/generate_kind_config.py +6 -66
  104. sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
  105. sky/utils/kubernetes_enums.py +5 -0
  106. sky/utils/ux_utils.py +35 -1
  107. sky/utils/yaml_utils.py +9 -0
  108. sky/volumes/client/sdk.py +44 -8
  109. sky/volumes/server/core.py +1 -0
  110. sky/volumes/server/server.py +33 -7
  111. sky/volumes/volume.py +35 -28
  112. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
  113. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
  114. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
  118. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
  119. /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
  120. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
  121. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
  122. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
  123. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -24,6 +24,7 @@ from sqlalchemy import exc as sqlalchemy_exc
24
24
  from sqlalchemy import orm
25
25
  from sqlalchemy.dialects import postgresql
26
26
  from sqlalchemy.dialects import sqlite
27
+ from sqlalchemy.ext import asyncio as sql_async
27
28
  from sqlalchemy.ext import declarative
28
29
 
29
30
  from sky import models
@@ -51,6 +52,7 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
51
52
  _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
52
53
 
53
54
  _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
55
+ _SQLALCHEMY_ENGINE_ASYNC: Optional[sql_async.AsyncEngine] = None
54
56
  _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
55
57
 
56
58
  DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
@@ -183,6 +185,14 @@ cluster_history_table = sqlalchemy.Table(
183
185
  sqlalchemy.Column('provision_log_path',
184
186
  sqlalchemy.Text,
185
187
  server_default=None),
188
+ sqlalchemy.Column('last_activity_time',
189
+ sqlalchemy.Integer,
190
+ server_default=None,
191
+ index=True),
192
+ sqlalchemy.Column('launched_at',
193
+ sqlalchemy.Integer,
194
+ server_default=None,
195
+ index=True),
186
196
  )
187
197
 
188
198
 
@@ -296,6 +306,20 @@ def create_table(engine: sqlalchemy.engine.Engine):
296
306
  migration_utils.GLOBAL_USER_STATE_VERSION)
297
307
 
298
308
 
309
+ def initialize_and_get_db_async() -> sql_async.AsyncEngine:
310
+ global _SQLALCHEMY_ENGINE_ASYNC
311
+ if _SQLALCHEMY_ENGINE_ASYNC is not None:
312
+ return _SQLALCHEMY_ENGINE_ASYNC
313
+ with _SQLALCHEMY_ENGINE_LOCK:
314
+ if _SQLALCHEMY_ENGINE_ASYNC is not None:
315
+ return _SQLALCHEMY_ENGINE_ASYNC
316
+
317
+ _SQLALCHEMY_ENGINE_ASYNC = db_utils.get_engine('state',
318
+ async_engine=True)
319
+ initialize_and_get_db()
320
+ return _SQLALCHEMY_ENGINE_ASYNC
321
+
322
+
299
323
  # We wrap the sqlalchemy engine initialization in a thread
300
324
  # lock to ensure that multiple threads do not initialize the
301
325
  # engine which could result in a rare race condition where
@@ -321,6 +345,22 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
321
345
  return _SQLALCHEMY_ENGINE
322
346
 
323
347
 
348
+ def _init_db_async(func):
349
+ """Initialize the async database."""
350
+
351
+ @functools.wraps(func)
352
+ async def wrapper(*args, **kwargs):
353
+ if _SQLALCHEMY_ENGINE_ASYNC is None:
354
+ # this may happen multiple times since there is no locking
355
+ # here but thats fine, this is just a short circuit for the
356
+ # common case.
357
+ await context_utils.to_thread(initialize_and_get_db_async)
358
+
359
+ return await func(*args, **kwargs)
360
+
361
+ return wrapper
362
+
363
+
324
364
  def _init_db(func):
325
365
  """Initialize the database."""
326
366
 
@@ -688,6 +728,10 @@ def add_or_update_cluster(cluster_name: str,
688
728
  conditional_values.get('last_creation_command'),
689
729
  }
690
730
 
731
+ # Calculate last_activity_time and launched_at from usage_intervals
732
+ last_activity_time = _get_cluster_last_activity_time(usage_intervals)
733
+ launched_at = _get_cluster_launch_time(usage_intervals)
734
+
691
735
  insert_stmnt = insert_func(cluster_history_table).values(
692
736
  cluster_hash=cluster_hash,
693
737
  name=cluster_name,
@@ -698,6 +742,8 @@ def add_or_update_cluster(cluster_name: str,
698
742
  user_hash=user_hash,
699
743
  workspace=history_workspace,
700
744
  provision_log_path=provision_log_path,
745
+ last_activity_time=last_activity_time,
746
+ launched_at=launched_at,
701
747
  **creation_info,
702
748
  )
703
749
  do_update_stmt = insert_stmnt.on_conflict_do_update(
@@ -714,6 +760,8 @@ def add_or_update_cluster(cluster_name: str,
714
760
  cluster_history_table.c.user_hash: history_hash,
715
761
  cluster_history_table.c.workspace: history_workspace,
716
762
  cluster_history_table.c.provision_log_path: provision_log_path,
763
+ cluster_history_table.c.last_activity_time: last_activity_time,
764
+ cluster_history_table.c.launched_at: launched_at,
717
765
  **creation_info,
718
766
  })
719
767
  session.execute(do_update_stmt)
@@ -1010,29 +1058,68 @@ def get_handle_from_cluster_name(
1010
1058
  assert _SQLALCHEMY_ENGINE is not None
1011
1059
  assert cluster_name is not None, 'cluster_name cannot be None'
1012
1060
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1013
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1061
+ row = (session.query(
1062
+ cluster_table.c.handle).filter_by(name=cluster_name).first())
1014
1063
  if row is None:
1015
1064
  return None
1016
1065
  return pickle.loads(row.handle)
1017
1066
 
1018
1067
 
1068
+ @_init_db_async
1069
+ @metrics_lib.time_me
1070
+ async def get_status_from_cluster_name_async(
1071
+ cluster_name: str) -> Optional[status_lib.ClusterStatus]:
1072
+ """Get the status of a cluster."""
1073
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1074
+ assert cluster_name is not None, 'cluster_name cannot be None'
1075
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1076
+ result = await session.execute(
1077
+ sqlalchemy.select(cluster_table.c.status).where(
1078
+ cluster_table.c.name == cluster_name))
1079
+ row = result.first()
1080
+
1081
+ if row is None:
1082
+ return None
1083
+ return status_lib.ClusterStatus(row[0])
1084
+
1085
+
1019
1086
  @_init_db
1020
1087
  @metrics_lib.time_me
1021
- def get_glob_cluster_names(cluster_name: str) -> List[str]:
1088
+ def get_status_from_cluster_name(
1089
+ cluster_name: str) -> Optional[status_lib.ClusterStatus]:
1090
+ assert _SQLALCHEMY_ENGINE is not None
1091
+ assert cluster_name is not None, 'cluster_name cannot be None'
1092
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1093
+ row = session.query(
1094
+ cluster_table.c.status).filter_by(name=cluster_name).first()
1095
+ if row is None:
1096
+ return None
1097
+ return status_lib.ClusterStatus[row.status]
1098
+
1099
+
1100
+ @_init_db
1101
+ @metrics_lib.time_me
1102
+ def get_glob_cluster_names(
1103
+ cluster_name: str,
1104
+ workspaces_filter: Optional[Set[str]] = None) -> List[str]:
1022
1105
  assert _SQLALCHEMY_ENGINE is not None
1023
1106
  assert cluster_name is not None, 'cluster_name cannot be None'
1024
1107
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1025
1108
  if (_SQLALCHEMY_ENGINE.dialect.name ==
1026
1109
  db_utils.SQLAlchemyDialect.SQLITE.value):
1027
- rows = session.query(cluster_table).filter(
1028
- cluster_table.c.name.op('GLOB')(cluster_name)).all()
1110
+ query = session.query(cluster_table.c.name).filter(
1111
+ cluster_table.c.name.op('GLOB')(cluster_name))
1029
1112
  elif (_SQLALCHEMY_ENGINE.dialect.name ==
1030
1113
  db_utils.SQLAlchemyDialect.POSTGRESQL.value):
1031
- rows = session.query(cluster_table).filter(
1114
+ query = session.query(cluster_table.c.name).filter(
1032
1115
  cluster_table.c.name.op('SIMILAR TO')(
1033
- _glob_to_similar(cluster_name))).all()
1116
+ _glob_to_similar(cluster_name)))
1034
1117
  else:
1035
1118
  raise ValueError('Unsupported database dialect')
1119
+ if workspaces_filter is not None:
1120
+ query = query.filter(
1121
+ cluster_table.c.workspace.in_(workspaces_filter))
1122
+ rows = query.all()
1036
1123
  return [row.name for row in rows]
1037
1124
 
1038
1125
 
@@ -1076,7 +1163,8 @@ def set_cluster_autostop_value(cluster_name: str, idle_minutes: int,
1076
1163
  def get_cluster_launch_time(cluster_name: str) -> Optional[int]:
1077
1164
  assert _SQLALCHEMY_ENGINE is not None
1078
1165
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1079
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1166
+ row = session.query(
1167
+ cluster_table.c.launched_at).filter_by(name=cluster_name).first()
1080
1168
  if row is None or row.launched_at is None:
1081
1169
  return None
1082
1170
  return int(row.launched_at)
@@ -1087,7 +1175,8 @@ def get_cluster_launch_time(cluster_name: str) -> Optional[int]:
1087
1175
  def get_cluster_info(cluster_name: str) -> Optional[Dict[str, Any]]:
1088
1176
  assert _SQLALCHEMY_ENGINE is not None
1089
1177
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1090
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1178
+ row = session.query(
1179
+ cluster_table.c.metadata).filter_by(name=cluster_name).first()
1091
1180
  if row is None or row.metadata is None:
1092
1181
  return None
1093
1182
  return json.loads(row.metadata)
@@ -1167,7 +1256,8 @@ def get_cluster_storage_mounts_metadata(
1167
1256
  cluster_name: str) -> Optional[Dict[str, Any]]:
1168
1257
  assert _SQLALCHEMY_ENGINE is not None
1169
1258
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1170
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1259
+ row = (session.query(cluster_table.c.storage_mounts_metadata).filter_by(
1260
+ name=cluster_name).first())
1171
1261
  if row is None or row.storage_mounts_metadata is None:
1172
1262
  return None
1173
1263
  return pickle.loads(row.storage_mounts_metadata)
@@ -1196,7 +1286,9 @@ def get_cluster_skylet_ssh_tunnel_metadata(
1196
1286
  cluster_name: str) -> Optional[Tuple[int, int]]:
1197
1287
  assert _SQLALCHEMY_ENGINE is not None
1198
1288
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1199
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1289
+ row = session.query(
1290
+ cluster_table.c.skylet_ssh_tunnel_metadata).filter_by(
1291
+ name=cluster_name).first()
1200
1292
  if row is None or row.skylet_ssh_tunnel_metadata is None:
1201
1293
  return None
1202
1294
  return pickle.loads(row.skylet_ssh_tunnel_metadata)
@@ -1230,7 +1322,7 @@ def _get_cluster_usage_intervals(
1230
1322
  if cluster_hash is None:
1231
1323
  return None
1232
1324
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1233
- row = session.query(cluster_history_table).filter_by(
1325
+ row = session.query(cluster_history_table.c.usage_intervals).filter_by(
1234
1326
  cluster_hash=cluster_hash).first()
1235
1327
  if row is None or row.usage_intervals is None:
1236
1328
  return None
@@ -1264,17 +1356,33 @@ def _get_cluster_duration(
1264
1356
  return total_duration
1265
1357
 
1266
1358
 
1359
+ def _get_cluster_last_activity_time(
1360
+ usage_intervals: Optional[List[Tuple[int,
1361
+ Optional[int]]]]) -> Optional[int]:
1362
+ last_activity_time = None
1363
+ if usage_intervals:
1364
+ last_interval = usage_intervals[-1]
1365
+ last_activity_time = (last_interval[1] if last_interval[1] is not None
1366
+ else last_interval[0])
1367
+ return last_activity_time
1368
+
1369
+
1267
1370
  @_init_db
1268
1371
  @metrics_lib.time_me
1269
1372
  def _set_cluster_usage_intervals(
1270
1373
  cluster_hash: str, usage_intervals: List[Tuple[int,
1271
1374
  Optional[int]]]) -> None:
1272
1375
  assert _SQLALCHEMY_ENGINE is not None
1376
+
1377
+ # Calculate last_activity_time from usage_intervals
1378
+ last_activity_time = _get_cluster_last_activity_time(usage_intervals)
1379
+
1273
1380
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1274
1381
  count = session.query(cluster_history_table).filter_by(
1275
1382
  cluster_hash=cluster_hash).update({
1276
1383
  cluster_history_table.c.usage_intervals:
1277
- pickle.dumps(usage_intervals)
1384
+ pickle.dumps(usage_intervals),
1385
+ cluster_history_table.c.last_activity_time: last_activity_time,
1278
1386
  })
1279
1387
  session.commit()
1280
1388
  assert count <= 1, count
@@ -1305,7 +1413,8 @@ def set_owner_identity_for_cluster(cluster_name: str,
1305
1413
  def _get_hash_for_existing_cluster(cluster_name: str) -> Optional[str]:
1306
1414
  assert _SQLALCHEMY_ENGINE is not None
1307
1415
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1308
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1416
+ row = (session.query(
1417
+ cluster_table.c.cluster_hash).filter_by(name=cluster_name).first())
1309
1418
  if row is None or row.cluster_hash is None:
1310
1419
  return None
1311
1420
  return row.cluster_hash
@@ -1317,8 +1426,10 @@ def get_launched_resources_from_cluster_hash(
1317
1426
  cluster_hash: str) -> Optional[Tuple[int, Any]]:
1318
1427
  assert _SQLALCHEMY_ENGINE is not None
1319
1428
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1320
- row = session.query(cluster_history_table).filter_by(
1321
- cluster_hash=cluster_hash).first()
1429
+ row = session.query(
1430
+ cluster_history_table.c.num_nodes,
1431
+ cluster_history_table.c.launched_resources).filter_by(
1432
+ cluster_hash=cluster_hash).first()
1322
1433
  if row is None:
1323
1434
  return None
1324
1435
  num_nodes = row.num_nodes
@@ -1362,17 +1473,56 @@ def _load_storage_mounts_metadata(
1362
1473
  @metrics_lib.time_me
1363
1474
  @context_utils.cancellation_guard
1364
1475
  def get_cluster_from_name(
1365
- cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
1476
+ cluster_name: Optional[str],
1477
+ *,
1478
+ include_user_info: bool = True,
1479
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
1366
1480
  assert _SQLALCHEMY_ENGINE is not None
1367
1481
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1368
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1482
+ if summary_response:
1483
+ query = session.query(
1484
+ cluster_table.c.name, cluster_table.c.launched_at,
1485
+ cluster_table.c.handle, cluster_table.c.last_use,
1486
+ cluster_table.c.status, cluster_table.c.autostop,
1487
+ cluster_table.c.to_down, cluster_table.c.owner,
1488
+ cluster_table.c.metadata, cluster_table.c.cluster_hash,
1489
+ cluster_table.c.storage_mounts_metadata,
1490
+ cluster_table.c.cluster_ever_up,
1491
+ cluster_table.c.status_updated_at, cluster_table.c.user_hash,
1492
+ cluster_table.c.config_hash, cluster_table.c.workspace,
1493
+ cluster_table.c.is_managed)
1494
+ else:
1495
+ query = session.query(
1496
+ cluster_table.c.name,
1497
+ cluster_table.c.launched_at,
1498
+ cluster_table.c.handle,
1499
+ cluster_table.c.last_use,
1500
+ cluster_table.c.status,
1501
+ cluster_table.c.autostop,
1502
+ cluster_table.c.to_down,
1503
+ cluster_table.c.owner,
1504
+ cluster_table.c.metadata,
1505
+ cluster_table.c.cluster_hash,
1506
+ cluster_table.c.storage_mounts_metadata,
1507
+ cluster_table.c.cluster_ever_up,
1508
+ cluster_table.c.status_updated_at,
1509
+ cluster_table.c.user_hash,
1510
+ cluster_table.c.config_hash,
1511
+ cluster_table.c.workspace,
1512
+ cluster_table.c.is_managed,
1513
+ # extra fields compared to above query
1514
+ cluster_table.c.last_creation_yaml,
1515
+ cluster_table.c.last_creation_command)
1516
+ row = query.filter_by(name=cluster_name).first()
1369
1517
  if row is None:
1370
1518
  return None
1371
- user_hash = _get_user_hash_or_current_user(row.user_hash)
1372
- user = get_user(user_hash)
1373
- user_name = user.name if user is not None else None
1374
- last_event = get_last_cluster_event(
1375
- row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE)
1519
+ if include_user_info:
1520
+ user_hash = _get_user_hash_or_current_user(row.user_hash)
1521
+ user = get_user(user_hash)
1522
+ user_name = user.name if user is not None else None
1523
+ if not summary_response:
1524
+ last_event = get_last_cluster_event(
1525
+ row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE)
1376
1526
  # TODO: use namedtuple instead of dict
1377
1527
  record = {
1378
1528
  'name': row.name,
@@ -1389,27 +1539,43 @@ def get_cluster_from_name(
1389
1539
  row.storage_mounts_metadata),
1390
1540
  'cluster_ever_up': bool(row.cluster_ever_up),
1391
1541
  'status_updated_at': row.status_updated_at,
1392
- 'user_hash': user_hash,
1393
- 'user_name': user_name,
1394
- 'config_hash': row.config_hash,
1395
1542
  'workspace': row.workspace,
1396
- 'last_creation_yaml': row.last_creation_yaml,
1397
- 'last_creation_command': row.last_creation_command,
1398
1543
  'is_managed': bool(row.is_managed),
1399
- 'last_event': last_event,
1544
+ 'config_hash': row.config_hash,
1400
1545
  }
1546
+ if not summary_response:
1547
+ record['last_creation_yaml'] = row.last_creation_yaml
1548
+ record['last_creation_command'] = row.last_creation_command
1549
+ record['last_event'] = last_event
1550
+ if include_user_info:
1551
+ record['user_hash'] = user_hash
1552
+ record['user_name'] = user_name
1401
1553
 
1402
1554
  return record
1403
1555
 
1404
1556
 
1557
+ @_init_db
1558
+ @metrics_lib.time_me
1559
+ @context_utils.cancellation_guard
1560
+ def cluster_with_name_exists(cluster_name: str) -> bool:
1561
+ assert _SQLALCHEMY_ENGINE is not None
1562
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1563
+ row = session.query(
1564
+ cluster_table.c.name).filter_by(name=cluster_name).first()
1565
+ if row is None:
1566
+ return False
1567
+ return True
1568
+
1569
+
1405
1570
  @_init_db
1406
1571
  @metrics_lib.time_me
1407
1572
  def get_clusters(
1408
1573
  *, # keyword only separator
1409
1574
  exclude_managed_clusters: bool = False,
1410
- workspaces_filter: Optional[Set[str]] = None,
1575
+ workspaces_filter: Optional[Dict[str, Any]] = None,
1411
1576
  user_hashes_filter: Optional[Set[str]] = None,
1412
1577
  cluster_names: Optional[List[str]] = None,
1578
+ summary_response: bool = False,
1413
1579
  ) -> List[Dict[str, Any]]:
1414
1580
  """Get clusters from the database.
1415
1581
 
@@ -1428,7 +1594,40 @@ def get_clusters(
1428
1594
  current_user_hash = common_utils.get_user_hash()
1429
1595
  assert _SQLALCHEMY_ENGINE is not None
1430
1596
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1431
- query = session.query(cluster_table)
1597
+ if summary_response:
1598
+ query = session.query(
1599
+ cluster_table.c.name, cluster_table.c.launched_at,
1600
+ cluster_table.c.handle, cluster_table.c.last_use,
1601
+ cluster_table.c.status, cluster_table.c.autostop,
1602
+ cluster_table.c.to_down, cluster_table.c.owner,
1603
+ cluster_table.c.metadata, cluster_table.c.cluster_hash,
1604
+ cluster_table.c.storage_mounts_metadata,
1605
+ cluster_table.c.cluster_ever_up,
1606
+ cluster_table.c.status_updated_at, cluster_table.c.user_hash,
1607
+ cluster_table.c.config_hash, cluster_table.c.workspace,
1608
+ cluster_table.c.is_managed)
1609
+ else:
1610
+ query = session.query(
1611
+ cluster_table.c.name,
1612
+ cluster_table.c.launched_at,
1613
+ cluster_table.c.handle,
1614
+ cluster_table.c.last_use,
1615
+ cluster_table.c.status,
1616
+ cluster_table.c.autostop,
1617
+ cluster_table.c.to_down,
1618
+ cluster_table.c.owner,
1619
+ cluster_table.c.metadata,
1620
+ cluster_table.c.cluster_hash,
1621
+ cluster_table.c.storage_mounts_metadata,
1622
+ cluster_table.c.cluster_ever_up,
1623
+ cluster_table.c.status_updated_at,
1624
+ cluster_table.c.user_hash,
1625
+ cluster_table.c.config_hash,
1626
+ cluster_table.c.workspace,
1627
+ cluster_table.c.is_managed,
1628
+ # extra fields compared to above query
1629
+ cluster_table.c.last_creation_yaml,
1630
+ cluster_table.c.last_creation_command)
1432
1631
  if exclude_managed_clusters:
1433
1632
  query = query.filter(cluster_table.c.is_managed == int(False))
1434
1633
  if workspaces_filter is not None:
@@ -1464,15 +1663,15 @@ def get_clusters(
1464
1663
 
1465
1664
  # get last cluster event for each row
1466
1665
  cluster_hashes = set(row_to_user_hash.keys())
1467
- last_cluster_event_dict = _get_last_cluster_event_multiple(
1468
- cluster_hashes, ClusterEventType.STATUS_CHANGE)
1666
+ if not summary_response:
1667
+ last_cluster_event_dict = _get_last_cluster_event_multiple(
1668
+ cluster_hashes, ClusterEventType.STATUS_CHANGE)
1469
1669
 
1470
1670
  # get user for each row
1471
1671
  for row in rows:
1472
1672
  user_hash = row_to_user_hash[row.cluster_hash]
1473
1673
  user = user_hash_to_user.get(user_hash, None)
1474
1674
  user_name = user.name if user is not None else None
1475
- last_event = last_cluster_event_dict.get(row.cluster_hash, None)
1476
1675
  # TODO: use namedtuple instead of dict
1477
1676
  record = {
1478
1677
  'name': row.name,
@@ -1491,18 +1690,32 @@ def get_clusters(
1491
1690
  'status_updated_at': row.status_updated_at,
1492
1691
  'user_hash': user_hash,
1493
1692
  'user_name': user_name,
1494
- 'config_hash': row.config_hash,
1495
1693
  'workspace': row.workspace,
1496
- 'last_creation_yaml': row.last_creation_yaml,
1497
- 'last_creation_command': row.last_creation_command,
1498
1694
  'is_managed': bool(row.is_managed),
1499
- 'last_event': last_event,
1695
+ 'config_hash': row.config_hash,
1500
1696
  }
1697
+ if not summary_response:
1698
+ record['last_creation_yaml'] = row.last_creation_yaml
1699
+ record['last_creation_command'] = row.last_creation_command
1700
+ record['last_event'] = last_cluster_event_dict.get(
1701
+ row.cluster_hash, None)
1501
1702
 
1502
1703
  records.append(record)
1503
1704
  return records
1504
1705
 
1505
1706
 
1707
+ @_init_db
1708
+ @metrics_lib.time_me
1709
+ def get_cluster_names(exclude_managed_clusters: bool = False,) -> List[str]:
1710
+ assert _SQLALCHEMY_ENGINE is not None
1711
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1712
+ query = session.query(cluster_table.c.name)
1713
+ if exclude_managed_clusters:
1714
+ query = query.filter(cluster_table.c.is_managed == int(False))
1715
+ rows = query.all()
1716
+ return [row[0] for row in rows]
1717
+
1718
+
1506
1719
  @_init_db
1507
1720
  @metrics_lib.time_me
1508
1721
  def get_clusters_from_history(
@@ -1525,7 +1738,7 @@ def get_clusters_from_history(
1525
1738
  current_user_hash = common_utils.get_user_hash()
1526
1739
 
1527
1740
  # Prepare filtering parameters
1528
- cutoff_time = None
1741
+ cutoff_time = 0
1529
1742
  if days is not None:
1530
1743
  cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
1531
1744
 
@@ -1539,7 +1752,9 @@ def get_clusters_from_history(
1539
1752
  cluster_history_table.c.usage_intervals,
1540
1753
  cluster_history_table.c.user_hash,
1541
1754
  cluster_history_table.c.workspace.label('history_workspace'),
1542
- cluster_table.c.status, cluster_table.c.workspace)
1755
+ cluster_history_table.c.last_activity_time,
1756
+ cluster_history_table.c.launched_at, cluster_table.c.status,
1757
+ cluster_table.c.workspace)
1543
1758
  else:
1544
1759
  query = session.query(
1545
1760
  cluster_history_table.c.cluster_hash,
@@ -1550,19 +1765,33 @@ def get_clusters_from_history(
1550
1765
  cluster_history_table.c.last_creation_yaml,
1551
1766
  cluster_history_table.c.last_creation_command,
1552
1767
  cluster_history_table.c.workspace.label('history_workspace'),
1553
- cluster_table.c.status, cluster_table.c.workspace)
1768
+ cluster_history_table.c.last_activity_time,
1769
+ cluster_history_table.c.launched_at, cluster_table.c.status,
1770
+ cluster_table.c.workspace)
1554
1771
 
1555
1772
  query = query.select_from(
1556
1773
  cluster_history_table.join(cluster_table,
1557
1774
  cluster_history_table.c.cluster_hash ==
1558
1775
  cluster_table.c.cluster_hash,
1559
1776
  isouter=True))
1777
+
1778
+ # Only include clusters that are either active (status is not None)
1779
+ # or are within the cutoff time (cutoff_time <= last_activity_time).
1780
+ # If days is not specified, we include all clusters by setting
1781
+ # cutoff_time to 0.
1782
+ query = query.filter(
1783
+ (cluster_table.c.status.isnot(None) |
1784
+ (cluster_history_table.c.last_activity_time >= cutoff_time)))
1785
+
1786
+ # Order by launched_at descending (most recent first)
1787
+ query = query.order_by(
1788
+ sqlalchemy.desc(cluster_history_table.c.launched_at))
1789
+
1560
1790
  if cluster_hashes is not None:
1561
1791
  query = query.filter(
1562
1792
  cluster_history_table.c.cluster_hash.in_(cluster_hashes))
1563
1793
  rows = query.all()
1564
1794
 
1565
- filtered_rows = []
1566
1795
  usage_intervals_dict = {}
1567
1796
  row_to_user_hash = {}
1568
1797
  for row in rows:
@@ -1572,36 +1801,11 @@ def get_clusters_from_history(
1572
1801
  row_usage_intervals = pickle.loads(row.usage_intervals)
1573
1802
  except (pickle.PickleError, AttributeError):
1574
1803
  pass
1575
- # Parse status
1576
- status = None
1577
- if row.status:
1578
- status = status_lib.ClusterStatus[row.status]
1579
- # Apply filtering: always include active clusters, filter historical
1580
- # ones by time
1581
- if cutoff_time is not None and status is None: # Historical cluster
1582
- # For historical clusters, check if they were used recently
1583
- # Use the most recent activity from usage_intervals to determine
1584
- # last use
1585
- # Find the most recent activity time from usage_intervals
1586
- last_activity_time = None
1587
- if row_usage_intervals:
1588
- # Get the end time of the last interval (or start time if
1589
- # still running)
1590
- last_interval = row_usage_intervals[-1]
1591
- last_activity_time = (last_interval[1] if last_interval[1]
1592
- is not None else last_interval[0])
1593
-
1594
- # Skip historical clusters that haven't been used recently
1595
- if last_activity_time is None or last_activity_time < cutoff_time:
1596
- continue
1597
-
1598
- filtered_rows.append(row)
1599
1804
  usage_intervals_dict[row.cluster_hash] = row_usage_intervals
1600
1805
  user_hash = (row.user_hash
1601
1806
  if row.user_hash is not None else current_user_hash)
1602
1807
  row_to_user_hash[row.cluster_hash] = user_hash
1603
1808
 
1604
- rows = filtered_rows
1605
1809
  user_hashes = set(row_to_user_hash.values())
1606
1810
  user_hash_to_user = _get_users(user_hashes)
1607
1811
  cluster_hashes = set(row_to_user_hash.keys())
@@ -1616,10 +1820,10 @@ def get_clusters_from_history(
1616
1820
  user_name = user.name if user is not None else None
1617
1821
  if not abbreviate_response:
1618
1822
  last_event = last_cluster_event_dict.get(row.cluster_hash, None)
1823
+ launched_at = row.launched_at
1619
1824
  usage_intervals: Optional[List[Tuple[
1620
1825
  int,
1621
1826
  Optional[int]]]] = usage_intervals_dict.get(row.cluster_hash, None)
1622
- launched_at = _get_cluster_launch_time(usage_intervals)
1623
1827
  duration = _get_cluster_duration(usage_intervals)
1624
1828
 
1625
1829
  # Parse status
sky/jobs/client/sdk.py CHANGED
@@ -383,15 +383,24 @@ def dashboard() -> None:
383
383
  @server_common.check_server_healthy_or_start
384
384
  @versions.minimal_api_version(12)
385
385
  def pool_apply(
386
- task: Union['sky.Task', 'sky.Dag'],
386
+ task: Optional[Union['sky.Task', 'sky.Dag']],
387
387
  pool_name: str,
388
388
  mode: 'serve_utils.UpdateMode',
389
+ workers: Optional[int] = None,
389
390
  # Internal only:
390
391
  # pylint: disable=invalid-name
391
392
  _need_confirmation: bool = False
392
393
  ) -> server_common.RequestId[None]:
393
394
  """Apply a config to a pool."""
395
+ remote_api_version = versions.get_remote_api_version()
396
+ if (workers is not None and
397
+ (remote_api_version is None or remote_api_version < 19)):
398
+ raise click.UsageError('Updating the number of workers in a pool is '
399
+ 'not supported in your API server. Please '
400
+ 'upgrade to a newer API server to use this '
401
+ 'feature.')
394
402
  return impl.apply(task,
403
+ workers,
395
404
  pool_name,
396
405
  mode,
397
406
  pool=True,
sky/jobs/constants.py CHANGED
@@ -10,6 +10,8 @@ JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
10
10
 
11
11
  JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
12
12
 
13
+ JOB_CONTROLLER_INDICATOR_FILE = '~/.sky/is_jobs_controller'
14
+
13
15
  CONSOLIDATED_SIGNAL_PATH = os.path.expanduser('~/.sky/signals/')
14
16
  SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
15
17
  # Resources as a dict for the jobs controller.