skypilot-nightly 1.0.0.dev20250919__py3-none-any.whl → 1.0.0.dev20250925__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (113) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +10 -0
  3. sky/backends/backend_utils.py +200 -78
  4. sky/backends/cloud_vm_ray_backend.py +37 -13
  5. sky/backends/local_docker_backend.py +9 -0
  6. sky/client/cli/command.py +104 -53
  7. sky/client/sdk.py +13 -5
  8. sky/client/sdk_async.py +4 -2
  9. sky/clouds/kubernetes.py +2 -1
  10. sky/clouds/runpod.py +20 -7
  11. sky/core.py +7 -53
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_buildManifest.js +1 -1
  14. sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +16 -0
  18. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/{webpack-b2a3938c22b6647b.js → webpack-16ba1d7187d2e3b1.js} +1 -1
  20. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  21. sky/dashboard/out/clusters/[cluster].html +1 -1
  22. sky/dashboard/out/clusters.html +1 -1
  23. sky/dashboard/out/config.html +1 -1
  24. sky/dashboard/out/index.html +1 -1
  25. sky/dashboard/out/infra/[context].html +1 -1
  26. sky/dashboard/out/infra.html +1 -1
  27. sky/dashboard/out/jobs/[job].html +1 -1
  28. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  29. sky/dashboard/out/jobs.html +1 -1
  30. sky/dashboard/out/users.html +1 -1
  31. sky/dashboard/out/volumes.html +1 -1
  32. sky/dashboard/out/workspace/new.html +1 -1
  33. sky/dashboard/out/workspaces/[name].html +1 -1
  34. sky/dashboard/out/workspaces.html +1 -1
  35. sky/data/mounting_utils.py +19 -10
  36. sky/execution.py +4 -2
  37. sky/global_user_state.py +224 -38
  38. sky/jobs/client/sdk.py +10 -1
  39. sky/jobs/controller.py +7 -7
  40. sky/jobs/server/core.py +3 -3
  41. sky/jobs/server/server.py +15 -11
  42. sky/jobs/utils.py +1 -1
  43. sky/logs/agent.py +30 -3
  44. sky/logs/aws.py +9 -19
  45. sky/provision/__init__.py +2 -1
  46. sky/provision/aws/instance.py +2 -1
  47. sky/provision/azure/instance.py +2 -1
  48. sky/provision/cudo/instance.py +2 -2
  49. sky/provision/do/instance.py +2 -2
  50. sky/provision/docker_utils.py +41 -19
  51. sky/provision/fluidstack/instance.py +2 -2
  52. sky/provision/gcp/instance.py +2 -1
  53. sky/provision/hyperbolic/instance.py +2 -1
  54. sky/provision/instance_setup.py +1 -1
  55. sky/provision/kubernetes/instance.py +134 -8
  56. sky/provision/lambda_cloud/instance.py +2 -1
  57. sky/provision/nebius/instance.py +2 -1
  58. sky/provision/oci/instance.py +2 -1
  59. sky/provision/paperspace/instance.py +2 -2
  60. sky/provision/primeintellect/instance.py +2 -2
  61. sky/provision/provisioner.py +1 -0
  62. sky/provision/runpod/instance.py +2 -2
  63. sky/provision/scp/instance.py +2 -2
  64. sky/provision/seeweb/instance.py +2 -1
  65. sky/provision/vast/instance.py +2 -1
  66. sky/provision/vsphere/instance.py +6 -5
  67. sky/schemas/api/responses.py +2 -1
  68. sky/serve/autoscalers.py +2 -0
  69. sky/serve/client/impl.py +45 -19
  70. sky/serve/replica_managers.py +12 -5
  71. sky/serve/serve_utils.py +5 -11
  72. sky/serve/server/core.py +9 -6
  73. sky/serve/server/impl.py +78 -25
  74. sky/serve/server/server.py +4 -5
  75. sky/serve/service_spec.py +33 -0
  76. sky/server/auth/oauth2_proxy.py +2 -2
  77. sky/server/constants.py +1 -1
  78. sky/server/daemons.py +2 -3
  79. sky/server/requests/executor.py +56 -6
  80. sky/server/requests/payloads.py +31 -8
  81. sky/server/requests/preconditions.py +2 -3
  82. sky/server/rest.py +2 -0
  83. sky/server/server.py +28 -19
  84. sky/server/stream_utils.py +34 -12
  85. sky/setup_files/dependencies.py +12 -2
  86. sky/setup_files/setup.py +44 -44
  87. sky/skylet/constants.py +2 -3
  88. sky/templates/kubernetes-ray.yml.j2 +16 -15
  89. sky/usage/usage_lib.py +3 -0
  90. sky/utils/cli_utils/status_utils.py +4 -5
  91. sky/utils/context.py +104 -29
  92. sky/utils/controller_utils.py +7 -6
  93. sky/utils/kubernetes/create_cluster.sh +13 -28
  94. sky/utils/kubernetes/delete_cluster.sh +10 -7
  95. sky/utils/kubernetes/generate_kind_config.py +6 -66
  96. sky/utils/kubernetes/kubernetes_deploy_utils.py +170 -37
  97. sky/utils/kubernetes_enums.py +5 -0
  98. sky/utils/ux_utils.py +35 -1
  99. sky/utils/yaml_utils.py +9 -0
  100. sky/volumes/client/sdk.py +44 -8
  101. sky/volumes/server/server.py +33 -7
  102. sky/volumes/volume.py +22 -14
  103. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/METADATA +38 -33
  104. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/RECORD +109 -109
  105. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
  109. /sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_ssgManifest.js +0 -0
  110. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/WHEEL +0 -0
  111. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/entry_points.txt +0 -0
  112. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/licenses/LICENSE +0 -0
  113. {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/top_level.txt +0 -0
sky/execution.py CHANGED
@@ -265,8 +265,10 @@ def _execute_dag(
265
265
 
266
266
  cluster_exists = False
267
267
  if cluster_name is not None:
268
- cluster_record = global_user_state.get_cluster_from_name(cluster_name)
269
- cluster_exists = cluster_record is not None
268
+ # We use launched_at to check if the cluster exists, because this
269
+ # db query is faster than get_cluster_from_name.
270
+ cluster_exists = global_user_state.cluster_with_name_exists(
271
+ cluster_name)
270
272
  # TODO(woosuk): If the cluster exists, print a warning that
271
273
  # `cpus` and `memory` are not used as a job scheduling constraint,
272
274
  # unlike `gpus`.
sky/global_user_state.py CHANGED
@@ -24,6 +24,7 @@ from sqlalchemy import exc as sqlalchemy_exc
24
24
  from sqlalchemy import orm
25
25
  from sqlalchemy.dialects import postgresql
26
26
  from sqlalchemy.dialects import sqlite
27
+ from sqlalchemy.ext import asyncio as sql_async
27
28
  from sqlalchemy.ext import declarative
28
29
 
29
30
  from sky import models
@@ -51,6 +52,7 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
51
52
  _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
52
53
 
53
54
  _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
55
+ _SQLALCHEMY_ENGINE_ASYNC: Optional[sql_async.AsyncEngine] = None
54
56
  _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
55
57
 
56
58
  DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
@@ -296,6 +298,20 @@ def create_table(engine: sqlalchemy.engine.Engine):
296
298
  migration_utils.GLOBAL_USER_STATE_VERSION)
297
299
 
298
300
 
301
+ def initialize_and_get_db_async() -> sql_async.AsyncEngine:
302
+ global _SQLALCHEMY_ENGINE_ASYNC
303
+ if _SQLALCHEMY_ENGINE_ASYNC is not None:
304
+ return _SQLALCHEMY_ENGINE_ASYNC
305
+ with _SQLALCHEMY_ENGINE_LOCK:
306
+ if _SQLALCHEMY_ENGINE_ASYNC is not None:
307
+ return _SQLALCHEMY_ENGINE_ASYNC
308
+
309
+ _SQLALCHEMY_ENGINE_ASYNC = db_utils.get_engine('state',
310
+ async_engine=True)
311
+ initialize_and_get_db()
312
+ return _SQLALCHEMY_ENGINE_ASYNC
313
+
314
+
299
315
  # We wrap the sqlalchemy engine initialization in a thread
300
316
  # lock to ensure that multiple threads do not initialize the
301
317
  # engine which could result in a rare race condition where
@@ -321,6 +337,22 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
321
337
  return _SQLALCHEMY_ENGINE
322
338
 
323
339
 
340
+ def _init_db_async(func):
341
+ """Initialize the async database."""
342
+
343
+ @functools.wraps(func)
344
+ async def wrapper(*args, **kwargs):
345
+ if _SQLALCHEMY_ENGINE_ASYNC is None:
346
+ # this may happen multiple times since there is no locking
347
+ # here but thats fine, this is just a short circuit for the
348
+ # common case.
349
+ await context_utils.to_thread(initialize_and_get_db_async)
350
+
351
+ return await func(*args, **kwargs)
352
+
353
+ return wrapper
354
+
355
+
324
356
  def _init_db(func):
325
357
  """Initialize the database."""
326
358
 
@@ -1010,29 +1042,68 @@ def get_handle_from_cluster_name(
1010
1042
  assert _SQLALCHEMY_ENGINE is not None
1011
1043
  assert cluster_name is not None, 'cluster_name cannot be None'
1012
1044
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1013
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1045
+ row = (session.query(
1046
+ cluster_table.c.handle).filter_by(name=cluster_name).first())
1014
1047
  if row is None:
1015
1048
  return None
1016
1049
  return pickle.loads(row.handle)
1017
1050
 
1018
1051
 
1052
+ @_init_db_async
1053
+ @metrics_lib.time_me
1054
+ async def get_status_from_cluster_name_async(
1055
+ cluster_name: str) -> Optional[status_lib.ClusterStatus]:
1056
+ """Get the status of a cluster."""
1057
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1058
+ assert cluster_name is not None, 'cluster_name cannot be None'
1059
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1060
+ result = await session.execute(
1061
+ sqlalchemy.select(cluster_table.c.status).where(
1062
+ cluster_table.c.name == cluster_name))
1063
+ row = result.first()
1064
+
1065
+ if row is None:
1066
+ return None
1067
+ return status_lib.ClusterStatus(row[0])
1068
+
1069
+
1019
1070
  @_init_db
1020
1071
  @metrics_lib.time_me
1021
- def get_glob_cluster_names(cluster_name: str) -> List[str]:
1072
+ def get_status_from_cluster_name(
1073
+ cluster_name: str) -> Optional[status_lib.ClusterStatus]:
1074
+ assert _SQLALCHEMY_ENGINE is not None
1075
+ assert cluster_name is not None, 'cluster_name cannot be None'
1076
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1077
+ row = session.query(
1078
+ cluster_table.c.status).filter_by(name=cluster_name).first()
1079
+ if row is None:
1080
+ return None
1081
+ return status_lib.ClusterStatus[row.status]
1082
+
1083
+
1084
+ @_init_db
1085
+ @metrics_lib.time_me
1086
+ def get_glob_cluster_names(
1087
+ cluster_name: str,
1088
+ workspaces_filter: Optional[Set[str]] = None) -> List[str]:
1022
1089
  assert _SQLALCHEMY_ENGINE is not None
1023
1090
  assert cluster_name is not None, 'cluster_name cannot be None'
1024
1091
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1025
1092
  if (_SQLALCHEMY_ENGINE.dialect.name ==
1026
1093
  db_utils.SQLAlchemyDialect.SQLITE.value):
1027
- rows = session.query(cluster_table).filter(
1028
- cluster_table.c.name.op('GLOB')(cluster_name)).all()
1094
+ query = session.query(cluster_table.c.name).filter(
1095
+ cluster_table.c.name.op('GLOB')(cluster_name))
1029
1096
  elif (_SQLALCHEMY_ENGINE.dialect.name ==
1030
1097
  db_utils.SQLAlchemyDialect.POSTGRESQL.value):
1031
- rows = session.query(cluster_table).filter(
1098
+ query = session.query(cluster_table.c.name).filter(
1032
1099
  cluster_table.c.name.op('SIMILAR TO')(
1033
- _glob_to_similar(cluster_name))).all()
1100
+ _glob_to_similar(cluster_name)))
1034
1101
  else:
1035
1102
  raise ValueError('Unsupported database dialect')
1103
+ if workspaces_filter is not None:
1104
+ query = query.filter(
1105
+ cluster_table.c.workspace.in_(workspaces_filter))
1106
+ rows = query.all()
1036
1107
  return [row.name for row in rows]
1037
1108
 
1038
1109
 
@@ -1076,7 +1147,8 @@ def set_cluster_autostop_value(cluster_name: str, idle_minutes: int,
1076
1147
  def get_cluster_launch_time(cluster_name: str) -> Optional[int]:
1077
1148
  assert _SQLALCHEMY_ENGINE is not None
1078
1149
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1079
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1150
+ row = session.query(
1151
+ cluster_table.c.launched_at).filter_by(name=cluster_name).first()
1080
1152
  if row is None or row.launched_at is None:
1081
1153
  return None
1082
1154
  return int(row.launched_at)
@@ -1087,7 +1159,8 @@ def get_cluster_launch_time(cluster_name: str) -> Optional[int]:
1087
1159
  def get_cluster_info(cluster_name: str) -> Optional[Dict[str, Any]]:
1088
1160
  assert _SQLALCHEMY_ENGINE is not None
1089
1161
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1090
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1162
+ row = session.query(
1163
+ cluster_table.c.metadata).filter_by(name=cluster_name).first()
1091
1164
  if row is None or row.metadata is None:
1092
1165
  return None
1093
1166
  return json.loads(row.metadata)
@@ -1167,7 +1240,8 @@ def get_cluster_storage_mounts_metadata(
1167
1240
  cluster_name: str) -> Optional[Dict[str, Any]]:
1168
1241
  assert _SQLALCHEMY_ENGINE is not None
1169
1242
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1170
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1243
+ row = (session.query(cluster_table.c.storage_mounts_metadata).filter_by(
1244
+ name=cluster_name).first())
1171
1245
  if row is None or row.storage_mounts_metadata is None:
1172
1246
  return None
1173
1247
  return pickle.loads(row.storage_mounts_metadata)
@@ -1196,7 +1270,9 @@ def get_cluster_skylet_ssh_tunnel_metadata(
1196
1270
  cluster_name: str) -> Optional[Tuple[int, int]]:
1197
1271
  assert _SQLALCHEMY_ENGINE is not None
1198
1272
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1199
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1273
+ row = session.query(
1274
+ cluster_table.c.skylet_ssh_tunnel_metadata).filter_by(
1275
+ name=cluster_name).first()
1200
1276
  if row is None or row.skylet_ssh_tunnel_metadata is None:
1201
1277
  return None
1202
1278
  return pickle.loads(row.skylet_ssh_tunnel_metadata)
@@ -1230,7 +1306,7 @@ def _get_cluster_usage_intervals(
1230
1306
  if cluster_hash is None:
1231
1307
  return None
1232
1308
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1233
- row = session.query(cluster_history_table).filter_by(
1309
+ row = session.query(cluster_history_table.c.usage_intervals).filter_by(
1234
1310
  cluster_hash=cluster_hash).first()
1235
1311
  if row is None or row.usage_intervals is None:
1236
1312
  return None
@@ -1305,7 +1381,8 @@ def set_owner_identity_for_cluster(cluster_name: str,
1305
1381
  def _get_hash_for_existing_cluster(cluster_name: str) -> Optional[str]:
1306
1382
  assert _SQLALCHEMY_ENGINE is not None
1307
1383
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1308
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1384
+ row = (session.query(
1385
+ cluster_table.c.cluster_hash).filter_by(name=cluster_name).first())
1309
1386
  if row is None or row.cluster_hash is None:
1310
1387
  return None
1311
1388
  return row.cluster_hash
@@ -1317,8 +1394,10 @@ def get_launched_resources_from_cluster_hash(
1317
1394
  cluster_hash: str) -> Optional[Tuple[int, Any]]:
1318
1395
  assert _SQLALCHEMY_ENGINE is not None
1319
1396
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1320
- row = session.query(cluster_history_table).filter_by(
1321
- cluster_hash=cluster_hash).first()
1397
+ row = session.query(
1398
+ cluster_history_table.c.num_nodes,
1399
+ cluster_history_table.c.launched_resources).filter_by(
1400
+ cluster_hash=cluster_hash).first()
1322
1401
  if row is None:
1323
1402
  return None
1324
1403
  num_nodes = row.num_nodes
@@ -1362,17 +1441,56 @@ def _load_storage_mounts_metadata(
1362
1441
  @metrics_lib.time_me
1363
1442
  @context_utils.cancellation_guard
1364
1443
  def get_cluster_from_name(
1365
- cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
1444
+ cluster_name: Optional[str],
1445
+ *,
1446
+ include_user_info: bool = True,
1447
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
1366
1448
  assert _SQLALCHEMY_ENGINE is not None
1367
1449
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1368
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1450
+ if summary_response:
1451
+ query = session.query(
1452
+ cluster_table.c.name, cluster_table.c.launched_at,
1453
+ cluster_table.c.handle, cluster_table.c.last_use,
1454
+ cluster_table.c.status, cluster_table.c.autostop,
1455
+ cluster_table.c.to_down, cluster_table.c.owner,
1456
+ cluster_table.c.metadata, cluster_table.c.cluster_hash,
1457
+ cluster_table.c.storage_mounts_metadata,
1458
+ cluster_table.c.cluster_ever_up,
1459
+ cluster_table.c.status_updated_at, cluster_table.c.user_hash,
1460
+ cluster_table.c.config_hash, cluster_table.c.workspace,
1461
+ cluster_table.c.is_managed)
1462
+ else:
1463
+ query = session.query(
1464
+ cluster_table.c.name,
1465
+ cluster_table.c.launched_at,
1466
+ cluster_table.c.handle,
1467
+ cluster_table.c.last_use,
1468
+ cluster_table.c.status,
1469
+ cluster_table.c.autostop,
1470
+ cluster_table.c.to_down,
1471
+ cluster_table.c.owner,
1472
+ cluster_table.c.metadata,
1473
+ cluster_table.c.cluster_hash,
1474
+ cluster_table.c.storage_mounts_metadata,
1475
+ cluster_table.c.cluster_ever_up,
1476
+ cluster_table.c.status_updated_at,
1477
+ cluster_table.c.user_hash,
1478
+ cluster_table.c.config_hash,
1479
+ cluster_table.c.workspace,
1480
+ cluster_table.c.is_managed,
1481
+ # extra fields compared to above query
1482
+ cluster_table.c.last_creation_yaml,
1483
+ cluster_table.c.last_creation_command)
1484
+ row = query.filter_by(name=cluster_name).first()
1369
1485
  if row is None:
1370
1486
  return None
1371
- user_hash = _get_user_hash_or_current_user(row.user_hash)
1372
- user = get_user(user_hash)
1373
- user_name = user.name if user is not None else None
1374
- last_event = get_last_cluster_event(
1375
- row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE)
1487
+ if include_user_info:
1488
+ user_hash = _get_user_hash_or_current_user(row.user_hash)
1489
+ user = get_user(user_hash)
1490
+ user_name = user.name if user is not None else None
1491
+ if not summary_response:
1492
+ last_event = get_last_cluster_event(
1493
+ row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE)
1376
1494
  # TODO: use namedtuple instead of dict
1377
1495
  record = {
1378
1496
  'name': row.name,
@@ -1389,26 +1507,43 @@ def get_cluster_from_name(
1389
1507
  row.storage_mounts_metadata),
1390
1508
  'cluster_ever_up': bool(row.cluster_ever_up),
1391
1509
  'status_updated_at': row.status_updated_at,
1392
- 'user_hash': user_hash,
1393
- 'user_name': user_name,
1394
- 'config_hash': row.config_hash,
1395
1510
  'workspace': row.workspace,
1396
- 'last_creation_yaml': row.last_creation_yaml,
1397
- 'last_creation_command': row.last_creation_command,
1398
1511
  'is_managed': bool(row.is_managed),
1399
- 'last_event': last_event,
1512
+ 'config_hash': row.config_hash,
1400
1513
  }
1514
+ if not summary_response:
1515
+ record['last_creation_yaml'] = row.last_creation_yaml
1516
+ record['last_creation_command'] = row.last_creation_command
1517
+ record['last_event'] = last_event
1518
+ if include_user_info:
1519
+ record['user_hash'] = user_hash
1520
+ record['user_name'] = user_name
1401
1521
 
1402
1522
  return record
1403
1523
 
1404
1524
 
1525
+ @_init_db
1526
+ @metrics_lib.time_me
1527
+ @context_utils.cancellation_guard
1528
+ def cluster_with_name_exists(cluster_name: str) -> bool:
1529
+ assert _SQLALCHEMY_ENGINE is not None
1530
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1531
+ row = session.query(
1532
+ cluster_table.c.name).filter_by(name=cluster_name).first()
1533
+ if row is None:
1534
+ return False
1535
+ return True
1536
+
1537
+
1405
1538
  @_init_db
1406
1539
  @metrics_lib.time_me
1407
1540
  def get_clusters(
1408
1541
  *, # keyword only separator
1409
1542
  exclude_managed_clusters: bool = False,
1410
- workspaces_filter: Optional[Set[str]] = None,
1543
+ workspaces_filter: Optional[Dict[str, Any]] = None,
1411
1544
  user_hashes_filter: Optional[Set[str]] = None,
1545
+ cluster_names: Optional[List[str]] = None,
1546
+ summary_response: bool = False,
1412
1547
  ) -> List[Dict[str, Any]]:
1413
1548
  """Get clusters from the database.
1414
1549
 
@@ -1419,13 +1554,48 @@ def get_clusters(
1419
1554
  that has workspace field set to one of the values.
1420
1555
  user_hashes_filter: If specified, only include clusters
1421
1556
  that has user_hash field set to one of the values.
1557
+ cluster_names: If specified, only include clusters
1558
+ that has name field set to one of the values.
1422
1559
  """
1423
1560
  # is a cluster has a null user_hash,
1424
1561
  # we treat it as belonging to the current user.
1425
1562
  current_user_hash = common_utils.get_user_hash()
1426
1563
  assert _SQLALCHEMY_ENGINE is not None
1427
1564
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1428
- query = session.query(cluster_table)
1565
+ if summary_response:
1566
+ query = session.query(
1567
+ cluster_table.c.name, cluster_table.c.launched_at,
1568
+ cluster_table.c.handle, cluster_table.c.last_use,
1569
+ cluster_table.c.status, cluster_table.c.autostop,
1570
+ cluster_table.c.to_down, cluster_table.c.owner,
1571
+ cluster_table.c.metadata, cluster_table.c.cluster_hash,
1572
+ cluster_table.c.storage_mounts_metadata,
1573
+ cluster_table.c.cluster_ever_up,
1574
+ cluster_table.c.status_updated_at, cluster_table.c.user_hash,
1575
+ cluster_table.c.config_hash, cluster_table.c.workspace,
1576
+ cluster_table.c.is_managed)
1577
+ else:
1578
+ query = session.query(
1579
+ cluster_table.c.name,
1580
+ cluster_table.c.launched_at,
1581
+ cluster_table.c.handle,
1582
+ cluster_table.c.last_use,
1583
+ cluster_table.c.status,
1584
+ cluster_table.c.autostop,
1585
+ cluster_table.c.to_down,
1586
+ cluster_table.c.owner,
1587
+ cluster_table.c.metadata,
1588
+ cluster_table.c.cluster_hash,
1589
+ cluster_table.c.storage_mounts_metadata,
1590
+ cluster_table.c.cluster_ever_up,
1591
+ cluster_table.c.status_updated_at,
1592
+ cluster_table.c.user_hash,
1593
+ cluster_table.c.config_hash,
1594
+ cluster_table.c.workspace,
1595
+ cluster_table.c.is_managed,
1596
+ # extra fields compared to above query
1597
+ cluster_table.c.last_creation_yaml,
1598
+ cluster_table.c.last_creation_command)
1429
1599
  if exclude_managed_clusters:
1430
1600
  query = query.filter(cluster_table.c.is_managed == int(False))
1431
1601
  if workspaces_filter is not None:
@@ -1437,11 +1607,13 @@ def get_clusters(
1437
1607
  # If current_user_hash is in user_hashes_filter, we include
1438
1608
  # clusters that have a null user_hash.
1439
1609
  query = query.filter(
1440
- cluster_table.c.user_hash.in_(user_hashes_filter) |
1441
- (cluster_table.c.user_hash is None))
1610
+ (cluster_table.c.user_hash.in_(user_hashes_filter) |
1611
+ (cluster_table.c.user_hash is None)))
1442
1612
  else:
1443
1613
  query = query.filter(
1444
1614
  cluster_table.c.user_hash.in_(user_hashes_filter))
1615
+ if cluster_names is not None:
1616
+ query = query.filter(cluster_table.c.name.in_(cluster_names))
1445
1617
  query = query.order_by(sqlalchemy.desc(cluster_table.c.launched_at))
1446
1618
  rows = query.all()
1447
1619
  records = []
@@ -1459,15 +1631,15 @@ def get_clusters(
1459
1631
 
1460
1632
  # get last cluster event for each row
1461
1633
  cluster_hashes = set(row_to_user_hash.keys())
1462
- last_cluster_event_dict = _get_last_cluster_event_multiple(
1463
- cluster_hashes, ClusterEventType.STATUS_CHANGE)
1634
+ if not summary_response:
1635
+ last_cluster_event_dict = _get_last_cluster_event_multiple(
1636
+ cluster_hashes, ClusterEventType.STATUS_CHANGE)
1464
1637
 
1465
1638
  # get user for each row
1466
1639
  for row in rows:
1467
1640
  user_hash = row_to_user_hash[row.cluster_hash]
1468
1641
  user = user_hash_to_user.get(user_hash, None)
1469
1642
  user_name = user.name if user is not None else None
1470
- last_event = last_cluster_event_dict.get(row.cluster_hash, None)
1471
1643
  # TODO: use namedtuple instead of dict
1472
1644
  record = {
1473
1645
  'name': row.name,
@@ -1486,18 +1658,32 @@ def get_clusters(
1486
1658
  'status_updated_at': row.status_updated_at,
1487
1659
  'user_hash': user_hash,
1488
1660
  'user_name': user_name,
1489
- 'config_hash': row.config_hash,
1490
1661
  'workspace': row.workspace,
1491
- 'last_creation_yaml': row.last_creation_yaml,
1492
- 'last_creation_command': row.last_creation_command,
1493
1662
  'is_managed': bool(row.is_managed),
1494
- 'last_event': last_event,
1663
+ 'config_hash': row.config_hash,
1495
1664
  }
1665
+ if not summary_response:
1666
+ record['last_creation_yaml'] = row.last_creation_yaml
1667
+ record['last_creation_command'] = row.last_creation_command
1668
+ record['last_event'] = last_cluster_event_dict.get(
1669
+ row.cluster_hash, None)
1496
1670
 
1497
1671
  records.append(record)
1498
1672
  return records
1499
1673
 
1500
1674
 
1675
+ @_init_db
1676
+ @metrics_lib.time_me
1677
+ def get_cluster_names(exclude_managed_clusters: bool = False,) -> List[str]:
1678
+ assert _SQLALCHEMY_ENGINE is not None
1679
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1680
+ query = session.query(cluster_table.c.name)
1681
+ if exclude_managed_clusters:
1682
+ query = query.filter(cluster_table.c.is_managed == int(False))
1683
+ rows = query.all()
1684
+ return [row[0] for row in rows]
1685
+
1686
+
1501
1687
  @_init_db
1502
1688
  @metrics_lib.time_me
1503
1689
  def get_clusters_from_history(
sky/jobs/client/sdk.py CHANGED
@@ -383,15 +383,24 @@ def dashboard() -> None:
383
383
  @server_common.check_server_healthy_or_start
384
384
  @versions.minimal_api_version(12)
385
385
  def pool_apply(
386
- task: Union['sky.Task', 'sky.Dag'],
386
+ task: Optional[Union['sky.Task', 'sky.Dag']],
387
387
  pool_name: str,
388
388
  mode: 'serve_utils.UpdateMode',
389
+ workers: Optional[int] = None,
389
390
  # Internal only:
390
391
  # pylint: disable=invalid-name
391
392
  _need_confirmation: bool = False
392
393
  ) -> server_common.RequestId[None]:
393
394
  """Apply a config to a pool."""
395
+ remote_api_version = versions.get_remote_api_version()
396
+ if (workers is not None and
397
+ (remote_api_version is None or remote_api_version < 19)):
398
+ raise click.UsageError('Updating the number of workers in a pool is '
399
+ 'not supported in your API server. Please '
400
+ 'upgrade to a newer API server to use this '
401
+ 'feature.')
394
402
  return impl.apply(task,
403
+ workers,
395
404
  pool_name,
396
405
  mode,
397
406
  pool=True,
sky/jobs/controller.py CHANGED
@@ -3,6 +3,7 @@
3
3
  import asyncio
4
4
  import logging
5
5
  import os
6
+ import pathlib
6
7
  import resource
7
8
  import shutil
8
9
  import sys
@@ -56,6 +57,7 @@ async def create_background_task(coro: typing.Coroutine) -> None:
56
57
  async with _background_tasks_lock:
57
58
  task = asyncio.create_task(coro)
58
59
  _background_tasks.add(task)
60
+ # TODO(cooperc): Discard needs a lock?
59
61
  task.add_done_callback(_background_tasks.discard)
60
62
 
61
63
 
@@ -896,6 +898,9 @@ class Controller:
896
898
  # some data here.
897
899
  raise error
898
900
 
901
+ # Use context.contextual to enable per-job output redirection and env var
902
+ # isolation.
903
+ @context.contextual
899
904
  async def run_job_loop(self,
900
905
  job_id: int,
901
906
  dag_yaml: str,
@@ -904,13 +909,9 @@ class Controller:
904
909
  env_file_path: Optional[str] = None,
905
910
  pool: Optional[str] = None):
906
911
  """Background task that runs the job loop."""
907
- # Replace os.environ with ContextualEnviron to enable per-job
908
- # environment isolation. This allows each job to have its own
909
- # environment variables without affecting other jobs or the main
910
- # process.
911
- context.initialize()
912
912
  ctx = context.get()
913
- ctx.redirect_log(log_file) # type: ignore
913
+ assert ctx is not None, 'Context is not initialized'
914
+ ctx.redirect_log(pathlib.Path(log_file))
914
915
 
915
916
  # Load and apply environment variables from the job's environment file
916
917
  if env_file_path and os.path.exists(env_file_path):
@@ -921,7 +922,6 @@ class Controller:
921
922
  f'{list(env_vars.keys())}')
922
923
 
923
924
  # Apply environment variables to the job's context
924
- ctx = context.get()
925
925
  if ctx is not None:
926
926
  for key, value in env_vars.items():
927
927
  if value is not None:
sky/jobs/server/core.py CHANGED
@@ -281,8 +281,7 @@ def launch(
281
281
  # Check whether cached jobs controller cluster is accessible
282
282
  cluster_name = (
283
283
  controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
284
- record = global_user_state.get_cluster_from_name(cluster_name)
285
- if record is not None:
284
+ if global_user_state.cluster_with_name_exists(cluster_name):
286
285
  # there is a cached jobs controller cluster
287
286
  try:
288
287
  # TODO: do something with returned status?
@@ -959,9 +958,10 @@ def pool_apply(
959
958
  task: 'sky.Task',
960
959
  pool_name: str,
961
960
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
961
+ workers: Optional[int] = None,
962
962
  ) -> None:
963
963
  """Apply a config to a pool."""
964
- return impl.apply(task, pool_name, mode, pool=True)
964
+ return impl.apply(task, workers, pool_name, mode, pool=True)
965
965
 
966
966
 
967
967
  @usage_lib.entrypoint
sky/jobs/server/server.py CHANGED
@@ -94,23 +94,27 @@ async def logs(
94
94
  request: fastapi.Request, jobs_logs_body: payloads.JobsLogsBody,
95
95
  background_tasks: fastapi.BackgroundTasks
96
96
  ) -> fastapi.responses.StreamingResponse:
97
- executor.schedule_request(
97
+ schedule_type = api_requests.ScheduleType.SHORT
98
+ if jobs_logs_body.refresh:
99
+ # When refresh is specified, the job controller might be restarted,
100
+ # which takes longer time to finish. We schedule it to long executor.
101
+ schedule_type = api_requests.ScheduleType.LONG
102
+ request_task = executor.prepare_request(
98
103
  request_id=request.state.request_id,
99
104
  request_name='jobs.logs',
100
105
  request_body=jobs_logs_body,
101
106
  func=core.tail_logs,
102
- # TODO(aylei): We have tail logs scheduled as SHORT request, because it
103
- # should be responsive. However, it can be long running if the user's
104
- # job keeps running, and we should avoid it taking the SHORT worker
105
- # indefinitely.
106
- # When refresh is True we schedule it as LONG because a controller
107
- # restart might be needed.
108
- schedule_type=api_requests.ScheduleType.LONG
109
- if jobs_logs_body.refresh else api_requests.ScheduleType.SHORT,
107
+ schedule_type=schedule_type,
110
108
  request_cluster_name=common.JOB_CONTROLLER_NAME,
111
109
  )
112
- request_task = await api_requests.get_request_async(request.state.request_id
113
- )
110
+ if schedule_type == api_requests.ScheduleType.LONG:
111
+ executor.schedule_prepared_request(request_task)
112
+ else:
113
+ # For short request, run in the coroutine to avoid blocking
114
+ # short workers.
115
+ task = executor.execute_request_in_coroutine(request_task)
116
+ # Cancel the coroutine after the request is done or client disconnects
117
+ background_tasks.add_task(task.cancel)
114
118
 
115
119
  return stream_utils.stream_response(
116
120
  request_id=request_task.request_id,
sky/jobs/utils.py CHANGED
@@ -156,7 +156,7 @@ def _validate_consolidation_mode_config(
156
156
  if current_is_consolidation_mode:
157
157
  controller_cn = (
158
158
  controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
159
- if global_user_state.get_cluster_from_name(controller_cn) is not None:
159
+ if global_user_state.cluster_with_name_exists(controller_cn):
160
160
  with ux_utils.print_exception_no_traceback():
161
161
  raise exceptions.InconsistentConsolidationModeError(
162
162
  f'{colorama.Fore.RED}Consolidation mode for jobs is '
sky/logs/agent.py CHANGED
@@ -34,7 +34,8 @@ class FluentbitAgent(LoggingAgent):
34
34
  def get_setup_command(self,
35
35
  cluster_name: resources_utils.ClusterName) -> str:
36
36
  install_cmd = (
37
- 'if ! command -v fluent-bit >/dev/null 2>&1; then '
37
+ # pylint: disable=line-too-long
38
+ 'if ! command -v fluent-bit >/dev/null 2>&1 && [ ! -f /opt/fluent-bit/bin/fluent-bit ]; then '
38
39
  'sudo apt-get update; sudo apt-get install -y gnupg; '
39
40
  # pylint: disable=line-too-long
40
41
  'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
@@ -51,14 +52,32 @@ class FluentbitAgent(LoggingAgent):
51
52
  cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
52
53
  config_cmd = (f'mkdir -p {constants.LOGGING_CONFIG_DIR} && '
53
54
  f'echo {shlex.quote(cfg)} > {cfg_path}')
55
+ kill_prior_cmd = (
56
+ 'if [ -f "/tmp/fluentbit.pid" ]; then '
57
+ # pylint: disable=line-too-long
58
+ 'echo "Killing prior fluent-bit process $(cat /tmp/fluentbit.pid)"; '
59
+ 'kill "$(cat /tmp/fluentbit.pid)" || true; '
60
+ 'fi')
54
61
  start_cmd = ('nohup $(command -v fluent-bit || '
55
62
  'echo "/opt/fluent-bit/bin/fluent-bit") '
56
- f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 &')
57
- return f'set -e; {install_cmd}; {config_cmd}; {start_cmd}'
63
+ f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 & '
64
+ 'echo $! > /tmp/fluentbit.pid')
65
+ return ('set -e; '
66
+ f'{install_cmd}; '
67
+ f'{config_cmd}; '
68
+ f'{kill_prior_cmd}; '
69
+ f'{start_cmd}')
58
70
 
59
71
  def fluentbit_config(self,
60
72
  cluster_name: resources_utils.ClusterName) -> str:
61
73
  cfg_dict = {
74
+ 'parsers': [{
75
+ 'name': 'sky-ray-parser',
76
+ 'format': 'regex',
77
+ # pylint: disable=line-too-long
78
+ 'regex': r'(?:\x1b\[[\d;]+m)?\((?<worker_name>[^,]+)(?:,\s*rank=(?<rank>\d+))?(?:,\s*pid=(?<pid>\d+))(?:,\s*ip=(?<ip>[\d.]+))?\)(?:\x1b\[[\d;]+m)?\s*(?<log_line>.*)',
79
+ 'types': 'rank:integer pid:integer',
80
+ }],
62
81
  'pipeline': {
63
82
  'inputs': [{
64
83
  'name': 'tail',
@@ -70,6 +89,14 @@ class FluentbitAgent(LoggingAgent):
70
89
  # right after the job completion.
71
90
  'refresh_interval': 1,
72
91
  }],
92
+ 'filters': [{
93
+ 'name': 'parser',
94
+ 'match': '*',
95
+ 'key_name': 'log',
96
+ 'parser': 'sky-ray-parser',
97
+ 'preserve_key': 'on', # preserve field for backwards compat
98
+ 'reserve_data': 'on',
99
+ }],
73
100
  'outputs': [self.fluentbit_output_config(cluster_name)],
74
101
  }
75
102
  }