skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250816__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend.py +5 -3
- sky/backends/backend_utils.py +22 -7
- sky/backends/cloud_vm_ray_backend.py +50 -18
- sky/backends/local_docker_backend.py +8 -3
- sky/client/cli/command.py +25 -10
- sky/client/sdk.py +51 -1
- sky/clouds/kubernetes.py +2 -6
- sky/clouds/nebius.py +3 -1
- sky/core.py +9 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1121-2edb8ab2ba080a76.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-2f60a90b7d76838e.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-fd15b3ff228f7738.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.bc5d2853355c9c47.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/{4725.29550342bd53afd8.js → 4725.10f7a9a5d3ea8208.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-e6f350f567182e87.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +36 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-89a84fd7fa31362d.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/9984.7eb6cc51fb460cae.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c2ea34fda4f1f8c8.js → _app-ce361c6959bc2001.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-ec747e4f2dc39b57.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-81351f95f3bec08e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-c320641c2bcbbea6.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-664c36eda967b1ba.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-4b3ba1792dc6f21d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-65f72dee417237ef.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-338de9df523d883a.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-b6987eb47888da9c.js +1 -0
- sky/dashboard/out/_next/static/yW7-Bc1l0EwIosbauU8LZ/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +29 -9
- sky/execution.py +13 -10
- sky/global_user_state.py +131 -2
- sky/jobs/constants.py +1 -1
- sky/jobs/recovery_strategy.py +0 -3
- sky/jobs/scheduler.py +14 -21
- sky/jobs/server/core.py +64 -10
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +1 -3
- sky/jobs/utils.py +159 -11
- sky/provision/aws/config.py +19 -3
- sky/provision/aws/instance.py +2 -1
- sky/provision/kubernetes/instance.py +2 -1
- sky/provision/nebius/utils.py +101 -86
- sky/provision/provisioner.py +13 -8
- sky/resources.py +5 -5
- sky/schemas/api/responses.py +50 -1
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/serve/replica_managers.py +123 -101
- sky/serve/serve_state.py +32 -0
- sky/serve/serve_utils.py +37 -16
- sky/serve/service.py +51 -17
- sky/server/common.py +2 -3
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +20 -5
- sky/server/requests/serializers/encoders.py +21 -8
- sky/server/server.py +57 -11
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/cli_utils/status_utils.py +2 -1
- sky/utils/common_utils.py +20 -0
- sky/utils/controller_utils.py +17 -4
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +3 -0
- sky/utils/ux_utils.py +36 -5
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/RECORD +107 -106
- sky/dashboard/out/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-00c0a51d21157453.js +0 -1
- /sky/dashboard/out/_next/static/chunks/{6989-37611fe6b86d274d.js → 6989-01359c57e018caa4.js} +0 -0
- /sky/dashboard/out/_next/static/{Y0eNlwi85qGRecLTin11y → yW7-Bc1l0EwIosbauU8LZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/top_level.txt +0 -0
sky/data/storage_utils.py
CHANGED
|
@@ -252,17 +252,28 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
|
|
|
252
252
|
|
|
253
253
|
def zip_files_and_folders(items: List[str],
|
|
254
254
|
output_file: Union[str, pathlib.Path],
|
|
255
|
-
log_file: Optional[TextIO] = None
|
|
255
|
+
log_file: Optional[TextIO] = None,
|
|
256
|
+
relative_to_items: bool = False):
|
|
257
|
+
|
|
258
|
+
def _get_archive_name(file_path: str, item_path: str) -> str:
|
|
259
|
+
"""Get the archive name for a file based on the relative parameters."""
|
|
260
|
+
if relative_to_items:
|
|
261
|
+
# Make paths relative to the item itself
|
|
262
|
+
return os.path.relpath(file_path, os.path.dirname(item_path))
|
|
263
|
+
else:
|
|
264
|
+
# Default: use full path (existing behavior)
|
|
265
|
+
return file_path
|
|
256
266
|
|
|
257
|
-
def _store_symlink(zipf, path: str, is_dir: bool):
|
|
267
|
+
def _store_symlink(zipf, path: str, archive_name: str, is_dir: bool):
|
|
258
268
|
# Get the target of the symlink
|
|
259
269
|
target = os.readlink(path)
|
|
260
270
|
# Use relative path as absolute path will not be able to resolve on
|
|
261
271
|
# remote API server.
|
|
262
272
|
if os.path.isabs(target):
|
|
263
273
|
target = os.path.relpath(target, os.path.dirname(path))
|
|
264
|
-
# Create a ZipInfo instance
|
|
265
|
-
zi = zipfile.ZipInfo(
|
|
274
|
+
# Create a ZipInfo instance using the archive name
|
|
275
|
+
zi = zipfile.ZipInfo(archive_name +
|
|
276
|
+
'/') if is_dir else zipfile.ZipInfo(archive_name)
|
|
266
277
|
# Set external attributes to mark as symlink
|
|
267
278
|
zi.external_attr = 0xA1ED0000
|
|
268
279
|
# Write symlink target as content
|
|
@@ -281,7 +292,8 @@ def zip_files_and_folders(items: List[str],
|
|
|
281
292
|
# Add the file to the zip archive even if it matches
|
|
282
293
|
# patterns in dot ignore files, as it was explicitly
|
|
283
294
|
# specified by user.
|
|
284
|
-
|
|
295
|
+
archive_name = _get_archive_name(item, item)
|
|
296
|
+
zipf.write(item, archive_name)
|
|
285
297
|
elif os.path.isdir(item):
|
|
286
298
|
excluded_files = set([
|
|
287
299
|
os.path.join(item, f.rstrip('/'))
|
|
@@ -304,21 +316,29 @@ def zip_files_and_folders(items: List[str],
|
|
|
304
316
|
# directories)
|
|
305
317
|
for dir_name in dirs:
|
|
306
318
|
dir_path = os.path.join(root, dir_name)
|
|
319
|
+
archive_name = _get_archive_name(dir_path, item)
|
|
307
320
|
# If it's a symlink, store it as a symlink
|
|
308
321
|
if os.path.islink(dir_path):
|
|
309
|
-
_store_symlink(zipf,
|
|
322
|
+
_store_symlink(zipf,
|
|
323
|
+
dir_path,
|
|
324
|
+
archive_name,
|
|
325
|
+
is_dir=True)
|
|
310
326
|
else:
|
|
311
|
-
zipf.write(dir_path)
|
|
327
|
+
zipf.write(dir_path, archive_name)
|
|
312
328
|
|
|
313
329
|
for file in files:
|
|
314
330
|
file_path = os.path.join(root, file)
|
|
315
331
|
if file_path in excluded_files:
|
|
316
332
|
continue
|
|
333
|
+
archive_name = _get_archive_name(file_path, item)
|
|
317
334
|
if os.path.islink(file_path):
|
|
318
|
-
_store_symlink(zipf,
|
|
335
|
+
_store_symlink(zipf,
|
|
336
|
+
file_path,
|
|
337
|
+
archive_name,
|
|
338
|
+
is_dir=False)
|
|
319
339
|
continue
|
|
320
340
|
if stat.S_ISSOCK(os.stat(file_path).st_mode):
|
|
321
341
|
continue
|
|
322
|
-
zipf.write(file_path)
|
|
342
|
+
zipf.write(file_path, archive_name)
|
|
323
343
|
if log_file is not None:
|
|
324
344
|
log_file.write(f'Zipped {item}\n')
|
sky/execution.py
CHANGED
|
@@ -173,19 +173,12 @@ def _execute(
|
|
|
173
173
|
if dryrun.
|
|
174
174
|
"""
|
|
175
175
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
|
176
|
-
dag.resolve_and_validate_volumes()
|
|
177
|
-
if (not _is_launched_by_jobs_controller and
|
|
178
|
-
not _is_launched_by_sky_serve_controller):
|
|
179
|
-
# Only process pre-mount operations on API server.
|
|
180
|
-
dag.pre_mount_volumes()
|
|
181
176
|
for task in dag.tasks:
|
|
182
|
-
if task.storage_mounts is not None:
|
|
183
|
-
for storage in task.storage_mounts.values():
|
|
184
|
-
# Ensure the storage is constructed.
|
|
185
|
-
storage.construct()
|
|
186
177
|
for resource in task.resources:
|
|
187
178
|
# For backward compatibility, we need to override the autostop
|
|
188
|
-
# config at server-side for legacy clients.
|
|
179
|
+
# config at server-side for legacy clients. This should be set
|
|
180
|
+
# before admin policy to make the admin policy get the final
|
|
181
|
+
# value of autostop config.
|
|
189
182
|
# TODO(aylei): remove this after we bump the API version.
|
|
190
183
|
resource.override_autostop_config(
|
|
191
184
|
down=down, idle_minutes=idle_minutes_to_autostop)
|
|
@@ -200,6 +193,16 @@ def _execute(
|
|
|
200
193
|
down=down,
|
|
201
194
|
dryrun=dryrun,
|
|
202
195
|
)) as dag:
|
|
196
|
+
dag.resolve_and_validate_volumes()
|
|
197
|
+
if (not _is_launched_by_jobs_controller and
|
|
198
|
+
not _is_launched_by_sky_serve_controller):
|
|
199
|
+
# Only process pre-mount operations on API server.
|
|
200
|
+
dag.pre_mount_volumes()
|
|
201
|
+
for task in dag.tasks:
|
|
202
|
+
if task.storage_mounts is not None:
|
|
203
|
+
for storage in task.storage_mounts.values():
|
|
204
|
+
# Ensure the storage is constructed.
|
|
205
|
+
storage.construct()
|
|
203
206
|
return _execute_dag(
|
|
204
207
|
dag,
|
|
205
208
|
dryrun=dryrun,
|
sky/global_user_state.py
CHANGED
|
@@ -6,6 +6,7 @@ Concepts:
|
|
|
6
6
|
- Cluster handle: (non-user facing) an opaque backend handle for us to
|
|
7
7
|
interact with a cluster.
|
|
8
8
|
"""
|
|
9
|
+
import asyncio
|
|
9
10
|
import enum
|
|
10
11
|
import functools
|
|
11
12
|
import json
|
|
@@ -51,6 +52,9 @@ _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
|
|
|
51
52
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
52
53
|
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
53
54
|
|
|
55
|
+
DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
|
|
56
|
+
MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS = 3600
|
|
57
|
+
|
|
54
58
|
Base = declarative.declarative_base()
|
|
55
59
|
|
|
56
60
|
config_table = sqlalchemy.Table(
|
|
@@ -102,6 +106,9 @@ cluster_table = sqlalchemy.Table(
|
|
|
102
106
|
sqlalchemy.Text,
|
|
103
107
|
server_default=None),
|
|
104
108
|
sqlalchemy.Column('is_managed', sqlalchemy.Integer, server_default='0'),
|
|
109
|
+
sqlalchemy.Column('provision_log_path',
|
|
110
|
+
sqlalchemy.Text,
|
|
111
|
+
server_default=None),
|
|
105
112
|
)
|
|
106
113
|
|
|
107
114
|
storage_table = sqlalchemy.Table(
|
|
@@ -161,6 +168,9 @@ cluster_history_table = sqlalchemy.Table(
|
|
|
161
168
|
sqlalchemy.Text,
|
|
162
169
|
server_default=None),
|
|
163
170
|
sqlalchemy.Column('workspace', sqlalchemy.Text, server_default=None),
|
|
171
|
+
sqlalchemy.Column('provision_log_path',
|
|
172
|
+
sqlalchemy.Text,
|
|
173
|
+
server_default=None),
|
|
164
174
|
)
|
|
165
175
|
|
|
166
176
|
|
|
@@ -430,6 +440,17 @@ def get_user_by_name(username: str) -> List[models.User]:
|
|
|
430
440
|
]
|
|
431
441
|
|
|
432
442
|
|
|
443
|
+
@_init_db
|
|
444
|
+
def get_user_by_name_match(username_match: str) -> List[models.User]:
|
|
445
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
446
|
+
rows = session.query(user_table).filter(
|
|
447
|
+
user_table.c.name.like(f'%{username_match}%')).all()
|
|
448
|
+
return [
|
|
449
|
+
models.User(id=row.id, name=row.name, created_at=row.created_at)
|
|
450
|
+
for row in rows
|
|
451
|
+
]
|
|
452
|
+
|
|
453
|
+
|
|
433
454
|
@_init_db
|
|
434
455
|
def delete_user(user_id: str) -> None:
|
|
435
456
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
@@ -458,7 +479,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
458
479
|
is_launch: bool = True,
|
|
459
480
|
config_hash: Optional[str] = None,
|
|
460
481
|
task_config: Optional[Dict[str, Any]] = None,
|
|
461
|
-
is_managed: bool = False
|
|
482
|
+
is_managed: bool = False,
|
|
483
|
+
provision_log_path: Optional[str] = None):
|
|
462
484
|
"""Adds or updates cluster_name -> cluster_handle mapping.
|
|
463
485
|
|
|
464
486
|
Args:
|
|
@@ -473,6 +495,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
473
495
|
task_config: The config of the task being launched.
|
|
474
496
|
is_managed: Whether the cluster is launched by the
|
|
475
497
|
controller.
|
|
498
|
+
provision_log_path: Absolute path to provision.log, if available.
|
|
476
499
|
"""
|
|
477
500
|
assert _SQLALCHEMY_ENGINE is not None
|
|
478
501
|
# FIXME: launched_at will be changed when `sky launch -c` is called.
|
|
@@ -555,6 +578,10 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
555
578
|
if task_config else None,
|
|
556
579
|
'last_creation_command': last_use,
|
|
557
580
|
})
|
|
581
|
+
if provision_log_path is not None:
|
|
582
|
+
conditional_values.update({
|
|
583
|
+
'provision_log_path': provision_log_path,
|
|
584
|
+
})
|
|
558
585
|
|
|
559
586
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
560
587
|
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
@@ -618,6 +645,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
618
645
|
usage_intervals=pickle.dumps(usage_intervals),
|
|
619
646
|
user_hash=user_hash,
|
|
620
647
|
workspace=history_workspace,
|
|
648
|
+
provision_log_path=provision_log_path,
|
|
621
649
|
**creation_info,
|
|
622
650
|
)
|
|
623
651
|
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
@@ -633,6 +661,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
633
661
|
pickle.dumps(usage_intervals),
|
|
634
662
|
cluster_history_table.c.user_hash: history_hash,
|
|
635
663
|
cluster_history_table.c.workspace: history_workspace,
|
|
664
|
+
cluster_history_table.c.provision_log_path: provision_log_path,
|
|
636
665
|
**creation_info,
|
|
637
666
|
})
|
|
638
667
|
session.execute(do_update_stmt)
|
|
@@ -731,6 +760,41 @@ def get_last_cluster_event(cluster_hash: str,
|
|
|
731
760
|
return row.reason
|
|
732
761
|
|
|
733
762
|
|
|
763
|
+
def cleanup_cluster_events_with_retention(retention_hours: float) -> None:
|
|
764
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
765
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
766
|
+
query = session.query(cluster_event_table).filter(
|
|
767
|
+
cluster_event_table.c.transitioned_at < time.time() -
|
|
768
|
+
retention_hours * 3600)
|
|
769
|
+
logger.debug(f'Deleting {query.count()} cluster events.')
|
|
770
|
+
query.delete()
|
|
771
|
+
session.commit()
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
async def cluster_event_retention_daemon():
|
|
775
|
+
"""Garbage collect cluster events periodically."""
|
|
776
|
+
while True:
|
|
777
|
+
logger.info('Running cluster event retention daemon...')
|
|
778
|
+
# Use the latest config.
|
|
779
|
+
skypilot_config.reload_config()
|
|
780
|
+
retention_hours = skypilot_config.get_nested(
|
|
781
|
+
('api_server', 'cluster_event_retention_hours'),
|
|
782
|
+
DEFAULT_CLUSTER_EVENT_RETENTION_HOURS)
|
|
783
|
+
try:
|
|
784
|
+
if retention_hours >= 0:
|
|
785
|
+
cleanup_cluster_events_with_retention(retention_hours)
|
|
786
|
+
except asyncio.CancelledError:
|
|
787
|
+
logger.info('Cluster event retention daemon cancelled')
|
|
788
|
+
break
|
|
789
|
+
except Exception as e: # pylint: disable=broad-except
|
|
790
|
+
logger.error(f'Error running cluster event retention daemon: {e}')
|
|
791
|
+
|
|
792
|
+
# Run daemon at most once every hour to avoid too frequent cleanup.
|
|
793
|
+
sleep_amount = max(retention_hours * 3600,
|
|
794
|
+
MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS)
|
|
795
|
+
await asyncio.sleep(sleep_amount)
|
|
796
|
+
|
|
797
|
+
|
|
734
798
|
def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
|
|
735
799
|
event_type: ClusterEventType) -> List[str]:
|
|
736
800
|
"""Returns the cluster events for the cluster.
|
|
@@ -793,11 +857,13 @@ def update_last_use(cluster_name: str):
|
|
|
793
857
|
|
|
794
858
|
|
|
795
859
|
@_init_db
|
|
796
|
-
def remove_cluster(cluster_name: str, terminate: bool
|
|
860
|
+
def remove_cluster(cluster_name: str, terminate: bool,
|
|
861
|
+
remove_events: bool) -> None:
|
|
797
862
|
"""Removes cluster_name mapping."""
|
|
798
863
|
assert _SQLALCHEMY_ENGINE is not None
|
|
799
864
|
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
800
865
|
usage_intervals = _get_cluster_usage_intervals(cluster_hash)
|
|
866
|
+
provision_log_path = get_cluster_provision_log_path(cluster_name)
|
|
801
867
|
|
|
802
868
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
803
869
|
# usage_intervals is not None and not empty
|
|
@@ -808,8 +874,19 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
|
|
|
808
874
|
usage_intervals.append((start_time, end_time))
|
|
809
875
|
_set_cluster_usage_intervals(cluster_hash, usage_intervals)
|
|
810
876
|
|
|
877
|
+
if provision_log_path:
|
|
878
|
+
assert cluster_hash is not None, cluster_name
|
|
879
|
+
session.query(cluster_history_table).filter_by(
|
|
880
|
+
cluster_hash=cluster_hash
|
|
881
|
+
).filter(
|
|
882
|
+
cluster_history_table.c.provision_log_path.is_(None)
|
|
883
|
+
).update({
|
|
884
|
+
cluster_history_table.c.provision_log_path: provision_log_path
|
|
885
|
+
})
|
|
886
|
+
|
|
811
887
|
if terminate:
|
|
812
888
|
session.query(cluster_table).filter_by(name=cluster_name).delete()
|
|
889
|
+
if remove_events:
|
|
813
890
|
session.query(cluster_event_table).filter_by(
|
|
814
891
|
cluster_hash=cluster_hash).delete()
|
|
815
892
|
else:
|
|
@@ -915,6 +992,58 @@ def get_cluster_info(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
915
992
|
return json.loads(row.metadata)
|
|
916
993
|
|
|
917
994
|
|
|
995
|
+
@_init_db
|
|
996
|
+
def get_cluster_provision_log_path(cluster_name: str) -> Optional[str]:
|
|
997
|
+
"""Returns provision_log_path from clusters table, if recorded."""
|
|
998
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
999
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1000
|
+
row = session.query(cluster_table).filter_by(name=cluster_name).first()
|
|
1001
|
+
if row is None:
|
|
1002
|
+
return None
|
|
1003
|
+
return getattr(row, 'provision_log_path', None)
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
@_init_db
|
|
1007
|
+
def get_cluster_history_provision_log_path(cluster_name: str) -> Optional[str]:
|
|
1008
|
+
"""Returns provision_log_path from cluster_history for this name.
|
|
1009
|
+
|
|
1010
|
+
If the cluster currently exists, we use its hash. Otherwise, we look up
|
|
1011
|
+
historical rows by name and choose the most recent one based on
|
|
1012
|
+
usage_intervals.
|
|
1013
|
+
"""
|
|
1014
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1015
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1016
|
+
# Try current cluster first (fast path)
|
|
1017
|
+
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
1018
|
+
if cluster_hash is not None:
|
|
1019
|
+
row = session.query(cluster_history_table).filter_by(
|
|
1020
|
+
cluster_hash=cluster_hash).first()
|
|
1021
|
+
if row is not None:
|
|
1022
|
+
return getattr(row, 'provision_log_path', None)
|
|
1023
|
+
|
|
1024
|
+
# Fallback: search history by name and pick the latest by
|
|
1025
|
+
# usage_intervals
|
|
1026
|
+
rows = session.query(cluster_history_table).filter_by(
|
|
1027
|
+
name=cluster_name).all()
|
|
1028
|
+
if not rows:
|
|
1029
|
+
return None
|
|
1030
|
+
|
|
1031
|
+
def latest_timestamp(usages_bin) -> int:
|
|
1032
|
+
try:
|
|
1033
|
+
intervals = pickle.loads(usages_bin)
|
|
1034
|
+
# intervals: List[Tuple[int, Optional[int]]]
|
|
1035
|
+
if not intervals:
|
|
1036
|
+
return -1
|
|
1037
|
+
_, end = intervals[-1]
|
|
1038
|
+
return end if end is not None else int(time.time())
|
|
1039
|
+
except Exception: # pylint: disable=broad-except
|
|
1040
|
+
return -1
|
|
1041
|
+
|
|
1042
|
+
latest_row = max(rows,
|
|
1043
|
+
key=lambda r: latest_timestamp(r.usage_intervals))
|
|
1044
|
+
return getattr(latest_row, 'provision_log_path', None)
|
|
1045
|
+
|
|
1046
|
+
|
|
918
1047
|
@_init_db
|
|
919
1048
|
def set_cluster_info(cluster_name: str, metadata: Dict[str, Any]) -> None:
|
|
920
1049
|
assert _SQLALCHEMY_ENGINE is not None
|
sky/jobs/constants.py
CHANGED
|
@@ -47,7 +47,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
|
|
|
47
47
|
# The version of the lib files that jobs/utils use. Whenever there is an API
|
|
48
48
|
# change for the jobs/utils, we need to bump this version and update
|
|
49
49
|
# job.utils.ManagedJobCodeGen to handle the version update.
|
|
50
|
-
MANAGED_JOBS_VERSION =
|
|
50
|
+
MANAGED_JOBS_VERSION = 9
|
|
51
51
|
|
|
52
52
|
# The command for setting up the jobs dashboard on the controller. It firstly
|
|
53
53
|
# checks if the systemd services are available, and if not (e.g., Kubernetes
|
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -261,9 +261,6 @@ class StrategyExecutor:
|
|
|
261
261
|
if self.cluster_name is None:
|
|
262
262
|
return
|
|
263
263
|
if self.pool is None:
|
|
264
|
-
global_user_state.add_cluster_event(
|
|
265
|
-
self.cluster_name, None, 'Cluster was cleaned up.',
|
|
266
|
-
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
267
264
|
managed_job_utils.terminate_cluster(self.cluster_name)
|
|
268
265
|
|
|
269
266
|
def _launch(self,
|
sky/jobs/scheduler.py
CHANGED
|
@@ -93,7 +93,7 @@ def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
|
|
|
93
93
|
logger.debug(f'Job {job_id} started with pid {pid}')
|
|
94
94
|
|
|
95
95
|
|
|
96
|
-
def maybe_schedule_next_jobs(
|
|
96
|
+
def maybe_schedule_next_jobs() -> None:
|
|
97
97
|
"""Determine if any managed jobs can be scheduled, and if so, schedule them.
|
|
98
98
|
|
|
99
99
|
Here, "schedule" means to select job that is waiting, and allow it to
|
|
@@ -139,7 +139,7 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
|
|
|
139
139
|
with filelock.FileLock(controller_utils.get_resources_lock_path(),
|
|
140
140
|
blocking=False):
|
|
141
141
|
while True:
|
|
142
|
-
maybe_next_job = state.get_waiting_job(
|
|
142
|
+
maybe_next_job = state.get_waiting_job()
|
|
143
143
|
if maybe_next_job is None:
|
|
144
144
|
# Nothing left to start, break from scheduling loop
|
|
145
145
|
break
|
|
@@ -158,22 +158,11 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
|
|
|
158
158
|
# an ALIVE_WAITING job, but we would be able to launch a WAITING
|
|
159
159
|
# job.
|
|
160
160
|
if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
|
|
161
|
-
if not
|
|
162
|
-
actual_pool is not None):
|
|
161
|
+
if not controller_utils.can_provision():
|
|
163
162
|
# Can't schedule anything, break from scheduling loop.
|
|
164
163
|
break
|
|
165
164
|
elif current_state == state.ManagedJobScheduleState.WAITING:
|
|
166
165
|
if not _can_start_new_job(actual_pool):
|
|
167
|
-
# If there is no job can be scheduled in the pool, we
|
|
168
|
-
# try to schedule another job regardless of the pool.
|
|
169
|
-
# This is to avoid the case where the pool is scaled
|
|
170
|
-
# down at the same time as a job is done. In this case,
|
|
171
|
-
# we won't have any job to schedule in the pool, but
|
|
172
|
-
# other jobs in other pool (or no pool) can still be
|
|
173
|
-
# scheduled.
|
|
174
|
-
if pool is not None:
|
|
175
|
-
pool = None
|
|
176
|
-
continue
|
|
177
166
|
# Can't schedule anything, break from scheduling loop.
|
|
178
167
|
break
|
|
179
168
|
|
|
@@ -218,7 +207,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
|
218
207
|
if is_resume:
|
|
219
208
|
_start_controller(job_id, dag_yaml_path, env_file_path, pool)
|
|
220
209
|
else:
|
|
221
|
-
maybe_schedule_next_jobs(
|
|
210
|
+
maybe_schedule_next_jobs()
|
|
222
211
|
|
|
223
212
|
|
|
224
213
|
@contextlib.contextmanager
|
|
@@ -243,6 +232,13 @@ def scheduled_launch(job_id: int):
|
|
|
243
232
|
multiple uses of this context are nested, behavior is undefined. Don't do
|
|
244
233
|
that.
|
|
245
234
|
"""
|
|
235
|
+
pool = state.get_pool_from_job_id(job_id)
|
|
236
|
+
# For pool, since there is no execution.launch, we don't need to have all
|
|
237
|
+
# the ALIVE_WAITING state. The state transition will be
|
|
238
|
+
# WAITING -> ALIVE -> DONE without any intermediate transitions.
|
|
239
|
+
if pool is not None:
|
|
240
|
+
yield
|
|
241
|
+
return
|
|
246
242
|
|
|
247
243
|
# If we're already in LAUNCHING schedule_state, we don't need to wait.
|
|
248
244
|
# This may be the case for the first launch of a job.
|
|
@@ -254,7 +250,6 @@ def scheduled_launch(job_id: int):
|
|
|
254
250
|
while (state.get_job_schedule_state(job_id) !=
|
|
255
251
|
state.ManagedJobScheduleState.LAUNCHING):
|
|
256
252
|
time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
|
|
257
|
-
pool = state.get_pool_from_job_id(job_id)
|
|
258
253
|
|
|
259
254
|
try:
|
|
260
255
|
yield
|
|
@@ -268,7 +263,7 @@ def scheduled_launch(job_id: int):
|
|
|
268
263
|
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
269
264
|
state.scheduler_set_alive(job_id)
|
|
270
265
|
finally:
|
|
271
|
-
maybe_schedule_next_jobs(
|
|
266
|
+
maybe_schedule_next_jobs()
|
|
272
267
|
|
|
273
268
|
|
|
274
269
|
def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
@@ -283,19 +278,17 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
|
283
278
|
if idempotent and (state.get_job_schedule_state(job_id)
|
|
284
279
|
== state.ManagedJobScheduleState.DONE):
|
|
285
280
|
return
|
|
286
|
-
pool = state.get_pool_from_job_id(job_id)
|
|
287
281
|
|
|
288
282
|
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
289
283
|
state.scheduler_set_done(job_id, idempotent)
|
|
290
|
-
maybe_schedule_next_jobs(
|
|
284
|
+
maybe_schedule_next_jobs()
|
|
291
285
|
|
|
292
286
|
|
|
293
287
|
def _set_alive_waiting(job_id: int) -> None:
|
|
294
288
|
"""Should use wait_until_launch_okay() to transition to this state."""
|
|
295
289
|
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
296
290
|
state.scheduler_set_alive_waiting(job_id)
|
|
297
|
-
|
|
298
|
-
maybe_schedule_next_jobs(pool)
|
|
291
|
+
maybe_schedule_next_jobs()
|
|
299
292
|
|
|
300
293
|
|
|
301
294
|
def _can_start_new_job(pool: Optional[str]) -> bool:
|
sky/jobs/server/core.py
CHANGED
|
@@ -497,7 +497,8 @@ def queue_from_kubernetes_pod(
|
|
|
497
497
|
managed_jobs_runner = provision_lib.get_command_runners(
|
|
498
498
|
'kubernetes', cluster_info)[0]
|
|
499
499
|
|
|
500
|
-
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
500
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
501
|
+
skip_finished=skip_finished)
|
|
501
502
|
returncode, job_table_payload, stderr = managed_jobs_runner.run(
|
|
502
503
|
code,
|
|
503
504
|
require_outputs=True,
|
|
@@ -513,7 +514,14 @@ def queue_from_kubernetes_pod(
|
|
|
513
514
|
except exceptions.CommandError as e:
|
|
514
515
|
raise RuntimeError(str(e)) from e
|
|
515
516
|
|
|
516
|
-
jobs = managed_job_utils.load_managed_job_queue(
|
|
517
|
+
jobs, _, result_type = managed_job_utils.load_managed_job_queue(
|
|
518
|
+
job_table_payload)
|
|
519
|
+
|
|
520
|
+
if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
|
|
521
|
+
return jobs
|
|
522
|
+
|
|
523
|
+
# Backward compatibility for old jobs controller without filtering
|
|
524
|
+
# TODO(hailong): remove this after 0.12.0
|
|
517
525
|
if skip_finished:
|
|
518
526
|
# Filter out the finished jobs. If a multi-task job is partially
|
|
519
527
|
# finished, we will include all its tasks.
|
|
@@ -568,10 +576,18 @@ def _maybe_restart_controller(
|
|
|
568
576
|
|
|
569
577
|
|
|
570
578
|
@usage_lib.entrypoint
|
|
571
|
-
def queue(
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
579
|
+
def queue(
|
|
580
|
+
refresh: bool,
|
|
581
|
+
skip_finished: bool = False,
|
|
582
|
+
all_users: bool = False,
|
|
583
|
+
job_ids: Optional[List[int]] = None,
|
|
584
|
+
user_match: Optional[str] = None,
|
|
585
|
+
workspace_match: Optional[str] = None,
|
|
586
|
+
name_match: Optional[str] = None,
|
|
587
|
+
pool_match: Optional[str] = None,
|
|
588
|
+
page: Optional[int] = None,
|
|
589
|
+
limit: Optional[int] = None,
|
|
590
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
575
591
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
576
592
|
"""Gets statuses of managed jobs.
|
|
577
593
|
|
|
@@ -601,6 +617,17 @@ def queue(refresh: bool,
|
|
|
601
617
|
does not exist.
|
|
602
618
|
RuntimeError: if failed to get the managed jobs with ssh.
|
|
603
619
|
"""
|
|
620
|
+
if limit is not None:
|
|
621
|
+
if limit < 1:
|
|
622
|
+
raise ValueError(f'Limit must be at least 1, got {limit}')
|
|
623
|
+
if page is None:
|
|
624
|
+
page = 1
|
|
625
|
+
if page < 1:
|
|
626
|
+
raise ValueError(f'Page must be at least 1, got {page}')
|
|
627
|
+
else:
|
|
628
|
+
if page is not None:
|
|
629
|
+
raise ValueError('Limit must be specified when page is specified')
|
|
630
|
+
|
|
604
631
|
handle = _maybe_restart_controller(refresh,
|
|
605
632
|
stopped_message='No in-progress '
|
|
606
633
|
'managed jobs.',
|
|
@@ -609,7 +636,22 @@ def queue(refresh: bool,
|
|
|
609
636
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
610
637
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
611
638
|
|
|
612
|
-
|
|
639
|
+
user_hashes: Optional[List[Optional[str]]] = None
|
|
640
|
+
if not all_users:
|
|
641
|
+
user_hashes = [common_utils.get_user_hash()]
|
|
642
|
+
# For backwards compatibility, we show jobs that do not have a
|
|
643
|
+
# user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
644
|
+
user_hashes.append(None)
|
|
645
|
+
elif user_match is not None:
|
|
646
|
+
users = global_user_state.get_user_by_name_match(user_match)
|
|
647
|
+
if not users:
|
|
648
|
+
return [], 0
|
|
649
|
+
user_hashes = [user.id for user in users]
|
|
650
|
+
|
|
651
|
+
accessible_workspaces = list(workspaces_core.get_workspaces().keys())
|
|
652
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
653
|
+
skip_finished, accessible_workspaces, job_ids, workspace_match,
|
|
654
|
+
name_match, pool_match, page, limit, user_hashes)
|
|
613
655
|
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
614
656
|
handle,
|
|
615
657
|
code,
|
|
@@ -622,8 +664,14 @@ def queue(refresh: bool,
|
|
|
622
664
|
raise RuntimeError('Failed to fetch managed jobs with returncode: '
|
|
623
665
|
f'{returncode}.\n{job_table_payload + stderr}')
|
|
624
666
|
|
|
625
|
-
jobs = managed_job_utils.load_managed_job_queue(
|
|
667
|
+
jobs, total, result_type = managed_job_utils.load_managed_job_queue(
|
|
668
|
+
job_table_payload)
|
|
669
|
+
|
|
670
|
+
if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
|
|
671
|
+
return jobs, total
|
|
626
672
|
|
|
673
|
+
# Backward compatibility for old jobs controller without filtering
|
|
674
|
+
# TODO(hailong): remove this after 0.12.0
|
|
627
675
|
if not all_users:
|
|
628
676
|
|
|
629
677
|
def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
|
|
@@ -636,7 +684,6 @@ def queue(refresh: bool,
|
|
|
636
684
|
|
|
637
685
|
jobs = list(filter(user_hash_matches_or_missing, jobs))
|
|
638
686
|
|
|
639
|
-
accessible_workspaces = workspaces_core.get_workspaces()
|
|
640
687
|
jobs = list(
|
|
641
688
|
filter(
|
|
642
689
|
lambda job: job.get('workspace', skylet_constants.
|
|
@@ -655,7 +702,14 @@ def queue(refresh: bool,
|
|
|
655
702
|
if job_ids:
|
|
656
703
|
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
657
704
|
|
|
658
|
-
return jobs
|
|
705
|
+
return managed_job_utils.filter_jobs(jobs,
|
|
706
|
+
workspace_match,
|
|
707
|
+
name_match,
|
|
708
|
+
pool_match,
|
|
709
|
+
page=page,
|
|
710
|
+
limit=limit,
|
|
711
|
+
user_match=user_match,
|
|
712
|
+
enable_user_match=True)
|
|
659
713
|
|
|
660
714
|
|
|
661
715
|
@usage_lib.entrypoint
|
sky/jobs/server/utils.py
CHANGED
|
@@ -62,7 +62,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
62
62
|
version_matches = controller_version == local_version
|
|
63
63
|
|
|
64
64
|
# Load and filter jobs locally using existing method
|
|
65
|
-
jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
|
|
65
|
+
jobs, _, _ = managed_job_utils.load_managed_job_queue(job_table_payload)
|
|
66
66
|
non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
|
|
67
67
|
has_non_terminal_jobs = len(non_terminal_jobs) > 0
|
|
68
68
|
|
sky/jobs/state.py
CHANGED
|
@@ -1528,7 +1528,7 @@ def get_nonterminal_job_ids_by_pool(pool: str,
|
|
|
1528
1528
|
|
|
1529
1529
|
|
|
1530
1530
|
@_init_db
|
|
1531
|
-
def get_waiting_job(
|
|
1531
|
+
def get_waiting_job() -> Optional[Dict[str, Any]]:
|
|
1532
1532
|
"""Get the next job that should transition to LAUNCHING.
|
|
1533
1533
|
|
|
1534
1534
|
Selects the highest-priority WAITING or ALIVE_WAITING job, provided its
|
|
@@ -1559,8 +1559,6 @@ def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
|
|
|
1559
1559
|
job_info_table.c.priority >= sqlalchemy.func.coalesce(
|
|
1560
1560
|
max_priority_subquery, 0),
|
|
1561
1561
|
]
|
|
1562
|
-
if pool is not None:
|
|
1563
|
-
select_conds.append(job_info_table.c.pool == pool)
|
|
1564
1562
|
query = sqlalchemy.select(
|
|
1565
1563
|
job_info_table.c.spot_job_id,
|
|
1566
1564
|
job_info_table.c.schedule_state,
|