skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +448 -60
- sky/client/common.py +12 -9
- sky/clouds/nebius.py +1 -1
- sky/clouds/utils/gcp_utils.py +1 -1
- sky/clouds/vast.py +1 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
- sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
- sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
- sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
- sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
- sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
- sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
- sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
- sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
- sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
- sky/dashboard/out/_next/static/oKqDxFQ88cquF4nQGE_0w/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +25 -0
- sky/data/storage.py +1219 -1775
- sky/global_user_state.py +18 -8
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +164 -31
- sky/jobs/utils.py +144 -68
- sky/logs/aws.py +4 -2
- sky/provision/kubernetes/utils.py +6 -4
- sky/provision/nebius/constants.py +3 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vast/utils.py +9 -6
- sky/py.typed +0 -0
- sky/resources.py +24 -14
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +18 -1
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/server/server.py +6 -1
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +24 -19
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +135 -130
- sky/dashboard/out/_next/static/Q2sVXboB_t7cgvntL-6nD/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
- sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
- sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
- sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
- sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
- sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
- sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
- sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
- sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
- sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
- sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
- sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
- sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
- /sky/dashboard/out/_next/static/{Q2sVXboB_t7cgvntL-6nD → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
import enum
|
|
5
5
|
import functools
|
|
6
6
|
import json
|
|
7
|
+
import threading
|
|
7
8
|
import time
|
|
8
9
|
import typing
|
|
9
10
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
@@ -33,6 +34,7 @@ CallbackType = Callable[[str], None]
|
|
|
33
34
|
logger = sky_logging.init_logger(__name__)
|
|
34
35
|
|
|
35
36
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
37
|
+
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
36
38
|
|
|
37
39
|
Base = declarative.declarative_base()
|
|
38
40
|
|
|
@@ -98,6 +100,13 @@ job_info_table = sqlalchemy.Table(
|
|
|
98
100
|
sqlalchemy.Column('original_user_yaml_path',
|
|
99
101
|
sqlalchemy.Text,
|
|
100
102
|
server_default=None),
|
|
103
|
+
sqlalchemy.Column('pool', sqlalchemy.Text, server_default=None),
|
|
104
|
+
sqlalchemy.Column('current_cluster_name',
|
|
105
|
+
sqlalchemy.Text,
|
|
106
|
+
server_default=None),
|
|
107
|
+
sqlalchemy.Column('job_id_on_pool_cluster',
|
|
108
|
+
sqlalchemy.Integer,
|
|
109
|
+
server_default=None),
|
|
101
110
|
)
|
|
102
111
|
|
|
103
112
|
ha_recovery_script_table = sqlalchemy.Table(
|
|
@@ -131,21 +140,30 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
131
140
|
migration_utils.SPOT_JOBS_VERSION)
|
|
132
141
|
|
|
133
142
|
|
|
143
|
+
# We wrap the sqlalchemy engine initialization in a thread
|
|
144
|
+
# lock to ensure that multiple threads do not initialize the
|
|
145
|
+
# engine which could result in a rare race condition where
|
|
146
|
+
# a session has already been created with _SQLALCHEMY_ENGINE = e1,
|
|
147
|
+
# and then another thread overwrites _SQLALCHEMY_ENGINE = e2
|
|
148
|
+
# which could result in e1 being garbage collected unexpectedly.
|
|
134
149
|
def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
135
150
|
global _SQLALCHEMY_ENGINE
|
|
136
151
|
|
|
137
152
|
if _SQLALCHEMY_ENGINE is not None:
|
|
138
153
|
return _SQLALCHEMY_ENGINE
|
|
139
154
|
|
|
140
|
-
|
|
141
|
-
|
|
155
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
156
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
157
|
+
return _SQLALCHEMY_ENGINE
|
|
158
|
+
# get an engine to the db
|
|
159
|
+
engine = migration_utils.get_engine('spot_jobs')
|
|
142
160
|
|
|
143
|
-
|
|
144
|
-
|
|
161
|
+
# run migrations if needed
|
|
162
|
+
create_table(engine)
|
|
145
163
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
164
|
+
# return engine
|
|
165
|
+
_SQLALCHEMY_ENGINE = engine
|
|
166
|
+
return _SQLALCHEMY_ENGINE
|
|
149
167
|
|
|
150
168
|
|
|
151
169
|
def _init_db(func):
|
|
@@ -204,6 +222,9 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
|
|
|
204
222
|
'priority': r['priority'],
|
|
205
223
|
'entrypoint': r['entrypoint'],
|
|
206
224
|
'original_user_yaml_path': r['original_user_yaml_path'],
|
|
225
|
+
'pool': r['pool'],
|
|
226
|
+
'current_cluster_name': r['current_cluster_name'],
|
|
227
|
+
'job_id_on_pool_cluster': r['job_id_on_pool_cluster'],
|
|
207
228
|
}
|
|
208
229
|
|
|
209
230
|
|
|
@@ -440,8 +461,8 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
|
|
|
440
461
|
|
|
441
462
|
|
|
442
463
|
@_init_db
|
|
443
|
-
def set_job_info_without_job_id(name: str, workspace: str,
|
|
444
|
-
|
|
464
|
+
def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
|
|
465
|
+
pool: Optional[str]) -> int:
|
|
445
466
|
assert _SQLALCHEMY_ENGINE is not None
|
|
446
467
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
447
468
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
@@ -458,6 +479,7 @@ def set_job_info_without_job_id(name: str, workspace: str,
|
|
|
458
479
|
schedule_state=ManagedJobScheduleState.INACTIVE.value,
|
|
459
480
|
workspace=workspace,
|
|
460
481
|
entrypoint=entrypoint,
|
|
482
|
+
pool=pool,
|
|
461
483
|
)
|
|
462
484
|
|
|
463
485
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
@@ -1045,6 +1067,23 @@ def _get_all_task_ids_statuses(
|
|
|
1045
1067
|
return [(row[0], ManagedJobStatus(row[1])) for row in id_statuses]
|
|
1046
1068
|
|
|
1047
1069
|
|
|
1070
|
+
@_init_db
|
|
1071
|
+
def get_all_task_ids_names_statuses_logs(
|
|
1072
|
+
job_id: int) -> List[Tuple[int, str, ManagedJobStatus, str]]:
|
|
1073
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1074
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1075
|
+
id_names = session.execute(
|
|
1076
|
+
sqlalchemy.select(
|
|
1077
|
+
spot_table.c.task_id,
|
|
1078
|
+
spot_table.c.task_name,
|
|
1079
|
+
spot_table.c.status,
|
|
1080
|
+
spot_table.c.local_log_file,
|
|
1081
|
+
).where(spot_table.c.spot_job_id == job_id).order_by(
|
|
1082
|
+
spot_table.c.task_id.asc())).fetchall()
|
|
1083
|
+
return [(row[0], row[1], ManagedJobStatus(row[2]), row[3])
|
|
1084
|
+
for row in id_names]
|
|
1085
|
+
|
|
1086
|
+
|
|
1048
1087
|
@_init_db
|
|
1049
1088
|
def get_job_status_with_task_id(job_id: int,
|
|
1050
1089
|
task_id: int) -> Optional[ManagedJobStatus]:
|
|
@@ -1250,6 +1289,56 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
|
|
|
1250
1289
|
return updated_count == 0
|
|
1251
1290
|
|
|
1252
1291
|
|
|
1292
|
+
@_init_db
|
|
1293
|
+
def get_pool_from_job_id(job_id: int) -> Optional[str]:
|
|
1294
|
+
"""Get the pool from the job id."""
|
|
1295
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1296
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1297
|
+
pool = session.execute(
|
|
1298
|
+
sqlalchemy.select(job_info_table.c.pool).where(
|
|
1299
|
+
job_info_table.c.spot_job_id == job_id)).fetchone()
|
|
1300
|
+
return pool[0] if pool else None
|
|
1301
|
+
|
|
1302
|
+
|
|
1303
|
+
@_init_db
|
|
1304
|
+
def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
|
|
1305
|
+
"""Set the current cluster name for a job."""
|
|
1306
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1307
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1308
|
+
session.query(job_info_table).filter(
|
|
1309
|
+
job_info_table.c.spot_job_id == job_id).update(
|
|
1310
|
+
{job_info_table.c.current_cluster_name: current_cluster_name})
|
|
1311
|
+
session.commit()
|
|
1312
|
+
|
|
1313
|
+
|
|
1314
|
+
@_init_db
|
|
1315
|
+
def set_job_id_on_pool_cluster(job_id: int,
|
|
1316
|
+
job_id_on_pool_cluster: int) -> None:
|
|
1317
|
+
"""Set the job id on the pool cluster for a job."""
|
|
1318
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1319
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1320
|
+
session.query(job_info_table).filter(
|
|
1321
|
+
job_info_table.c.spot_job_id == job_id).update({
|
|
1322
|
+
job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster
|
|
1323
|
+
})
|
|
1324
|
+
session.commit()
|
|
1325
|
+
|
|
1326
|
+
|
|
1327
|
+
@_init_db
|
|
1328
|
+
def get_pool_submit_info(job_id: int) -> Tuple[Optional[str], Optional[int]]:
|
|
1329
|
+
"""Get the cluster name and job id on the pool from the managed job id."""
|
|
1330
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1331
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1332
|
+
info = session.execute(
|
|
1333
|
+
sqlalchemy.select(
|
|
1334
|
+
job_info_table.c.current_cluster_name,
|
|
1335
|
+
job_info_table.c.job_id_on_pool_cluster).where(
|
|
1336
|
+
job_info_table.c.spot_job_id == job_id)).fetchone()
|
|
1337
|
+
if info is None:
|
|
1338
|
+
return None, None
|
|
1339
|
+
return info[0], info[1]
|
|
1340
|
+
|
|
1341
|
+
|
|
1253
1342
|
@_init_db
|
|
1254
1343
|
def scheduler_set_launching(job_id: int,
|
|
1255
1344
|
current_state: ManagedJobScheduleState) -> None:
|
|
@@ -1370,28 +1459,68 @@ def get_num_launching_jobs() -> int:
|
|
|
1370
1459
|
sqlalchemy.select(
|
|
1371
1460
|
sqlalchemy.func.count() # pylint: disable=not-callable
|
|
1372
1461
|
).select_from(job_info_table).where(
|
|
1373
|
-
|
|
1374
|
-
|
|
1462
|
+
sqlalchemy.and_(
|
|
1463
|
+
job_info_table.c.schedule_state ==
|
|
1464
|
+
ManagedJobScheduleState.LAUNCHING.value,
|
|
1465
|
+
# We only count jobs that are not in the pool, because the
|
|
1466
|
+
# job in the pool does not actually calling the sky.launch.
|
|
1467
|
+
job_info_table.c.pool.is_(None)))).fetchone()[0]
|
|
1375
1468
|
|
|
1376
1469
|
|
|
1377
1470
|
@_init_db
|
|
1378
|
-
def get_num_alive_jobs() -> int:
|
|
1471
|
+
def get_num_alive_jobs(pool: Optional[str] = None) -> int:
|
|
1379
1472
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1380
1473
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1474
|
+
where_conditions = [
|
|
1475
|
+
job_info_table.c.schedule_state.in_([
|
|
1476
|
+
ManagedJobScheduleState.ALIVE_WAITING.value,
|
|
1477
|
+
ManagedJobScheduleState.LAUNCHING.value,
|
|
1478
|
+
ManagedJobScheduleState.ALIVE.value,
|
|
1479
|
+
ManagedJobScheduleState.ALIVE_BACKOFF.value,
|
|
1480
|
+
])
|
|
1481
|
+
]
|
|
1482
|
+
|
|
1483
|
+
if pool is not None:
|
|
1484
|
+
where_conditions.append(job_info_table.c.pool == pool)
|
|
1485
|
+
|
|
1381
1486
|
return session.execute(
|
|
1382
1487
|
sqlalchemy.select(
|
|
1383
1488
|
sqlalchemy.func.count() # pylint: disable=not-callable
|
|
1384
1489
|
).select_from(job_info_table).where(
|
|
1385
|
-
|
|
1386
|
-
ManagedJobScheduleState.ALIVE_WAITING.value,
|
|
1387
|
-
ManagedJobScheduleState.LAUNCHING.value,
|
|
1388
|
-
ManagedJobScheduleState.ALIVE.value,
|
|
1389
|
-
ManagedJobScheduleState.ALIVE_BACKOFF.value,
|
|
1390
|
-
]))).fetchone()[0]
|
|
1490
|
+
sqlalchemy.and_(*where_conditions))).fetchone()[0]
|
|
1391
1491
|
|
|
1392
1492
|
|
|
1393
1493
|
@_init_db
|
|
1394
|
-
def
|
|
1494
|
+
def get_nonterminal_job_ids_by_pool(pool: str,
|
|
1495
|
+
cluster_name: Optional[str] = None
|
|
1496
|
+
) -> List[int]:
|
|
1497
|
+
"""Get nonterminal job ids in a pool."""
|
|
1498
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1499
|
+
|
|
1500
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1501
|
+
query = sqlalchemy.select(
|
|
1502
|
+
spot_table.c.spot_job_id.distinct()).select_from(
|
|
1503
|
+
spot_table.outerjoin(
|
|
1504
|
+
job_info_table,
|
|
1505
|
+
spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
|
|
1506
|
+
and_conditions = [
|
|
1507
|
+
~spot_table.c.status.in_([
|
|
1508
|
+
status.value for status in ManagedJobStatus.terminal_statuses()
|
|
1509
|
+
]),
|
|
1510
|
+
job_info_table.c.pool == pool,
|
|
1511
|
+
]
|
|
1512
|
+
if cluster_name is not None:
|
|
1513
|
+
and_conditions.append(
|
|
1514
|
+
job_info_table.c.current_cluster_name == cluster_name)
|
|
1515
|
+
query = query.where(sqlalchemy.and_(*and_conditions)).order_by(
|
|
1516
|
+
spot_table.c.spot_job_id.asc())
|
|
1517
|
+
rows = session.execute(query).fetchall()
|
|
1518
|
+
job_ids = [row[0] for row in rows if row[0] is not None]
|
|
1519
|
+
return job_ids
|
|
1520
|
+
|
|
1521
|
+
|
|
1522
|
+
@_init_db
|
|
1523
|
+
def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
|
|
1395
1524
|
"""Get the next job that should transition to LAUNCHING.
|
|
1396
1525
|
|
|
1397
1526
|
Selects the highest-priority WAITING or ALIVE_WAITING job, provided its
|
|
@@ -1414,23 +1543,26 @@ def get_waiting_job() -> Optional[Dict[str, Any]]:
|
|
|
1414
1543
|
ManagedJobScheduleState.ALIVE_BACKOFF.value,
|
|
1415
1544
|
])).scalar_subquery()
|
|
1416
1545
|
# Main query for waiting jobs
|
|
1546
|
+
select_conds = [
|
|
1547
|
+
job_info_table.c.schedule_state.in_([
|
|
1548
|
+
ManagedJobScheduleState.WAITING.value,
|
|
1549
|
+
ManagedJobScheduleState.ALIVE_WAITING.value,
|
|
1550
|
+
]),
|
|
1551
|
+
job_info_table.c.priority >= sqlalchemy.func.coalesce(
|
|
1552
|
+
max_priority_subquery, 0),
|
|
1553
|
+
]
|
|
1554
|
+
if pool is not None:
|
|
1555
|
+
select_conds.append(job_info_table.c.pool == pool)
|
|
1417
1556
|
query = sqlalchemy.select(
|
|
1418
1557
|
job_info_table.c.spot_job_id,
|
|
1419
1558
|
job_info_table.c.schedule_state,
|
|
1420
1559
|
job_info_table.c.dag_yaml_path,
|
|
1421
1560
|
job_info_table.c.env_file_path,
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
]),
|
|
1428
|
-
job_info_table.c.priority >= sqlalchemy.func.coalesce(
|
|
1429
|
-
max_priority_subquery, 0),
|
|
1430
|
-
)).order_by(
|
|
1431
|
-
job_info_table.c.priority.desc(),
|
|
1432
|
-
job_info_table.c.spot_job_id.asc(),
|
|
1433
|
-
).limit(1)
|
|
1561
|
+
job_info_table.c.pool,
|
|
1562
|
+
).where(sqlalchemy.and_(*select_conds)).order_by(
|
|
1563
|
+
job_info_table.c.priority.desc(),
|
|
1564
|
+
job_info_table.c.spot_job_id.asc(),
|
|
1565
|
+
).limit(1)
|
|
1434
1566
|
waiting_job_row = session.execute(query).fetchone()
|
|
1435
1567
|
if waiting_job_row is None:
|
|
1436
1568
|
return None
|
|
@@ -1440,6 +1572,7 @@ def get_waiting_job() -> Optional[Dict[str, Any]]:
|
|
|
1440
1572
|
'schedule_state': ManagedJobScheduleState(waiting_job_row[1]),
|
|
1441
1573
|
'dag_yaml_path': waiting_job_row[2],
|
|
1442
1574
|
'env_file_path': waiting_job_row[3],
|
|
1575
|
+
'pool': waiting_job_row[4],
|
|
1443
1576
|
}
|
|
1444
1577
|
|
|
1445
1578
|
|