skypilot-nightly 1.0.0.dev20250717__py3-none-any.whl → 1.0.0.dev20250720__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/backends/backend_utils.py +23 -13
- sky/backends/cloud_vm_ray_backend.py +19 -11
- sky/catalog/__init__.py +3 -1
- sky/catalog/aws_catalog.py +8 -5
- sky/catalog/azure_catalog.py +8 -5
- sky/catalog/common.py +8 -2
- sky/catalog/cudo_catalog.py +5 -2
- sky/catalog/do_catalog.py +4 -1
- sky/catalog/fluidstack_catalog.py +5 -2
- sky/catalog/gcp_catalog.py +8 -5
- sky/catalog/hyperbolic_catalog.py +5 -2
- sky/catalog/ibm_catalog.py +8 -5
- sky/catalog/lambda_catalog.py +8 -5
- sky/catalog/nebius_catalog.py +8 -5
- sky/catalog/oci_catalog.py +8 -5
- sky/catalog/paperspace_catalog.py +4 -1
- sky/catalog/runpod_catalog.py +5 -2
- sky/catalog/scp_catalog.py +8 -5
- sky/catalog/vast_catalog.py +5 -2
- sky/catalog/vsphere_catalog.py +4 -1
- sky/client/cli/command.py +25 -2
- sky/client/sdk.py +10 -5
- sky/clouds/aws.py +12 -7
- sky/clouds/azure.py +12 -7
- sky/clouds/cloud.py +9 -8
- sky/clouds/cudo.py +13 -7
- sky/clouds/do.py +12 -7
- sky/clouds/fluidstack.py +11 -6
- sky/clouds/gcp.py +12 -7
- sky/clouds/hyperbolic.py +11 -6
- sky/clouds/ibm.py +11 -6
- sky/clouds/kubernetes.py +7 -3
- sky/clouds/lambda_cloud.py +11 -6
- sky/clouds/nebius.py +12 -7
- sky/clouds/oci.py +12 -7
- sky/clouds/paperspace.py +12 -7
- sky/clouds/runpod.py +12 -7
- sky/clouds/scp.py +11 -6
- sky/clouds/vast.py +12 -7
- sky/clouds/vsphere.py +11 -6
- sky/core.py +6 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-a821dcaaae2a3823.js +6 -0
- sky/dashboard/out/_next/static/chunks/{2641.35edc9ccaeaad9e3.js → 2641.5233e938f14e31a7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.4c849b1e05c8e9ad.js → 4725.66125dcd9832aa5d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4869.c7c055a5c2814f33.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-63fc419cb82ad9b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9470-8178183f3bae198f.js +1 -0
- sky/dashboard/out/_next/static/chunks/{9984.b56614f3c4c5961d.js → 9984.2b5e3fa69171bff9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-507712f30cd3cec3.js +20 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa406155b4223d0d.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-14d404b7dd28502a.js → [job]-c5b357bfd9502fbe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-26cdc782eed15a7d.js +1 -0
- sky/dashboard/out/_next/static/css/5122cb0a08486fd3.css +3 -0
- sky/dashboard/out/_next/static/{Et5IQ5Y3WvH608nXClo4z → pTQKG61ng32Zc7gsAROFJ}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +13 -143
- sky/jobs/client/sdk.py +1 -1
- sky/jobs/server/core.py +14 -0
- sky/jobs/state.py +9 -88
- sky/jobs/utils.py +28 -13
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/serve/client/sdk.py +7 -3
- sky/serve/controller.py +7 -3
- sky/serve/serve_state.py +1 -1
- sky/serve/serve_utils.py +171 -75
- sky/serve/server/core.py +17 -6
- sky/server/common.py +4 -0
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/requests.py +1 -1
- sky/server/rest.py +71 -26
- sky/setup_files/MANIFEST.in +2 -0
- sky/setup_files/alembic.ini +152 -0
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/configs.py +1 -1
- sky/skylet/job_lib.py +1 -1
- sky/skypilot_config.py +32 -6
- sky/users/permission.py +1 -1
- sky/utils/common_utils.py +77 -0
- sky/utils/db/__init__.py +0 -0
- sky/utils/{db_utils.py → db/db_utils.py} +59 -0
- sky/utils/db/migration_utils.py +53 -0
- {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/RECORD +110 -101
- sky/dashboard/out/_next/static/chunks/1043-90a88c46f27b3df5.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-76491ac174a95278.js +0 -6
- sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-743abf4bc86baf48.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-6a9ffdaa21eee969.js +0 -1
- sky/dashboard/out/_next/static/chunks/9470-b6f6a35283863a6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-771a40cde532309b.js +0 -20
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-9096ea50b8e2cf9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/webpack-c3b45b7b0eaef66f.js +0 -1
- sky/dashboard/out/_next/static/css/219887b94512388c.css +0 -3
- /sky/dashboard/out/_next/static/{Et5IQ5Y3WvH608nXClo4z → pTQKG61ng32Zc7gsAROFJ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
|
@@ -12,12 +12,12 @@ import os
|
|
|
12
12
|
import pathlib
|
|
13
13
|
import pickle
|
|
14
14
|
import re
|
|
15
|
-
import threading
|
|
16
15
|
import time
|
|
17
16
|
import typing
|
|
18
17
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
19
18
|
import uuid
|
|
20
19
|
|
|
20
|
+
from alembic import command as alembic_command
|
|
21
21
|
import sqlalchemy
|
|
22
22
|
from sqlalchemy import exc as sqlalchemy_exc
|
|
23
23
|
from sqlalchemy import orm
|
|
@@ -32,9 +32,10 @@ from sky import skypilot_config
|
|
|
32
32
|
from sky.skylet import constants
|
|
33
33
|
from sky.utils import common_utils
|
|
34
34
|
from sky.utils import context_utils
|
|
35
|
-
from sky.utils import db_utils
|
|
36
35
|
from sky.utils import registry
|
|
37
36
|
from sky.utils import status_lib
|
|
37
|
+
from sky.utils.db import db_utils
|
|
38
|
+
from sky.utils.db import migration_utils
|
|
38
39
|
|
|
39
40
|
if typing.TYPE_CHECKING:
|
|
40
41
|
from sky import backends
|
|
@@ -48,7 +49,6 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
|
|
|
48
49
|
_ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
|
|
49
50
|
|
|
50
51
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
51
|
-
_DB_INIT_LOCK = threading.Lock()
|
|
52
52
|
|
|
53
53
|
Base = declarative.declarative_base()
|
|
54
54
|
|
|
@@ -238,152 +238,20 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
238
238
|
# If the database is locked, it is OK to continue, as the WAL mode
|
|
239
239
|
# is not critical and is likely to be enabled by other processes.
|
|
240
240
|
|
|
241
|
-
#
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
#
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
# Add autostop column to clusters table
|
|
249
|
-
db_utils.add_column_to_table_sqlalchemy(session,
|
|
250
|
-
'clusters',
|
|
251
|
-
'autostop',
|
|
252
|
-
sqlalchemy.Integer(),
|
|
253
|
-
default_statement='DEFAULT -1')
|
|
254
|
-
|
|
255
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
256
|
-
session,
|
|
257
|
-
'clusters',
|
|
258
|
-
'metadata',
|
|
259
|
-
sqlalchemy.Text(),
|
|
260
|
-
default_statement='DEFAULT \'{}\'')
|
|
261
|
-
|
|
262
|
-
db_utils.add_column_to_table_sqlalchemy(session,
|
|
263
|
-
'clusters',
|
|
264
|
-
'to_down',
|
|
265
|
-
sqlalchemy.Integer(),
|
|
266
|
-
default_statement='DEFAULT 0')
|
|
267
|
-
|
|
268
|
-
# The cloud identity that created the cluster.
|
|
269
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
270
|
-
session,
|
|
271
|
-
'clusters',
|
|
272
|
-
'owner',
|
|
273
|
-
sqlalchemy.Text(),
|
|
274
|
-
default_statement='DEFAULT NULL')
|
|
275
|
-
|
|
276
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
277
|
-
session,
|
|
278
|
-
'clusters',
|
|
279
|
-
'cluster_hash',
|
|
280
|
-
sqlalchemy.Text(),
|
|
281
|
-
default_statement='DEFAULT NULL')
|
|
282
|
-
|
|
283
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
284
|
-
session,
|
|
285
|
-
'clusters',
|
|
286
|
-
'storage_mounts_metadata',
|
|
287
|
-
sqlalchemy.LargeBinary(),
|
|
288
|
-
default_statement='DEFAULT NULL')
|
|
289
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
290
|
-
session,
|
|
291
|
-
'clusters',
|
|
292
|
-
'cluster_ever_up',
|
|
293
|
-
sqlalchemy.Integer(),
|
|
294
|
-
default_statement='DEFAULT 0',
|
|
295
|
-
# Set the value to 1 so that all the existing clusters before #2977
|
|
296
|
-
# are considered as ever up, i.e:
|
|
297
|
-
# existing cluster's default (null) -> 1;
|
|
298
|
-
# new cluster's default -> 0;
|
|
299
|
-
# This is conservative for the existing clusters: even if some INIT
|
|
300
|
-
# clusters were never really UP, setting it to 1 means they won't be
|
|
301
|
-
# auto-deleted during any failover.
|
|
302
|
-
value_to_replace_existing_entries=1)
|
|
303
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
304
|
-
session,
|
|
305
|
-
'clusters',
|
|
306
|
-
'status_updated_at',
|
|
307
|
-
sqlalchemy.Integer(),
|
|
308
|
-
default_statement='DEFAULT NULL')
|
|
309
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
310
|
-
session,
|
|
311
|
-
'clusters',
|
|
312
|
-
'user_hash',
|
|
313
|
-
sqlalchemy.Text(),
|
|
314
|
-
default_statement='DEFAULT NULL',
|
|
315
|
-
value_to_replace_existing_entries=common_utils.get_current_user(
|
|
316
|
-
).id)
|
|
317
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
318
|
-
session,
|
|
319
|
-
'clusters',
|
|
320
|
-
'config_hash',
|
|
321
|
-
sqlalchemy.Text(),
|
|
322
|
-
default_statement='DEFAULT NULL')
|
|
323
|
-
|
|
324
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
325
|
-
session,
|
|
326
|
-
'cluster_history',
|
|
327
|
-
'user_hash',
|
|
328
|
-
sqlalchemy.Text(),
|
|
329
|
-
default_statement='DEFAULT NULL')
|
|
330
|
-
|
|
331
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
332
|
-
session,
|
|
333
|
-
'clusters',
|
|
334
|
-
'workspace',
|
|
335
|
-
sqlalchemy.Text(),
|
|
336
|
-
default_statement='DEFAULT \'default\'',
|
|
337
|
-
value_to_replace_existing_entries=constants.
|
|
338
|
-
SKYPILOT_DEFAULT_WORKSPACE)
|
|
339
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
340
|
-
session,
|
|
341
|
-
'clusters',
|
|
342
|
-
'last_creation_yaml',
|
|
343
|
-
sqlalchemy.Text(),
|
|
344
|
-
default_statement='DEFAULT NULL',
|
|
345
|
-
)
|
|
346
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
347
|
-
session,
|
|
348
|
-
'clusters',
|
|
349
|
-
'last_creation_command',
|
|
350
|
-
sqlalchemy.Text(),
|
|
351
|
-
default_statement='DEFAULT NULL')
|
|
352
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
353
|
-
session,
|
|
354
|
-
'users',
|
|
355
|
-
'password',
|
|
356
|
-
sqlalchemy.Text(),
|
|
357
|
-
default_statement='DEFAULT NULL')
|
|
358
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
359
|
-
session,
|
|
360
|
-
'users',
|
|
361
|
-
'created_at',
|
|
362
|
-
sqlalchemy.Integer(),
|
|
363
|
-
default_statement='DEFAULT NULL')
|
|
364
|
-
|
|
365
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
366
|
-
session,
|
|
367
|
-
'cluster_history',
|
|
368
|
-
'last_creation_yaml',
|
|
369
|
-
sqlalchemy.Text(),
|
|
370
|
-
default_statement='DEFAULT NULL')
|
|
371
|
-
|
|
372
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
373
|
-
session,
|
|
374
|
-
'cluster_history',
|
|
375
|
-
'last_creation_command',
|
|
376
|
-
sqlalchemy.Text(),
|
|
377
|
-
default_statement='DEFAULT NULL')
|
|
378
|
-
|
|
379
|
-
session.commit()
|
|
241
|
+
# Get alembic config for state db and run migrations
|
|
242
|
+
alembic_config = migration_utils.get_alembic_config(
|
|
243
|
+
engine, migration_utils.GLOBAL_USER_STATE_DB_NAME)
|
|
244
|
+
# pylint: disable=line-too-long
|
|
245
|
+
alembic_config.config_ini_section = migration_utils.GLOBAL_USER_STATE_DB_NAME
|
|
246
|
+
alembic_command.upgrade(alembic_config,
|
|
247
|
+
migration_utils.GLOBAL_USER_STATE_VERSION)
|
|
380
248
|
|
|
381
249
|
|
|
382
250
|
def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
383
251
|
global _SQLALCHEMY_ENGINE
|
|
384
252
|
if _SQLALCHEMY_ENGINE is not None:
|
|
385
253
|
return _SQLALCHEMY_ENGINE
|
|
386
|
-
with
|
|
254
|
+
with migration_utils.db_lock(migration_utils.GLOBAL_USER_STATE_DB_NAME):
|
|
387
255
|
if _SQLALCHEMY_ENGINE is None:
|
|
388
256
|
conn_string = None
|
|
389
257
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
@@ -520,6 +388,7 @@ def get_user(user_id: str) -> Optional[models.User]:
|
|
|
520
388
|
created_at=row.created_at)
|
|
521
389
|
|
|
522
390
|
|
|
391
|
+
@_init_db
|
|
523
392
|
def get_user_by_name(username: str) -> List[models.User]:
|
|
524
393
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
525
394
|
rows = session.query(user_table).filter_by(name=username).all()
|
|
@@ -533,6 +402,7 @@ def get_user_by_name(username: str) -> List[models.User]:
|
|
|
533
402
|
]
|
|
534
403
|
|
|
535
404
|
|
|
405
|
+
@_init_db
|
|
536
406
|
def delete_user(user_id: str) -> None:
|
|
537
407
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
538
408
|
session.query(user_table).filter_by(id=user_id).delete()
|
sky/jobs/client/sdk.py
CHANGED
|
@@ -192,7 +192,7 @@ def cancel(
|
|
|
192
192
|
|
|
193
193
|
@usage_lib.entrypoint
|
|
194
194
|
@server_common.check_server_healthy_or_start
|
|
195
|
-
@rest.
|
|
195
|
+
@rest.retry_transient_errors()
|
|
196
196
|
def tail_logs(name: Optional[str] = None,
|
|
197
197
|
job_id: Optional[int] = None,
|
|
198
198
|
follow: bool = True,
|
sky/jobs/server/core.py
CHANGED
|
@@ -147,6 +147,18 @@ def launch(
|
|
|
147
147
|
None if dryrun.
|
|
148
148
|
"""
|
|
149
149
|
entrypoint = task
|
|
150
|
+
# using hasattr instead of isinstance to avoid importing sky
|
|
151
|
+
if hasattr(task, 'metadata'):
|
|
152
|
+
metadata = task.metadata
|
|
153
|
+
else:
|
|
154
|
+
# we are a Dag, not a Task
|
|
155
|
+
if len(task.tasks) == 1:
|
|
156
|
+
metadata = task.tasks[0].metadata
|
|
157
|
+
else:
|
|
158
|
+
# doesn't make sense to have a git commit since there might be
|
|
159
|
+
# different metadatas for each task
|
|
160
|
+
metadata = {}
|
|
161
|
+
|
|
150
162
|
dag_uuid = str(uuid.uuid4().hex[:4])
|
|
151
163
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
|
152
164
|
dag.resolve_and_validate_volumes()
|
|
@@ -311,6 +323,8 @@ def launch(
|
|
|
311
323
|
controller_task.set_resources(controller_resources)
|
|
312
324
|
|
|
313
325
|
controller_task.managed_job_dag = dag
|
|
326
|
+
# pylint: disable=protected-access
|
|
327
|
+
controller_task._metadata = metadata
|
|
314
328
|
|
|
315
329
|
logger.info(
|
|
316
330
|
f'{colorama.Fore.YELLOW}'
|
sky/jobs/state.py
CHANGED
|
@@ -6,11 +6,11 @@ import functools
|
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
8
|
import pathlib
|
|
9
|
-
import threading
|
|
10
9
|
import time
|
|
11
10
|
import typing
|
|
12
11
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
13
12
|
|
|
13
|
+
from alembic import command as alembic_command
|
|
14
14
|
import colorama
|
|
15
15
|
import sqlalchemy
|
|
16
16
|
from sqlalchemy import exc as sqlalchemy_exc
|
|
@@ -24,7 +24,8 @@ from sky import sky_logging
|
|
|
24
24
|
from sky import skypilot_config
|
|
25
25
|
from sky.skylet import constants
|
|
26
26
|
from sky.utils import common_utils
|
|
27
|
-
from sky.utils import db_utils
|
|
27
|
+
from sky.utils.db import db_utils
|
|
28
|
+
from sky.utils.db import migration_utils
|
|
28
29
|
|
|
29
30
|
if typing.TYPE_CHECKING:
|
|
30
31
|
from sqlalchemy.engine import row
|
|
@@ -36,7 +37,6 @@ CallbackType = Callable[[str], None]
|
|
|
36
37
|
logger = sky_logging.init_logger(__name__)
|
|
37
38
|
|
|
38
39
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
39
|
-
_DB_INIT_LOCK = threading.Lock()
|
|
40
40
|
|
|
41
41
|
Base = declarative.declarative_base()
|
|
42
42
|
|
|
@@ -130,97 +130,18 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
130
130
|
# If the database is locked, it is OK to continue, as the WAL mode
|
|
131
131
|
# is not critical and is likely to be enabled by other processes.
|
|
132
132
|
|
|
133
|
-
#
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
db_utils.add_column_to_table_sqlalchemy(session, 'spot',
|
|
139
|
-
'failure_reason',
|
|
140
|
-
sqlalchemy.Text())
|
|
141
|
-
db_utils.add_column_to_table_sqlalchemy(session,
|
|
142
|
-
'spot',
|
|
143
|
-
'spot_job_id',
|
|
144
|
-
sqlalchemy.Integer(),
|
|
145
|
-
copy_from='job_id')
|
|
146
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
147
|
-
session,
|
|
148
|
-
'spot',
|
|
149
|
-
'task_id',
|
|
150
|
-
sqlalchemy.Integer(),
|
|
151
|
-
default_statement='DEFAULT 0',
|
|
152
|
-
value_to_replace_existing_entries=0)
|
|
153
|
-
db_utils.add_column_to_table_sqlalchemy(session,
|
|
154
|
-
'spot',
|
|
155
|
-
'task_name',
|
|
156
|
-
sqlalchemy.Text(),
|
|
157
|
-
copy_from='job_name')
|
|
158
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
159
|
-
session,
|
|
160
|
-
'spot',
|
|
161
|
-
'specs',
|
|
162
|
-
sqlalchemy.Text(),
|
|
163
|
-
value_to_replace_existing_entries=json.dumps({
|
|
164
|
-
'max_restarts_on_errors': 0,
|
|
165
|
-
}))
|
|
166
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
167
|
-
session,
|
|
168
|
-
'spot',
|
|
169
|
-
'local_log_file',
|
|
170
|
-
sqlalchemy.Text(),
|
|
171
|
-
default_statement='DEFAULT NULL')
|
|
172
|
-
|
|
173
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
174
|
-
session,
|
|
175
|
-
'spot',
|
|
176
|
-
'metadata',
|
|
177
|
-
sqlalchemy.Text(),
|
|
178
|
-
default_statement='DEFAULT \'{}\'',
|
|
179
|
-
value_to_replace_existing_entries='{}')
|
|
180
|
-
|
|
181
|
-
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
|
182
|
-
'schedule_state',
|
|
183
|
-
sqlalchemy.Text())
|
|
184
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
185
|
-
session,
|
|
186
|
-
'job_info',
|
|
187
|
-
'controller_pid',
|
|
188
|
-
sqlalchemy.Integer(),
|
|
189
|
-
default_statement='DEFAULT NULL')
|
|
190
|
-
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
|
191
|
-
'dag_yaml_path',
|
|
192
|
-
sqlalchemy.Text())
|
|
193
|
-
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
|
194
|
-
'env_file_path',
|
|
195
|
-
sqlalchemy.Text())
|
|
196
|
-
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
|
197
|
-
'user_hash', sqlalchemy.Text())
|
|
198
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
199
|
-
session,
|
|
200
|
-
'job_info',
|
|
201
|
-
'workspace',
|
|
202
|
-
sqlalchemy.Text(),
|
|
203
|
-
default_statement='DEFAULT NULL',
|
|
204
|
-
value_to_replace_existing_entries='default')
|
|
205
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
206
|
-
session,
|
|
207
|
-
'job_info',
|
|
208
|
-
'priority',
|
|
209
|
-
sqlalchemy.Integer(),
|
|
210
|
-
value_to_replace_existing_entries=constants.DEFAULT_PRIORITY)
|
|
211
|
-
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
|
212
|
-
'entrypoint', sqlalchemy.Text())
|
|
213
|
-
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
|
214
|
-
'original_user_yaml_path',
|
|
215
|
-
sqlalchemy.Text())
|
|
216
|
-
session.commit()
|
|
133
|
+
# Get alembic config for spot jobs db and run migrations
|
|
134
|
+
alembic_config = migration_utils.get_alembic_config(
|
|
135
|
+
engine, migration_utils.SPOT_JOBS_DB_NAME)
|
|
136
|
+
alembic_config.config_ini_section = migration_utils.SPOT_JOBS_DB_NAME
|
|
137
|
+
alembic_command.upgrade(alembic_config, migration_utils.SPOT_JOBS_VERSION)
|
|
217
138
|
|
|
218
139
|
|
|
219
140
|
def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
220
141
|
global _SQLALCHEMY_ENGINE
|
|
221
142
|
if _SQLALCHEMY_ENGINE is not None:
|
|
222
143
|
return _SQLALCHEMY_ENGINE
|
|
223
|
-
with
|
|
144
|
+
with migration_utils.db_lock(migration_utils.SPOT_JOBS_DB_NAME):
|
|
224
145
|
if _SQLALCHEMY_ENGINE is None:
|
|
225
146
|
conn_string = None
|
|
226
147
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
sky/jobs/utils.py
CHANGED
|
@@ -67,6 +67,9 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
|
|
|
67
67
|
|
|
68
68
|
_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
|
|
69
69
|
|
|
70
|
+
_JOB_STATUS_FETCH_MAX_RETRIES = 3
|
|
71
|
+
_JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
|
|
72
|
+
|
|
70
73
|
_JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
|
|
71
74
|
'Waiting for task to start[/]'
|
|
72
75
|
'{status_str}. It may take a few minutes.\n'
|
|
@@ -250,19 +253,31 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
|
|
|
250
253
|
logger.info(f'Cluster {cluster_name} not found.')
|
|
251
254
|
return None
|
|
252
255
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
256
|
+
for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
|
|
257
|
+
try:
|
|
258
|
+
logger.info('=== Checking the job status... ===')
|
|
259
|
+
statuses = backend.get_job_status(handle, stream_logs=False)
|
|
260
|
+
status = list(statuses.values())[0]
|
|
261
|
+
if status is None:
|
|
262
|
+
logger.info('No job found.')
|
|
263
|
+
else:
|
|
264
|
+
logger.info(f'Job status: {status}')
|
|
265
|
+
logger.info('=' * 34)
|
|
266
|
+
return status
|
|
267
|
+
except exceptions.CommandError as e:
|
|
268
|
+
# Retry on k8s transient network errors. This is useful when using
|
|
269
|
+
# coreweave which may have transient network issue sometimes.
|
|
270
|
+
if (e.detailed_reason is not None and
|
|
271
|
+
_JOB_K8S_TRANSIENT_NW_MSG in e.detailed_reason):
|
|
272
|
+
logger.info('Failed to connect to the cluster. Retrying '
|
|
273
|
+
f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
|
|
274
|
+
logger.info('=' * 34)
|
|
275
|
+
time.sleep(1)
|
|
276
|
+
else:
|
|
277
|
+
logger.info(f'Failed to get job status: {e.detailed_reason}')
|
|
278
|
+
logger.info('=' * 34)
|
|
279
|
+
return None
|
|
280
|
+
return None
|
|
266
281
|
|
|
267
282
|
|
|
268
283
|
def _controller_process_alive(pid: int, job_id: int) -> bool:
|
sky/schemas/db/README
ADDED
sky/schemas/db/env.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Alembic environment configuration for state database migrations."""
|
|
2
|
+
from logging.config import fileConfig
|
|
3
|
+
|
|
4
|
+
from alembic import context
|
|
5
|
+
from sqlalchemy import engine_from_config
|
|
6
|
+
from sqlalchemy import pool
|
|
7
|
+
|
|
8
|
+
# this is the Alembic Config object, which provides
|
|
9
|
+
# access to the values within the .ini file in use.
|
|
10
|
+
config = context.config
|
|
11
|
+
|
|
12
|
+
# NOTE: We intentionally disable Alembic's logging configuration to prevent
|
|
13
|
+
# it from overriding SkyPilot's logging setup. Alembic's fileConfig() call
|
|
14
|
+
# globally reconfigures Python's logging system, which can suppress SkyPilot's
|
|
15
|
+
# output messages that tests expect to see.
|
|
16
|
+
#
|
|
17
|
+
# Original code (now disabled):
|
|
18
|
+
if config.config_file_name is not None:
|
|
19
|
+
fileConfig(config.config_file_name, disable_existing_loggers=False)
|
|
20
|
+
|
|
21
|
+
# add your model's MetaData object here
|
|
22
|
+
# for 'autogenerate' support
|
|
23
|
+
# from myapp import mymodel
|
|
24
|
+
# target_metadata = mymodel.Base.metadata
|
|
25
|
+
target_metadata = None
|
|
26
|
+
|
|
27
|
+
# other values from the config, defined by the needs of env.py,
|
|
28
|
+
# can be acquired:
|
|
29
|
+
# my_important_option = config.get_main_option("my_important_option")
|
|
30
|
+
# ... etc.
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def run_migrations_offline() -> None:
|
|
34
|
+
"""Run migrations in 'offline' mode.
|
|
35
|
+
|
|
36
|
+
This configures the context with just a URL
|
|
37
|
+
and not an Engine, though an Engine is acceptable
|
|
38
|
+
here as well. By skipping the Engine creation
|
|
39
|
+
we don't even need a DBAPI to be available.
|
|
40
|
+
|
|
41
|
+
Calls to context.execute() here emit the given string to the
|
|
42
|
+
script output.
|
|
43
|
+
|
|
44
|
+
"""
|
|
45
|
+
url = config.get_main_option('sqlalchemy.url')
|
|
46
|
+
version_table = config.get_section_option(config.config_ini_section,
|
|
47
|
+
'version_table',
|
|
48
|
+
'alembic_version')
|
|
49
|
+
context.configure(
|
|
50
|
+
url=url,
|
|
51
|
+
target_metadata=target_metadata,
|
|
52
|
+
literal_binds=True,
|
|
53
|
+
dialect_opts={'paramstyle': 'named'},
|
|
54
|
+
version_table=version_table,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
with context.begin_transaction():
|
|
58
|
+
context.run_migrations()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def run_migrations_online() -> None:
|
|
62
|
+
"""Run migrations in 'online' mode.
|
|
63
|
+
|
|
64
|
+
In this scenario we need to create an Engine
|
|
65
|
+
and associate a connection with the context.
|
|
66
|
+
|
|
67
|
+
"""
|
|
68
|
+
connectable = engine_from_config(
|
|
69
|
+
config.get_section(config.config_ini_section, {}),
|
|
70
|
+
prefix='sqlalchemy.',
|
|
71
|
+
poolclass=pool.NullPool,
|
|
72
|
+
)
|
|
73
|
+
version_table = config.get_section_option(config.config_ini_section,
|
|
74
|
+
'version_table',
|
|
75
|
+
'alembic_version')
|
|
76
|
+
with connectable.connect() as connection:
|
|
77
|
+
context.configure(
|
|
78
|
+
connection=connection,
|
|
79
|
+
target_metadata=target_metadata,
|
|
80
|
+
version_table=version_table,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
with context.begin_transaction():
|
|
84
|
+
context.run_migrations()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
if context.is_offline_mode():
|
|
88
|
+
run_migrations_offline()
|
|
89
|
+
else:
|
|
90
|
+
run_migrations_online()
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Initial schema for state database with backwards compatibility columns
|
|
2
|
+
|
|
3
|
+
Revision ID: 001
|
|
4
|
+
Revises:
|
|
5
|
+
Create Date: 2024-01-01 12:00:00.000000
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from alembic import op
|
|
10
|
+
import sqlalchemy as sa
|
|
11
|
+
|
|
12
|
+
from sky.global_user_state import Base
|
|
13
|
+
from sky.utils.db import db_utils
|
|
14
|
+
|
|
15
|
+
# revision identifiers, used by Alembic.
|
|
16
|
+
revision = '001'
|
|
17
|
+
down_revision = None
|
|
18
|
+
branch_labels = None
|
|
19
|
+
depends_on = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upgrade():
|
|
23
|
+
with op.get_context().autocommit_block():
|
|
24
|
+
# Create any missing tables with current schema first
|
|
25
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
26
|
+
|
|
27
|
+
# Add all missing columns to clusters table
|
|
28
|
+
# This allows each column addition to fail independently without rolling
|
|
29
|
+
# back the entire migration, which is needed for backwards compatibility
|
|
30
|
+
|
|
31
|
+
# Add all missing columns to clusters table
|
|
32
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
33
|
+
'autostop',
|
|
34
|
+
sa.Integer(),
|
|
35
|
+
server_default='-1')
|
|
36
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
37
|
+
'metadata',
|
|
38
|
+
sa.Text(),
|
|
39
|
+
server_default='{}')
|
|
40
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
41
|
+
'to_down',
|
|
42
|
+
sa.Integer(),
|
|
43
|
+
server_default='0')
|
|
44
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
45
|
+
'owner',
|
|
46
|
+
sa.Text(),
|
|
47
|
+
server_default=None)
|
|
48
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
49
|
+
'cluster_hash',
|
|
50
|
+
sa.Text(),
|
|
51
|
+
server_default=None)
|
|
52
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
53
|
+
'launched_nodes',
|
|
54
|
+
sa.Integer(),
|
|
55
|
+
server_default='0')
|
|
56
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
57
|
+
'disk_tier',
|
|
58
|
+
sa.Text(),
|
|
59
|
+
server_default=None)
|
|
60
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
61
|
+
'config_hash',
|
|
62
|
+
sa.Text(),
|
|
63
|
+
server_default=None)
|
|
64
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
65
|
+
'user_hash',
|
|
66
|
+
sa.Text(),
|
|
67
|
+
server_default=None)
|
|
68
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
69
|
+
'workspace',
|
|
70
|
+
sa.Text(),
|
|
71
|
+
server_default='default')
|
|
72
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
73
|
+
'last_creation_yaml',
|
|
74
|
+
sa.Text(),
|
|
75
|
+
server_default=None)
|
|
76
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
77
|
+
'last_creation_command',
|
|
78
|
+
sa.Text(),
|
|
79
|
+
server_default=None)
|
|
80
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
81
|
+
'config_hash_locked',
|
|
82
|
+
sa.Boolean(),
|
|
83
|
+
server_default='FALSE')
|
|
84
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
85
|
+
'handle_locked',
|
|
86
|
+
sa.Boolean(),
|
|
87
|
+
server_default='FALSE')
|
|
88
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
89
|
+
'num_failures',
|
|
90
|
+
sa.Integer(),
|
|
91
|
+
server_default='0')
|
|
92
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
93
|
+
'configs',
|
|
94
|
+
sa.Text(),
|
|
95
|
+
server_default='[]')
|
|
96
|
+
|
|
97
|
+
# Add all missing columns to cluster_history table
|
|
98
|
+
db_utils.add_column_to_table_alembic('cluster_history',
|
|
99
|
+
'user_hash',
|
|
100
|
+
sa.Text(),
|
|
101
|
+
server_default=None)
|
|
102
|
+
db_utils.add_column_to_table_alembic('cluster_history',
|
|
103
|
+
'last_creation_yaml',
|
|
104
|
+
sa.Text(),
|
|
105
|
+
server_default=None)
|
|
106
|
+
db_utils.add_column_to_table_alembic('cluster_history',
|
|
107
|
+
'last_creation_command',
|
|
108
|
+
sa.Text(),
|
|
109
|
+
server_default=None)
|
|
110
|
+
|
|
111
|
+
# Add all missing columns to users table
|
|
112
|
+
db_utils.add_column_to_table_alembic('users',
|
|
113
|
+
'password',
|
|
114
|
+
sa.Text(),
|
|
115
|
+
server_default=None)
|
|
116
|
+
db_utils.add_column_to_table_alembic('users',
|
|
117
|
+
'created_at',
|
|
118
|
+
sa.Integer(),
|
|
119
|
+
server_default=None)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def downgrade():
|
|
123
|
+
# Drop all tables
|
|
124
|
+
Base.metadata.drop_all(bind=op.get_bind())
|