skypilot-nightly 1.0.0.dev20250717__py3-none-any.whl → 1.0.0.dev20250720__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (120) hide show
  1. sky/__init__.py +4 -2
  2. sky/backends/backend_utils.py +23 -13
  3. sky/backends/cloud_vm_ray_backend.py +19 -11
  4. sky/catalog/__init__.py +3 -1
  5. sky/catalog/aws_catalog.py +8 -5
  6. sky/catalog/azure_catalog.py +8 -5
  7. sky/catalog/common.py +8 -2
  8. sky/catalog/cudo_catalog.py +5 -2
  9. sky/catalog/do_catalog.py +4 -1
  10. sky/catalog/fluidstack_catalog.py +5 -2
  11. sky/catalog/gcp_catalog.py +8 -5
  12. sky/catalog/hyperbolic_catalog.py +5 -2
  13. sky/catalog/ibm_catalog.py +8 -5
  14. sky/catalog/lambda_catalog.py +8 -5
  15. sky/catalog/nebius_catalog.py +8 -5
  16. sky/catalog/oci_catalog.py +8 -5
  17. sky/catalog/paperspace_catalog.py +4 -1
  18. sky/catalog/runpod_catalog.py +5 -2
  19. sky/catalog/scp_catalog.py +8 -5
  20. sky/catalog/vast_catalog.py +5 -2
  21. sky/catalog/vsphere_catalog.py +4 -1
  22. sky/client/cli/command.py +25 -2
  23. sky/client/sdk.py +10 -5
  24. sky/clouds/aws.py +12 -7
  25. sky/clouds/azure.py +12 -7
  26. sky/clouds/cloud.py +9 -8
  27. sky/clouds/cudo.py +13 -7
  28. sky/clouds/do.py +12 -7
  29. sky/clouds/fluidstack.py +11 -6
  30. sky/clouds/gcp.py +12 -7
  31. sky/clouds/hyperbolic.py +11 -6
  32. sky/clouds/ibm.py +11 -6
  33. sky/clouds/kubernetes.py +7 -3
  34. sky/clouds/lambda_cloud.py +11 -6
  35. sky/clouds/nebius.py +12 -7
  36. sky/clouds/oci.py +12 -7
  37. sky/clouds/paperspace.py +12 -7
  38. sky/clouds/runpod.py +12 -7
  39. sky/clouds/scp.py +11 -6
  40. sky/clouds/vast.py +12 -7
  41. sky/clouds/vsphere.py +11 -6
  42. sky/core.py +6 -1
  43. sky/dashboard/out/404.html +1 -1
  44. sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/1871-a821dcaaae2a3823.js +6 -0
  46. sky/dashboard/out/_next/static/chunks/{2641.35edc9ccaeaad9e3.js → 2641.5233e938f14e31a7.js} +1 -1
  47. sky/dashboard/out/_next/static/chunks/{4725.4c849b1e05c8e9ad.js → 4725.66125dcd9832aa5d.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/4869.c7c055a5c2814f33.js +16 -0
  49. sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/938-63fc419cb82ad9b3.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/9470-8178183f3bae198f.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/{9984.b56614f3c4c5961d.js → 9984.2b5e3fa69171bff9.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/pages/_app-507712f30cd3cec3.js +20 -0
  54. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa406155b4223d0d.js +11 -0
  55. sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-14d404b7dd28502a.js → [job]-c5b357bfd9502fbe.js} +1 -1
  56. sky/dashboard/out/_next/static/chunks/webpack-26cdc782eed15a7d.js +1 -0
  57. sky/dashboard/out/_next/static/css/5122cb0a08486fd3.css +3 -0
  58. sky/dashboard/out/_next/static/{Et5IQ5Y3WvH608nXClo4z → pTQKG61ng32Zc7gsAROFJ}/_buildManifest.js +1 -1
  59. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  60. sky/dashboard/out/clusters/[cluster].html +1 -1
  61. sky/dashboard/out/clusters.html +1 -1
  62. sky/dashboard/out/config.html +1 -1
  63. sky/dashboard/out/index.html +1 -1
  64. sky/dashboard/out/infra/[context].html +1 -1
  65. sky/dashboard/out/infra.html +1 -1
  66. sky/dashboard/out/jobs/[job].html +1 -1
  67. sky/dashboard/out/jobs.html +1 -1
  68. sky/dashboard/out/users.html +1 -1
  69. sky/dashboard/out/volumes.html +1 -1
  70. sky/dashboard/out/workspace/new.html +1 -1
  71. sky/dashboard/out/workspaces/[name].html +1 -1
  72. sky/dashboard/out/workspaces.html +1 -1
  73. sky/global_user_state.py +13 -143
  74. sky/jobs/client/sdk.py +1 -1
  75. sky/jobs/server/core.py +14 -0
  76. sky/jobs/state.py +9 -88
  77. sky/jobs/utils.py +28 -13
  78. sky/schemas/db/README +4 -0
  79. sky/schemas/db/env.py +90 -0
  80. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  81. sky/schemas/db/script.py.mako +28 -0
  82. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  83. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  84. sky/serve/client/sdk.py +7 -3
  85. sky/serve/controller.py +7 -3
  86. sky/serve/serve_state.py +1 -1
  87. sky/serve/serve_utils.py +171 -75
  88. sky/serve/server/core.py +17 -6
  89. sky/server/common.py +4 -0
  90. sky/server/requests/payloads.py +2 -0
  91. sky/server/requests/requests.py +1 -1
  92. sky/server/rest.py +71 -26
  93. sky/setup_files/MANIFEST.in +2 -0
  94. sky/setup_files/alembic.ini +152 -0
  95. sky/setup_files/dependencies.py +1 -0
  96. sky/skylet/configs.py +1 -1
  97. sky/skylet/job_lib.py +1 -1
  98. sky/skypilot_config.py +32 -6
  99. sky/users/permission.py +1 -1
  100. sky/utils/common_utils.py +77 -0
  101. sky/utils/db/__init__.py +0 -0
  102. sky/utils/{db_utils.py → db/db_utils.py} +59 -0
  103. sky/utils/db/migration_utils.py +53 -0
  104. {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/METADATA +2 -1
  105. {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/RECORD +110 -101
  106. sky/dashboard/out/_next/static/chunks/1043-90a88c46f27b3df5.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/1871-76491ac174a95278.js +0 -6
  108. sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +0 -16
  109. sky/dashboard/out/_next/static/chunks/8969-743abf4bc86baf48.js +0 -1
  110. sky/dashboard/out/_next/static/chunks/938-6a9ffdaa21eee969.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/9470-b6f6a35283863a6f.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/pages/_app-771a40cde532309b.js +0 -20
  113. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-9096ea50b8e2cf9e.js +0 -6
  114. sky/dashboard/out/_next/static/chunks/webpack-c3b45b7b0eaef66f.js +0 -1
  115. sky/dashboard/out/_next/static/css/219887b94512388c.css +0 -3
  116. /sky/dashboard/out/_next/static/{Et5IQ5Y3WvH608nXClo4z → pTQKG61ng32Zc7gsAROFJ}/_ssgManifest.js +0 -0
  117. {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/WHEEL +0 -0
  118. {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/entry_points.txt +0 -0
  119. {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/licenses/LICENSE +0 -0
  120. {skypilot_nightly-1.0.0.dev20250717.dist-info → skypilot_nightly-1.0.0.dev20250720.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -12,12 +12,12 @@ import os
12
12
  import pathlib
13
13
  import pickle
14
14
  import re
15
- import threading
16
15
  import time
17
16
  import typing
18
17
  from typing import Any, Dict, List, Optional, Set, Tuple
19
18
  import uuid
20
19
 
20
+ from alembic import command as alembic_command
21
21
  import sqlalchemy
22
22
  from sqlalchemy import exc as sqlalchemy_exc
23
23
  from sqlalchemy import orm
@@ -32,9 +32,10 @@ from sky import skypilot_config
32
32
  from sky.skylet import constants
33
33
  from sky.utils import common_utils
34
34
  from sky.utils import context_utils
35
- from sky.utils import db_utils
36
35
  from sky.utils import registry
37
36
  from sky.utils import status_lib
37
+ from sky.utils.db import db_utils
38
+ from sky.utils.db import migration_utils
38
39
 
39
40
  if typing.TYPE_CHECKING:
40
41
  from sky import backends
@@ -48,7 +49,6 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
48
49
  _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
49
50
 
50
51
  _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
51
- _DB_INIT_LOCK = threading.Lock()
52
52
 
53
53
  Base = declarative.declarative_base()
54
54
 
@@ -238,152 +238,20 @@ def create_table(engine: sqlalchemy.engine.Engine):
238
238
  # If the database is locked, it is OK to continue, as the WAL mode
239
239
  # is not critical and is likely to be enabled by other processes.
240
240
 
241
- # Create tables if they don't exist
242
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata, engine)
243
-
244
- # For backward compatibility.
245
- # TODO(zhwu): Remove this function after all users have migrated to
246
- # the latest version of SkyPilot.
247
- with orm.Session(engine) as session:
248
- # Add autostop column to clusters table
249
- db_utils.add_column_to_table_sqlalchemy(session,
250
- 'clusters',
251
- 'autostop',
252
- sqlalchemy.Integer(),
253
- default_statement='DEFAULT -1')
254
-
255
- db_utils.add_column_to_table_sqlalchemy(
256
- session,
257
- 'clusters',
258
- 'metadata',
259
- sqlalchemy.Text(),
260
- default_statement='DEFAULT \'{}\'')
261
-
262
- db_utils.add_column_to_table_sqlalchemy(session,
263
- 'clusters',
264
- 'to_down',
265
- sqlalchemy.Integer(),
266
- default_statement='DEFAULT 0')
267
-
268
- # The cloud identity that created the cluster.
269
- db_utils.add_column_to_table_sqlalchemy(
270
- session,
271
- 'clusters',
272
- 'owner',
273
- sqlalchemy.Text(),
274
- default_statement='DEFAULT NULL')
275
-
276
- db_utils.add_column_to_table_sqlalchemy(
277
- session,
278
- 'clusters',
279
- 'cluster_hash',
280
- sqlalchemy.Text(),
281
- default_statement='DEFAULT NULL')
282
-
283
- db_utils.add_column_to_table_sqlalchemy(
284
- session,
285
- 'clusters',
286
- 'storage_mounts_metadata',
287
- sqlalchemy.LargeBinary(),
288
- default_statement='DEFAULT NULL')
289
- db_utils.add_column_to_table_sqlalchemy(
290
- session,
291
- 'clusters',
292
- 'cluster_ever_up',
293
- sqlalchemy.Integer(),
294
- default_statement='DEFAULT 0',
295
- # Set the value to 1 so that all the existing clusters before #2977
296
- # are considered as ever up, i.e:
297
- # existing cluster's default (null) -> 1;
298
- # new cluster's default -> 0;
299
- # This is conservative for the existing clusters: even if some INIT
300
- # clusters were never really UP, setting it to 1 means they won't be
301
- # auto-deleted during any failover.
302
- value_to_replace_existing_entries=1)
303
- db_utils.add_column_to_table_sqlalchemy(
304
- session,
305
- 'clusters',
306
- 'status_updated_at',
307
- sqlalchemy.Integer(),
308
- default_statement='DEFAULT NULL')
309
- db_utils.add_column_to_table_sqlalchemy(
310
- session,
311
- 'clusters',
312
- 'user_hash',
313
- sqlalchemy.Text(),
314
- default_statement='DEFAULT NULL',
315
- value_to_replace_existing_entries=common_utils.get_current_user(
316
- ).id)
317
- db_utils.add_column_to_table_sqlalchemy(
318
- session,
319
- 'clusters',
320
- 'config_hash',
321
- sqlalchemy.Text(),
322
- default_statement='DEFAULT NULL')
323
-
324
- db_utils.add_column_to_table_sqlalchemy(
325
- session,
326
- 'cluster_history',
327
- 'user_hash',
328
- sqlalchemy.Text(),
329
- default_statement='DEFAULT NULL')
330
-
331
- db_utils.add_column_to_table_sqlalchemy(
332
- session,
333
- 'clusters',
334
- 'workspace',
335
- sqlalchemy.Text(),
336
- default_statement='DEFAULT \'default\'',
337
- value_to_replace_existing_entries=constants.
338
- SKYPILOT_DEFAULT_WORKSPACE)
339
- db_utils.add_column_to_table_sqlalchemy(
340
- session,
341
- 'clusters',
342
- 'last_creation_yaml',
343
- sqlalchemy.Text(),
344
- default_statement='DEFAULT NULL',
345
- )
346
- db_utils.add_column_to_table_sqlalchemy(
347
- session,
348
- 'clusters',
349
- 'last_creation_command',
350
- sqlalchemy.Text(),
351
- default_statement='DEFAULT NULL')
352
- db_utils.add_column_to_table_sqlalchemy(
353
- session,
354
- 'users',
355
- 'password',
356
- sqlalchemy.Text(),
357
- default_statement='DEFAULT NULL')
358
- db_utils.add_column_to_table_sqlalchemy(
359
- session,
360
- 'users',
361
- 'created_at',
362
- sqlalchemy.Integer(),
363
- default_statement='DEFAULT NULL')
364
-
365
- db_utils.add_column_to_table_sqlalchemy(
366
- session,
367
- 'cluster_history',
368
- 'last_creation_yaml',
369
- sqlalchemy.Text(),
370
- default_statement='DEFAULT NULL')
371
-
372
- db_utils.add_column_to_table_sqlalchemy(
373
- session,
374
- 'cluster_history',
375
- 'last_creation_command',
376
- sqlalchemy.Text(),
377
- default_statement='DEFAULT NULL')
378
-
379
- session.commit()
241
+ # Get alembic config for state db and run migrations
242
+ alembic_config = migration_utils.get_alembic_config(
243
+ engine, migration_utils.GLOBAL_USER_STATE_DB_NAME)
244
+ # pylint: disable=line-too-long
245
+ alembic_config.config_ini_section = migration_utils.GLOBAL_USER_STATE_DB_NAME
246
+ alembic_command.upgrade(alembic_config,
247
+ migration_utils.GLOBAL_USER_STATE_VERSION)
380
248
 
381
249
 
382
250
  def initialize_and_get_db() -> sqlalchemy.engine.Engine:
383
251
  global _SQLALCHEMY_ENGINE
384
252
  if _SQLALCHEMY_ENGINE is not None:
385
253
  return _SQLALCHEMY_ENGINE
386
- with _DB_INIT_LOCK:
254
+ with migration_utils.db_lock(migration_utils.GLOBAL_USER_STATE_DB_NAME):
387
255
  if _SQLALCHEMY_ENGINE is None:
388
256
  conn_string = None
389
257
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
@@ -520,6 +388,7 @@ def get_user(user_id: str) -> Optional[models.User]:
520
388
  created_at=row.created_at)
521
389
 
522
390
 
391
+ @_init_db
523
392
  def get_user_by_name(username: str) -> List[models.User]:
524
393
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
525
394
  rows = session.query(user_table).filter_by(name=username).all()
@@ -533,6 +402,7 @@ def get_user_by_name(username: str) -> List[models.User]:
533
402
  ]
534
403
 
535
404
 
405
+ @_init_db
536
406
  def delete_user(user_id: str) -> None:
537
407
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
538
408
  session.query(user_table).filter_by(id=user_id).delete()
sky/jobs/client/sdk.py CHANGED
@@ -192,7 +192,7 @@ def cancel(
192
192
 
193
193
  @usage_lib.entrypoint
194
194
  @server_common.check_server_healthy_or_start
195
- @rest.retry_on_server_unavailable()
195
+ @rest.retry_transient_errors()
196
196
  def tail_logs(name: Optional[str] = None,
197
197
  job_id: Optional[int] = None,
198
198
  follow: bool = True,
sky/jobs/server/core.py CHANGED
@@ -147,6 +147,18 @@ def launch(
147
147
  None if dryrun.
148
148
  """
149
149
  entrypoint = task
150
+ # using hasattr instead of isinstance to avoid importing sky
151
+ if hasattr(task, 'metadata'):
152
+ metadata = task.metadata
153
+ else:
154
+ # we are a Dag, not a Task
155
+ if len(task.tasks) == 1:
156
+ metadata = task.tasks[0].metadata
157
+ else:
158
+ # doesn't make sense to have a git commit since there might be
159
+ # different metadatas for each task
160
+ metadata = {}
161
+
150
162
  dag_uuid = str(uuid.uuid4().hex[:4])
151
163
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
152
164
  dag.resolve_and_validate_volumes()
@@ -311,6 +323,8 @@ def launch(
311
323
  controller_task.set_resources(controller_resources)
312
324
 
313
325
  controller_task.managed_job_dag = dag
326
+ # pylint: disable=protected-access
327
+ controller_task._metadata = metadata
314
328
 
315
329
  logger.info(
316
330
  f'{colorama.Fore.YELLOW}'
sky/jobs/state.py CHANGED
@@ -6,11 +6,11 @@ import functools
6
6
  import json
7
7
  import os
8
8
  import pathlib
9
- import threading
10
9
  import time
11
10
  import typing
12
11
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
13
12
 
13
+ from alembic import command as alembic_command
14
14
  import colorama
15
15
  import sqlalchemy
16
16
  from sqlalchemy import exc as sqlalchemy_exc
@@ -24,7 +24,8 @@ from sky import sky_logging
24
24
  from sky import skypilot_config
25
25
  from sky.skylet import constants
26
26
  from sky.utils import common_utils
27
- from sky.utils import db_utils
27
+ from sky.utils.db import db_utils
28
+ from sky.utils.db import migration_utils
28
29
 
29
30
  if typing.TYPE_CHECKING:
30
31
  from sqlalchemy.engine import row
@@ -36,7 +37,6 @@ CallbackType = Callable[[str], None]
36
37
  logger = sky_logging.init_logger(__name__)
37
38
 
38
39
  _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
39
- _DB_INIT_LOCK = threading.Lock()
40
40
 
41
41
  Base = declarative.declarative_base()
42
42
 
@@ -130,97 +130,18 @@ def create_table(engine: sqlalchemy.engine.Engine):
130
130
  # If the database is locked, it is OK to continue, as the WAL mode
131
131
  # is not critical and is likely to be enabled by other processes.
132
132
 
133
- # Create tables if they don't exist
134
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata, engine)
135
-
136
- # Backward compatibility: add columns that not exist in older databases
137
- with orm.Session(engine) as session:
138
- db_utils.add_column_to_table_sqlalchemy(session, 'spot',
139
- 'failure_reason',
140
- sqlalchemy.Text())
141
- db_utils.add_column_to_table_sqlalchemy(session,
142
- 'spot',
143
- 'spot_job_id',
144
- sqlalchemy.Integer(),
145
- copy_from='job_id')
146
- db_utils.add_column_to_table_sqlalchemy(
147
- session,
148
- 'spot',
149
- 'task_id',
150
- sqlalchemy.Integer(),
151
- default_statement='DEFAULT 0',
152
- value_to_replace_existing_entries=0)
153
- db_utils.add_column_to_table_sqlalchemy(session,
154
- 'spot',
155
- 'task_name',
156
- sqlalchemy.Text(),
157
- copy_from='job_name')
158
- db_utils.add_column_to_table_sqlalchemy(
159
- session,
160
- 'spot',
161
- 'specs',
162
- sqlalchemy.Text(),
163
- value_to_replace_existing_entries=json.dumps({
164
- 'max_restarts_on_errors': 0,
165
- }))
166
- db_utils.add_column_to_table_sqlalchemy(
167
- session,
168
- 'spot',
169
- 'local_log_file',
170
- sqlalchemy.Text(),
171
- default_statement='DEFAULT NULL')
172
-
173
- db_utils.add_column_to_table_sqlalchemy(
174
- session,
175
- 'spot',
176
- 'metadata',
177
- sqlalchemy.Text(),
178
- default_statement='DEFAULT \'{}\'',
179
- value_to_replace_existing_entries='{}')
180
-
181
- db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
182
- 'schedule_state',
183
- sqlalchemy.Text())
184
- db_utils.add_column_to_table_sqlalchemy(
185
- session,
186
- 'job_info',
187
- 'controller_pid',
188
- sqlalchemy.Integer(),
189
- default_statement='DEFAULT NULL')
190
- db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
191
- 'dag_yaml_path',
192
- sqlalchemy.Text())
193
- db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
194
- 'env_file_path',
195
- sqlalchemy.Text())
196
- db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
197
- 'user_hash', sqlalchemy.Text())
198
- db_utils.add_column_to_table_sqlalchemy(
199
- session,
200
- 'job_info',
201
- 'workspace',
202
- sqlalchemy.Text(),
203
- default_statement='DEFAULT NULL',
204
- value_to_replace_existing_entries='default')
205
- db_utils.add_column_to_table_sqlalchemy(
206
- session,
207
- 'job_info',
208
- 'priority',
209
- sqlalchemy.Integer(),
210
- value_to_replace_existing_entries=constants.DEFAULT_PRIORITY)
211
- db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
212
- 'entrypoint', sqlalchemy.Text())
213
- db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
214
- 'original_user_yaml_path',
215
- sqlalchemy.Text())
216
- session.commit()
133
+ # Get alembic config for spot jobs db and run migrations
134
+ alembic_config = migration_utils.get_alembic_config(
135
+ engine, migration_utils.SPOT_JOBS_DB_NAME)
136
+ alembic_config.config_ini_section = migration_utils.SPOT_JOBS_DB_NAME
137
+ alembic_command.upgrade(alembic_config, migration_utils.SPOT_JOBS_VERSION)
217
138
 
218
139
 
219
140
  def initialize_and_get_db() -> sqlalchemy.engine.Engine:
220
141
  global _SQLALCHEMY_ENGINE
221
142
  if _SQLALCHEMY_ENGINE is not None:
222
143
  return _SQLALCHEMY_ENGINE
223
- with _DB_INIT_LOCK:
144
+ with migration_utils.db_lock(migration_utils.SPOT_JOBS_DB_NAME):
224
145
  if _SQLALCHEMY_ENGINE is None:
225
146
  conn_string = None
226
147
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
sky/jobs/utils.py CHANGED
@@ -67,6 +67,9 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
67
67
 
68
68
  _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
69
69
 
70
+ _JOB_STATUS_FETCH_MAX_RETRIES = 3
71
+ _JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
72
+
70
73
  _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
71
74
  'Waiting for task to start[/]'
72
75
  '{status_str}. It may take a few minutes.\n'
@@ -250,19 +253,31 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
250
253
  logger.info(f'Cluster {cluster_name} not found.')
251
254
  return None
252
255
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
253
- status = None
254
- try:
255
- logger.info('=== Checking the job status... ===')
256
- statuses = backend.get_job_status(handle, stream_logs=False)
257
- status = list(statuses.values())[0]
258
- if status is None:
259
- logger.info('No job found.')
260
- else:
261
- logger.info(f'Job status: {status}')
262
- except exceptions.CommandError:
263
- logger.info('Failed to connect to the cluster.')
264
- logger.info('=' * 34)
265
- return status
256
+ for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
257
+ try:
258
+ logger.info('=== Checking the job status... ===')
259
+ statuses = backend.get_job_status(handle, stream_logs=False)
260
+ status = list(statuses.values())[0]
261
+ if status is None:
262
+ logger.info('No job found.')
263
+ else:
264
+ logger.info(f'Job status: {status}')
265
+ logger.info('=' * 34)
266
+ return status
267
+ except exceptions.CommandError as e:
268
+ # Retry on k8s transient network errors. This is useful when using
269
+ # coreweave which may have transient network issue sometimes.
270
+ if (e.detailed_reason is not None and
271
+ _JOB_K8S_TRANSIENT_NW_MSG in e.detailed_reason):
272
+ logger.info('Failed to connect to the cluster. Retrying '
273
+ f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
274
+ logger.info('=' * 34)
275
+ time.sleep(1)
276
+ else:
277
+ logger.info(f'Failed to get job status: {e.detailed_reason}')
278
+ logger.info('=' * 34)
279
+ return None
280
+ return None
266
281
 
267
282
 
268
283
  def _controller_process_alive(pid: int, job_id: int) -> bool:
sky/schemas/db/README ADDED
@@ -0,0 +1,4 @@
1
+ Migrations for sqlalchemy databases. Currently includes:
2
+ global_user_state
3
+ spot_jobs (managed jobs state)
4
+ skypilot_config
sky/schemas/db/env.py ADDED
@@ -0,0 +1,90 @@
1
+ """Alembic environment configuration for state database migrations."""
2
+ from logging.config import fileConfig
3
+
4
+ from alembic import context
5
+ from sqlalchemy import engine_from_config
6
+ from sqlalchemy import pool
7
+
8
+ # this is the Alembic Config object, which provides
9
+ # access to the values within the .ini file in use.
10
+ config = context.config
11
+
12
+ # NOTE: We intentionally disable Alembic's logging configuration to prevent
13
+ # it from overriding SkyPilot's logging setup. Alembic's fileConfig() call
14
+ # globally reconfigures Python's logging system, which can suppress SkyPilot's
15
+ # output messages that tests expect to see.
16
+ #
17
+ # Original code (now disabled):
18
+ if config.config_file_name is not None:
19
+ fileConfig(config.config_file_name, disable_existing_loggers=False)
20
+
21
+ # add your model's MetaData object here
22
+ # for 'autogenerate' support
23
+ # from myapp import mymodel
24
+ # target_metadata = mymodel.Base.metadata
25
+ target_metadata = None
26
+
27
+ # other values from the config, defined by the needs of env.py,
28
+ # can be acquired:
29
+ # my_important_option = config.get_main_option("my_important_option")
30
+ # ... etc.
31
+
32
+
33
+ def run_migrations_offline() -> None:
34
+ """Run migrations in 'offline' mode.
35
+
36
+ This configures the context with just a URL
37
+ and not an Engine, though an Engine is acceptable
38
+ here as well. By skipping the Engine creation
39
+ we don't even need a DBAPI to be available.
40
+
41
+ Calls to context.execute() here emit the given string to the
42
+ script output.
43
+
44
+ """
45
+ url = config.get_main_option('sqlalchemy.url')
46
+ version_table = config.get_section_option(config.config_ini_section,
47
+ 'version_table',
48
+ 'alembic_version')
49
+ context.configure(
50
+ url=url,
51
+ target_metadata=target_metadata,
52
+ literal_binds=True,
53
+ dialect_opts={'paramstyle': 'named'},
54
+ version_table=version_table,
55
+ )
56
+
57
+ with context.begin_transaction():
58
+ context.run_migrations()
59
+
60
+
61
+ def run_migrations_online() -> None:
62
+ """Run migrations in 'online' mode.
63
+
64
+ In this scenario we need to create an Engine
65
+ and associate a connection with the context.
66
+
67
+ """
68
+ connectable = engine_from_config(
69
+ config.get_section(config.config_ini_section, {}),
70
+ prefix='sqlalchemy.',
71
+ poolclass=pool.NullPool,
72
+ )
73
+ version_table = config.get_section_option(config.config_ini_section,
74
+ 'version_table',
75
+ 'alembic_version')
76
+ with connectable.connect() as connection:
77
+ context.configure(
78
+ connection=connection,
79
+ target_metadata=target_metadata,
80
+ version_table=version_table,
81
+ )
82
+
83
+ with context.begin_transaction():
84
+ context.run_migrations()
85
+
86
+
87
+ if context.is_offline_mode():
88
+ run_migrations_offline()
89
+ else:
90
+ run_migrations_online()
@@ -0,0 +1,124 @@
1
+ """Initial schema for state database with backwards compatibility columns
2
+
3
+ Revision ID: 001
4
+ Revises:
5
+ Create Date: 2024-01-01 12:00:00.000000
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from alembic import op
10
+ import sqlalchemy as sa
11
+
12
+ from sky.global_user_state import Base
13
+ from sky.utils.db import db_utils
14
+
15
+ # revision identifiers, used by Alembic.
16
+ revision = '001'
17
+ down_revision = None
18
+ branch_labels = None
19
+ depends_on = None
20
+
21
+
22
+ def upgrade():
23
+ with op.get_context().autocommit_block():
24
+ # Create any missing tables with current schema first
25
+ db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
26
+
27
+ # Add all missing columns to clusters table
28
+ # This allows each column addition to fail independently without rolling
29
+ # back the entire migration, which is needed for backwards compatibility
30
+
31
+ # Add all missing columns to clusters table
32
+ db_utils.add_column_to_table_alembic('clusters',
33
+ 'autostop',
34
+ sa.Integer(),
35
+ server_default='-1')
36
+ db_utils.add_column_to_table_alembic('clusters',
37
+ 'metadata',
38
+ sa.Text(),
39
+ server_default='{}')
40
+ db_utils.add_column_to_table_alembic('clusters',
41
+ 'to_down',
42
+ sa.Integer(),
43
+ server_default='0')
44
+ db_utils.add_column_to_table_alembic('clusters',
45
+ 'owner',
46
+ sa.Text(),
47
+ server_default=None)
48
+ db_utils.add_column_to_table_alembic('clusters',
49
+ 'cluster_hash',
50
+ sa.Text(),
51
+ server_default=None)
52
+ db_utils.add_column_to_table_alembic('clusters',
53
+ 'launched_nodes',
54
+ sa.Integer(),
55
+ server_default='0')
56
+ db_utils.add_column_to_table_alembic('clusters',
57
+ 'disk_tier',
58
+ sa.Text(),
59
+ server_default=None)
60
+ db_utils.add_column_to_table_alembic('clusters',
61
+ 'config_hash',
62
+ sa.Text(),
63
+ server_default=None)
64
+ db_utils.add_column_to_table_alembic('clusters',
65
+ 'user_hash',
66
+ sa.Text(),
67
+ server_default=None)
68
+ db_utils.add_column_to_table_alembic('clusters',
69
+ 'workspace',
70
+ sa.Text(),
71
+ server_default='default')
72
+ db_utils.add_column_to_table_alembic('clusters',
73
+ 'last_creation_yaml',
74
+ sa.Text(),
75
+ server_default=None)
76
+ db_utils.add_column_to_table_alembic('clusters',
77
+ 'last_creation_command',
78
+ sa.Text(),
79
+ server_default=None)
80
+ db_utils.add_column_to_table_alembic('clusters',
81
+ 'config_hash_locked',
82
+ sa.Boolean(),
83
+ server_default='FALSE')
84
+ db_utils.add_column_to_table_alembic('clusters',
85
+ 'handle_locked',
86
+ sa.Boolean(),
87
+ server_default='FALSE')
88
+ db_utils.add_column_to_table_alembic('clusters',
89
+ 'num_failures',
90
+ sa.Integer(),
91
+ server_default='0')
92
+ db_utils.add_column_to_table_alembic('clusters',
93
+ 'configs',
94
+ sa.Text(),
95
+ server_default='[]')
96
+
97
+ # Add all missing columns to cluster_history table
98
+ db_utils.add_column_to_table_alembic('cluster_history',
99
+ 'user_hash',
100
+ sa.Text(),
101
+ server_default=None)
102
+ db_utils.add_column_to_table_alembic('cluster_history',
103
+ 'last_creation_yaml',
104
+ sa.Text(),
105
+ server_default=None)
106
+ db_utils.add_column_to_table_alembic('cluster_history',
107
+ 'last_creation_command',
108
+ sa.Text(),
109
+ server_default=None)
110
+
111
+ # Add all missing columns to users table
112
+ db_utils.add_column_to_table_alembic('users',
113
+ 'password',
114
+ sa.Text(),
115
+ server_default=None)
116
+ db_utils.add_column_to_table_alembic('users',
117
+ 'created_at',
118
+ sa.Integer(),
119
+ server_default=None)
120
+
121
+
122
+ def downgrade():
123
+ # Drop all tables
124
+ Base.metadata.drop_all(bind=op.get_bind())