skypilot-nightly 1.0.0.dev20250718__py3-none-any.whl → 1.0.0.dev20250723__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/admin_policy.py +11 -4
- sky/backends/backend_utils.py +50 -24
- sky/backends/cloud_vm_ray_backend.py +41 -38
- sky/catalog/__init__.py +3 -1
- sky/catalog/aws_catalog.py +8 -5
- sky/catalog/azure_catalog.py +8 -5
- sky/catalog/common.py +8 -2
- sky/catalog/cudo_catalog.py +5 -2
- sky/catalog/do_catalog.py +4 -1
- sky/catalog/fluidstack_catalog.py +5 -2
- sky/catalog/gcp_catalog.py +8 -5
- sky/catalog/hyperbolic_catalog.py +5 -2
- sky/catalog/ibm_catalog.py +8 -5
- sky/catalog/lambda_catalog.py +8 -5
- sky/catalog/nebius_catalog.py +8 -5
- sky/catalog/oci_catalog.py +8 -5
- sky/catalog/paperspace_catalog.py +4 -1
- sky/catalog/runpod_catalog.py +5 -2
- sky/catalog/scp_catalog.py +8 -5
- sky/catalog/vast_catalog.py +5 -2
- sky/catalog/vsphere_catalog.py +4 -1
- sky/client/cli/command.py +63 -25
- sky/client/sdk.py +61 -11
- sky/clouds/aws.py +12 -7
- sky/clouds/azure.py +12 -7
- sky/clouds/cloud.py +9 -8
- sky/clouds/cudo.py +13 -7
- sky/clouds/do.py +12 -7
- sky/clouds/fluidstack.py +11 -6
- sky/clouds/gcp.py +12 -7
- sky/clouds/hyperbolic.py +11 -6
- sky/clouds/ibm.py +11 -6
- sky/clouds/kubernetes.py +7 -3
- sky/clouds/lambda_cloud.py +11 -6
- sky/clouds/nebius.py +14 -12
- sky/clouds/oci.py +12 -7
- sky/clouds/paperspace.py +12 -7
- sky/clouds/runpod.py +12 -7
- sky/clouds/scp.py +11 -6
- sky/clouds/vast.py +14 -8
- sky/clouds/vsphere.py +11 -6
- sky/core.py +6 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{1043-734e57d2b27dfe5d.js → 1043-869d9c78bf5dd3df.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1141-d8c6404a7c6fffe6.js → 1141-e49a159c30a6c4a7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +30 -0
- sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +6 -0
- sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +15 -0
- sky/dashboard/out/_next/static/chunks/{2641.35edc9ccaeaad9e3.js → 2641.74c19c4d45a2c034.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{4725.4c849b1e05c8e9ad.js → 4725.66125dcd9832aa5d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +16 -0
- sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +15 -0
- sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +1 -0
- sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +55 -0
- sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +1 -0
- sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +41 -0
- sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +6 -0
- sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +1 -0
- sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +30 -0
- sky/dashboard/out/_next/static/chunks/{9984.2b5e3fa69171bff9.js → 9984.0460de9d3adf5582.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa406155b4223d0d.js → [job]-2186770cc2de1623.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0c37ee1ac5f3474d.js → [cluster]-95afb019ab85801c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-c5b357bfd9502fbe.js → [job]-dc0299ffefebcdbe.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-19e98664bdd61643.js → users-6790fcefd5487b13.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +1 -0
- sky/dashboard/out/_next/static/css/b3227360726f12eb.css +3 -0
- sky/dashboard/out/_next/static/mym3Ciwp-zqU7ZpOLGnrW/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +93 -32
- sky/global_user_state.py +12 -143
- sky/jobs/state.py +9 -88
- sky/jobs/utils.py +28 -13
- sky/provision/nebius/utils.py +3 -6
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/serve/client/sdk.py +6 -2
- sky/serve/controller.py +7 -3
- sky/serve/serve_state.py +1 -1
- sky/serve/serve_utils.py +171 -75
- sky/serve/server/core.py +17 -6
- sky/server/common.py +4 -3
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/requests.py +1 -1
- sky/setup_files/MANIFEST.in +2 -0
- sky/setup_files/alembic.ini +148 -0
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/configs.py +1 -1
- sky/skylet/constants.py +4 -0
- sky/skylet/job_lib.py +1 -1
- sky/skypilot_config.py +1 -1
- sky/users/permission.py +1 -1
- sky/utils/common_utils.py +85 -3
- sky/utils/config_utils.py +15 -0
- sky/utils/db/__init__.py +0 -0
- sky/utils/{db_utils.py → db/db_utils.py} +59 -0
- sky/utils/db/migration_utils.py +93 -0
- sky/utils/locks.py +319 -0
- sky/utils/schemas.py +38 -34
- sky/utils/timeline.py +41 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/RECORD +134 -125
- sky/dashboard/out/_next/static/FUjweqdImyeYhMYFON-Se/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1746.27d40aedc22bd2d6.js +0 -60
- sky/dashboard/out/_next/static/chunks/1871-76491ac174a95278.js +0 -6
- sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +0 -1
- sky/dashboard/out/_next/static/chunks/2875.c24c6d57dc82e436.js +0 -25
- sky/dashboard/out/_next/static/chunks/3785.95b94f18aaec7233.js +0 -1
- sky/dashboard/out/_next/static/chunks/3947-b059261d6fa88a1f.js +0 -35
- sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +0 -16
- sky/dashboard/out/_next/static/chunks/5491.918ffed0ba7a5294.js +0 -20
- sky/dashboard/out/_next/static/chunks/6990-dcb411b566e64cde.js +0 -1
- sky/dashboard/out/_next/static/chunks/804-9f5e98ce84d46bdd.js +0 -21
- sky/dashboard/out/_next/static/chunks/9025.133e9ba5c780afeb.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-6a9ffdaa21eee969.js +0 -1
- sky/dashboard/out/_next/static/chunks/9470-b6f6a35283863a6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.46e613d000c55859.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-771a40cde532309b.js +0 -20
- sky/dashboard/out/_next/static/chunks/pages/clusters-102d169e87913ba1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-a1e43d9ef51a9cea.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-6b0575ea521af4f3.js +0 -1
- sky/dashboard/out/_next/static/css/219887b94512388c.css +0 -3
- /sky/dashboard/out/_next/static/{FUjweqdImyeYhMYFON-Se → mym3Ciwp-zqU7ZpOLGnrW}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/top_level.txt +0 -0
sky/utils/common_utils.py
CHANGED
|
@@ -369,6 +369,83 @@ def get_pretty_entrypoint_cmd() -> str:
|
|
|
369
369
|
return ' '.join(argv)
|
|
370
370
|
|
|
371
371
|
|
|
372
|
+
def read_last_n_lines(file_path: str,
|
|
373
|
+
n: int,
|
|
374
|
+
chunk_size: int = 8192,
|
|
375
|
+
encoding: str = 'utf-8',
|
|
376
|
+
errors: str = 'replace') -> List[str]:
|
|
377
|
+
"""Read the last N lines of a file.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
file_path: Path to the file to read.
|
|
381
|
+
n: Number of lines to read from the end of the file.
|
|
382
|
+
chunk_size: Size of chunks in bytes.
|
|
383
|
+
encoding: Encoding to use when decoding binary chunks.
|
|
384
|
+
errors: Error handling for decode errors (e.g., 'replace', 'ignore').
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
A list of the last N lines, preserving newlines where applicable.
|
|
388
|
+
"""
|
|
389
|
+
|
|
390
|
+
assert n >= 0, f'n must be non-negative. Got {n}'
|
|
391
|
+
assert chunk_size > 0, f'chunk_size must be positive. Got {chunk_size}'
|
|
392
|
+
assert os.path.exists(file_path), f'File not found: {file_path}'
|
|
393
|
+
|
|
394
|
+
if n == 0:
|
|
395
|
+
return []
|
|
396
|
+
|
|
397
|
+
try:
|
|
398
|
+
with open(file_path, 'rb') as f:
|
|
399
|
+
# Start reading from the end of the file
|
|
400
|
+
f.seek(0, os.SEEK_END)
|
|
401
|
+
file_size = f.tell()
|
|
402
|
+
if file_size == 0:
|
|
403
|
+
return []
|
|
404
|
+
|
|
405
|
+
pos = file_size
|
|
406
|
+
lines_found = 0
|
|
407
|
+
chunks = []
|
|
408
|
+
|
|
409
|
+
# Read backwards in chunks until we've found at least n newlines
|
|
410
|
+
while pos > 0 and lines_found <= n:
|
|
411
|
+
read_size = min(chunk_size, pos)
|
|
412
|
+
pos -= read_size
|
|
413
|
+
f.seek(pos)
|
|
414
|
+
chunk = f.read(read_size)
|
|
415
|
+
chunks.append(chunk)
|
|
416
|
+
lines_found += chunk.count(b'\n')
|
|
417
|
+
|
|
418
|
+
# Combine all chunks in reverse order since we read backwards
|
|
419
|
+
full_bytes = b''.join(reversed(chunks))
|
|
420
|
+
|
|
421
|
+
# Split by newline byte. Note: this handles '\n' endings.
|
|
422
|
+
all_lines = full_bytes.split(b'\n')
|
|
423
|
+
|
|
424
|
+
# Handle edge case: if file ends with a newline, last element is b''
|
|
425
|
+
if all_lines and all_lines[-1] == b'':
|
|
426
|
+
result_bytes = all_lines[-n - 1:-1]
|
|
427
|
+
else:
|
|
428
|
+
result_bytes = all_lines[-n:]
|
|
429
|
+
|
|
430
|
+
# Decode each line and normalize CR/LF endings
|
|
431
|
+
decoded_lines = [
|
|
432
|
+
line.decode(encoding, errors=errors).rstrip('\r') + '\n'
|
|
433
|
+
for line in result_bytes[:-1]
|
|
434
|
+
]
|
|
435
|
+
|
|
436
|
+
# Decode the final line — only add newline if it was present
|
|
437
|
+
last_line = result_bytes[-1].decode(encoding,
|
|
438
|
+
errors=errors).rstrip('\r')
|
|
439
|
+
decoded_lines.append(last_line)
|
|
440
|
+
|
|
441
|
+
return decoded_lines
|
|
442
|
+
|
|
443
|
+
except OSError as e:
|
|
444
|
+
with ux_utils.print_exception_no_traceback():
|
|
445
|
+
raise RuntimeError(
|
|
446
|
+
f'Failed to read last {n} lines from {file_path}: {e}') from e
|
|
447
|
+
|
|
448
|
+
|
|
372
449
|
def _redact_secrets_values(argv: List[str]) -> List[str]:
|
|
373
450
|
"""Redact sensitive values from --secret arguments.
|
|
374
451
|
|
|
@@ -485,8 +562,9 @@ def read_yaml_all(path: str) -> List[Dict[str, Any]]:
|
|
|
485
562
|
return read_yaml_all_str(f.read())
|
|
486
563
|
|
|
487
564
|
|
|
488
|
-
def dump_yaml(path: str,
|
|
489
|
-
|
|
565
|
+
def dump_yaml(path: str,
|
|
566
|
+
config: Union[List[Dict[str, Any]], Dict[str, Any]],
|
|
567
|
+
blank: bool = False) -> None:
|
|
490
568
|
"""Dumps a YAML file.
|
|
491
569
|
|
|
492
570
|
Args:
|
|
@@ -494,7 +572,11 @@ def dump_yaml(path: str, config: Union[List[Dict[str, Any]],
|
|
|
494
572
|
config: the configuration to dump.
|
|
495
573
|
"""
|
|
496
574
|
with open(path, 'w', encoding='utf-8') as f:
|
|
497
|
-
|
|
575
|
+
contents = dump_yaml_str(config)
|
|
576
|
+
if blank and isinstance(config, dict) and len(config) == 0:
|
|
577
|
+
# when dumping to yaml, an empty dict will go in as {}.
|
|
578
|
+
contents = ''
|
|
579
|
+
f.write(contents)
|
|
498
580
|
|
|
499
581
|
|
|
500
582
|
def dump_yaml_str(config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> str:
|
sky/utils/config_utils.py
CHANGED
|
@@ -248,6 +248,8 @@ def get_cloud_config_value_from_dict(
|
|
|
248
248
|
region_key = None
|
|
249
249
|
if cloud == 'kubernetes':
|
|
250
250
|
region_key = 'context_configs'
|
|
251
|
+
if cloud == 'nebius':
|
|
252
|
+
region_key = 'region_configs'
|
|
251
253
|
|
|
252
254
|
per_context_config = None
|
|
253
255
|
if region is not None and region_key is not None:
|
|
@@ -255,6 +257,19 @@ def get_cloud_config_value_from_dict(
|
|
|
255
257
|
keys=(cloud, region_key, region) + keys,
|
|
256
258
|
default_value=None,
|
|
257
259
|
override_configs=override_configs)
|
|
260
|
+
if not per_context_config and cloud == 'nebius':
|
|
261
|
+
# TODO (kyuds): Backward compatibility, remove after 0.11.0.
|
|
262
|
+
per_context_config = input_config.get_nested(
|
|
263
|
+
keys=(cloud, region) + keys,
|
|
264
|
+
default_value=None,
|
|
265
|
+
override_configs=override_configs)
|
|
266
|
+
if per_context_config is not None:
|
|
267
|
+
logger.info(
|
|
268
|
+
'Nebius configuration is using the legacy format. \n'
|
|
269
|
+
'This format will be deprecated after 0.11.0, refer to '
|
|
270
|
+
'`https://docs.skypilot.co/en/latest/reference/config.html#nebius` ' # pylint: disable=line-too-long
|
|
271
|
+
'for the new format. Please use `region_configs` to specify region specific configuration.'
|
|
272
|
+
)
|
|
258
273
|
# if no override found for specified region
|
|
259
274
|
general_config = input_config.get_nested(keys=(cloud,) + keys,
|
|
260
275
|
default_value=default_value,
|
sky/utils/db/__init__.py
ADDED
|
File without changes
|
|
@@ -9,6 +9,9 @@ from typing import Any, Callable, Optional
|
|
|
9
9
|
import sqlalchemy
|
|
10
10
|
from sqlalchemy import exc as sqlalchemy_exc
|
|
11
11
|
|
|
12
|
+
from sky import sky_logging
|
|
13
|
+
|
|
14
|
+
logger = sky_logging.init_logger(__name__)
|
|
12
15
|
if typing.TYPE_CHECKING:
|
|
13
16
|
from sqlalchemy.orm import Session
|
|
14
17
|
|
|
@@ -146,6 +149,62 @@ def add_column_to_table_sqlalchemy(
|
|
|
146
149
|
session.commit()
|
|
147
150
|
|
|
148
151
|
|
|
152
|
+
def add_column_to_table_alembic(
|
|
153
|
+
table_name: str,
|
|
154
|
+
column_name: str,
|
|
155
|
+
column_type: sqlalchemy.types.TypeEngine,
|
|
156
|
+
server_default: Optional[str] = None,
|
|
157
|
+
copy_from: Optional[str] = None,
|
|
158
|
+
value_to_replace_existing_entries: Optional[Any] = None,
|
|
159
|
+
):
|
|
160
|
+
"""Add a column to a table using Alembic operations.
|
|
161
|
+
|
|
162
|
+
This provides the same interface as add_column_to_table_sqlalchemy but
|
|
163
|
+
uses Alembic's connection context for proper migration support.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
table_name: Name of the table to add column to
|
|
167
|
+
column_name: Name of the new column
|
|
168
|
+
column_type: SQLAlchemy column type
|
|
169
|
+
server_default: Server-side default value for the column
|
|
170
|
+
copy_from: Column name to copy values from (for existing rows)
|
|
171
|
+
value_to_replace_existing_entries: Default value for existing NULL
|
|
172
|
+
entries
|
|
173
|
+
"""
|
|
174
|
+
from alembic import op # pylint: disable=import-outside-toplevel
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
# Create the column with server_default if provided
|
|
178
|
+
column = sqlalchemy.Column(column_name,
|
|
179
|
+
column_type,
|
|
180
|
+
server_default=server_default)
|
|
181
|
+
op.add_column(table_name, column)
|
|
182
|
+
|
|
183
|
+
# Handle data migration
|
|
184
|
+
if copy_from is not None:
|
|
185
|
+
op.execute(
|
|
186
|
+
sqlalchemy.text(
|
|
187
|
+
f'UPDATE {table_name} SET {column_name} = {copy_from}'))
|
|
188
|
+
|
|
189
|
+
if value_to_replace_existing_entries is not None:
|
|
190
|
+
# Use parameterized query for safety
|
|
191
|
+
op.get_bind().execute(
|
|
192
|
+
sqlalchemy.text(f'UPDATE {table_name} '
|
|
193
|
+
f'SET {column_name} = :replacement_value '
|
|
194
|
+
f'WHERE {column_name} IS NULL'),
|
|
195
|
+
{'replacement_value': value_to_replace_existing_entries})
|
|
196
|
+
except sqlalchemy_exc.ProgrammingError as e:
|
|
197
|
+
if 'already exists' in str(e).lower():
|
|
198
|
+
pass # Column already exists, that's fine
|
|
199
|
+
else:
|
|
200
|
+
raise
|
|
201
|
+
except sqlalchemy_exc.OperationalError as e:
|
|
202
|
+
if 'duplicate column name' in str(e).lower():
|
|
203
|
+
pass # Column already exists, that's fine
|
|
204
|
+
else:
|
|
205
|
+
raise
|
|
206
|
+
|
|
207
|
+
|
|
149
208
|
class SQLiteConn(threading.local):
|
|
150
209
|
"""Thread-local connection to the sqlite3 database."""
|
|
151
210
|
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Constants for the database schemas."""
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
from alembic import command as alembic_command
|
|
8
|
+
from alembic.config import Config
|
|
9
|
+
from alembic.runtime import migration
|
|
10
|
+
import filelock
|
|
11
|
+
import sqlalchemy
|
|
12
|
+
|
|
13
|
+
DB_INIT_LOCK_TIMEOUT_SECONDS = 10
|
|
14
|
+
|
|
15
|
+
GLOBAL_USER_STATE_DB_NAME = 'state_db'
|
|
16
|
+
GLOBAL_USER_STATE_VERSION = '001'
|
|
17
|
+
GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
|
|
18
|
+
|
|
19
|
+
SPOT_JOBS_DB_NAME = 'spot_jobs_db'
|
|
20
|
+
SPOT_JOBS_VERSION = '001'
|
|
21
|
+
SPOT_JOBS_LOCK_PATH = '~/.sky/locks/.spot_jobs_db.lock'
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@contextlib.contextmanager
|
|
25
|
+
def db_lock(db_name: str):
|
|
26
|
+
lock_path = os.path.expanduser(f'~/.sky/locks/.{db_name}.lock')
|
|
27
|
+
try:
|
|
28
|
+
with filelock.FileLock(lock_path, timeout=DB_INIT_LOCK_TIMEOUT_SECONDS):
|
|
29
|
+
yield
|
|
30
|
+
except filelock.Timeout as e:
|
|
31
|
+
raise RuntimeError(f'Failed to initialize database due to a timeout '
|
|
32
|
+
f'when trying to acquire the lock at '
|
|
33
|
+
f'{lock_path}. '
|
|
34
|
+
'Please try again or manually remove the lock '
|
|
35
|
+
f'file if you believe it is stale.') from e
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_alembic_config(engine: sqlalchemy.engine.Engine, section: str):
|
|
39
|
+
"""Get Alembic configuration for the given section"""
|
|
40
|
+
# Use the alembic.ini file from setup_files (included in wheel)
|
|
41
|
+
# From sky/utils/db/migration_utils.py -> sky/setup_files/alembic.ini
|
|
42
|
+
alembic_ini_path = os.path.join(
|
|
43
|
+
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
|
|
44
|
+
'setup_files', 'alembic.ini')
|
|
45
|
+
alembic_cfg = Config(alembic_ini_path, ini_section=section)
|
|
46
|
+
|
|
47
|
+
# Override the database URL to match SkyPilot's current connection
|
|
48
|
+
# Use render_as_string to get the full URL with password
|
|
49
|
+
url = engine.url.render_as_string(hide_password=False)
|
|
50
|
+
alembic_cfg.set_section_option(section, 'sqlalchemy.url', url)
|
|
51
|
+
|
|
52
|
+
return alembic_cfg
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def safe_alembic_upgrade(engine: sqlalchemy.engine.Engine,
|
|
56
|
+
alembic_config: Config, target_revision: str):
|
|
57
|
+
"""Only upgrade if current version is older than target.
|
|
58
|
+
|
|
59
|
+
This handles the case where a database was created with a newer version of
|
|
60
|
+
the code and we're now running older code. Since our migrations are purely
|
|
61
|
+
additive, it's safe to run a newer database with older code.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
engine: SQLAlchemy engine for the database
|
|
65
|
+
alembic_config: Alembic configuration object
|
|
66
|
+
target_revision: Target revision to upgrade to (e.g., '001')
|
|
67
|
+
"""
|
|
68
|
+
# set alembic logger to warning level
|
|
69
|
+
alembic_logger = logging.getLogger('alembic')
|
|
70
|
+
alembic_logger.setLevel(logging.WARNING)
|
|
71
|
+
|
|
72
|
+
current_rev = None
|
|
73
|
+
|
|
74
|
+
# Get the current revision from the database
|
|
75
|
+
version_table = alembic_config.get_section_option(
|
|
76
|
+
alembic_config.config_ini_section, 'version_table', 'alembic_version')
|
|
77
|
+
|
|
78
|
+
with engine.connect() as connection:
|
|
79
|
+
context = migration.MigrationContext.configure(
|
|
80
|
+
connection, opts={'version_table': version_table})
|
|
81
|
+
current_rev = context.get_current_revision()
|
|
82
|
+
|
|
83
|
+
if current_rev is None:
|
|
84
|
+
alembic_command.upgrade(alembic_config, target_revision)
|
|
85
|
+
return
|
|
86
|
+
|
|
87
|
+
# Compare revisions - assuming they are numeric strings like '001', '002'
|
|
88
|
+
current_rev_num = int(current_rev)
|
|
89
|
+
target_rev_num = int(target_revision)
|
|
90
|
+
|
|
91
|
+
# only upgrade if current revision is older than target revision
|
|
92
|
+
if current_rev_num < target_rev_num:
|
|
93
|
+
alembic_command.upgrade(alembic_config, target_revision)
|
sky/utils/locks.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""Lock for SkyPilot.
|
|
2
|
+
|
|
3
|
+
This module provides an abstraction for locking that can use
|
|
4
|
+
either local file locks or database-based distributed locks.
|
|
5
|
+
"""
|
|
6
|
+
import abc
|
|
7
|
+
import hashlib
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import time
|
|
11
|
+
from typing import Any, Optional
|
|
12
|
+
|
|
13
|
+
import filelock
|
|
14
|
+
import sqlalchemy
|
|
15
|
+
|
|
16
|
+
from sky import global_user_state
|
|
17
|
+
from sky.skylet import constants
|
|
18
|
+
from sky.utils import common_utils
|
|
19
|
+
from sky.utils.db import db_utils
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LockTimeout(RuntimeError):
|
|
25
|
+
"""Raised when a lock acquisition times out."""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class AcquireReturnProxy:
|
|
30
|
+
"""A context manager that releases the lock when exiting.
|
|
31
|
+
|
|
32
|
+
This proxy is returned by acquire() and ensures proper cleanup
|
|
33
|
+
when used in a with statement.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, lock: 'DistributedLock') -> None:
|
|
37
|
+
self.lock = lock
|
|
38
|
+
|
|
39
|
+
def __enter__(self) -> 'DistributedLock':
|
|
40
|
+
return self.lock
|
|
41
|
+
|
|
42
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
43
|
+
self.lock.release()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DistributedLock(abc.ABC):
|
|
47
|
+
"""Abstract base class for a distributed lock.
|
|
48
|
+
|
|
49
|
+
Provides a context manager interface for acquiring and releasing locks
|
|
50
|
+
that can work across multiple processes and potentially multiple machines.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(self,
|
|
54
|
+
lock_id: str,
|
|
55
|
+
timeout: Optional[float] = None,
|
|
56
|
+
poll_interval: float = 0.1):
|
|
57
|
+
"""Initialize the lock.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
lock_id: Unique identifier for the lock.
|
|
61
|
+
timeout: Maximum time to wait for lock acquisition.
|
|
62
|
+
If None, wait indefinitely.
|
|
63
|
+
poll_interval: Interval in seconds to poll for lock acquisition.
|
|
64
|
+
"""
|
|
65
|
+
self.lock_id = lock_id
|
|
66
|
+
self.timeout = timeout
|
|
67
|
+
self.poll_interval = poll_interval
|
|
68
|
+
|
|
69
|
+
@abc.abstractmethod
|
|
70
|
+
def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
|
|
71
|
+
"""Acquire the lock.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
blocking: If True, block until lock is acquired or timeout.
|
|
75
|
+
If False, return immediately.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
AcquireReturnProxy that can be used as a context manager.
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
LockTimeout: If lock cannot be acquired.
|
|
82
|
+
"""
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
@abc.abstractmethod
|
|
86
|
+
def release(self) -> None:
|
|
87
|
+
"""Release the lock."""
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
@abc.abstractmethod
|
|
91
|
+
def force_unlock(self) -> None:
|
|
92
|
+
"""Force unlock the lock if it is acquired."""
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
@abc.abstractmethod
|
|
96
|
+
def is_locked(self) -> bool:
|
|
97
|
+
"""Check if the lock is acquired."""
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
def __enter__(self) -> 'DistributedLock':
|
|
101
|
+
"""Context manager entry."""
|
|
102
|
+
self.acquire()
|
|
103
|
+
return self
|
|
104
|
+
|
|
105
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
106
|
+
"""Context manager exit."""
|
|
107
|
+
self.release()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class FileLock(DistributedLock):
|
|
111
|
+
"""A wrapper around filelock.FileLock.
|
|
112
|
+
|
|
113
|
+
This implements a distributed lock that works across multiple processes
|
|
114
|
+
when they share the same filesystem.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
def __init__(self,
|
|
118
|
+
lock_id: str,
|
|
119
|
+
timeout: Optional[float] = None,
|
|
120
|
+
poll_interval: float = 0.1):
|
|
121
|
+
"""Initialize the file lock.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
lock_id: Unique identifier for the lock.
|
|
125
|
+
timeout: Maximum time to wait for lock acquisition.
|
|
126
|
+
poll_interval: Interval in seconds to poll for lock acquisition.
|
|
127
|
+
"""
|
|
128
|
+
super().__init__(lock_id, timeout, poll_interval)
|
|
129
|
+
os.makedirs(constants.SKY_LOCKS_DIR, exist_ok=True)
|
|
130
|
+
self.lock_path = os.path.join(constants.SKY_LOCKS_DIR,
|
|
131
|
+
f'.{lock_id}.lock')
|
|
132
|
+
if timeout is None:
|
|
133
|
+
timeout = -1
|
|
134
|
+
self._filelock: filelock.FileLock = filelock.FileLock(self.lock_path,
|
|
135
|
+
timeout=timeout)
|
|
136
|
+
|
|
137
|
+
def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
|
|
138
|
+
"""Acquire the file lock."""
|
|
139
|
+
try:
|
|
140
|
+
acquired = self._filelock.acquire(blocking=blocking)
|
|
141
|
+
if not acquired:
|
|
142
|
+
raise LockTimeout(f'Failed to acquire file lock {self.lock_id}')
|
|
143
|
+
return AcquireReturnProxy(self)
|
|
144
|
+
except filelock.Timeout as e:
|
|
145
|
+
raise LockTimeout(
|
|
146
|
+
f'Failed to acquire file lock {self.lock_id}') from e
|
|
147
|
+
|
|
148
|
+
def release(self) -> None:
|
|
149
|
+
"""Release the file lock."""
|
|
150
|
+
self._filelock.release()
|
|
151
|
+
|
|
152
|
+
def force_unlock(self) -> None:
|
|
153
|
+
"""Force unlock the file lock."""
|
|
154
|
+
common_utils.remove_file_if_exists(self.lock_path)
|
|
155
|
+
|
|
156
|
+
def is_locked(self) -> bool:
|
|
157
|
+
return self._filelock.is_locked()
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class PostgresLock(DistributedLock):
|
|
161
|
+
"""PostgreSQL advisory lock implementation.
|
|
162
|
+
|
|
163
|
+
Uses PostgreSQL advisory locks to implement distributed locking
|
|
164
|
+
that works across multiple machines sharing the same database.
|
|
165
|
+
Reference:
|
|
166
|
+
https://www.postgresql.org/docs/current/explicit-locking.html
|
|
167
|
+
#ADVISORY-LOCKS
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
def __init__(self,
|
|
171
|
+
lock_id: str,
|
|
172
|
+
timeout: Optional[float] = None,
|
|
173
|
+
poll_interval: float = 1):
|
|
174
|
+
"""Initialize the postgres lock.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
lock_id: Unique identifier for the lock.
|
|
178
|
+
timeout: Maximum time to wait for lock acquisition.
|
|
179
|
+
poll_interval: Interval in seconds to poll for lock acquisition,
|
|
180
|
+
default to 1 second to avoid storming the database.
|
|
181
|
+
"""
|
|
182
|
+
super().__init__(lock_id, timeout, poll_interval)
|
|
183
|
+
# Convert string lock_id to integer for postgres advisory locks
|
|
184
|
+
self._lock_key = self._string_to_lock_key(lock_id)
|
|
185
|
+
self._acquired = False
|
|
186
|
+
self._connection: Optional[sqlalchemy.pool.PoolProxiedConnection] = None
|
|
187
|
+
|
|
188
|
+
def _string_to_lock_key(self, s: str) -> int:
|
|
189
|
+
"""Convert string to a 64-bit integer for advisory lock key."""
|
|
190
|
+
hash_digest = hashlib.sha256(s.encode('utf-8')).digest()
|
|
191
|
+
# Take first 8 bytes and convert to int, ensure positive 64-bit
|
|
192
|
+
return int.from_bytes(hash_digest[:8], 'big') & ((1 << 63) - 1)
|
|
193
|
+
|
|
194
|
+
def _get_connection(self) -> sqlalchemy.pool.PoolProxiedConnection:
|
|
195
|
+
"""Get database connection."""
|
|
196
|
+
engine = global_user_state.initialize_and_get_db()
|
|
197
|
+
if engine.dialect.name != db_utils.SQLAlchemyDialect.POSTGRESQL.value:
|
|
198
|
+
raise ValueError('PostgresLock requires PostgreSQL database. '
|
|
199
|
+
f'Current dialect: {engine.dialect.name}')
|
|
200
|
+
return engine.raw_connection()
|
|
201
|
+
|
|
202
|
+
def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
|
|
203
|
+
"""Acquire the postgres advisory lock."""
|
|
204
|
+
if self._acquired:
|
|
205
|
+
return AcquireReturnProxy(self)
|
|
206
|
+
|
|
207
|
+
self._connection = self._get_connection()
|
|
208
|
+
cursor = self._connection.cursor()
|
|
209
|
+
|
|
210
|
+
start_time = time.time()
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
while True:
|
|
214
|
+
cursor.execute('SELECT pg_try_advisory_lock(%s)',
|
|
215
|
+
(self._lock_key,))
|
|
216
|
+
result = cursor.fetchone()[0]
|
|
217
|
+
|
|
218
|
+
if result:
|
|
219
|
+
self._acquired = True
|
|
220
|
+
return AcquireReturnProxy(self)
|
|
221
|
+
|
|
222
|
+
if not blocking:
|
|
223
|
+
raise LockTimeout(
|
|
224
|
+
f'Failed to immediately acquire postgres lock '
|
|
225
|
+
f'{self.lock_id}')
|
|
226
|
+
|
|
227
|
+
if (self.timeout is not None and
|
|
228
|
+
time.time() - start_time > self.timeout):
|
|
229
|
+
raise LockTimeout(
|
|
230
|
+
f'Failed to acquire postgres lock {self.lock_id} '
|
|
231
|
+
f'within {self.timeout} seconds')
|
|
232
|
+
|
|
233
|
+
time.sleep(self.poll_interval)
|
|
234
|
+
|
|
235
|
+
except Exception:
|
|
236
|
+
if self._connection:
|
|
237
|
+
self._connection.close()
|
|
238
|
+
self._connection = None
|
|
239
|
+
raise
|
|
240
|
+
|
|
241
|
+
def release(self) -> None:
|
|
242
|
+
"""Release the postgres advisory lock."""
|
|
243
|
+
if not self._acquired or not self._connection:
|
|
244
|
+
return
|
|
245
|
+
|
|
246
|
+
try:
|
|
247
|
+
cursor = self._connection.cursor()
|
|
248
|
+
cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
|
|
249
|
+
self._connection.commit()
|
|
250
|
+
self._acquired = False
|
|
251
|
+
finally:
|
|
252
|
+
if self._connection:
|
|
253
|
+
self._connection.close()
|
|
254
|
+
self._connection = None
|
|
255
|
+
|
|
256
|
+
def force_unlock(self) -> None:
|
|
257
|
+
"""Force unlock the postgres advisory lock."""
|
|
258
|
+
try:
|
|
259
|
+
if not self._connection:
|
|
260
|
+
self._connection = self._get_connection()
|
|
261
|
+
cursor = self._connection.cursor()
|
|
262
|
+
cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
|
|
263
|
+
self._connection.commit()
|
|
264
|
+
except Exception as e:
|
|
265
|
+
raise RuntimeError(
|
|
266
|
+
f'Failed to force unlock postgres lock {self.lock_id}: {e}'
|
|
267
|
+
) from e
|
|
268
|
+
finally:
|
|
269
|
+
if self._connection:
|
|
270
|
+
self._connection.close()
|
|
271
|
+
self._connection = None
|
|
272
|
+
|
|
273
|
+
def is_locked(self) -> bool:
|
|
274
|
+
"""Check if the postgres advisory lock is acquired."""
|
|
275
|
+
return self._acquired
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def get_lock(lock_id: str,
|
|
279
|
+
timeout: Optional[float] = None,
|
|
280
|
+
lock_type: Optional[str] = None,
|
|
281
|
+
poll_interval: Optional[float] = None) -> DistributedLock:
|
|
282
|
+
"""Create a distributed lock instance.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
lock_id: Unique identifier for the lock.
|
|
286
|
+
timeout: Maximum time seconds to wait for lock acquisition,
|
|
287
|
+
None means wait indefinitely.
|
|
288
|
+
lock_type: Type of lock to create ('filelock' or 'postgres').
|
|
289
|
+
If None, auto-detect based on database configuration.
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
DistributedLock instance.
|
|
293
|
+
"""
|
|
294
|
+
if lock_type is None:
|
|
295
|
+
lock_type = _detect_lock_type()
|
|
296
|
+
|
|
297
|
+
if lock_type == 'postgres':
|
|
298
|
+
if poll_interval is None:
|
|
299
|
+
return PostgresLock(lock_id, timeout)
|
|
300
|
+
return PostgresLock(lock_id, timeout, poll_interval)
|
|
301
|
+
elif lock_type == 'filelock':
|
|
302
|
+
if poll_interval is None:
|
|
303
|
+
return FileLock(lock_id, timeout)
|
|
304
|
+
return FileLock(lock_id, timeout, poll_interval)
|
|
305
|
+
else:
|
|
306
|
+
raise ValueError(f'Unknown lock type: {lock_type}')
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _detect_lock_type() -> str:
|
|
310
|
+
"""Auto-detect the appropriate lock type based on configuration."""
|
|
311
|
+
try:
|
|
312
|
+
engine = global_user_state.initialize_and_get_db()
|
|
313
|
+
if engine.dialect.name == db_utils.SQLAlchemyDialect.POSTGRESQL.value:
|
|
314
|
+
return 'postgres'
|
|
315
|
+
except Exception: # pylint: disable=broad-except
|
|
316
|
+
# Fall back to filelock if database detection fails
|
|
317
|
+
pass
|
|
318
|
+
|
|
319
|
+
return 'filelock'
|