skypilot-nightly 1.0.0.dev20250718__py3-none-any.whl → 1.0.0.dev20250723__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (160) hide show
  1. sky/__init__.py +4 -2
  2. sky/admin_policy.py +11 -4
  3. sky/backends/backend_utils.py +50 -24
  4. sky/backends/cloud_vm_ray_backend.py +41 -38
  5. sky/catalog/__init__.py +3 -1
  6. sky/catalog/aws_catalog.py +8 -5
  7. sky/catalog/azure_catalog.py +8 -5
  8. sky/catalog/common.py +8 -2
  9. sky/catalog/cudo_catalog.py +5 -2
  10. sky/catalog/do_catalog.py +4 -1
  11. sky/catalog/fluidstack_catalog.py +5 -2
  12. sky/catalog/gcp_catalog.py +8 -5
  13. sky/catalog/hyperbolic_catalog.py +5 -2
  14. sky/catalog/ibm_catalog.py +8 -5
  15. sky/catalog/lambda_catalog.py +8 -5
  16. sky/catalog/nebius_catalog.py +8 -5
  17. sky/catalog/oci_catalog.py +8 -5
  18. sky/catalog/paperspace_catalog.py +4 -1
  19. sky/catalog/runpod_catalog.py +5 -2
  20. sky/catalog/scp_catalog.py +8 -5
  21. sky/catalog/vast_catalog.py +5 -2
  22. sky/catalog/vsphere_catalog.py +4 -1
  23. sky/client/cli/command.py +63 -25
  24. sky/client/sdk.py +61 -11
  25. sky/clouds/aws.py +12 -7
  26. sky/clouds/azure.py +12 -7
  27. sky/clouds/cloud.py +9 -8
  28. sky/clouds/cudo.py +13 -7
  29. sky/clouds/do.py +12 -7
  30. sky/clouds/fluidstack.py +11 -6
  31. sky/clouds/gcp.py +12 -7
  32. sky/clouds/hyperbolic.py +11 -6
  33. sky/clouds/ibm.py +11 -6
  34. sky/clouds/kubernetes.py +7 -3
  35. sky/clouds/lambda_cloud.py +11 -6
  36. sky/clouds/nebius.py +14 -12
  37. sky/clouds/oci.py +12 -7
  38. sky/clouds/paperspace.py +12 -7
  39. sky/clouds/runpod.py +12 -7
  40. sky/clouds/scp.py +11 -6
  41. sky/clouds/vast.py +14 -8
  42. sky/clouds/vsphere.py +11 -6
  43. sky/core.py +6 -1
  44. sky/dashboard/out/404.html +1 -1
  45. sky/dashboard/out/_next/static/chunks/{1043-734e57d2b27dfe5d.js → 1043-869d9c78bf5dd3df.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/{1141-d8c6404a7c6fffe6.js → 1141-e49a159c30a6c4a7.js} +1 -1
  47. sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +30 -0
  48. sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +6 -0
  49. sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +15 -0
  52. sky/dashboard/out/_next/static/chunks/{2641.35edc9ccaeaad9e3.js → 2641.74c19c4d45a2c034.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/{4725.4c849b1e05c8e9ad.js → 4725.66125dcd9832aa5d.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +16 -0
  56. sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +15 -0
  57. sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +55 -0
  59. sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +41 -0
  61. sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +6 -0
  62. sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +30 -0
  64. sky/dashboard/out/_next/static/chunks/{9984.2b5e3fa69171bff9.js → 9984.0460de9d3adf5582.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +34 -0
  66. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa406155b4223d0d.js → [job]-2186770cc2de1623.js} +2 -2
  67. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0c37ee1ac5f3474d.js → [cluster]-95afb019ab85801c.js} +1 -1
  68. sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +1 -0
  70. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +1 -0
  71. sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-c5b357bfd9502fbe.js → [job]-dc0299ffefebcdbe.js} +2 -2
  73. sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +1 -0
  74. sky/dashboard/out/_next/static/chunks/pages/{users-19e98664bdd61643.js → users-6790fcefd5487b13.js} +1 -1
  75. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +1 -0
  76. sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +1 -0
  77. sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +1 -0
  78. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +3 -0
  79. sky/dashboard/out/_next/static/mym3Ciwp-zqU7ZpOLGnrW/_buildManifest.js +1 -0
  80. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  81. sky/dashboard/out/clusters/[cluster].html +1 -1
  82. sky/dashboard/out/clusters.html +1 -1
  83. sky/dashboard/out/config.html +1 -1
  84. sky/dashboard/out/index.html +1 -1
  85. sky/dashboard/out/infra/[context].html +1 -1
  86. sky/dashboard/out/infra.html +1 -1
  87. sky/dashboard/out/jobs/[job].html +1 -1
  88. sky/dashboard/out/jobs.html +1 -1
  89. sky/dashboard/out/users.html +1 -1
  90. sky/dashboard/out/volumes.html +1 -1
  91. sky/dashboard/out/workspace/new.html +1 -1
  92. sky/dashboard/out/workspaces/[name].html +1 -1
  93. sky/dashboard/out/workspaces.html +1 -1
  94. sky/data/mounting_utils.py +93 -32
  95. sky/global_user_state.py +12 -143
  96. sky/jobs/state.py +9 -88
  97. sky/jobs/utils.py +28 -13
  98. sky/provision/nebius/utils.py +3 -6
  99. sky/schemas/db/README +4 -0
  100. sky/schemas/db/env.py +90 -0
  101. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  102. sky/schemas/db/script.py.mako +28 -0
  103. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  104. sky/serve/client/sdk.py +6 -2
  105. sky/serve/controller.py +7 -3
  106. sky/serve/serve_state.py +1 -1
  107. sky/serve/serve_utils.py +171 -75
  108. sky/serve/server/core.py +17 -6
  109. sky/server/common.py +4 -3
  110. sky/server/requests/payloads.py +2 -0
  111. sky/server/requests/requests.py +1 -1
  112. sky/setup_files/MANIFEST.in +2 -0
  113. sky/setup_files/alembic.ini +148 -0
  114. sky/setup_files/dependencies.py +1 -0
  115. sky/skylet/configs.py +1 -1
  116. sky/skylet/constants.py +4 -0
  117. sky/skylet/job_lib.py +1 -1
  118. sky/skypilot_config.py +1 -1
  119. sky/users/permission.py +1 -1
  120. sky/utils/common_utils.py +85 -3
  121. sky/utils/config_utils.py +15 -0
  122. sky/utils/db/__init__.py +0 -0
  123. sky/utils/{db_utils.py → db/db_utils.py} +59 -0
  124. sky/utils/db/migration_utils.py +93 -0
  125. sky/utils/locks.py +319 -0
  126. sky/utils/schemas.py +38 -34
  127. sky/utils/timeline.py +41 -0
  128. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/METADATA +2 -1
  129. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/RECORD +134 -125
  130. sky/dashboard/out/_next/static/FUjweqdImyeYhMYFON-Se/_buildManifest.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/1746.27d40aedc22bd2d6.js +0 -60
  132. sky/dashboard/out/_next/static/chunks/1871-76491ac174a95278.js +0 -6
  133. sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/2875.c24c6d57dc82e436.js +0 -25
  135. sky/dashboard/out/_next/static/chunks/3785.95b94f18aaec7233.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/3947-b059261d6fa88a1f.js +0 -35
  137. sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +0 -16
  139. sky/dashboard/out/_next/static/chunks/5491.918ffed0ba7a5294.js +0 -20
  140. sky/dashboard/out/_next/static/chunks/6990-dcb411b566e64cde.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/804-9f5e98ce84d46bdd.js +0 -21
  142. sky/dashboard/out/_next/static/chunks/9025.133e9ba5c780afeb.js +0 -6
  143. sky/dashboard/out/_next/static/chunks/938-6a9ffdaa21eee969.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/9470-b6f6a35283863a6f.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/9847.46e613d000c55859.js +0 -30
  146. sky/dashboard/out/_next/static/chunks/pages/_app-771a40cde532309b.js +0 -20
  147. sky/dashboard/out/_next/static/chunks/pages/clusters-102d169e87913ba1.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces-a1e43d9ef51a9cea.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/webpack-6b0575ea521af4f3.js +0 -1
  155. sky/dashboard/out/_next/static/css/219887b94512388c.css +0 -3
  156. /sky/dashboard/out/_next/static/{FUjweqdImyeYhMYFON-Se → mym3Ciwp-zqU7ZpOLGnrW}/_ssgManifest.js +0 -0
  157. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/WHEEL +0 -0
  158. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/entry_points.txt +0 -0
  159. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/licenses/LICENSE +0 -0
  160. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/top_level.txt +0 -0
sky/utils/common_utils.py CHANGED
@@ -369,6 +369,83 @@ def get_pretty_entrypoint_cmd() -> str:
369
369
  return ' '.join(argv)
370
370
 
371
371
 
372
+ def read_last_n_lines(file_path: str,
373
+ n: int,
374
+ chunk_size: int = 8192,
375
+ encoding: str = 'utf-8',
376
+ errors: str = 'replace') -> List[str]:
377
+ """Read the last N lines of a file.
378
+
379
+ Args:
380
+ file_path: Path to the file to read.
381
+ n: Number of lines to read from the end of the file.
382
+ chunk_size: Size of chunks in bytes.
383
+ encoding: Encoding to use when decoding binary chunks.
384
+ errors: Error handling for decode errors (e.g., 'replace', 'ignore').
385
+
386
+ Returns:
387
+ A list of the last N lines, preserving newlines where applicable.
388
+ """
389
+
390
+ assert n >= 0, f'n must be non-negative. Got {n}'
391
+ assert chunk_size > 0, f'chunk_size must be positive. Got {chunk_size}'
392
+ assert os.path.exists(file_path), f'File not found: {file_path}'
393
+
394
+ if n == 0:
395
+ return []
396
+
397
+ try:
398
+ with open(file_path, 'rb') as f:
399
+ # Start reading from the end of the file
400
+ f.seek(0, os.SEEK_END)
401
+ file_size = f.tell()
402
+ if file_size == 0:
403
+ return []
404
+
405
+ pos = file_size
406
+ lines_found = 0
407
+ chunks = []
408
+
409
+ # Read backwards in chunks until we've found at least n newlines
410
+ while pos > 0 and lines_found <= n:
411
+ read_size = min(chunk_size, pos)
412
+ pos -= read_size
413
+ f.seek(pos)
414
+ chunk = f.read(read_size)
415
+ chunks.append(chunk)
416
+ lines_found += chunk.count(b'\n')
417
+
418
+ # Combine all chunks in reverse order since we read backwards
419
+ full_bytes = b''.join(reversed(chunks))
420
+
421
+ # Split by newline byte. Note: this handles '\n' endings.
422
+ all_lines = full_bytes.split(b'\n')
423
+
424
+ # Handle edge case: if file ends with a newline, last element is b''
425
+ if all_lines and all_lines[-1] == b'':
426
+ result_bytes = all_lines[-n - 1:-1]
427
+ else:
428
+ result_bytes = all_lines[-n:]
429
+
430
+ # Decode each line and normalize CR/LF endings
431
+ decoded_lines = [
432
+ line.decode(encoding, errors=errors).rstrip('\r') + '\n'
433
+ for line in result_bytes[:-1]
434
+ ]
435
+
436
+ # Decode the final line — only add newline if it was present
437
+ last_line = result_bytes[-1].decode(encoding,
438
+ errors=errors).rstrip('\r')
439
+ decoded_lines.append(last_line)
440
+
441
+ return decoded_lines
442
+
443
+ except OSError as e:
444
+ with ux_utils.print_exception_no_traceback():
445
+ raise RuntimeError(
446
+ f'Failed to read last {n} lines from {file_path}: {e}') from e
447
+
448
+
372
449
  def _redact_secrets_values(argv: List[str]) -> List[str]:
373
450
  """Redact sensitive values from --secret arguments.
374
451
 
@@ -485,8 +562,9 @@ def read_yaml_all(path: str) -> List[Dict[str, Any]]:
485
562
  return read_yaml_all_str(f.read())
486
563
 
487
564
 
488
- def dump_yaml(path: str, config: Union[List[Dict[str, Any]],
489
- Dict[str, Any]]) -> None:
565
+ def dump_yaml(path: str,
566
+ config: Union[List[Dict[str, Any]], Dict[str, Any]],
567
+ blank: bool = False) -> None:
490
568
  """Dumps a YAML file.
491
569
 
492
570
  Args:
@@ -494,7 +572,11 @@ def dump_yaml(path: str, config: Union[List[Dict[str, Any]],
494
572
  config: the configuration to dump.
495
573
  """
496
574
  with open(path, 'w', encoding='utf-8') as f:
497
- f.write(dump_yaml_str(config))
575
+ contents = dump_yaml_str(config)
576
+ if blank and isinstance(config, dict) and len(config) == 0:
577
+ # when dumping to yaml, an empty dict will go in as {}.
578
+ contents = ''
579
+ f.write(contents)
498
580
 
499
581
 
500
582
  def dump_yaml_str(config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> str:
sky/utils/config_utils.py CHANGED
@@ -248,6 +248,8 @@ def get_cloud_config_value_from_dict(
248
248
  region_key = None
249
249
  if cloud == 'kubernetes':
250
250
  region_key = 'context_configs'
251
+ if cloud == 'nebius':
252
+ region_key = 'region_configs'
251
253
 
252
254
  per_context_config = None
253
255
  if region is not None and region_key is not None:
@@ -255,6 +257,19 @@ def get_cloud_config_value_from_dict(
255
257
  keys=(cloud, region_key, region) + keys,
256
258
  default_value=None,
257
259
  override_configs=override_configs)
260
+ if not per_context_config and cloud == 'nebius':
261
+ # TODO (kyuds): Backward compatibility, remove after 0.11.0.
262
+ per_context_config = input_config.get_nested(
263
+ keys=(cloud, region) + keys,
264
+ default_value=None,
265
+ override_configs=override_configs)
266
+ if per_context_config is not None:
267
+ logger.info(
268
+ 'Nebius configuration is using the legacy format. \n'
269
+ 'This format will be deprecated after 0.11.0, refer to '
270
+ '`https://docs.skypilot.co/en/latest/reference/config.html#nebius` ' # pylint: disable=line-too-long
271
+ 'for the new format. Please use `region_configs` to specify region specific configuration.'
272
+ )
258
273
  # if no override found for specified region
259
274
  general_config = input_config.get_nested(keys=(cloud,) + keys,
260
275
  default_value=default_value,
File without changes
@@ -9,6 +9,9 @@ from typing import Any, Callable, Optional
9
9
  import sqlalchemy
10
10
  from sqlalchemy import exc as sqlalchemy_exc
11
11
 
12
+ from sky import sky_logging
13
+
14
+ logger = sky_logging.init_logger(__name__)
12
15
  if typing.TYPE_CHECKING:
13
16
  from sqlalchemy.orm import Session
14
17
 
@@ -146,6 +149,62 @@ def add_column_to_table_sqlalchemy(
146
149
  session.commit()
147
150
 
148
151
 
152
+ def add_column_to_table_alembic(
153
+ table_name: str,
154
+ column_name: str,
155
+ column_type: sqlalchemy.types.TypeEngine,
156
+ server_default: Optional[str] = None,
157
+ copy_from: Optional[str] = None,
158
+ value_to_replace_existing_entries: Optional[Any] = None,
159
+ ):
160
+ """Add a column to a table using Alembic operations.
161
+
162
+ This provides the same interface as add_column_to_table_sqlalchemy but
163
+ uses Alembic's connection context for proper migration support.
164
+
165
+ Args:
166
+ table_name: Name of the table to add column to
167
+ column_name: Name of the new column
168
+ column_type: SQLAlchemy column type
169
+ server_default: Server-side default value for the column
170
+ copy_from: Column name to copy values from (for existing rows)
171
+ value_to_replace_existing_entries: Default value for existing NULL
172
+ entries
173
+ """
174
+ from alembic import op # pylint: disable=import-outside-toplevel
175
+
176
+ try:
177
+ # Create the column with server_default if provided
178
+ column = sqlalchemy.Column(column_name,
179
+ column_type,
180
+ server_default=server_default)
181
+ op.add_column(table_name, column)
182
+
183
+ # Handle data migration
184
+ if copy_from is not None:
185
+ op.execute(
186
+ sqlalchemy.text(
187
+ f'UPDATE {table_name} SET {column_name} = {copy_from}'))
188
+
189
+ if value_to_replace_existing_entries is not None:
190
+ # Use parameterized query for safety
191
+ op.get_bind().execute(
192
+ sqlalchemy.text(f'UPDATE {table_name} '
193
+ f'SET {column_name} = :replacement_value '
194
+ f'WHERE {column_name} IS NULL'),
195
+ {'replacement_value': value_to_replace_existing_entries})
196
+ except sqlalchemy_exc.ProgrammingError as e:
197
+ if 'already exists' in str(e).lower():
198
+ pass # Column already exists, that's fine
199
+ else:
200
+ raise
201
+ except sqlalchemy_exc.OperationalError as e:
202
+ if 'duplicate column name' in str(e).lower():
203
+ pass # Column already exists, that's fine
204
+ else:
205
+ raise
206
+
207
+
149
208
  class SQLiteConn(threading.local):
150
209
  """Thread-local connection to the sqlite3 database."""
151
210
 
@@ -0,0 +1,93 @@
1
+ """Constants for the database schemas."""
2
+
3
+ import contextlib
4
+ import logging
5
+ import os
6
+
7
+ from alembic import command as alembic_command
8
+ from alembic.config import Config
9
+ from alembic.runtime import migration
10
+ import filelock
11
+ import sqlalchemy
12
+
13
+ DB_INIT_LOCK_TIMEOUT_SECONDS = 10
14
+
15
+ GLOBAL_USER_STATE_DB_NAME = 'state_db'
16
+ GLOBAL_USER_STATE_VERSION = '001'
17
+ GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
18
+
19
+ SPOT_JOBS_DB_NAME = 'spot_jobs_db'
20
+ SPOT_JOBS_VERSION = '001'
21
+ SPOT_JOBS_LOCK_PATH = '~/.sky/locks/.spot_jobs_db.lock'
22
+
23
+
24
+ @contextlib.contextmanager
25
+ def db_lock(db_name: str):
26
+ lock_path = os.path.expanduser(f'~/.sky/locks/.{db_name}.lock')
27
+ try:
28
+ with filelock.FileLock(lock_path, timeout=DB_INIT_LOCK_TIMEOUT_SECONDS):
29
+ yield
30
+ except filelock.Timeout as e:
31
+ raise RuntimeError(f'Failed to initialize database due to a timeout '
32
+ f'when trying to acquire the lock at '
33
+ f'{lock_path}. '
34
+ 'Please try again or manually remove the lock '
35
+ f'file if you believe it is stale.') from e
36
+
37
+
38
+ def get_alembic_config(engine: sqlalchemy.engine.Engine, section: str):
39
+ """Get Alembic configuration for the given section"""
40
+ # Use the alembic.ini file from setup_files (included in wheel)
41
+ # From sky/utils/db/migration_utils.py -> sky/setup_files/alembic.ini
42
+ alembic_ini_path = os.path.join(
43
+ os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
44
+ 'setup_files', 'alembic.ini')
45
+ alembic_cfg = Config(alembic_ini_path, ini_section=section)
46
+
47
+ # Override the database URL to match SkyPilot's current connection
48
+ # Use render_as_string to get the full URL with password
49
+ url = engine.url.render_as_string(hide_password=False)
50
+ alembic_cfg.set_section_option(section, 'sqlalchemy.url', url)
51
+
52
+ return alembic_cfg
53
+
54
+
55
+ def safe_alembic_upgrade(engine: sqlalchemy.engine.Engine,
56
+ alembic_config: Config, target_revision: str):
57
+ """Only upgrade if current version is older than target.
58
+
59
+ This handles the case where a database was created with a newer version of
60
+ the code and we're now running older code. Since our migrations are purely
61
+ additive, it's safe to run a newer database with older code.
62
+
63
+ Args:
64
+ engine: SQLAlchemy engine for the database
65
+ alembic_config: Alembic configuration object
66
+ target_revision: Target revision to upgrade to (e.g., '001')
67
+ """
68
+ # set alembic logger to warning level
69
+ alembic_logger = logging.getLogger('alembic')
70
+ alembic_logger.setLevel(logging.WARNING)
71
+
72
+ current_rev = None
73
+
74
+ # Get the current revision from the database
75
+ version_table = alembic_config.get_section_option(
76
+ alembic_config.config_ini_section, 'version_table', 'alembic_version')
77
+
78
+ with engine.connect() as connection:
79
+ context = migration.MigrationContext.configure(
80
+ connection, opts={'version_table': version_table})
81
+ current_rev = context.get_current_revision()
82
+
83
+ if current_rev is None:
84
+ alembic_command.upgrade(alembic_config, target_revision)
85
+ return
86
+
87
+ # Compare revisions - assuming they are numeric strings like '001', '002'
88
+ current_rev_num = int(current_rev)
89
+ target_rev_num = int(target_revision)
90
+
91
+ # only upgrade if current revision is older than target revision
92
+ if current_rev_num < target_rev_num:
93
+ alembic_command.upgrade(alembic_config, target_revision)
sky/utils/locks.py ADDED
@@ -0,0 +1,319 @@
1
+ """Lock for SkyPilot.
2
+
3
+ This module provides an abstraction for locking that can use
4
+ either local file locks or database-based distributed locks.
5
+ """
6
+ import abc
7
+ import hashlib
8
+ import logging
9
+ import os
10
+ import time
11
+ from typing import Any, Optional
12
+
13
+ import filelock
14
+ import sqlalchemy
15
+
16
+ from sky import global_user_state
17
+ from sky.skylet import constants
18
+ from sky.utils import common_utils
19
+ from sky.utils.db import db_utils
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class LockTimeout(RuntimeError):
25
+ """Raised when a lock acquisition times out."""
26
+ pass
27
+
28
+
29
+ class AcquireReturnProxy:
30
+ """A context manager that releases the lock when exiting.
31
+
32
+ This proxy is returned by acquire() and ensures proper cleanup
33
+ when used in a with statement.
34
+ """
35
+
36
+ def __init__(self, lock: 'DistributedLock') -> None:
37
+ self.lock = lock
38
+
39
+ def __enter__(self) -> 'DistributedLock':
40
+ return self.lock
41
+
42
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
43
+ self.lock.release()
44
+
45
+
46
+ class DistributedLock(abc.ABC):
47
+ """Abstract base class for a distributed lock.
48
+
49
+ Provides a context manager interface for acquiring and releasing locks
50
+ that can work across multiple processes and potentially multiple machines.
51
+ """
52
+
53
+ def __init__(self,
54
+ lock_id: str,
55
+ timeout: Optional[float] = None,
56
+ poll_interval: float = 0.1):
57
+ """Initialize the lock.
58
+
59
+ Args:
60
+ lock_id: Unique identifier for the lock.
61
+ timeout: Maximum time to wait for lock acquisition.
62
+ If None, wait indefinitely.
63
+ poll_interval: Interval in seconds to poll for lock acquisition.
64
+ """
65
+ self.lock_id = lock_id
66
+ self.timeout = timeout
67
+ self.poll_interval = poll_interval
68
+
69
+ @abc.abstractmethod
70
+ def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
71
+ """Acquire the lock.
72
+
73
+ Args:
74
+ blocking: If True, block until lock is acquired or timeout.
75
+ If False, return immediately.
76
+
77
+ Returns:
78
+ AcquireReturnProxy that can be used as a context manager.
79
+
80
+ Raises:
81
+ LockTimeout: If lock cannot be acquired.
82
+ """
83
+ pass
84
+
85
+ @abc.abstractmethod
86
+ def release(self) -> None:
87
+ """Release the lock."""
88
+ pass
89
+
90
+ @abc.abstractmethod
91
+ def force_unlock(self) -> None:
92
+ """Force unlock the lock if it is acquired."""
93
+ pass
94
+
95
+ @abc.abstractmethod
96
+ def is_locked(self) -> bool:
97
+ """Check if the lock is acquired."""
98
+ pass
99
+
100
+ def __enter__(self) -> 'DistributedLock':
101
+ """Context manager entry."""
102
+ self.acquire()
103
+ return self
104
+
105
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
106
+ """Context manager exit."""
107
+ self.release()
108
+
109
+
110
+ class FileLock(DistributedLock):
111
+ """A wrapper around filelock.FileLock.
112
+
113
+ This implements a distributed lock that works across multiple processes
114
+ when they share the same filesystem.
115
+ """
116
+
117
+ def __init__(self,
118
+ lock_id: str,
119
+ timeout: Optional[float] = None,
120
+ poll_interval: float = 0.1):
121
+ """Initialize the file lock.
122
+
123
+ Args:
124
+ lock_id: Unique identifier for the lock.
125
+ timeout: Maximum time to wait for lock acquisition.
126
+ poll_interval: Interval in seconds to poll for lock acquisition.
127
+ """
128
+ super().__init__(lock_id, timeout, poll_interval)
129
+ os.makedirs(constants.SKY_LOCKS_DIR, exist_ok=True)
130
+ self.lock_path = os.path.join(constants.SKY_LOCKS_DIR,
131
+ f'.{lock_id}.lock')
132
+ if timeout is None:
133
+ timeout = -1
134
+ self._filelock: filelock.FileLock = filelock.FileLock(self.lock_path,
135
+ timeout=timeout)
136
+
137
+ def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
138
+ """Acquire the file lock."""
139
+ try:
140
+ acquired = self._filelock.acquire(blocking=blocking)
141
+ if not acquired:
142
+ raise LockTimeout(f'Failed to acquire file lock {self.lock_id}')
143
+ return AcquireReturnProxy(self)
144
+ except filelock.Timeout as e:
145
+ raise LockTimeout(
146
+ f'Failed to acquire file lock {self.lock_id}') from e
147
+
148
+ def release(self) -> None:
149
+ """Release the file lock."""
150
+ self._filelock.release()
151
+
152
+ def force_unlock(self) -> None:
153
+ """Force unlock the file lock."""
154
+ common_utils.remove_file_if_exists(self.lock_path)
155
+
156
+ def is_locked(self) -> bool:
157
+ return self._filelock.is_locked()
158
+
159
+
160
+ class PostgresLock(DistributedLock):
161
+ """PostgreSQL advisory lock implementation.
162
+
163
+ Uses PostgreSQL advisory locks to implement distributed locking
164
+ that works across multiple machines sharing the same database.
165
+ Reference:
166
+ https://www.postgresql.org/docs/current/explicit-locking.html
167
+ #ADVISORY-LOCKS
168
+ """
169
+
170
+ def __init__(self,
171
+ lock_id: str,
172
+ timeout: Optional[float] = None,
173
+ poll_interval: float = 1):
174
+ """Initialize the postgres lock.
175
+
176
+ Args:
177
+ lock_id: Unique identifier for the lock.
178
+ timeout: Maximum time to wait for lock acquisition.
179
+ poll_interval: Interval in seconds to poll for lock acquisition,
180
+ default to 1 second to avoid storming the database.
181
+ """
182
+ super().__init__(lock_id, timeout, poll_interval)
183
+ # Convert string lock_id to integer for postgres advisory locks
184
+ self._lock_key = self._string_to_lock_key(lock_id)
185
+ self._acquired = False
186
+ self._connection: Optional[sqlalchemy.pool.PoolProxiedConnection] = None
187
+
188
+ def _string_to_lock_key(self, s: str) -> int:
189
+ """Convert string to a 64-bit integer for advisory lock key."""
190
+ hash_digest = hashlib.sha256(s.encode('utf-8')).digest()
191
+ # Take first 8 bytes and convert to int, ensure positive 64-bit
192
+ return int.from_bytes(hash_digest[:8], 'big') & ((1 << 63) - 1)
193
+
194
+ def _get_connection(self) -> sqlalchemy.pool.PoolProxiedConnection:
195
+ """Get database connection."""
196
+ engine = global_user_state.initialize_and_get_db()
197
+ if engine.dialect.name != db_utils.SQLAlchemyDialect.POSTGRESQL.value:
198
+ raise ValueError('PostgresLock requires PostgreSQL database. '
199
+ f'Current dialect: {engine.dialect.name}')
200
+ return engine.raw_connection()
201
+
202
+ def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
203
+ """Acquire the postgres advisory lock."""
204
+ if self._acquired:
205
+ return AcquireReturnProxy(self)
206
+
207
+ self._connection = self._get_connection()
208
+ cursor = self._connection.cursor()
209
+
210
+ start_time = time.time()
211
+
212
+ try:
213
+ while True:
214
+ cursor.execute('SELECT pg_try_advisory_lock(%s)',
215
+ (self._lock_key,))
216
+ result = cursor.fetchone()[0]
217
+
218
+ if result:
219
+ self._acquired = True
220
+ return AcquireReturnProxy(self)
221
+
222
+ if not blocking:
223
+ raise LockTimeout(
224
+ f'Failed to immediately acquire postgres lock '
225
+ f'{self.lock_id}')
226
+
227
+ if (self.timeout is not None and
228
+ time.time() - start_time > self.timeout):
229
+ raise LockTimeout(
230
+ f'Failed to acquire postgres lock {self.lock_id} '
231
+ f'within {self.timeout} seconds')
232
+
233
+ time.sleep(self.poll_interval)
234
+
235
+ except Exception:
236
+ if self._connection:
237
+ self._connection.close()
238
+ self._connection = None
239
+ raise
240
+
241
+ def release(self) -> None:
242
+ """Release the postgres advisory lock."""
243
+ if not self._acquired or not self._connection:
244
+ return
245
+
246
+ try:
247
+ cursor = self._connection.cursor()
248
+ cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
249
+ self._connection.commit()
250
+ self._acquired = False
251
+ finally:
252
+ if self._connection:
253
+ self._connection.close()
254
+ self._connection = None
255
+
256
+ def force_unlock(self) -> None:
257
+ """Force unlock the postgres advisory lock."""
258
+ try:
259
+ if not self._connection:
260
+ self._connection = self._get_connection()
261
+ cursor = self._connection.cursor()
262
+ cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
263
+ self._connection.commit()
264
+ except Exception as e:
265
+ raise RuntimeError(
266
+ f'Failed to force unlock postgres lock {self.lock_id}: {e}'
267
+ ) from e
268
+ finally:
269
+ if self._connection:
270
+ self._connection.close()
271
+ self._connection = None
272
+
273
+ def is_locked(self) -> bool:
274
+ """Check if the postgres advisory lock is acquired."""
275
+ return self._acquired
276
+
277
+
278
+ def get_lock(lock_id: str,
279
+ timeout: Optional[float] = None,
280
+ lock_type: Optional[str] = None,
281
+ poll_interval: Optional[float] = None) -> DistributedLock:
282
+ """Create a distributed lock instance.
283
+
284
+ Args:
285
+ lock_id: Unique identifier for the lock.
286
+ timeout: Maximum time seconds to wait for lock acquisition,
287
+ None means wait indefinitely.
288
+ lock_type: Type of lock to create ('filelock' or 'postgres').
289
+ If None, auto-detect based on database configuration.
290
+
291
+ Returns:
292
+ DistributedLock instance.
293
+ """
294
+ if lock_type is None:
295
+ lock_type = _detect_lock_type()
296
+
297
+ if lock_type == 'postgres':
298
+ if poll_interval is None:
299
+ return PostgresLock(lock_id, timeout)
300
+ return PostgresLock(lock_id, timeout, poll_interval)
301
+ elif lock_type == 'filelock':
302
+ if poll_interval is None:
303
+ return FileLock(lock_id, timeout)
304
+ return FileLock(lock_id, timeout, poll_interval)
305
+ else:
306
+ raise ValueError(f'Unknown lock type: {lock_type}')
307
+
308
+
309
+ def _detect_lock_type() -> str:
310
+ """Auto-detect the appropriate lock type based on configuration."""
311
+ try:
312
+ engine = global_user_state.initialize_and_get_db()
313
+ if engine.dialect.name == db_utils.SQLAlchemyDialect.POSTGRESQL.value:
314
+ return 'postgres'
315
+ except Exception: # pylint: disable=broad-except
316
+ # Fall back to filelock if database detection fails
317
+ pass
318
+
319
+ return 'filelock'