skypilot-nightly 1.0.0.dev20250718__py3-none-any.whl → 1.0.0.dev20250723__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (160) hide show
  1. sky/__init__.py +4 -2
  2. sky/admin_policy.py +11 -4
  3. sky/backends/backend_utils.py +50 -24
  4. sky/backends/cloud_vm_ray_backend.py +41 -38
  5. sky/catalog/__init__.py +3 -1
  6. sky/catalog/aws_catalog.py +8 -5
  7. sky/catalog/azure_catalog.py +8 -5
  8. sky/catalog/common.py +8 -2
  9. sky/catalog/cudo_catalog.py +5 -2
  10. sky/catalog/do_catalog.py +4 -1
  11. sky/catalog/fluidstack_catalog.py +5 -2
  12. sky/catalog/gcp_catalog.py +8 -5
  13. sky/catalog/hyperbolic_catalog.py +5 -2
  14. sky/catalog/ibm_catalog.py +8 -5
  15. sky/catalog/lambda_catalog.py +8 -5
  16. sky/catalog/nebius_catalog.py +8 -5
  17. sky/catalog/oci_catalog.py +8 -5
  18. sky/catalog/paperspace_catalog.py +4 -1
  19. sky/catalog/runpod_catalog.py +5 -2
  20. sky/catalog/scp_catalog.py +8 -5
  21. sky/catalog/vast_catalog.py +5 -2
  22. sky/catalog/vsphere_catalog.py +4 -1
  23. sky/client/cli/command.py +63 -25
  24. sky/client/sdk.py +61 -11
  25. sky/clouds/aws.py +12 -7
  26. sky/clouds/azure.py +12 -7
  27. sky/clouds/cloud.py +9 -8
  28. sky/clouds/cudo.py +13 -7
  29. sky/clouds/do.py +12 -7
  30. sky/clouds/fluidstack.py +11 -6
  31. sky/clouds/gcp.py +12 -7
  32. sky/clouds/hyperbolic.py +11 -6
  33. sky/clouds/ibm.py +11 -6
  34. sky/clouds/kubernetes.py +7 -3
  35. sky/clouds/lambda_cloud.py +11 -6
  36. sky/clouds/nebius.py +14 -12
  37. sky/clouds/oci.py +12 -7
  38. sky/clouds/paperspace.py +12 -7
  39. sky/clouds/runpod.py +12 -7
  40. sky/clouds/scp.py +11 -6
  41. sky/clouds/vast.py +14 -8
  42. sky/clouds/vsphere.py +11 -6
  43. sky/core.py +6 -1
  44. sky/dashboard/out/404.html +1 -1
  45. sky/dashboard/out/_next/static/chunks/{1043-734e57d2b27dfe5d.js → 1043-869d9c78bf5dd3df.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/{1141-d8c6404a7c6fffe6.js → 1141-e49a159c30a6c4a7.js} +1 -1
  47. sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +30 -0
  48. sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +6 -0
  49. sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +15 -0
  52. sky/dashboard/out/_next/static/chunks/{2641.35edc9ccaeaad9e3.js → 2641.74c19c4d45a2c034.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/{4725.4c849b1e05c8e9ad.js → 4725.66125dcd9832aa5d.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +16 -0
  56. sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +15 -0
  57. sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +55 -0
  59. sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +41 -0
  61. sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +6 -0
  62. sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +30 -0
  64. sky/dashboard/out/_next/static/chunks/{9984.2b5e3fa69171bff9.js → 9984.0460de9d3adf5582.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +34 -0
  66. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa406155b4223d0d.js → [job]-2186770cc2de1623.js} +2 -2
  67. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0c37ee1ac5f3474d.js → [cluster]-95afb019ab85801c.js} +1 -1
  68. sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +1 -0
  70. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +1 -0
  71. sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-c5b357bfd9502fbe.js → [job]-dc0299ffefebcdbe.js} +2 -2
  73. sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +1 -0
  74. sky/dashboard/out/_next/static/chunks/pages/{users-19e98664bdd61643.js → users-6790fcefd5487b13.js} +1 -1
  75. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +1 -0
  76. sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +1 -0
  77. sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +1 -0
  78. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +3 -0
  79. sky/dashboard/out/_next/static/mym3Ciwp-zqU7ZpOLGnrW/_buildManifest.js +1 -0
  80. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  81. sky/dashboard/out/clusters/[cluster].html +1 -1
  82. sky/dashboard/out/clusters.html +1 -1
  83. sky/dashboard/out/config.html +1 -1
  84. sky/dashboard/out/index.html +1 -1
  85. sky/dashboard/out/infra/[context].html +1 -1
  86. sky/dashboard/out/infra.html +1 -1
  87. sky/dashboard/out/jobs/[job].html +1 -1
  88. sky/dashboard/out/jobs.html +1 -1
  89. sky/dashboard/out/users.html +1 -1
  90. sky/dashboard/out/volumes.html +1 -1
  91. sky/dashboard/out/workspace/new.html +1 -1
  92. sky/dashboard/out/workspaces/[name].html +1 -1
  93. sky/dashboard/out/workspaces.html +1 -1
  94. sky/data/mounting_utils.py +93 -32
  95. sky/global_user_state.py +12 -143
  96. sky/jobs/state.py +9 -88
  97. sky/jobs/utils.py +28 -13
  98. sky/provision/nebius/utils.py +3 -6
  99. sky/schemas/db/README +4 -0
  100. sky/schemas/db/env.py +90 -0
  101. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  102. sky/schemas/db/script.py.mako +28 -0
  103. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  104. sky/serve/client/sdk.py +6 -2
  105. sky/serve/controller.py +7 -3
  106. sky/serve/serve_state.py +1 -1
  107. sky/serve/serve_utils.py +171 -75
  108. sky/serve/server/core.py +17 -6
  109. sky/server/common.py +4 -3
  110. sky/server/requests/payloads.py +2 -0
  111. sky/server/requests/requests.py +1 -1
  112. sky/setup_files/MANIFEST.in +2 -0
  113. sky/setup_files/alembic.ini +148 -0
  114. sky/setup_files/dependencies.py +1 -0
  115. sky/skylet/configs.py +1 -1
  116. sky/skylet/constants.py +4 -0
  117. sky/skylet/job_lib.py +1 -1
  118. sky/skypilot_config.py +1 -1
  119. sky/users/permission.py +1 -1
  120. sky/utils/common_utils.py +85 -3
  121. sky/utils/config_utils.py +15 -0
  122. sky/utils/db/__init__.py +0 -0
  123. sky/utils/{db_utils.py → db/db_utils.py} +59 -0
  124. sky/utils/db/migration_utils.py +93 -0
  125. sky/utils/locks.py +319 -0
  126. sky/utils/schemas.py +38 -34
  127. sky/utils/timeline.py +41 -0
  128. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/METADATA +2 -1
  129. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/RECORD +134 -125
  130. sky/dashboard/out/_next/static/FUjweqdImyeYhMYFON-Se/_buildManifest.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/1746.27d40aedc22bd2d6.js +0 -60
  132. sky/dashboard/out/_next/static/chunks/1871-76491ac174a95278.js +0 -6
  133. sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/2875.c24c6d57dc82e436.js +0 -25
  135. sky/dashboard/out/_next/static/chunks/3785.95b94f18aaec7233.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/3947-b059261d6fa88a1f.js +0 -35
  137. sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +0 -16
  139. sky/dashboard/out/_next/static/chunks/5491.918ffed0ba7a5294.js +0 -20
  140. sky/dashboard/out/_next/static/chunks/6990-dcb411b566e64cde.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/804-9f5e98ce84d46bdd.js +0 -21
  142. sky/dashboard/out/_next/static/chunks/9025.133e9ba5c780afeb.js +0 -6
  143. sky/dashboard/out/_next/static/chunks/938-6a9ffdaa21eee969.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/9470-b6f6a35283863a6f.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/9847.46e613d000c55859.js +0 -30
  146. sky/dashboard/out/_next/static/chunks/pages/_app-771a40cde532309b.js +0 -20
  147. sky/dashboard/out/_next/static/chunks/pages/clusters-102d169e87913ba1.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces-a1e43d9ef51a9cea.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/webpack-6b0575ea521af4f3.js +0 -1
  155. sky/dashboard/out/_next/static/css/219887b94512388c.css +0 -3
  156. /sky/dashboard/out/_next/static/{FUjweqdImyeYhMYFON-Se → mym3Ciwp-zqU7ZpOLGnrW}/_ssgManifest.js +0 -0
  157. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/WHEEL +0 -0
  158. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/entry_points.txt +0 -0
  159. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/licenses/LICENSE +0 -0
  160. {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '663a28261fc98dfa69214e1d4f1b0bb7b02664e0'
8
+ _SKYPILOT_COMMIT_SHA = '874bc28c3a4b7322d30cfc544b257647379b59ed'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250718'
38
+ __version__ = '1.0.0.dev20250723'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -104,6 +104,7 @@ from sky.client.sdk import job_status
104
104
  from sky.client.sdk import launch
105
105
  from sky.client.sdk import optimize
106
106
  from sky.client.sdk import queue
107
+ from sky.client.sdk import reload_config
107
108
  from sky.client.sdk import start
108
109
  from sky.client.sdk import status
109
110
  from sky.client.sdk import stop
@@ -185,6 +186,7 @@ __all__ = [
185
186
  'optimize',
186
187
  'launch',
187
188
  'exec',
189
+ 'reload_config',
188
190
  # core APIs
189
191
  'status',
190
192
  'start',
sky/admin_policy.py CHANGED
@@ -121,11 +121,17 @@ class MutatedUserRequest:
121
121
  dict(self.skypilot_config),)).model_dump_json()
122
122
 
123
123
  @classmethod
124
- def decode(cls, mutated_user_request_body: str) -> 'MutatedUserRequest':
124
+ def decode(cls, mutated_user_request_body: str,
125
+ original_request: UserRequest) -> 'MutatedUserRequest':
125
126
  mutated_user_request_body = _MutatedUserRequestBody.model_validate_json(
126
127
  mutated_user_request_body)
127
- return cls(task=sky.Task.from_yaml_config(
128
- common_utils.read_yaml_all_str(mutated_user_request_body.task)[0]),
128
+ task = sky.Task.from_yaml_config(
129
+ common_utils.read_yaml_all_str(mutated_user_request_body.task)[0])
130
+ # Some internal Task fields are not serialized. We need to manually
131
+ # restore them from the original request.
132
+ task.managed_job_dag = original_request.task.managed_job_dag
133
+ task.service_name = original_request.task.service_name
134
+ return cls(task=task,
129
135
  skypilot_config=config_utils.Config.from_dict(
130
136
  common_utils.read_yaml_all_str(
131
137
  mutated_user_request_body.skypilot_config)[0],))
@@ -243,7 +249,8 @@ class RestfulAdminPolicy(PolicyTemplate):
243
249
  f'{self.policy_url}: {e}') from None
244
250
 
245
251
  try:
246
- mutated_user_request = MutatedUserRequest.decode(response.json())
252
+ mutated_user_request = MutatedUserRequest.decode(
253
+ response.json(), user_request)
247
254
  except Exception as e: # pylint: disable=broad-except
248
255
  with ux_utils.print_exception_no_traceback():
249
256
  raise exceptions.RestfulPolicyError(
@@ -17,7 +17,6 @@ from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
17
17
  import uuid
18
18
 
19
19
  import colorama
20
- import filelock
21
20
  from packaging import version
22
21
  from typing_extensions import Literal
23
22
 
@@ -45,6 +44,7 @@ from sky.utils import common_utils
45
44
  from sky.utils import context_utils
46
45
  from sky.utils import controller_utils
47
46
  from sky.utils import env_options
47
+ from sky.utils import locks
48
48
  from sky.utils import registry
49
49
  from sky.utils import resources_utils
50
50
  from sky.utils import rich_utils
@@ -104,23 +104,18 @@ WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
104
104
  # Fixed IP addresses are used to avoid DNS lookup blocking the check, for
105
105
  # machine with no internet connection.
106
106
  # Refer to: https://stackoverflow.com/questions/3764291/how-can-i-see-if-theres-an-available-and-active-network-connection-in-python # pylint: disable=line-too-long
107
- _TEST_IP_LIST = ['https://1.1.1.1', 'https://8.8.8.8']
107
+ _TEST_IP_LIST = ['https://8.8.8.8', 'https://1.1.1.1']
108
108
 
109
109
  # Allow each CPU thread take 2 tasks.
110
110
  # Note: This value cannot be too small, otherwise OOM issue may occur.
111
111
  DEFAULT_TASK_CPU_DEMAND = 0.5
112
112
 
113
- # Filelocks for the cluster status change.
114
- CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
115
113
  CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
116
114
 
117
115
  # Time that must elapse since the last status check before we should re-check if
118
116
  # the cluster has been terminated or autostopped.
119
117
  _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
120
118
 
121
- # Filelocks for updating cluster's file_mounts.
122
- CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
123
- '~/.sky/.{}_file_mounts.lock')
124
119
  CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
125
120
 
126
121
  # Remote dir that holds our runtime files.
@@ -1635,18 +1630,28 @@ def get_node_ips(cluster_yaml: str,
1635
1630
 
1636
1631
  def check_network_connection():
1637
1632
  # Tolerate 3 retries as it is observed that connections can fail.
1638
- adapter = adapters.HTTPAdapter(max_retries=retry_lib.Retry(total=3))
1639
1633
  http = requests.Session()
1640
- http.mount('https://', adapter)
1641
- http.mount('http://', adapter)
1642
- for i, ip in enumerate(_TEST_IP_LIST):
1643
- try:
1644
- http.head(ip, timeout=3)
1645
- return
1646
- except (requests.Timeout, requests.exceptions.ConnectionError) as e:
1647
- if i == len(_TEST_IP_LIST) - 1:
1648
- raise exceptions.NetworkError('Could not refresh the cluster. '
1649
- 'Network seems down.') from e
1634
+ http.mount('https://', adapters.HTTPAdapter())
1635
+ http.mount('http://', adapters.HTTPAdapter())
1636
+
1637
+ # Alternate between IPs on each retry
1638
+ max_retries = 3
1639
+ timeout = 0.5
1640
+
1641
+ for _ in range(max_retries):
1642
+ for ip in _TEST_IP_LIST:
1643
+ try:
1644
+ http.head(ip, timeout=timeout)
1645
+ return
1646
+ except (requests.Timeout, requests.exceptions.ConnectionError):
1647
+ continue
1648
+
1649
+ timeout *= 2 # Double the timeout for next retry
1650
+
1651
+ # If we get here, all IPs failed
1652
+ # Assume network connection is down
1653
+ raise exceptions.NetworkError('Could not refresh the cluster. '
1654
+ 'Network seems down.')
1650
1655
 
1651
1656
 
1652
1657
  @timeline.event
@@ -1995,9 +2000,20 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1995
2000
 
1996
2001
  total_nodes = handle.launched_nodes * handle.num_ips_per_node
1997
2002
 
2003
+ cloud_name = repr(handle.launched_resources.cloud).lower()
1998
2004
  for i in range(5):
1999
- ready_head, ready_workers, output, stderr = (
2000
- get_node_counts_from_ray_status(head_runner))
2005
+ try:
2006
+ ready_head, ready_workers, output, stderr = (
2007
+ get_node_counts_from_ray_status(head_runner))
2008
+ except RuntimeError as e:
2009
+ logger.debug(f'Refreshing status ({cluster_name!r}) attempt'
2010
+ f' {i}: {common_utils.format_exception(e)}')
2011
+ if cloud_name != 'kubernetes':
2012
+ raise e
2013
+ # We retry for kubernetes because coreweave can have a
2014
+ # transient network issue.
2015
+ time.sleep(1)
2016
+ continue
2001
2017
  if ready_head + ready_workers == total_nodes:
2002
2018
  return True
2003
2019
  logger.debug(f'Refreshing status ({cluster_name!r}) attempt '
@@ -2284,8 +2300,7 @@ def refresh_cluster_record(
2284
2300
 
2285
2301
  # The loop logic allows us to notice if the status was updated in the
2286
2302
  # global_user_state by another process and stop trying to get the lock.
2287
- # The core loop logic is adapted from FileLock's implementation.
2288
- lock = filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
2303
+ lock = locks.get_lock(cluster_status_lock_id(cluster_name))
2289
2304
  start_time = time.perf_counter()
2290
2305
 
2291
2306
  # Loop until we have an up-to-date status or until we acquire the lock.
@@ -2309,7 +2324,8 @@ def refresh_cluster_record(
2309
2324
  return record
2310
2325
  # Update and return the cluster status.
2311
2326
  return _update_cluster_status(cluster_name)
2312
- except filelock.Timeout:
2327
+
2328
+ except locks.LockTimeout:
2313
2329
  # lock.acquire() will throw a Timeout exception if the lock is not
2314
2330
  # available and we have blocking=False.
2315
2331
  pass
@@ -2610,7 +2626,7 @@ def is_controller_accessible(
2610
2626
  need_connection_check):
2611
2627
  # Check ssh connection if (1) controller is in INIT state, or (2) we failed to fetch the
2612
2628
  # status, both of which can happen when controller's status lock is held by another `sky jobs launch` or
2613
- # `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
2629
+ # `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
2614
2630
  # we can allow access to the controller.
2615
2631
  ssh_credentials = ssh_credential_from_yaml(handle.cluster_yaml,
2616
2632
  handle.docker_user,
@@ -3187,3 +3203,13 @@ def get_endpoints(cluster: str,
3187
3203
  return {
3188
3204
  port_num: urls[0].url() for port_num, urls in port_details.items()
3189
3205
  }
3206
+
3207
+
3208
+ def cluster_status_lock_id(cluster_name: str) -> str:
3209
+ """Get the lock ID for cluster status operations."""
3210
+ return f'{cluster_name}_status'
3211
+
3212
+
3213
+ def cluster_file_mounts_lock_id(cluster_name: str) -> str:
3214
+ """Get the lock ID for cluster file mounts operations."""
3215
+ return f'{cluster_name}_file_mounts'
@@ -20,7 +20,6 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
20
20
  Union)
21
21
 
22
22
  import colorama
23
- import filelock
24
23
  import yaml
25
24
 
26
25
  import sky
@@ -64,6 +63,7 @@ from sky.utils import common_utils
64
63
  from sky.utils import context_utils
65
64
  from sky.utils import controller_utils
66
65
  from sky.utils import env_options
66
+ from sky.utils import locks
67
67
  from sky.utils import log_utils
68
68
  from sky.utils import message_utils
69
69
  from sky.utils import registry
@@ -2916,9 +2916,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2916
2916
  # Check if the cluster is owned by the current user. Raise
2917
2917
  # exceptions.ClusterOwnerIdentityMismatchError
2918
2918
  backend_utils.check_owner_identity(cluster_name)
2919
- lock_path = os.path.expanduser(
2920
- backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
2921
- with timeline.FileLockEvent(lock_path):
2919
+ lock_id = backend_utils.cluster_status_lock_id(cluster_name)
2920
+ with timeline.DistributedLockEvent(lock_id):
2922
2921
  # Try to launch the exiting cluster first. If no existing cluster,
2923
2922
  # this function will create a to_provision_config with required
2924
2923
  # resources.
@@ -3065,7 +3064,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3065
3064
 
3066
3065
  self._update_after_cluster_provisioned(
3067
3066
  handle, to_provision_config.prev_handle, task,
3068
- prev_cluster_status, lock_path, config_hash)
3067
+ prev_cluster_status, lock_id, config_hash)
3069
3068
  return handle, False
3070
3069
 
3071
3070
  cluster_config_file = config_dict['ray']
@@ -3137,7 +3136,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3137
3136
 
3138
3137
  self._update_after_cluster_provisioned(
3139
3138
  handle, to_provision_config.prev_handle, task,
3140
- prev_cluster_status, lock_path, config_hash)
3139
+ prev_cluster_status, lock_id, config_hash)
3141
3140
  return handle, False
3142
3141
 
3143
3142
  def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
@@ -3155,7 +3154,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3155
3154
  prev_handle: Optional[CloudVmRayResourceHandle],
3156
3155
  task: task_lib.Task,
3157
3156
  prev_cluster_status: Optional[status_lib.ClusterStatus],
3158
- lock_path: str, config_hash: str) -> None:
3157
+ lock_id: str, config_hash: str) -> None:
3159
3158
  usage_lib.messages.usage.update_cluster_resources(
3160
3159
  handle.launched_nodes, handle.launched_resources)
3161
3160
  usage_lib.messages.usage.update_final_cluster_status(
@@ -3237,7 +3236,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3237
3236
  handle.cached_external_ssh_ports, handle.docker_user,
3238
3237
  handle.ssh_user)
3239
3238
 
3240
- common_utils.remove_file_if_exists(lock_path)
3239
+ locks.get_lock(lock_id).force_unlock()
3241
3240
 
3242
3241
  def _sync_workdir(self, handle: CloudVmRayResourceHandle,
3243
3242
  workdir: Union[Path, Dict[str, Any]],
@@ -3819,8 +3818,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3819
3818
  is_identity_mismatch_and_purge = True
3820
3819
  else:
3821
3820
  raise
3822
- lock_path = os.path.expanduser(
3823
- backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
3821
+ lock_id = backend_utils.cluster_status_lock_id(cluster_name)
3822
+ lock = locks.get_lock(lock_id)
3824
3823
  # Retry in case new cluster operation comes in and holds the lock
3825
3824
  # right after the lock is removed.
3826
3825
  n_attempts = 2
@@ -3828,7 +3827,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3828
3827
  n_attempts -= 1
3829
3828
  # In case other running cluster operations are still holding the
3830
3829
  # lock.
3831
- common_utils.remove_file_if_exists(lock_path)
3830
+ lock.force_unlock()
3832
3831
  # We have to kill the cluster requests, because `down` and `stop`
3833
3832
  # should be higher priority than the cluster requests, and we should
3834
3833
  # release the lock from other requests.
@@ -3847,9 +3846,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3847
3846
  f'cluster {handle.cluster_name}: '
3848
3847
  f'{common_utils.format_exception(e, use_bracket=True)}')
3849
3848
  try:
3850
- with filelock.FileLock(
3851
- lock_path,
3852
- backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
3849
+ with lock:
3853
3850
  self.teardown_no_lock(
3854
3851
  handle,
3855
3852
  terminate,
@@ -3862,14 +3859,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3862
3859
  refresh_cluster_status=(
3863
3860
  not is_identity_mismatch_and_purge))
3864
3861
  if terminate:
3865
- common_utils.remove_file_if_exists(lock_path)
3862
+ lock.force_unlock()
3866
3863
  break
3867
- except filelock.Timeout as e:
3864
+ except locks.LockTimeout as e:
3868
3865
  logger.debug(f'Failed to acquire lock for {cluster_name}, '
3869
3866
  f'retrying...')
3870
3867
  if n_attempts <= 0:
3871
3868
  raise RuntimeError(
3872
- f'Cluster {cluster_name!r} is locked by {lock_path}. '
3869
+ f'Cluster {cluster_name!r} is locked by {lock_id}. '
3873
3870
  'Check to see if it is still being launched') from e
3874
3871
 
3875
3872
  # --- CloudVMRayBackend Specific APIs ---
@@ -3988,12 +3985,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3988
3985
  return dict(zip(job_ids, local_log_dirs))
3989
3986
 
3990
3987
  @context_utils.cancellation_guard
3991
- def tail_logs(self,
3992
- handle: CloudVmRayResourceHandle,
3993
- job_id: Optional[int],
3994
- managed_job_id: Optional[int] = None,
3995
- follow: bool = True,
3996
- tail: int = 0) -> int:
3988
+ def tail_logs(
3989
+ self,
3990
+ handle: CloudVmRayResourceHandle,
3991
+ job_id: Optional[int],
3992
+ managed_job_id: Optional[int] = None,
3993
+ follow: bool = True,
3994
+ tail: int = 0,
3995
+ require_outputs: bool = False,
3996
+ stream_logs: bool = True,
3997
+ process_stream: bool = False) -> Union[int, Tuple[int, str, str]]:
3997
3998
  """Tail the logs of a job.
3998
3999
 
3999
4000
  Args:
@@ -4003,6 +4004,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4003
4004
  follow: Whether to follow the logs.
4004
4005
  tail: The number of lines to display from the end of the
4005
4006
  log file. If 0, print all lines.
4007
+ require_outputs: Whether to return the stdout/stderr of the command.
4008
+ stream_logs: Whether to stream the logs to stdout/stderr.
4009
+ process_stream: Whether to process the stream.
4006
4010
 
4007
4011
  Returns:
4008
4012
  The exit code of the tail command. Returns code 100 if the job has
@@ -4022,18 +4026,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4022
4026
  signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
4023
4027
  signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
4024
4028
  try:
4025
- returncode = self.run_on_head(
4029
+ final = self.run_on_head(
4026
4030
  handle,
4027
4031
  code,
4028
- stream_logs=True,
4029
- process_stream=False,
4032
+ stream_logs=stream_logs,
4033
+ process_stream=process_stream,
4034
+ require_outputs=require_outputs,
4030
4035
  # Allocate a pseudo-terminal to disable output buffering.
4031
4036
  # Otherwise, there may be 5 minutes delay in logging.
4032
4037
  ssh_mode=command_runner.SshMode.INTERACTIVE,
4033
4038
  )
4034
4039
  except SystemExit as e:
4035
- returncode = e.code
4036
- return returncode
4040
+ final = e.code
4041
+ return final
4037
4042
 
4038
4043
  def tail_managed_job_logs(self,
4039
4044
  handle: CloudVmRayResourceHandle,
@@ -5237,18 +5242,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5237
5242
  # reconstruct them during cluster restart.
5238
5243
  continue
5239
5244
  storage_mounts_metadata[dst] = storage_obj.handle
5240
- lock_path = (
5241
- backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
5245
+ lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
5242
5246
  lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
5243
5247
  try:
5244
- with filelock.FileLock(lock_path, lock_timeout):
5248
+ with locks.get_lock(lock_id, lock_timeout):
5245
5249
  global_user_state.set_cluster_storage_mounts_metadata(
5246
5250
  cluster_name, storage_mounts_metadata)
5247
- except filelock.Timeout as e:
5251
+ except locks.LockTimeout as e:
5248
5252
  raise RuntimeError(
5249
5253
  f'Failed to store metadata for cluster {cluster_name!r} due to '
5250
5254
  'a timeout when trying to access local database. Please '
5251
- f'try again or manually remove the lock at {lock_path}. '
5255
+ f'try again or manually remove the lock at {lock_id}. '
5252
5256
  f'{common_utils.format_exception(e)}') from None
5253
5257
 
5254
5258
  def get_storage_mounts_metadata(
@@ -5259,19 +5263,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5259
5263
  After retrieving storage_mounts_metadata, it converts back the
5260
5264
  StorageMetadata to Storage object and restores 'storage_mounts.'
5261
5265
  """
5262
- lock_path = (
5263
- backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
5266
+ lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
5264
5267
  lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
5265
5268
  try:
5266
- with filelock.FileLock(lock_path, lock_timeout):
5269
+ with locks.get_lock(lock_id, lock_timeout):
5267
5270
  storage_mounts_metadata = (
5268
5271
  global_user_state.get_cluster_storage_mounts_metadata(
5269
5272
  cluster_name))
5270
- except filelock.Timeout as e:
5273
+ except locks.LockTimeout as e:
5271
5274
  raise RuntimeError(
5272
5275
  f'Failed to retrieve metadata for cluster {cluster_name!r} '
5273
5276
  'due to a timeout when trying to access local database. '
5274
- f'Please try again or manually remove the lock at {lock_path}.'
5277
+ f'Please try again or manually remove the lock at {lock_id}.'
5275
5278
  f' {common_utils.format_exception(e)}') from None
5276
5279
 
5277
5280
  if storage_mounts_metadata is None:
sky/catalog/__init__.py CHANGED
@@ -221,6 +221,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
221
221
  memory: Optional[str] = None,
222
222
  disk_tier: Optional[
223
223
  resources_utils.DiskTier] = None,
224
+ region: Optional[str] = None,
225
+ zone: Optional[str] = None,
224
226
  clouds: CloudFilter = None) -> Optional[str]:
225
227
  """Returns the cloud's default instance type for given #vCPUs and memory.
226
228
 
@@ -234,7 +236,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
234
236
  the given CPU and memory requirement.
235
237
  """
236
238
  return _map_clouds_catalog(clouds, 'get_default_instance_type', cpus,
237
- memory, disk_tier)
239
+ memory, disk_tier, region, zone)
238
240
 
239
241
 
240
242
  def get_accelerators_from_instance_type(
@@ -230,10 +230,12 @@ def get_vcpus_mem_from_instance_type(
230
230
  instance_type)
231
231
 
232
232
 
233
- def get_default_instance_type(
234
- cpus: Optional[str] = None,
235
- memory: Optional[str] = None,
236
- disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
233
+ def get_default_instance_type(cpus: Optional[str] = None,
234
+ memory: Optional[str] = None,
235
+ disk_tier: Optional[
236
+ resources_utils.DiskTier] = None,
237
+ region: Optional[str] = None,
238
+ zone: Optional[str] = None) -> Optional[str]:
237
239
  del disk_tier # unused
238
240
  if cpus is None and memory is None:
239
241
  cpus = f'{_DEFAULT_NUM_VCPUS}+'
@@ -247,7 +249,8 @@ def get_default_instance_type(
247
249
  df = _get_df()
248
250
  df = df[df['InstanceType'].str.startswith(instance_type_prefix)]
249
251
  return common.get_instance_type_for_cpus_mem_impl(df, cpus,
250
- memory_gb_or_ratio)
252
+ memory_gb_or_ratio,
253
+ region, zone)
251
254
 
252
255
 
253
256
  def get_accelerators_from_instance_type(
@@ -114,10 +114,12 @@ def _get_instance_family(instance_type: str) -> str:
114
114
  return instance_family
115
115
 
116
116
 
117
- def get_default_instance_type(
118
- cpus: Optional[str] = None,
119
- memory: Optional[str] = None,
120
- disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
117
+ def get_default_instance_type(cpus: Optional[str] = None,
118
+ memory: Optional[str] = None,
119
+ disk_tier: Optional[
120
+ resources_utils.DiskTier] = None,
121
+ region: Optional[str] = None,
122
+ zone: Optional[str] = None) -> Optional[str]:
121
123
  if cpus is None and memory is None:
122
124
  cpus = f'{_DEFAULT_NUM_VCPUS}+'
123
125
  if memory is None:
@@ -133,7 +135,8 @@ def get_default_instance_type(
133
135
 
134
136
  df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
135
137
  return common.get_instance_type_for_cpus_mem_impl(df, cpus,
136
- memory_gb_or_ratio)
138
+ memory_gb_or_ratio,
139
+ region, zone)
137
140
 
138
141
 
139
142
  def get_accelerators_from_instance_type(
sky/catalog/common.py CHANGED
@@ -476,8 +476,11 @@ def _filter_region_zone(df: 'pd.DataFrame', region: Optional[str],
476
476
 
477
477
 
478
478
  def get_instance_type_for_cpus_mem_impl(
479
- df: 'pd.DataFrame', cpus: Optional[str],
480
- memory_gb_or_ratio: Optional[str]) -> Optional[str]:
479
+ df: 'pd.DataFrame',
480
+ cpus: Optional[str],
481
+ memory_gb_or_ratio: Optional[str],
482
+ region: Optional[str] = None,
483
+ zone: Optional[str] = None) -> Optional[str]:
481
484
  """Returns the cheapest instance type that satisfies the requirements.
482
485
 
483
486
  Args:
@@ -490,7 +493,10 @@ def get_instance_type_for_cpus_mem_impl(
490
493
  returned instance type should have at least the given memory size.
491
494
  If the string ends with "x", then the returned instance type should
492
495
  have at least the given number of vCPUs times the given ratio.
496
+ region: The region to filter by.
497
+ zone: The zone to filter by.
493
498
  """
499
+ df = _filter_region_zone(df, region, zone)
494
500
  df = _filter_with_cpus(df, cpus)
495
501
  df = _filter_with_mem(df, memory_gb_or_ratio)
496
502
  if df.empty:
@@ -51,7 +51,9 @@ def get_vcpus_mem_from_instance_type(
51
51
 
52
52
  def get_default_instance_type(cpus: Optional[str] = None,
53
53
  memory: Optional[str] = None,
54
- disk_tier: Optional[str] = None) -> Optional[str]:
54
+ disk_tier: Optional[str] = None,
55
+ region: Optional[str] = None,
56
+ zone: Optional[str] = None) -> Optional[str]:
55
57
  del disk_tier
56
58
  # NOTE: After expanding catalog to multiple entries, you may
57
59
  # want to specify a default instance type or family.
@@ -62,7 +64,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
62
64
  if memory is None:
63
65
  memory_gb_or_ratio = f'{_DEFAULT_MEMORY_CPU_RATIO}x'
64
66
  return common.get_instance_type_for_cpus_mem_impl(_df, cpus,
65
- memory_gb_or_ratio)
67
+ memory_gb_or_ratio,
68
+ region, zone)
66
69
 
67
70
 
68
71
  def get_accelerators_from_instance_type(
sky/catalog/do_catalog.py CHANGED
@@ -52,11 +52,14 @@ def get_default_instance_type(
52
52
  cpus: Optional[str] = None,
53
53
  memory: Optional[str] = None,
54
54
  disk_tier: Optional[str] = None,
55
+ region: Optional[str] = None,
56
+ zone: Optional[str] = None,
55
57
  ) -> Optional[str]:
56
58
  # NOTE: After expanding catalog to multiple entries, you may
57
59
  # want to specify a default instance type or family.
58
60
  del disk_tier # unused
59
- return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
61
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
62
+ zone)
60
63
 
61
64
 
62
65
  def get_accelerators_from_instance_type(
@@ -52,7 +52,9 @@ def get_vcpus_mem_from_instance_type(
52
52
 
53
53
  def get_default_instance_type(cpus: Optional[str] = None,
54
54
  memory: Optional[str] = None,
55
- disk_tier: Optional[str] = None) -> Optional[str]:
55
+ disk_tier: Optional[str] = None,
56
+ region: Optional[str] = None,
57
+ zone: Optional[str] = None) -> Optional[str]:
56
58
  del disk_tier # unused
57
59
  if cpus is None and memory is None:
58
60
  cpus = f'{_DEFAULT_NUM_VCPUS}+'
@@ -61,7 +63,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
61
63
  else:
62
64
  memory_gb_or_ratio = memory
63
65
  return common.get_instance_type_for_cpus_mem_impl(_df, cpus,
64
- memory_gb_or_ratio)
66
+ memory_gb_or_ratio,
67
+ region, zone)
65
68
 
66
69
 
67
70
  def get_accelerators_from_instance_type(
@@ -279,10 +279,12 @@ def get_vcpus_mem_from_instance_type(
279
279
  return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
280
280
 
281
281
 
282
- def get_default_instance_type(
283
- cpus: Optional[str] = None,
284
- memory: Optional[str] = None,
285
- disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
282
+ def get_default_instance_type(cpus: Optional[str] = None,
283
+ memory: Optional[str] = None,
284
+ disk_tier: Optional[
285
+ resources_utils.DiskTier] = None,
286
+ region: Optional[str] = None,
287
+ zone: Optional[str] = None) -> Optional[str]:
286
288
  if cpus is None and memory is None:
287
289
  cpus = f'{_DEFAULT_NUM_VCPUS}+'
288
290
  if memory is None:
@@ -300,7 +302,8 @@ def get_default_instance_type(
300
302
 
301
303
  df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
302
304
  return common.get_instance_type_for_cpus_mem_impl(df, cpus,
303
- memory_gb_or_ratio)
305
+ memory_gb_or_ratio,
306
+ region, zone)
304
307
 
305
308
 
306
309
  def get_accelerators_from_instance_type(
@@ -67,9 +67,12 @@ def get_zone_shell_cmd() -> Optional[str]:
67
67
 
68
68
  def get_default_instance_type(cpus: Optional[str] = None,
69
69
  memory: Optional[str] = None,
70
- disk_tier: Optional[str] = None) -> Optional[str]:
70
+ disk_tier: Optional[str] = None,
71
+ region: Optional[str] = None,
72
+ zone: Optional[str] = None) -> Optional[str]:
71
73
  del disk_tier # Unused
72
- return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
74
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
75
+ zone)
73
76
 
74
77
 
75
78
  def get_instance_type_for_accelerator(
@@ -92,10 +92,12 @@ def list_accelerators(
92
92
  case_sensitive, all_regions)
93
93
 
94
94
 
95
- def get_default_instance_type(
96
- cpus: Optional[str] = None,
97
- memory: Optional[str] = None,
98
- disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
95
+ def get_default_instance_type(cpus: Optional[str] = None,
96
+ memory: Optional[str] = None,
97
+ disk_tier: Optional[
98
+ resources_utils.DiskTier] = None,
99
+ region: Optional[str] = None,
100
+ zone: Optional[str] = None) -> Optional[str]:
99
101
  del disk_tier # unused
100
102
  if cpus is None and memory is None:
101
103
  cpus = f'{_DEFAULT_NUM_VCPUS}+'
@@ -107,7 +109,8 @@ def get_default_instance_type(
107
109
  instance_type_prefix = f'{_DEFAULT_INSTANCE_FAMILY}-'
108
110
  df = _df[_df['InstanceType'].str.startswith(instance_type_prefix)]
109
111
  return common.get_instance_type_for_cpus_mem_impl(df, cpus,
110
- memory_gb_or_ratio)
112
+ memory_gb_or_ratio,
113
+ region, zone)
111
114
 
112
115
 
113
116
  def is_image_tag_valid(tag: str, region: Optional[str]) -> bool: