skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -72,7 +72,8 @@ class _ControllerSpec:
72
72
  """Spec for skypilot controllers."""
73
73
  controller_type: str
74
74
  name: str
75
- cluster_name: str
75
+ _cluster_name_func: Callable[[], str]
76
+ _cluster_name_from_server: Optional[str] # For client-side only
76
77
  in_progress_hint: Callable[[bool], str]
77
78
  decline_cancel_hint: str
78
79
  _decline_down_when_failed_to_fetch_status_hint: str
@@ -93,6 +94,24 @@ class _ControllerSpec:
93
94
  return self._check_cluster_name_hint.format(
94
95
  cluster_name=self.cluster_name)
95
96
 
97
+ @property
98
+ def cluster_name(self) -> str:
99
+ """The cluster name of the controller.
100
+
101
+ On the server-side, the cluster name is the actual cluster name,
102
+ which is read from common.(JOB|SKY_SERVE)_CONTROLLER_NAME.
103
+
104
+ On the client-side, the cluster name may not be accurate,
105
+ as we may not know the exact name, because we are missing
106
+ the server-side common.SERVER_ID. We have to wait until
107
+ we get the actual cluster name from the server.
108
+ """
109
+ return (self._cluster_name_from_server if self._cluster_name_from_server
110
+ is not None else self._cluster_name_func())
111
+
112
+ def set_cluster_name_from_server(self, cluster_name: str) -> None:
113
+ self._cluster_name_from_server = cluster_name
114
+
96
115
 
97
116
  # TODO: refactor controller class to not be an enum.
98
117
  class Controllers(enum.Enum):
@@ -102,7 +121,8 @@ class Controllers(enum.Enum):
102
121
  JOBS_CONTROLLER = _ControllerSpec(
103
122
  controller_type='jobs',
104
123
  name='managed jobs controller',
105
- cluster_name=common.JOB_CONTROLLER_NAME,
124
+ _cluster_name_func=lambda: common.JOB_CONTROLLER_NAME,
125
+ _cluster_name_from_server=None,
106
126
  in_progress_hint=lambda _:
107
127
  ('* {job_info}To see all managed jobs: '
108
128
  f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
@@ -133,7 +153,8 @@ class Controllers(enum.Enum):
133
153
  SKY_SERVE_CONTROLLER = _ControllerSpec(
134
154
  controller_type='serve',
135
155
  name='serve controller',
136
- cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
156
+ _cluster_name_func=lambda: common.SKY_SERVE_CONTROLLER_NAME,
157
+ _cluster_name_from_server=None,
137
158
  in_progress_hint=(
138
159
  lambda pool:
139
160
  (f'* To see detailed pool status: {colorama.Style.BRIGHT}'
@@ -166,7 +187,9 @@ class Controllers(enum.Enum):
166
187
  default_autostop_config=serve_constants.CONTROLLER_AUTOSTOP)
167
188
 
168
189
  @classmethod
169
- def from_name(cls, name: Optional[str]) -> Optional['Controllers']:
190
+ def from_name(cls,
191
+ name: Optional[str],
192
+ expect_exact_match: bool = True) -> Optional['Controllers']:
170
193
  """Check if the cluster name is a controller name.
171
194
 
172
195
  Returns:
@@ -187,7 +210,11 @@ class Controllers(enum.Enum):
187
210
  elif name.startswith(common.JOB_CONTROLLER_PREFIX):
188
211
  controller = cls.JOBS_CONTROLLER
189
212
  prefix = common.JOB_CONTROLLER_PREFIX
190
- if controller is not None and name != controller.value.cluster_name:
213
+
214
+ if controller is not None and expect_exact_match:
215
+ assert name == controller.value.cluster_name, (
216
+ name, controller.value.cluster_name)
217
+ elif controller is not None and name != controller.value.cluster_name:
191
218
  # The client-side cluster_name is not accurate. Assume that `name`
192
219
  # is the actual cluster name, so need to set the controller's
193
220
  # cluster name to the input name.
@@ -201,7 +228,7 @@ class Controllers(enum.Enum):
201
228
  prefix)
202
229
 
203
230
  # Update the cluster name.
204
- controller.value.cluster_name = name
231
+ controller.value.set_cluster_name_from_server(name)
205
232
  return controller
206
233
 
207
234
  @classmethod
@@ -228,7 +255,7 @@ def get_controller_for_pool(pool: bool) -> Controllers:
228
255
  def high_availability_specified(cluster_name: Optional[str]) -> bool:
229
256
  """Check if the controller high availability is specified in user config.
230
257
  """
231
- controller = Controllers.from_name(cluster_name)
258
+ controller = Controllers.from_name(cluster_name, expect_exact_match=False)
232
259
  if controller is None:
233
260
  return False
234
261
 
@@ -411,7 +438,7 @@ def check_cluster_name_not_controller(
411
438
  Returns:
412
439
  None, if the cluster name is not a controller name.
413
440
  """
414
- controller = Controllers.from_name(cluster_name)
441
+ controller = Controllers.from_name(cluster_name, expect_exact_match=False)
415
442
  if controller is not None:
416
443
  msg = controller.value.check_cluster_name_hint
417
444
  if operation_str is not None:
sky/utils/db/db_utils.py CHANGED
@@ -185,7 +185,7 @@ def add_column_to_table_sqlalchemy(
185
185
  pass
186
186
  else:
187
187
  raise
188
- #postgressql
188
+ #postgresql
189
189
  except sqlalchemy_exc.ProgrammingError as e:
190
190
  if 'already exists' in str(e):
191
191
  pass
@@ -358,6 +358,27 @@ class SQLiteConn(threading.local):
358
358
  conn = await self._get_async_conn()
359
359
  return await conn.execute_fetchall(sql, parameters)
360
360
 
361
+ async def execute_get_returning_value_async(
362
+ self,
363
+ sql: str,
364
+ parameters: Optional[Iterable[Any]] = None
365
+ ) -> Optional[sqlite3.Row]:
366
+ conn = await self._get_async_conn()
367
+
368
+ if parameters is None:
369
+ parameters = []
370
+
371
+ def exec_and_get_returning_value(sql: str,
372
+ parameters: Optional[Iterable[Any]]):
373
+ # pylint: disable=protected-access
374
+ row = conn._conn.execute(sql, parameters).fetchone()
375
+ conn._conn.commit()
376
+ return row
377
+
378
+ # pylint: disable=protected-access
379
+ return await conn._execute(exec_and_get_returning_value, sql,
380
+ parameters)
381
+
361
382
  async def close(self):
362
383
  if self._async_conn is not None:
363
384
  await self._async_conn.close()
@@ -382,21 +403,28 @@ def get_max_connections():
382
403
 
383
404
  @typing.overload
384
405
  def get_engine(
385
- db_name: str,
406
+ db_name: Optional[str],
386
407
  async_engine: Literal[False] = False) -> sqlalchemy.engine.Engine:
387
408
  ...
388
409
 
389
410
 
390
411
  @typing.overload
391
- def get_engine(db_name: str,
412
+ def get_engine(db_name: Optional[str],
392
413
  async_engine: Literal[True]) -> sqlalchemy_async.AsyncEngine:
393
414
  ...
394
415
 
395
416
 
396
417
  def get_engine(
397
- db_name: str,
418
+ db_name: Optional[str],
398
419
  async_engine: bool = False
399
420
  ) -> Union[sqlalchemy.engine.Engine, sqlalchemy_async.AsyncEngine]:
421
+ """Get the engine for the given database name.
422
+
423
+ Args:
424
+ db_name: The name of the database. ONLY used for SQLite. On Postgres,
425
+ we use a single database, which we get from the connection string.
426
+ async_engine: Whether to return an async engine.
427
+ """
400
428
  conn_string = None
401
429
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
402
430
  conn_string = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
@@ -416,19 +444,18 @@ def get_engine(
416
444
  _postgres_engine_cache[conn_string] = (
417
445
  sqlalchemy.create_engine(
418
446
  conn_string, poolclass=sqlalchemy.pool.NullPool))
419
- elif _max_connections == 1:
420
- _postgres_engine_cache[conn_string] = (
421
- sqlalchemy.create_engine(
422
- conn_string, poolclass=sqlalchemy.pool.StaticPool))
423
447
  else:
424
448
  _postgres_engine_cache[conn_string] = (
425
449
  sqlalchemy.create_engine(
426
450
  conn_string,
427
451
  poolclass=sqlalchemy.pool.QueuePool,
428
- size=_max_connections,
429
- max_overflow=0))
452
+ pool_size=_max_connections,
453
+ max_overflow=max(0, 5 - _max_connections),
454
+ pool_pre_ping=True,
455
+ pool_recycle=1800))
430
456
  engine = _postgres_engine_cache[conn_string]
431
457
  else:
458
+ assert db_name is not None, 'db_name must be provided for SQLite'
432
459
  db_path = os.path.expanduser(f'~/.sky/{db_name}.db')
433
460
  pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
434
461
  if async_engine:
@@ -19,15 +19,19 @@ DB_INIT_LOCK_TIMEOUT_SECONDS = 10
19
19
 
20
20
  GLOBAL_USER_STATE_DB_NAME = 'state_db'
21
21
  GLOBAL_USER_STATE_VERSION = '010'
22
- GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
22
+ GLOBAL_USER_STATE_LOCK_PATH = f'~/.sky/locks/.{GLOBAL_USER_STATE_DB_NAME}.lock'
23
23
 
24
24
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'
25
- SPOT_JOBS_VERSION = '003'
26
- SPOT_JOBS_LOCK_PATH = '~/.sky/locks/.spot_jobs_db.lock'
25
+ SPOT_JOBS_VERSION = '005'
26
+ SPOT_JOBS_LOCK_PATH = f'~/.sky/locks/.{SPOT_JOBS_DB_NAME}.lock'
27
27
 
28
28
  SERVE_DB_NAME = 'serve_db'
29
29
  SERVE_VERSION = '001'
30
- SERVE_LOCK_PATH = '~/.sky/locks/.serve_db.lock'
30
+ SERVE_LOCK_PATH = f'~/.sky/locks/.{SERVE_DB_NAME}.lock'
31
+
32
+ SKYPILOT_CONFIG_DB_NAME = 'sky_config_db'
33
+ SKYPILOT_CONFIG_VERSION = '001'
34
+ SKYPILOT_CONFIG_LOCK_PATH = f'~/.sky/locks/.{SKYPILOT_CONFIG_DB_NAME}.lock'
31
35
 
32
36
 
33
37
  @contextlib.contextmanager
sky/utils/locks.py CHANGED
@@ -243,6 +243,7 @@ class PostgresLock(DistributedLock):
243
243
  if not self._acquired or not self._connection:
244
244
  return
245
245
 
246
+ connection_lost = False
246
247
  try:
247
248
  cursor = self._connection.cursor()
248
249
  cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
@@ -252,8 +253,11 @@ class PostgresLock(DistributedLock):
252
253
  # Lost connection to the database, likely the lock is force unlocked
253
254
  # by other routines.
254
255
  logger.debug(f'Failed to release postgres lock {self.lock_id}: {e}')
256
+ connection_lost = True
255
257
  finally:
256
- self._close_connection()
258
+ # Invalidate if connection was lost to prevent SQLAlchemy from
259
+ # trying to reset a dead connection
260
+ self._close_connection(invalidate=connection_lost)
257
261
 
258
262
  def force_unlock(self) -> None:
259
263
  """Force unlock the postgres advisory lock."""
@@ -270,7 +274,7 @@ class PostgresLock(DistributedLock):
270
274
  cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
271
275
  result = cursor.fetchone()[0]
272
276
  if result:
273
- # The lock is held by current routine and unlock suceed
277
+ # The lock is held by current routine and unlock succeed
274
278
  self._connection.commit()
275
279
  self._acquired = False
276
280
  return
@@ -292,13 +296,27 @@ class PostgresLock(DistributedLock):
292
296
  finally:
293
297
  self._close_connection()
294
298
 
295
- def _close_connection(self) -> None:
296
- """Close the postgres connection."""
299
+ def _close_connection(self, invalidate: bool = False) -> None:
300
+ """Close the postgres connection.
301
+
302
+ Args:
303
+ invalidate: If True, invalidate connection instead of closing it.
304
+ Use this when the connection might be broken (e.g., after
305
+ pg_terminate_backend) to prevent SQLAlchemy from trying to
306
+ reset it (which would result in an error being logged).
307
+ """
297
308
  if self._connection:
298
309
  try:
299
- self._connection.close()
310
+ if invalidate:
311
+ self._connection.invalidate()
312
+ else:
313
+ self._connection.close()
300
314
  except Exception as e: # pylint: disable=broad-except
301
- logger.debug(f'Failed to close postgres connection: {e}')
315
+ if invalidate:
316
+ logger.debug(
317
+ f'Failed to invalidate postgres connection: {e}')
318
+ else:
319
+ logger.debug(f'Failed to close postgres connection: {e}')
302
320
  self._connection = None
303
321
 
304
322
  def is_locked(self) -> bool:
@@ -278,7 +278,10 @@ def _get_active_resources(
278
278
  from sky.jobs.server import core as managed_jobs_core
279
279
  try:
280
280
  filtered_jobs, _, _, _ = managed_jobs_core.queue_v2(
281
- refresh=False, skip_finished=True, all_users=True)
281
+ refresh=False,
282
+ skip_finished=True,
283
+ all_users=True,
284
+ fields=['job_id', 'user_hash', 'workspace'])
282
285
  return filtered_jobs
283
286
  except exceptions.ClusterNotUpError:
284
287
  logger.warning('All jobs should be finished.')
@@ -181,57 +181,81 @@ def simplify_ports(ports: List[str]) -> List[str]:
181
181
 
182
182
 
183
183
  def format_resource(resource: 'resources_lib.Resources',
184
- simplify: bool = False) -> str:
184
+ simplified_only: bool = False) -> Tuple[str, Optional[str]]:
185
185
  resource = resource.assert_launchable()
186
- vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
187
- resource.instance_type)
186
+ is_k8s = str(resource.cloud).lower() == 'kubernetes'
187
+ if resource.accelerators is None or is_k8s or not simplified_only:
188
+ vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
189
+ resource.instance_type)
188
190
 
189
- components = []
191
+ elements_simple = []
192
+ elements_full = []
190
193
 
191
194
  if resource.accelerators is not None:
192
195
  acc, count = list(resource.accelerators.items())[0]
193
- components.append(f'gpus={acc}:{count}')
196
+ elements_simple.append(f'gpus={acc}:{count}')
197
+ elements_full.append(f'gpus={acc}:{count}')
194
198
 
195
- is_k8s = str(resource.cloud).lower() == 'kubernetes'
196
- if (resource.accelerators is None or is_k8s or not simplify):
199
+ if (resource.accelerators is None or is_k8s):
200
+ if vcpu is not None:
201
+ elements_simple.append(f'cpus={int(vcpu)}')
202
+ elements_full.append(f'cpus={int(vcpu)}')
203
+ if mem is not None:
204
+ elements_simple.append(f'mem={int(mem)}')
205
+ elements_full.append(f'mem={int(mem)}')
206
+ elif not simplified_only:
197
207
  if vcpu is not None:
198
- components.append(f'cpus={int(vcpu)}')
208
+ elements_full.append(f'cpus={int(vcpu)}')
199
209
  if mem is not None:
200
- components.append(f'mem={int(mem)}')
210
+ elements_full.append(f'mem={int(mem)}')
201
211
 
202
- instance_type = resource.instance_type
203
- if simplify:
204
- instance_type = common_utils.truncate_long_string(instance_type, 15)
205
212
  if not is_k8s:
206
- components.append(instance_type)
207
- if simplify:
208
- components.append('...')
209
- else:
213
+ instance_type_full = resource.instance_type
214
+ instance_type_simple = common_utils.truncate_long_string(
215
+ instance_type_full, 15)
216
+ elements_simple.append(instance_type_simple)
217
+ elements_full.append(instance_type_full)
218
+ elements_simple.append('...')
219
+ if not simplified_only:
210
220
  image_id = resource.image_id
211
221
  if image_id is not None:
212
222
  if None in image_id:
213
- components.append(f'image_id={image_id[None]}')
223
+ elements_full.append(f'image_id={image_id[None]}')
214
224
  else:
215
- components.append(f'image_id={image_id}')
216
- components.append(f'disk={resource.disk_size}')
225
+ elements_full.append(f'image_id={image_id}')
226
+ elements_full.append(f'disk={resource.disk_size}')
217
227
  disk_tier = resource.disk_tier
218
228
  if disk_tier is not None:
219
- components.append(f'disk_tier={disk_tier.value}')
229
+ elements_full.append(f'disk_tier={disk_tier.value}')
220
230
  ports = resource.ports
221
231
  if ports is not None:
222
- components.append(f'ports={ports}')
232
+ elements_full.append(f'ports={ports}')
223
233
 
224
234
  spot = '[spot]' if resource.use_spot else ''
225
- return f'{spot}({"" if not components else ", ".join(components)})'
226
-
227
-
228
- def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle',
229
- simplify: bool = False) -> str:
235
+ resources_str_simple = (
236
+ f'{spot}({"" if not elements_simple else ", ".join(elements_simple)})')
237
+ if simplified_only:
238
+ return resources_str_simple, None
239
+ else:
240
+ resources_str_full = (
241
+ f'{spot}({"" if not elements_full else ", ".join(elements_full)})')
242
+ return resources_str_simple, resources_str_full
243
+
244
+
245
+ def get_readable_resources_repr(
246
+ handle: 'backends.CloudVmRayResourceHandle',
247
+ simplified_only: bool = False) -> Tuple[str, Optional[str]]:
248
+ resource_str_simple, resource_str_full = format_resource(
249
+ handle.launched_resources, simplified_only)
250
+ if not simplified_only:
251
+ assert resource_str_full is not None
230
252
  if (handle.launched_nodes is not None and
231
253
  handle.launched_resources is not None):
232
- return (f'{handle.launched_nodes}x'
233
- f'{format_resource(handle.launched_resources, simplify)}')
234
- return _DEFAULT_MESSAGE_HANDLE_INITIALIZING
254
+ return (f'{handle.launched_nodes}x{resource_str_simple}',
255
+ None if simplified_only else
256
+ f'{handle.launched_nodes}x{resource_str_full}')
257
+ return (_DEFAULT_MESSAGE_HANDLE_INITIALIZING,
258
+ _DEFAULT_MESSAGE_HANDLE_INITIALIZING)
235
259
 
236
260
 
237
261
  def make_ray_custom_resources_str(
sky/utils/schemas.py CHANGED
@@ -1190,7 +1190,13 @@ def get_config_schema():
1190
1190
  'consolidation_mode': {
1191
1191
  'type': 'boolean',
1192
1192
  'default': False,
1193
- }
1193
+ },
1194
+ 'controller_logs_gc_retention_hours': {
1195
+ 'type': 'integer',
1196
+ },
1197
+ 'task_logs_gc_retention_hours': {
1198
+ 'type': 'integer',
1199
+ },
1194
1200
  },
1195
1201
  },
1196
1202
  'bucket': {
@@ -1592,10 +1598,10 @@ def get_config_schema():
1592
1598
 
1593
1599
  allowed_workspace_cloud_names = list(constants.ALL_CLOUDS) + ['cloudflare']
1594
1600
  # Create pattern for not supported clouds, i.e.
1595
- # all clouds except gcp, kubernetes, ssh
1601
+ # all clouds except aws, gcp, kubernetes, ssh, nebius
1596
1602
  not_supported_clouds = [
1597
1603
  cloud for cloud in allowed_workspace_cloud_names
1598
- if cloud.lower() not in ['gcp', 'kubernetes', 'ssh', 'nebius']
1604
+ if cloud.lower() not in ['aws', 'gcp', 'kubernetes', 'ssh', 'nebius']
1599
1605
  ]
1600
1606
  not_supported_cloud_regex = '|'.join(not_supported_clouds)
1601
1607
  workspaces_schema = {
@@ -1606,7 +1612,8 @@ def get_config_schema():
1606
1612
  'type': 'object',
1607
1613
  'additionalProperties': False,
1608
1614
  'patternProperties': {
1609
- # Pattern for non-GCP clouds - only allows 'disabled' property
1615
+ # Pattern for clouds with no workspace-specific config -
1616
+ # only allow 'disabled' property.
1610
1617
  f'^({not_supported_cloud_regex})$': {
1611
1618
  'type': 'object',
1612
1619
  'additionalProperties': False,
@@ -1641,6 +1648,18 @@ def get_config_schema():
1641
1648
  },
1642
1649
  'additionalProperties': False,
1643
1650
  },
1651
+ 'aws': {
1652
+ 'type': 'object',
1653
+ 'properties': {
1654
+ 'profile': {
1655
+ 'type': 'string'
1656
+ },
1657
+ 'disabled': {
1658
+ 'type': 'boolean'
1659
+ },
1660
+ },
1661
+ 'additionalProperties': False,
1662
+ },
1644
1663
  'ssh': {
1645
1664
  'type': 'object',
1646
1665
  'required': [],
@@ -10,7 +10,8 @@ import sys
10
10
  import threading
11
11
  import time
12
12
  import typing
13
- from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
13
+ from typing import (Any, Callable, Dict, List, Optional, Protocol, Set, Tuple,
14
+ Union)
14
15
 
15
16
  import colorama
16
17
 
@@ -18,6 +19,7 @@ from sky import exceptions
18
19
  from sky import sky_logging
19
20
  from sky.adaptors import common as adaptors_common
20
21
  from sky.skylet import log_lib
22
+ from sky.skylet import subprocess_daemon
21
23
  from sky.utils import common_utils
22
24
  from sky.utils import timeline
23
25
  from sky.utils import ux_utils
@@ -107,7 +109,7 @@ def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
107
109
 
108
110
 
109
111
  def run_in_parallel(func: Callable,
110
- args: List[Any],
112
+ args: Union[List[Any], Set[Any]],
111
113
  num_threads: Optional[int] = None) -> List[Any]:
112
114
  """Run a function in parallel on a list of arguments.
113
115
 
@@ -128,7 +130,7 @@ def run_in_parallel(func: Callable,
128
130
  if len(args) == 0:
129
131
  return []
130
132
  if len(args) == 1:
131
- return [func(args[0])]
133
+ return [func(list(args)[0])]
132
134
 
133
135
  processes = (num_threads
134
136
  if num_threads is not None else get_parallel_threads())
@@ -305,11 +307,17 @@ def run_with_retries(
305
307
  return returncode, stdout, stderr
306
308
 
307
309
 
308
- def kill_process_daemon(process_pid: int) -> None:
310
+ def kill_process_daemon(process_pid: int, use_kill_pg: bool = False) -> None:
309
311
  """Start a daemon as a safety net to kill the process.
310
312
 
311
313
  Args:
312
314
  process_pid: The PID of the process to kill.
315
+ use_kill_pg: Whether to use kill process group to kill the process. If
316
+ True, the process will use os.killpg() to kill the target process
317
+ group on UNIX system, which is more efficient than using the daemon
318
+ to refresh the process tree in the daemon. Note that both
319
+ implementations have corner cases where subprocesses might not be
320
+ killed. Refer to subprocess_daemon.py for more details.
313
321
  """
314
322
  # Get initial children list
315
323
  try:
@@ -336,6 +344,10 @@ def kill_process_daemon(process_pid: int) -> None:
336
344
  ','.join(map(str, initial_children)),
337
345
  ]
338
346
 
347
+ env = os.environ.copy()
348
+ if use_kill_pg:
349
+ env[subprocess_daemon.USE_KILL_PG_ENV_VAR] = '1'
350
+
339
351
  # We do not need to set `start_new_session=True` here, as the
340
352
  # daemon script will detach itself from the parent process with
341
353
  # fork to avoid being killed by parent process. See the reason we
@@ -347,6 +359,7 @@ def kill_process_daemon(process_pid: int) -> None:
347
359
  stderr=subprocess.DEVNULL,
348
360
  # Disable input
349
361
  stdin=subprocess.DEVNULL,
362
+ env=env,
350
363
  )
351
364
 
352
365
 
@@ -7,6 +7,7 @@ from sky import exceptions
7
7
  from sky import sky_logging
8
8
  from sky.server.requests import executor
9
9
  from sky.server.requests import payloads
10
+ from sky.server.requests import request_names
10
11
  from sky.server.requests import requests as requests_lib
11
12
  from sky.utils import registry
12
13
  from sky.utils import volume as volume_utils
@@ -25,9 +26,9 @@ async def volume_list(request: fastapi.Request) -> None:
25
26
  'env_vars': auth_user.to_env_vars()
26
27
  } if auth_user else {}
27
28
  request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
28
- executor.schedule_request(
29
+ await executor.schedule_request_async(
29
30
  request_id=request.state.request_id,
30
- request_name='volume_list',
31
+ request_name=request_names.RequestName.VOLUME_LIST,
31
32
  request_body=request_body,
32
33
  func=core.volume_list,
33
34
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -38,9 +39,9 @@ async def volume_list(request: fastapi.Request) -> None:
38
39
  async def volume_delete(request: fastapi.Request,
39
40
  volume_delete_body: payloads.VolumeDeleteBody) -> None:
40
41
  """Deletes a volume."""
41
- executor.schedule_request(
42
+ await executor.schedule_request_async(
42
43
  request_id=request.state.request_id,
43
- request_name='volume_delete',
44
+ request_name=request_names.RequestName.VOLUME_DELETE,
44
45
  request_body=volume_delete_body,
45
46
  func=core.volume_delete,
46
47
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -112,9 +113,9 @@ async def volume_apply(request: fastapi.Request,
112
113
  raise fastapi.HTTPException(
113
114
  status_code=400,
114
115
  detail='Runpod network volume is only supported on Runpod')
115
- executor.schedule_request(
116
+ await executor.schedule_request_async(
116
117
  request_id=request.state.request_id,
117
- request_name='volume_apply',
118
+ request_name=request_names.RequestName.VOLUME_APPLY,
118
119
  request_body=volume_apply_body,
119
120
  func=core.volume_apply,
120
121
  schedule_type=requests_lib.ScheduleType.LONG,
sky/workspaces/server.py CHANGED
@@ -4,6 +4,7 @@ import fastapi
4
4
 
5
5
  from sky.server.requests import executor
6
6
  from sky.server.requests import payloads
7
+ from sky.server.requests import request_names
7
8
  from sky.server.requests import requests as api_requests
8
9
  from sky.workspaces import core
9
10
 
@@ -22,9 +23,9 @@ async def get(request: fastapi.Request) -> None:
22
23
  } if auth_user else {}
23
24
  request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
24
25
 
25
- executor.schedule_request(
26
+ await executor.schedule_request_async(
26
27
  request_id=request.state.request_id,
27
- request_name='workspaces.get',
28
+ request_name=request_names.RequestName.WORKSPACES_GET,
28
29
  request_body=request_body,
29
30
  func=core.get_workspaces,
30
31
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -35,9 +36,9 @@ async def get(request: fastapi.Request) -> None:
35
36
  async def update(request: fastapi.Request,
36
37
  update_workspace_body: payloads.UpdateWorkspaceBody) -> None:
37
38
  """Updates a specific workspace configuration."""
38
- executor.schedule_request(
39
+ await executor.schedule_request_async(
39
40
  request_id=request.state.request_id,
40
- request_name='workspaces.update',
41
+ request_name=request_names.RequestName.WORKSPACES_UPDATE,
41
42
  request_body=update_workspace_body,
42
43
  func=core.update_workspace,
43
44
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -48,9 +49,9 @@ async def update(request: fastapi.Request,
48
49
  async def create(request: fastapi.Request,
49
50
  create_workspace_body: payloads.CreateWorkspaceBody) -> None:
50
51
  """Creates a new workspace configuration."""
51
- executor.schedule_request(
52
+ await executor.schedule_request_async(
52
53
  request_id=request.state.request_id,
53
- request_name='workspaces.create',
54
+ request_name=request_names.RequestName.WORKSPACES_CREATE,
54
55
  request_body=create_workspace_body,
55
56
  func=core.create_workspace,
56
57
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -61,9 +62,9 @@ async def create(request: fastapi.Request,
61
62
  async def delete(request: fastapi.Request,
62
63
  delete_workspace_body: payloads.DeleteWorkspaceBody) -> None:
63
64
  """Deletes a workspace configuration."""
64
- executor.schedule_request(
65
+ await executor.schedule_request_async(
65
66
  request_id=request.state.request_id,
66
- request_name='workspaces.delete',
67
+ request_name=request_names.RequestName.WORKSPACES_DELETE,
67
68
  request_body=delete_workspace_body,
68
69
  func=core.delete_workspace,
69
70
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -78,9 +79,9 @@ async def get_config(request: fastapi.Request) -> None:
78
79
  'env_vars': auth_user.to_env_vars()
79
80
  } if auth_user else {}
80
81
  get_config_body = payloads.GetConfigBody(**auth_user_env_vars_kwargs)
81
- executor.schedule_request(
82
+ await executor.schedule_request_async(
82
83
  request_id=request.state.request_id,
83
- request_name='workspaces.get_config',
84
+ request_name=request_names.RequestName.WORKSPACES_GET_CONFIG,
84
85
  request_body=get_config_body,
85
86
  func=core.get_config,
86
87
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -91,9 +92,9 @@ async def get_config(request: fastapi.Request) -> None:
91
92
  async def update_config(request: fastapi.Request,
92
93
  update_config_body: payloads.UpdateConfigBody) -> None:
93
94
  """Updates the entire SkyPilot configuration."""
94
- executor.schedule_request(
95
+ await executor.schedule_request_async(
95
96
  request_id=request.state.request_id,
96
- request_name='workspaces.update_config',
97
+ request_name=request_names.RequestName.WORKSPACES_UPDATE_CONFIG,
97
98
  request_body=update_config_body,
98
99
  func=core.update_config,
99
100
  schedule_type=api_requests.ScheduleType.SHORT,