skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (186) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +448 -60
  9. sky/client/common.py +12 -9
  10. sky/clouds/nebius.py +1 -1
  11. sky/clouds/utils/gcp_utils.py +1 -1
  12. sky/clouds/vast.py +1 -2
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
  16. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
  17. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  22. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
  29. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  31. sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  33. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  34. sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
  36. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  39. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
  41. sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
  43. sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  46. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
  49. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
  51. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
  65. sky/dashboard/out/_next/static/oKqDxFQ88cquF4nQGE_0w/_buildManifest.js +1 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs.html +1 -1
  75. sky/dashboard/out/users.html +1 -1
  76. sky/dashboard/out/volumes.html +1 -1
  77. sky/dashboard/out/workspace/new.html +1 -1
  78. sky/dashboard/out/workspaces/[name].html +1 -1
  79. sky/dashboard/out/workspaces.html +1 -1
  80. sky/data/data_utils.py +25 -0
  81. sky/data/storage.py +1219 -1775
  82. sky/global_user_state.py +18 -8
  83. sky/jobs/__init__.py +3 -0
  84. sky/jobs/client/sdk.py +80 -3
  85. sky/jobs/controller.py +76 -25
  86. sky/jobs/recovery_strategy.py +80 -34
  87. sky/jobs/scheduler.py +68 -20
  88. sky/jobs/server/core.py +228 -136
  89. sky/jobs/server/server.py +40 -0
  90. sky/jobs/state.py +164 -31
  91. sky/jobs/utils.py +144 -68
  92. sky/logs/aws.py +4 -2
  93. sky/provision/kubernetes/utils.py +6 -4
  94. sky/provision/nebius/constants.py +3 -0
  95. sky/provision/vast/instance.py +2 -1
  96. sky/provision/vast/utils.py +9 -6
  97. sky/py.typed +0 -0
  98. sky/resources.py +24 -14
  99. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  100. sky/serve/autoscalers.py +8 -0
  101. sky/serve/client/impl.py +188 -0
  102. sky/serve/client/sdk.py +12 -82
  103. sky/serve/constants.py +5 -1
  104. sky/serve/controller.py +5 -0
  105. sky/serve/replica_managers.py +112 -37
  106. sky/serve/serve_state.py +16 -6
  107. sky/serve/serve_utils.py +274 -77
  108. sky/serve/server/core.py +8 -525
  109. sky/serve/server/impl.py +709 -0
  110. sky/serve/service.py +13 -9
  111. sky/serve/service_spec.py +74 -4
  112. sky/server/constants.py +1 -1
  113. sky/server/requests/payloads.py +33 -0
  114. sky/server/requests/requests.py +18 -1
  115. sky/server/requests/serializers/decoders.py +12 -3
  116. sky/server/requests/serializers/encoders.py +13 -2
  117. sky/server/server.py +6 -1
  118. sky/skylet/events.py +9 -0
  119. sky/skypilot_config.py +24 -21
  120. sky/task.py +41 -11
  121. sky/templates/jobs-controller.yaml.j2 +3 -0
  122. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  123. sky/users/server.py +1 -1
  124. sky/utils/command_runner.py +4 -2
  125. sky/utils/controller_utils.py +14 -10
  126. sky/utils/dag_utils.py +4 -2
  127. sky/utils/db/migration_utils.py +2 -4
  128. sky/utils/schemas.py +24 -19
  129. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
  130. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +135 -130
  131. sky/dashboard/out/_next/static/Q2sVXboB_t7cgvntL-6nD/_buildManifest.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
  133. sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
  134. sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
  135. sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
  140. sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
  146. sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
  147. sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
  149. sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
  151. sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
  153. sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
  156. sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
  158. sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
  160. sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
  161. sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
  163. sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
  164. sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
  166. sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
  167. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
  168. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
  169. sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
  170. sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
  173. sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
  174. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
  175. sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
  179. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
  180. sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
  182. /sky/dashboard/out/_next/static/{Q2sVXboB_t7cgvntL-6nD → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
  183. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
  184. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
  185. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
  186. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/jobs/state.py CHANGED
@@ -4,6 +4,7 @@
4
4
  import enum
5
5
  import functools
6
6
  import json
7
+ import threading
7
8
  import time
8
9
  import typing
9
10
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -33,6 +34,7 @@ CallbackType = Callable[[str], None]
33
34
  logger = sky_logging.init_logger(__name__)
34
35
 
35
36
  _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
37
+ _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
36
38
 
37
39
  Base = declarative.declarative_base()
38
40
 
@@ -98,6 +100,13 @@ job_info_table = sqlalchemy.Table(
98
100
  sqlalchemy.Column('original_user_yaml_path',
99
101
  sqlalchemy.Text,
100
102
  server_default=None),
103
+ sqlalchemy.Column('pool', sqlalchemy.Text, server_default=None),
104
+ sqlalchemy.Column('current_cluster_name',
105
+ sqlalchemy.Text,
106
+ server_default=None),
107
+ sqlalchemy.Column('job_id_on_pool_cluster',
108
+ sqlalchemy.Integer,
109
+ server_default=None),
101
110
  )
102
111
 
103
112
  ha_recovery_script_table = sqlalchemy.Table(
@@ -131,21 +140,30 @@ def create_table(engine: sqlalchemy.engine.Engine):
131
140
  migration_utils.SPOT_JOBS_VERSION)
132
141
 
133
142
 
143
+ # We wrap the sqlalchemy engine initialization in a thread
144
+ # lock to ensure that multiple threads do not initialize the
145
+ # engine which could result in a rare race condition where
146
+ # a session has already been created with _SQLALCHEMY_ENGINE = e1,
147
+ # and then another thread overwrites _SQLALCHEMY_ENGINE = e2
148
+ # which could result in e1 being garbage collected unexpectedly.
134
149
  def initialize_and_get_db() -> sqlalchemy.engine.Engine:
135
150
  global _SQLALCHEMY_ENGINE
136
151
 
137
152
  if _SQLALCHEMY_ENGINE is not None:
138
153
  return _SQLALCHEMY_ENGINE
139
154
 
140
- # get an engine to the db
141
- engine = migration_utils.get_engine('spot_jobs')
155
+ with _SQLALCHEMY_ENGINE_LOCK:
156
+ if _SQLALCHEMY_ENGINE is not None:
157
+ return _SQLALCHEMY_ENGINE
158
+ # get an engine to the db
159
+ engine = migration_utils.get_engine('spot_jobs')
142
160
 
143
- # run migrations if needed
144
- create_table(engine)
161
+ # run migrations if needed
162
+ create_table(engine)
145
163
 
146
- # return engine
147
- _SQLALCHEMY_ENGINE = engine
148
- return _SQLALCHEMY_ENGINE
164
+ # return engine
165
+ _SQLALCHEMY_ENGINE = engine
166
+ return _SQLALCHEMY_ENGINE
149
167
 
150
168
 
151
169
  def _init_db(func):
@@ -204,6 +222,9 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
204
222
  'priority': r['priority'],
205
223
  'entrypoint': r['entrypoint'],
206
224
  'original_user_yaml_path': r['original_user_yaml_path'],
225
+ 'pool': r['pool'],
226
+ 'current_cluster_name': r['current_cluster_name'],
227
+ 'job_id_on_pool_cluster': r['job_id_on_pool_cluster'],
207
228
  }
208
229
 
209
230
 
@@ -440,8 +461,8 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
440
461
 
441
462
 
442
463
  @_init_db
443
- def set_job_info_without_job_id(name: str, workspace: str,
444
- entrypoint: str) -> int:
464
+ def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
465
+ pool: Optional[str]) -> int:
445
466
  assert _SQLALCHEMY_ENGINE is not None
446
467
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
447
468
  if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -458,6 +479,7 @@ def set_job_info_without_job_id(name: str, workspace: str,
458
479
  schedule_state=ManagedJobScheduleState.INACTIVE.value,
459
480
  workspace=workspace,
460
481
  entrypoint=entrypoint,
482
+ pool=pool,
461
483
  )
462
484
 
463
485
  if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -1045,6 +1067,23 @@ def _get_all_task_ids_statuses(
1045
1067
  return [(row[0], ManagedJobStatus(row[1])) for row in id_statuses]
1046
1068
 
1047
1069
 
1070
+ @_init_db
1071
+ def get_all_task_ids_names_statuses_logs(
1072
+ job_id: int) -> List[Tuple[int, str, ManagedJobStatus, str]]:
1073
+ assert _SQLALCHEMY_ENGINE is not None
1074
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1075
+ id_names = session.execute(
1076
+ sqlalchemy.select(
1077
+ spot_table.c.task_id,
1078
+ spot_table.c.task_name,
1079
+ spot_table.c.status,
1080
+ spot_table.c.local_log_file,
1081
+ ).where(spot_table.c.spot_job_id == job_id).order_by(
1082
+ spot_table.c.task_id.asc())).fetchall()
1083
+ return [(row[0], row[1], ManagedJobStatus(row[2]), row[3])
1084
+ for row in id_names]
1085
+
1086
+
1048
1087
  @_init_db
1049
1088
  def get_job_status_with_task_id(job_id: int,
1050
1089
  task_id: int) -> Optional[ManagedJobStatus]:
@@ -1250,6 +1289,56 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
1250
1289
  return updated_count == 0
1251
1290
 
1252
1291
 
1292
+ @_init_db
1293
+ def get_pool_from_job_id(job_id: int) -> Optional[str]:
1294
+ """Get the pool from the job id."""
1295
+ assert _SQLALCHEMY_ENGINE is not None
1296
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1297
+ pool = session.execute(
1298
+ sqlalchemy.select(job_info_table.c.pool).where(
1299
+ job_info_table.c.spot_job_id == job_id)).fetchone()
1300
+ return pool[0] if pool else None
1301
+
1302
+
1303
+ @_init_db
1304
+ def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
1305
+ """Set the current cluster name for a job."""
1306
+ assert _SQLALCHEMY_ENGINE is not None
1307
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1308
+ session.query(job_info_table).filter(
1309
+ job_info_table.c.spot_job_id == job_id).update(
1310
+ {job_info_table.c.current_cluster_name: current_cluster_name})
1311
+ session.commit()
1312
+
1313
+
1314
+ @_init_db
1315
+ def set_job_id_on_pool_cluster(job_id: int,
1316
+ job_id_on_pool_cluster: int) -> None:
1317
+ """Set the job id on the pool cluster for a job."""
1318
+ assert _SQLALCHEMY_ENGINE is not None
1319
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1320
+ session.query(job_info_table).filter(
1321
+ job_info_table.c.spot_job_id == job_id).update({
1322
+ job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster
1323
+ })
1324
+ session.commit()
1325
+
1326
+
1327
+ @_init_db
1328
+ def get_pool_submit_info(job_id: int) -> Tuple[Optional[str], Optional[int]]:
1329
+ """Get the cluster name and job id on the pool from the managed job id."""
1330
+ assert _SQLALCHEMY_ENGINE is not None
1331
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1332
+ info = session.execute(
1333
+ sqlalchemy.select(
1334
+ job_info_table.c.current_cluster_name,
1335
+ job_info_table.c.job_id_on_pool_cluster).where(
1336
+ job_info_table.c.spot_job_id == job_id)).fetchone()
1337
+ if info is None:
1338
+ return None, None
1339
+ return info[0], info[1]
1340
+
1341
+
1253
1342
  @_init_db
1254
1343
  def scheduler_set_launching(job_id: int,
1255
1344
  current_state: ManagedJobScheduleState) -> None:
@@ -1370,28 +1459,68 @@ def get_num_launching_jobs() -> int:
1370
1459
  sqlalchemy.select(
1371
1460
  sqlalchemy.func.count() # pylint: disable=not-callable
1372
1461
  ).select_from(job_info_table).where(
1373
- job_info_table.c.schedule_state ==
1374
- ManagedJobScheduleState.LAUNCHING.value)).fetchone()[0]
1462
+ sqlalchemy.and_(
1463
+ job_info_table.c.schedule_state ==
1464
+ ManagedJobScheduleState.LAUNCHING.value,
1465
+ # We only count jobs that are not in the pool, because the
1466
+ # job in the pool does not actually calling the sky.launch.
1467
+ job_info_table.c.pool.is_(None)))).fetchone()[0]
1375
1468
 
1376
1469
 
1377
1470
  @_init_db
1378
- def get_num_alive_jobs() -> int:
1471
+ def get_num_alive_jobs(pool: Optional[str] = None) -> int:
1379
1472
  assert _SQLALCHEMY_ENGINE is not None
1380
1473
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1474
+ where_conditions = [
1475
+ job_info_table.c.schedule_state.in_([
1476
+ ManagedJobScheduleState.ALIVE_WAITING.value,
1477
+ ManagedJobScheduleState.LAUNCHING.value,
1478
+ ManagedJobScheduleState.ALIVE.value,
1479
+ ManagedJobScheduleState.ALIVE_BACKOFF.value,
1480
+ ])
1481
+ ]
1482
+
1483
+ if pool is not None:
1484
+ where_conditions.append(job_info_table.c.pool == pool)
1485
+
1381
1486
  return session.execute(
1382
1487
  sqlalchemy.select(
1383
1488
  sqlalchemy.func.count() # pylint: disable=not-callable
1384
1489
  ).select_from(job_info_table).where(
1385
- job_info_table.c.schedule_state.in_([
1386
- ManagedJobScheduleState.ALIVE_WAITING.value,
1387
- ManagedJobScheduleState.LAUNCHING.value,
1388
- ManagedJobScheduleState.ALIVE.value,
1389
- ManagedJobScheduleState.ALIVE_BACKOFF.value,
1390
- ]))).fetchone()[0]
1490
+ sqlalchemy.and_(*where_conditions))).fetchone()[0]
1391
1491
 
1392
1492
 
1393
1493
  @_init_db
1394
- def get_waiting_job() -> Optional[Dict[str, Any]]:
1494
+ def get_nonterminal_job_ids_by_pool(pool: str,
1495
+ cluster_name: Optional[str] = None
1496
+ ) -> List[int]:
1497
+ """Get nonterminal job ids in a pool."""
1498
+ assert _SQLALCHEMY_ENGINE is not None
1499
+
1500
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1501
+ query = sqlalchemy.select(
1502
+ spot_table.c.spot_job_id.distinct()).select_from(
1503
+ spot_table.outerjoin(
1504
+ job_info_table,
1505
+ spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
1506
+ and_conditions = [
1507
+ ~spot_table.c.status.in_([
1508
+ status.value for status in ManagedJobStatus.terminal_statuses()
1509
+ ]),
1510
+ job_info_table.c.pool == pool,
1511
+ ]
1512
+ if cluster_name is not None:
1513
+ and_conditions.append(
1514
+ job_info_table.c.current_cluster_name == cluster_name)
1515
+ query = query.where(sqlalchemy.and_(*and_conditions)).order_by(
1516
+ spot_table.c.spot_job_id.asc())
1517
+ rows = session.execute(query).fetchall()
1518
+ job_ids = [row[0] for row in rows if row[0] is not None]
1519
+ return job_ids
1520
+
1521
+
1522
+ @_init_db
1523
+ def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
1395
1524
  """Get the next job that should transition to LAUNCHING.
1396
1525
 
1397
1526
  Selects the highest-priority WAITING or ALIVE_WAITING job, provided its
@@ -1414,23 +1543,26 @@ def get_waiting_job() -> Optional[Dict[str, Any]]:
1414
1543
  ManagedJobScheduleState.ALIVE_BACKOFF.value,
1415
1544
  ])).scalar_subquery()
1416
1545
  # Main query for waiting jobs
1546
+ select_conds = [
1547
+ job_info_table.c.schedule_state.in_([
1548
+ ManagedJobScheduleState.WAITING.value,
1549
+ ManagedJobScheduleState.ALIVE_WAITING.value,
1550
+ ]),
1551
+ job_info_table.c.priority >= sqlalchemy.func.coalesce(
1552
+ max_priority_subquery, 0),
1553
+ ]
1554
+ if pool is not None:
1555
+ select_conds.append(job_info_table.c.pool == pool)
1417
1556
  query = sqlalchemy.select(
1418
1557
  job_info_table.c.spot_job_id,
1419
1558
  job_info_table.c.schedule_state,
1420
1559
  job_info_table.c.dag_yaml_path,
1421
1560
  job_info_table.c.env_file_path,
1422
- ).where(
1423
- sqlalchemy.and_(
1424
- job_info_table.c.schedule_state.in_([
1425
- ManagedJobScheduleState.WAITING.value,
1426
- ManagedJobScheduleState.ALIVE_WAITING.value,
1427
- ]),
1428
- job_info_table.c.priority >= sqlalchemy.func.coalesce(
1429
- max_priority_subquery, 0),
1430
- )).order_by(
1431
- job_info_table.c.priority.desc(),
1432
- job_info_table.c.spot_job_id.asc(),
1433
- ).limit(1)
1561
+ job_info_table.c.pool,
1562
+ ).where(sqlalchemy.and_(*select_conds)).order_by(
1563
+ job_info_table.c.priority.desc(),
1564
+ job_info_table.c.spot_job_id.asc(),
1565
+ ).limit(1)
1434
1566
  waiting_job_row = session.execute(query).fetchone()
1435
1567
  if waiting_job_row is None:
1436
1568
  return None
@@ -1440,6 +1572,7 @@ def get_waiting_job() -> Optional[Dict[str, Any]]:
1440
1572
  'schedule_state': ManagedJobScheduleState(waiting_job_row[1]),
1441
1573
  'dag_yaml_path': waiting_job_row[2],
1442
1574
  'env_file_path': waiting_job_row[3],
1575
+ 'pool': waiting_job_row[4],
1443
1576
  }
1444
1577
 
1445
1578