skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (186) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +448 -60
  9. sky/client/common.py +12 -9
  10. sky/clouds/nebius.py +1 -1
  11. sky/clouds/utils/gcp_utils.py +1 -1
  12. sky/clouds/vast.py +1 -2
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
  16. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
  17. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  22. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
  29. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  31. sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  33. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  34. sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
  36. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  39. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
  41. sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
  43. sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  46. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
  49. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
  51. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
  65. sky/dashboard/out/_next/static/oKqDxFQ88cquF4nQGE_0w/_buildManifest.js +1 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs.html +1 -1
  75. sky/dashboard/out/users.html +1 -1
  76. sky/dashboard/out/volumes.html +1 -1
  77. sky/dashboard/out/workspace/new.html +1 -1
  78. sky/dashboard/out/workspaces/[name].html +1 -1
  79. sky/dashboard/out/workspaces.html +1 -1
  80. sky/data/data_utils.py +25 -0
  81. sky/data/storage.py +1219 -1775
  82. sky/global_user_state.py +18 -8
  83. sky/jobs/__init__.py +3 -0
  84. sky/jobs/client/sdk.py +80 -3
  85. sky/jobs/controller.py +76 -25
  86. sky/jobs/recovery_strategy.py +80 -34
  87. sky/jobs/scheduler.py +68 -20
  88. sky/jobs/server/core.py +228 -136
  89. sky/jobs/server/server.py +40 -0
  90. sky/jobs/state.py +164 -31
  91. sky/jobs/utils.py +144 -68
  92. sky/logs/aws.py +4 -2
  93. sky/provision/kubernetes/utils.py +6 -4
  94. sky/provision/nebius/constants.py +3 -0
  95. sky/provision/vast/instance.py +2 -1
  96. sky/provision/vast/utils.py +9 -6
  97. sky/py.typed +0 -0
  98. sky/resources.py +24 -14
  99. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  100. sky/serve/autoscalers.py +8 -0
  101. sky/serve/client/impl.py +188 -0
  102. sky/serve/client/sdk.py +12 -82
  103. sky/serve/constants.py +5 -1
  104. sky/serve/controller.py +5 -0
  105. sky/serve/replica_managers.py +112 -37
  106. sky/serve/serve_state.py +16 -6
  107. sky/serve/serve_utils.py +274 -77
  108. sky/serve/server/core.py +8 -525
  109. sky/serve/server/impl.py +709 -0
  110. sky/serve/service.py +13 -9
  111. sky/serve/service_spec.py +74 -4
  112. sky/server/constants.py +1 -1
  113. sky/server/requests/payloads.py +33 -0
  114. sky/server/requests/requests.py +18 -1
  115. sky/server/requests/serializers/decoders.py +12 -3
  116. sky/server/requests/serializers/encoders.py +13 -2
  117. sky/server/server.py +6 -1
  118. sky/skylet/events.py +9 -0
  119. sky/skypilot_config.py +24 -21
  120. sky/task.py +41 -11
  121. sky/templates/jobs-controller.yaml.j2 +3 -0
  122. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  123. sky/users/server.py +1 -1
  124. sky/utils/command_runner.py +4 -2
  125. sky/utils/controller_utils.py +14 -10
  126. sky/utils/dag_utils.py +4 -2
  127. sky/utils/db/migration_utils.py +2 -4
  128. sky/utils/schemas.py +24 -19
  129. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
  130. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +135 -130
  131. sky/dashboard/out/_next/static/Q2sVXboB_t7cgvntL-6nD/_buildManifest.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
  133. sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
  134. sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
  135. sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
  140. sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
  146. sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
  147. sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
  149. sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
  151. sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
  153. sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
  156. sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
  158. sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
  160. sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
  161. sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
  163. sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
  164. sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
  166. sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
  167. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
  168. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
  169. sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
  170. sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
  173. sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
  174. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
  175. sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
  179. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
  180. sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
  182. /sky/dashboard/out/_next/static/{Q2sVXboB_t7cgvntL-6nD → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
  183. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
  184. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
  185. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
  186. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py CHANGED
@@ -30,7 +30,6 @@ from sky.backends import backend_utils
30
30
  from sky.jobs import constants as managed_job_constants
31
31
  from sky.jobs import scheduler
32
32
  from sky.jobs import state as managed_job_state
33
- from sky.server import common as server_common
34
33
  from sky.skylet import constants
35
34
  from sky.skylet import job_lib
36
35
  from sky.skylet import log_lib
@@ -39,7 +38,6 @@ from sky.utils import annotations
39
38
  from sky.utils import command_runner
40
39
  from sky.utils import common_utils
41
40
  from sky.utils import controller_utils
42
- from sky.utils import env_options
43
41
  from sky.utils import infra_utils
44
42
  from sky.utils import log_utils
45
43
  from sky.utils import message_utils
@@ -136,12 +134,6 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
136
134
  def _validate_consolidation_mode_config(
137
135
  current_is_consolidation_mode: bool) -> None:
138
136
  """Validate the consolidation mode config."""
139
- if (current_is_consolidation_mode and
140
- not env_options.Options.IS_DEVELOPER.get() and
141
- server_common.is_api_server_local()):
142
- with ux_utils.print_exception_no_traceback():
143
- raise exceptions.NotSupportedError(
144
- 'Consolidation mode is not supported when running locally.')
145
137
  # Check whether the consolidation mode config is changed.
146
138
  if current_is_consolidation_mode:
147
139
  controller_cn = (
@@ -239,8 +231,8 @@ def ha_recovery_for_consolidation_mode():
239
231
  f.write(f'Total recovery time: {time.time() - start} seconds\n')
240
232
 
241
233
 
242
- def get_job_status(backend: 'backends.CloudVmRayBackend',
243
- cluster_name: str) -> Optional['job_lib.JobStatus']:
234
+ def get_job_status(backend: 'backends.CloudVmRayBackend', cluster_name: str,
235
+ job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
244
236
  """Check the status of the job running on a managed job cluster.
245
237
 
246
238
  It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
@@ -253,10 +245,13 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
253
245
  logger.info(f'Cluster {cluster_name} not found.')
254
246
  return None
255
247
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
248
+ job_ids = None if job_id is None else [job_id]
256
249
  for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
257
250
  try:
258
251
  logger.info('=== Checking the job status... ===')
259
- statuses = backend.get_job_status(handle, stream_logs=False)
252
+ statuses = backend.get_job_status(handle,
253
+ job_ids=job_ids,
254
+ stream_logs=False)
260
255
  status = list(statuses.values())[0]
261
256
  if status is None:
262
257
  logger.info('No job found.')
@@ -323,13 +318,20 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
323
318
  error_msg = None
324
319
  tasks = managed_job_state.get_managed_jobs(job_id)
325
320
  for task in tasks:
326
- task_name = task['job_name']
327
- cluster_name = generate_managed_job_cluster_name(task_name, job_id)
321
+ pool = task.get('pool', None)
322
+ if pool is None:
323
+ task_name = task['job_name']
324
+ cluster_name = generate_managed_job_cluster_name(
325
+ task_name, job_id)
326
+ else:
327
+ cluster_name, _ = (
328
+ managed_job_state.get_pool_submit_info(job_id))
328
329
  handle = global_user_state.get_handle_from_cluster_name(
329
330
  cluster_name)
330
331
  if handle is not None:
331
332
  try:
332
- terminate_cluster(cluster_name)
333
+ if pool is None:
334
+ terminate_cluster(cluster_name)
333
335
  except Exception as e: # pylint: disable=broad-except
334
336
  error_msg = (
335
337
  f'Failed to terminate cluster {cluster_name}: '
@@ -510,10 +512,10 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
510
512
 
511
513
 
512
514
  def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
513
- get_end_time: bool) -> float:
515
+ job_id: Optional[int], get_end_time: bool) -> float:
514
516
  """Get the submitted/ended time of the job."""
515
517
  code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
516
- job_id=None, get_ended_time=get_end_time)
518
+ job_id=job_id, get_ended_time=get_end_time)
517
519
  handle = global_user_state.get_handle_from_cluster_name(cluster_name)
518
520
  returncode, stdout, stderr = backend.run_on_head(handle,
519
521
  code,
@@ -527,14 +529,17 @@ def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
527
529
 
528
530
 
529
531
  def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
530
- cluster_name: str) -> float:
532
+ cluster_name: str, job_id: Optional[int]) -> float:
531
533
  """Try to get the end time of the job.
532
534
 
533
535
  If the job is preempted or we can't connect to the instance for whatever
534
536
  reason, fall back to the current time.
535
537
  """
536
538
  try:
537
- return get_job_timestamp(backend, cluster_name, get_end_time=True)
539
+ return get_job_timestamp(backend,
540
+ cluster_name,
541
+ job_id=job_id,
542
+ get_end_time=True)
538
543
  except exceptions.CommandError as e:
539
544
  if e.returncode == 255:
540
545
  # Failed to connect - probably the instance was preempted since the
@@ -556,8 +561,12 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
556
561
  if event_callback is None or task is None:
557
562
  return
558
563
  event_callback = event_callback.strip()
559
- cluster_name = generate_managed_job_cluster_name(
560
- task.name, job_id) if task.name else None
564
+ pool = managed_job_state.get_pool_from_job_id(job_id)
565
+ if pool is not None:
566
+ cluster_name, _ = (managed_job_state.get_pool_submit_info(job_id))
567
+ else:
568
+ cluster_name = generate_managed_job_cluster_name(
569
+ task.name, job_id) if task.name else None
561
570
  logger.info(f'=== START: event callback for {status!r} ===')
562
571
  log_path = os.path.join(constants.SKY_LOGS_DIRECTORY,
563
572
  'managed_job_event',
@@ -684,6 +693,15 @@ def cancel_job_by_name(job_name: str,
684
693
  return f'{job_name!r} {msg}'
685
694
 
686
695
 
696
+ def cancel_jobs_by_pool(pool_name: str,
697
+ current_workspace: Optional[str] = None) -> str:
698
+ """Cancel all jobs in a pool."""
699
+ job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(pool_name)
700
+ if not job_ids:
701
+ return f'No running job found in pool {pool_name!r}.'
702
+ return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
703
+
704
+
687
705
  def stream_logs_by_id(job_id: int,
688
706
  follow: bool = True,
689
707
  tail: Optional[int] = None) -> Tuple[str, int]:
@@ -716,23 +734,41 @@ def stream_logs_by_id(job_id: int,
716
734
  if managed_job_status.is_failed():
717
735
  job_msg = ('\nFailure reason: '
718
736
  f'{managed_job_state.get_failure_reason(job_id)}')
719
- log_file = managed_job_state.get_local_log_file(job_id, None)
720
- if log_file is not None:
721
- with open(os.path.expanduser(log_file), 'r',
722
- encoding='utf-8') as f:
723
- # Stream the logs to the console without reading the whole
724
- # file into memory.
725
- start_streaming = False
726
- read_from: Union[TextIO, Deque[str]] = f
727
- if tail is not None:
728
- assert tail > 0
729
- # Read only the last 'tail' lines using deque
730
- read_from = collections.deque(f, maxlen=tail)
731
- for line in read_from:
732
- if log_lib.LOG_FILE_START_STREAMING_AT in line:
733
- start_streaming = True
734
- if start_streaming:
735
- print(line, end='', flush=True)
737
+ log_file_exists = False
738
+ task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
739
+ job_id)
740
+ num_tasks = len(task_info)
741
+ for task_id, task_name, task_status, log_file in task_info:
742
+ if log_file:
743
+ log_file_exists = True
744
+ task_str = (f'Task {task_name}({task_id})'
745
+ if task_name else f'Task {task_id}')
746
+ if num_tasks > 1:
747
+ print(f'=== {task_str} ===')
748
+ with open(os.path.expanduser(log_file),
749
+ 'r',
750
+ encoding='utf-8') as f:
751
+ # Stream the logs to the console without reading the
752
+ # whole file into memory.
753
+ start_streaming = False
754
+ read_from: Union[TextIO, Deque[str]] = f
755
+ if tail is not None:
756
+ assert tail > 0
757
+ # Read only the last 'tail' lines using deque
758
+ read_from = collections.deque(f, maxlen=tail)
759
+ for line in read_from:
760
+ if log_lib.LOG_FILE_START_STREAMING_AT in line:
761
+ start_streaming = True
762
+ if start_streaming:
763
+ print(line, end='', flush=True)
764
+ if num_tasks > 1:
765
+ # Add the "Task finished" message for terminal states
766
+ if task_status.is_terminal():
767
+ print(ux_utils.finishing_message(
768
+ f'{task_str} finished '
769
+ f'(status: {task_status.value}).'),
770
+ flush=True)
771
+ if log_file_exists:
736
772
  # Add the "Job finished" message for terminal states
737
773
  if managed_job_status.is_terminal():
738
774
  print(ux_utils.finishing_message(
@@ -759,12 +795,19 @@ def stream_logs_by_id(job_id: int,
759
795
 
760
796
  while should_keep_logging(managed_job_status):
761
797
  handle = None
798
+ job_id_to_tail = None
762
799
  if task_id is not None:
763
- task_name = managed_job_state.get_task_name(job_id, task_id)
764
- cluster_name = generate_managed_job_cluster_name(
765
- task_name, job_id)
766
- handle = global_user_state.get_handle_from_cluster_name(
767
- cluster_name)
800
+ pool = managed_job_state.get_pool_from_job_id(job_id)
801
+ if pool is not None:
802
+ cluster_name, job_id_to_tail = (
803
+ managed_job_state.get_pool_submit_info(job_id))
804
+ else:
805
+ task_name = managed_job_state.get_task_name(job_id, task_id)
806
+ cluster_name = generate_managed_job_cluster_name(
807
+ task_name, job_id)
808
+ if cluster_name is not None:
809
+ handle = global_user_state.get_handle_from_cluster_name(
810
+ cluster_name)
768
811
 
769
812
  # Check the handle: The cluster can be preempted and removed from
770
813
  # the table before the managed job state is updated by the
@@ -796,7 +839,7 @@ def stream_logs_by_id(job_id: int,
796
839
  status_display.stop()
797
840
  tail_param = tail if tail is not None else 0
798
841
  returncode = backend.tail_logs(handle,
799
- job_id=None,
842
+ job_id=job_id_to_tail,
800
843
  managed_job_id=job_id,
801
844
  follow=follow,
802
845
  tail=tail_param)
@@ -1114,9 +1157,15 @@ def dump_managed_job_queue() -> str:
1114
1157
  job['status'] = job['status'].value
1115
1158
  job['schedule_state'] = job['schedule_state'].value
1116
1159
 
1117
- cluster_name = generate_managed_job_cluster_name(
1118
- job['task_name'], job['job_id'])
1119
- handle = global_user_state.get_handle_from_cluster_name(cluster_name)
1160
+ pool = managed_job_state.get_pool_from_job_id(job['job_id'])
1161
+ if pool is not None:
1162
+ cluster_name, _ = managed_job_state.get_pool_submit_info(
1163
+ job['job_id'])
1164
+ else:
1165
+ cluster_name = generate_managed_job_cluster_name(
1166
+ job['task_name'], job['job_id'])
1167
+ handle = global_user_state.get_handle_from_cluster_name(
1168
+ cluster_name) if cluster_name is not None else None
1120
1169
  if isinstance(handle, backends.CloudVmRayResourceHandle):
1121
1170
  resources_str = resources_utils.get_readable_resources_repr(
1122
1171
  handle, simplify=True)
@@ -1127,6 +1176,11 @@ def dump_managed_job_queue() -> str:
1127
1176
  job['cloud'] = str(handle.launched_resources.cloud)
1128
1177
  job['region'] = handle.launched_resources.region
1129
1178
  job['zone'] = handle.launched_resources.zone
1179
+ job['infra'] = infra_utils.InfraInfo(
1180
+ str(handle.launched_resources.cloud),
1181
+ handle.launched_resources.region,
1182
+ handle.launched_resources.zone).formatted_str()
1183
+ job['accelerators'] = handle.launched_resources.accelerators
1130
1184
  else:
1131
1185
  # FIXME(zongheng): display the last cached values for these.
1132
1186
  job['cluster_resources'] = '-'
@@ -1134,6 +1188,7 @@ def dump_managed_job_queue() -> str:
1134
1188
  job['cloud'] = '-'
1135
1189
  job['region'] = '-'
1136
1190
  job['zone'] = '-'
1191
+ job['infra'] = '-'
1137
1192
 
1138
1193
  # Add details about schedule state / backoff.
1139
1194
  state_details = None
@@ -1274,10 +1329,13 @@ def format_job_table(
1274
1329
  'JOB DURATION',
1275
1330
  '#RECOVERIES',
1276
1331
  'STATUS',
1332
+ 'WORKER_POOL',
1277
1333
  ]
1278
1334
  if show_all:
1279
1335
  # TODO: move SCHED. STATE to a separate flag (e.g. --debug)
1280
1336
  columns += [
1337
+ 'WORKER_CLUSTER',
1338
+ 'WORKER_JOB_ID',
1281
1339
  'STARTED',
1282
1340
  'INFRA',
1283
1341
  'RESOURCES',
@@ -1387,11 +1445,14 @@ def format_job_table(
1387
1445
  job_duration,
1388
1446
  recovery_cnt,
1389
1447
  status_str,
1448
+ job_tasks[0].get('pool', '-'),
1390
1449
  ]
1391
1450
  if show_all:
1392
1451
  details = job_tasks[current_task_id].get('details')
1393
1452
  failure_reason = job_tasks[current_task_id]['failure_reason']
1394
1453
  job_values.extend([
1454
+ '-',
1455
+ '-',
1395
1456
  '-',
1396
1457
  '-',
1397
1458
  '-',
@@ -1427,37 +1488,43 @@ def format_job_table(
1427
1488
  job_duration,
1428
1489
  task['recovery_count'],
1429
1490
  task['status'].colored_str(),
1491
+ task.get('pool', '-'),
1430
1492
  ]
1431
1493
  if show_all:
1432
1494
  # schedule_state is only set at the job level, so if we have
1433
1495
  # more than one task, only display on the aggregated row.
1434
1496
  schedule_state = (task['schedule_state']
1435
1497
  if len(job_tasks) == 1 else '-')
1436
- cloud = task.get('cloud')
1437
- if cloud is None:
1438
- # Backward compatibility for old jobs controller without
1439
- # cloud info returned, we parse it from the cluster
1440
- # resources
1441
- # TODO(zhwu): remove this after 0.12.0
1442
- cloud = task['cluster_resources'].split('(')[0].split(
1443
- 'x')[-1]
1444
- task['cluster_resources'] = task[
1445
- 'cluster_resources'].replace(f'{cloud}(',
1446
- '(').replace('x ', 'x')
1447
- region = task['region']
1448
- zone = task.get('zone')
1449
- if cloud == '-':
1450
- cloud = None
1451
- if region == '-':
1452
- region = None
1453
- if zone == '-':
1454
- zone = None
1455
-
1456
- infra = infra_utils.InfraInfo(cloud, region, zone)
1498
+ infra_str = task.get('infra')
1499
+ if infra_str is None:
1500
+ cloud = task.get('cloud')
1501
+ if cloud is None:
1502
+ # Backward compatibility for old jobs controller without
1503
+ # cloud info returned, we parse it from the cluster
1504
+ # resources
1505
+ # TODO(zhwu): remove this after 0.12.0
1506
+ cloud = task['cluster_resources'].split('(')[0].split(
1507
+ 'x')[-1]
1508
+ task['cluster_resources'] = task[
1509
+ 'cluster_resources'].replace(f'{cloud}(',
1510
+ '(').replace(
1511
+ 'x ', 'x')
1512
+ region = task['region']
1513
+ zone = task.get('zone')
1514
+ if cloud == '-':
1515
+ cloud = None
1516
+ if region == '-':
1517
+ region = None
1518
+ if zone == '-':
1519
+ zone = None
1520
+ infra_str = infra_utils.InfraInfo(cloud, region,
1521
+ zone).formatted_str()
1457
1522
  values.extend([
1523
+ task.get('current_cluster_name', '-'),
1524
+ task.get('job_id_on_pool_cluster', '-'),
1458
1525
  # STARTED
1459
1526
  log_utils.readable_time_duration(task['start_at']),
1460
- infra.formatted_str(),
1527
+ infra_str,
1461
1528
  task['cluster_resources'],
1462
1529
  schedule_state,
1463
1530
  generate_details(task.get('details'),
@@ -1549,6 +1616,15 @@ class ManagedJobCodeGen:
1549
1616
  """)
1550
1617
  return cls._build(code)
1551
1618
 
1619
+ @classmethod
1620
+ def cancel_jobs_by_pool(cls, pool_name: str) -> str:
1621
+ active_workspace = skypilot_config.get_active_workspace()
1622
+ code = textwrap.dedent(f"""\
1623
+ msg = utils.cancel_jobs_by_pool({pool_name!r}, {active_workspace!r})
1624
+ print(msg, end="", flush=True)
1625
+ """)
1626
+ return cls._build(code)
1627
+
1552
1628
  @classmethod
1553
1629
  def get_version_and_job_table(cls) -> str:
1554
1630
  """Generate code to get controller version and raw job table."""
sky/logs/aws.py CHANGED
@@ -9,6 +9,8 @@ from sky.skylet import constants
9
9
  from sky.utils import common_utils
10
10
  from sky.utils import resources_utils
11
11
 
12
+ EC2_MD_URL = '"${AWS_EC2_METADATA_SERVICE_ENDPOINT:-http://169.254.169.254/}"'
13
+
12
14
 
13
15
  class _CloudwatchLoggingConfig(pydantic.BaseModel):
14
16
  """Configuration for AWS CloudWatch logging agent."""
@@ -109,8 +111,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
109
111
  # Check if we're running on EC2 with an IAM role or if
110
112
  # AWS credentials are available in the environment
111
113
  pre_cmd = (
112
- 'if ! curl -s -m 1 http://169.254.169.254'
113
- '/latest/meta-data/iam/security-credentials/ > /dev/null; '
114
+ f'if ! curl -s -m 1 {EC2_MD_URL}'
115
+ 'latest/meta-data/iam/security-credentials/ > /dev/null; '
114
116
  'then '
115
117
  # failed EC2 check, look for env vars
116
118
  'if [ -z "$AWS_ACCESS_KEY_ID" ] || '
@@ -3179,10 +3179,12 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
3179
3179
  return pods
3180
3180
 
3181
3181
 
3182
- def is_tpu_on_gke(accelerator: str) -> bool:
3182
+ def is_tpu_on_gke(accelerator: str, normalize: bool = True) -> bool:
3183
3183
  """Determines if the given accelerator is a TPU supported on GKE."""
3184
- normalized, _ = normalize_tpu_accelerator_name(accelerator)
3185
- return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
3184
+ if normalize:
3185
+ normalized, _ = normalize_tpu_accelerator_name(accelerator)
3186
+ return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
3187
+ return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
3186
3188
 
3187
3189
 
3188
3190
  def get_node_accelerator_count(context: Optional[str],
@@ -3384,7 +3386,7 @@ def process_skypilot_pods(
3384
3386
 
3385
3387
  def _gpu_resource_key_helper(context: Optional[str]) -> str:
3386
3388
  """Helper function to get the GPU resource key."""
3387
- gpu_resource_key = SUPPORTED_GPU_RESOURCE_KEYS['amd']
3389
+ gpu_resource_key = SUPPORTED_GPU_RESOURCE_KEYS['nvidia']
3388
3390
  try:
3389
3391
  nodes = kubernetes.core_api(context).list_node().items
3390
3392
  for gpu_key in SUPPORTED_GPU_RESOURCE_KEYS.values():
@@ -15,6 +15,9 @@ INFINIBAND_ENV_VARS = {
15
15
  'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
16
16
  }
17
17
 
18
+ # pylint: disable=line-too-long
19
+ INFINIBAND_IMAGE_ID = 'docker:cr.eu-north1.nebius.cloud/nebius-benchmarks/nccl-tests:2.23.4-ubu22.04-cu12.4'
20
+
18
21
  # Docker run options for InfiniBand support
19
22
  INFINIBAND_DOCKER_OPTIONS = ['--device=/dev/infiniband', '--cap-add=IPC_LOCK']
20
23
 
@@ -97,7 +97,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
97
97
  region=region,
98
98
  disk_size=config.node_config['DiskSize'],
99
99
  preemptible=config.node_config['Preemptible'],
100
- image_name=config.node_config['ImageId'])
100
+ image_name=config.node_config['ImageId'],
101
+ ports=config.ports_to_open_on_launch)
101
102
  except Exception as e: # pylint: disable=broad-except
102
103
  logger.warning(f'run_instances error: {e}')
103
104
  raise
@@ -5,7 +5,7 @@
5
5
  # python sdk.
6
6
  #
7
7
  """Vast library wrapper for SkyPilot."""
8
- from typing import Any, Dict, List
8
+ from typing import Any, Dict, List, Optional
9
9
 
10
10
  from sky import sky_logging
11
11
  from sky.adaptors import vast
@@ -34,7 +34,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
34
34
 
35
35
 
36
36
  def launch(name: str, instance_type: str, region: str, disk_size: int,
37
- image_name: str, preemptible: bool) -> str:
37
+ image_name: str, ports: Optional[List[int]],
38
+ preemptible: bool) -> str:
38
39
  """Launches an instance with the given parameters.
39
40
 
40
41
  Converts the instance_type to the Vast GPU name, finds the specs for the
@@ -58,6 +59,8 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
58
59
  The disk size {xx} GB is not exactly matched the requested
59
60
  size {yy} GB. It is possible to charge extra cost on disk.
60
61
 
62
+ * `ports`: This is a feature flag to expose ports to the internet.
63
+
61
64
  * `geolocation`: Geolocation on Vast can be as specific as the
62
65
  host chooses to be. They can say, for instance, "Yutakachō,
63
66
  Shinagawa District, Tokyo, JP." Such a specific geolocation
@@ -79,9 +82,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
79
82
 
80
83
  * Vast instance types are an invention for skypilot. Refer to
81
84
  catalog/vast_catalog.py for the current construction
82
- of the type.
83
-
84
- """
85
+ of the type."""
85
86
  cpu_ram = float(instance_type.split('-')[-1]) / 1024
86
87
  gpu_name = instance_type.split('-')[1].replace('_', ' ')
87
88
  num_gpus = int(instance_type.split('-')[0].replace('x', ''))
@@ -104,11 +105,13 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
104
105
 
105
106
  instance_touse = instance_list[0]
106
107
 
108
+ port_map = ' '.join([f'-p {p}:{p}' for p in ports]) if ports else ''
109
+
107
110
  launch_params = {
108
111
  'id': instance_touse['id'],
109
112
  'direct': True,
110
113
  'ssh': True,
111
- 'env': '-e __SOURCE=skypilot',
114
+ 'env': f'-e __SOURCE=skypilot {port_map}',
112
115
  'onstart_cmd': ';'.join([
113
116
  'touch ~/.no_auto_tmux',
114
117
  f'echo "{vast.vast().api_key_access}" > ~/.vast_api_key',
sky/py.typed ADDED
File without changes
sky/resources.py CHANGED
@@ -19,6 +19,7 @@ from sky.clouds import cloud as sky_cloud
19
19
  from sky.provision import docker_utils
20
20
  from sky.provision.gcp import constants as gcp_constants
21
21
  from sky.provision.kubernetes import utils as kubernetes_utils
22
+ from sky.provision.nebius import constants as nebius_constants
22
23
  from sky.skylet import constants
23
24
  from sky.utils import accelerator_registry
24
25
  from sky.utils import annotations
@@ -797,8 +798,13 @@ class Resources:
797
798
 
798
799
  acc, _ = list(accelerators.items())[0]
799
800
  if 'tpu' in acc.lower():
801
+ # TODO(syang): GCP TPU names are supported on both GCP and
802
+ # kubernetes (GKE), but this logic automatically assumes
803
+ # GCP TPUs can only be used on GCP.
804
+ # Fix the logic such that GCP TPU names can failover between
805
+ # GCP and kubernetes.
800
806
  if self.cloud is None:
801
- if kubernetes_utils.is_tpu_on_gke(acc):
807
+ if kubernetes_utils.is_tpu_on_gke(acc, normalize=False):
802
808
  self._cloud = clouds.Kubernetes()
803
809
  else:
804
810
  self._cloud = clouds.GCP()
@@ -813,7 +819,8 @@ class Resources:
813
819
 
814
820
  use_tpu_vm = accelerator_args.get('tpu_vm', True)
815
821
  if (self.cloud.is_same_cloud(clouds.GCP()) and
816
- not kubernetes_utils.is_tpu_on_gke(acc)):
822
+ not kubernetes_utils.is_tpu_on_gke(acc,
823
+ normalize=False)):
817
824
  if 'runtime_version' not in accelerator_args:
818
825
 
819
826
  def _get_default_runtime_version() -> str:
@@ -1254,15 +1261,19 @@ class Resources:
1254
1261
  ValueError: if the attribute is invalid.
1255
1262
  """
1256
1263
 
1257
- if (self._network_tier == resources_utils.NetworkTier.BEST and
1258
- isinstance(self._cloud, clouds.GCP)):
1259
- # Handle GPU Direct TCPX requirement for docker images
1260
- if self._image_id is None:
1261
- # No custom image specified - use the default GPU Direct image
1262
- self._image_id = {
1263
- self._region: gcp_constants.GCP_GPU_DIRECT_IMAGE_ID
1264
- }
1265
- else:
1264
+ if self._network_tier == resources_utils.NetworkTier.BEST:
1265
+ if isinstance(self._cloud, clouds.GCP):
1266
+ # Handle GPU Direct TCPX requirement for docker images
1267
+ if self._image_id is None:
1268
+ self._image_id = {
1269
+ self._region: gcp_constants.GCP_GPU_DIRECT_IMAGE_ID
1270
+ }
1271
+ elif isinstance(self._cloud, clouds.Nebius):
1272
+ if self._image_id is None:
1273
+ self._image_id = {
1274
+ self._region: nebius_constants.INFINIBAND_IMAGE_ID
1275
+ }
1276
+ elif self._image_id:
1266
1277
  # Custom image specified - validate it's a docker image
1267
1278
  # Check if any of the specified images are not docker images
1268
1279
  non_docker_images = []
@@ -1274,14 +1285,13 @@ class Resources:
1274
1285
  if non_docker_images:
1275
1286
  with ux_utils.print_exception_no_traceback():
1276
1287
  raise ValueError(
1277
- f'When using network_tier=BEST on GCP, image_id '
1288
+ f'When using network_tier=BEST, image_id '
1278
1289
  f'must be a docker image. '
1279
1290
  f'Found non-docker images: '
1280
1291
  f'{", ".join(non_docker_images)}. '
1281
1292
  f'Please either: (1) use a docker image '
1282
1293
  f'(prefix with "docker:"), or '
1283
- f'(2) leave image_id empty to use the default '
1284
- f'GPU Direct TCPX image.')
1294
+ f'(2) leave image_id empty to use the default')
1285
1295
 
1286
1296
  if self._image_id is None:
1287
1297
  return
@@ -0,0 +1,42 @@
1
+ """Columns for cluster pool.
2
+
3
+ Revision ID: 002
4
+ Revises: 001
5
+ Create Date: 2025-07-18
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '002'
18
+ down_revision: Union[str, Sequence[str], None] = '001'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add columns for cluster pool."""
25
+ with op.get_context().autocommit_block():
26
+ db_utils.add_column_to_table_alembic('job_info',
27
+ 'pool',
28
+ sa.Text(),
29
+ server_default=None)
30
+ db_utils.add_column_to_table_alembic('job_info',
31
+ 'current_cluster_name',
32
+ sa.Text(),
33
+ server_default=None)
34
+ db_utils.add_column_to_table_alembic('job_info',
35
+ 'job_id_on_pool_cluster',
36
+ sa.Integer(),
37
+ server_default=None)
38
+
39
+
40
+ def downgrade():
41
+ """Remove columns for cluster pool."""
42
+ pass
sky/serve/autoscalers.py CHANGED
@@ -175,6 +175,14 @@ class Autoscaler:
175
175
  """Collect request information from aggregator for autoscaling."""
176
176
  raise NotImplementedError
177
177
 
178
+ def info(self) -> Dict[str, Any]:
179
+ """Get information about the autoscaler."""
180
+ return {
181
+ 'target_num_replicas': self.target_num_replicas,
182
+ 'min_replicas': self.min_replicas,
183
+ 'max_replicas': self.max_replicas,
184
+ }
185
+
178
186
  def _generate_scaling_decisions(
179
187
  self,
180
188
  replica_infos: List['replica_managers.ReplicaInfo'],