skypilot-nightly 1.0.0.dev20250728__py3-none-any.whl → 1.0.0.dev20250730__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (152) hide show
  1. sky/__init__.py +2 -2
  2. sky/catalog/kubernetes_catalog.py +2 -2
  3. sky/client/cli/command.py +0 -7
  4. sky/client/common.py +12 -9
  5. sky/clouds/kubernetes.py +2 -1
  6. sky/clouds/nebius.py +1 -1
  7. sky/clouds/utils/gcp_utils.py +1 -1
  8. sky/clouds/vast.py +1 -2
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/_r2LwCFLjlWjZDUIJQG_V/_buildManifest.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
  13. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
  14. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
  16. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  19. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
  25. sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
  26. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  28. sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  30. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  31. sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
  33. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  36. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
  40. sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  43. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
  46. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
  48. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
  55. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
  62. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  63. sky/dashboard/out/clusters/[cluster].html +1 -1
  64. sky/dashboard/out/clusters.html +1 -1
  65. sky/dashboard/out/config.html +1 -1
  66. sky/dashboard/out/index.html +1 -1
  67. sky/dashboard/out/infra/[context].html +1 -1
  68. sky/dashboard/out/infra.html +1 -1
  69. sky/dashboard/out/jobs/[job].html +1 -1
  70. sky/dashboard/out/jobs.html +1 -1
  71. sky/dashboard/out/users.html +1 -1
  72. sky/dashboard/out/volumes.html +1 -1
  73. sky/dashboard/out/workspace/new.html +1 -1
  74. sky/dashboard/out/workspaces/[name].html +1 -1
  75. sky/dashboard/out/workspaces.html +1 -1
  76. sky/data/data_utils.py +25 -0
  77. sky/data/storage.py +1219 -1775
  78. sky/global_user_state.py +18 -8
  79. sky/jobs/server/core.py +4 -1
  80. sky/jobs/state.py +35 -7
  81. sky/jobs/utils.py +35 -17
  82. sky/logs/agent.py +0 -14
  83. sky/logs/aws.py +4 -30
  84. sky/provision/kubernetes/instance.py +4 -3
  85. sky/provision/kubernetes/utils.py +56 -31
  86. sky/provision/vast/instance.py +2 -1
  87. sky/provision/vast/utils.py +9 -6
  88. sky/resources.py +8 -2
  89. sky/serve/server/core.py +21 -2
  90. sky/serve/service.py +22 -2
  91. sky/server/server.py +7 -2
  92. sky/templates/sky-serve-controller.yaml.j2 +3 -0
  93. sky/utils/kubernetes/gpu_labeler.py +2 -2
  94. sky/utils/schemas.py +5 -1
  95. {skypilot_nightly-1.0.0.dev20250728.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/METADATA +1 -1
  96. {skypilot_nightly-1.0.0.dev20250728.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/RECORD +101 -100
  97. sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
  98. sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
  99. sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
  100. sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
  101. sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
  102. sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
  105. sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
  109. sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
  110. sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
  111. sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
  112. sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
  114. sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
  116. sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
  118. sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
  119. sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
  121. sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
  123. sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
  125. sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
  126. sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
  128. sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
  131. sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
  133. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
  134. sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
  137. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
  140. sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
  146. sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
  147. sky/dashboard/out/_next/static/ucBqsWPN0A5D2kXj8-FqQ/_buildManifest.js +0 -1
  148. /sky/dashboard/out/_next/static/{ucBqsWPN0A5D2kXj8-FqQ → _r2LwCFLjlWjZDUIJQG_V}/_ssgManifest.js +0 -0
  149. {skypilot_nightly-1.0.0.dev20250728.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/WHEEL +0 -0
  150. {skypilot_nightly-1.0.0.dev20250728.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/entry_points.txt +0 -0
  151. {skypilot_nightly-1.0.0.dev20250728.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/licenses/LICENSE +0 -0
  152. {skypilot_nightly-1.0.0.dev20250728.dist-info → skypilot_nightly-1.0.0.dev20250730.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -11,6 +11,7 @@ import json
11
11
  import os
12
12
  import pickle
13
13
  import re
14
+ import threading
14
15
  import time
15
16
  import typing
16
17
  from typing import Any, Dict, List, Optional, Set, Tuple
@@ -47,6 +48,7 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
47
48
  _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
48
49
 
49
50
  _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
51
+ _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
50
52
 
51
53
  Base = declarative.declarative_base()
52
54
 
@@ -241,21 +243,29 @@ def create_table(engine: sqlalchemy.engine.Engine):
241
243
  migration_utils.GLOBAL_USER_STATE_VERSION)
242
244
 
243
245
 
246
+ # We wrap the sqlalchemy engine initialization in a thread
247
+ # lock to ensure that multiple threads do not initialize the
248
+ # engine which could result in a rare race condition where
249
+ # a session has already been created with _SQLALCHEMY_ENGINE = e1,
250
+ # and then another thread overwrites _SQLALCHEMY_ENGINE = e2
251
+ # which could result in e1 being garbage collected unexpectedly.
244
252
  def initialize_and_get_db() -> sqlalchemy.engine.Engine:
245
253
  global _SQLALCHEMY_ENGINE
246
254
 
247
255
  if _SQLALCHEMY_ENGINE is not None:
248
256
  return _SQLALCHEMY_ENGINE
257
+ with _SQLALCHEMY_ENGINE_LOCK:
258
+ if _SQLALCHEMY_ENGINE is not None:
259
+ return _SQLALCHEMY_ENGINE
260
+ # get an engine to the db
261
+ engine = migration_utils.get_engine('state')
249
262
 
250
- # get an engine to the db
251
- engine = migration_utils.get_engine('state')
263
+ # run migrations if needed
264
+ create_table(engine)
252
265
 
253
- # run migrations if needed
254
- create_table(engine)
255
-
256
- # return engine
257
- _SQLALCHEMY_ENGINE = engine
258
- return _SQLALCHEMY_ENGINE
266
+ # return engine
267
+ _SQLALCHEMY_ENGINE = engine
268
+ return _SQLALCHEMY_ENGINE
259
269
 
260
270
 
261
271
  def _init_db(func):
sky/jobs/server/core.py CHANGED
@@ -59,7 +59,10 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
59
59
  # as uploading to the controller is only a local copy.
60
60
  storage_clouds = (
61
61
  storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
62
- if not managed_job_utils.is_consolidation_mode() and storage_clouds:
62
+ force_disable_cloud_bucket = skypilot_config.get_nested(
63
+ ('jobs', 'force_disable_cloud_bucket'), False)
64
+ if (not managed_job_utils.is_consolidation_mode() and storage_clouds and
65
+ not force_disable_cloud_bucket):
63
66
  for task_ in dag.tasks:
64
67
  controller_utils.maybe_translate_local_file_mounts_and_sync_up(
65
68
  task_, task_type='jobs')
sky/jobs/state.py CHANGED
@@ -4,6 +4,7 @@
4
4
  import enum
5
5
  import functools
6
6
  import json
7
+ import threading
7
8
  import time
8
9
  import typing
9
10
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -33,6 +34,7 @@ CallbackType = Callable[[str], None]
33
34
  logger = sky_logging.init_logger(__name__)
34
35
 
35
36
  _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
37
+ _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
36
38
 
37
39
  Base = declarative.declarative_base()
38
40
 
@@ -131,21 +133,30 @@ def create_table(engine: sqlalchemy.engine.Engine):
131
133
  migration_utils.SPOT_JOBS_VERSION)
132
134
 
133
135
 
136
+ # We wrap the sqlalchemy engine initialization in a thread
137
+ # lock to ensure that multiple threads do not initialize the
138
+ # engine which could result in a rare race condition where
139
+ # a session has already been created with _SQLALCHEMY_ENGINE = e1,
140
+ # and then another thread overwrites _SQLALCHEMY_ENGINE = e2
141
+ # which could result in e1 being garbage collected unexpectedly.
134
142
  def initialize_and_get_db() -> sqlalchemy.engine.Engine:
135
143
  global _SQLALCHEMY_ENGINE
136
144
 
137
145
  if _SQLALCHEMY_ENGINE is not None:
138
146
  return _SQLALCHEMY_ENGINE
139
147
 
140
- # get an engine to the db
141
- engine = migration_utils.get_engine('spot_jobs')
148
+ with _SQLALCHEMY_ENGINE_LOCK:
149
+ if _SQLALCHEMY_ENGINE is not None:
150
+ return _SQLALCHEMY_ENGINE
151
+ # get an engine to the db
152
+ engine = migration_utils.get_engine('spot_jobs')
142
153
 
143
- # run migrations if needed
144
- create_table(engine)
154
+ # run migrations if needed
155
+ create_table(engine)
145
156
 
146
- # return engine
147
- _SQLALCHEMY_ENGINE = engine
148
- return _SQLALCHEMY_ENGINE
157
+ # return engine
158
+ _SQLALCHEMY_ENGINE = engine
159
+ return _SQLALCHEMY_ENGINE
149
160
 
150
161
 
151
162
  def _init_db(func):
@@ -1045,6 +1056,23 @@ def _get_all_task_ids_statuses(
1045
1056
  return [(row[0], ManagedJobStatus(row[1])) for row in id_statuses]
1046
1057
 
1047
1058
 
1059
+ @_init_db
1060
+ def get_all_task_ids_names_statuses_logs(
1061
+ job_id: int) -> List[Tuple[int, str, ManagedJobStatus, str]]:
1062
+ assert _SQLALCHEMY_ENGINE is not None
1063
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1064
+ id_names = session.execute(
1065
+ sqlalchemy.select(
1066
+ spot_table.c.task_id,
1067
+ spot_table.c.task_name,
1068
+ spot_table.c.status,
1069
+ spot_table.c.local_log_file,
1070
+ ).where(spot_table.c.spot_job_id == job_id).order_by(
1071
+ spot_table.c.task_id.asc())).fetchall()
1072
+ return [(row[0], row[1], ManagedJobStatus(row[2]), row[3])
1073
+ for row in id_names]
1074
+
1075
+
1048
1076
  @_init_db
1049
1077
  def get_job_status_with_task_id(job_id: int,
1050
1078
  task_id: int) -> Optional[ManagedJobStatus]:
sky/jobs/utils.py CHANGED
@@ -716,23 +716,41 @@ def stream_logs_by_id(job_id: int,
716
716
  if managed_job_status.is_failed():
717
717
  job_msg = ('\nFailure reason: '
718
718
  f'{managed_job_state.get_failure_reason(job_id)}')
719
- log_file = managed_job_state.get_local_log_file(job_id, None)
720
- if log_file is not None:
721
- with open(os.path.expanduser(log_file), 'r',
722
- encoding='utf-8') as f:
723
- # Stream the logs to the console without reading the whole
724
- # file into memory.
725
- start_streaming = False
726
- read_from: Union[TextIO, Deque[str]] = f
727
- if tail is not None:
728
- assert tail > 0
729
- # Read only the last 'tail' lines using deque
730
- read_from = collections.deque(f, maxlen=tail)
731
- for line in read_from:
732
- if log_lib.LOG_FILE_START_STREAMING_AT in line:
733
- start_streaming = True
734
- if start_streaming:
735
- print(line, end='', flush=True)
719
+ log_file_exists = False
720
+ task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
721
+ job_id)
722
+ num_tasks = len(task_info)
723
+ for task_id, task_name, task_status, log_file in task_info:
724
+ if log_file:
725
+ log_file_exists = True
726
+ task_str = (f'Task {task_name}({task_id})'
727
+ if task_name else f'Task {task_id}')
728
+ if num_tasks > 1:
729
+ print(f'=== {task_str} ===')
730
+ with open(os.path.expanduser(log_file),
731
+ 'r',
732
+ encoding='utf-8') as f:
733
+ # Stream the logs to the console without reading the
734
+ # whole file into memory.
735
+ start_streaming = False
736
+ read_from: Union[TextIO, Deque[str]] = f
737
+ if tail is not None:
738
+ assert tail > 0
739
+ # Read only the last 'tail' lines using deque
740
+ read_from = collections.deque(f, maxlen=tail)
741
+ for line in read_from:
742
+ if log_lib.LOG_FILE_START_STREAMING_AT in line:
743
+ start_streaming = True
744
+ if start_streaming:
745
+ print(line, end='', flush=True)
746
+ if num_tasks > 1:
747
+ # Add the "Task finished" message for terminal states
748
+ if task_status.is_terminal():
749
+ print(ux_utils.finishing_message(
750
+ f'{task_str} finished '
751
+ f'(status: {task_status.value}).'),
752
+ flush=True)
753
+ if log_file_exists:
736
754
  # Add the "Job finished" message for terminal states
737
755
  if managed_job_status.is_terminal():
738
756
  print(ux_utils.finishing_message(
sky/logs/agent.py CHANGED
@@ -67,20 +67,6 @@ class FluentbitAgent(LoggingAgent):
67
67
  }
68
68
  return common_utils.dump_yaml_str(cfg_dict)
69
69
 
70
- def add_fallback_outputs(self, cfg_dict: Dict[str, Any]) -> Dict[str, Any]:
71
- """Add fallback outputs to the Fluent Bit configuration.
72
-
73
- This method can be overridden by subclasses to add fallback outputs
74
- in case the primary output fails.
75
-
76
- Args:
77
- cfg_dict: The Fluent Bit configuration dictionary.
78
-
79
- Returns:
80
- The updated configuration dictionary.
81
- """
82
- return cfg_dict
83
-
84
70
  @abc.abstractmethod
85
71
  def fluentbit_output_config(
86
72
  self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
sky/logs/aws.py CHANGED
@@ -9,6 +9,8 @@ from sky.skylet import constants
9
9
  from sky.utils import common_utils
10
10
  from sky.utils import resources_utils
11
11
 
12
+ EC2_MD_URL = '"${AWS_EC2_METADATA_SERVICE_ENDPOINT:-http://169.254.169.254/}"'
13
+
12
14
 
13
15
  class _CloudwatchLoggingConfig(pydantic.BaseModel):
14
16
  """Configuration for AWS CloudWatch logging agent."""
@@ -109,8 +111,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
109
111
  # Check if we're running on EC2 with an IAM role or if
110
112
  # AWS credentials are available in the environment
111
113
  pre_cmd = (
112
- 'if ! curl -s -m 1 http://169.254.169.254'
113
- '/latest/meta-data/iam/security-credentials/ > /dev/null; '
114
+ f'if ! curl -s -m 1 {EC2_MD_URL}'
115
+ 'latest/meta-data/iam/security-credentials/ > /dev/null; '
114
116
  'then '
115
117
  # failed EC2 check, look for env vars
116
118
  'if [ -z "$AWS_ACCESS_KEY_ID" ] || '
@@ -211,36 +213,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
211
213
  }
212
214
  }
213
215
 
214
- # Add fallback outputs for graceful failure handling
215
- cfg_dict = self.add_fallback_outputs(cfg_dict)
216
-
217
216
  return common_utils.dump_yaml_str(cfg_dict)
218
217
 
219
- def add_fallback_outputs(self, cfg_dict: Dict[str, Any]) -> Dict[str, Any]:
220
- """Add fallback outputs to the Fluent Bit configuration.
221
-
222
- This adds a local file output as a fallback in case
223
- CloudWatch logging fails.
224
-
225
- Args:
226
- cfg_dict: The Fluent Bit configuration dictionary.
227
-
228
- Returns:
229
- The updated configuration dictionary.
230
- """
231
- # Add a local file output as a fallback
232
- fallback_output = {
233
- 'name': 'file',
234
- 'match': '*',
235
- 'path': '/tmp/skypilot_logs_fallback.log',
236
- 'format': 'out_file',
237
- }
238
-
239
- # Add the fallback output to the configuration
240
- cfg_dict['pipeline']['outputs'].append(fallback_output)
241
-
242
- return cfg_dict
243
-
244
218
  def fluentbit_output_config(
245
219
  self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
246
220
  """Get the Fluent Bit output configuration for CloudWatch.
@@ -210,7 +210,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
210
210
  # case we will need to update this logic.
211
211
  # TODO(Doyoung): Update the error message raised
212
212
  # with the multi-host TPU support.
213
- gpu_resource_key = kubernetes_utils.get_gpu_resource_key() # pylint: disable=line-too-long
213
+ gpu_resource_key = kubernetes_utils.get_gpu_resource_key(context) # pylint: disable=line-too-long
214
214
  if 'Insufficient google.com/tpu' in event_message:
215
215
  extra_msg = (
216
216
  f'Verify if '
@@ -797,7 +797,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
797
797
  limits = pod_spec['spec']['containers'][0].get('resources',
798
798
  {}).get('limits')
799
799
  if limits is not None:
800
- needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(), 0) > 0
800
+ needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(context),
801
+ 0) > 0
801
802
 
802
803
  # TPU pods provisioned on GKE use the default containerd runtime.
803
804
  # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long
@@ -900,7 +901,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
900
901
  # to the non-DWS case.
901
902
  if needs_gpus:
902
903
  gpu_toleration = {
903
- 'key': kubernetes_utils.get_gpu_resource_key(),
904
+ 'key': kubernetes_utils.get_gpu_resource_key(context),
904
905
  'operator': 'Exists',
905
906
  'effect': 'NoSchedule'
906
907
  }
@@ -147,12 +147,14 @@ MEMORY_SIZE_UNITS = {
147
147
  # The resource keys used by Kubernetes to track NVIDIA GPUs and Google TPUs on
148
148
  # nodes. These keys are typically used in the node's status.allocatable
149
149
  # or status.capacity fields to indicate the available resources on the node.
150
- GPU_RESOURCE_KEY = 'nvidia.com/gpu'
150
+ SUPPORTED_GPU_RESOURCE_KEYS = {'amd': 'amd.com/gpu', 'nvidia': 'nvidia.com/gpu'}
151
151
  TPU_RESOURCE_KEY = 'google.com/tpu'
152
152
 
153
153
  NO_ACCELERATOR_HELP_MESSAGE = (
154
154
  'If your cluster contains GPUs or TPUs, make sure '
155
- f'{GPU_RESOURCE_KEY} or {TPU_RESOURCE_KEY} resource is available '
155
+ f'one of {SUPPORTED_GPU_RESOURCE_KEYS["amd"]}, '
156
+ f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]} or '
157
+ f'{TPU_RESOURCE_KEY} resource is available '
156
158
  'on the nodes and the node labels for identifying GPUs/TPUs '
157
159
  '(e.g., skypilot.co/accelerator) are setup correctly. ')
158
160
 
@@ -391,6 +393,8 @@ def get_gke_accelerator_name(accelerator: str) -> str:
391
393
  return 'nvidia-h200-141gb'
392
394
  elif accelerator.startswith('tpu-'):
393
395
  return accelerator
396
+ elif accelerator.startswith('amd-'):
397
+ return accelerator
394
398
  else:
395
399
  return 'nvidia-tesla-{}'.format(accelerator.lower())
396
400
 
@@ -1098,10 +1102,10 @@ def detect_accelerator_resource(
1098
1102
  context: Optional[str]) -> Tuple[bool, Set[str]]:
1099
1103
  """Checks if the Kubernetes cluster has GPU/TPU resource.
1100
1104
 
1101
- Two types of accelerator resources are available which are each checked
1102
- with nvidia.com/gpu and google.com/tpu. If nvidia.com/gpu resource is
1105
+ Three types of accelerator resources are available which are each checked
1106
+ with amd.com/gpu, nvidia.com/gpu and google.com/tpu. If amd.com/gpu or nvidia.com/gpu resource is
1103
1107
  missing, that typically means that the Kubernetes cluster does not have
1104
- GPUs or the nvidia GPU operator and/or device drivers are not installed.
1108
+ GPUs or the amd/nvidia GPU operator and/or device drivers are not installed.
1105
1109
 
1106
1110
  Returns:
1107
1111
  bool: True if the cluster has GPU_RESOURCE_KEY or TPU_RESOURCE_KEY
@@ -1112,7 +1116,7 @@ def detect_accelerator_resource(
1112
1116
  nodes = get_kubernetes_nodes(context=context)
1113
1117
  for node in nodes:
1114
1118
  cluster_resources.update(node.status.allocatable.keys())
1115
- has_accelerator = (get_gpu_resource_key() in cluster_resources or
1119
+ has_accelerator = (get_gpu_resource_key(context) in cluster_resources or
1116
1120
  TPU_RESOURCE_KEY in cluster_resources)
1117
1121
 
1118
1122
  return has_accelerator, cluster_resources
@@ -1262,8 +1266,8 @@ def check_instance_fits(context: Optional[str],
1262
1266
  else:
1263
1267
  # Check if any of the GPU nodes have sufficient number of GPUs.
1264
1268
  gpu_nodes = [
1265
- node for node in gpu_nodes if
1266
- get_node_accelerator_count(node.status.allocatable) >= acc_count
1269
+ node for node in gpu_nodes if get_node_accelerator_count(
1270
+ context, node.status.allocatable) >= acc_count
1267
1271
  ]
1268
1272
  if not gpu_nodes:
1269
1273
  return False, (
@@ -1325,14 +1329,14 @@ def get_accelerator_label_key_values(
1325
1329
  Raises:
1326
1330
  ResourcesUnavailableError: Can be raised from the following conditions:
1327
1331
  - The cluster does not have GPU/TPU resources
1328
- (nvidia.com/gpu, google.com/tpu)
1332
+ (amd.com/gpu, nvidia.com/gpu, google.com/tpu)
1329
1333
  - The cluster has GPU/TPU resources, but no node in the cluster has
1330
1334
  an accelerator label.
1331
1335
  - The cluster has a node with an invalid accelerator label value.
1332
1336
  - The cluster doesn't have any nodes with acc_type GPU/TPU
1333
1337
  """
1334
1338
  # Check if the cluster has GPU resources
1335
- # TODO(romilb): This assumes the accelerator is a nvidia GPU. We
1339
+ # TODO(romilb): This assumes the accelerator is a amd/nvidia GPU. We
1336
1340
  # need to support TPUs and other accelerators as well.
1337
1341
  # TODO(romilb): Currently, we broadly disable all GPU checks if autoscaling
1338
1342
  # is configured in config.yaml since the cluster may be scaling up from
@@ -1496,12 +1500,15 @@ def get_accelerator_label_key_values(
1496
1500
  f'`sky ssh up --infra {context_display_name}`. {suffix}')
1497
1501
  else:
1498
1502
  msg = (
1499
- f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or '
1503
+ f'Could not detect GPU/TPU resources ({SUPPORTED_GPU_RESOURCE_KEYS["amd"]!r}, '
1504
+ f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]!r} or '
1500
1505
  f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
1501
1506
  ' contains GPUs, please ensure GPU drivers are installed on '
1502
1507
  'the node. Check if the GPUs are setup correctly by running '
1503
1508
  '`kubectl describe nodes` and looking for the '
1504
- f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
1509
+ f'{SUPPORTED_GPU_RESOURCE_KEYS["amd"]!r}, '
1510
+ f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]!r} or '
1511
+ f'{TPU_RESOURCE_KEY!r} resource. '
1505
1512
  'Please refer to the documentation on how to set up GPUs.'
1506
1513
  f'{suffix}')
1507
1514
  raise exceptions.ResourcesUnavailableError(msg)
@@ -2861,7 +2868,7 @@ def get_unlabeled_accelerator_nodes(context: Optional[str] = None) -> List[Any]:
2861
2868
  nodes = get_kubernetes_nodes(context=context)
2862
2869
  nodes_with_accelerator = []
2863
2870
  for node in nodes:
2864
- if get_gpu_resource_key() in node.status.capacity:
2871
+ if get_gpu_resource_key(context) in node.status.capacity:
2865
2872
  nodes_with_accelerator.append(node)
2866
2873
 
2867
2874
  label_formatter, _ = detect_gpu_label_formatter(context)
@@ -2950,7 +2957,8 @@ def get_kubernetes_node_info(
2950
2957
  break
2951
2958
 
2952
2959
  allocated_qty = 0
2953
- accelerator_count = get_node_accelerator_count(node.status.allocatable)
2960
+ accelerator_count = get_node_accelerator_count(context,
2961
+ node.status.allocatable)
2954
2962
 
2955
2963
  if pods is None:
2956
2964
  accelerators_available = -1
@@ -2965,7 +2973,7 @@ def get_kubernetes_node_info(
2965
2973
  for container in pod.spec.containers:
2966
2974
  if container.resources.requests:
2967
2975
  allocated_qty += get_node_accelerator_count(
2968
- container.resources.requests)
2976
+ context, container.resources.requests)
2969
2977
 
2970
2978
  accelerators_available = accelerator_count - allocated_qty
2971
2979
 
@@ -3171,13 +3179,16 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
3171
3179
  return pods
3172
3180
 
3173
3181
 
3174
- def is_tpu_on_gke(accelerator: str) -> bool:
3182
+ def is_tpu_on_gke(accelerator: str, normalize: bool = True) -> bool:
3175
3183
  """Determines if the given accelerator is a TPU supported on GKE."""
3176
- normalized, _ = normalize_tpu_accelerator_name(accelerator)
3177
- return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
3184
+ if normalize:
3185
+ normalized, _ = normalize_tpu_accelerator_name(accelerator)
3186
+ return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
3187
+ return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
3178
3188
 
3179
3189
 
3180
- def get_node_accelerator_count(attribute_dict: dict) -> int:
3190
+ def get_node_accelerator_count(context: Optional[str],
3191
+ attribute_dict: dict) -> int:
3181
3192
  """Retrieves the count of accelerators from a node's resource dictionary.
3182
3193
 
3183
3194
  This method checks the node's allocatable resources or the accelerators
@@ -3192,7 +3203,7 @@ def get_node_accelerator_count(attribute_dict: dict) -> int:
3192
3203
  Number of accelerators allocated or available from the node. If no
3193
3204
  resource is found, it returns 0.
3194
3205
  """
3195
- gpu_resource_name = get_gpu_resource_key()
3206
+ gpu_resource_name = get_gpu_resource_key(context)
3196
3207
  assert not (gpu_resource_name in attribute_dict and
3197
3208
  TPU_RESOURCE_KEY in attribute_dict)
3198
3209
  if gpu_resource_name in attribute_dict:
@@ -3318,7 +3329,7 @@ def process_skypilot_pods(
3318
3329
  unit='G')
3319
3330
  gpu_count = parse_cpu_or_gpu_resource(
3320
3331
  pod.spec.containers[0].resources.requests.get(
3321
- 'nvidia.com/gpu', '0'))
3332
+ get_gpu_resource_key(context), '0'))
3322
3333
  gpu_name = None
3323
3334
  if gpu_count > 0:
3324
3335
  label_formatter, _ = (detect_gpu_label_formatter(context))
@@ -3373,19 +3384,33 @@ def process_skypilot_pods(
3373
3384
  return list(clusters.values()), jobs_controllers, serve_controllers
3374
3385
 
3375
3386
 
3376
- def get_gpu_resource_key():
3377
- """Get the GPU resource name to use in kubernetes.
3378
- The function first checks for an environment variable.
3379
- If defined, it uses its value; otherwise, it returns the default value.
3380
- Args:
3381
- name (str): Default GPU resource name, default is "nvidia.com/gpu".
3387
+ def _gpu_resource_key_helper(context: Optional[str]) -> str:
3388
+ """Helper function to get the GPU resource key."""
3389
+ gpu_resource_key = SUPPORTED_GPU_RESOURCE_KEYS['nvidia']
3390
+ try:
3391
+ nodes = kubernetes.core_api(context).list_node().items
3392
+ for gpu_key in SUPPORTED_GPU_RESOURCE_KEYS.values():
3393
+ if any(gpu_key in node.status.capacity for node in nodes):
3394
+ return gpu_key
3395
+ except Exception as e: # pylint: disable=broad-except
3396
+ logger.warning(f'Failed to load kube config or query nodes: {e}. '
3397
+ 'Falling back to default GPU resource key.')
3398
+ return gpu_resource_key
3399
+
3400
+
3401
+ @annotations.lru_cache(scope='request')
3402
+ def get_gpu_resource_key(context: Optional[str] = None) -> str:
3403
+ """Get the GPU resource name to use in Kubernetes.
3404
+
3405
+ The function auto-detects the GPU resource key by querying the Kubernetes node API.
3406
+ If detection fails, it falls back to a default value.
3407
+ An environment variable can override the detected or default value.
3408
+
3382
3409
  Returns:
3383
3410
  str: The selected GPU resource name.
3384
3411
  """
3385
- # Retrieve GPU resource name from environment variable, if set.
3386
- # Else use default.
3387
- # E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
3388
- return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY)
3412
+ gpu_resource_key = _gpu_resource_key_helper(context)
3413
+ return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=gpu_resource_key)
3389
3414
 
3390
3415
 
3391
3416
  def get_kubeconfig_paths() -> List[str]:
@@ -97,7 +97,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
97
97
  region=region,
98
98
  disk_size=config.node_config['DiskSize'],
99
99
  preemptible=config.node_config['Preemptible'],
100
- image_name=config.node_config['ImageId'])
100
+ image_name=config.node_config['ImageId'],
101
+ ports=config.ports_to_open_on_launch)
101
102
  except Exception as e: # pylint: disable=broad-except
102
103
  logger.warning(f'run_instances error: {e}')
103
104
  raise
@@ -5,7 +5,7 @@
5
5
  # python sdk.
6
6
  #
7
7
  """Vast library wrapper for SkyPilot."""
8
- from typing import Any, Dict, List
8
+ from typing import Any, Dict, List, Optional
9
9
 
10
10
  from sky import sky_logging
11
11
  from sky.adaptors import vast
@@ -34,7 +34,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
34
34
 
35
35
 
36
36
  def launch(name: str, instance_type: str, region: str, disk_size: int,
37
- image_name: str, preemptible: bool) -> str:
37
+ image_name: str, ports: Optional[List[int]],
38
+ preemptible: bool) -> str:
38
39
  """Launches an instance with the given parameters.
39
40
 
40
41
  Converts the instance_type to the Vast GPU name, finds the specs for the
@@ -58,6 +59,8 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
58
59
  The disk size {xx} GB is not exactly matched the requested
59
60
  size {yy} GB. It is possible to charge extra cost on disk.
60
61
 
62
+ * `ports`: This is a feature flag to expose ports to the internet.
63
+
61
64
  * `geolocation`: Geolocation on Vast can be as specific as the
62
65
  host chooses to be. They can say, for instance, "Yutakachō,
63
66
  Shinagawa District, Tokyo, JP." Such a specific geolocation
@@ -79,9 +82,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
79
82
 
80
83
  * Vast instance types are an invention for skypilot. Refer to
81
84
  catalog/vast_catalog.py for the current construction
82
- of the type.
83
-
84
- """
85
+ of the type."""
85
86
  cpu_ram = float(instance_type.split('-')[-1]) / 1024
86
87
  gpu_name = instance_type.split('-')[1].replace('_', ' ')
87
88
  num_gpus = int(instance_type.split('-')[0].replace('x', ''))
@@ -104,11 +105,13 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
104
105
 
105
106
  instance_touse = instance_list[0]
106
107
 
108
+ port_map = ' '.join([f'-p {p}:{p}' for p in ports]) if ports else ''
109
+
107
110
  launch_params = {
108
111
  'id': instance_touse['id'],
109
112
  'direct': True,
110
113
  'ssh': True,
111
- 'env': '-e __SOURCE=skypilot',
114
+ 'env': f'-e __SOURCE=skypilot {port_map}',
112
115
  'onstart_cmd': ';'.join([
113
116
  'touch ~/.no_auto_tmux',
114
117
  f'echo "{vast.vast().api_key_access}" > ~/.vast_api_key',
sky/resources.py CHANGED
@@ -797,8 +797,13 @@ class Resources:
797
797
 
798
798
  acc, _ = list(accelerators.items())[0]
799
799
  if 'tpu' in acc.lower():
800
+ # TODO(syang): GCP TPU names are supported on both GCP and
801
+ # kubernetes (GKE), but this logic automatically assumes
802
+ # GCP TPUs can only be used on GCP.
803
+ # Fix the logic such that GCP TPU names can failover between
804
+ # GCP and kubernetes.
800
805
  if self.cloud is None:
801
- if kubernetes_utils.is_tpu_on_gke(acc):
806
+ if kubernetes_utils.is_tpu_on_gke(acc, normalize=False):
802
807
  self._cloud = clouds.Kubernetes()
803
808
  else:
804
809
  self._cloud = clouds.GCP()
@@ -813,7 +818,8 @@ class Resources:
813
818
 
814
819
  use_tpu_vm = accelerator_args.get('tpu_vm', True)
815
820
  if (self.cloud.is_same_cloud(clouds.GCP()) and
816
- not kubernetes_utils.is_tpu_on_gke(acc)):
821
+ not kubernetes_utils.is_tpu_on_gke(acc,
822
+ normalize=False)):
817
823
  if 'runtime_version' not in accelerator_args:
818
824
 
819
825
  def _get_default_runtime_version() -> str:
sky/serve/server/core.py CHANGED
@@ -18,6 +18,7 @@ from sky import skypilot_config
18
18
  from sky import task as task_lib
19
19
  from sky.backends import backend_utils
20
20
  from sky.catalog import common as service_catalog_common
21
+ from sky.data import storage as storage_lib
21
22
  from sky.serve import constants as serve_constants
22
23
  from sky.serve import serve_state
23
24
  from sky.serve import serve_utils
@@ -151,8 +152,25 @@ def up(
151
152
 
152
153
  with rich_utils.safe_status(
153
154
  ux_utils.spinner_message('Initializing service')):
154
- controller_utils.maybe_translate_local_file_mounts_and_sync_up(
155
- task, task_type='serve')
155
+ # Handle file mounts using two-hop approach when cloud storage
156
+ # unavailable
157
+ storage_clouds = (
158
+ storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
159
+ force_disable_cloud_bucket = skypilot_config.get_nested(
160
+ ('serve', 'force_disable_cloud_bucket'), False)
161
+ if storage_clouds and not force_disable_cloud_bucket:
162
+ controller_utils.maybe_translate_local_file_mounts_and_sync_up(
163
+ task, task_type='serve')
164
+ local_to_controller_file_mounts = {}
165
+ else:
166
+ # Fall back to two-hop file_mount uploading when no cloud storage
167
+ if task.storage_mounts:
168
+ raise exceptions.NotSupportedError(
169
+ 'Cloud-based file_mounts are specified, but no cloud '
170
+ 'storage is available. Please specify local '
171
+ 'file_mounts only.')
172
+ local_to_controller_file_mounts = (
173
+ controller_utils.translate_local_file_mounts_to_two_hop(task))
156
174
 
157
175
  tls_template_vars = _rewrite_tls_credential_paths_and_get_tls_env_vars(
158
176
  service_name, task)
@@ -183,6 +201,7 @@ def up(
183
201
  'service_name': service_name,
184
202
  'controller_log_file': controller_log_file,
185
203
  'remote_user_config_path': remote_config_yaml_path,
204
+ 'local_to_controller_file_mounts': local_to_controller_file_mounts,
186
205
  'modified_catalogs':
187
206
  service_catalog_common.get_modified_catalog_file_mounts(),
188
207
  **tls_template_vars,