skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/client/cli/flags.py CHANGED
@@ -284,8 +284,10 @@ def config_option(expose_value: bool):
284
284
  return return_option_decorator
285
285
 
286
286
 
287
- def yes_option():
287
+ def yes_option(helptext: Optional[str] = None):
288
288
  """A decorator for the --yes/-y option."""
289
+ if helptext is None:
290
+ helptext = 'Skip confirmation prompt.'
289
291
 
290
292
  def return_option_decorator(func):
291
293
  return click.option('--yes',
@@ -293,7 +295,7 @@ def yes_option():
293
295
  is_flag=True,
294
296
  default=False,
295
297
  required=False,
296
- help='Skip confirmation prompt.')(func)
298
+ help=helptext)(func)
297
299
 
298
300
  return return_option_decorator
299
301
 
@@ -1,7 +1,7 @@
1
1
  """Utilities for formatting tables for CLI output."""
2
2
  import abc
3
3
  from datetime import datetime
4
- from typing import Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional
5
5
 
6
6
  import prettytable
7
7
 
@@ -88,15 +88,23 @@ def format_storage_table(storages: List[responses.StorageRecord],
88
88
  return 'No existing storage.'
89
89
 
90
90
 
91
- def format_job_table(jobs: List[responses.ManagedJobRecord],
92
- show_all: bool,
93
- show_user: bool,
94
- max_jobs: Optional[int] = None):
91
+ def format_job_table(
92
+ jobs: List[responses.ManagedJobRecord],
93
+ show_all: bool,
94
+ show_user: bool,
95
+ pool_status: Optional[List[Dict[str, Any]]] = None,
96
+ max_jobs: Optional[int] = None,
97
+ status_counts: Optional[Dict[str, int]] = None,
98
+ ):
95
99
  jobs = [job.model_dump() for job in jobs]
96
- return managed_jobs.format_job_table(jobs,
97
- show_all=show_all,
98
- show_user=show_user,
99
- max_jobs=max_jobs)
100
+ return managed_jobs.format_job_table(
101
+ jobs,
102
+ pool_status=pool_status,
103
+ show_all=show_all,
104
+ show_user=show_user,
105
+ max_jobs=max_jobs,
106
+ job_status_counts=status_counts,
107
+ )
100
108
 
101
109
 
102
110
  _BASIC_COLUMNS = [
sky/client/sdk.py CHANGED
@@ -37,6 +37,7 @@ from sky.server import common as server_common
37
37
  from sky.server import rest
38
38
  from sky.server import versions
39
39
  from sky.server.requests import payloads
40
+ from sky.server.requests import request_names
40
41
  from sky.server.requests import requests as requests_lib
41
42
  from sky.skylet import autostop_lib
42
43
  from sky.skylet import constants
@@ -98,6 +99,9 @@ def reload_config() -> None:
98
99
  skypilot_config.safe_reload_config()
99
100
 
100
101
 
102
+ # The overloads are not comprehensive - e.g. get_result Literal[False] could be
103
+ # specified to return None. We can add more overloads if needed. To do that see
104
+ # https://github.com/python/mypy/issues/8634#issuecomment-609411104
101
105
  @typing.overload
102
106
  def stream_response(request_id: None,
103
107
  response: 'requests.Response',
@@ -112,7 +116,16 @@ def stream_response(request_id: server_common.RequestId[T],
112
116
  response: 'requests.Response',
113
117
  output_stream: Optional['io.TextIOBase'] = None,
114
118
  resumable: bool = False,
115
- get_result: bool = True) -> T:
119
+ get_result: Literal[True] = True) -> T:
120
+ ...
121
+
122
+
123
+ @typing.overload
124
+ def stream_response(request_id: server_common.RequestId[T],
125
+ response: 'requests.Response',
126
+ output_stream: Optional['io.TextIOBase'] = None,
127
+ resumable: bool = False,
128
+ get_result: bool = True) -> Optional[T]:
116
129
  ...
117
130
 
118
131
 
@@ -591,7 +604,10 @@ def launch(
591
604
  down=down,
592
605
  dryrun=dryrun)
593
606
  with admin_policy_utils.apply_and_use_config_in_current_request(
594
- dag, request_options=request_options, at_client_side=True) as dag:
607
+ dag,
608
+ request_name=request_names.AdminPolicyRequestName.CLUSTER_LAUNCH,
609
+ request_options=request_options,
610
+ at_client_side=True) as dag:
595
611
  return _launch(
596
612
  dag,
597
613
  cluster_name,
@@ -913,6 +929,7 @@ def tail_logs(
913
929
  @annotations.client_api
914
930
  @rest.retry_transient_errors()
915
931
  def tail_provision_logs(cluster_name: str,
932
+ worker: Optional[int] = None,
916
933
  follow: bool = True,
917
934
  tail: int = 0,
918
935
  output_stream: Optional['io.TextIOBase'] = None) -> int:
@@ -920,17 +937,31 @@ def tail_provision_logs(cluster_name: str,
920
937
 
921
938
  Args:
922
939
  cluster_name: name of the cluster.
940
+ worker: worker id in multi-node cluster.
941
+ If None, stream the logs of the head node.
923
942
  follow: follow the logs.
924
943
  tail: lines from end to tail.
925
944
  output_stream: optional stream to write logs.
926
945
  Returns:
927
946
  Exit code 0 on streaming success; raises on HTTP error.
928
947
  """
929
- body = payloads.ClusterNameBody(cluster_name=cluster_name)
948
+ body = payloads.ProvisionLogsBody(cluster_name=cluster_name)
949
+
950
+ if worker is not None:
951
+ remote_api_version = versions.get_remote_api_version()
952
+ if remote_api_version is not None and remote_api_version >= 21:
953
+ if worker < 1:
954
+ raise ValueError('Worker must be a positive integer.')
955
+ body.worker = worker
956
+ else:
957
+ raise exceptions.APINotSupportedError(
958
+ 'Worker node provision logs are not supported in your API '
959
+ 'server. Please upgrade to a newer API server to use it.')
930
960
  params = {
931
961
  'follow': str(follow).lower(),
932
962
  'tail': tail,
933
963
  }
964
+
934
965
  response = server_common.make_authenticated_request(
935
966
  'POST',
936
967
  '/provision_logs',
@@ -939,13 +970,21 @@ def tail_provision_logs(cluster_name: str,
939
970
  stream=True,
940
971
  timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
941
972
  None))
973
+ # Check for HTTP errors before streaming the response
974
+ if response.status_code != 200:
975
+ with ux_utils.print_exception_no_traceback():
976
+ raise exceptions.CommandError(response.status_code,
977
+ 'tail_provision_logs',
978
+ 'Failed to stream provision logs',
979
+ response.text)
980
+
942
981
  # Log request is idempotent when tail is 0, thus can resume previous
943
982
  # streaming point on retry.
944
983
  # request_id=None here because /provision_logs does not create an async
945
984
  # request. Instead, it streams a plain file from the server. This does NOT
946
985
  # violate the stream_response doc warning about None in multi-user
947
- # environments: we are not asking stream_response to select the latest
948
- # request”. We already have the HTTP response to stream; request_id=None
986
+ # environments: we are not asking stream_response to select "the latest
987
+ # request". We already have the HTTP response to stream; request_id=None
949
988
  # merely disables the follow-up GET. It is also necessary for --no-follow
950
989
  # to return cleanly after printing the tailed lines. If we provided a
951
990
  # non-None request_id here, the get(request_id) in stream_response(
@@ -2064,16 +2103,16 @@ def stream_and_get(
2064
2103
  timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
2065
2104
  None),
2066
2105
  stream=True)
2106
+ if response.status_code in [404, 400]:
2107
+ detail = response.json().get('detail')
2108
+ with ux_utils.print_exception_no_traceback():
2109
+ raise exceptions.ClientError(f'Failed to stream logs: {detail}')
2067
2110
  stream_request_id: Optional[server_common.RequestId[
2068
2111
  T]] = server_common.get_stream_request_id(response)
2069
2112
  if request_id is not None and stream_request_id is not None:
2070
2113
  assert request_id == stream_request_id
2071
2114
  if request_id is None:
2072
2115
  request_id = stream_request_id
2073
- if response.status_code in [404, 400]:
2074
- detail = response.json().get('detail')
2075
- with ux_utils.print_exception_no_traceback():
2076
- raise exceptions.ClientError(f'Failed to stream logs: {detail}')
2077
2116
  elif response.status_code != 200:
2078
2117
  # TODO(syang): handle the case where the requestID is not provided
2079
2118
  # see https://github.com/skypilot-org/skypilot/issues/6549
@@ -2158,7 +2197,9 @@ def _local_api_server_running(kill: bool = False) -> bool:
2158
2197
  def api_status(
2159
2198
  request_ids: Optional[List[Union[server_common.RequestId[T], str]]] = None,
2160
2199
  # pylint: disable=redefined-builtin
2161
- all_status: bool = False
2200
+ all_status: bool = False,
2201
+ limit: Optional[int] = None,
2202
+ fields: Optional[List[str]] = None,
2162
2203
  ) -> List[payloads.RequestPayload]:
2163
2204
  """Lists all requests.
2164
2205
 
@@ -2167,6 +2208,8 @@ def api_status(
2167
2208
  If None, all requests are queried.
2168
2209
  all_status: Whether to list all finished requests as well. This argument
2169
2210
  is ignored if request_ids is not None.
2211
+ limit: The number of requests to show. If None, show all requests.
2212
+ fields: The fields to get. If None, get all fields.
2170
2213
 
2171
2214
  Returns:
2172
2215
  A list of request payloads.
@@ -2175,8 +2218,12 @@ def api_status(
2175
2218
  logger.info('SkyPilot API server is not running.')
2176
2219
  return []
2177
2220
 
2178
- body = payloads.RequestStatusBody(request_ids=request_ids,
2179
- all_status=all_status)
2221
+ body = payloads.RequestStatusBody(
2222
+ request_ids=request_ids,
2223
+ all_status=all_status,
2224
+ limit=limit,
2225
+ fields=fields,
2226
+ )
2180
2227
  response = server_common.make_authenticated_request(
2181
2228
  'GET',
2182
2229
  '/api/status',
sky/cloud_stores.py CHANGED
@@ -18,6 +18,7 @@ from sky import sky_logging
18
18
  from sky.adaptors import aws
19
19
  from sky.adaptors import azure
20
20
  from sky.adaptors import cloudflare
21
+ from sky.adaptors import coreweave
21
22
  from sky.adaptors import ibm
22
23
  from sky.adaptors import nebius
23
24
  from sky.adaptors import oci
@@ -602,6 +603,77 @@ class NebiusCloudStorage(CloudStorage):
602
603
  return ' && '.join(all_commands)
603
604
 
604
605
 
606
+ class CoreWeaveCloudStorage(CloudStorage):
607
+ """CoreWeave Cloud Storage."""
608
+
609
+ # List of commands to install AWS CLI
610
+ _GET_AWSCLI = [
611
+ 'aws --version >/dev/null 2>&1 || '
612
+ f'{constants.SKY_UV_PIP_CMD} install awscli',
613
+ ]
614
+
615
+ def is_directory(self, url: str) -> bool:
616
+ """Checks if the coreweave object is a directory.
617
+
618
+ In cloud object stores, a "directory" refers to a regular object whose
619
+ name is a prefix of other objects.
620
+
621
+ Args:
622
+ url: coreweave object URL.
623
+ """
624
+ cw = coreweave.resource('s3')
625
+ bucket_name, path = data_utils.split_coreweave_path(url)
626
+ bucket = cw.Bucket(bucket_name)
627
+
628
+ num_objects = 0
629
+ for obj in bucket.objects.filter(Prefix=path):
630
+ num_objects += 1
631
+ if obj.key == path:
632
+ return False
633
+ # If there are more than 1 object in filter, then it is a directory
634
+ if num_objects == 3:
635
+ return True
636
+
637
+ # A directory with few or no items
638
+ return True
639
+
640
+ def make_sync_dir_command(self, source: str, destination: str) -> str:
641
+ """Downloads using AWS CLI."""
642
+ # AWS Sync by default uses 10 threads to upload files to the bucket.
643
+ # To increase parallelism, modify max_concurrent_requests in your
644
+ # aws config file (Default path: ~/.coreweave/cw.config).
645
+ assert 'cw://' in source, 'cw:// is not in source'
646
+ source = source.replace('cw://', 's3://')
647
+ download_via_awscli = (
648
+ 'AWS_SHARED_CREDENTIALS_FILE='
649
+ f'{coreweave.COREWEAVE_CREDENTIALS_PATH} '
650
+ f'AWS_CONFIG_FILE={coreweave.COREWEAVE_CONFIG_PATH} '
651
+ f'{constants.SKY_REMOTE_PYTHON_ENV}/bin/aws s3 '
652
+ 'sync --no-follow-symlinks '
653
+ f'{source} {destination} '
654
+ f'--profile={coreweave.COREWEAVE_PROFILE_NAME}')
655
+
656
+ all_commands = list(self._GET_AWSCLI)
657
+ all_commands.append(download_via_awscli)
658
+ return ' && '.join(all_commands)
659
+
660
+ def make_sync_file_command(self, source: str, destination: str) -> str:
661
+ """Downloads a file using AWS CLI."""
662
+ assert 'cw://' in source, 'cw:// is not in source'
663
+ source = source.replace('cw://', 's3://')
664
+ download_via_awscli = (
665
+ 'AWS_SHARED_CREDENTIALS_FILE='
666
+ f'{coreweave.COREWEAVE_CREDENTIALS_PATH} '
667
+ f'AWS_CONFIG_FILE={coreweave.COREWEAVE_CONFIG_PATH} '
668
+ f'{constants.SKY_REMOTE_PYTHON_ENV}/bin/aws s3 '
669
+ f'cp {source} {destination} '
670
+ f'--profile={coreweave.COREWEAVE_PROFILE_NAME}')
671
+
672
+ all_commands = list(self._GET_AWSCLI)
673
+ all_commands.append(download_via_awscli)
674
+ return ' && '.join(all_commands)
675
+
676
+
605
677
  def get_storage_from_path(url: str) -> CloudStorage:
606
678
  """Returns a CloudStorage by identifying the scheme:// in a URL."""
607
679
  result = urllib.parse.urlsplit(url)
@@ -619,6 +691,7 @@ _REGISTRY = {
619
691
  'cos': IBMCosCloudStorage(),
620
692
  'oci': OciCloudStorage(),
621
693
  'nebius': NebiusCloudStorage(),
694
+ 'cw': CoreWeaveCloudStorage(),
622
695
  # TODO: This is a hack, as Azure URL starts with https://, we should
623
696
  # refactor the registry to be able to take regex, so that Azure blob can
624
697
  # be identified with `https://(.*?)\.blob\.core\.windows\.net`
sky/clouds/__init__.py CHANGED
@@ -30,6 +30,7 @@ from sky.clouds.primeintellect import PrimeIntellect
30
30
  from sky.clouds.runpod import RunPod
31
31
  from sky.clouds.scp import SCP
32
32
  from sky.clouds.seeweb import Seeweb
33
+ from sky.clouds.shadeform import Shadeform
33
34
  from sky.clouds.ssh import SSH
34
35
  from sky.clouds.vast import Vast
35
36
  from sky.clouds.vsphere import Vsphere
@@ -48,6 +49,7 @@ __all__ = [
48
49
  'PrimeIntellect',
49
50
  'SCP',
50
51
  'RunPod',
52
+ 'Shadeform',
51
53
  'Vast',
52
54
  'OCI',
53
55
  'Vsphere',
sky/clouds/aws.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Amazon Web Services."""
2
2
  import enum
3
3
  import fnmatch
4
+ import functools
4
5
  import hashlib
5
6
  import json
6
7
  import os
@@ -8,7 +9,10 @@ import re
8
9
  import subprocess
9
10
  import time
10
11
  import typing
11
- from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
12
+ from typing import (Any, Callable, Dict, Iterator, List, Literal, Optional, Set,
13
+ Tuple, TypeVar, Union)
14
+
15
+ from typing_extensions import ParamSpec
12
16
 
13
17
  from sky import catalog
14
18
  from sky import clouds
@@ -113,6 +117,37 @@ _EFA_DOCKER_RUN_OPTIONS = [
113
117
  _EFA_IMAGE_NAME = 'Deep Learning Base OSS Nvidia Driver GPU AMI' \
114
118
  ' (Ubuntu 22.04) 20250808'
115
119
 
120
+ # For functions that needs caching per AWS profile.
121
+ _AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE = 5
122
+
123
+ T = TypeVar('T')
124
+ P = ParamSpec('P')
125
+
126
+
127
+ def aws_profile_aware_lru_cache(*lru_cache_args,
128
+ scope: Literal['global', 'request'] = 'request',
129
+ **lru_cache_kwargs) -> Callable:
130
+ """Similar to annotations.lru_cache, but automatically includes the
131
+ AWS profile (if set in the workspace config) in the cache key.
132
+ """
133
+
134
+ def decorator(func: Callable[P, T]) -> Callable[P, T]:
135
+
136
+ @annotations.lru_cache(scope, *lru_cache_args, **lru_cache_kwargs)
137
+ def cached_impl(aws_profile, *args, **kwargs):
138
+ del aws_profile # Only used as part of the cache key.
139
+ return func(*args, **kwargs)
140
+
141
+ @functools.wraps(func)
142
+ def wrapper(*args, **kwargs):
143
+ aws_profile = aws.get_workspace_profile()
144
+ return cached_impl(aws_profile, *args, **kwargs)
145
+
146
+ wrapper.cache_clear = cached_impl.cache_clear # type: ignore[attr-defined]
147
+ return wrapper
148
+
149
+ return decorator
150
+
116
151
 
117
152
  def _is_efa_instance_type(instance_type: str) -> bool:
118
153
  """Check if the instance type is in EFA supported instance family."""
@@ -264,7 +299,9 @@ class AWS(clouds.Cloud):
264
299
 
265
300
  @classmethod
266
301
  def _unsupported_features_for_resources(
267
- cls, resources: 'resources_lib.Resources'
302
+ cls,
303
+ resources: 'resources_lib.Resources',
304
+ region: Optional[str] = None,
268
305
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
269
306
  unsupported_features = {}
270
307
  if resources.use_spot:
@@ -306,10 +343,15 @@ class AWS(clouds.Cloud):
306
343
  #### Regions/Zones ####
307
344
 
308
345
  @classmethod
309
- def regions_with_offering(cls, instance_type: str,
310
- accelerators: Optional[Dict[str, int]],
311
- use_spot: bool, region: Optional[str],
312
- zone: Optional[str]) -> List[clouds.Region]:
346
+ def regions_with_offering(
347
+ cls,
348
+ instance_type: str,
349
+ accelerators: Optional[Dict[str, int]],
350
+ use_spot: bool,
351
+ region: Optional[str],
352
+ zone: Optional[str],
353
+ resources: Optional['resources_lib.Resources'] = None,
354
+ ) -> List[clouds.Region]:
313
355
  del accelerators # unused
314
356
  regions = catalog.get_region_zones_for_instance_type(
315
357
  instance_type, use_spot, 'aws')
@@ -462,7 +504,8 @@ class AWS(clouds.Cloud):
462
504
  return image_size
463
505
 
464
506
  @classmethod
465
- @annotations.lru_cache(scope='request', maxsize=1)
507
+ @aws_profile_aware_lru_cache(scope='request',
508
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
466
509
  def get_image_root_device_name(cls, image_id: str,
467
510
  region: Optional[str]) -> str:
468
511
  if image_id.startswith('skypilot:'):
@@ -788,8 +831,9 @@ class AWS(clouds.Cloud):
788
831
  return cls._check_credentials()
789
832
 
790
833
  @classmethod
791
- @annotations.lru_cache(scope='request',
792
- maxsize=1) # Cache since getting identity is slow.
834
+ # Cache since getting identity is slow.
835
+ @aws_profile_aware_lru_cache(scope='request',
836
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
793
837
  def _check_credentials(cls) -> Tuple[bool, Optional[str]]:
794
838
  """Checks if the user has access credentials to AWS."""
795
839
 
@@ -924,9 +968,16 @@ class AWS(clouds.Cloud):
924
968
  return AWSIdentityType.SHARED_CREDENTIALS_FILE
925
969
 
926
970
  @classmethod
927
- @annotations.lru_cache(scope='request', maxsize=1)
971
+ @aws_profile_aware_lru_cache(scope='request',
972
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
928
973
  def _aws_configure_list(cls) -> Optional[bytes]:
929
- proc = subprocess.run('aws configure list',
974
+ cmd = 'aws configure list'
975
+ # Profile takes precedence over default configs.
976
+ profile = aws.get_workspace_profile()
977
+ if profile is not None:
978
+ # If profile does not exist, we will get returncode 255.
979
+ cmd += f' --profile {profile}'
980
+ proc = subprocess.run(cmd,
930
981
  shell=True,
931
982
  check=False,
932
983
  stdout=subprocess.PIPE,
@@ -936,8 +987,9 @@ class AWS(clouds.Cloud):
936
987
  return proc.stdout
937
988
 
938
989
  @classmethod
939
- @annotations.lru_cache(scope='request',
940
- maxsize=1) # Cache since getting identity is slow.
990
+ # Cache since getting identity is slow.
991
+ @aws_profile_aware_lru_cache(scope='request',
992
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
941
993
  def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]:
942
994
  try:
943
995
  sts = aws.client('sts', check_credentials=False)
@@ -1018,8 +1070,9 @@ class AWS(clouds.Cloud):
1018
1070
  return [user_ids]
1019
1071
 
1020
1072
  @classmethod
1021
- @annotations.lru_cache(scope='request',
1022
- maxsize=1) # Cache since getting identity is slow.
1073
+ # Cache since getting identity is slow.
1074
+ @aws_profile_aware_lru_cache(scope='request',
1075
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
1023
1076
  def get_user_identities(cls) -> Optional[List[List[str]]]:
1024
1077
  """Returns a [UserId, Account] list that uniquely identifies the user.
1025
1078
 
@@ -1114,6 +1167,7 @@ class AWS(clouds.Cloud):
1114
1167
  # provider of the cluster to be launched in this function and make sure
1115
1168
  # the cluster will not be used for launching clusters in other clouds,
1116
1169
  # e.g. jobs controller.
1170
+
1117
1171
  if self._current_identity_type(
1118
1172
  ) != AWSIdentityType.SHARED_CREDENTIALS_FILE:
1119
1173
  return {}
@@ -1123,7 +1177,8 @@ class AWS(clouds.Cloud):
1123
1177
  if os.path.exists(os.path.expanduser(f'~/.aws/{filename}'))
1124
1178
  }
1125
1179
 
1126
- @annotations.lru_cache(scope='request', maxsize=1)
1180
+ @aws_profile_aware_lru_cache(scope='request',
1181
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
1127
1182
  def can_credential_expire(self) -> bool:
1128
1183
  identity_type = self._current_identity_type()
1129
1184
  return (identity_type is not None and
sky/clouds/azure.py CHANGED
@@ -87,7 +87,9 @@ class Azure(clouds.Cloud):
87
87
 
88
88
  @classmethod
89
89
  def _unsupported_features_for_resources(
90
- cls, resources: 'resources.Resources'
90
+ cls,
91
+ resources: 'resources.Resources',
92
+ region: Optional[str] = None,
91
93
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
92
94
  features = {
93
95
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
@@ -264,10 +266,15 @@ class Azure(clouds.Cloud):
264
266
  return _DEFAULT_GPU_IMAGE_ID
265
267
 
266
268
  @classmethod
267
- def regions_with_offering(cls, instance_type: str,
268
- accelerators: Optional[Dict[str, int]],
269
- use_spot: bool, region: Optional[str],
270
- zone: Optional[str]) -> List[clouds.Region]:
269
+ def regions_with_offering(
270
+ cls,
271
+ instance_type: str,
272
+ accelerators: Optional[Dict[str, int]],
273
+ use_spot: bool,
274
+ region: Optional[str],
275
+ zone: Optional[str],
276
+ resources: Optional['resources.Resources'] = None,
277
+ ) -> List[clouds.Region]:
271
278
  del accelerators # unused
272
279
  assert zone is None, 'Azure does not support zones'
273
280
  regions = catalog.get_region_zones_for_instance_type(
sky/clouds/cloud.py CHANGED
@@ -185,10 +185,15 @@ class Cloud:
185
185
  #### Regions/Zones ####
186
186
 
187
187
  @classmethod
188
- def regions_with_offering(cls, instance_type: str,
189
- accelerators: Optional[Dict[str, int]],
190
- use_spot: bool, region: Optional[str],
191
- zone: Optional[str]) -> List[Region]:
188
+ def regions_with_offering(
189
+ cls,
190
+ instance_type: str,
191
+ accelerators: Optional[Dict[str, int]],
192
+ use_spot: bool,
193
+ region: Optional[str],
194
+ zone: Optional[str],
195
+ resources: Optional['resources_lib.Resources'] = None,
196
+ ) -> List[Region]:
192
197
  """Returns the regions that offer the specified resources.
193
198
 
194
199
  The order of the regions follow the order of the regions returned by
@@ -674,8 +679,11 @@ class Cloud:
674
679
 
675
680
  @classmethod
676
681
  def check_features_are_supported(
677
- cls, resources: 'resources_lib.Resources',
678
- requested_features: Set[CloudImplementationFeatures]) -> None:
682
+ cls,
683
+ resources: 'resources_lib.Resources',
684
+ requested_features: Set[CloudImplementationFeatures],
685
+ region: Optional[str] = None,
686
+ ) -> None:
679
687
  """Errors out if the cloud does not support all requested features.
680
688
 
681
689
  For instance, Lambda Cloud does not support stop, so
@@ -693,7 +701,7 @@ class Cloud:
693
701
  requested features.
694
702
  """
695
703
  unsupported_features2reason = cls._unsupported_features_for_resources(
696
- resources)
704
+ resources, region)
697
705
 
698
706
  # Docker image is not compatible with ssh proxy command.
699
707
  if skypilot_config.get_effective_region_config(
@@ -723,7 +731,9 @@ class Cloud:
723
731
 
724
732
  @classmethod
725
733
  def _unsupported_features_for_resources(
726
- cls, resources: 'resources_lib.Resources'
734
+ cls,
735
+ resources: 'resources_lib.Resources',
736
+ region: Optional[str] = None,
727
737
  ) -> Dict[CloudImplementationFeatures, str]:
728
738
  """The features not supported based on the resources provided.
729
739
 
@@ -734,7 +744,7 @@ class Cloud:
734
744
  A dict of {feature: reason} for the features not supported by the
735
745
  cloud implementation.
736
746
  """
737
- del resources
747
+ del resources, region
738
748
  raise NotImplementedError
739
749
 
740
750
  @classmethod
sky/clouds/cudo.py CHANGED
@@ -87,7 +87,9 @@ class Cudo(clouds.Cloud):
87
87
 
88
88
  @classmethod
89
89
  def _unsupported_features_for_resources(
90
- cls, resources: 'resources_lib.Resources'
90
+ cls,
91
+ resources: 'resources_lib.Resources',
92
+ region: Optional[str] = None,
91
93
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
92
94
  """The features not supported based on the resources provided.
93
95
 
@@ -106,10 +108,15 @@ class Cudo(clouds.Cloud):
106
108
  return cls._MAX_CLUSTER_NAME_LEN_LIMIT
107
109
 
108
110
  @classmethod
109
- def regions_with_offering(cls, instance_type,
110
- accelerators: Optional[Dict[str, int]],
111
- use_spot: bool, region: Optional[str],
112
- zone: Optional[str]) -> List[clouds.Region]:
111
+ def regions_with_offering(
112
+ cls,
113
+ instance_type,
114
+ accelerators: Optional[Dict[str, int]],
115
+ use_spot: bool,
116
+ region: Optional[str],
117
+ zone: Optional[str],
118
+ resources: Optional['resources_lib.Resources'] = None,
119
+ ) -> List[clouds.Region]:
113
120
  assert zone is None, 'Cudo does not support zones.'
114
121
  del accelerators, zone # unused
115
122
  if use_spot:
sky/clouds/do.py CHANGED
@@ -57,7 +57,9 @@ class DO(clouds.Cloud):
57
57
 
58
58
  @classmethod
59
59
  def _unsupported_features_for_resources(
60
- cls, resources: 'resources_lib.Resources'
60
+ cls,
61
+ resources: 'resources_lib.Resources',
62
+ region: Optional[str] = None,
61
63
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
62
64
  """The features not supported based on the resources provided.
63
65
 
@@ -83,6 +85,7 @@ class DO(clouds.Cloud):
83
85
  use_spot: bool,
84
86
  region: Optional[str],
85
87
  zone: Optional[str],
88
+ resources: Optional['resources_lib.Resources'] = None,
86
89
  ) -> List[clouds.Region]:
87
90
  assert zone is None, 'DO does not support zones.'
88
91
  del accelerators, zone # unused