skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/data/storage.py CHANGED
@@ -23,6 +23,7 @@ from sky import skypilot_config
23
23
  from sky.adaptors import aws
24
24
  from sky.adaptors import azure
25
25
  from sky.adaptors import cloudflare
26
+ from sky.adaptors import coreweave
26
27
  from sky.adaptors import gcp
27
28
  from sky.adaptors import ibm
28
29
  from sky.adaptors import nebius
@@ -62,6 +63,7 @@ STORE_ENABLED_CLOUDS: List[str] = [
62
63
  str(clouds.OCI()),
63
64
  str(clouds.Nebius()),
64
65
  cloudflare.NAME,
66
+ coreweave.NAME,
65
67
  ]
66
68
 
67
69
  # Maximum number of concurrent rsync upload processes
@@ -93,6 +95,12 @@ def get_cached_enabled_storage_cloud_names_or_refresh(
93
95
  r2_is_enabled, _ = cloudflare.check_storage_credentials()
94
96
  if r2_is_enabled:
95
97
  enabled_clouds.append(cloudflare.NAME)
98
+
99
+ # Similarly, handle CoreWeave storage credentials
100
+ coreweave_is_enabled, _ = coreweave.check_storage_credentials()
101
+ if coreweave_is_enabled:
102
+ enabled_clouds.append(coreweave.NAME)
103
+
96
104
  if raise_if_no_cloud_access and not enabled_clouds:
97
105
  raise exceptions.NoCloudAccessError(
98
106
  'No cloud access available for storage. '
@@ -126,6 +134,7 @@ class StoreType(enum.Enum):
126
134
  IBM = 'IBM'
127
135
  OCI = 'OCI'
128
136
  NEBIUS = 'NEBIUS'
137
+ COREWEAVE = 'COREWEAVE'
129
138
  VOLUME = 'VOLUME'
130
139
 
131
140
  @classmethod
@@ -883,7 +892,7 @@ class Storage(object):
883
892
  f'{source} in the file_mounts section of your YAML')
884
893
  is_local_source = True
885
894
  elif split_path.scheme in [
886
- 's3', 'gs', 'https', 'r2', 'cos', 'oci', 'nebius'
895
+ 's3', 'gs', 'https', 'r2', 'cos', 'oci', 'nebius', 'cw'
887
896
  ]:
888
897
  is_local_source = False
889
898
  # Storage mounting does not support mounting specific files from
@@ -908,7 +917,8 @@ class Storage(object):
908
917
  with ux_utils.print_exception_no_traceback():
909
918
  raise exceptions.StorageSourceError(
910
919
  f'Supported paths: local, s3://, gs://, https://, '
911
- f'r2://, cos://, oci://, nebius://. Got: {source}')
920
+ f'r2://, cos://, oci://, nebius://, cw://. '
921
+ f'Got: {source}')
912
922
  return source, is_local_source
913
923
 
914
924
  def _validate_storage_spec(self, name: Optional[str]) -> None:
@@ -923,7 +933,16 @@ class Storage(object):
923
933
  """
924
934
  prefix = name.split('://')[0]
925
935
  prefix = prefix.lower()
926
- if prefix in ['s3', 'gs', 'https', 'r2', 'cos', 'oci', 'nebius']:
936
+ if prefix in [
937
+ 's3',
938
+ 'gs',
939
+ 'https',
940
+ 'r2',
941
+ 'cos',
942
+ 'oci',
943
+ 'nebius',
944
+ 'cw',
945
+ ]:
927
946
  with ux_utils.print_exception_no_traceback():
928
947
  raise exceptions.StorageNameError(
929
948
  'Prefix detected: `name` cannot start with '
@@ -1062,6 +1081,12 @@ class Storage(object):
1062
1081
  source=self.source,
1063
1082
  sync_on_reconstruction=self.sync_on_reconstruction,
1064
1083
  _bucket_sub_path=self._bucket_sub_path)
1084
+ elif s_type == StoreType.COREWEAVE:
1085
+ store = CoreWeaveStore.from_metadata(
1086
+ s_metadata,
1087
+ source=self.source,
1088
+ sync_on_reconstruction=self.sync_on_reconstruction,
1089
+ _bucket_sub_path=self._bucket_sub_path)
1065
1090
  else:
1066
1091
  with ux_utils.print_exception_no_traceback():
1067
1092
  raise ValueError(f'Unknown store type: {s_type}')
@@ -1417,6 +1442,7 @@ class S3CompatibleConfig:
1417
1442
  aws_profile: Optional[str] = None
1418
1443
  get_endpoint_url: Optional[Callable[[], str]] = None
1419
1444
  credentials_file: Optional[str] = None
1445
+ config_file: Optional[str] = None
1420
1446
  extra_cli_args: Optional[List[str]] = None
1421
1447
 
1422
1448
  # Provider-specific settings
@@ -1437,8 +1463,8 @@ class S3CompatibleStore(AbstractStore):
1437
1463
  """Base class for S3-compatible object storage providers.
1438
1464
 
1439
1465
  This class provides a unified interface for all S3-compatible storage
1440
- providers (AWS S3, Cloudflare R2, Nebius, MinIO, etc.) by leveraging
1441
- a configuration-driven approach that eliminates code duplication.
1466
+ providers (AWS S3, Cloudflare R2, Nebius, MinIO, CoreWeave, etc.) by
1467
+ leveraging a configuration-driven approach that eliminates code duplication
1442
1468
 
1443
1469
  ## Adding a New S3-Compatible Store
1444
1470
 
@@ -1864,6 +1890,9 @@ class S3CompatibleStore(AbstractStore):
1864
1890
  if self.config.credentials_file:
1865
1891
  cmd = 'AWS_SHARED_CREDENTIALS_FILE=' + \
1866
1892
  f'{self.config.credentials_file} {cmd}'
1893
+ if self.config.config_file:
1894
+ cmd = 'AWS_CONFIG_FILE=' + \
1895
+ f'{self.config.config_file} {cmd}'
1867
1896
 
1868
1897
  return cmd
1869
1898
 
@@ -1909,6 +1938,9 @@ class S3CompatibleStore(AbstractStore):
1909
1938
  if self.config.credentials_file:
1910
1939
  cmd = 'AWS_SHARED_CREDENTIALS_FILE=' + \
1911
1940
  f'{self.config.credentials_file} {cmd}'
1941
+ if self.config.config_file:
1942
+ cmd = 'AWS_CONFIG_FILE=' + \
1943
+ f'{self.config.config_file} {cmd}'
1912
1944
 
1913
1945
  return cmd
1914
1946
 
@@ -1962,6 +1994,9 @@ class S3CompatibleStore(AbstractStore):
1962
1994
  if self.config.credentials_file:
1963
1995
  command = (f'AWS_SHARED_CREDENTIALS_FILE='
1964
1996
  f'{self.config.credentials_file} {command}')
1997
+ if self.config.config_file:
1998
+ command = 'AWS_CONFIG_FILE=' + \
1999
+ f'{self.config.config_file} {command}'
1965
2000
  with ux_utils.print_exception_no_traceback():
1966
2001
  raise exceptions.StorageBucketGetError(
1967
2002
  _BUCKET_FAIL_TO_CONNECT_MESSAGE.format(name=self.name) +
@@ -2015,7 +2050,7 @@ class S3CompatibleStore(AbstractStore):
2015
2050
  except aws.botocore_exceptions().ClientError as e:
2016
2051
  with ux_utils.print_exception_no_traceback():
2017
2052
  raise exceptions.StorageBucketCreateError(
2018
- f'Attempted to create a bucket {self.name} but failed.'
2053
+ f'Attempted to create S3 bucket {self.name} but failed.'
2019
2054
  ) from e
2020
2055
  return self.config.resource_factory(bucket_name)
2021
2056
 
@@ -2034,7 +2069,9 @@ class S3CompatibleStore(AbstractStore):
2034
2069
  remove_command = (f'AWS_SHARED_CREDENTIALS_FILE='
2035
2070
  f'{self.config.credentials_file} '
2036
2071
  f'{remove_command}')
2037
-
2072
+ if self.config.config_file:
2073
+ remove_command = 'AWS_CONFIG_FILE=' + \
2074
+ f'{self.config.config_file} {remove_command}'
2038
2075
  return self._execute_remove_command(
2039
2076
  remove_command, bucket_name,
2040
2077
  f'Deleting {self.config.store_type} bucket {bucket_name}',
@@ -2047,8 +2084,9 @@ class S3CompatibleStore(AbstractStore):
2047
2084
  try:
2048
2085
  with rich_utils.safe_status(
2049
2086
  ux_utils.spinner_message(hint_operating)):
2050
- subprocess.check_output(command.split(' '),
2051
- stderr=subprocess.STDOUT)
2087
+ subprocess.check_output(command,
2088
+ stderr=subprocess.STDOUT,
2089
+ shell=True)
2052
2090
  except subprocess.CalledProcessError as e:
2053
2091
  if 'NoSuchBucket' in e.output.decode('utf-8'):
2054
2092
  logger.debug(
@@ -2091,7 +2129,9 @@ class S3CompatibleStore(AbstractStore):
2091
2129
  remove_command = (f'AWS_SHARED_CREDENTIALS_FILE='
2092
2130
  f'{self.config.credentials_file} '
2093
2131
  f'{remove_command}')
2094
-
2132
+ if self.config.config_file:
2133
+ remove_command = 'AWS_CONFIG_FILE=' + \
2134
+ f'{self.config.config_file} {remove_command}'
2095
2135
  return self._execute_remove_command(
2096
2136
  remove_command, bucket_name,
2097
2137
  (f'Removing objects from {self.config.store_type} bucket '
@@ -2168,6 +2208,10 @@ class GcsStore(AbstractStore):
2168
2208
  elif self.source.startswith('oci://'):
2169
2209
  raise NotImplementedError(
2170
2210
  'Moving data from OCI to GCS is currently not supported.')
2211
+ elif self.source.startswith('cw://'):
2212
+ raise NotImplementedError(
2213
+ 'Moving data from CoreWeave Object Storage to GCS is'
2214
+ ' currently not supported.')
2171
2215
  # Validate name
2172
2216
  self.name = self.validate_name(self.name)
2173
2217
  # Check if the storage is enabled
@@ -2554,7 +2598,7 @@ class GcsStore(AbstractStore):
2554
2598
  except Exception as e: # pylint: disable=broad-except
2555
2599
  with ux_utils.print_exception_no_traceback():
2556
2600
  raise exceptions.StorageBucketCreateError(
2557
- f'Attempted to create a bucket {self.name} but failed.'
2601
+ f'Attempted to create GCS bucket {self.name} but failed.'
2558
2602
  ) from e
2559
2603
  logger.info(
2560
2604
  f' {colorama.Style.DIM}Created GCS bucket {new_bucket.name!r} in '
@@ -2783,6 +2827,10 @@ class AzureBlobStore(AbstractStore):
2783
2827
  elif self.source.startswith('oci://'):
2784
2828
  raise NotImplementedError(
2785
2829
  'Moving data from OCI to AZureBlob is not supported.')
2830
+ elif self.source.startswith('cw://'):
2831
+ raise NotImplementedError(
2832
+ 'Moving data from CoreWeave Object Storage to AzureBlob is'
2833
+ ' currently not supported.')
2786
2834
  # Validate name
2787
2835
  self.name = self.validate_name(self.name)
2788
2836
 
@@ -3154,6 +3202,8 @@ class AzureBlobStore(AbstractStore):
3154
3202
  raise NotImplementedError(error_message.format('OCI'))
3155
3203
  elif self.source.startswith('nebius://'):
3156
3204
  raise NotImplementedError(error_message.format('NEBIUS'))
3205
+ elif self.source.startswith('cw://'):
3206
+ raise NotImplementedError(error_message.format('CoreWeave'))
3157
3207
  else:
3158
3208
  self.batch_az_blob_sync([self.source])
3159
3209
  except exceptions.StorageUploadError:
@@ -3572,6 +3622,10 @@ class IBMCosStore(AbstractStore):
3572
3622
  assert self.name == data_utils.split_cos_path(self.source)[0], (
3573
3623
  'COS Bucket is specified as path, the name should be '
3574
3624
  'the same as COS bucket.')
3625
+ elif self.source.startswith('cw://'):
3626
+ raise NotImplementedError(
3627
+ 'Moving data from CoreWeave Object Storage to COS is '
3628
+ 'currently not supported.')
3575
3629
  # Validate name
3576
3630
  self.name = IBMCosStore.validate_name(self.name)
3577
3631
 
@@ -3670,6 +3724,9 @@ class IBMCosStore(AbstractStore):
3670
3724
  elif self.source.startswith('r2://'):
3671
3725
  raise Exception('IBM COS currently not supporting'
3672
3726
  'data transfers between COS and r2')
3727
+ elif self.source.startswith('cw://'):
3728
+ raise Exception('IBM COS currently not supporting'
3729
+ 'data transfers between COS and CoreWeave')
3673
3730
  else:
3674
3731
  self.batch_ibm_rsync([self.source])
3675
3732
 
@@ -4595,3 +4652,103 @@ class NebiusStore(S3CompatibleStore):
4595
4652
  rclone_config, rclone_profile_name, self.bucket.name, mount_path)
4596
4653
  return mounting_utils.get_mounting_command(mount_path, install_cmd,
4597
4654
  mount_cached_cmd)
4655
+
4656
+
4657
+ @register_s3_compatible_store
4658
+ class CoreWeaveStore(S3CompatibleStore):
4659
+ """CoreWeaveStore inherits from S3CompatibleStore and represents the backend
4660
+ for CoreWeave Object Storage buckets.
4661
+ """
4662
+
4663
+ @classmethod
4664
+ def get_config(cls) -> S3CompatibleConfig:
4665
+ """Return the configuration for CoreWeave Object Storage."""
4666
+ return S3CompatibleConfig(
4667
+ store_type='COREWEAVE',
4668
+ url_prefix='cw://',
4669
+ client_factory=lambda region: data_utils.create_coreweave_client(),
4670
+ resource_factory=lambda name: coreweave.resource('s3').Bucket(name),
4671
+ split_path=data_utils.split_coreweave_path,
4672
+ verify_bucket=data_utils.verify_coreweave_bucket,
4673
+ aws_profile=coreweave.COREWEAVE_PROFILE_NAME,
4674
+ get_endpoint_url=coreweave.get_endpoint,
4675
+ credentials_file=coreweave.COREWEAVE_CREDENTIALS_PATH,
4676
+ config_file=coreweave.COREWEAVE_CONFIG_PATH,
4677
+ cloud_name=coreweave.NAME,
4678
+ default_region=coreweave.DEFAULT_REGION,
4679
+ mount_cmd_factory=cls._get_coreweave_mount_cmd,
4680
+ )
4681
+
4682
+ def _get_bucket(self) -> Tuple[StorageHandle, bool]:
4683
+ """Get or create bucket using CoreWeave's S3 API"""
4684
+ bucket = self.config.resource_factory(self.name)
4685
+
4686
+ # Use our custom bucket verification instead of head_bucket
4687
+ if data_utils.verify_coreweave_bucket(self.name):
4688
+ self._validate_existing_bucket()
4689
+ return bucket, False
4690
+
4691
+ # TODO(hailong): Enable the bucket creation for CoreWeave
4692
+ # Disable this to avoid waiting too long until the following
4693
+ # issue is resolved:
4694
+ # https://github.com/skypilot-org/skypilot/issues/7736
4695
+ raise exceptions.StorageBucketGetError(
4696
+ f'Bucket {self.name!r} does not exist. CoreWeave buckets can take'
4697
+ ' a long time to become accessible after creation, so SkyPilot'
4698
+ ' does not create them automatically. Please create the bucket'
4699
+ ' manually in CoreWeave and wait for it to be accessible before'
4700
+ ' using it.')
4701
+
4702
+ # # Check if this is a source with URL prefix (existing bucket case)
4703
+ # if isinstance(self.source, str) and self.source.startswith(
4704
+ # self.config.url_prefix):
4705
+ # with ux_utils.print_exception_no_traceback():
4706
+ # raise exceptions.StorageBucketGetError(
4707
+ # 'Attempted to use a non-existent bucket as a source: '
4708
+ # f'{self.source}.')
4709
+
4710
+ # # If bucket cannot be found, create it if needed
4711
+ # if self.sync_on_reconstruction:
4712
+ # bucket = self._create_bucket(self.name)
4713
+ # return bucket, True
4714
+ # else:
4715
+ # raise exceptions.StorageExternalDeletionError(
4716
+ # 'Attempted to fetch a non-existent bucket: '
4717
+ # f'{self.name}')
4718
+
4719
+ @classmethod
4720
+ def _get_coreweave_mount_cmd(cls, bucket_name: str, mount_path: str,
4721
+ bucket_sub_path: Optional[str]) -> str:
4722
+ """Factory method for CoreWeave mount command."""
4723
+ endpoint_url = coreweave.get_endpoint()
4724
+ return mounting_utils.get_coreweave_mount_cmd(
4725
+ coreweave.COREWEAVE_CREDENTIALS_PATH,
4726
+ coreweave.COREWEAVE_PROFILE_NAME, bucket_name, endpoint_url,
4727
+ mount_path, bucket_sub_path)
4728
+
4729
+ def mount_cached_command(self, mount_path: str) -> str:
4730
+ """CoreWeave-specific cached mount implementation using rclone."""
4731
+ install_cmd = mounting_utils.get_rclone_install_cmd()
4732
+ rclone_profile_name = (
4733
+ data_utils.Rclone.RcloneStores.COREWEAVE.get_profile_name(
4734
+ self.name))
4735
+ rclone_config = data_utils.Rclone.RcloneStores.COREWEAVE.get_config(
4736
+ rclone_profile_name=rclone_profile_name)
4737
+ mount_cached_cmd = mounting_utils.get_mount_cached_cmd(
4738
+ rclone_config, rclone_profile_name, self.bucket.name, mount_path)
4739
+ return mounting_utils.get_mounting_command(mount_path, install_cmd,
4740
+ mount_cached_cmd)
4741
+
4742
+ def _create_bucket(self, bucket_name: str) -> StorageHandle:
4743
+ """Create bucket using S3 API with timing handling for CoreWeave."""
4744
+ result = super()._create_bucket(bucket_name)
4745
+ # Ensure bucket is created
4746
+ # The newly created bucket ever takes about 18min to be accessible,
4747
+ # here we just retry for 36 times (5s * 36 = 180s) to avoid waiting
4748
+ # too long
4749
+ # TODO(hailong): Update the logic here when the following
4750
+ # issue is resolved:
4751
+ # https://github.com/skypilot-org/skypilot/issues/7736
4752
+ data_utils.verify_coreweave_bucket(bucket_name, retry=36)
4753
+
4754
+ return result
sky/exceptions.py CHANGED
@@ -649,7 +649,14 @@ class VolumeTopologyConflictError(Exception):
649
649
 
650
650
  class ServerTemporarilyUnavailableError(Exception):
651
651
  """Raised when the server is temporarily unavailable."""
652
- pass
652
+
653
+ def __init__(self, message: str):
654
+ super().__init__(message)
655
+ self.message = message
656
+
657
+ def __str__(self):
658
+ return ('SkyPilot API server is temporarily unavailable: '
659
+ f'{self.message}. Please try again later.')
653
660
 
654
661
 
655
662
  class RestfulPolicyError(Exception):
@@ -686,3 +693,8 @@ class ClientError(Exception):
686
693
  If a request encounters a ClientError, it will not be retried to the server.
687
694
  """
688
695
  pass
696
+
697
+
698
+ class ConcurrentWorkerExhaustedError(Exception):
699
+ """Raised when the concurrent worker is exhausted."""
700
+ pass
sky/execution.py CHANGED
@@ -16,6 +16,7 @@ from sky import global_user_state
16
16
  from sky import optimizer
17
17
  from sky import sky_logging
18
18
  from sky.backends import backend_utils
19
+ from sky.server.requests import request_names
19
20
  from sky.skylet import autostop_lib
20
21
  from sky.usage import usage_lib
21
22
  from sky.utils import admin_policy_utils
@@ -116,8 +117,10 @@ def _execute(
116
117
  no_setup: bool = False,
117
118
  clone_disk_from: Optional[str] = None,
118
119
  skip_unnecessary_provisioning: bool = False,
120
+ *, #keyword only separator
119
121
  # Internal only:
120
122
  # pylint: disable=invalid-name
123
+ _request_name: request_names.AdminPolicyRequestName,
121
124
  _quiet_optimizer: bool = False,
122
125
  _is_launched_by_jobs_controller: bool = False,
123
126
  _is_launched_by_sky_serve_controller: bool = False,
@@ -187,6 +190,7 @@ def _execute(
187
190
  idle_minutes_to_autostop = resource.autostop_config.idle_minutes
188
191
  with admin_policy_utils.apply_and_use_config_in_current_request(
189
192
  dag,
193
+ request_name=_request_name,
190
194
  request_options=admin_policy.RequestOptions(
191
195
  cluster_name=cluster_name,
192
196
  idle_minutes_to_autostop=idle_minutes_to_autostop,
@@ -535,12 +539,15 @@ def launch(
535
539
  no_setup: bool = False,
536
540
  clone_disk_from: Optional[str] = None,
537
541
  fast: bool = False,
542
+ *, #keyword only separator
538
543
  # Internal only:
539
544
  # pylint: disable=invalid-name
540
545
  _quiet_optimizer: bool = False,
541
546
  _is_launched_by_jobs_controller: bool = False,
542
547
  _is_launched_by_sky_serve_controller: bool = False,
543
548
  _disable_controller_check: bool = False,
549
+ _request_name: request_names.AdminPolicyRequestName = request_names.
550
+ AdminPolicyRequestName.CLUSTER_LAUNCH,
544
551
  job_logger: logging.Logger = logger,
545
552
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
546
553
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
@@ -707,9 +714,14 @@ def launch(
707
714
  _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
708
715
  _is_launched_by_sky_serve_controller=
709
716
  _is_launched_by_sky_serve_controller,
717
+ _request_name=_request_name,
710
718
  job_logger=job_logger)
711
719
 
712
720
 
721
+ # needed for backward compatibility. Remove by v0.10.7 or v0.11.0
722
+ cluster_launch = launch
723
+
724
+
713
725
  @usage_lib.entrypoint
714
726
  def exec( # pylint: disable=redefined-builtin
715
727
  task: Union['sky.Task', 'sky.Dag'],
@@ -794,4 +806,5 @@ def exec( # pylint: disable=redefined-builtin
794
806
  ],
795
807
  cluster_name=cluster_name,
796
808
  job_logger=job_logger,
809
+ _request_name=request_names.AdminPolicyRequestName.CLUSTER_EXEC,
797
810
  )