skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@
2
2
  import base64
3
3
  import pickle
4
4
  import typing
5
- from typing import Any, Dict, List, Optional, Tuple
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
6
6
 
7
7
  from sky import jobs as managed_jobs
8
8
  from sky import models
@@ -56,10 +56,10 @@ def decode_status(
56
56
  clusters = return_value
57
57
  response = []
58
58
  for cluster in clusters:
59
- cluster['handle'] = decode_and_unpickle(cluster['handle'])
59
+ # handle may not always be present in the response.
60
+ if 'handle' in cluster and cluster['handle'] is not None:
61
+ cluster['handle'] = decode_and_unpickle(cluster['handle'])
60
62
  cluster['status'] = status_lib.ClusterStatus(cluster['status'])
61
- cluster['storage_mounts_metadata'] = decode_and_unpickle(
62
- cluster['storage_mounts_metadata'])
63
63
  if 'is_managed' not in cluster:
64
64
  cluster['is_managed'] = False
65
65
  response.append(responses.StatusResponse.model_validate(cluster))
@@ -116,22 +116,35 @@ def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
116
116
 
117
117
 
118
118
  @register_decoders('jobs.queue_v2')
119
- def decode_jobs_queue_v2(return_value) -> List[responses.ManagedJobRecord]:
119
+ def decode_jobs_queue_v2(
120
+ return_value
121
+ ) -> Union[Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int],
122
+ List[responses.ManagedJobRecord]]:
120
123
  """Decode jobs queue response.
121
124
 
122
- Supports legacy list, or a dict {jobs, total}.
123
- - Returns list[job]
125
+ Supports legacy list, or a dict {jobs, total, total_no_filter,
126
+ status_counts}.
127
+
128
+ - Returns either list[job] or tuple(list[job], total, status_counts,
129
+ total_no_filter)
124
130
  """
125
- # Case 1: dict shape {jobs, total}
126
- if isinstance(return_value, dict) and 'jobs' in return_value:
131
+ # Case 1: dict shape {jobs, total, total_no_filter, status_counts}
132
+ if isinstance(return_value, dict):
127
133
  jobs = return_value.get('jobs', [])
134
+ total = return_value.get('total', len(jobs))
135
+ total_no_filter = return_value.get('total_no_filter', total)
136
+ status_counts = return_value.get('status_counts', {})
137
+ for job in jobs:
138
+ job['status'] = managed_jobs.ManagedJobStatus(job['status'])
139
+ jobs = [responses.ManagedJobRecord(**job) for job in jobs]
140
+ return jobs, total, status_counts, total_no_filter
128
141
  else:
129
142
  # Case 2: legacy list
130
143
  jobs = return_value
131
- for job in jobs:
132
- job['status'] = managed_jobs.ManagedJobStatus(job['status'])
133
- jobs = [responses.ManagedJobRecord(**job) for job in jobs]
134
- return jobs
144
+ for job in jobs:
145
+ job['status'] = managed_jobs.ManagedJobStatus(job['status'])
146
+ jobs = [responses.ManagedJobRecord(**job) for job in jobs]
147
+ return jobs
135
148
 
136
149
 
137
150
  def _decode_serve_status(
@@ -8,6 +8,8 @@ import pickle
8
8
  import typing
9
9
  from typing import Any, Dict, List, Optional, Tuple, Union
10
10
 
11
+ from sky import models
12
+ from sky.catalog import common
11
13
  from sky.schemas.api import responses
12
14
  from sky.server import constants as server_constants
13
15
  from sky.utils import serialize_utils
@@ -15,7 +17,6 @@ from sky.utils import serialize_utils
15
17
  if typing.TYPE_CHECKING:
16
18
  from sky import backends
17
19
  from sky import clouds
18
- from sky import models
19
20
  from sky.provision.kubernetes import utils as kubernetes_utils
20
21
 
21
22
  handlers: Dict[str, Any] = {}
@@ -60,13 +61,23 @@ def encode_status(
60
61
  clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
61
62
  response = []
62
63
  for cluster in clusters:
63
- response_cluster = cluster.model_dump()
64
+ response_cluster = cluster.model_dump(exclude_none=True)
65
+ # These default setting is needed because last_use and status_updated_at
66
+ # used to be not optional.
67
+ # TODO(syang): remove this after v0.10.7 or v0.11.0
68
+ if 'last_use' not in response_cluster:
69
+ response_cluster['last_use'] = ''
70
+ if 'status_updated_at' not in response_cluster:
71
+ response_cluster['status_updated_at'] = 0
64
72
  response_cluster['status'] = cluster['status'].value
65
73
  handle = serialize_utils.prepare_handle_for_backwards_compatibility(
66
74
  cluster['handle'])
67
75
  response_cluster['handle'] = pickle_and_encode(handle)
76
+ # TODO (syang) We still need to return this field for backwards
77
+ # compatibility.
78
+ # Remove this field at or after v0.10.7 or v0.11.0
68
79
  response_cluster['storage_mounts_metadata'] = pickle_and_encode(
69
- response_cluster['storage_mounts_metadata'])
80
+ None) # Always returns None.
70
81
  response.append(response_cluster)
71
82
  return response
72
83
 
@@ -121,7 +132,7 @@ def encode_status_kubernetes(
121
132
  encoded_cluster = dataclasses.asdict(cluster)
122
133
  encoded_cluster['status'] = encoded_cluster['status'].value
123
134
  encoded_unmanaged_clusters.append(encoded_cluster)
124
- all_jobs = [job.model_dump() for job in all_jobs]
135
+ all_jobs = [job.model_dump(by_alias=True) for job in all_jobs]
125
136
  return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
126
137
 
127
138
 
@@ -148,12 +159,13 @@ def encode_jobs_queue_v2(
148
159
  else:
149
160
  jobs = jobs_or_tuple
150
161
  total = None
151
- for job in jobs:
162
+ jobs_dict = [job.model_dump(by_alias=True) for job in jobs]
163
+ for job in jobs_dict:
152
164
  job['status'] = job['status'].value
153
165
  if total is None:
154
- return [job.model_dump() for job in jobs]
166
+ return jobs_dict
155
167
  return {
156
- 'jobs': [job.model_dump() for job in jobs],
168
+ 'jobs': jobs_dict,
157
169
  'total': total,
158
170
  'total_no_filter': total_no_filter,
159
171
  'status_counts': status_counts
@@ -205,10 +217,11 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
205
217
  @register_encoder('storage_ls')
206
218
  def encode_storage_ls(
207
219
  return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
208
- for storage_info in return_value:
220
+ response_list = [storage_info.model_dump() for storage_info in return_value]
221
+ for storage_info in response_list:
209
222
  storage_info['status'] = storage_info['status'].value
210
223
  storage_info['store'] = [store.value for store in storage_info['store']]
211
- return [storage_info.model_dump() for storage_info in return_value]
224
+ return response_list
212
225
 
213
226
 
214
227
  @register_encoder('volume_list')
@@ -218,11 +231,11 @@ def encode_volume_list(
218
231
 
219
232
 
220
233
  @register_encoder('job_status')
221
- def encode_job_status(return_value: Dict[int, Any]) -> Dict[int, str]:
234
+ def encode_job_status(return_value: Dict[int, Any]) -> Dict[str, str]:
222
235
  for job_id in return_value.keys():
223
236
  if return_value[job_id] is not None:
224
237
  return_value[job_id] = return_value[job_id].value
225
- return return_value
238
+ return {str(k): v for k, v in return_value.items()}
226
239
 
227
240
 
228
241
  @register_encoder('kubernetes_node_info')
@@ -234,3 +247,35 @@ def encode_kubernetes_node_info(
234
247
  @register_encoder('endpoints')
235
248
  def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
236
249
  return {str(k): v for k, v in return_value.items()}
250
+
251
+
252
+ @register_encoder('realtime_kubernetes_gpu_availability')
253
+ def encode_realtime_gpu_availability(
254
+ return_value: List[Tuple[str,
255
+ List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
256
+ # Convert RealtimeGpuAvailability namedtuples to lists
257
+ # for JSON serialization.
258
+ encoded = []
259
+ for context, gpu_list in return_value:
260
+ converted_gpu_list = []
261
+ for gpu in gpu_list:
262
+ assert isinstance(gpu, models.RealtimeGpuAvailability), (
263
+ f'Expected RealtimeGpuAvailability, got {type(gpu)}')
264
+ converted_gpu_list.append(list(gpu))
265
+ encoded.append((context, converted_gpu_list))
266
+ return encoded
267
+
268
+
269
+ @register_encoder('list_accelerators')
270
+ def encode_list_accelerators(
271
+ return_value: Dict[str, List[Any]]) -> Dict[str, Any]:
272
+ encoded: Dict[str, Any] = {}
273
+ for accelerator_name, instances in return_value.items():
274
+ # Convert InstanceTypeInfo namedtuples to lists for JSON serialization.
275
+ converted_instances: List[Any] = []
276
+ for instance in instances:
277
+ assert isinstance(instance, common.InstanceTypeInfo), (
278
+ f'Expected InstanceTypeInfo, got {type(instance)}')
279
+ converted_instances.append(list(instance))
280
+ encoded[accelerator_name] = converted_instances
281
+ return encoded
@@ -0,0 +1,106 @@
1
+ """Request execution threads management."""
2
+
3
+ import concurrent.futures
4
+ import threading
5
+ from typing import Callable, Set
6
+
7
+ from sky import exceptions
8
+ from sky import sky_logging
9
+ from sky.utils import atomic
10
+
11
+ logger = sky_logging.init_logger(__name__)
12
+
13
+
14
+ class OnDemandThreadExecutor(concurrent.futures.Executor):
15
+ """An executor that creates a new thread for each task and destroys it
16
+ after the task is completed.
17
+
18
+ Note(dev):
19
+ We raise an error instead of queuing the request if the limit is reached, so
20
+ that:
21
+ 1. the request might be handled by other processes that have idle workers
22
+ upon retry;
23
+ 2. if not, then users can be clearly hinted that they need to scale the API
24
+ server to support higher concurrency.
25
+ So this executor is only suitable for carefully selected cases where the
26
+ error can be properly handled by caller. To make this executor general, we
27
+ need to support configuring the queuing behavior (exception or queueing).
28
+ """
29
+
30
+ def __init__(self, name: str, max_workers: int):
31
+ self.name: str = name
32
+ self.max_workers: int = max_workers
33
+ self.running: atomic.AtomicInt = atomic.AtomicInt(0)
34
+ self._shutdown: bool = False
35
+ self._shutdown_lock: threading.Lock = threading.Lock()
36
+ self._threads: Set[threading.Thread] = set()
37
+ self._threads_lock: threading.Lock = threading.Lock()
38
+
39
+ def _cleanup_thread(self, thread: threading.Thread):
40
+ with self._threads_lock:
41
+ self._threads.discard(thread)
42
+
43
+ def _task_wrapper(self, fn: Callable, fut: concurrent.futures.Future, /,
44
+ *args, **kwargs):
45
+ try:
46
+ result = fn(*args, **kwargs)
47
+ fut.set_result(result)
48
+ except Exception as e: # pylint: disable=broad-except
49
+ logger.debug(f'Executor [{self.name}] error executing {fn}: {e}')
50
+ fut.set_exception(e)
51
+ finally:
52
+ self.running.decrement()
53
+ self._cleanup_thread(threading.current_thread())
54
+
55
+ def check_available(self, borrow: bool = False) -> int:
56
+ """Check if there are available workers.
57
+
58
+ Args:
59
+ borrow: If True, the caller borrow a worker from the executor.
60
+ The caller is responsible for returning the worker to the
61
+ executor after the task is completed.
62
+ """
63
+ count = self.running.increment()
64
+ if count > self.max_workers:
65
+ self.running.decrement()
66
+ raise exceptions.ConcurrentWorkerExhaustedError(
67
+ f'Maximum concurrent workers {self.max_workers} of threads '
68
+ f'executor [{self.name}] reached')
69
+ if not borrow:
70
+ self.running.decrement()
71
+ return count
72
+
73
+ def submit(self, fn, /, *args, **kwargs):
74
+ with self._shutdown_lock:
75
+ if self._shutdown:
76
+ raise RuntimeError(
77
+ 'Cannot submit task after executor is shutdown')
78
+ count = self.check_available(borrow=True)
79
+ fut: concurrent.futures.Future = concurrent.futures.Future()
80
+ # Name is assigned for debugging purpose, duplication is fine
81
+ thread = threading.Thread(target=self._task_wrapper,
82
+ name=f'{self.name}-{count}',
83
+ args=(fn, fut, *args),
84
+ kwargs=kwargs,
85
+ daemon=True)
86
+ with self._threads_lock:
87
+ self._threads.add(thread)
88
+ try:
89
+ thread.start()
90
+ except Exception as e:
91
+ self.running.decrement()
92
+ self._cleanup_thread(thread)
93
+ fut.set_exception(e)
94
+ raise
95
+ assert thread.ident is not None, 'Thread should be started'
96
+ return fut
97
+
98
+ def shutdown(self, wait=True):
99
+ with self._shutdown_lock:
100
+ self._shutdown = True
101
+ if not wait:
102
+ return
103
+ with self._threads_lock:
104
+ threads = list(self._threads)
105
+ for t in threads:
106
+ t.join()
sky/server/rest.py CHANGED
@@ -178,14 +178,16 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
178
178
  Notes(dev):
179
179
  """
180
180
 
181
+ def _readable_error_msg(message: str) -> str:
182
+ return (f'{colorama.Fore.YELLOW}API server is temporarily '
183
+ f'unavailable: {message}.\nRetrying...'
184
+ f'{colorama.Style.RESET_ALL}')
185
+
181
186
  def decorator(func: F) -> F:
182
187
 
183
188
  @functools.wraps(func)
184
189
  def wrapper(*args, **kwargs) -> Any:
185
- msg = (
186
- f'{colorama.Fore.YELLOW}API server is temporarily unavailable: '
187
- 'upgrade in progress. Waiting to resume...'
188
- f'{colorama.Style.RESET_ALL}')
190
+
189
191
  backoff = common_utils.Backoff(
190
192
  initial_backoff=initial_backoff,
191
193
  max_backoff_factor=max_backoff_factor)
@@ -203,7 +205,8 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
203
205
  # stop the status spinner before retrying func() to
204
206
  # avoid the status spinner get stuck if the func() runs
205
207
  # for a long time without update status, e.g. sky logs.
206
- with rich_utils.client_status(msg):
208
+ with rich_utils.client_status(
209
+ _readable_error_msg(e.message)):
207
210
  if time.time() - start_time > max_wait_seconds:
208
211
  # pylint: disable=line-too-long
209
212
  raise exceptions.ServerTemporarilyUnavailableError(
@@ -224,14 +227,67 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
224
227
 
225
228
 
226
229
  def handle_server_unavailable(response: 'requests.Response') -> None:
227
- if response.status_code == 503:
228
- # TODO(aylei): Hacky, depends on how nginx controller handles backends
229
- # with no ready endpoints. Should use self-defined status code or header
230
- # to distinguish retryable server error from general 503 errors.
231
- with ux_utils.print_exception_no_traceback():
232
- raise exceptions.ServerTemporarilyUnavailableError(
233
- 'SkyPilot API server is temporarily unavailable. '
234
- 'Please try again later.')
230
+ """Handle 503 (Service Unavailable) error
231
+
232
+ The client get 503 error in the following cases:
233
+ 1. The reverse proxy cannot find any ready backend endpoints to serve the
234
+ request, e.g. when there is and rolling-update.
235
+ 2. The skypilot API server has temporary resource issue, e.g. when the
236
+ cucurrency of the handling process is exhausted.
237
+
238
+ We expect the caller (CLI or SDK) retry on these cases and show clear wait
239
+ message to the user to let user decide whether keep waiting or abort the
240
+ request.
241
+ """
242
+ if response.status_code != 503:
243
+ return
244
+
245
+ # error_msg = 'SkyPilot API server is temporarily unavailable. '
246
+ error_msg = ''
247
+ try:
248
+ response_data = response.json()
249
+ if 'detail' in response_data:
250
+ error_msg = response_data['detail']
251
+ except Exception: # pylint: disable=broad-except
252
+ if response.text:
253
+ error_msg = response.text
254
+
255
+ with ux_utils.print_exception_no_traceback():
256
+ raise exceptions.ServerTemporarilyUnavailableError(error_msg)
257
+
258
+
259
+ async def handle_server_unavailable_async(
260
+ response: 'aiohttp.ClientResponse') -> None:
261
+ """Async version: Handle 503 (Service Unavailable) error
262
+
263
+ The client get 503 error in the following cases:
264
+ 1. The reverse proxy cannot find any ready backend endpoints to serve the
265
+ request, e.g. when there is and rolling-update.
266
+ 2. The skypilot API server has temporary resource issue, e.g. when the
267
+ cucurrency of the handling process is exhausted.
268
+
269
+ We expect the caller (CLI or SDK) retry on these cases and show clear wait
270
+ message to the user to let user decide whether keep waiting or abort the
271
+ request.
272
+ """
273
+ if response.status != 503:
274
+ return
275
+
276
+ error_msg = ''
277
+ try:
278
+ response_data = await response.json()
279
+ if 'detail' in response_data:
280
+ error_msg = response_data['detail']
281
+ except Exception: # pylint: disable=broad-except
282
+ try:
283
+ text = await response.text()
284
+ if text:
285
+ error_msg = text
286
+ except Exception: # pylint: disable=broad-except
287
+ pass
288
+
289
+ with ux_utils.print_exception_no_traceback():
290
+ raise exceptions.ServerTemporarilyUnavailableError(error_msg)
235
291
 
236
292
 
237
293
  @_retry_on_server_unavailable()
@@ -310,11 +366,7 @@ async def request_without_retry_async(session: 'aiohttp.ClientSession',
310
366
  response = await session.request(method, url, **kwargs)
311
367
 
312
368
  # Handle server unavailability (503 status) - same as sync version
313
- if response.status == 503:
314
- with ux_utils.print_exception_no_traceback():
315
- raise exceptions.ServerTemporarilyUnavailableError(
316
- 'SkyPilot API server is temporarily unavailable. '
317
- 'Please try again later.')
369
+ await handle_server_unavailable_async(response)
318
370
 
319
371
  # Set remote API version and version from headers - same as sync version
320
372
  remote_api_version = response.headers.get(constants.API_VERSION_HEADER)