skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,322 @@
1
+ """Utilities for formatting tables for CLI output."""
2
+ import abc
3
+ from datetime import datetime
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ import prettytable
7
+
8
+ from sky import sky_logging
9
+ from sky.jobs import utils as managed_jobs
10
+ from sky.schemas.api import responses
11
+ from sky.skylet import constants
12
+ from sky.utils import common_utils
13
+ from sky.utils import log_utils
14
+ from sky.utils import volume
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+
19
+ def format_job_queue(jobs: List[responses.ClusterJobRecord]):
20
+ """Format the job queue for display.
21
+
22
+ Usage:
23
+ jobs = get_job_queue()
24
+ print(format_job_queue(jobs))
25
+ """
26
+ job_table = log_utils.create_table([
27
+ 'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
28
+ 'STATUS', 'LOG', 'GIT COMMIT'
29
+ ])
30
+ for job in jobs:
31
+ job_table.add_row([
32
+ job.job_id,
33
+ job.job_name,
34
+ job.username,
35
+ log_utils.readable_time_duration(job.submitted_at),
36
+ log_utils.readable_time_duration(job.start_at),
37
+ log_utils.readable_time_duration(job.start_at,
38
+ job.end_at,
39
+ absolute=True),
40
+ job.resources,
41
+ job.status.colored_str(),
42
+ job.log_path,
43
+ job.metadata.get('git_commit', '-'),
44
+ ])
45
+ return job_table
46
+
47
+
48
+ def format_storage_table(storages: List[responses.StorageRecord],
49
+ show_all: bool = False) -> str:
50
+ """Format the storage table for display.
51
+
52
+ Args:
53
+ storage_table (dict): The storage table.
54
+
55
+ Returns:
56
+ str: The formatted storage table.
57
+ """
58
+ storage_table = log_utils.create_table([
59
+ 'NAME',
60
+ 'UPDATED',
61
+ 'STORE',
62
+ 'COMMAND',
63
+ 'STATUS',
64
+ ])
65
+
66
+ for row in storages:
67
+ launched_at = row.launched_at
68
+ if show_all:
69
+ command = row.last_use
70
+ else:
71
+ command = common_utils.truncate_long_string(
72
+ row.last_use, constants.LAST_USE_TRUNC_LENGTH)
73
+ storage_table.add_row([
74
+ # NAME
75
+ row.name,
76
+ # LAUNCHED
77
+ log_utils.readable_time_duration(launched_at),
78
+ # CLOUDS
79
+ ', '.join([s.value for s in row.store]),
80
+ # COMMAND,
81
+ command,
82
+ # STATUS
83
+ row.status.value,
84
+ ])
85
+ if storages:
86
+ return str(storage_table)
87
+ else:
88
+ return 'No existing storage.'
89
+
90
+
91
+ def format_job_table(
92
+ jobs: List[responses.ManagedJobRecord],
93
+ show_all: bool,
94
+ show_user: bool,
95
+ pool_status: Optional[List[Dict[str, Any]]] = None,
96
+ max_jobs: Optional[int] = None,
97
+ status_counts: Optional[Dict[str, int]] = None,
98
+ ):
99
+ jobs = [job.model_dump() for job in jobs]
100
+ return managed_jobs.format_job_table(
101
+ jobs,
102
+ pool_status=pool_status,
103
+ show_all=show_all,
104
+ show_user=show_user,
105
+ max_jobs=max_jobs,
106
+ job_status_counts=status_counts,
107
+ )
108
+
109
+
110
+ _BASIC_COLUMNS = [
111
+ 'NAME',
112
+ 'TYPE',
113
+ 'INFRA',
114
+ 'SIZE',
115
+ 'USER',
116
+ 'WORKSPACE',
117
+ 'AGE',
118
+ 'STATUS',
119
+ 'LAST_USE',
120
+ 'USED_BY',
121
+ ]
122
+
123
+
124
+ def _get_infra_str(cloud: Optional[str], region: Optional[str],
125
+ zone: Optional[str]) -> str:
126
+ """Get the infrastructure string for the volume."""
127
+ infra = ''
128
+ if cloud:
129
+ infra += cloud
130
+ if region:
131
+ infra += f'/{region}'
132
+ if zone:
133
+ infra += f'/{zone}'
134
+ return infra
135
+
136
+
137
+ class VolumeTable(abc.ABC):
138
+ """The volume table."""
139
+
140
+ def __init__(self,
141
+ volumes: List[responses.VolumeRecord],
142
+ show_all: bool = False):
143
+ super().__init__()
144
+ self.table = self._create_table(show_all)
145
+ self._add_rows(volumes, show_all)
146
+
147
+ def _get_row_base_columns(self,
148
+ row: responses.VolumeRecord,
149
+ show_all: bool = False) -> List[str]:
150
+ """Get the base columns for a row."""
151
+ # Convert last_attached_at timestamp to human readable string
152
+ last_attached_at = row.get('last_attached_at')
153
+ if last_attached_at is not None:
154
+ last_attached_at_str = datetime.fromtimestamp(
155
+ last_attached_at).strftime('%Y-%m-%d %H:%M:%S')
156
+ else:
157
+ last_attached_at_str = '-'
158
+ size = row.get('size', '')
159
+ if size:
160
+ size = f'{size}Gi'
161
+ usedby_str = '-'
162
+ usedby_clusters = row.get('usedby_clusters')
163
+ usedby_pods = row.get('usedby_pods')
164
+ if usedby_clusters:
165
+ usedby_str = f'{", ".join(usedby_clusters)}'
166
+ elif usedby_pods:
167
+ usedby_str = f'{", ".join(usedby_pods)}'
168
+ if show_all:
169
+ usedby = usedby_str
170
+ else:
171
+ usedby = common_utils.truncate_long_string(
172
+ usedby_str, constants.USED_BY_TRUNC_LENGTH)
173
+ infra = _get_infra_str(row.get('cloud'), row.get('region'),
174
+ row.get('zone'))
175
+ return [
176
+ row.get('name', ''),
177
+ row.get('type', ''),
178
+ infra,
179
+ size,
180
+ row.get('user_name', '-'),
181
+ row.get('workspace', '-'),
182
+ log_utils.human_duration(row.get('launched_at', 0)),
183
+ row.get('status', ''),
184
+ last_attached_at_str,
185
+ usedby,
186
+ ]
187
+
188
+ def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
189
+ """Create the volume table."""
190
+ raise NotImplementedError
191
+
192
+ def _add_rows(self,
193
+ volumes: List[responses.VolumeRecord],
194
+ show_all: bool = False) -> None:
195
+ """Add rows to the volume table."""
196
+ raise NotImplementedError
197
+
198
+ @abc.abstractmethod
199
+ def format(self) -> str:
200
+ """Format the volume table for display."""
201
+ raise NotImplementedError
202
+
203
+
204
+ class PVCVolumeTable(VolumeTable):
205
+ """The PVC volume table."""
206
+
207
+ def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
208
+ """Create the PVC volume table."""
209
+ # If show_all is False, show the table with the columns:
210
+ # NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
211
+ # AGE, STATUS, LAST_USE, USED_BY, IS_EPHEMERAL
212
+ # If show_all is True, show the table with the columns:
213
+ # NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
214
+ # AGE, STATUS, LAST_USE, USED_BY, IS_EPHEMERAL, NAME_ON_CLOUD
215
+ # STORAGE_CLASS, ACCESS_MODE
216
+
217
+ columns = _BASIC_COLUMNS + [
218
+ 'IS_EPHEMERAL',
219
+ ]
220
+ if show_all:
221
+ columns = columns + [
222
+ 'NAME_ON_CLOUD',
223
+ 'STORAGE_CLASS',
224
+ 'ACCESS_MODE',
225
+ ]
226
+
227
+ table = log_utils.create_table(columns)
228
+ return table
229
+
230
+ def _add_rows(self,
231
+ volumes: List[responses.VolumeRecord],
232
+ show_all: bool = False) -> None:
233
+ """Add rows to the PVC volume table."""
234
+ for row in volumes:
235
+ table_row = self._get_row_base_columns(row, show_all)
236
+ table_row.append(row.get('is_ephemeral', False))
237
+ if show_all:
238
+ table_row.append(row.get('name_on_cloud', ''))
239
+ table_row.append(
240
+ row.get('config', {}).get('storage_class_name', '-'))
241
+ table_row.append(row.get('config', {}).get('access_mode', ''))
242
+
243
+ self.table.add_row(table_row)
244
+
245
+ def format(self) -> str:
246
+ """Format the PVC volume table for display."""
247
+ return 'Kubernetes PVCs:\n' + str(self.table)
248
+
249
+
250
+ class RunPodVolumeTable(VolumeTable):
251
+ """The RunPod volume table."""
252
+
253
+ def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
254
+ """Create the RunPod volume table."""
255
+ # If show_all is False, show the table with the columns:
256
+ # NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
257
+ # AGE, STATUS, LAST_USE, USED_BY
258
+ # If show_all is True, show the table with the columns:
259
+ # NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
260
+ # AGE, STATUS, LAST_USE, USED_BY, NAME_ON_CLOUD
261
+
262
+ if show_all:
263
+ columns = _BASIC_COLUMNS + ['NAME_ON_CLOUD']
264
+ else:
265
+ columns = _BASIC_COLUMNS
266
+
267
+ table = log_utils.create_table(columns)
268
+ return table
269
+
270
+ def _add_rows(self,
271
+ volumes: List[responses.VolumeRecord],
272
+ show_all: bool = False) -> None:
273
+ """Add rows to the RunPod volume table."""
274
+ for row in volumes:
275
+ table_row = self._get_row_base_columns(row, show_all)
276
+ if show_all:
277
+ table_row.append(row.get('name_on_cloud', ''))
278
+
279
+ self.table.add_row(table_row)
280
+
281
+ def format(self) -> str:
282
+ """Format the RunPod volume table for display."""
283
+ return 'RunPod Network Volumes:\n' + str(self.table)
284
+
285
+
286
+ def format_volume_table(volumes: List[responses.VolumeRecord],
287
+ show_all: bool = False) -> str:
288
+ """Format the volume table for display.
289
+
290
+ Args:
291
+ volume_table (dict): The volume table.
292
+
293
+ Returns:
294
+ str: The formatted volume table.
295
+ """
296
+ volumes_per_type: Dict[str, List[responses.VolumeRecord]] = {}
297
+ supported_volume_types = [
298
+ volume_type.value for volume_type in volume.VolumeType
299
+ ]
300
+ for row in volumes:
301
+ volume_type = row.get('type', '')
302
+ if volume_type in supported_volume_types:
303
+ if volume_type not in volumes_per_type:
304
+ volumes_per_type[volume_type] = []
305
+ volumes_per_type[volume_type].append(row)
306
+ else:
307
+ logger.warning(f'Unknown volume type: {volume_type}')
308
+ continue
309
+ table_str = ''
310
+ for volume_type, volume_list in volumes_per_type.items():
311
+ if table_str:
312
+ table_str += '\n\n'
313
+ if volume_type == volume.VolumeType.PVC.value:
314
+ pvc_table = PVCVolumeTable(volume_list, show_all)
315
+ table_str += pvc_table.format()
316
+ elif volume_type == volume.VolumeType.RUNPOD_NETWORK_VOLUME.value:
317
+ runpod_table = RunPodVolumeTable(volume_list, show_all)
318
+ table_str += runpod_table.format()
319
+ if table_str:
320
+ return table_str
321
+ else:
322
+ return 'No existing volumes.'
@@ -0,0 +1,79 @@
1
+ """Utility functions for the CLI."""
2
+ import enum
3
+ import typing
4
+ from typing import Dict, List, Optional, Tuple, Union
5
+
6
+ from sky import exceptions
7
+ from sky import jobs as managed_jobs
8
+ from sky.schemas.api import responses
9
+ from sky.server import common as server_common
10
+
11
+
12
+ class QueueResultVersion(enum.Enum):
13
+ """The version of the queue result.
14
+
15
+ V1: The old version of the queue result.
16
+ - job_records (List[responses.ManagedJobRecord]): A list of dicts,
17
+ with each dict containing the information of a job.
18
+ V2: The new version of the queue result.
19
+ - job_records (List[responses.ManagedJobRecord]): A list of dicts,
20
+ with each dict containing the information of a job.
21
+ - total (int): Total number of jobs after filter.
22
+ - status_counts (Dict[str, int]): Status counts after filter.
23
+ - total_no_filter (int): Total number of jobs before filter.
24
+ """
25
+ V1 = 'v1'
26
+ V2 = 'v2'
27
+
28
+ def v2(self) -> bool:
29
+ return self == QueueResultVersion.V2
30
+
31
+
32
+ def get_managed_job_queue(
33
+ refresh: bool,
34
+ skip_finished: bool = False,
35
+ all_users: bool = False,
36
+ job_ids: Optional[List[int]] = None,
37
+ limit: Optional[int] = None,
38
+ fields: Optional[List[str]] = None,
39
+ ) -> Tuple[server_common.RequestId[Union[List[responses.ManagedJobRecord],
40
+ Tuple[List[responses.ManagedJobRecord],
41
+ int, Dict[str, int], int]]],
42
+ QueueResultVersion]:
43
+ """Gets statuses of managed jobs.
44
+
45
+ Please refer to sky.cli.job_queue for documentation.
46
+
47
+ Args:
48
+ refresh: Whether to restart the jobs controller if it is stopped.
49
+ skip_finished: Whether to skip finished jobs.
50
+ all_users: Whether to show all users' jobs.
51
+ job_ids: IDs of the managed jobs to show.
52
+ limit: Number of jobs to show.
53
+ fields: Fields to get for the managed jobs.
54
+
55
+ Returns:
56
+ - the request ID of the queue request
57
+ - the version of the queue result
58
+
59
+ Request Raises:
60
+ sky.exceptions.ClusterNotUpError: the jobs controller is not up or
61
+ does not exist.
62
+ RuntimeError: if failed to get the managed jobs with ssh.
63
+ """
64
+ try:
65
+ return typing.cast(
66
+ server_common.RequestId[
67
+ Union[List[responses.ManagedJobRecord],
68
+ Tuple[List[responses.ManagedJobRecord], int,
69
+ Dict[str, int], int]]],
70
+ managed_jobs.queue_v2(refresh, skip_finished, all_users, job_ids,
71
+ limit, fields)), QueueResultVersion.V2
72
+ except exceptions.APINotSupportedError:
73
+ return typing.cast(
74
+ server_common.RequestId[
75
+ Union[List[responses.ManagedJobRecord],
76
+ Tuple[List[responses.ManagedJobRecord], int,
77
+ Dict[str, int], int]]],
78
+ managed_jobs.queue(refresh, skip_finished, all_users,
79
+ job_ids)), QueueResultVersion.V1
sky/client/common.py CHANGED
@@ -16,8 +16,10 @@ import zipfile
16
16
 
17
17
  from sky import sky_logging
18
18
  from sky.adaptors import common as adaptors_common
19
+ from sky.client import service_account_auth
19
20
  from sky.data import data_utils
20
21
  from sky.data import storage_utils
22
+ from sky.schemas.api import responses as api_responses
21
23
  from sky.server import common as server_common
22
24
  from sky.server.requests import payloads
23
25
  from sky.skylet import constants
@@ -31,7 +33,7 @@ if typing.TYPE_CHECKING:
31
33
  import requests
32
34
 
33
35
  import sky
34
- import sky.dag as dag_lib
36
+ from sky import dag as dag_lib
35
37
  else:
36
38
  httpx = adaptors_common.LazyImport('httpx')
37
39
  requests = adaptors_common.LazyImport('requests')
@@ -42,8 +44,10 @@ logger = sky_logging.init_logger(__name__)
42
44
  _DOWNLOAD_CHUNK_BYTES = 8192
43
45
  # The chunk size for the zip file to be uploaded to the API server. We split
44
46
  # the zip file into chunks to avoid network issues for large request body that
45
- # can be caused by NGINX's client_max_body_size.
46
- _UPLOAD_CHUNK_BYTES = 512 * 1024 * 1024
47
+ # can be caused by NGINX's client_max_body_size or Cloudflare's upload limit.
48
+ # As of 09/25/2025, the upload limit for Cloudflare's free plan is 100MiB:
49
+ # https://developers.cloudflare.com/support/troubleshooting/http-status-codes/4xx-client-error/error-413/
50
+ _UPLOAD_CHUNK_BYTES = 100 * 1024 * 1024
47
51
 
48
52
  FILE_UPLOAD_LOGS_DIR = os.path.join(constants.SKY_LOGS_DIRECTORY,
49
53
  'file_uploads')
@@ -79,11 +83,20 @@ def download_logs_from_api_server(
79
83
  remote_machine_prefix,
80
84
  local_machine_prefix) for remote_path in paths_on_api_server
81
85
  }
86
+ # Check if any local log directories already exist before downloading
87
+ for local_path in remote2local_path_dict.values():
88
+ expanded_path = os.path.expanduser(local_path)
89
+ if os.path.exists(expanded_path):
90
+ logger.warning(
91
+ f'Log directory {local_path} already exists. '
92
+ f'This may overwrite logs from a previous cluster with the '
93
+ f'same name and job ID.')
82
94
  body = payloads.DownloadBody(folder_paths=list(paths_on_api_server),)
83
- response = requests.post(f'{server_common.get_server_url()}/download',
84
- json=json.loads(body.model_dump_json()),
85
- stream=True,
86
- cookies=server_common.get_api_cookie_jar())
95
+ response = server_common.make_authenticated_request(
96
+ 'POST',
97
+ '/download',
98
+ json=json.loads(body.model_dump_json()),
99
+ stream=True)
87
100
  if response.status_code == 200:
88
101
  remote_home_path = response.headers.get('X-Home-Path')
89
102
  assert remote_home_path is not None, response.headers
@@ -164,14 +177,19 @@ class UploadChunkParams:
164
177
  log_file: str
165
178
 
166
179
 
167
- def _upload_chunk_with_retry(params: UploadChunkParams) -> None:
168
- """Uploads a chunk of a zip file to the API server."""
180
+ def _upload_chunk_with_retry(params: UploadChunkParams) -> str:
181
+ """Uploads a chunk of a zip file to the API server.
182
+
183
+ Returns:
184
+ Status of the upload.
185
+ """
169
186
  upload_logger = params.upload_logger
170
187
  upload_logger.info(
171
188
  f'Uploading chunk: {params.chunk_index + 1} / {params.total_chunks}')
172
189
 
173
190
  server_url = server_common.get_server_url()
174
191
  max_attempts = 3
192
+ sa_headers = service_account_auth.get_service_account_headers()
175
193
  with open(params.file_path, 'rb') as f:
176
194
  for attempt in range(max_attempts):
177
195
  response = params.client.post(
@@ -184,19 +202,23 @@ def _upload_chunk_with_retry(params: UploadChunkParams) -> None:
184
202
  },
185
203
  content=FileChunkIterator(f, _UPLOAD_CHUNK_BYTES,
186
204
  params.chunk_index),
187
- headers={'Content-Type': 'application/octet-stream'},
205
+ headers={
206
+ 'Content-Type': 'application/octet-stream',
207
+ **sa_headers,
208
+ },
188
209
  cookies=server_common.get_api_cookie_jar())
189
210
  if response.status_code == 200:
190
211
  data = response.json()
191
212
  status = data.get('status')
192
213
  msg = ('Uploaded chunk: '
193
- f'{params.chunk_index + 1} / {params.total_chunks}')
194
- if status == 'uploading':
214
+ f'{params.chunk_index + 1} / {params.total_chunks} '
215
+ f'(Status: {status})')
216
+ if status == api_responses.UploadStatus.UPLOADING.value:
195
217
  missing_chunks = data.get('missing_chunks')
196
218
  if missing_chunks:
197
219
  msg += f' - Waiting for chunks: {missing_chunks}'
198
220
  upload_logger.info(msg)
199
- return
221
+ return status
200
222
  elif attempt < max_attempts - 1:
201
223
  upload_logger.error(
202
224
  f'Failed to upload chunk: '
@@ -204,17 +226,29 @@ def _upload_chunk_with_retry(params: UploadChunkParams) -> None:
204
226
  f'{response.content.decode("utf-8")}')
205
227
  upload_logger.info(
206
228
  f'Retrying... ({attempt + 1} / {max_attempts})')
207
- time.sleep(1)
229
+ if response.status_code == 503:
230
+ # If the server is temporarily unavailable,
231
+ # wait a little longer before retrying.
232
+ time.sleep(10)
233
+ else:
234
+ time.sleep(1)
208
235
  else:
236
+ try:
237
+ response_details = response.json().get('detail')
238
+ except Exception: # pylint: disable=broad-except
239
+ response_details = response.content
209
240
  error_msg = (
210
241
  f'Failed to upload chunk: {params.chunk_index + 1} / '
211
- f'{params.total_chunks}: {response.json().get("detail")}')
242
+ f'{params.total_chunks}: {response_details} '
243
+ f'(Status code: {response.status_code})')
212
244
  upload_logger.error(error_msg)
213
245
  with ux_utils.print_exception_no_traceback():
214
246
  raise RuntimeError(
215
247
  ux_utils.error_message(error_msg + '\n',
216
248
  params.log_file,
217
249
  is_local=True))
250
+ # If we reach here, the upload failed.
251
+ return 'failed'
218
252
 
219
253
 
220
254
  @contextlib.contextmanager
@@ -267,7 +301,7 @@ def upload_mounts_to_api_server(dag: 'sky.Dag',
267
301
  upload_list = []
268
302
  for task_ in dag.tasks:
269
303
  task_.file_mounts_mapping = {}
270
- if task_.workdir:
304
+ if task_.workdir and isinstance(task_.workdir, str):
271
305
  workdir = task_.workdir
272
306
  assert os.path.isabs(workdir)
273
307
  upload_list.append(workdir)
@@ -299,14 +333,12 @@ def upload_mounts_to_api_server(dag: 'sky.Dag',
299
333
  task_.file_mounts_mapping[src] = _full_path(src)
300
334
  if (task_.service is not None and
301
335
  task_.service.tls_credential is not None):
302
- upload_list.append(task_.service.tls_credential.keyfile)
303
- upload_list.append(task_.service.tls_credential.certfile)
304
- task_.file_mounts_mapping[
305
- task_.service.tls_credential.
306
- keyfile] = task_.service.tls_credential.keyfile
307
- task_.file_mounts_mapping[
308
- task_.service.tls_credential.
309
- certfile] = task_.service.tls_credential.certfile
336
+ keyfile = task_.service.tls_credential.keyfile
337
+ certfile = task_.service.tls_credential.certfile
338
+ upload_list.append(_full_path(keyfile))
339
+ upload_list.append(_full_path(certfile))
340
+ task_.file_mounts_mapping[keyfile] = _full_path(keyfile)
341
+ task_.file_mounts_mapping[certfile] = _full_path(certfile)
310
342
 
311
343
  if upload_list:
312
344
  os.makedirs(os.path.expanduser(FILE_UPLOAD_LOGS_DIR), exist_ok=True)
@@ -339,15 +371,29 @@ def upload_mounts_to_api_server(dag: 'sky.Dag',
339
371
  log_file,
340
372
  is_local=True))
341
373
 
374
+ upload_completed = False
342
375
  with httpx.Client(timeout=timeout) as client:
343
- chunk_params = [
344
- UploadChunkParams(client, upload_id, chunk_index,
345
- total_chunks, temp_zip_file.name,
346
- upload_logger, log_file)
347
- for chunk_index in range(total_chunks)
348
- ]
349
- subprocess_utils.run_in_parallel(_upload_chunk_with_retry,
350
- chunk_params)
376
+ total_retries = 3
377
+ for retry in range(total_retries):
378
+ chunk_params = [
379
+ UploadChunkParams(client, upload_id, chunk_index,
380
+ total_chunks, temp_zip_file.name,
381
+ upload_logger, log_file)
382
+ for chunk_index in range(total_chunks)
383
+ ]
384
+ statuses = subprocess_utils.run_in_parallel(
385
+ _upload_chunk_with_retry, chunk_params)
386
+ if any(status == api_responses.UploadStatus.COMPLETED.value
387
+ for status in statuses):
388
+ upload_completed = True
389
+ break
390
+ else:
391
+ upload_logger.info(
392
+ f'No chunk upload returned completed status. '
393
+ 'Retrying entire upload... '
394
+ f'({retry + 1} / {total_retries})')
395
+ if not upload_completed:
396
+ raise RuntimeError('Failed to upload files to API server.')
351
397
  os.unlink(temp_zip_file.name)
352
398
  upload_logger.info(f'Uploaded files: {upload_list}')
353
399
  logger.info(
sky/client/oauth.py ADDED
@@ -0,0 +1,82 @@
1
+ """Client-side OAuth module."""
2
+ from http.server import BaseHTTPRequestHandler
3
+ from http.server import HTTPServer
4
+ import threading
5
+ import time
6
+ from typing import Dict, Optional
7
+
8
+ AUTH_TIMEOUT = 300 # 5 minutes
9
+
10
+
11
+ class _AuthCallbackHandler(BaseHTTPRequestHandler):
12
+ """HTTP request handler for OAuth callback."""
13
+
14
+ def __init__(self, token_container: Dict[str, Optional[str]],
15
+ remote_endpoint: str, *args, **kwargs):
16
+ self.token_container = token_container
17
+ self.remote_endpoint = remote_endpoint
18
+ super().__init__(*args, **kwargs)
19
+
20
+ def do_POST(self): # pylint: disable=invalid-name
21
+ """Handle POST request for OAuth callback."""
22
+ data = self.rfile.read(int(self.headers['Content-Length']))
23
+
24
+ if data:
25
+ token = data.decode('utf-8')
26
+ self.token_container['token'] = token
27
+
28
+ # Send success response
29
+ self.send_response(200)
30
+ self.send_header('Content-type', 'text/html')
31
+ self.send_header('Access-Control-Allow-Origin',
32
+ self.remote_endpoint)
33
+ self.end_headers()
34
+ else:
35
+ # Send error response
36
+ self.send_response(400)
37
+ self.send_header('Content-type', 'text/html')
38
+ self.send_header('Access-Control-Allow-Origin',
39
+ self.remote_endpoint)
40
+ self.end_headers()
41
+
42
+ def log_message(self, *args): # pylint: disable=unused-argument
43
+ """Suppress default HTTP server logging."""
44
+ pass
45
+
46
+
47
+ def start_local_auth_server(port: int,
48
+ token_store: Dict[str, Optional[str]],
49
+ remote_endpoint: str,
50
+ timeout: int = AUTH_TIMEOUT) -> HTTPServer:
51
+ """Start a local HTTP server to handle OAuth callback.
52
+
53
+ Args:
54
+ port: Port to bind the server to.
55
+ token_container: Dict to store the received token.
56
+ remote_endpoint: The endpoint of the SkyPilot API server that will send
57
+ the token, needed for CORS.
58
+ timeout: Timeout in seconds to wait for the callback.
59
+
60
+ Returns:
61
+ The HTTP server instance.
62
+ """
63
+
64
+ def handler_factory(*args, **kwargs):
65
+ return _AuthCallbackHandler(token_store, remote_endpoint, *args,
66
+ **kwargs)
67
+
68
+ server = HTTPServer(('localhost', port), handler_factory)
69
+ server.timeout = timeout
70
+
71
+ def serve_until_token():
72
+ """Serve requests until token is received or timeout."""
73
+ start_time = time.time()
74
+ while (token_store['token'] is None and
75
+ time.time() - start_time < timeout):
76
+ server.handle_request()
77
+
78
+ # Start server in a separate thread
79
+ server_thread = threading.Thread(target=serve_until_token, daemon=True)
80
+ server_thread.start()
81
+
82
+ return server