skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,320 @@
1
+ """Utilities for formatting tables for CLI output."""
2
+ import abc
3
+ from datetime import datetime
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ import prettytable
7
+
8
+ from sky import sky_logging
9
+ from sky.jobs import utils as managed_jobs
10
+ from sky.schemas.api import responses
11
+ from sky.skylet import constants
12
+ from sky.utils import common_utils
13
+ from sky.utils import log_utils
14
+ from sky.utils import volume
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+
19
+ def format_job_queue(jobs: List[responses.ClusterJobRecord]):
20
+ """Format the job queue for display.
21
+
22
+ Usage:
23
+ jobs = get_job_queue()
24
+ print(format_job_queue(jobs))
25
+ """
26
+ job_table = log_utils.create_table([
27
+ 'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
28
+ 'STATUS', 'LOG', 'GIT COMMIT'
29
+ ])
30
+ for job in jobs:
31
+ job_table.add_row([
32
+ job.job_id,
33
+ job.job_name,
34
+ job.username,
35
+ log_utils.readable_time_duration(job.submitted_at),
36
+ log_utils.readable_time_duration(job.start_at),
37
+ log_utils.readable_time_duration(job.start_at,
38
+ job.end_at,
39
+ absolute=True),
40
+ job.resources,
41
+ job.status.colored_str(),
42
+ job.log_path,
43
+ job.metadata.get('git_commit', '-'),
44
+ ])
45
+ return job_table
46
+
47
+
48
+ def format_storage_table(storages: List[responses.StorageRecord],
49
+ show_all: bool = False) -> str:
50
+ """Format the storage table for display.
51
+
52
+ Args:
53
+ storage_table (dict): The storage table.
54
+
55
+ Returns:
56
+ str: The formatted storage table.
57
+ """
58
+ storage_table = log_utils.create_table([
59
+ 'NAME',
60
+ 'UPDATED',
61
+ 'STORE',
62
+ 'COMMAND',
63
+ 'STATUS',
64
+ ])
65
+
66
+ for row in storages:
67
+ launched_at = row.launched_at
68
+ if show_all:
69
+ command = row.last_use
70
+ else:
71
+ command = common_utils.truncate_long_string(
72
+ row.last_use, constants.LAST_USE_TRUNC_LENGTH)
73
+ storage_table.add_row([
74
+ # NAME
75
+ row.name,
76
+ # LAUNCHED
77
+ log_utils.readable_time_duration(launched_at),
78
+ # CLOUDS
79
+ ', '.join([s.value for s in row.store]),
80
+ # COMMAND,
81
+ command,
82
+ # STATUS
83
+ row.status.value,
84
+ ])
85
+ if storages:
86
+ return str(storage_table)
87
+ else:
88
+ return 'No existing storage.'
89
+
90
+
91
+ def format_job_table(
92
+ jobs: List[responses.ManagedJobRecord],
93
+ show_all: bool,
94
+ show_user: bool,
95
+ pool_status: Optional[List[Dict[str, Any]]] = None,
96
+ max_jobs: Optional[int] = None,
97
+ status_counts: Optional[Dict[str, int]] = None,
98
+ ):
99
+ jobs = [job.model_dump() for job in jobs]
100
+ return managed_jobs.format_job_table(
101
+ jobs,
102
+ pool_status=pool_status,
103
+ show_all=show_all,
104
+ show_user=show_user,
105
+ max_jobs=max_jobs,
106
+ job_status_counts=status_counts,
107
+ )
108
+
109
+
110
+ _BASIC_COLUMNS = [
111
+ 'NAME',
112
+ 'TYPE',
113
+ 'INFRA',
114
+ 'SIZE',
115
+ 'USER',
116
+ 'WORKSPACE',
117
+ 'AGE',
118
+ 'STATUS',
119
+ 'LAST_USE',
120
+ 'USED_BY',
121
+ ]
122
+
123
+
124
+ def _get_infra_str(cloud: Optional[str], region: Optional[str],
125
+ zone: Optional[str]) -> str:
126
+ """Get the infrastructure string for the volume."""
127
+ infra = ''
128
+ if cloud:
129
+ infra += cloud
130
+ if region:
131
+ infra += f'/{region}'
132
+ if zone:
133
+ infra += f'/{zone}'
134
+ return infra
135
+
136
+
137
+ class VolumeTable(abc.ABC):
138
+ """The volume table."""
139
+
140
+ def __init__(self,
141
+ volumes: List[responses.VolumeRecord],
142
+ show_all: bool = False):
143
+ super().__init__()
144
+ self.table = self._create_table(show_all)
145
+ self._add_rows(volumes, show_all)
146
+
147
+ def _get_row_base_columns(self,
148
+ row: responses.VolumeRecord,
149
+ show_all: bool = False) -> List[str]:
150
+ """Get the base columns for a row."""
151
+ # Convert last_attached_at timestamp to human readable string
152
+ last_attached_at = row.get('last_attached_at')
153
+ if last_attached_at is not None:
154
+ last_attached_at_str = datetime.fromtimestamp(
155
+ last_attached_at).strftime('%Y-%m-%d %H:%M:%S')
156
+ else:
157
+ last_attached_at_str = '-'
158
+ size = row.get('size', '')
159
+ if size:
160
+ size = f'{size}Gi'
161
+ usedby_str = '-'
162
+ usedby_clusters = row.get('usedby_clusters')
163
+ usedby_pods = row.get('usedby_pods')
164
+ if usedby_clusters:
165
+ usedby_str = f'{", ".join(usedby_clusters)}'
166
+ elif usedby_pods:
167
+ usedby_str = f'{", ".join(usedby_pods)}'
168
+ if show_all:
169
+ usedby = usedby_str
170
+ else:
171
+ usedby = common_utils.truncate_long_string(
172
+ usedby_str, constants.USED_BY_TRUNC_LENGTH)
173
+ infra = _get_infra_str(row.get('cloud'), row.get('region'),
174
+ row.get('zone'))
175
+ return [
176
+ row.get('name', ''),
177
+ row.get('type', ''),
178
+ infra,
179
+ size,
180
+ row.get('user_name', '-'),
181
+ row.get('workspace', '-'),
182
+ log_utils.human_duration(row.get('launched_at', 0)),
183
+ row.get('status', ''),
184
+ last_attached_at_str,
185
+ usedby,
186
+ ]
187
+
188
+ def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
189
+ """Create the volume table."""
190
+ raise NotImplementedError
191
+
192
+ def _add_rows(self,
193
+ volumes: List[responses.VolumeRecord],
194
+ show_all: bool = False) -> None:
195
+ """Add rows to the volume table."""
196
+ raise NotImplementedError
197
+
198
+ @abc.abstractmethod
199
+ def format(self) -> str:
200
+ """Format the volume table for display."""
201
+ raise NotImplementedError
202
+
203
+
204
+ class PVCVolumeTable(VolumeTable):
205
+ """The PVC volume table."""
206
+
207
+ def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
208
+ """Create the PVC volume table."""
209
+ # If show_all is False, show the table with the columns:
210
+ # NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
211
+ # AGE, STATUS, LAST_USE, USED_BY
212
+ # If show_all is True, show the table with the columns:
213
+ # NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
214
+ # AGE, STATUS, LAST_USE, USED_BY, NAME_ON_CLOUD
215
+ # STORAGE_CLASS, ACCESS_MODE
216
+
217
+ if show_all:
218
+ columns = _BASIC_COLUMNS + [
219
+ 'NAME_ON_CLOUD',
220
+ 'STORAGE_CLASS',
221
+ 'ACCESS_MODE',
222
+ ]
223
+ else:
224
+ columns = _BASIC_COLUMNS
225
+
226
+ table = log_utils.create_table(columns)
227
+ return table
228
+
229
+ def _add_rows(self,
230
+ volumes: List[responses.VolumeRecord],
231
+ show_all: bool = False) -> None:
232
+ """Add rows to the PVC volume table."""
233
+ for row in volumes:
234
+ table_row = self._get_row_base_columns(row, show_all)
235
+ if show_all:
236
+ table_row.append(row.get('name_on_cloud', ''))
237
+ table_row.append(
238
+ row.get('config', {}).get('storage_class_name', '-'))
239
+ table_row.append(row.get('config', {}).get('access_mode', ''))
240
+
241
+ self.table.add_row(table_row)
242
+
243
+ def format(self) -> str:
244
+ """Format the PVC volume table for display."""
245
+ return 'Kubernetes PVCs:\n' + str(self.table)
246
+
247
+
248
+ class RunPodVolumeTable(VolumeTable):
249
+ """The RunPod volume table."""
250
+
251
+ def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
252
+ """Create the RunPod volume table."""
253
+ # If show_all is False, show the table with the columns:
254
+ # NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
255
+ # AGE, STATUS, LAST_USE, USED_BY
256
+ # If show_all is True, show the table with the columns:
257
+ # NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
258
+ # AGE, STATUS, LAST_USE, USED_BY, NAME_ON_CLOUD
259
+
260
+ if show_all:
261
+ columns = _BASIC_COLUMNS + ['NAME_ON_CLOUD']
262
+ else:
263
+ columns = _BASIC_COLUMNS
264
+
265
+ table = log_utils.create_table(columns)
266
+ return table
267
+
268
+ def _add_rows(self,
269
+ volumes: List[responses.VolumeRecord],
270
+ show_all: bool = False) -> None:
271
+ """Add rows to the RunPod volume table."""
272
+ for row in volumes:
273
+ table_row = self._get_row_base_columns(row, show_all)
274
+ if show_all:
275
+ table_row.append(row.get('name_on_cloud', ''))
276
+
277
+ self.table.add_row(table_row)
278
+
279
+ def format(self) -> str:
280
+ """Format the RunPod volume table for display."""
281
+ return 'RunPod Network Volumes:\n' + str(self.table)
282
+
283
+
284
+ def format_volume_table(volumes: List[responses.VolumeRecord],
285
+ show_all: bool = False) -> str:
286
+ """Format the volume table for display.
287
+
288
+ Args:
289
+ volume_table (dict): The volume table.
290
+
291
+ Returns:
292
+ str: The formatted volume table.
293
+ """
294
+ volumes_per_type: Dict[str, List[responses.VolumeRecord]] = {}
295
+ supported_volume_types = [
296
+ volume_type.value for volume_type in volume.VolumeType
297
+ ]
298
+ for row in volumes:
299
+ volume_type = row.get('type', '')
300
+ if volume_type in supported_volume_types:
301
+ if volume_type not in volumes_per_type:
302
+ volumes_per_type[volume_type] = []
303
+ volumes_per_type[volume_type].append(row)
304
+ else:
305
+ logger.warning(f'Unknown volume type: {volume_type}')
306
+ continue
307
+ table_str = ''
308
+ for volume_type, volume_list in volumes_per_type.items():
309
+ if table_str:
310
+ table_str += '\n\n'
311
+ if volume_type == volume.VolumeType.PVC.value:
312
+ pvc_table = PVCVolumeTable(volume_list, show_all)
313
+ table_str += pvc_table.format()
314
+ elif volume_type == volume.VolumeType.RUNPOD_NETWORK_VOLUME.value:
315
+ runpod_table = RunPodVolumeTable(volume_list, show_all)
316
+ table_str += runpod_table.format()
317
+ if table_str:
318
+ return table_str
319
+ else:
320
+ return 'No existing volumes.'
sky/client/common.py CHANGED
@@ -16,8 +16,10 @@ import zipfile
16
16
 
17
17
  from sky import sky_logging
18
18
  from sky.adaptors import common as adaptors_common
19
+ from sky.client import service_account_auth
19
20
  from sky.data import data_utils
20
21
  from sky.data import storage_utils
22
+ from sky.schemas.api import responses as api_responses
21
23
  from sky.server import common as server_common
22
24
  from sky.server.requests import payloads
23
25
  from sky.skylet import constants
@@ -31,7 +33,7 @@ if typing.TYPE_CHECKING:
31
33
  import requests
32
34
 
33
35
  import sky
34
- import sky.dag as dag_lib
36
+ from sky import dag as dag_lib
35
37
  else:
36
38
  httpx = adaptors_common.LazyImport('httpx')
37
39
  requests = adaptors_common.LazyImport('requests')
@@ -42,8 +44,10 @@ logger = sky_logging.init_logger(__name__)
42
44
  _DOWNLOAD_CHUNK_BYTES = 8192
43
45
  # The chunk size for the zip file to be uploaded to the API server. We split
44
46
  # the zip file into chunks to avoid network issues for large request body that
45
- # can be caused by NGINX's client_max_body_size.
46
- _UPLOAD_CHUNK_BYTES = 512 * 1024 * 1024
47
+ # can be caused by NGINX's client_max_body_size or Cloudflare's upload limit.
48
+ # As of 09/25/2025, the upload limit for Cloudflare's free plan is 100MiB:
49
+ # https://developers.cloudflare.com/support/troubleshooting/http-status-codes/4xx-client-error/error-413/
50
+ _UPLOAD_CHUNK_BYTES = 100 * 1024 * 1024
47
51
 
48
52
  FILE_UPLOAD_LOGS_DIR = os.path.join(constants.SKY_LOGS_DIRECTORY,
49
53
  'file_uploads')
@@ -80,10 +84,11 @@ def download_logs_from_api_server(
80
84
  local_machine_prefix) for remote_path in paths_on_api_server
81
85
  }
82
86
  body = payloads.DownloadBody(folder_paths=list(paths_on_api_server),)
83
- response = requests.post(f'{server_common.get_server_url()}/download',
84
- json=json.loads(body.model_dump_json()),
85
- stream=True,
86
- cookies=server_common.get_api_cookie_jar())
87
+ response = server_common.make_authenticated_request(
88
+ 'POST',
89
+ '/download',
90
+ json=json.loads(body.model_dump_json()),
91
+ stream=True)
87
92
  if response.status_code == 200:
88
93
  remote_home_path = response.headers.get('X-Home-Path')
89
94
  assert remote_home_path is not None, response.headers
@@ -164,14 +169,19 @@ class UploadChunkParams:
164
169
  log_file: str
165
170
 
166
171
 
167
- def _upload_chunk_with_retry(params: UploadChunkParams) -> None:
168
- """Uploads a chunk of a zip file to the API server."""
172
+ def _upload_chunk_with_retry(params: UploadChunkParams) -> str:
173
+ """Uploads a chunk of a zip file to the API server.
174
+
175
+ Returns:
176
+ Status of the upload.
177
+ """
169
178
  upload_logger = params.upload_logger
170
179
  upload_logger.info(
171
180
  f'Uploading chunk: {params.chunk_index + 1} / {params.total_chunks}')
172
181
 
173
182
  server_url = server_common.get_server_url()
174
183
  max_attempts = 3
184
+ sa_headers = service_account_auth.get_service_account_headers()
175
185
  with open(params.file_path, 'rb') as f:
176
186
  for attempt in range(max_attempts):
177
187
  response = params.client.post(
@@ -184,19 +194,23 @@ def _upload_chunk_with_retry(params: UploadChunkParams) -> None:
184
194
  },
185
195
  content=FileChunkIterator(f, _UPLOAD_CHUNK_BYTES,
186
196
  params.chunk_index),
187
- headers={'Content-Type': 'application/octet-stream'},
197
+ headers={
198
+ 'Content-Type': 'application/octet-stream',
199
+ **sa_headers,
200
+ },
188
201
  cookies=server_common.get_api_cookie_jar())
189
202
  if response.status_code == 200:
190
203
  data = response.json()
191
204
  status = data.get('status')
192
205
  msg = ('Uploaded chunk: '
193
- f'{params.chunk_index + 1} / {params.total_chunks}')
194
- if status == 'uploading':
206
+ f'{params.chunk_index + 1} / {params.total_chunks} '
207
+ f'(Status: {status})')
208
+ if status == api_responses.UploadStatus.UPLOADING.value:
195
209
  missing_chunks = data.get('missing_chunks')
196
210
  if missing_chunks:
197
211
  msg += f' - Waiting for chunks: {missing_chunks}'
198
212
  upload_logger.info(msg)
199
- return
213
+ return status
200
214
  elif attempt < max_attempts - 1:
201
215
  upload_logger.error(
202
216
  f'Failed to upload chunk: '
@@ -204,17 +218,29 @@ def _upload_chunk_with_retry(params: UploadChunkParams) -> None:
204
218
  f'{response.content.decode("utf-8")}')
205
219
  upload_logger.info(
206
220
  f'Retrying... ({attempt + 1} / {max_attempts})')
207
- time.sleep(1)
221
+ if response.status_code == 503:
222
+ # If the server is temporarily unavailable,
223
+ # wait a little longer before retrying.
224
+ time.sleep(10)
225
+ else:
226
+ time.sleep(1)
208
227
  else:
228
+ try:
229
+ response_details = response.json().get('detail')
230
+ except Exception: # pylint: disable=broad-except
231
+ response_details = response.content
209
232
  error_msg = (
210
233
  f'Failed to upload chunk: {params.chunk_index + 1} / '
211
- f'{params.total_chunks}: {response.json().get("detail")}')
234
+ f'{params.total_chunks}: {response_details} '
235
+ f'(Status code: {response.status_code})')
212
236
  upload_logger.error(error_msg)
213
237
  with ux_utils.print_exception_no_traceback():
214
238
  raise RuntimeError(
215
239
  ux_utils.error_message(error_msg + '\n',
216
240
  params.log_file,
217
241
  is_local=True))
242
+ # If we reach here, the upload failed.
243
+ return 'failed'
218
244
 
219
245
 
220
246
  @contextlib.contextmanager
@@ -267,7 +293,7 @@ def upload_mounts_to_api_server(dag: 'sky.Dag',
267
293
  upload_list = []
268
294
  for task_ in dag.tasks:
269
295
  task_.file_mounts_mapping = {}
270
- if task_.workdir:
296
+ if task_.workdir and isinstance(task_.workdir, str):
271
297
  workdir = task_.workdir
272
298
  assert os.path.isabs(workdir)
273
299
  upload_list.append(workdir)
@@ -299,14 +325,12 @@ def upload_mounts_to_api_server(dag: 'sky.Dag',
299
325
  task_.file_mounts_mapping[src] = _full_path(src)
300
326
  if (task_.service is not None and
301
327
  task_.service.tls_credential is not None):
302
- upload_list.append(task_.service.tls_credential.keyfile)
303
- upload_list.append(task_.service.tls_credential.certfile)
304
- task_.file_mounts_mapping[
305
- task_.service.tls_credential.
306
- keyfile] = task_.service.tls_credential.keyfile
307
- task_.file_mounts_mapping[
308
- task_.service.tls_credential.
309
- certfile] = task_.service.tls_credential.certfile
328
+ keyfile = task_.service.tls_credential.keyfile
329
+ certfile = task_.service.tls_credential.certfile
330
+ upload_list.append(_full_path(keyfile))
331
+ upload_list.append(_full_path(certfile))
332
+ task_.file_mounts_mapping[keyfile] = _full_path(keyfile)
333
+ task_.file_mounts_mapping[certfile] = _full_path(certfile)
310
334
 
311
335
  if upload_list:
312
336
  os.makedirs(os.path.expanduser(FILE_UPLOAD_LOGS_DIR), exist_ok=True)
@@ -339,15 +363,29 @@ def upload_mounts_to_api_server(dag: 'sky.Dag',
339
363
  log_file,
340
364
  is_local=True))
341
365
 
366
+ upload_completed = False
342
367
  with httpx.Client(timeout=timeout) as client:
343
- chunk_params = [
344
- UploadChunkParams(client, upload_id, chunk_index,
345
- total_chunks, temp_zip_file.name,
346
- upload_logger, log_file)
347
- for chunk_index in range(total_chunks)
348
- ]
349
- subprocess_utils.run_in_parallel(_upload_chunk_with_retry,
350
- chunk_params)
368
+ total_retries = 3
369
+ for retry in range(total_retries):
370
+ chunk_params = [
371
+ UploadChunkParams(client, upload_id, chunk_index,
372
+ total_chunks, temp_zip_file.name,
373
+ upload_logger, log_file)
374
+ for chunk_index in range(total_chunks)
375
+ ]
376
+ statuses = subprocess_utils.run_in_parallel(
377
+ _upload_chunk_with_retry, chunk_params)
378
+ if any(status == api_responses.UploadStatus.COMPLETED.value
379
+ for status in statuses):
380
+ upload_completed = True
381
+ break
382
+ else:
383
+ upload_logger.info(
384
+ f'No chunk upload returned completed status. '
385
+ 'Retrying entire upload... '
386
+ f'({retry + 1} / {total_retries})')
387
+ if not upload_completed:
388
+ raise RuntimeError('Failed to upload files to API server.')
351
389
  os.unlink(temp_zip_file.name)
352
390
  upload_logger.info(f'Uploaded files: {upload_list}')
353
391
  logger.info(
sky/client/oauth.py ADDED
@@ -0,0 +1,82 @@
1
+ """Client-side OAuth module."""
2
+ from http.server import BaseHTTPRequestHandler
3
+ from http.server import HTTPServer
4
+ import threading
5
+ import time
6
+ from typing import Dict, Optional
7
+
8
+ AUTH_TIMEOUT = 300 # 5 minutes
9
+
10
+
11
+ class _AuthCallbackHandler(BaseHTTPRequestHandler):
12
+ """HTTP request handler for OAuth callback."""
13
+
14
+ def __init__(self, token_container: Dict[str, Optional[str]],
15
+ remote_endpoint: str, *args, **kwargs):
16
+ self.token_container = token_container
17
+ self.remote_endpoint = remote_endpoint
18
+ super().__init__(*args, **kwargs)
19
+
20
+ def do_POST(self): # pylint: disable=invalid-name
21
+ """Handle POST request for OAuth callback."""
22
+ data = self.rfile.read(int(self.headers['Content-Length']))
23
+
24
+ if data:
25
+ token = data.decode('utf-8')
26
+ self.token_container['token'] = token
27
+
28
+ # Send success response
29
+ self.send_response(200)
30
+ self.send_header('Content-type', 'text/html')
31
+ self.send_header('Access-Control-Allow-Origin',
32
+ self.remote_endpoint)
33
+ self.end_headers()
34
+ else:
35
+ # Send error response
36
+ self.send_response(400)
37
+ self.send_header('Content-type', 'text/html')
38
+ self.send_header('Access-Control-Allow-Origin',
39
+ self.remote_endpoint)
40
+ self.end_headers()
41
+
42
+ def log_message(self, *args): # pylint: disable=unused-argument
43
+ """Suppress default HTTP server logging."""
44
+ pass
45
+
46
+
47
+ def start_local_auth_server(port: int,
48
+ token_store: Dict[str, Optional[str]],
49
+ remote_endpoint: str,
50
+ timeout: int = AUTH_TIMEOUT) -> HTTPServer:
51
+ """Start a local HTTP server to handle OAuth callback.
52
+
53
+ Args:
54
+ port: Port to bind the server to.
55
+ token_container: Dict to store the received token.
56
+ remote_endpoint: The endpoint of the SkyPilot API server that will send
57
+ the token, needed for CORS.
58
+ timeout: Timeout in seconds to wait for the callback.
59
+
60
+ Returns:
61
+ The HTTP server instance.
62
+ """
63
+
64
+ def handler_factory(*args, **kwargs):
65
+ return _AuthCallbackHandler(token_store, remote_endpoint, *args,
66
+ **kwargs)
67
+
68
+ server = HTTPServer(('localhost', port), handler_factory)
69
+ server.timeout = timeout
70
+
71
+ def serve_until_token():
72
+ """Serve requests until token is received or timeout."""
73
+ start_time = time.time()
74
+ while (token_store['token'] is None and
75
+ time.time() - start_time < timeout):
76
+ server.handle_request()
77
+
78
+ # Start server in a separate thread
79
+ server_thread = threading.Thread(target=serve_until_token, daemon=True)
80
+ server_thread.start()
81
+
82
+ return server