skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/server/config.py CHANGED
@@ -2,9 +2,11 @@
2
2
 
3
3
  import dataclasses
4
4
  import enum
5
+ from typing import Optional
5
6
 
6
7
  from sky import sky_logging
7
8
  from sky.server import constants as server_constants
9
+ from sky.server import daemons
8
10
  from sky.utils import common_utils
9
11
 
10
12
  # Constants based on profiling the peak memory usage while serving various
@@ -18,8 +20,9 @@ from sky.utils import common_utils
18
20
  # TODO(aylei): maintaining these constants is error-prone, we may need to
19
21
  # automatically tune parallelism at runtime according to system usage stats
20
22
  # in the future.
21
- _LONG_WORKER_MEM_GB = 0.4
22
- _SHORT_WORKER_MEM_GB = 0.25
23
+ # TODO(luca): The future is now! ^^^
24
+ LONG_WORKER_MEM_GB = 0.4
25
+ SHORT_WORKER_MEM_GB = 0.3
23
26
  # To control the number of long workers.
24
27
  _CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
25
28
  # Limit the number of long workers of local API server, since local server is
@@ -34,9 +37,8 @@ _MAX_LONG_WORKERS_LOCAL = 4
34
37
  _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
35
38
  # Minimal number of long workers to ensure responsiveness.
36
39
  _MIN_LONG_WORKERS = 1
37
- # Minimal number of short workers, there is a daemon task running on short
38
- # workers so at least 2 workers are needed to ensure responsiveness.
39
- _MIN_SHORT_WORKERS = 2
40
+ # Minimal number of idle short workers to ensure responsiveness.
41
+ _MIN_IDLE_SHORT_WORKERS = 1
40
42
 
41
43
  # Default number of burstable workers for local API server. A heuristic number
42
44
  # that is large enough for most local cases.
@@ -61,6 +63,7 @@ class QueueBackend(enum.Enum):
61
63
  class WorkerConfig:
62
64
  garanteed_parallelism: int
63
65
  burstable_parallelism: int
66
+ num_db_connections_per_worker: int
64
67
 
65
68
 
66
69
  @dataclasses.dataclass
@@ -68,10 +71,13 @@ class ServerConfig:
68
71
  num_server_workers: int
69
72
  long_worker_config: WorkerConfig
70
73
  short_worker_config: WorkerConfig
74
+ num_db_connections_per_worker: int
71
75
  queue_backend: QueueBackend
72
76
 
73
77
 
74
- def compute_server_config(deploy: bool) -> ServerConfig:
78
+ def compute_server_config(deploy: bool,
79
+ max_db_connections: Optional[int] = None,
80
+ quiet: bool = False) -> ServerConfig:
75
81
  """Compute the server config based on environment.
76
82
 
77
83
  We have different assumptions for the resources in different deployment
@@ -105,7 +111,9 @@ def compute_server_config(deploy: bool) -> ServerConfig:
105
111
  process after API server was introduced.
106
112
  """
107
113
  cpu_count = common_utils.get_cpu_count()
114
+ logger.debug(f'CPU count: {cpu_count}')
108
115
  mem_size_gb = common_utils.get_mem_size_gb()
116
+ logger.debug(f'Memory size: {mem_size_gb}GB')
109
117
  max_parallel_for_long = _max_long_worker_parallism(cpu_count,
110
118
  mem_size_gb,
111
119
  local=not deploy)
@@ -114,7 +122,17 @@ def compute_server_config(deploy: bool) -> ServerConfig:
114
122
  queue_backend = QueueBackend.MULTIPROCESSING
115
123
  burstable_parallel_for_long = 0
116
124
  burstable_parallel_for_short = 0
125
+ # if num_db_connections_per_worker is 0, server will use NullPool
126
+ # to conserve the number of concurrent db connections.
127
+ # This could lead to performance degradation.
128
+ num_db_connections_per_worker = 0
117
129
  num_server_workers = cpu_count
130
+
131
+ # +1 for the event loop running the main process
132
+ # and gc daemons in the '__main__' body of sky/server/server.py
133
+ max_parallel_all_workers = (max_parallel_for_long + max_parallel_for_short +
134
+ num_server_workers + 1)
135
+
118
136
  if not deploy:
119
137
  # For local mode, use local queue backend since we only run 1 uvicorn
120
138
  # worker in local mode and no multiprocessing is needed.
@@ -125,7 +143,12 @@ def compute_server_config(deploy: bool) -> ServerConfig:
125
143
  burstable_parallel_for_short = _BURSTABLE_WORKERS_FOR_LOCAL
126
144
  # Runs in low resource mode if the available memory is less than
127
145
  # server_constants.MIN_AVAIL_MEM_GB.
128
- if not deploy and mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
146
+ # pylint: disable=import-outside-toplevel
147
+ import sky.jobs.utils as job_utils
148
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
149
+ if job_utils.is_consolidation_mode() else
150
+ server_constants.MIN_AVAIL_MEM_GB)
151
+ if not deploy and mem_size_gb < max_memory:
129
152
  # Permanent worker process may have significant memory consumption
130
153
  # (~350MB per worker) after running commands like `sky check`, so we
131
154
  # don't start any permanent workers in low resource local mode. This
@@ -136,24 +159,41 @@ def compute_server_config(deploy: bool) -> ServerConfig:
136
159
  # permanently because it never exits.
137
160
  max_parallel_for_long = 0
138
161
  max_parallel_for_short = 0
139
- logger.warning(
140
- 'SkyPilot API server will run in low resource mode because '
141
- 'the available memory is less than '
142
- f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
143
- logger.info(
144
- f'SkyPilot API server will start {num_server_workers} server processes '
145
- f'with {max_parallel_for_long} background workers for long requests '
146
- f'and will allow at max {max_parallel_for_short} short requests in '
147
- f'parallel.')
162
+ if not quiet:
163
+ logger.warning(
164
+ 'SkyPilot API server will run in low resource mode because '
165
+ 'the available memory is less than '
166
+ f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
167
+ elif max_db_connections is not None:
168
+ if max_parallel_all_workers > max_db_connections:
169
+ if not quiet:
170
+ logger.warning(
171
+ f'Max parallel all workers ({max_parallel_all_workers}) '
172
+ 'is greater than max db connections '
173
+ f'({max_db_connections}). Increase the number of max db '
174
+ f'connections to at least {max_parallel_all_workers} for '
175
+ 'optimal performance.')
176
+ else:
177
+ num_db_connections_per_worker = 1
178
+
179
+ if not quiet:
180
+ logger.info(
181
+ f'SkyPilot API server will start {num_server_workers} server '
182
+ f'processes with {max_parallel_for_long} background workers for '
183
+ f'long requests and will allow at max {max_parallel_for_short} '
184
+ 'short requests in parallel.')
148
185
  return ServerConfig(
149
186
  num_server_workers=num_server_workers,
150
187
  queue_backend=queue_backend,
151
188
  long_worker_config=WorkerConfig(
152
189
  garanteed_parallelism=max_parallel_for_long,
153
- burstable_parallelism=burstable_parallel_for_long),
190
+ burstable_parallelism=burstable_parallel_for_long,
191
+ num_db_connections_per_worker=num_db_connections_per_worker),
154
192
  short_worker_config=WorkerConfig(
155
193
  garanteed_parallelism=max_parallel_for_short,
156
- burstable_parallelism=burstable_parallel_for_short),
194
+ burstable_parallelism=burstable_parallel_for_short,
195
+ num_db_connections_per_worker=num_db_connections_per_worker),
196
+ num_db_connections_per_worker=num_db_connections_per_worker,
157
197
  )
158
198
 
159
199
 
@@ -162,10 +202,15 @@ def _max_long_worker_parallism(cpu_count: int,
162
202
  local=False) -> int:
163
203
  """Max parallelism for long workers."""
164
204
  # Reserve min available memory to avoid OOM.
165
- available_mem = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
205
+ # pylint: disable=import-outside-toplevel
206
+ import sky.jobs.utils as job_utils
207
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
208
+ if job_utils.is_consolidation_mode() else
209
+ server_constants.MIN_AVAIL_MEM_GB)
210
+ available_mem = max(0, mem_size_gb - max_memory)
166
211
  cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
167
212
  mem_based_max_parallel = int(available_mem * _MAX_MEM_PERCENT_FOR_BLOCKING /
168
- _LONG_WORKER_MEM_GB)
213
+ LONG_WORKER_MEM_GB)
169
214
  n = max(_MIN_LONG_WORKERS,
170
215
  min(cpu_based_max_parallel, mem_based_max_parallel))
171
216
  if local:
@@ -173,12 +218,25 @@ def _max_long_worker_parallism(cpu_count: int,
173
218
  return n
174
219
 
175
220
 
221
+ def _get_min_short_workers() -> int:
222
+ """Min number of short workers."""
223
+ daemon_count = 0
224
+ for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
225
+ if not daemon.should_skip():
226
+ daemon_count += 1
227
+ return _MIN_IDLE_SHORT_WORKERS + daemon_count
228
+
229
+
176
230
  def _max_short_worker_parallism(mem_size_gb: float,
177
231
  long_worker_parallism: int) -> int:
178
232
  """Max parallelism for short workers."""
179
233
  # Reserve memory for long workers and min available memory.
180
- reserved_mem = server_constants.MIN_AVAIL_MEM_GB + (long_worker_parallism *
181
- _LONG_WORKER_MEM_GB)
234
+ # pylint: disable=import-outside-toplevel
235
+ import sky.jobs.utils as job_utils
236
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
237
+ if job_utils.is_consolidation_mode() else
238
+ server_constants.MIN_AVAIL_MEM_GB)
239
+ reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
182
240
  available_mem = max(0, mem_size_gb - reserved_mem)
183
- n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
241
+ n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
184
242
  return n
sky/server/constants.py CHANGED
@@ -4,17 +4,37 @@ import os
4
4
 
5
5
  from sky.skylet import constants
6
6
 
7
- # API server version, whenever there is a change in API server that requires a
8
- # restart of the local API server or error out when the client does not match
9
- # the server version.
10
- API_VERSION = '5'
7
+ # pylint: disable=line-too-long
8
+ # The SkyPilot API version that the code currently use.
9
+ # Bump this version when the API is changed and special compatibility handling
10
+ # based on version info is needed.
11
+ # For more details and code guidelines, refer to:
12
+ # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
+ API_VERSION = 22
14
+
15
+ # The minimum peer API version that the code should still work with.
16
+ # Notes (dev):
17
+ # - This value is maintained by the CI pipeline, DO NOT EDIT this manually.
18
+ # - Compatibility code for versions lower than this can be safely removed.
19
+ # Refer to API_VERSION for more details.
20
+ MIN_COMPATIBLE_API_VERSION = 11
21
+
22
+ # The semantic version of the minimum compatible API version.
23
+ # Refer to MIN_COMPATIBLE_API_VERSION for more details.
24
+ # Note (dev): DO NOT EDIT this constant manually.
25
+ MIN_COMPATIBLE_VERSION = '0.10.0'
26
+
27
+ # The HTTP header name for the API version of the sender.
28
+ API_VERSION_HEADER = 'X-SkyPilot-API-Version'
29
+
30
+ # The HTTP header name for the SkyPilot version of the sender.
31
+ VERSION_HEADER = 'X-SkyPilot-Version'
11
32
 
12
33
  # Prefix for API request names.
13
34
  REQUEST_NAME_PREFIX = 'sky.'
14
- # The user ID of the SkyPilot system.
15
- SKYPILOT_SYSTEM_USER_ID = 'skypilot-system'
16
35
  # The memory (GB) that SkyPilot tries to not use to prevent OOM.
17
36
  MIN_AVAIL_MEM_GB = 2
37
+ MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE = 4
18
38
  # Default encoder/decoder handler name.
19
39
  DEFAULT_HANDLER_NAME = 'default'
20
40
  # The path to the API request database.
@@ -24,9 +44,27 @@ API_SERVER_REQUEST_DB_PATH = '~/.sky/api_server/requests.db'
24
44
  # background.
25
45
  CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
26
46
 
47
+ # The interval (seconds) for the volume status to be refreshed in the
48
+ # background.
49
+ VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS = 60
50
+
27
51
  # Environment variable for a file path to the API cookie file.
52
+ # Keep in sync with websocket_proxy.py
28
53
  API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
54
+ # Default file if unset.
55
+ # Keep in sync with websocket_proxy.py
56
+ API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
29
57
 
30
58
  # The path to the dashboard build output
31
59
  DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
32
60
  'out')
61
+
62
+ # The interval (seconds) for the event to be restarted in the background.
63
+ DAEMON_RESTART_INTERVAL_SECONDS = 20
64
+
65
+ # Cookie header for stream request id.
66
+ STREAM_REQUEST_HEADER = 'X-SkyPilot-Stream-Request-ID'
67
+
68
+ # Valid empty values for pickled fields (base64-encoded pickled None)
69
+ # base64.b64encode(pickle.dumps(None)).decode('utf-8')
70
+ EMPTY_PICKLED_VALUE = 'gAROLg=='
sky/server/daemons.py ADDED
@@ -0,0 +1,229 @@
1
+ """Internal server daemons that run in the background."""
2
+ import dataclasses
3
+ import os
4
+ import time
5
+ from typing import Callable
6
+
7
+ from sky import sky_logging
8
+ from sky import skypilot_config
9
+ from sky.server import constants as server_constants
10
+ from sky.server.requests import request_names
11
+ from sky.utils import annotations
12
+ from sky.utils import common_utils
13
+ from sky.utils import env_options
14
+ from sky.utils import subprocess_utils
15
+ from sky.utils import timeline
16
+ from sky.utils import ux_utils
17
+
18
+ logger = sky_logging.init_logger(__name__)
19
+
20
+
21
+ def _default_should_skip():
22
+ return False
23
+
24
+
25
+ @dataclasses.dataclass
26
+ class InternalRequestDaemon:
27
+ """Internal daemon that runs an event in the background."""
28
+
29
+ id: str
30
+ name: request_names.RequestName
31
+ event_fn: Callable[[], None]
32
+ default_log_level: str = 'INFO'
33
+ should_skip: Callable[[], bool] = _default_should_skip
34
+
35
+ def refresh_log_level(self) -> int:
36
+ # pylint: disable=import-outside-toplevel
37
+ import logging
38
+
39
+ try:
40
+ # Refresh config within the while loop.
41
+ # Since this is a long running daemon,
42
+ # reload_for_new_request()
43
+ # is not called in between the event runs.
44
+ # We don't need to grab the lock here because each of the daemons
45
+ # run in their own process and thus have their own request context.
46
+ skypilot_config.reload_config()
47
+ # Get the configured log level for the daemon inside the event loop
48
+ # in case the log level changes after the API server is started.
49
+ level_str = skypilot_config.get_nested(
50
+ ('daemons', self.id, 'log_level'), self.default_log_level)
51
+ return getattr(logging, level_str.upper())
52
+ except AttributeError:
53
+ # Bad level should be rejected by
54
+ # schema validation, just in case.
55
+ logger.warning(f'Invalid log level: {level_str}, using DEBUG')
56
+ return logging.DEBUG
57
+ except Exception as e: # pylint: disable=broad-except
58
+ logger.exception(f'Error refreshing log level for {self.id}: {e}')
59
+ return logging.DEBUG
60
+
61
+ def run_event(self):
62
+ """Run the event."""
63
+
64
+ # Disable logging for periodic refresh to avoid the usage message being
65
+ # sent multiple times.
66
+ os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
67
+
68
+ level = self.refresh_log_level()
69
+ while True:
70
+ try:
71
+ with ux_utils.enable_traceback(), \
72
+ sky_logging.set_sky_logging_levels(level):
73
+ sky_logging.reload_logger()
74
+ level = self.refresh_log_level()
75
+ self.event_fn()
76
+ # Clear request level cache after each run to avoid
77
+ # using too much memory.
78
+ annotations.clear_request_level_cache()
79
+ timeline.save_timeline()
80
+ # Kill all children processes related to this request.
81
+ # Each executor handles a single request, so we can safely
82
+ # kill all children processes related to this request.
83
+ subprocess_utils.kill_children_processes()
84
+ common_utils.release_memory()
85
+ except Exception: # pylint: disable=broad-except
86
+ # It is OK to fail to run the event, as the event is not
87
+ # critical, but we should log the error.
88
+ logger.exception(
89
+ f'Error running {self.name} event. '
90
+ f'Restarting in '
91
+ f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
92
+ 'seconds...')
93
+ time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
94
+
95
+
96
+ def refresh_cluster_status_event():
97
+ """Periodically refresh the cluster status."""
98
+ # pylint: disable=import-outside-toplevel
99
+ from sky.backends import backend_utils
100
+
101
+ logger.info('=== Refreshing cluster status ===')
102
+ # This periodically refresh will hold the lock for the cluster being
103
+ # refreshed, but it is OK because other operations will just wait for
104
+ # the lock and get the just refreshed status without refreshing again.
105
+ backend_utils.refresh_cluster_records()
106
+ logger.info('Status refreshed. Sleeping '
107
+ f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
108
+ ' seconds for the next refresh...\n')
109
+ time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
110
+
111
+
112
+ def refresh_volume_status_event():
113
+ """Periodically refresh the volume status."""
114
+ # pylint: disable=import-outside-toplevel
115
+ from sky.volumes.server import core
116
+
117
+ # Disable logging for periodic refresh to avoid the usage message being
118
+ # sent multiple times.
119
+ os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
120
+
121
+ logger.info('=== Refreshing volume status ===')
122
+ core.volume_refresh()
123
+ logger.info('Volume status refreshed. Sleeping '
124
+ f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
125
+ ' seconds for the next refresh...\n')
126
+ time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
127
+
128
+
129
+ def managed_job_status_refresh_event():
130
+ """Refresh the managed job status for controller consolidation mode."""
131
+ # pylint: disable=import-outside-toplevel
132
+ from sky.jobs import utils as managed_job_utils
133
+
134
+ # We run the recovery logic before starting the event loop as those two are
135
+ # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
136
+ managed_job_utils.ha_recovery_for_consolidation_mode()
137
+
138
+ # After recovery, we start the event loop.
139
+ from sky.skylet import events
140
+ refresh_event = events.ManagedJobEvent()
141
+ logger.info('=== Running managed job event ===')
142
+ refresh_event.run()
143
+ time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
144
+
145
+
146
+ def should_skip_managed_job_status_refresh():
147
+ """Check if the managed job status refresh event should be skipped."""
148
+ # pylint: disable=import-outside-toplevel
149
+ from sky.jobs import utils as managed_job_utils
150
+ return not managed_job_utils.is_consolidation_mode()
151
+
152
+
153
+ def _serve_status_refresh_event(pool: bool):
154
+ """Refresh the sky serve status for controller consolidation mode."""
155
+ # pylint: disable=import-outside-toplevel
156
+ from sky.serve import serve_utils
157
+
158
+ # We run the recovery logic before starting the event loop as those two are
159
+ # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
160
+ serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
161
+
162
+ # After recovery, we start the event loop.
163
+ from sky.skylet import events
164
+ event = events.ServiceUpdateEvent(pool=pool)
165
+ noun = 'pool' if pool else 'serve'
166
+ logger.info(f'=== Running {noun} status refresh event ===')
167
+ event.run()
168
+ time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
169
+
170
+
171
+ def _should_skip_serve_status_refresh_event(pool: bool):
172
+ """Check if the serve status refresh event should be skipped."""
173
+ # pylint: disable=import-outside-toplevel
174
+ from sky.serve import serve_utils
175
+ return not serve_utils.is_consolidation_mode(pool=pool)
176
+
177
+
178
+ def sky_serve_status_refresh_event():
179
+ _serve_status_refresh_event(pool=False)
180
+
181
+
182
+ def should_skip_sky_serve_status_refresh():
183
+ return _should_skip_serve_status_refresh_event(pool=False)
184
+
185
+
186
+ def pool_status_refresh_event():
187
+ _serve_status_refresh_event(pool=True)
188
+
189
+
190
+ def should_skip_pool_status_refresh():
191
+ return _should_skip_serve_status_refresh_event(pool=True)
192
+
193
+
194
+ # Register the events to run in the background.
195
+ INTERNAL_REQUEST_DAEMONS = [
196
+ # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
197
+ # set to updated status automatically, without showing users the hint of
198
+ # cluster being stopped or down when `sky status -r` is called.
199
+ InternalRequestDaemon(
200
+ id='skypilot-status-refresh-daemon',
201
+ name=request_names.RequestName.REQUEST_DAEMON_STATUS_REFRESH,
202
+ event_fn=refresh_cluster_status_event,
203
+ default_log_level='DEBUG'),
204
+ # Volume status refresh daemon to update the volume status periodically.
205
+ InternalRequestDaemon(
206
+ id='skypilot-volume-status-refresh-daemon',
207
+ name=request_names.RequestName.REQUEST_DAEMON_VOLUME_REFRESH,
208
+ event_fn=refresh_volume_status_event),
209
+ InternalRequestDaemon(id='managed-job-status-refresh-daemon',
210
+ name=request_names.RequestName.
211
+ REQUEST_DAEMON_MANAGED_JOB_STATUS_REFRESH,
212
+ event_fn=managed_job_status_refresh_event,
213
+ should_skip=should_skip_managed_job_status_refresh),
214
+ InternalRequestDaemon(
215
+ id='sky-serve-status-refresh-daemon',
216
+ name=request_names.RequestName.REQUEST_DAEMON_SKY_SERVE_STATUS_REFRESH,
217
+ event_fn=sky_serve_status_refresh_event,
218
+ should_skip=should_skip_sky_serve_status_refresh),
219
+ InternalRequestDaemon(
220
+ id='pool-status-refresh-daemon',
221
+ name=request_names.RequestName.REQUEST_DAEMON_POOL_STATUS_REFRESH,
222
+ event_fn=pool_status_refresh_event,
223
+ should_skip=should_skip_pool_status_refresh),
224
+ ]
225
+
226
+
227
+ def is_daemon_request_id(request_id: str) -> bool:
228
+ """Returns whether a specific request_id is an internal daemon."""
229
+ return any([d.id == request_id for d in INTERNAL_REQUEST_DAEMONS])
@@ -0,0 +1,185 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>SkyPilot API Server Login</title>
7
+ <style>
8
+ body {
9
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
10
+ display: flex;
11
+ flex-direction: column;
12
+ align-items: center;
13
+ justify-content: center;
14
+ min-height: 100vh;
15
+ margin: 0;
16
+ background-color: #f8f9fa;
17
+ color: #202124;
18
+ padding: 20px;
19
+ box-sizing: border-box;
20
+ }
21
+ .container {
22
+ background-color: #ffffff;
23
+ padding: 48px;
24
+ border-radius: 8px;
25
+ box-shadow: 0 1px 3px rgba(0,0,0,0.12), 0 1px 2px rgba(0,0,0,0.24);
26
+ text-align: center;
27
+ max-width: 600px;
28
+ width: 100%;
29
+ }
30
+ .logo {
31
+ width: 64px;
32
+ height: 64px;
33
+ margin-bottom: 20px;
34
+ display: inline-block;
35
+ }
36
+ .logo svg {
37
+ width: 100%;
38
+ height: 100%;
39
+ }
40
+ h1 {
41
+ font-size: 24px;
42
+ font-weight: 500;
43
+ margin-bottom: 20px;
44
+ color: #202124;
45
+ }
46
+ p {
47
+ font-size: 14px;
48
+ line-height: 1.5;
49
+ margin-bottom: 20px;
50
+ color: #5f6368;
51
+ }
52
+ .user-identifier {
53
+ font-size: 12px; /* Smaller font size */
54
+ color: #80868b; /* Lighter color */
55
+ margin-bottom: 8px; /* Adjusted margin */
56
+ }
57
+ .code-block {
58
+ background-color: #f1f3f4;
59
+ border: 1px solid #dadce0;
60
+ border-radius: 4px;
61
+ padding: 16px;
62
+ margin-top: 24px;
63
+ margin-bottom: 24px;
64
+ margin-left: auto;
65
+ margin-right: auto;
66
+ text-align: left;
67
+ word-break: break-all;
68
+ white-space: pre-wrap;
69
+ font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier, monospace;
70
+ font-size: 13px;
71
+ line-height: 1.4;
72
+ max-width: 480px;
73
+ }
74
+ #token-box { /* Specifically for the token */
75
+ height: auto;
76
+ min-height: 6em; /* Ensure it's a reasonable size */
77
+ max-height: 15em; /* Prevent it from getting too large */
78
+ overflow-y: auto;
79
+ }
80
+ .copy-button {
81
+ background-color: #1a73e8;
82
+ color: white;
83
+ border: none;
84
+ border-radius: 4px;
85
+ padding: 10px 24px;
86
+ font-size: 14px;
87
+ font-weight: 500;
88
+ cursor: pointer;
89
+ transition: background-color 0.3s;
90
+ margin-top: 10px;
91
+ }
92
+ .copy-button:hover {
93
+ background-color: #287ae6;
94
+ }
95
+ .copy-button:active {
96
+ background-color: #1b66c9;
97
+ }
98
+ .footer-text {
99
+ font-size: 12px;
100
+ color: #5f6368;
101
+ margin-top: 30px;
102
+ }
103
+ .local-port-info {
104
+ display: none;
105
+ }
106
+ </style>
107
+ </head>
108
+ <body>
109
+ <div class="container">
110
+ <div class="logo">
111
+ <!-- SkyPilot Logo Icon -->
112
+ <svg viewBox="0 0 50 50" fill="none" xmlns="http://www.w3.org/2000/svg">
113
+ <path d="M25.1258 30.8274L19.2842 31.6783L33.8316 46.2268L31.492 37.1925L25.1258 30.8274Z" fill="#372F8A"/>
114
+ <path d="M46.9433 0.000976562L0.719727 13.1148L15.2661 27.6601L16.633 21.3925L10.3728 15.1323L40.183 6.74118C40.183 6.74118 46.102 0.855027 46.9444 0.00203721L46.9433 0.000976562Z" fill="#372F8A"/>
115
+ <path d="M40.1821 6.74021L31.4922 37.1925L33.8318 46.2257L46.9445 0C46.1022 0.85299 40.1831 6.73915 40.1831 6.73915L40.1821 6.74021Z" fill="#372F8A"/>
116
+ <path d="M21.3356 25.6089L19.2842 31.6783L25.1258 30.8275L30.3741 16.6011L30.3275 16.617L21.3356 25.6089Z" fill="#195D7F"/>
117
+ <path d="M16.632 21.3918L15.2651 27.6605L21.3357 25.6091L30.3276 16.6172L16.632 21.3918Z" fill="#39A4DD"/>
118
+ </svg>
119
+ </div>
120
+ <h1 class="no-local-port">Sign in to SkyPilot CLI</h1>
121
+ <h1 class="local-port-info">Successfully signed into SkyPilot CLI</h1>
122
+ <p class="user-identifier">USER_PLACEHOLDER</p>
123
+ <!-- display token info by default -->
124
+ <p class="no-local-port">You are seeing this page because a SkyPilot command requires authentication.</p>
125
+ <p class="no-local-port">Please copy the following token and paste it into your SkyPilot CLI prompt:</p>
126
+ <div id="token-box" class="code-block no-local-port">SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER</div>
127
+ <button id="copy-btn" class="copy-button no-local-port">Copy Token</button>
128
+ <p class="footer-text no-local-port">You can close this tab after copying the token.</p>
129
+
130
+ <!-- don't display local port info unless successful -->
131
+ <p class="local-port-info">You can now close this tab.</p>
132
+ </div>
133
+
134
+ <script>
135
+ const tokenBox = document.getElementById('token-box');
136
+ const copyBtn = document.getElementById('copy-btn');
137
+
138
+ function selectToken() {
139
+ // For <pre> or <div>, create a range to select its content
140
+ const range = document.createRange();
141
+ range.selectNodeContents(tokenBox);
142
+ const sel = window.getSelection();
143
+ sel.removeAllRanges();
144
+ sel.addRange(range);
145
+ }
146
+
147
+ // Optional: Select the token when the page loads or when token box is clicked
148
+ tokenBox.addEventListener('click', selectToken);
149
+ window.addEventListener('load', selectToken);
150
+
151
+ copyBtn.addEventListener('click', () => {
152
+ selectToken(); // Select the text
153
+ try {
154
+ document.execCommand('copy');
155
+ copyBtn.textContent = 'Copied!';
156
+ } catch (err) {
157
+ copyBtn.textContent = 'Error!';
158
+ console.error('Failed to copy text: ', err);
159
+ }
160
+ setTimeout(() => {
161
+ copyBtn.textContent = 'Copy Token';
162
+ }, 2000);
163
+ });
164
+
165
+ function hideTokenInfo() {
166
+ const noLocalPortElems = document.querySelectorAll('.no-local-port');
167
+ noLocalPortElems.forEach(elem => {
168
+ elem.style.display = 'none';
169
+ });
170
+ const localPortInfoElems = document.querySelectorAll('.local-port-info');
171
+ localPortInfoElems.forEach(elem => {
172
+ elem.classList.remove('local-port-info');
173
+ });
174
+ }
175
+
176
+ if (window.location.search.includes('local_port=')) {
177
+ const uri = `http://localhost:${window.location.search.split('local_port=')[1]}`;
178
+ fetch(uri, {
179
+ method: 'POST',
180
+ body: 'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER'
181
+ }).then(hideTokenInfo)
182
+ }
183
+ </script>
184
+ </body>
185
+ </html>