skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,8 @@
1
1
  """Autostop utilities."""
2
+ import enum
2
3
  import pickle
3
4
  import shlex
5
+ import subprocess
4
6
  import time
5
7
  import typing
6
8
  from typing import List, Optional
@@ -10,11 +12,17 @@ from sky.adaptors import common as adaptors_common
10
12
  from sky.skylet import configs
11
13
  from sky.skylet import constants
12
14
  from sky.utils import message_utils
15
+ from sky.utils import ux_utils
13
16
 
14
17
  if typing.TYPE_CHECKING:
15
18
  import psutil
19
+
20
+ from sky.schemas.generated import autostopv1_pb2
16
21
  else:
17
22
  psutil = adaptors_common.LazyImport('psutil')
23
+ # To avoid requiring protobuf to be installed on the client side.
24
+ autostopv1_pb2 = adaptors_common.LazyImport(
25
+ 'sky.schemas.generated.autostopv1_pb2')
18
26
 
19
27
  logger = sky_logging.init_logger(__name__)
20
28
 
@@ -30,6 +38,83 @@ _AUTOSTOP_LAST_ACTIVE_TIME = 'autostop_last_active_time'
30
38
  _AUTOSTOP_INDICATOR = 'autostop_indicator'
31
39
 
32
40
 
41
+ class AutostopWaitFor(enum.Enum):
42
+ """Enum for the Autostop behaviour.
43
+
44
+ JOBS: Wait for jobs to finish.
45
+ JOBS_AND_SSH: Wait for jobs to finish and all SSH sessions to be closed.
46
+ NONE: Unconditionally stop the cluster after the idle time.
47
+ """
48
+ JOBS_AND_SSH = 'jobs_and_ssh'
49
+ JOBS = 'jobs'
50
+ NONE = 'none'
51
+
52
+ @classmethod
53
+ def supported_modes(cls) -> List[str]:
54
+ return [mode.value for mode in cls]
55
+
56
+ @classmethod
57
+ def cli_help_message(cls, pair: str) -> str:
58
+ return f"""\
59
+ Determines the condition for resetting the idleness timer.
60
+ This option works in conjunction with ``--{pair}``. Options:
61
+
62
+ \b
63
+ 1. ``jobs_and_ssh`` (default): Wait for in-progress jobs and SSH connections to finish.
64
+ 2. ``jobs``: Only wait for in-progress jobs.
65
+ 3. ``none``: Wait for nothing; autostop right after ``{pair}``."""
66
+
67
+ @classmethod
68
+ def from_str(cls, mode: str) -> 'AutostopWaitFor':
69
+ """Returns the enum value for the given string."""
70
+ if mode.lower() == cls.JOBS.value:
71
+ return cls.JOBS
72
+ elif mode.lower() == cls.JOBS_AND_SSH.value:
73
+ return cls.JOBS_AND_SSH
74
+ elif mode.lower() == cls.NONE.value:
75
+ return cls.NONE
76
+ else:
77
+ with ux_utils.print_exception_no_traceback():
78
+ raise ValueError(f'Unsupported autostop wait mode: '
79
+ f'{mode}. The mode must be either '
80
+ f'\'{cls.JOBS_AND_SSH.value}\', '
81
+ f'\'{cls.JOBS.value}\', or '
82
+ f'\'{cls.NONE.value}\'. ')
83
+
84
+ @classmethod
85
+ def from_protobuf(
86
+ cls, protobuf_value: 'autostopv1_pb2.AutostopWaitFor'
87
+ ) -> Optional['AutostopWaitFor']:
88
+ """Convert protobuf AutostopWaitFor enum to Python enum value."""
89
+ protobuf_to_enum = {
90
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: cls.JOBS_AND_SSH,
91
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS: cls.JOBS,
92
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE: cls.NONE,
93
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED: None,
94
+ }
95
+ if protobuf_value not in protobuf_to_enum:
96
+ with ux_utils.print_exception_no_traceback():
97
+ raise ValueError(
98
+ f'Unknown protobuf AutostopWaitFor value: {protobuf_value}')
99
+ return protobuf_to_enum[protobuf_value]
100
+
101
+ def to_protobuf(self) -> 'autostopv1_pb2.AutostopWaitFor':
102
+ """Convert this Python enum value to protobuf enum value."""
103
+ enum_to_protobuf = {
104
+ AutostopWaitFor.JOBS_AND_SSH:
105
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH,
106
+ AutostopWaitFor.JOBS: autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS,
107
+ AutostopWaitFor.NONE: autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE,
108
+ }
109
+ if self not in enum_to_protobuf:
110
+ with ux_utils.print_exception_no_traceback():
111
+ raise ValueError(f'Unknown AutostopWaitFor value: {self}')
112
+ return enum_to_protobuf[self]
113
+
114
+
115
+ DEFAULT_AUTOSTOP_WAIT_FOR: AutostopWaitFor = AutostopWaitFor.JOBS_AND_SSH
116
+
117
+
33
118
  class AutostopConfig:
34
119
  """Autostop configuration."""
35
120
 
@@ -37,12 +122,14 @@ class AutostopConfig:
37
122
  autostop_idle_minutes: int,
38
123
  boot_time: float,
39
124
  backend: Optional[str],
125
+ wait_for: AutostopWaitFor,
40
126
  down: bool = False):
41
127
  assert autostop_idle_minutes < 0 or backend is not None, (
42
128
  autostop_idle_minutes, backend)
43
129
  self.autostop_idle_minutes = autostop_idle_minutes
44
130
  self.boot_time = boot_time
45
131
  self.backend = backend
132
+ self.wait_for = wait_for
46
133
  self.down = down
47
134
 
48
135
  def __setstate__(self, state: dict):
@@ -53,15 +140,18 @@ class AutostopConfig:
53
140
  def get_autostop_config() -> AutostopConfig:
54
141
  config_str = configs.get_config(_AUTOSTOP_CONFIG_KEY)
55
142
  if config_str is None:
56
- return AutostopConfig(-1, -1, None)
143
+ return AutostopConfig(-1, -1, None, DEFAULT_AUTOSTOP_WAIT_FOR)
57
144
  return pickle.loads(config_str)
58
145
 
59
146
 
60
- def set_autostop(idle_minutes: int, backend: Optional[str], down: bool) -> None:
147
+ def set_autostop(idle_minutes: int, backend: Optional[str],
148
+ wait_for: AutostopWaitFor, down: bool) -> None:
61
149
  boot_time = psutil.boot_time()
62
- autostop_config = AutostopConfig(idle_minutes, boot_time, backend, down)
150
+ autostop_config = AutostopConfig(idle_minutes, boot_time, backend, wait_for,
151
+ down)
63
152
  configs.set_config(_AUTOSTOP_CONFIG_KEY, pickle.dumps(autostop_config))
64
- logger.debug(f'set_autostop(): idle_minutes {idle_minutes}, down {down}.')
153
+ logger.debug(f'set_autostop(): idle_minutes {idle_minutes}, down {down}, '
154
+ f'wait_for {wait_for.value}.')
65
155
  # Reset timer whenever an autostop setting is submitted, i.e. the idle
66
156
  # time will be counted from now.
67
157
  set_last_active_time_to_now()
@@ -107,6 +197,28 @@ def set_last_active_time_to_now() -> None:
107
197
  configs.set_config(_AUTOSTOP_LAST_ACTIVE_TIME, str(time.time()))
108
198
 
109
199
 
200
+ def has_active_ssh_sessions() -> bool:
201
+ """Returns True if there are any active SSH sessions on the node."""
202
+ try:
203
+ # /dev/pts is a virtual filesystem that contains the pseudo-terminal
204
+ # devices. ptmx is the pseudo-terminal multiplexer, which is the
205
+ # "master" device that creates new pseudo-terminal devices, so we
206
+ # exclude it from the count.
207
+ proc = subprocess.run('ls /dev/pts | grep -v ptmx | wc -l',
208
+ capture_output=True,
209
+ text=True,
210
+ check=False,
211
+ shell=True)
212
+ if proc.returncode != 0:
213
+ logger.warning(f'SSH session check command failed with return code '
214
+ f'{proc.returncode}.')
215
+ return False
216
+ return int(proc.stdout.strip()) > 0
217
+ except Exception as e: # pylint: disable=broad-except
218
+ logger.warning(f'Error checking active SSH sessions: {e}.')
219
+ return False
220
+
221
+
110
222
  class AutostopCodeGen:
111
223
  """Code generator for autostop utility functions.
112
224
 
@@ -114,13 +226,22 @@ class AutostopCodeGen:
114
226
 
115
227
  >> codegen = AutostopCodeGen.set_autostop(...)
116
228
  """
117
- _PREFIX = ['from sky.skylet import autostop_lib']
229
+ _PREFIX = ['from sky.skylet import autostop_lib, constants']
118
230
 
119
231
  @classmethod
120
- def set_autostop(cls, idle_minutes: int, backend: str, down: bool) -> str:
232
+ def set_autostop(cls,
233
+ idle_minutes: int,
234
+ backend: str,
235
+ wait_for: Optional[AutostopWaitFor],
236
+ down: bool = False) -> str:
237
+ if wait_for is None:
238
+ wait_for = DEFAULT_AUTOSTOP_WAIT_FOR
121
239
  code = [
122
- f'autostop_lib.set_autostop({idle_minutes}, {backend!r},'
123
- f' {down})',
240
+ f'\nif getattr(constants, "SKYLET_LIB_VERSION", 1) < 4: '
241
+ f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, {down})'
242
+ f'\nelse: '
243
+ f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, '
244
+ f'autostop_lib.{wait_for}, {down})',
124
245
  ]
125
246
  return cls._build(code)
126
247
 
sky/skylet/configs.py CHANGED
@@ -2,17 +2,16 @@
2
2
  import functools
3
3
  import os
4
4
  import pathlib
5
+ import threading
5
6
  from typing import Callable, Optional, Union
6
7
 
7
- from sky.utils import db_utils
8
+ from sky.utils.db import db_utils
8
9
 
9
- _DB_PATH = os.path.expanduser('~/.sky/skylet_config.db')
10
- os.makedirs(pathlib.Path(_DB_PATH).parents[0], exist_ok=True)
10
+ _DB_PATH = None
11
+ _db_init_lock = threading.Lock()
11
12
 
12
- _table_created = False
13
13
 
14
-
15
- def ensure_table(func: Callable):
14
+ def init_db(func: Callable):
16
15
  """Ensure the table exists before calling the function.
17
16
 
18
17
  Since this module will be imported whenever `sky` is imported (due to
@@ -24,25 +23,32 @@ def ensure_table(func: Callable):
24
23
 
25
24
  @functools.wraps(func)
26
25
  def wrapper(*args, **kwargs):
27
- global _table_created
28
- if not _table_created:
29
- with db_utils.safe_cursor(
30
- _DB_PATH) as c: # Call it 'c' to avoid pylint complaining.
31
- # Use WAL mode to avoid locking problem in #1507.
32
- # Reference: https://stackoverflow.com/a/39265148
33
- c.execute('PRAGMA journal_mode=WAL')
34
- c.execute("""\
35
- CREATE TABLE IF NOT EXISTS config (
36
- key TEXT PRIMARY KEY,
37
- value TEXT)""")
38
- _table_created = True
26
+ global _DB_PATH
27
+ if _DB_PATH is not None:
28
+ return func(*args, **kwargs)
29
+
30
+ with _db_init_lock:
31
+ if _DB_PATH is None:
32
+ _DB_PATH = os.path.expanduser('~/.sky/skylet_config.db')
33
+ os.makedirs(pathlib.Path(_DB_PATH).parents[0], exist_ok=True)
34
+ with db_utils.safe_cursor(
35
+ _DB_PATH
36
+ ) as c: # Call it 'c' to avoid pylint complaining.
37
+ # Use WAL mode to avoid locking problem in #1507.
38
+ # Reference: https://stackoverflow.com/a/39265148
39
+ c.execute('PRAGMA journal_mode=WAL')
40
+ c.execute("""\
41
+ CREATE TABLE IF NOT EXISTS config (
42
+ key TEXT PRIMARY KEY,
43
+ value TEXT)""")
39
44
  return func(*args, **kwargs)
40
45
 
41
46
  return wrapper
42
47
 
43
48
 
44
- @ensure_table
49
+ @init_db
45
50
  def get_config(key: str) -> Optional[bytes]:
51
+ assert _DB_PATH is not None
46
52
  with db_utils.safe_cursor(_DB_PATH) as cursor:
47
53
  rows = cursor.execute('SELECT value FROM config WHERE key = ?', (key,))
48
54
  for (value,) in rows:
@@ -50,8 +56,9 @@ def get_config(key: str) -> Optional[bytes]:
50
56
  return None
51
57
 
52
58
 
53
- @ensure_table
59
+ @init_db
54
60
  def set_config(key: str, value: Union[bytes, str]) -> None:
61
+ assert _DB_PATH is not None
55
62
  with db_utils.safe_cursor(_DB_PATH) as cursor:
56
63
  cursor.execute(
57
64
  """\
sky/skylet/constants.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Constants for SkyPilot."""
2
+ import os
2
3
  from typing import List, Tuple
3
4
 
4
5
  from packaging import version
@@ -28,6 +29,7 @@ SKY_REMOTE_RAY_PORT_FILE = '~/.sky/ray_port.json'
28
29
  SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
29
30
  SKY_REMOTE_RAY_VERSION = '2.9.3'
30
31
 
32
+ SKY_UNSET_PYTHONPATH = 'env -u PYTHONPATH'
31
33
  # We store the absolute path of the python executable (/opt/conda/bin/python3)
32
34
  # in this file, so that any future internal commands that need to use python
33
35
  # can use this path. This is useful for the case where the user has a custom
@@ -39,7 +41,7 @@ SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
39
41
  f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
40
42
  'which python3')
41
43
  # Python executable, e.g., /opt/conda/bin/python3
42
- SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
44
+ SKY_PYTHON_CMD = f'{SKY_UNSET_PYTHONPATH} $({SKY_GET_PYTHON_PATH_CMD})'
43
45
  # Prefer SKY_UV_PIP_CMD, which is faster.
44
46
  # TODO(cooperc): remove remaining usage (GCP TPU setup).
45
47
  SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
@@ -55,20 +57,29 @@ SKY_REMOTE_PYTHON_ENV: str = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
55
57
  ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
56
58
  # uv is used for venv and pip, much faster than python implementations.
57
59
  SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
58
- SKY_UV_CMD = f'UV_SYSTEM_PYTHON=false {SKY_UV_INSTALL_DIR}/uv'
60
+ # set UV_SYSTEM_PYTHON to false in case the
61
+ # user provided docker image set it to true.
62
+ # unset PYTHONPATH in case the user provided docker image set it.
63
+ SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
64
+ f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
59
65
  # This won't reinstall uv if it's already installed, so it's safe to re-run.
60
66
  SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
61
67
  'curl -LsSf https://astral.sh/uv/install.sh '
62
68
  f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
63
69
  SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
64
- # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH to deactivate the
65
- # environment. `deactivate` command does not work when conda is used.
70
+ SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run '
71
+ '--no-project --no-config')
72
+ # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
73
+ # VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
74
+ # not work when conda is used.
66
75
  DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
67
76
  'export PATH='
68
- f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||")')
77
+ f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||") && '
78
+ 'unset VIRTUAL_ENV && unset VIRTUAL_ENV_PROMPT')
69
79
 
70
80
  # Prefix for SkyPilot environment variables
71
81
  SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
82
+ SKYPILOT_SERVER_ENV_VAR_PREFIX = 'SKYPILOT_SERVER_'
72
83
 
73
84
  # The name for the environment variable that stores the unique ID of the
74
85
  # current task. This will stay the same across multiple recoveries of the
@@ -89,17 +100,14 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
89
100
  # cluster yaml is updated.
90
101
  #
91
102
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
92
- SKYLET_VERSION = '12'
103
+ SKYLET_VERSION = '25'
93
104
  # The version of the lib files that skylet/jobs use. Whenever there is an API
94
105
  # change for the job_lib or log_lib, we need to bump this version, so that the
95
106
  # user can be notified to update their SkyPilot version on the remote cluster.
96
- SKYLET_LIB_VERSION = 3
107
+ SKYLET_LIB_VERSION = 4
97
108
  SKYLET_VERSION_FILE = '~/.sky/skylet_version'
98
-
99
- # `sky jobs dashboard`-related
100
- #
101
- # Port on the remote jobs controller that the dashboard is running on.
102
- SPOT_DASHBOARD_REMOTE_PORT = 5000
109
+ SKYLET_GRPC_PORT = 46590
110
+ SKYLET_GRPC_TIMEOUT_SECONDS = 10
103
111
 
104
112
  # Docker default options
105
113
  DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
@@ -151,7 +159,7 @@ CONDA_INSTALLATION_COMMANDS = (
151
159
  # because for some images, conda is already installed, but not initialized.
152
160
  # In this case, we need to initialize conda and set auto_activate_base to
153
161
  # true.
154
- '{ bash Miniconda3-Linux.sh -b; '
162
+ '{ bash Miniconda3-Linux.sh -b || true; '
155
163
  'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
156
164
  # Caller should replace {conda_auto_activate} with either true or false.
157
165
  'conda config --set auto_activate_base {conda_auto_activate} && '
@@ -218,7 +226,9 @@ RAY_INSTALLATION_COMMANDS = (
218
226
  f'{SKY_UV_PIP_CMD} list | grep "ray " | '
219
227
  f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
220
228
  f'|| {RAY_STATUS} || '
221
- f'{SKY_UV_PIP_CMD} install -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long
229
+ # The pydantic-core==2.41.3 for arm seems corrupted
230
+ # so we need to avoid that specific version.
231
+ f'{SKY_UV_PIP_CMD} install -U "ray[default]=={SKY_REMOTE_RAY_VERSION}" "pydantic-core==2.41.1"; ' # pylint: disable=line-too-long
222
232
  # In some envs, e.g. pip does not have permission to write under /opt/conda
223
233
  # ray package will be installed under ~/.local/bin. If the user's PATH does
224
234
  # not include ~/.local/bin (the pip install will have the output: `WARNING:
@@ -230,7 +240,7 @@ RAY_INSTALLATION_COMMANDS = (
230
240
  'export PATH=$PATH:$HOME/.local/bin; '
231
241
  # Writes ray path to file if it does not exist or the file is empty.
232
242
  f'[ -s {SKY_RAY_PATH_FILE} ] || '
233
- f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && '
243
+ f'{{ {SKY_UV_RUN_CMD} '
234
244
  f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
235
245
 
236
246
  SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
@@ -323,6 +333,14 @@ FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.sky/tmp/'
323
333
  # controller_utils.translate_local_file_mounts_to_two_hop().
324
334
  FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.sky/tmp/controller'
325
335
 
336
+ # For passing in CPU and memory limits to the controller pod when running
337
+ # in k8s. Right now, we only use this for the jobs controller, but we may
338
+ # use this for the serve controller as well in the future.
339
+ # These files are written to disk by the skylet, who reads it from env vars
340
+ # passed by the backend when starting the skylet (start_skylet_on_head_node).
341
+ CONTROLLER_K8S_CPU_FILE = '~/.sky/_internal_k8s_pod_cpu'
342
+ CONTROLLER_K8S_MEMORY_FILE = '~/.sky/_internal_k8s_pod_memory'
343
+
326
344
  # Used when an managed jobs are created and
327
345
  # files are synced up to the cloud.
328
346
  FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
@@ -346,6 +364,11 @@ API_SERVER_CREATION_LOCK_PATH = '~/.sky/api_server/.creation.lock'
346
364
  # API server.
347
365
  SKY_API_SERVER_URL_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}API_SERVER_ENDPOINT'
348
366
 
367
+ # The name for the environment variable that stores the SkyPilot service
368
+ # account token on client side.
369
+ SERVICE_ACCOUNT_TOKEN_ENV_VAR = (
370
+ f'{SKYPILOT_ENV_VAR_PREFIX}SERVICE_ACCOUNT_TOKEN')
371
+
349
372
  # SkyPilot environment variables
350
373
  SKYPILOT_NUM_NODES = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_NODES'
351
374
  SKYPILOT_NODE_IPS = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_IPS'
@@ -358,7 +381,7 @@ SKY_SSH_USER_PLACEHOLDER = 'skypilot:ssh_user'
358
381
 
359
382
  RCLONE_CONFIG_DIR = '~/.config/rclone'
360
383
  RCLONE_CONFIG_PATH = f'{RCLONE_CONFIG_DIR}/rclone.conf'
361
- RCLONE_LOG_DIR = '~/.sky/rclone_log'
384
+ RCLONE_MOUNT_CACHED_LOG_DIR = '~/.sky/rclone_log'
362
385
  RCLONE_CACHE_DIR = '~/.cache/rclone'
363
386
  RCLONE_CACHE_REFRESH_INTERVAL = 10
364
387
 
@@ -367,15 +390,41 @@ RCLONE_CACHE_REFRESH_INTERVAL = 10
367
390
  OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
368
391
  ('docker', 'run_options'),
369
392
  ('nvidia_gpus', 'disable_ecc'),
393
+ ('ssh', 'pod_config'),
394
+ ('kubernetes', 'custom_metadata'),
370
395
  ('kubernetes', 'pod_config'),
371
396
  ('kubernetes', 'provision_timeout'),
397
+ ('kubernetes', 'dws'),
398
+ ('kubernetes', 'kueue'),
372
399
  ('gcp', 'managed_instance_group'),
400
+ ('gcp', 'enable_gvnic'),
401
+ ('gcp', 'enable_gpu_direct'),
402
+ ('gcp', 'placement_policy'),
403
+ ('active_workspace',),
373
404
  ]
374
405
  # When overriding the SkyPilot configs on the API server with the client one,
375
406
  # we skip the following keys because they are meant to be client-side configs.
376
- SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('admin_policy',),
377
- ('api_server',),
378
- ('allowed_clouds',)]
407
+ # Also, we skip the consolidation mode config as those should be only set on
408
+ # the API server side.
409
+ SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
410
+ ('api_server',),
411
+ ('allowed_clouds',),
412
+ ('workspaces',),
413
+ ('db',),
414
+ ('daemons',),
415
+ # TODO(kevin,tian): Override the whole controller config once our test
416
+ # infrastructure supports setting dynamic server side configs.
417
+ # Tests that are affected:
418
+ # - test_managed_jobs_ha_kill_starting
419
+ # - test_managed_jobs_ha_kill_running
420
+ # - all tests that use LOW_CONTROLLER_RESOURCE_ENV or
421
+ # LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG (won't cause test failure,
422
+ # but the configs won't be applied)
423
+ ('jobs', 'controller', 'consolidation_mode'),
424
+ ('serve', 'controller', 'consolidation_mode'),
425
+ ('jobs', 'controller', 'controller_logs_gc_retention_hours'),
426
+ ('jobs', 'controller', 'task_logs_gc_retention_hours'),
427
+ ]
379
428
 
380
429
  # Constants for Azure blob storage
381
430
  WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
@@ -392,6 +441,12 @@ ROLE_ASSIGNMENT_FAILURE_ERROR_MSG = (
392
441
  # persistent through PVC. See kubernetes-ray.yml.j2.
393
442
  PERSISTENT_SETUP_SCRIPT_PATH = '~/.sky/.controller_recovery_setup_commands.sh'
394
443
  PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
444
+ # Signal file to indicate that the controller is recovering from a failure.
445
+ # See sky/jobs/utils.py::update_managed_jobs_statuses for more details.
446
+ PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
447
+ '~/.sky/.controller_recovery_restarting_signal')
448
+
449
+ HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/{}ha_recovery.log'
395
450
 
396
451
  # The placeholder for the local skypilot config path in file mounts for
397
452
  # controllers.
@@ -400,5 +455,102 @@ LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
400
455
  # Path to the generated cluster config yamls and ssh configs.
401
456
  SKY_USER_FILE_PATH = '~/.sky/generated'
402
457
 
458
+ # TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
403
459
  # Environment variable that is set to 'true' if this is a skypilot server.
404
460
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
461
+ OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
462
+
463
+ # Environment variable that is set to 'true' if metrics are enabled.
464
+ ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
465
+
466
+ # If set, overrides the header that we can use to get the user name.
467
+ ENV_VAR_SERVER_AUTH_USER_HEADER = f'{SKYPILOT_ENV_VAR_PREFIX}AUTH_USER_HEADER'
468
+
469
+ # Environment variable that is used as the DB connection string for the
470
+ # skypilot server.
471
+ ENV_VAR_DB_CONNECTION_URI = (f'{SKYPILOT_ENV_VAR_PREFIX}DB_CONNECTION_URI')
472
+
473
+ # Environment variable that is set to 'true' if basic
474
+ # authentication is enabled in the API server.
475
+ ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
476
+ SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
477
+ SKYPILOT_INGRESS_BASIC_AUTH_ENABLED = 'SKYPILOT_INGRESS_BASIC_AUTH_ENABLED'
478
+ ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
479
+
480
+ # Enable debug logging for requests.
481
+ ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING = (
482
+ f'{SKYPILOT_SERVER_ENV_VAR_PREFIX}ENABLE_REQUEST_DEBUG_LOGGING')
483
+
484
+ SKYPILOT_DEFAULT_WORKSPACE = 'default'
485
+
486
+ # BEGIN constants used for service catalog.
487
+ HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
488
+ HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs' # pylint: disable=line-too-long
489
+ CATALOG_SCHEMA_VERSION = 'v8'
490
+ CATALOG_DIR = '~/.sky/catalogs'
491
+ ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
492
+ 'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
493
+ 'paperspace', 'primeintellect', 'do', 'nebius', 'ssh',
494
+ 'hyperbolic', 'seeweb', 'shadeform')
495
+ # END constants used for service catalog.
496
+
497
+ # The user ID of the SkyPilot system.
498
+ SKYPILOT_SYSTEM_USER_ID = 'skypilot-system'
499
+
500
+ # The directory to store the logging configuration.
501
+ LOGGING_CONFIG_DIR = '~/.sky/logging'
502
+
503
+ # Resources constants
504
+ TIME_UNITS = {
505
+ 'm': 1,
506
+ 'h': 60,
507
+ 'd': 24 * 60,
508
+ 'w': 7 * 24 * 60,
509
+ }
510
+
511
+ TIME_PATTERN: str = ('^[0-9]+('
512
+ f'{"|".join([unit.lower() for unit in TIME_UNITS])}|'
513
+ f'{"|".join([unit.upper() for unit in TIME_UNITS])}|'
514
+ ')?$')
515
+
516
+ MEMORY_SIZE_UNITS = {
517
+ 'kb': 2**10,
518
+ 'ki': 2**10,
519
+ 'mb': 2**20,
520
+ 'mi': 2**20,
521
+ 'gb': 2**30,
522
+ 'gi': 2**30,
523
+ 'tb': 2**40,
524
+ 'ti': 2**40,
525
+ 'pb': 2**50,
526
+ 'pi': 2**50,
527
+ }
528
+
529
+ MEMORY_SIZE_PATTERN = (
530
+ '^[0-9]+('
531
+ f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}|'
532
+ f'{"|".join([unit.upper() for unit in MEMORY_SIZE_UNITS])}|'
533
+ f'{"|".join([unit[0].upper() + unit[1:] for unit in MEMORY_SIZE_UNITS if len(unit) > 1])}' # pylint: disable=line-too-long
534
+ ')?$')
535
+
536
+ LAST_USE_TRUNC_LENGTH = 25
537
+ USED_BY_TRUNC_LENGTH = 25
538
+
539
+ MIN_PRIORITY = -1000
540
+ MAX_PRIORITY = 1000
541
+ DEFAULT_PRIORITY = 0
542
+
543
+ GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
544
+ COST_REPORT_DEFAULT_DAYS = 30
545
+
546
+ # The directory for file locks.
547
+ SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
548
+
549
+ ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
550
+ 'DEBUG_LOOP_LAG_THRESHOLD_MS')
551
+
552
+ ARM64_ARCH = 'arm64'
553
+ X86_64_ARCH = 'x86_64'
554
+
555
+ SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
556
+ f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')