skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,51 +1,133 @@
1
1
  """Restarts skylet if version does not match"""
2
2
 
3
3
  import os
4
+ import signal
4
5
  import subprocess
6
+ from typing import List, Optional, Tuple
7
+
8
+ import psutil
5
9
 
6
10
  from sky.skylet import constants
11
+ from sky.skylet import runtime_utils
12
+
13
+ VERSION_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_VERSION_FILE)
14
+ SKYLET_LOG_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_LOG_FILE)
15
+ PID_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_PID_FILE)
16
+ PORT_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_PORT_FILE)
17
+
18
+
19
+ def _is_running_skylet_process(pid: int) -> bool:
20
+ if pid <= 0:
21
+ return False
22
+ try:
23
+ process = psutil.Process(pid)
24
+ if not process.is_running():
25
+ return False
26
+ # Check if command line contains the skylet module identifier
27
+ cmdline = process.cmdline()
28
+ return any('sky.skylet.skylet' in arg for arg in cmdline)
29
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess,
30
+ OSError) as e:
31
+ print(f'Error checking if skylet process {pid} is running: {e}')
32
+ return False
33
+
34
+
35
+ def _find_running_skylet_pids() -> List[int]:
36
+ if os.path.exists(PID_FILE):
37
+ try:
38
+ with open(PID_FILE, 'r', encoding='utf-8') as pid_file:
39
+ pid = int(pid_file.read().strip())
40
+ if _is_running_skylet_process(pid):
41
+ return [pid]
42
+ except (OSError, ValueError, IOError) as e:
43
+ # Don't fallback to grep-based detection as the existence of the
44
+ # PID file implies that we are on the new version, and there is
45
+ # possibility of there being multiple skylet processes running,
46
+ # and we don't want to accidentally kill the wrong skylet(s).
47
+ print(f'Error reading PID file {PID_FILE}: {e}')
48
+ return []
49
+ else:
50
+ # Fall back to grep-based detection for backward compatibility.
51
+ pids = []
52
+ # We use -m to grep instead of {constants.SKY_PYTHON_CMD} -m to grep
53
+ # because need to handle the backward compatibility of the old skylet
54
+ # started before #3326, which does not use the full path to python.
55
+ proc = subprocess.run(
56
+ 'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | grep " -m"',
57
+ shell=True,
58
+ check=False,
59
+ capture_output=True,
60
+ text=True)
61
+ if proc.returncode == 0:
62
+ # Parse the output to extract PIDs (column 2)
63
+ for line in proc.stdout.strip().split('\n'):
64
+ if line:
65
+ parts = line.split()
66
+ if len(parts) >= 2:
67
+ try:
68
+ pids.append(int(parts[1]))
69
+ except ValueError:
70
+ continue
71
+ return pids
72
+
73
+
74
+ def _check_version_match() -> Tuple[bool, Optional[str]]:
75
+ """Check if the version file matches the current skylet version.
7
76
 
8
- VERSION_FILE = os.path.expanduser(constants.SKYLET_VERSION_FILE)
77
+ Returns:
78
+ Tuple of (version_match: bool, version: str or None)
79
+ """
80
+ version: Optional[str] = None
81
+ if os.path.exists(VERSION_FILE):
82
+ try:
83
+ with open(VERSION_FILE, 'r', encoding='utf-8') as f:
84
+ version = f.read().strip()
85
+ return version == constants.SKYLET_VERSION, version
86
+ except (OSError, IOError):
87
+ pass
88
+ return False, version
9
89
 
10
90
 
11
91
  def restart_skylet():
12
92
  # Kills old skylet if it is running.
13
93
  # TODO(zhwu): make the killing graceful, e.g., use a signal to tell
14
94
  # skylet to exit, instead of directly killing it.
15
- subprocess.run(
16
- # We use -m to grep instead of {constants.SKY_PYTHON_CMD} -m to grep
17
- # because need to handle the backward compatibility of the old skylet
18
- # started before #3326, which does not use the full path to python.
19
- 'ps aux | grep "sky.skylet.skylet" | grep " -m "'
20
- '| awk \'{print $2}\' | xargs kill >> ~/.sky/skylet.log 2>&1',
21
- shell=True,
22
- check=False)
95
+
96
+ # Find and kill running skylet processes
97
+ for pid in _find_running_skylet_pids():
98
+ try:
99
+ os.kill(pid, signal.SIGKILL)
100
+ except (OSError, ProcessLookupError):
101
+ # Process died between detection and kill
102
+ pass
103
+ # Clean up the PID file
104
+ try:
105
+ os.remove(PID_FILE)
106
+ except OSError:
107
+ pass # Best effort cleanup
108
+
109
+ port = constants.SKYLET_GRPC_PORT
23
110
  subprocess.run(
24
111
  # We have made sure that `attempt_skylet.py` is executed with the
25
112
  # skypilot runtime env activated, so that skylet can access the cloud
26
113
  # CLI tools.
27
- f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet'
28
- ' >> ~/.sky/skylet.log 2>&1 &',
114
+ f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet '
115
+ f'--port={port} '
116
+ f'>> {SKYLET_LOG_FILE} 2>&1 & echo $! > {PID_FILE}',
29
117
  shell=True,
30
118
  check=True)
119
+
120
+ with open(PORT_FILE, 'w', encoding='utf-8') as pf:
121
+ pf.write(str(port))
122
+
31
123
  with open(VERSION_FILE, 'w', encoding='utf-8') as v_f:
32
124
  v_f.write(constants.SKYLET_VERSION)
33
125
 
34
126
 
35
- proc = subprocess.run(
36
- 'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | grep " -m"',
37
- shell=True,
38
- check=False)
39
-
40
- running = (proc.returncode == 0)
127
+ # Check if our skylet is running
128
+ running = bool(_find_running_skylet_pids())
41
129
 
42
- version_match = False
43
- found_version = None
44
- if os.path.exists(VERSION_FILE):
45
- with open(VERSION_FILE, 'r', encoding='utf-8') as f:
46
- found_version = f.read().strip()
47
- if found_version == constants.SKYLET_VERSION:
48
- version_match = True
130
+ version_match, found_version = _check_version_match()
49
131
 
50
132
  version_string = (f' (found version {found_version}, new version '
51
133
  f'{constants.SKYLET_VERSION})')
@@ -1,6 +1,8 @@
1
1
  """Autostop utilities."""
2
+ import enum
2
3
  import pickle
3
4
  import shlex
5
+ import subprocess
4
6
  import time
5
7
  import typing
6
8
  from typing import List, Optional
@@ -10,11 +12,17 @@ from sky.adaptors import common as adaptors_common
10
12
  from sky.skylet import configs
11
13
  from sky.skylet import constants
12
14
  from sky.utils import message_utils
15
+ from sky.utils import ux_utils
13
16
 
14
17
  if typing.TYPE_CHECKING:
15
18
  import psutil
19
+
20
+ from sky.schemas.generated import autostopv1_pb2
16
21
  else:
17
22
  psutil = adaptors_common.LazyImport('psutil')
23
+ # To avoid requiring protobuf to be installed on the client side.
24
+ autostopv1_pb2 = adaptors_common.LazyImport(
25
+ 'sky.schemas.generated.autostopv1_pb2')
18
26
 
19
27
  logger = sky_logging.init_logger(__name__)
20
28
 
@@ -30,6 +38,83 @@ _AUTOSTOP_LAST_ACTIVE_TIME = 'autostop_last_active_time'
30
38
  _AUTOSTOP_INDICATOR = 'autostop_indicator'
31
39
 
32
40
 
41
+ class AutostopWaitFor(enum.Enum):
42
+ """Enum for the Autostop behaviour.
43
+
44
+ JOBS: Wait for jobs to finish.
45
+ JOBS_AND_SSH: Wait for jobs to finish and all SSH sessions to be closed.
46
+ NONE: Unconditionally stop the cluster after the idle time.
47
+ """
48
+ JOBS_AND_SSH = 'jobs_and_ssh'
49
+ JOBS = 'jobs'
50
+ NONE = 'none'
51
+
52
+ @classmethod
53
+ def supported_modes(cls) -> List[str]:
54
+ return [mode.value for mode in cls]
55
+
56
+ @classmethod
57
+ def cli_help_message(cls, pair: str) -> str:
58
+ return f"""\
59
+ Determines the condition for resetting the idleness timer.
60
+ This option works in conjunction with ``--{pair}``. Options:
61
+
62
+ \b
63
+ 1. ``jobs_and_ssh`` (default): Wait for in-progress jobs and SSH connections to finish.
64
+ 2. ``jobs``: Only wait for in-progress jobs.
65
+ 3. ``none``: Wait for nothing; autostop right after ``{pair}``."""
66
+
67
+ @classmethod
68
+ def from_str(cls, mode: str) -> 'AutostopWaitFor':
69
+ """Returns the enum value for the given string."""
70
+ if mode.lower() == cls.JOBS.value:
71
+ return cls.JOBS
72
+ elif mode.lower() == cls.JOBS_AND_SSH.value:
73
+ return cls.JOBS_AND_SSH
74
+ elif mode.lower() == cls.NONE.value:
75
+ return cls.NONE
76
+ else:
77
+ with ux_utils.print_exception_no_traceback():
78
+ raise ValueError(f'Unsupported autostop wait mode: '
79
+ f'{mode}. The mode must be either '
80
+ f'\'{cls.JOBS_AND_SSH.value}\', '
81
+ f'\'{cls.JOBS.value}\', or '
82
+ f'\'{cls.NONE.value}\'. ')
83
+
84
+ @classmethod
85
+ def from_protobuf(
86
+ cls, protobuf_value: 'autostopv1_pb2.AutostopWaitFor'
87
+ ) -> Optional['AutostopWaitFor']:
88
+ """Convert protobuf AutostopWaitFor enum to Python enum value."""
89
+ protobuf_to_enum = {
90
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: cls.JOBS_AND_SSH,
91
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS: cls.JOBS,
92
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE: cls.NONE,
93
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED: None,
94
+ }
95
+ if protobuf_value not in protobuf_to_enum:
96
+ with ux_utils.print_exception_no_traceback():
97
+ raise ValueError(
98
+ f'Unknown protobuf AutostopWaitFor value: {protobuf_value}')
99
+ return protobuf_to_enum[protobuf_value]
100
+
101
+ def to_protobuf(self) -> 'autostopv1_pb2.AutostopWaitFor':
102
+ """Convert this Python enum value to protobuf enum value."""
103
+ enum_to_protobuf = {
104
+ AutostopWaitFor.JOBS_AND_SSH:
105
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH,
106
+ AutostopWaitFor.JOBS: autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS,
107
+ AutostopWaitFor.NONE: autostopv1_pb2.AUTOSTOP_WAIT_FOR_NONE,
108
+ }
109
+ if self not in enum_to_protobuf:
110
+ with ux_utils.print_exception_no_traceback():
111
+ raise ValueError(f'Unknown AutostopWaitFor value: {self}')
112
+ return enum_to_protobuf[self]
113
+
114
+
115
+ DEFAULT_AUTOSTOP_WAIT_FOR: AutostopWaitFor = AutostopWaitFor.JOBS_AND_SSH
116
+
117
+
33
118
  class AutostopConfig:
34
119
  """Autostop configuration."""
35
120
 
@@ -37,12 +122,14 @@ class AutostopConfig:
37
122
  autostop_idle_minutes: int,
38
123
  boot_time: float,
39
124
  backend: Optional[str],
125
+ wait_for: AutostopWaitFor,
40
126
  down: bool = False):
41
127
  assert autostop_idle_minutes < 0 or backend is not None, (
42
128
  autostop_idle_minutes, backend)
43
129
  self.autostop_idle_minutes = autostop_idle_minutes
44
130
  self.boot_time = boot_time
45
131
  self.backend = backend
132
+ self.wait_for = wait_for
46
133
  self.down = down
47
134
 
48
135
  def __setstate__(self, state: dict):
@@ -53,15 +140,18 @@ class AutostopConfig:
53
140
  def get_autostop_config() -> AutostopConfig:
54
141
  config_str = configs.get_config(_AUTOSTOP_CONFIG_KEY)
55
142
  if config_str is None:
56
- return AutostopConfig(-1, -1, None)
143
+ return AutostopConfig(-1, -1, None, DEFAULT_AUTOSTOP_WAIT_FOR)
57
144
  return pickle.loads(config_str)
58
145
 
59
146
 
60
- def set_autostop(idle_minutes: int, backend: Optional[str], down: bool) -> None:
147
+ def set_autostop(idle_minutes: int, backend: Optional[str],
148
+ wait_for: AutostopWaitFor, down: bool) -> None:
61
149
  boot_time = psutil.boot_time()
62
- autostop_config = AutostopConfig(idle_minutes, boot_time, backend, down)
150
+ autostop_config = AutostopConfig(idle_minutes, boot_time, backend, wait_for,
151
+ down)
63
152
  configs.set_config(_AUTOSTOP_CONFIG_KEY, pickle.dumps(autostop_config))
64
- logger.debug(f'set_autostop(): idle_minutes {idle_minutes}, down {down}.')
153
+ logger.debug(f'set_autostop(): idle_minutes {idle_minutes}, down {down}, '
154
+ f'wait_for {wait_for.value}.')
65
155
  # Reset timer whenever an autostop setting is submitted, i.e. the idle
66
156
  # time will be counted from now.
67
157
  set_last_active_time_to_now()
@@ -107,6 +197,28 @@ def set_last_active_time_to_now() -> None:
107
197
  configs.set_config(_AUTOSTOP_LAST_ACTIVE_TIME, str(time.time()))
108
198
 
109
199
 
200
+ def has_active_ssh_sessions() -> bool:
201
+ """Returns True if there are any active SSH sessions on the node."""
202
+ try:
203
+ # /dev/pts is a virtual filesystem that contains the pseudo-terminal
204
+ # devices. ptmx is the pseudo-terminal multiplexer, which is the
205
+ # "master" device that creates new pseudo-terminal devices, so we
206
+ # exclude it from the count.
207
+ proc = subprocess.run('ls /dev/pts | grep -v ptmx | wc -l',
208
+ capture_output=True,
209
+ text=True,
210
+ check=False,
211
+ shell=True)
212
+ if proc.returncode != 0:
213
+ logger.warning(f'SSH session check command failed with return code '
214
+ f'{proc.returncode}.')
215
+ return False
216
+ return int(proc.stdout.strip()) > 0
217
+ except Exception as e: # pylint: disable=broad-except
218
+ logger.warning(f'Error checking active SSH sessions: {e}.')
219
+ return False
220
+
221
+
110
222
  class AutostopCodeGen:
111
223
  """Code generator for autostop utility functions.
112
224
 
@@ -114,13 +226,22 @@ class AutostopCodeGen:
114
226
 
115
227
  >> codegen = AutostopCodeGen.set_autostop(...)
116
228
  """
117
- _PREFIX = ['from sky.skylet import autostop_lib']
229
+ _PREFIX = ['from sky.skylet import autostop_lib, constants']
118
230
 
119
231
  @classmethod
120
- def set_autostop(cls, idle_minutes: int, backend: str, down: bool) -> str:
232
+ def set_autostop(cls,
233
+ idle_minutes: int,
234
+ backend: str,
235
+ wait_for: Optional[AutostopWaitFor],
236
+ down: bool = False) -> str:
237
+ if wait_for is None:
238
+ wait_for = DEFAULT_AUTOSTOP_WAIT_FOR
121
239
  code = [
122
- f'autostop_lib.set_autostop({idle_minutes}, {backend!r},'
123
- f' {down})',
240
+ f'\nif getattr(constants, "SKYLET_LIB_VERSION", 1) < 4: '
241
+ f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, {down})'
242
+ f'\nelse: '
243
+ f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, '
244
+ f'autostop_lib.{wait_for}, {down})',
124
245
  ]
125
246
  return cls._build(code)
126
247
 
sky/skylet/configs.py CHANGED
@@ -2,17 +2,17 @@
2
2
  import functools
3
3
  import os
4
4
  import pathlib
5
+ import threading
5
6
  from typing import Callable, Optional, Union
6
7
 
7
- from sky.utils import db_utils
8
+ from sky.skylet import runtime_utils
9
+ from sky.utils.db import db_utils
8
10
 
9
- _DB_PATH = os.path.expanduser('~/.sky/skylet_config.db')
10
- os.makedirs(pathlib.Path(_DB_PATH).parents[0], exist_ok=True)
11
+ _DB_PATH = None
12
+ _db_init_lock = threading.Lock()
11
13
 
12
- _table_created = False
13
14
 
14
-
15
- def ensure_table(func: Callable):
15
+ def init_db(func: Callable):
16
16
  """Ensure the table exists before calling the function.
17
17
 
18
18
  Since this module will be imported whenever `sky` is imported (due to
@@ -24,25 +24,33 @@ def ensure_table(func: Callable):
24
24
 
25
25
  @functools.wraps(func)
26
26
  def wrapper(*args, **kwargs):
27
- global _table_created
28
- if not _table_created:
29
- with db_utils.safe_cursor(
30
- _DB_PATH) as c: # Call it 'c' to avoid pylint complaining.
31
- # Use WAL mode to avoid locking problem in #1507.
32
- # Reference: https://stackoverflow.com/a/39265148
33
- c.execute('PRAGMA journal_mode=WAL')
34
- c.execute("""\
35
- CREATE TABLE IF NOT EXISTS config (
36
- key TEXT PRIMARY KEY,
37
- value TEXT)""")
38
- _table_created = True
27
+ global _DB_PATH
28
+ if _DB_PATH is not None:
29
+ return func(*args, **kwargs)
30
+
31
+ with _db_init_lock:
32
+ if _DB_PATH is None:
33
+ _DB_PATH = runtime_utils.get_runtime_dir_path(
34
+ '.sky/skylet_config.db')
35
+ os.makedirs(pathlib.Path(_DB_PATH).parents[0], exist_ok=True)
36
+ with db_utils.safe_cursor(
37
+ _DB_PATH
38
+ ) as c: # Call it 'c' to avoid pylint complaining.
39
+ # Use WAL mode to avoid locking problem in #1507.
40
+ # Reference: https://stackoverflow.com/a/39265148
41
+ c.execute('PRAGMA journal_mode=WAL')
42
+ c.execute("""\
43
+ CREATE TABLE IF NOT EXISTS config (
44
+ key TEXT PRIMARY KEY,
45
+ value TEXT)""")
39
46
  return func(*args, **kwargs)
40
47
 
41
48
  return wrapper
42
49
 
43
50
 
44
- @ensure_table
51
+ @init_db
45
52
  def get_config(key: str) -> Optional[bytes]:
53
+ assert _DB_PATH is not None
46
54
  with db_utils.safe_cursor(_DB_PATH) as cursor:
47
55
  rows = cursor.execute('SELECT value FROM config WHERE key = ?', (key,))
48
56
  for (value,) in rows:
@@ -50,8 +58,9 @@ def get_config(key: str) -> Optional[bytes]:
50
58
  return None
51
59
 
52
60
 
53
- @ensure_table
61
+ @init_db
54
62
  def set_config(key: str, value: Union[bytes, str]) -> None:
63
+ assert _DB_PATH is not None
55
64
  with db_utils.safe_cursor(_DB_PATH) as cursor:
56
65
  cursor.execute(
57
66
  """\