skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -11,6 +11,7 @@ from sky.utils import common_utils
11
11
  from sky.utils import log_utils
12
12
  from sky.utils import resources_utils
13
13
  from sky.utils import status_lib
14
+ from sky.utils import ux_utils
14
15
 
15
16
  if typing.TYPE_CHECKING:
16
17
  from sky.provision.kubernetes import utils as kubernetes_utils
@@ -105,11 +106,9 @@ def show_status_table(cluster_records: List[responses.StatusResponse],
105
106
 
106
107
  if query_clusters:
107
108
  cluster_names = {record['name'] for record in cluster_records}
108
- not_found_clusters = [
109
- repr(cluster)
110
- for cluster in query_clusters
111
- if cluster not in cluster_names
112
- ]
109
+ not_found_clusters = ux_utils.get_non_matched_query(
110
+ query_clusters, cluster_names)
111
+ not_found_clusters = [repr(cluster) for cluster in not_found_clusters]
113
112
  if not_found_clusters:
114
113
  cluster_str = 'Cluster'
115
114
  if len(not_found_clusters) > 1:
@@ -283,8 +282,14 @@ def _get_resources(cluster_record: _ClusterRecord,
283
282
  if resources_str_full is not None:
284
283
  resources_str = resources_str_full
285
284
  if resources_str is None:
286
- resources_str = resources_utils.get_readable_resources_repr(
287
- handle, simplify=truncate)
285
+ resources_str_simple, resources_str_full = (
286
+ resources_utils.get_readable_resources_repr(
287
+ handle, simplified_only=truncate))
288
+ if truncate:
289
+ resources_str = resources_str_simple
290
+ else:
291
+ assert resources_str_full is not None
292
+ resources_str = resources_str_full
288
293
 
289
294
  return resources_str
290
295
  return '-'
@@ -144,6 +144,9 @@ class SSHConfigHelper(object):
144
144
  username = docker_user
145
145
 
146
146
  key_path = cls.generate_local_key_file(cluster_name, auth_config)
147
+ # Keep the unexpanded path for SSH config (with ~)
148
+ key_path_for_config = key_path
149
+ # Expand the path for internal operations that need absolute path
147
150
  key_path = os.path.expanduser(key_path)
148
151
  sky_autogen_comment = ('# Added by sky (use `sky stop/down '
149
152
  f'{cluster_name}` to remove)')
@@ -190,11 +193,29 @@ class SSHConfigHelper(object):
190
193
  proxy_command = auth_config.get('ssh_proxy_command', None)
191
194
 
192
195
  docker_proxy_command_generator = None
196
+ proxy_command_for_nodes = proxy_command
193
197
  if docker_user is not None:
194
- docker_proxy_command_generator = lambda ip, port: ' '.join(
195
- ['ssh'] + command_runner.ssh_options_list(
196
- key_path, ssh_control_name=None, port=port) +
197
- ['-W', '%h:%p', f'{auth_config["ssh_user"]}@{ip}'])
198
+
199
+ def _docker_proxy_cmd(ip: str, port: int) -> str:
200
+ inner_proxy = proxy_command
201
+ inner_port = port or 22
202
+ if inner_proxy is not None:
203
+ inner_proxy = inner_proxy.replace('%h', ip)
204
+ inner_proxy = inner_proxy.replace('%p', str(inner_port))
205
+ return ' '.join(['ssh'] + command_runner.ssh_options_list(
206
+ key_path,
207
+ ssh_control_name=None,
208
+ ssh_proxy_command=inner_proxy,
209
+ port=inner_port,
210
+ # ProxyCommand (ssh -W) is a forwarding tunnel, not an
211
+ # interactive session. ControlMaster would cache these
212
+ # processes, causing them to hang and block subsequent
213
+ # connections. Each ProxyCommand should be ephemeral.
214
+ disable_control_master=True
215
+ ) + ['-W', '%h:%p', f'{auth_config["ssh_user"]}@{ip}'])
216
+
217
+ docker_proxy_command_generator = _docker_proxy_cmd
218
+ proxy_command_for_nodes = None
198
219
 
199
220
  codegen = ''
200
221
  # Add the nodes to the codegen
@@ -208,8 +229,9 @@ class SSHConfigHelper(object):
208
229
  node_name = cluster_name if i == 0 else cluster_name + f'-worker{i}'
209
230
  # TODO(romilb): Update port number when k8s supports multinode
210
231
  codegen += cls._get_generated_config(
211
- sky_autogen_comment, node_name, ip, username, key_path,
212
- proxy_command, port, docker_proxy_command) + '\n'
232
+ sky_autogen_comment, node_name, ip, username,
233
+ key_path_for_config, proxy_command_for_nodes, port,
234
+ docker_proxy_command) + '\n'
213
235
 
214
236
  cluster_config_path = os.path.expanduser(
215
237
  cls.ssh_cluster_path.format(cluster_name))
@@ -3,6 +3,7 @@ import enum
3
3
  import hashlib
4
4
  import os
5
5
  import pathlib
6
+ import re
6
7
  import shlex
7
8
  import sys
8
9
  import time
@@ -13,6 +14,7 @@ from sky import exceptions
13
14
  from sky import sky_logging
14
15
  from sky.skylet import constants
15
16
  from sky.skylet import log_lib
17
+ from sky.utils import auth_utils
16
18
  from sky.utils import common_utils
17
19
  from sky.utils import context_utils
18
20
  from sky.utils import control_master_utils
@@ -22,6 +24,9 @@ from sky.utils import timeline
22
24
 
23
25
  logger = sky_logging.init_logger(__name__)
24
26
 
27
+ # Pattern to extract home directory from command output
28
+ _HOME_DIR_PATTERN = re.compile(r'SKYPILOT_HOME_DIR: ([^\s\n]+)')
29
+
25
30
  # Rsync options
26
31
  # TODO(zhwu): This will print a per-file progress bar (with -P),
27
32
  # shooting a lot of messages to the output. --info=progress2 is used
@@ -183,17 +188,25 @@ class CommandRunner:
183
188
  return '-'.join(str(x) for x in self.node)
184
189
 
185
190
  def _get_remote_home_dir(self) -> str:
186
- # Use `echo ~` to get the remote home directory, instead of pwd or
187
- # echo $HOME, because pwd can be `/` when the remote user is root
188
- # and $HOME is not always set.
189
- rc, remote_home_dir, stderr = self.run('echo ~',
190
- require_outputs=True,
191
- separate_stderr=True,
192
- stream_logs=False)
191
+ # Use pattern matching to extract home directory.
192
+ # Some container images print MOTD when login shells start, which can
193
+ # contaminate command output. We use a unique pattern to extract the
194
+ # actual home directory reliably.
195
+ rc, output, stderr = self.run('echo "SKYPILOT_HOME_DIR: $(echo ~)"',
196
+ require_outputs=True,
197
+ separate_stderr=True,
198
+ stream_logs=False)
193
199
  if rc != 0:
194
200
  raise ValueError('Failed to get remote home directory: '
195
- f'{remote_home_dir + stderr}')
196
- remote_home_dir = remote_home_dir.strip()
201
+ f'{output + stderr}')
202
+
203
+ # Extract home directory using pattern matching
204
+ home_dir_match = _HOME_DIR_PATTERN.search(output)
205
+ if home_dir_match:
206
+ remote_home_dir = home_dir_match.group(1)
207
+ else:
208
+ raise ValueError('Failed to find remote home directory identifier: '
209
+ f'{output + stderr}')
197
210
  return remote_home_dir
198
211
 
199
212
  def _get_command_to_run(
@@ -414,7 +427,6 @@ class CommandRunner:
414
427
  SkyPilot but we still want to get rid of some warning messages,
415
428
  such as SSH warnings.
416
429
 
417
-
418
430
  Returns:
419
431
  returncode
420
432
  or
@@ -469,15 +481,19 @@ class CommandRunner:
469
481
  """Close the cached connection to the remote machine."""
470
482
  pass
471
483
 
472
- def port_forward_command(self,
473
- port_forward: List[Tuple[int, int]],
474
- connect_timeout: int = 1) -> List[str]:
484
+ def port_forward_command(
485
+ self,
486
+ port_forward: List[Tuple[int, int]],
487
+ connect_timeout: int = 1,
488
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
475
489
  """Command for forwarding ports from localhost to the remote machine.
476
490
 
477
491
  Args:
478
492
  port_forward: A list of ports to forward from the localhost to the
479
493
  remote host.
480
494
  connect_timeout: The timeout for the connection.
495
+ ssh_mode: The mode to use for ssh.
496
+ See SSHMode for more details.
481
497
  """
482
498
  raise NotImplementedError
483
499
 
@@ -592,6 +608,7 @@ class SSHCommandRunner(CommandRunner):
592
608
  ssh_proxy_command: Optional[str] = None,
593
609
  docker_user: Optional[str] = None,
594
610
  disable_control_master: Optional[bool] = False,
611
+ port_forward_execute_remote_command: Optional[bool] = False,
595
612
  ):
596
613
  """Initialize SSHCommandRunner.
597
614
 
@@ -618,6 +635,10 @@ class SSHCommandRunner(CommandRunner):
618
635
  disable_control_master: bool; specifies either or not the ssh
619
636
  command will utilize ControlMaster. We currently disable
620
637
  it for k8s instance.
638
+ port_forward_execute_remote_command: bool; specifies whether to
639
+ add -N to the port forwarding command. This is useful if you
640
+ want to run a command on the remote machine to make sure the
641
+ SSH tunnel is established.
621
642
  """
622
643
  super().__init__(node)
623
644
  ip, port = node
@@ -629,39 +650,63 @@ class SSHCommandRunner(CommandRunner):
629
650
  self.disable_control_master = (
630
651
  disable_control_master or
631
652
  control_master_utils.should_disable_control_master())
653
+ # ensure the ssh key files are created from the database
654
+ auth_utils.create_ssh_key_files_from_db(ssh_private_key)
632
655
  if docker_user is not None:
633
656
  assert port is None or port == 22, (
634
657
  f'port must be None or 22 for docker_user, got {port}.')
635
- # Already checked in resources
636
- assert ssh_proxy_command is None, (
637
- 'ssh_proxy_command is not supported when using docker.')
658
+ # When connecting via docker, the outer SSH hop points to the
659
+ # container's sshd (localhost). Preserve the user proxy for the
660
+ # inner hop that reaches the host VM, and clear the outer proxy to
661
+ # avoid forwarding localhost through the jump host.
662
+ inner_proxy_command = ssh_proxy_command
663
+ inner_proxy_port = port or 22
664
+ self._ssh_proxy_command = None
638
665
  self.ip = 'localhost'
639
666
  self.ssh_user = docker_user
640
667
  self.port = constants.DEFAULT_DOCKER_PORT
668
+ if inner_proxy_command is not None:
669
+ # Replace %h/%p placeholders with actual host values, since the
670
+ # final destination from the perspective of the user proxy is
671
+ # the host VM (ip, inner_proxy_port).
672
+ inner_proxy_command = inner_proxy_command.replace('%h', ip)
673
+ inner_proxy_command = inner_proxy_command.replace(
674
+ '%p', str(inner_proxy_port))
641
675
  self._docker_ssh_proxy_command = lambda ssh: ' '.join(
642
- ssh + ssh_options_list(ssh_private_key, None
643
- ) + ['-W', '%h:%p', f'{ssh_user}@{ip}'])
676
+ ssh + ssh_options_list(ssh_private_key,
677
+ None,
678
+ ssh_proxy_command=inner_proxy_command,
679
+ port=inner_proxy_port,
680
+ disable_control_master=self.
681
+ disable_control_master) +
682
+ ['-W', '%h:%p', f'{ssh_user}@{ip}'])
644
683
  else:
645
684
  self.ip = ip
646
685
  self.ssh_user = ssh_user
647
686
  self.port = port
648
687
  self._docker_ssh_proxy_command = None
688
+ self.port_forward_execute_remote_command = (
689
+ port_forward_execute_remote_command)
649
690
 
650
- def port_forward_command(self,
651
- port_forward: List[Tuple[int, int]],
652
- connect_timeout: int = 1) -> List[str]:
691
+ def port_forward_command(
692
+ self,
693
+ port_forward: List[Tuple[int, int]],
694
+ connect_timeout: int = 1,
695
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
653
696
  """Command for forwarding ports from localhost to the remote machine.
654
697
 
655
698
  Args:
656
699
  port_forward: A list of ports to forward from the local port to the
657
700
  remote port.
658
701
  connect_timeout: The timeout for the ssh connection.
702
+ ssh_mode: The mode to use for ssh.
703
+ See SSHMode for more details.
659
704
 
660
705
  Returns:
661
706
  The command for forwarding ports from localhost to the remote
662
707
  machine.
663
708
  """
664
- return self.ssh_base_command(ssh_mode=SshMode.INTERACTIVE,
709
+ return self.ssh_base_command(ssh_mode=ssh_mode,
665
710
  port_forward=port_forward,
666
711
  connect_timeout=connect_timeout)
667
712
 
@@ -680,7 +725,11 @@ class SSHCommandRunner(CommandRunner):
680
725
  for local, remote in port_forward:
681
726
  logger.debug(
682
727
  f'Forwarding local port {local} to remote port {remote}.')
683
- ssh += ['-NL', f'{local}:localhost:{remote}']
728
+ if self.port_forward_execute_remote_command:
729
+ ssh += ['-L']
730
+ else:
731
+ ssh += ['-NL']
732
+ ssh += [f'{local}:localhost:{remote}']
684
733
  if self._docker_ssh_proxy_command is not None:
685
734
  docker_ssh_proxy_command = self._docker_ssh_proxy_command(ssh)
686
735
  else:
@@ -894,9 +943,11 @@ class KubernetesCommandRunner(CommandRunner):
894
943
  else:
895
944
  return f'pod/{self.pod_name}'
896
945
 
897
- def port_forward_command(self,
898
- port_forward: List[Tuple[int, int]],
899
- connect_timeout: int = 1) -> List[str]:
946
+ def port_forward_command(
947
+ self,
948
+ port_forward: List[Tuple[int, int]],
949
+ connect_timeout: int = 1,
950
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
900
951
  """Command for forwarding ports from localhost to the remote machine.
901
952
 
902
953
  Args:
@@ -904,14 +955,25 @@ class KubernetesCommandRunner(CommandRunner):
904
955
  remote port. Currently, only one port is supported, i.e. the
905
956
  list should have only one element.
906
957
  connect_timeout: The timeout for the ssh connection.
958
+ ssh_mode: The mode to use for ssh.
959
+ See SSHMode for more details.
907
960
  """
961
+ del ssh_mode # unused
908
962
  assert port_forward and len(port_forward) == 1, (
909
963
  'Only one port is supported for Kubernetes port-forward.')
910
964
  kubectl_args = [
911
965
  '--pod-running-timeout', f'{connect_timeout}s', '-n', self.namespace
912
966
  ]
967
+ # The same logic to either set `--context` to the k8s context where
968
+ # the sky cluster is hosted, or `--kubeconfig` to /dev/null for
969
+ # in-cluster k8s is used below in the `run()` method.
913
970
  if self.context:
914
971
  kubectl_args += ['--context', self.context]
972
+ # If context is none, it means the cluster is hosted on in-cluster k8s.
973
+ # In this case, we need to set KUBECONFIG to /dev/null to avoid looking
974
+ # for the cluster in whatever active context is set in the kubeconfig.
975
+ else:
976
+ kubectl_args += ['--kubeconfig', '/dev/null']
915
977
  local_port, remote_port = port_forward[0]
916
978
  local_port_str = f'{local_port}' if local_port is not None else ''
917
979
 
@@ -967,7 +1029,6 @@ class KubernetesCommandRunner(CommandRunner):
967
1029
  SkyPilot but we still want to get rid of some warning messages,
968
1030
  such as SSH warnings.
969
1031
 
970
-
971
1032
  Returns:
972
1033
  returncode
973
1034
  or
@@ -36,9 +36,9 @@ def ssh_options_list(
36
36
 
37
37
 
38
38
  class SshMode(enum.Enum):
39
- NON_INTERACTIVE: int
40
- INTERACTIVE: int
41
- LOGIN: int
39
+ NON_INTERACTIVE = ...
40
+ INTERACTIVE = ...
41
+ LOGIN = ...
42
42
 
43
43
 
44
44
  class CommandRunner:
@@ -106,6 +106,13 @@ class CommandRunner:
106
106
  max_retry: int = ...) -> None:
107
107
  ...
108
108
 
109
+ def port_forward_command(
110
+ self,
111
+ port_forward: List[Tuple[int, int]],
112
+ connect_timeout: int = 1,
113
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
114
+ ...
115
+
109
116
  @classmethod
110
117
  def make_runner_list(cls: typing.Type[CommandRunner],
111
118
  node_list: Iterable[Tuple[Any, ...]],
@@ -127,6 +134,7 @@ class SSHCommandRunner(CommandRunner):
127
134
  ssh_control_name: Optional[str]
128
135
  docker_user: str
129
136
  disable_control_master: Optional[bool]
137
+ port_forward_execute_remote_command: Optional[bool]
130
138
 
131
139
  def __init__(
132
140
  self,
@@ -134,8 +142,10 @@ class SSHCommandRunner(CommandRunner):
134
142
  ssh_user: str,
135
143
  ssh_private_key: str,
136
144
  ssh_control_name: Optional[str] = ...,
145
+ ssh_proxy_command: Optional[str] = ...,
137
146
  docker_user: Optional[str] = ...,
138
147
  disable_control_master: Optional[bool] = ...,
148
+ port_forward_execute_remote_command: Optional[bool] = ...,
139
149
  ) -> None:
140
150
  ...
141
151
 
@@ -190,6 +200,15 @@ class SSHCommandRunner(CommandRunner):
190
200
  **kwargs) -> Union[Tuple[int, str, str], int]:
191
201
  ...
192
202
 
203
+ def ssh_base_command(
204
+ self,
205
+ *,
206
+ ssh_mode: SshMode,
207
+ port_forward: Optional[List[Tuple[int, int]]],
208
+ connect_timeout: Optional[int],
209
+ ) -> List[str]:
210
+ ...
211
+
193
212
  def rsync(self,
194
213
  source: str,
195
214
  target: str,
@@ -200,6 +219,13 @@ class SSHCommandRunner(CommandRunner):
200
219
  max_retry: int = ...) -> None:
201
220
  ...
202
221
 
222
+ def port_forward_command(
223
+ self,
224
+ port_forward: List[Tuple[int, int]],
225
+ connect_timeout: int = 1,
226
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
227
+ ...
228
+
203
229
 
204
230
  class KubernetesCommandRunner(CommandRunner):
205
231
 
@@ -272,6 +298,13 @@ class KubernetesCommandRunner(CommandRunner):
272
298
  max_retry: int = ...) -> None:
273
299
  ...
274
300
 
301
+ def port_forward_command(
302
+ self,
303
+ port_forward: List[Tuple[int, int]],
304
+ connect_timeout: int = 1,
305
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
306
+ ...
307
+
275
308
 
276
309
  class LocalProcessCommandRunner(CommandRunner):
277
310
 
sky/utils/common.py CHANGED
@@ -31,7 +31,7 @@ JOB_CONTROLLER_NAME: str
31
31
  def refresh_server_id() -> None:
32
32
  """Refresh the server id.
33
33
 
34
- This function is used to ensure the server id is read from the authorative
34
+ This function is used to ensure the server id is read from the authoritative
35
35
  source.
36
36
  """
37
37
  global SERVER_ID
@@ -42,6 +42,8 @@ def refresh_server_id() -> None:
42
42
  JOB_CONTROLLER_NAME = f'{JOB_CONTROLLER_PREFIX}{SERVER_ID}'
43
43
 
44
44
 
45
+ # TODO(kevin): Remove this side effect and have callers call
46
+ # refresh_server_id() explicitly as needed.
45
47
  refresh_server_id()
46
48
 
47
49
 
sky/utils/common_utils.py CHANGED
@@ -1,8 +1,10 @@
1
1
  """Utils shared between all of sky"""
2
2
 
3
+ import ctypes
3
4
  import difflib
4
5
  import enum
5
6
  import functools
7
+ import gc
6
8
  import getpass
7
9
  import hashlib
8
10
  import inspect
@@ -263,13 +265,16 @@ def get_global_job_id(job_timestamp: str,
263
265
 
264
266
  class Backoff:
265
267
  """Exponential backoff with jittering."""
266
- MULTIPLIER = 1.6
267
268
  JITTER = 0.4
268
269
 
269
- def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5):
270
+ def __init__(self,
271
+ initial_backoff: float = 5,
272
+ max_backoff_factor: int = 5,
273
+ multiplier: float = 1.6):
270
274
  self._initial = True
271
275
  self._backoff = 0.0
272
276
  self._initial_backoff = initial_backoff
277
+ self._multiplier = multiplier
273
278
  self._max_backoff = max_backoff_factor * self._initial_backoff
274
279
 
275
280
  # https://github.com/grpc/grpc/blob/2d4f3c56001cd1e1f85734b2f7c5ce5f2797c38a/doc/connection-backoff.md
@@ -281,7 +286,7 @@ class Backoff:
281
286
  self._initial = False
282
287
  self._backoff = min(self._initial_backoff, self._max_backoff)
283
288
  else:
284
- self._backoff = min(self._backoff * self.MULTIPLIER,
289
+ self._backoff = min(self._backoff * self._multiplier,
285
290
  self._max_backoff)
286
291
  self._backoff += random.uniform(-self.JITTER * self._backoff,
287
292
  self.JITTER * self._backoff)
@@ -994,7 +999,17 @@ def get_mem_size_gb() -> float:
994
999
  except ValueError as e:
995
1000
  with ux_utils.print_exception_no_traceback():
996
1001
  raise ValueError(
997
- f'Failed to parse the memory size from {mem_size}') from e
1002
+ f'Failed to parse the memory size from {mem_size} (GB)'
1003
+ ) from e
1004
+ mem_size = os.getenv('SKYPILOT_POD_MEMORY_BYTES_LIMIT')
1005
+ if mem_size is not None:
1006
+ try:
1007
+ return float(mem_size) / (1024**3)
1008
+ except ValueError as e:
1009
+ with ux_utils.print_exception_no_traceback():
1010
+ raise ValueError(
1011
+ f'Failed to parse the memory size from {mem_size} (bytes)'
1012
+ ) from e
998
1013
  return _mem_size_gb()
999
1014
 
1000
1015
 
@@ -1090,3 +1105,21 @@ def removeprefix(string: str, prefix: str) -> str:
1090
1105
  if string.startswith(prefix):
1091
1106
  return string[len(prefix):]
1092
1107
  return string
1108
+
1109
+
1110
+ def release_memory():
1111
+ """Release the process memory"""
1112
+ # Do the best effort to release the python heap and let malloc_trim
1113
+ # be more efficient.
1114
+ try:
1115
+ gc.collect()
1116
+ if sys.platform.startswith('linux'):
1117
+ # Will fail on musl (alpine), but at least it works on our
1118
+ # official docker images.
1119
+ libc = ctypes.CDLL('libc.so.6')
1120
+ return libc.malloc_trim(0)
1121
+ return 0
1122
+ except Exception as e: # pylint: disable=broad-except
1123
+ logger.error(f'Failed to release memory: '
1124
+ f'{format_exception(e)}')
1125
+ return 0
sky/utils/config_utils.py CHANGED
@@ -272,7 +272,7 @@ def get_cloud_config_value_from_dict(
272
272
  """
273
273
  input_config = Config(dict_config)
274
274
  region_key = None
275
- if cloud == 'kubernetes':
275
+ if cloud in ('kubernetes', 'ssh'):
276
276
  region_key = 'context_configs'
277
277
  elif cloud in _REGION_CONFIG_CLOUDS:
278
278
  region_key = 'region_configs'
@@ -283,19 +283,6 @@ def get_cloud_config_value_from_dict(
283
283
  keys=(cloud, region_key, region) + keys,
284
284
  default_value=None,
285
285
  override_configs=override_configs)
286
- if not per_context_config and cloud in _REGION_CONFIG_CLOUDS:
287
- # TODO (kyuds): Backward compatibility, remove after 0.11.0.
288
- per_context_config = input_config.get_nested(
289
- keys=(cloud, region) + keys,
290
- default_value=None,
291
- override_configs=override_configs)
292
- if per_context_config is not None:
293
- logger.info(
294
- f'{cloud} configuration is using the legacy format. \n'
295
- 'This format will be deprecated after 0.11.0, refer to '
296
- '`https://docs.skypilot.co/en/latest/reference/config.html` ' # pylint: disable=line-too-long
297
- 'for the new format. Please use `region_configs` to specify region specific configuration.'
298
- )
299
286
  # if no override found for specified region
300
287
  general_config = input_config.get_nested(keys=(cloud,) + keys,
301
288
  default_value=default_value,