skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,10 +1,11 @@
1
1
  """Utility functions for deploying Kubernetes clusters."""
2
2
  import os
3
+ import random
3
4
  import shlex
4
5
  import subprocess
5
- import sys
6
6
  import tempfile
7
- from typing import List, Optional
7
+ import textwrap
8
+ from typing import List, Optional, Tuple
8
9
 
9
10
  import colorama
10
11
 
@@ -19,11 +20,16 @@ from sky.utils import log_utils
19
20
  from sky.utils import rich_utils
20
21
  from sky.utils import subprocess_utils
21
22
  from sky.utils import ux_utils
23
+ from sky.utils.kubernetes import deploy_ssh_node_pools
22
24
 
23
25
  logger = sky_logging.init_logger(__name__)
24
26
 
25
27
  # Default path for Kubernetes configuration file
26
28
  DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
29
+ DEFAULT_LOCAL_CLUSTER_NAME = 'skypilot'
30
+ LOCAL_CLUSTER_PORT_RANGE = 100
31
+ LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
32
+ LOCAL_CLUSTER_INTERNAL_PORT_END = 30099
27
33
 
28
34
 
29
35
  def check_ssh_cluster_dependencies(
@@ -85,218 +91,178 @@ def deploy_ssh_cluster(cleanup: bool = False,
85
91
  """
86
92
  check_ssh_cluster_dependencies()
87
93
 
88
- # Prepare command to call deploy_remote_cluster.py script
89
- # TODO(romilb): We should move this to a native python method/class call
90
- # instead of invoking a script with subprocess.
91
- path_to_package = os.path.dirname(__file__)
92
- up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
93
- cwd = os.path.dirname(os.path.abspath(up_script_path))
94
+ action = 'Cleanup' if cleanup else 'Deployment'
95
+ msg_str = f'Initializing SSH Node Pools {action}...'
94
96
 
95
- deploy_command = [sys.executable, up_script_path]
97
+ with rich_utils.safe_status(ux_utils.spinner_message(msg_str)):
98
+ try:
99
+ deploy_ssh_node_pools.deploy_clusters(
100
+ infra=infra, cleanup=cleanup, kubeconfig_path=kubeconfig_path)
101
+ except Exception as e: # pylint: disable=broad-except
102
+ logger.error(str(e))
103
+ with ux_utils.print_exception_no_traceback():
104
+ raise RuntimeError(
105
+ 'Failed to deploy SkyPilot on some Node Pools.') from e
96
106
 
107
+ logger.info('')
97
108
  if cleanup:
98
- deploy_command.append('--cleanup')
99
-
100
- if infra:
101
- deploy_command.extend(['--infra', infra])
109
+ logger.info(
110
+ ux_utils.finishing_message(
111
+ '🎉 SSH Node Pools cleaned up successfully.'))
112
+ else:
113
+ logger.info(
114
+ ux_utils.finishing_message(
115
+ '🎉 SSH Node Pools set up successfully. ',
116
+ follow_up_message=(
117
+ f'Run `{colorama.Style.BRIGHT}'
118
+ f'sky check ssh'
119
+ f'{colorama.Style.RESET_ALL}` to verify access, '
120
+ f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
121
+ f'{colorama.Style.RESET_ALL}` to launch a cluster.')))
102
122
 
103
- # Use the default kubeconfig path if none is provided
104
- kubeconfig_path = kubeconfig_path or DEFAULT_KUBECONFIG_PATH
105
- deploy_command.extend(['--kubeconfig-path', kubeconfig_path])
106
123
 
107
- # Setup logging paths
108
- run_timestamp = sky_logging.get_run_timestamp()
109
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
110
- 'ssh_up.log')
124
+ def generate_kind_config(port_start: int,
125
+ num_nodes: int = 1,
126
+ gpus: bool = False) -> str:
127
+ """Generate a kind cluster config with ports mapped from host to container
111
128
 
112
- if cleanup:
113
- msg_str = 'Cleaning up SSH Node Pools...'
114
- else:
115
- msg_str = 'Initializing deployment to SSH Node Pools...'
129
+ Port range will be [port_start, port_start + LOCAL_CLUSTER_PORT_RANGE)
130
+ Internally, this will map to ports 30000 - 30099
116
131
 
117
- # Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
118
- env = os.environ.copy()
119
- env['PYTHONUNBUFFERED'] = '1'
132
+ Args:
133
+ path: Path to generate the config file at
134
+ port_start: Port range start for mappings
135
+ num_nodes: Number of nodes in the cluster
136
+ gpus: If true, initialize kind cluster with GPU support
120
137
 
121
- with rich_utils.safe_status(
122
- ux_utils.spinner_message(msg_str, log_path=log_path,
123
- is_local=True)):
124
- returncode, _, stderr = log_lib.run_with_log(
125
- cmd=deploy_command,
126
- log_path=log_path,
127
- require_outputs=True,
128
- stream_logs=False,
129
- line_processor=log_utils.SkySSHUpLineProcessor(log_path=log_path,
130
- is_local=False),
131
- cwd=cwd,
132
- env=env)
138
+ Returns:
139
+ The kind cluster config
140
+ """
141
+ internal_start = LOCAL_CLUSTER_INTERNAL_PORT_START
142
+ internal_end = LOCAL_CLUSTER_INTERNAL_PORT_END
143
+
144
+ config = textwrap.dedent(f"""
145
+ apiVersion: kind.x-k8s.io/v1alpha4
146
+ kind: Cluster
147
+ kubeadmConfigPatches:
148
+ - |
149
+ kind: ClusterConfiguration
150
+ apiServer:
151
+ extraArgs:
152
+ "service-node-port-range": {internal_start}-{internal_end}
153
+ nodes:
154
+ - role: control-plane
155
+ kubeadmConfigPatches:
156
+ - |
157
+ kind: InitConfiguration
158
+ nodeRegistration:
159
+ kubeletExtraArgs:
160
+ node-labels: "ingress-ready=true"
161
+ """)
162
+ if gpus:
163
+ config += textwrap.indent(
164
+ textwrap.dedent("""
165
+ extraMounts:
166
+ - hostPath: /dev/null
167
+ containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
168
+ config += textwrap.indent(textwrap.dedent("""
169
+ extraPortMappings:"""), ' ' * 2)
170
+ for offset in range(LOCAL_CLUSTER_PORT_RANGE):
171
+ config += textwrap.indent(
172
+ textwrap.dedent(f"""
173
+ - containerPort: {internal_start + offset}
174
+ hostPort: {port_start + offset}
175
+ listenAddress: "0.0.0.0"
176
+ protocol: tcp
177
+ """), ' ' * 2)
178
+ if num_nodes > 1:
179
+ config += '- role: worker\n' * (num_nodes - 1)
180
+ return config
181
+
182
+
183
+ def _get_port_range(name: str, port_start: Optional[int]) -> Tuple[int, int]:
184
+ is_default = name == DEFAULT_LOCAL_CLUSTER_NAME
185
+ if port_start is None:
186
+ if is_default:
187
+ port_start = LOCAL_CLUSTER_INTERNAL_PORT_START
188
+ else:
189
+ port_start = random.randint(301, 399) * 100
190
+ port_end = port_start + LOCAL_CLUSTER_PORT_RANGE - 1
191
+
192
+ port_range = f'Current port range: {port_start}-{port_end}'
193
+ if is_default and port_start != LOCAL_CLUSTER_INTERNAL_PORT_START:
194
+ raise ValueError('Default local cluster `skypilot` should have '
195
+ f'port range from 30000 to 30099. {port_range}.')
196
+ if not is_default and port_start == LOCAL_CLUSTER_INTERNAL_PORT_START:
197
+ raise ValueError('Port range 30000 to 30099 is reserved for '
198
+ f'default local cluster `skypilot`. {port_range}.')
199
+ if port_start % 100 != 0:
200
+ raise ValueError('Local cluster port start must be a multiple of 100. '
201
+ f'{port_range}.')
202
+
203
+ return port_start, port_end
204
+
205
+
206
+ def deploy_local_cluster(name: Optional[str], port_start: Optional[int],
207
+ gpus: bool):
208
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
209
+ port_start, port_end = _get_port_range(name, port_start)
210
+ context_name = f'kind-{name}'
211
+ cluster_created = False
133
212
 
134
- if returncode == 0:
135
- success = True
136
- else:
137
- with ux_utils.print_exception_no_traceback():
138
- log_hint = ux_utils.log_path_hint(log_path, is_local=False)
139
- raise RuntimeError('Failed to deploy SkyPilot on some Node Pools. '
140
- f'{log_hint}'
141
- f'\nError: {stderr}')
213
+ # Check if GPUs are available on the host
214
+ local_gpus_available = backend_utils.check_local_gpus()
215
+ gpus = gpus and local_gpus_available
142
216
 
143
- if success:
144
- # Add an empty line to separate the deployment logs from the final
145
- # message
146
- logger.info('')
147
- if cleanup:
148
- logger.info(
149
- ux_utils.finishing_message(
150
- '🎉 SSH Node Pools cleaned up successfully.',
151
- log_path=log_path,
152
- is_local=True))
153
- else:
217
+ # Check if ~/.kube/config exists:
218
+ if os.path.exists(os.path.expanduser('~/.kube/config')):
219
+ curr_context = kubernetes_utils.get_current_kube_config_context_name()
220
+ if curr_context is not None and curr_context != context_name:
154
221
  logger.info(
155
- ux_utils.finishing_message(
156
- '🎉 SSH Node Pools set up successfully. ',
157
- follow_up_message=(
158
- f'Run `{colorama.Style.BRIGHT}'
159
- f'sky check ssh'
160
- f'{colorama.Style.RESET_ALL}` to verify access, '
161
- f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
162
- f'{colorama.Style.RESET_ALL}` to launch a cluster. '),
163
- log_path=log_path,
164
- is_local=True))
165
-
166
-
167
- def deploy_remote_cluster(ip_list: List[str],
168
- ssh_user: str,
169
- ssh_key: str,
170
- cleanup: bool,
171
- context_name: Optional[str] = None,
172
- password: Optional[str] = None):
173
- success = False
174
- path_to_package = os.path.dirname(__file__)
175
- up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
176
- # Get directory of script and run it from there
177
- cwd = os.path.dirname(os.path.abspath(up_script_path))
178
-
179
- # Create temporary files for the IPs and SSH key
180
- with tempfile.NamedTemporaryFile(mode='w') as ip_file, \
181
- tempfile.NamedTemporaryFile(mode='w') as key_file:
182
-
183
- # Write IPs and SSH key to temporary files
184
- ip_file.write('\n'.join(ip_list))
185
- ip_file.flush()
186
-
187
- key_file.write(ssh_key)
188
- key_file.flush()
189
- os.chmod(key_file.name, 0o600)
190
-
191
- # Use the legacy mode command line arguments for backward compatibility
192
- deploy_command = [
193
- sys.executable, up_script_path, '--ips-file', ip_file.name,
194
- '--user', ssh_user, '--ssh-key', key_file.name
195
- ]
196
-
197
- if context_name is not None:
198
- deploy_command.extend(['--context-name', context_name])
199
- if password is not None:
200
- deploy_command.extend(['--password', password])
201
- if cleanup:
202
- deploy_command.append('--cleanup')
222
+ f'Current context in kube config: {curr_context}'
223
+ f'\nWill automatically switch to {context_name} after the '
224
+ 'local cluster is created.')
225
+ message_str = 'Creating local cluster {}{}...'
226
+ message_str = message_str.format(
227
+ name,
228
+ ' with GPU support (this may take up to 15 minutes)' if gpus else '')
229
+
230
+ with tempfile.NamedTemporaryFile(mode='w+', suffix='.yaml',
231
+ delete=True) as f:
232
+ # Choose random port range to use on the host machine.
233
+ # Port range is port_start - port_start + 99 (exactly 100 ports).
234
+ logger.debug(f'Using host port range {port_start}-{port_end}')
235
+ f.write(generate_kind_config(port_start, gpus=gpus))
236
+ f.flush()
237
+
238
+ path_to_package = os.path.dirname(__file__)
239
+ up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
240
+
241
+ # Get directory of script and run it from there
242
+ cwd = os.path.dirname(os.path.abspath(up_script_path))
243
+ run_command = f'{up_script_path} {name} {f.name}'
244
+ if gpus:
245
+ run_command += ' --gpus'
246
+ run_command = shlex.split(run_command)
203
247
 
204
248
  # Setup logging paths
205
249
  run_timestamp = sky_logging.get_run_timestamp()
206
250
  log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
207
251
  'local_up.log')
208
-
209
- if cleanup:
210
- msg_str = 'Cleaning up remote cluster...'
211
- else:
212
- msg_str = 'Deploying remote cluster...'
213
-
214
- # Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
215
- env = os.environ.copy()
216
- env['PYTHONUNBUFFERED'] = '1'
252
+ logger.info(message_str)
217
253
 
218
254
  with rich_utils.safe_status(
219
- ux_utils.spinner_message(msg_str,
255
+ ux_utils.spinner_message(message_str,
220
256
  log_path=log_path,
221
257
  is_local=True)):
222
258
  returncode, _, stderr = log_lib.run_with_log(
223
- cmd=deploy_command,
259
+ cmd=run_command,
224
260
  log_path=log_path,
225
261
  require_outputs=True,
226
262
  stream_logs=False,
227
- line_processor=log_utils.SkyRemoteUpLineProcessor(
263
+ line_processor=log_utils.SkyLocalUpLineProcessor(
228
264
  log_path=log_path, is_local=True),
229
- cwd=cwd,
230
- env=env)
231
- if returncode == 0:
232
- success = True
233
- else:
234
- with ux_utils.print_exception_no_traceback():
235
- log_hint = ux_utils.log_path_hint(log_path, is_local=True)
236
- raise RuntimeError('Failed to deploy remote cluster. '
237
- f'Full log: {log_hint}'
238
- f'\nError: {stderr}')
239
-
240
- if success:
241
- if cleanup:
242
- logger.info(
243
- ux_utils.finishing_message(
244
- '🎉 Remote cluster cleaned up successfully.',
245
- log_path=log_path,
246
- is_local=True))
247
- else:
248
- logger.info(
249
- ux_utils.finishing_message(
250
- '🎉 Remote cluster deployed successfully.',
251
- log_path=log_path,
252
- is_local=True))
253
-
254
-
255
- def deploy_local_cluster(gpus: bool):
256
- cluster_created = False
257
-
258
- # Check if GPUs are available on the host
259
- local_gpus_available = backend_utils.check_local_gpus()
260
- gpus = gpus and local_gpus_available
261
-
262
- # Check if ~/.kube/config exists:
263
- if os.path.exists(os.path.expanduser('~/.kube/config')):
264
- curr_context = kubernetes_utils.get_current_kube_config_context_name()
265
- skypilot_context = 'kind-skypilot'
266
- if curr_context is not None and curr_context != skypilot_context:
267
- logger.info(
268
- f'Current context in kube config: {curr_context}'
269
- '\nWill automatically switch to kind-skypilot after the local '
270
- 'cluster is created.')
271
- message_str = 'Creating local cluster{}...'
272
- message_str = message_str.format((' with GPU support (this may take up '
273
- 'to 15 minutes)') if gpus else '')
274
- path_to_package = os.path.dirname(__file__)
275
- up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
276
-
277
- # Get directory of script and run it from there
278
- cwd = os.path.dirname(os.path.abspath(up_script_path))
279
- run_command = up_script_path + ' --gpus' if gpus else up_script_path
280
- run_command = shlex.split(run_command)
281
-
282
- # Setup logging paths
283
- run_timestamp = sky_logging.get_run_timestamp()
284
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
285
- 'local_up.log')
286
- logger.info(message_str)
287
-
288
- with rich_utils.safe_status(
289
- ux_utils.spinner_message(message_str,
290
- log_path=log_path,
291
- is_local=True)):
292
- returncode, _, stderr = log_lib.run_with_log(
293
- cmd=run_command,
294
- log_path=log_path,
295
- require_outputs=True,
296
- stream_logs=False,
297
- line_processor=log_utils.SkyLocalUpLineProcessor(log_path=log_path,
298
- is_local=True),
299
- cwd=cwd)
265
+ cwd=cwd)
300
266
 
301
267
  # Kind always writes to stderr even if it succeeds.
302
268
  # If the failure happens after the cluster is created, we need
@@ -309,11 +275,11 @@ def deploy_local_cluster(gpus: bool):
309
275
  elif returncode == 100:
310
276
  logger.info(
311
277
  ux_utils.finishing_message(
312
- 'Local cluster already exists.\n',
278
+ f'Local cluster {name} already exists.\n',
313
279
  log_path=log_path,
314
280
  is_local=True,
315
281
  follow_up_message=
316
- 'If you want to delete it instead, run: sky local down'))
282
+ 'If you want to delete it instead, run: `sky local down --name {name}`')) # pylint: disable=line-too-long
317
283
  else:
318
284
  with ux_utils.print_exception_no_traceback():
319
285
  log_hint = ux_utils.log_path_hint(log_path, is_local=True)
@@ -339,7 +305,7 @@ def deploy_local_cluster(gpus: bool):
339
305
  if gpus:
340
306
  # Get GPU model by querying the node labels
341
307
  label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
342
- gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
308
+ gpu_type_cmd = f'kubectl get node {name}-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
343
309
  try:
344
310
  # Run the command and capture the output
345
311
  gpu_count_output = subprocess.check_output(gpu_type_cmd,
@@ -375,8 +341,10 @@ def deploy_local_cluster(gpus: bool):
375
341
  'This may cause issues with running tasks.')
376
342
  logger.info(
377
343
  ux_utils.finishing_message(
378
- message=(f'Local Kubernetes cluster created successfully with '
379
- f'{num_cpus} CPUs{gpu_message}.'),
344
+ message=(
345
+ f'Local Kubernetes cluster {name} created successfully '
346
+ f'with {num_cpus} CPUs{gpu_message} on host port range '
347
+ f'{port_start}-{port_end}.'),
380
348
  log_path=log_path,
381
349
  is_local=True,
382
350
  follow_up_message=(
@@ -384,3 +352,54 @@ def deploy_local_cluster(gpus: bool):
384
352
  'Hint: To change the number of CPUs, change your docker '
385
353
  'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
386
354
  f'{gpu_hint}')))
355
+
356
+
357
+ def teardown_local_cluster(name: Optional[str] = None):
358
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
359
+ cluster_removed = False
360
+
361
+ path_to_package = os.path.dirname(__file__)
362
+ down_script_path = os.path.join(path_to_package, 'delete_cluster.sh')
363
+
364
+ cwd = os.path.dirname(os.path.abspath(down_script_path))
365
+ run_command = f'{down_script_path} {name}'
366
+ run_command = shlex.split(run_command)
367
+
368
+ # Setup logging paths
369
+ run_timestamp = sky_logging.get_run_timestamp()
370
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
371
+ 'local_down.log')
372
+
373
+ with rich_utils.safe_status(
374
+ ux_utils.spinner_message(f'Removing local cluster {name}',
375
+ log_path=log_path,
376
+ is_local=True)):
377
+
378
+ returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
379
+ log_path=log_path,
380
+ require_outputs=True,
381
+ stream_logs=False,
382
+ cwd=cwd)
383
+ stderr = stderr.replace('No kind clusters found.\n', '')
384
+
385
+ if returncode == 0:
386
+ cluster_removed = True
387
+ elif returncode == 100:
388
+ logger.info(
389
+ ux_utils.error_message(f'Local cluster {name} does not exist.'))
390
+ else:
391
+ with ux_utils.print_exception_no_traceback():
392
+ raise RuntimeError(f'Failed to down local cluster {name}. '
393
+ f'Stdout: {stdout}'
394
+ f'\nError: {stderr}')
395
+ if cluster_removed:
396
+ # Run sky check
397
+ with rich_utils.safe_status(
398
+ ux_utils.spinner_message('Running sky check...')):
399
+ sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
400
+ clouds=['kubernetes'],
401
+ quiet=True)
402
+ logger.info(
403
+ ux_utils.finishing_message(f'Local cluster {name} removed.',
404
+ log_path=log_path,
405
+ is_local=True))
@@ -48,8 +48,16 @@ fi
48
48
 
49
49
  if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
50
50
  # If context is none, it means we are using incluster auth. In this case,
51
- # use need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
52
- kubectl exec -i "$resource_type/$resource_name" -n "$namespace" --kubeconfig=/dev/null -- "$@"
51
+ # we need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
52
+ kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --kubeconfig=/dev/null --"
53
53
  else
54
- kubectl exec -i "$resource_type/$resource_name" -n "$namespace" --context="$context" -- "$@"
54
+ kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --context=\"$context\" --"
55
55
  fi
56
+
57
+ # Execute command on remote pod, waiting for rsync to be available first.
58
+ # The waiting happens on the remote pod, not locally, which is more efficient
59
+ # and reliable than polling from the local machine.
60
+ # We wrap the command in a bash script that waits for rsync, then execs the original command.
61
+ # Timeout after MAX_WAIT_TIME_SECONDS seconds.
62
+ MAX_WAIT_TIME_SECONDS=300
63
+ eval "${kubectl_cmd_base% --} -i -- bash -c 'count=0; max_count=$MAX_WAIT_TIME_SECONDS*2; until which rsync >/dev/null 2>&1; do if [ \$count -ge \$max_count ]; then echo \"Error when trying to rsync files to kubernetes cluster. Package installation may have failed.\" >&2; exit 1; fi; sleep 0.5; count=\$((count+1)); done; exec \"\$@\"' -- \"\$@\""
@@ -2,26 +2,13 @@
2
2
  import enum
3
3
 
4
4
 
5
+ # TODO(kevin): Remove this enum in v0.13.0.
5
6
  class KubernetesNetworkingMode(enum.Enum):
6
- """Enum for the different types of networking modes for accessing
7
- jump pods.
7
+ """Enum for the different types of networking modes for accessing pods.
8
8
  """
9
9
  NODEPORT = 'nodeport'
10
10
  PORTFORWARD = 'portforward'
11
11
 
12
- @classmethod
13
- def from_str(cls, mode: str) -> 'KubernetesNetworkingMode':
14
- """Returns the enum value for the given string."""
15
- if mode.lower() == cls.NODEPORT.value:
16
- return cls.NODEPORT
17
- elif mode.lower() == cls.PORTFORWARD.value:
18
- return cls.PORTFORWARD
19
- else:
20
- raise ValueError(f'Unsupported kubernetes networking mode: '
21
- f'{mode}. The mode must be either '
22
- f'\'{cls.PORTFORWARD.value}\' or '
23
- f'\'{cls.NODEPORT.value}\'. ')
24
-
25
12
 
26
13
  class KubernetesServiceType(enum.Enum):
27
14
  """Enum for the different types of services."""
@@ -44,3 +31,8 @@ class KubernetesAutoscalerType(enum.Enum):
44
31
  KARPENTER = 'karpenter'
45
32
  COREWEAVE = 'coreweave'
46
33
  GENERIC = 'generic'
34
+
35
+ def emits_autoscale_event(self) -> bool:
36
+ """Returns whether specific autoscaler emits the event reason
37
+ TriggeredScaleUp."""
38
+ return self not in {self.KARPENTER}
sky/utils/lock_events.py CHANGED
@@ -20,17 +20,17 @@ class DistributedLockEvent:
20
20
  f'[DistributedLock.hold]:{lock_id}')
21
21
 
22
22
  def acquire(self):
23
- was_locked = self._lock.is_locked
23
+ was_locked = self._lock.is_locked # type: ignore[truthy-function]
24
24
  with timeline.Event(f'[DistributedLock.acquire]:{self._lock_id}'):
25
25
  self._lock.acquire()
26
- if not was_locked and self._lock.is_locked:
26
+ if not was_locked and self._lock.is_locked: # type: ignore[truthy-function] # pylint: disable=line-too-long
27
27
  # start holding the lock after initial acquiring
28
28
  self._hold_lock_event.begin()
29
29
 
30
30
  def release(self):
31
- was_locked = self._lock.is_locked
31
+ was_locked = self._lock.is_locked # type: ignore[truthy-function]
32
32
  self._lock.release()
33
- if was_locked and not self._lock.is_locked:
33
+ if was_locked and not self._lock.is_locked: # type: ignore[truthy-function] # pylint: disable=line-too-long
34
34
  # stop holding the lock after initial releasing
35
35
  self._hold_lock_event.end()
36
36