skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -311,9 +311,10 @@ def _get_head_instance_id(instances: List) -> Optional[str]:
311
311
  return head_instance_id
312
312
 
313
313
 
314
- def run_instances(region: str, cluster_name_on_cloud: str,
314
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
315
315
  config: common.ProvisionConfig) -> common.ProvisionRecord:
316
316
  """See sky/provision/__init__.py"""
317
+ del cluster_name # unused
317
318
  ec2 = _default_ec2_resource(region)
318
319
  # NOTE: We set max_attempts=0 for fast failing when the resource is not
319
320
  # available (although the doc says it will only retry for network
@@ -629,9 +630,10 @@ def query_instances(
629
630
  cluster_name_on_cloud: str,
630
631
  provider_config: Optional[Dict[str, Any]] = None,
631
632
  non_terminated_only: bool = True,
633
+ retry_if_missing: bool = False,
632
634
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
633
635
  """See sky/provision/__init__.py"""
634
- del cluster_name # unused
636
+ del cluster_name, retry_if_missing # unused
635
637
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
636
638
  region = provider_config['region']
637
639
  ec2 = _default_ec2_resource(region)
@@ -743,6 +745,7 @@ def terminate_instances(
743
745
 
744
746
  # Make this multithreaded: modify all instances' SGs in parallel.
745
747
  def modify_instance_sg(instance):
748
+ assert default_sg is not None # Type narrowing for mypy
746
749
  instance.modify_attribute(Groups=[default_sg.id])
747
750
  logger.debug(f'Instance {instance.id} modified to use default SG:'
748
751
  f'{default_sg.id} for quick deletion.')
@@ -214,7 +214,7 @@ def _create_network_interface(
214
214
  location=provider_config['location'],
215
215
  public_ip_allocation_method='Static',
216
216
  public_ip_address_version='IPv4',
217
- sku=network.PublicIPAddressSku(name='Basic', tier='Regional'))
217
+ sku=network.PublicIPAddressSku(name='Standard', tier='Regional'))
218
218
  ip_poller = network_client.public_ip_addresses.begin_create_or_update(
219
219
  resource_group_name=provider_config['resource_group'],
220
220
  public_ip_address_name=f'{vm_name}-ip',
@@ -362,9 +362,10 @@ def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
362
362
  return instances
363
363
 
364
364
 
365
- def run_instances(region: str, cluster_name_on_cloud: str,
365
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
366
366
  config: common.ProvisionConfig) -> common.ProvisionRecord:
367
367
  """See sky/provision/__init__.py"""
368
+ del cluster_name # unused
368
369
  # TODO(zhwu): This function is too long. We should refactor it.
369
370
  provider_config = config.provider_config
370
371
  resource_group = provider_config['resource_group']
@@ -956,9 +957,10 @@ def query_instances(
956
957
  cluster_name_on_cloud: str,
957
958
  provider_config: Optional[Dict[str, Any]] = None,
958
959
  non_terminated_only: bool = True,
960
+ retry_if_missing: bool = False,
959
961
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
960
962
  """See sky/provision/__init__.py"""
961
- del cluster_name # unused
963
+ del cluster_name, retry_if_missing # unused
962
964
  assert provider_config is not None, cluster_name_on_cloud
963
965
 
964
966
  subscription_id = provider_config['subscription_id']
sky/provision/common.py CHANGED
@@ -97,6 +97,8 @@ class InstanceInfo:
97
97
  external_ip: Optional[str]
98
98
  tags: Dict[str, str]
99
99
  ssh_port: int = 22
100
+ # The internal service address of the instance on Kubernetes.
101
+ internal_svc: Optional[str] = None
100
102
 
101
103
  def get_feasible_ip(self) -> str:
102
104
  """Get the most feasible IPs of the instance. This function returns
@@ -40,10 +40,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
40
40
  return head_instance_id
41
41
 
42
42
 
43
- def run_instances(region: str, cluster_name_on_cloud: str,
43
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
44
44
  config: common.ProvisionConfig) -> common.ProvisionRecord:
45
45
  """Runs instances for the given cluster."""
46
-
46
+ del cluster_name # unused
47
47
  pending_status = ['pend', 'init', 'prol', 'boot']
48
48
 
49
49
  while True:
@@ -195,9 +195,10 @@ def query_instances(
195
195
  cluster_name_on_cloud: str,
196
196
  provider_config: Optional[Dict[str, Any]] = None,
197
197
  non_terminated_only: bool = True,
198
+ retry_if_missing: bool = False,
198
199
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
199
200
  """See sky/provision/__init__.py"""
200
- del cluster_name # unused
201
+ del cluster_name, retry_if_missing # unused
201
202
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
202
203
  instances = _filter_instances(cluster_name_on_cloud, None)
203
204
 
@@ -26,10 +26,10 @@ def _get_head_instance(
26
26
  return None
27
27
 
28
28
 
29
- def run_instances(region: str, cluster_name_on_cloud: str,
29
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
30
30
  config: common.ProvisionConfig) -> common.ProvisionRecord:
31
31
  """Runs instances for the given cluster."""
32
-
32
+ del cluster_name # unused
33
33
  pending_status = ['new']
34
34
  newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
35
35
  pending_status + ['off'])
@@ -246,9 +246,10 @@ def query_instances(
246
246
  cluster_name_on_cloud: str,
247
247
  provider_config: Optional[Dict[str, Any]] = None,
248
248
  non_terminated_only: bool = True,
249
+ retry_if_missing: bool = False,
249
250
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
250
251
  """See sky/provision/__init__.py"""
251
- del cluster_name # unused
252
+ del cluster_name, retry_if_missing # unused
252
253
  # terminated instances are not retrieved by the
253
254
  # API making `non_terminated_only` argument moot.
254
255
  del non_terminated_only
@@ -3,7 +3,7 @@
3
3
  import dataclasses
4
4
  import shlex
5
5
  import time
6
- from typing import Any, Dict, List
6
+ from typing import Any, Dict, List, Optional
7
7
 
8
8
  from sky import sky_logging
9
9
  from sky.skylet import constants
@@ -15,23 +15,52 @@ logger = sky_logging.init_logger(__name__)
15
15
  # Configure environment variables. A docker image can have environment variables
16
16
  # set in the Dockerfile with `ENV``. We need to export these variables to the
17
17
  # shell environment, so that our ssh session can access them.
18
+ # Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
19
+ # Docker images with Ray 2.48.0+ set this for UV package manager support,
20
+ # but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
21
+ # See: https://github.com/skypilot-org/skypilot/pull/7181
18
22
  SETUP_ENV_VARS_CMD = (
19
23
  'prefix_cmd() '
20
24
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
21
- 'export -p > ~/container_env_var.sh && '
25
+ 'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
22
26
  '$(prefix_cmd) '
23
27
  'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
24
28
 
25
29
  # Docker daemon may not be ready when the machine is firstly started. The error
26
30
  # message starts with the following string. We should wait for a while and retry
27
31
  # the command.
28
- DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
29
- 'the Docker daemon socket')
32
+ DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to ')
30
33
 
31
34
  DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
35
+ DOCKER_SOCKET_NOT_READY_STR_2 = (
36
+ 'check if the path is correct and if the daemon is running')
32
37
 
33
38
  _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
34
39
 
40
+ # Install AWS CLI v2 (not v1 from pip) as it's required for ECR authentication
41
+ # AWS CLI v2 is installed as a standalone binary, not a Python package. See:
42
+ # https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
43
+ INSTALL_AWS_CLI_CMD = (
44
+ 'which aws || ((command -v unzip >/dev/null 2>&1 || '
45
+ '(sudo apt-get update && sudo apt-get install -y unzip)) && '
46
+ 'curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" '
47
+ '-o "/tmp/awscliv2.zip" && '
48
+ 'unzip -q /tmp/awscliv2.zip -d /tmp && sudo /tmp/aws/install '
49
+ '&& rm -rf /tmp/awscliv2.zip /tmp/aws)')
50
+
51
+
52
+ def _extract_region_from_ecr_server(server: str) -> str:
53
+ """Extract AWS region from ECR server URL.
54
+
55
+ ECR server format: <account-id>.dkr.ecr.<region>.amazonaws.com
56
+ Returns the region part from the URL.
57
+ """
58
+ # Split: ['<account-id>', 'dkr', 'ecr', '<region>', 'amazonaws', 'com']
59
+ parts = server.split('.')
60
+ if len(parts) >= 6 and parts[1] == 'dkr' and parts[2] == 'ecr':
61
+ return parts[3]
62
+ raise ValueError(f'Invalid ECR server format: {server}')
63
+
35
64
 
36
65
  @dataclasses.dataclass
37
66
  class DockerLoginConfig:
@@ -157,19 +186,23 @@ class DockerInitializer:
157
186
  self.docker_config = docker_config
158
187
  self.container_name = docker_config['container_name']
159
188
  self.runner = runner
160
- self.home_dir = None
189
+ self.home_dir: Optional[str] = None
161
190
  self.initialized = False
162
191
  # podman is not fully tested yet.
163
192
  use_podman = docker_config.get('use_podman', False)
164
193
  self.docker_cmd = 'podman' if use_podman else 'docker'
165
194
  self.log_path = log_path
166
195
 
167
- def _run(self,
168
- cmd,
169
- run_env='host',
170
- wait_for_docker_daemon: bool = False,
171
- separate_stderr: bool = False,
172
- log_err_when_fail: bool = True) -> str:
196
+ def _run(
197
+ self,
198
+ cmd,
199
+ run_env='host',
200
+ wait_for_docker_daemon: bool = False,
201
+ separate_stderr: bool = False,
202
+ log_err_when_fail: bool = True,
203
+ flock_name: Optional[str] = None,
204
+ flock_args: Optional[str] = None,
205
+ ) -> str:
173
206
 
174
207
  if run_env == 'docker':
175
208
  cmd = self._docker_expand_user(cmd, any_char=True)
@@ -178,8 +211,13 @@ class DockerInitializer:
178
211
  # an error: `the input device is not a TTY`, and it works without
179
212
  # `-it` flag.
180
213
  # TODO(zhwu): ray use the `-it` flag, we need to check why.
181
- cmd = (f'{self.docker_cmd} exec {self.container_name} /bin/bash -c'
182
- f' {shlex.quote(cmd)} ')
214
+ cmd = (f'{self.docker_cmd} exec -u 0 {self.container_name}'
215
+ f' /bin/bash -c {shlex.quote(cmd)} ')
216
+
217
+ if flock_name is not None:
218
+ flock_args = flock_args or ''
219
+ cmd = (f'flock {flock_args} /tmp/{flock_name} '
220
+ f'-c {shlex.quote(cmd)}')
183
221
 
184
222
  logger.debug(f'+ {cmd}')
185
223
  start = time.time()
@@ -191,7 +229,8 @@ class DockerInitializer:
191
229
  separate_stderr=separate_stderr,
192
230
  log_path=self.log_path)
193
231
  if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr or
194
- DOCKER_SOCKET_NOT_READY_STR in stdout + stderr):
232
+ DOCKER_SOCKET_NOT_READY_STR in stdout + stderr or
233
+ DOCKER_SOCKET_NOT_READY_STR_2 in stdout + stderr):
195
234
  if wait_for_docker_daemon:
196
235
  if time.time(
197
236
  ) - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
@@ -231,14 +270,17 @@ class DockerInitializer:
231
270
  if self._check_container_exited():
232
271
  self.initialized = True
233
272
  self._run(f'{self.docker_cmd} start {self.container_name}')
234
- self._run('sudo service ssh start', run_env='docker')
273
+ self._run('sudo service ssh start',
274
+ run_env='docker',
275
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
276
+ flock_args='-s -w 1')
235
277
  return self._run('whoami', run_env='docker')
236
278
 
237
279
  # SkyPilot: Docker login if user specified a private docker registry.
238
280
  if 'docker_login_config' in self.docker_config:
239
- # TODO(tian): Maybe support a command to get the login password?
240
281
  docker_login_config = DockerLoginConfig(
241
282
  **self.docker_config['docker_login_config'])
283
+
242
284
  if docker_login_config.password:
243
285
  # Password is allowed to be empty, in that case, we will not run
244
286
  # the login command, and assume that the image pulling is
@@ -249,6 +291,25 @@ class DockerInitializer:
249
291
  f'--password {shlex.quote(docker_login_config.password)} '
250
292
  f'{shlex.quote(docker_login_config.server)}',
251
293
  wait_for_docker_daemon=True)
294
+ elif (docker_login_config.server.endswith('.amazonaws.com') and
295
+ '.dkr.ecr.' in docker_login_config.server):
296
+ # AWS ECR: Use aws ecr get-login-password for authentication
297
+ # ECR format: <account-id>.dkr.ecr.<region>.amazonaws.com
298
+ # This command uses the IAM credentials from the EC2 instance
299
+ # Ref: https://docs.aws.amazon.com/AmazonECR/latest/userguide/registry_auth.html # pylint: disable=line-too-long
300
+ region = _extract_region_from_ecr_server(
301
+ docker_login_config.server)
302
+
303
+ # AWS CLI is not pre-installed on AWS instances, unlike gcloud
304
+ # on GCP instances, so we need to install it first
305
+ self._run(INSTALL_AWS_CLI_CMD, wait_for_docker_daemon=False)
306
+
307
+ self._run(
308
+ f'aws ecr get-login-password --region {region} | '
309
+ f'{self.docker_cmd} login --username AWS '
310
+ f'--password-stdin '
311
+ f'{shlex.quote(docker_login_config.server)}',
312
+ wait_for_docker_daemon=True)
252
313
  elif docker_login_config.server.endswith('-docker.pkg.dev'):
253
314
  # Docker image server is on GCR, we need to do additional setup
254
315
  # to pull the image.
@@ -311,7 +372,9 @@ class DockerInitializer:
311
372
  self._auto_configure_shm(user_docker_run_options)),
312
373
  self.docker_cmd,
313
374
  )
314
- self._run(f'{remove_container_cmd}; {start_command}')
375
+ self._run(f'{remove_container_cmd} && {start_command}',
376
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
377
+ flock_args='-x -w 10')
315
378
 
316
379
  # SkyPilot: Setup Commands.
317
380
  # TODO(zhwu): the following setups should be aligned with the kubernetes
@@ -329,14 +392,18 @@ class DockerInitializer:
329
392
  'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;',
330
393
  run_env='docker')
331
394
  # Install dependencies.
332
- self._run(
333
- 'sudo apt-get update; '
395
+ cmd = (
396
+ 'bash -lc \''
397
+ 'exec 200>/var/tmp/sky_apt.lock; '
398
+ 'flock -x -w 120 200 || exit 1; '
399
+ 'export DEBIAN_FRONTEND=noninteractive; '
400
+ 'apt-get -yq update && '
334
401
  # Our mount script will install gcsfuse without fuse package.
335
402
  # We need to install fuse package first to enable storage mount.
336
403
  # The dpkg option is to suppress the prompt for fuse installation.
337
- 'sudo apt-get -o DPkg::Options::="--force-confnew" install -y '
338
- 'rsync curl wget patch openssh-server python3-pip fuse;',
339
- run_env='docker')
404
+ 'apt-get -o DPkg::Options::=--force-confnew install -y '
405
+ 'rsync curl wget patch openssh-server python3-pip fuse\'')
406
+ self._run(cmd, run_env='docker')
340
407
 
341
408
  # Copy local authorized_keys to docker container.
342
409
  # Stop and disable jupyter service. This is to avoid port conflict on
@@ -367,7 +434,7 @@ class DockerInitializer:
367
434
  # pylint: disable=anomalous-backslash-in-string
368
435
  self._run(
369
436
  'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
370
- f'sudo echo "Port {port}" >> /etc/ssh/sshd_config;'
437
+ f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
371
438
  'mkdir -p ~/.ssh;'
372
439
  'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
373
440
  'sudo service ssh start;'
@@ -412,9 +479,13 @@ class DockerInitializer:
412
479
  user_pos = string.find('~')
413
480
  if user_pos > -1:
414
481
  if self.home_dir is None:
415
- cmd = (f'{self.docker_cmd} exec {self.container_name} '
416
- 'printenv HOME')
417
- self.home_dir = self._run(cmd, separate_stderr=True)
482
+ cmd = (f'{self.docker_cmd} exec {self.container_name}'
483
+ ' printenv HOME')
484
+ self.home_dir = self._run(
485
+ cmd,
486
+ separate_stderr=True,
487
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
488
+ flock_args='-s -w 1')
418
489
  # Check for unexpected newline in home directory, which can be
419
490
  # a common issue when the output is mixed with stderr.
420
491
  assert '\n' not in self.home_dir, (
@@ -3,11 +3,11 @@ import os
3
3
  import time
4
4
  from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
- from sky import authentication as auth
7
6
  from sky import exceptions
8
7
  from sky import sky_logging
9
8
  from sky.provision import common
10
9
  from sky.provision.fluidstack import fluidstack_utils as utils
10
+ from sky.utils import auth_utils
11
11
  from sky.utils import command_runner
12
12
  from sky.utils import common_utils
13
13
  from sky.utils import status_lib
@@ -27,7 +27,7 @@ logger = sky_logging.init_logger(__name__)
27
27
  def get_internal_ip(node_info: Dict[str, Any]) -> None:
28
28
  node_info['internal_ip'] = node_info['ip_address']
29
29
 
30
- private_key_path, _ = auth.get_or_generate_keys()
30
+ private_key_path, _ = auth_utils.get_or_generate_keys()
31
31
  runner = command_runner.SSHCommandRunner(
32
32
  (node_info['ip_address'], 22),
33
33
  ssh_user='ubuntu',
@@ -78,10 +78,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
78
78
  return head_instance_id
79
79
 
80
80
 
81
- def run_instances(region: str, cluster_name_on_cloud: str,
81
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
82
82
  config: common.ProvisionConfig) -> common.ProvisionRecord:
83
83
  """Runs instances for the given cluster."""
84
-
84
+ del cluster_name # unused
85
85
  pending_status = ['pending', 'provisioning']
86
86
  while True:
87
87
  instances = _filter_instances(cluster_name_on_cloud, pending_status)
@@ -291,9 +291,10 @@ def query_instances(
291
291
  cluster_name_on_cloud: str,
292
292
  provider_config: Optional[Dict[str, Any]] = None,
293
293
  non_terminated_only: bool = True,
294
+ retry_if_missing: bool = False,
294
295
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
295
296
  """See sky/provision/__init__.py"""
296
- del cluster_name # unused
297
+ del cluster_name, retry_if_missing # unused
297
298
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
298
299
  instances = _filter_instances(cluster_name_on_cloud, None)
299
300
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -5,6 +5,8 @@ import time
5
5
  import typing
6
6
  from typing import Any, Dict, List, Set, Tuple
7
7
 
8
+ from typing_extensions import TypedDict
9
+
8
10
  from sky.adaptors import gcp
9
11
  from sky.clouds.utils import gcp_utils
10
12
  from sky.provision import common
@@ -415,6 +417,9 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
415
417
  return iam_role
416
418
 
417
419
 
420
+ AllowedList = TypedDict('AllowedList', {'IPProtocol': str, 'ports': List[str]})
421
+
422
+
418
423
  def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
419
424
  compute):
420
425
  """Check if the firewall rules in the VPC are sufficient."""
@@ -466,7 +471,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
466
471
  }
467
472
  """
468
473
  source2rules: Dict[Tuple[str, str], Dict[str, Set[int]]] = {}
469
- source2allowed_list: Dict[Tuple[str, str], List[Dict[str, str]]] = {}
474
+ source2allowed_list: Dict[Tuple[str, str], List[AllowedList]] = {}
470
475
  for rule in rules:
471
476
  # Rules applied to specific VM (targetTags) may not work for the
472
477
  # current VM, so should be skipped.
@@ -62,9 +62,10 @@ def query_instances(
62
62
  cluster_name_on_cloud: str,
63
63
  provider_config: Optional[Dict[str, Any]] = None,
64
64
  non_terminated_only: bool = True,
65
+ retry_if_missing: bool = False,
65
66
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
66
67
  """See sky/provision/__init__.py"""
67
- del cluster_name # unused
68
+ del cluster_name, retry_if_missing # unused
68
69
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
69
70
  zone = provider_config['availability_zone']
70
71
  project_id = provider_config['project_id']
@@ -360,9 +361,10 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
360
361
  created_instance_ids=created_instance_ids)
361
362
 
362
363
 
363
- def run_instances(region: str, cluster_name_on_cloud: str,
364
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
364
365
  config: common.ProvisionConfig) -> common.ProvisionRecord:
365
366
  """See sky/provision/__init__.py"""
367
+ del cluster_name # unused
366
368
  try:
367
369
  return _run_instances(region, cluster_name_on_cloud, config)
368
370
  except gcp.http_error_exception() as e:
@@ -64,8 +64,9 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
64
64
  return next(iter(instances.keys()))
65
65
 
66
66
 
67
- def run_instances(region: str, cluster_name_on_cloud: str,
67
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
68
68
  config: common.ProvisionConfig) -> common.ProvisionRecord:
69
+ del cluster_name # unused
69
70
  logger.info(f'Starting run_instances with region={region}, '
70
71
  f'cluster={cluster_name_on_cloud}')
71
72
  logger.debug(f'Config: {config}')
@@ -308,9 +309,10 @@ def query_instances(
308
309
  cluster_name_on_cloud: str,
309
310
  provider_config: Optional[dict] = None,
310
311
  non_terminated_only: bool = True,
312
+ retry_if_missing: bool = False,
311
313
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
312
314
  """Returns the status of the specified instances for Hyperbolic."""
313
- del cluster_name, provider_config # unused
315
+ del cluster_name, provider_config, retry_if_missing # unused
314
316
  # Fetch all instances for this cluster
315
317
  instances = utils.list_instances(
316
318
  metadata={'skypilot': {
@@ -10,6 +10,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
10
10
  from sky import exceptions
11
11
  from sky import logs
12
12
  from sky import provision
13
+ from sky import resources as resources_lib
13
14
  from sky import sky_logging
14
15
  from sky.provision import common
15
16
  from sky.provision import docker_utils
@@ -38,11 +39,13 @@ _RAY_PRLIMIT = (
38
39
  'which prlimit && for id in $(pgrep -f raylet/raylet); '
39
40
  'do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;')
40
41
 
41
- _DUMP_RAY_PORTS = (
42
- f'{constants.SKY_PYTHON_CMD} -c \'import json, os; '
43
- f'json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, '
44
- f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", '
45
- 'encoding="utf-8"))\';')
42
+ DUMP_RAY_PORTS = (f'{constants.SKY_PYTHON_CMD} -c \'import json, os; '
43
+ f'runtime_dir = os.path.expanduser(os.environ.get('
44
+ f'"{constants.SKY_RUNTIME_DIR_ENV_VAR_KEY}", "~")); '
45
+ f'json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, '
46
+ f'open(os.path.join(runtime_dir, '
47
+ f'"{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", '
48
+ 'encoding="utf-8"))\';')
46
49
 
47
50
  _RAY_PORT_COMMAND = (
48
51
  f'RAY_PORT=$({constants.SKY_PYTHON_CMD} -c '
@@ -84,7 +87,7 @@ def _set_usage_run_id_cmd() -> str:
84
87
  latest one when the function is called.
85
88
  """
86
89
  return (
87
- f'cat {usage_constants.USAGE_RUN_ID_FILE} || '
90
+ f'cat {usage_constants.USAGE_RUN_ID_FILE} 2> /dev/null || '
88
91
  # The run id is retrieved locally for the current run, so that the
89
92
  # remote cluster will be set with the same run id as the initial
90
93
  # launch operation.
@@ -92,12 +95,6 @@ def _set_usage_run_id_cmd() -> str:
92
95
  f'{usage_constants.USAGE_RUN_ID_FILE}')
93
96
 
94
97
 
95
- def _set_skypilot_env_var_cmd() -> str:
96
- """Sets the skypilot environment variables on the remote machine."""
97
- env_vars = env_options.Options.all_options()
98
- return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
99
-
100
-
101
98
  def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True):
102
99
  """Decorator that retries the function if it fails.
103
100
 
@@ -136,6 +133,20 @@ def _hint_worker_log_path(cluster_name: str, cluster_info: common.ClusterInfo,
136
133
  logger.info(f'Logs of worker nodes can be found at: {worker_log_path}')
137
134
 
138
135
 
136
+ class SSHThreadPoolExecutor(futures.ThreadPoolExecutor):
137
+ """ThreadPoolExecutor that kills children processes on exit."""
138
+
139
+ def __exit__(self, exc_type, exc_val, exc_tb):
140
+ # ssh command runner eventually calls
141
+ # log_lib.run_with_log, which will spawn
142
+ # subprocesses. If we are exiting the context
143
+ # we need to kill the children processes
144
+ # to avoid leakage.
145
+ subprocess_utils.kill_children_processes()
146
+ self.shutdown()
147
+ return False
148
+
149
+
139
150
  def _parallel_ssh_with_cache(func,
140
151
  cluster_name: str,
141
152
  stage_name: str,
@@ -148,7 +159,7 @@ def _parallel_ssh_with_cache(func,
148
159
  # as 32 is too large for some machines.
149
160
  max_workers = subprocess_utils.get_parallel_threads(
150
161
  cluster_info.provider_name)
151
- with futures.ThreadPoolExecutor(max_workers=max_workers) as pool:
162
+ with SSHThreadPoolExecutor(max_workers=max_workers) as pool:
152
163
  results = []
153
164
  runners = provision.get_command_runners(cluster_info.provider_name,
154
165
  cluster_info, **ssh_credentials)
@@ -317,7 +328,7 @@ def ray_head_start_command(custom_resource: Optional[str],
317
328
  # the warning when the worker count is >12x CPUs.
318
329
  'RAY_worker_maximum_startup_concurrency=$(( 3 * $(nproc --all) )) '
319
330
  f'{constants.SKY_RAY_CMD} start --head {ray_options} || exit 1;' +
320
- _RAY_PRLIMIT + _DUMP_RAY_PORTS + RAY_HEAD_WAIT_INITIALIZED_COMMAND)
331
+ _RAY_PRLIMIT + DUMP_RAY_PORTS + RAY_HEAD_WAIT_INITIALIZED_COMMAND)
321
332
  return cmd
322
333
 
323
334
 
@@ -425,8 +436,16 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
425
436
  # use the external IP of the head node.
426
437
  use_external_ip = cluster_info.custom_ray_options.pop(
427
438
  'use_external_ip', False)
428
- head_ip = (head_instance.internal_ip
429
- if not use_external_ip else head_instance.external_ip)
439
+
440
+ if use_external_ip:
441
+ head_ip = head_instance.external_ip
442
+ else:
443
+ # For Kubernetes, use the internal service address of the head node.
444
+ # Keep this consistent with the logic in kubernetes-ray.yml.j2
445
+ if head_instance.internal_svc:
446
+ head_ip = head_instance.internal_svc
447
+ else:
448
+ head_ip = head_instance.internal_ip
430
449
 
431
450
  ray_cmd = ray_worker_start_command(custom_resource,
432
451
  cluster_info.custom_ray_options,
@@ -468,11 +487,38 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
468
487
  @common.log_function_start_end
469
488
  @_auto_retry()
470
489
  @timeline.event
471
- def start_skylet_on_head_node(cluster_name: str,
472
- cluster_info: common.ClusterInfo,
473
- ssh_credentials: Dict[str, Any]) -> None:
490
+ def start_skylet_on_head_node(
491
+ cluster_name: resources_utils.ClusterName,
492
+ cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any],
493
+ launched_resources: resources_lib.Resources) -> None:
474
494
  """Start skylet on the head node."""
475
- del cluster_name
495
+ # Avoid circular import.
496
+ # pylint: disable=import-outside-toplevel
497
+ from sky.utils import controller_utils
498
+
499
+ def _set_skypilot_env_var_cmd() -> str:
500
+ """Sets the skypilot environment variables on the remote machine."""
501
+ env_vars = {
502
+ k: str(v) for (k, v) in env_options.Options.all_options().items()
503
+ }
504
+ is_controller = controller_utils.Controllers.from_name(
505
+ cluster_name.display_name) is not None
506
+ is_kubernetes = cluster_info.provider_name == 'kubernetes'
507
+ if is_controller and is_kubernetes:
508
+ # For jobs/serve controller, we pass in the CPU and memory limits
509
+ # when starting the skylet to handle cases where these env vars
510
+ # are not set on the cluster's pod spec. The skylet will read
511
+ # these env vars when starting (ManagedJobEvent.start()) and write
512
+ # it to disk.
513
+ resources = launched_resources.assert_launchable()
514
+ vcpus, mem = resources.cloud.get_vcpus_mem_from_instance_type(
515
+ resources.instance_type)
516
+ if vcpus is not None:
517
+ env_vars['SKYPILOT_POD_CPU_CORE_LIMIT'] = str(vcpus)
518
+ if mem is not None:
519
+ env_vars['SKYPILOT_POD_MEMORY_GB_LIMIT'] = str(mem)
520
+ return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
521
+
476
522
  runners = provision.get_command_runners(cluster_info.provider_name,
477
523
  cluster_info, **ssh_credentials)
478
524
  head_runner = runners[0]
@@ -13,4 +13,6 @@ from sky.provision.kubernetes.network import open_ports
13
13
  from sky.provision.kubernetes.network import query_ports
14
14
  from sky.provision.kubernetes.volume import apply_volume
15
15
  from sky.provision.kubernetes.volume import delete_volume
16
+ from sky.provision.kubernetes.volume import get_all_volumes_usedby
16
17
  from sky.provision.kubernetes.volume import get_volume_usedby
18
+ from sky.provision.kubernetes.volume import map_all_volumes_usedby