skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -33,14 +33,11 @@ provider:
33
33
  networking_mode: {{k8s_networking_mode}}
34
34
 
35
35
  # We use internal IPs since we set up a port-forward between the kubernetes
36
- # cluster and the local machine, or directly use NodePort to reach the
37
- # head node.
36
+ # cluster and the local machine.
38
37
  use_internal_ips: true
39
38
 
40
39
  timeout: {{timeout}}
41
40
 
42
- ssh_jump_image: {{k8s_ssh_jump_image}}
43
-
44
41
  # Namespace used to host SkyPilot system components, such as fuse device
45
42
  # manager.
46
43
  skypilot_system_namespace: {{k8s_skypilot_system_namespace}}
@@ -49,6 +46,10 @@ provider:
49
46
  # Used to set up the necessary permissions and sidecars.
50
47
  fuse_device_required: {{k8s_fuse_device_required}}
51
48
 
49
+ {% if ephemeral_volume_mounts %}
50
+ ephemeral_volume_specs: {{ephemeral_volume_mounts | tojson}}
51
+ {% endif %}
52
+
52
53
  # ServiceAccount created by the autoscaler for the head node pod that it
53
54
  # runs in. If this field isn't provided, the head pod config below must
54
55
  # contain a user-created service account with the proper permissions.
@@ -212,7 +213,9 @@ provider:
212
213
  metadata:
213
214
  labels:
214
215
  parent: skypilot
216
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
215
217
  skypilot-cluster: {{cluster_name_on_cloud}}
218
+ skypilot-cluster-name: {{cluster_name_on_cloud}}
216
219
  skypilot-user: {{ user }}
217
220
  name: {{cluster_name_on_cloud}}-head-ssh
218
221
  spec:
@@ -230,7 +233,9 @@ provider:
230
233
  metadata:
231
234
  labels:
232
235
  parent: skypilot
236
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
233
237
  skypilot-cluster: {{cluster_name_on_cloud}}
238
+ skypilot-cluster-name: {{cluster_name_on_cloud}}
234
239
  skypilot-user: {{ user }}
235
240
  # NOTE: If you're running multiple Ray clusters with services
236
241
  # on one Kubernetes cluster, they must have unique service
@@ -250,7 +255,9 @@ provider:
250
255
  metadata:
251
256
  labels:
252
257
  parent: skypilot
258
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
253
259
  skypilot-cluster: {{cluster_name_on_cloud}}
260
+ skypilot-cluster-name: {{cluster_name_on_cloud}}
254
261
  skypilot-user: {{ user }}
255
262
  name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
256
263
  spec:
@@ -275,9 +282,8 @@ available_node_types:
275
282
  labels:
276
283
  parent: skypilot
277
284
  # component will be set for the head node pod to be the same as the head node service selector above if a
285
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
278
286
  skypilot-cluster: {{cluster_name_on_cloud}}
279
- # Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
280
- skypilot-ssh-jump: {{k8s_ssh_jump_name}}
281
287
  skypilot-user: {{ user }}
282
288
  # Custom tags for the pods
283
289
  {%- for label_key, label_value in labels.items() %}
@@ -444,9 +450,6 @@ available_node_types:
444
450
  # object store. If you do not provide this, Ray will fall back to
445
451
  # /tmp which cause slowdowns if is not a shared memory volume.
446
452
  volumes:
447
- - name: secret-volume
448
- secret:
449
- secretName: {{k8s_ssh_key_secret_name}}
450
453
  - name: dshm
451
454
  emptyDir:
452
455
  medium: Memory
@@ -510,6 +513,16 @@ available_node_types:
510
513
  valueFrom:
511
514
  fieldRef:
512
515
  fieldPath: metadata.labels['ray-node-type']
516
+ - name: SKYPILOT_POD_CPU_CORE_LIMIT
517
+ valueFrom:
518
+ resourceFieldRef:
519
+ containerName: ray-node
520
+ resource: requests.cpu
521
+ - name: SKYPILOT_POD_MEMORY_BYTES_LIMIT
522
+ valueFrom:
523
+ resourceFieldRef:
524
+ containerName: ray-node
525
+ resource: requests.memory
513
526
  {% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
514
527
  - name: {{ key }}
515
528
  value: {{ value }}
@@ -630,12 +643,17 @@ available_node_types:
630
643
  command: ["/bin/bash", "-c", "--"]
631
644
  args:
632
645
  - |
633
- # For backwards compatibility, we put a marker file in the pod
634
- # to indicate that the pod is running with the changes introduced
635
- # in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
636
- # TODO: Remove this marker file and it's usage in setup_commands
637
- # after v0.10.0 release.
638
- touch /tmp/skypilot_is_nimbus
646
+ # Set -x to print the commands and their arguments as they are executed.
647
+ # Useful for debugging.
648
+ set -x
649
+
650
+ # Execute user-provided post-provision runcmd
651
+ # before any of the SkyPilot setup commands.
652
+ {%- if runcmd %}
653
+ {%- for cmd in runcmd %}
654
+ {{cmd}}
655
+ {%- endfor %}
656
+ {%- endif %}
639
657
 
640
658
  # Helper function to conditionally use sudo
641
659
  # TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
@@ -647,15 +665,125 @@ available_node_types:
647
665
  # STEP 1: Run apt update, install missing packages, and set up ssh.
648
666
  (
649
667
  (
650
- # For backwards compatibility, we put a marker file in the pod
651
- # to indicate that the apt ssh setup step will write a completion
652
- # marker file (/tmp/apt_ssh_setup_complete) to the pod.
653
- # TODO: Remove this marker file and its usage in setup_commands
654
- # after v0.11.0 release.
655
- touch /tmp/apt_ssh_setup_started
656
-
657
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
658
- echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
668
+ # Helper: run apt-get update with retries
669
+ apt_update_with_retries() {
670
+ # do not fail the whole shell; we handle return codes
671
+ set +e
672
+ local log=/tmp/apt-update.log
673
+ local tries=3
674
+ local delay=1
675
+ local i
676
+ for i in $(seq 1 $tries); do
677
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update >> "$log" 2>&1 && { set -e; return 0; }
678
+ echo "apt-get update attempt $i/$tries failed; retrying in ${delay}s" >> "$log"
679
+ sleep $delay
680
+ delay=$((delay * 2))
681
+ done
682
+ set -e
683
+ return 1
684
+ }
685
+ apt_install_with_retries() {
686
+ local packages="$@"
687
+ [ -z "$packages" ] && return 0
688
+ set +e
689
+ local log=/tmp/apt-update.log
690
+ local tries=3
691
+ local delay=1
692
+ local i
693
+ for i in $(seq 1 $tries); do
694
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $packages && { set -e; return 0; }
695
+ echo "apt-get install failed for: $packages (attempt $i/$tries). Running -f install and retrying..." >> "$log"
696
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get -f install -y >> "$log" 2>&1 || true
697
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get clean >> "$log" 2>&1 || true
698
+ sleep $delay
699
+ delay=$((delay * 2))
700
+ done
701
+ set -e
702
+ return 1
703
+ }
704
+ apt_update_install_with_retries() {
705
+ apt_update_with_retries
706
+ apt_install_with_retries "$@"
707
+ }
708
+ backup_dir=/etc/apt/sources.list.backup_skypilot
709
+ backup_source() {
710
+ $(prefix_cmd) mkdir -p "$backup_dir"
711
+ if [ -f /etc/apt/sources.list ] && [ ! -f "$backup_dir/sources.list" ]; then
712
+ $(prefix_cmd) cp -a /etc/apt/sources.list "$backup_dir/sources.list" || true
713
+ fi
714
+ }
715
+ restore_source() {
716
+ if [ -f "$backup_dir/sources.list" ]; then
717
+ $(prefix_cmd) cp -a "$backup_dir/sources.list" /etc/apt/sources.list || true
718
+ fi
719
+ }
720
+ update_apt_sources() {
721
+ local host=$1
722
+ local apt_file=$2
723
+ $(prefix_cmd) sed -i -E "s|https?://[a-zA-Z0-9.-]+\.ubuntu\.com/ubuntu|http://$host/ubuntu|g" $apt_file
724
+ }
725
+ # Helper: install packages across mirrors with retries
726
+ apt_install_with_mirrors() {
727
+ local required=$1; shift
728
+ local packages="$@"
729
+ [ -z "$packages" ] && return 0
730
+ set +e
731
+ # Install packages with default sources first
732
+ local log=/tmp/apt-update.log
733
+ echo "$(date +%Y-%m-%d\ %H:%M:%S) Installing packages: $packages" >> "$log"
734
+ restore_source
735
+ apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
736
+ echo "Install failed with default sources: $packages" >> "$log"
737
+ # Detect distro (ubuntu/debian)
738
+ local APT_OS="unknown"
739
+ if [ -f /etc/os-release ]; then
740
+ . /etc/os-release
741
+ case "$ID" in
742
+ debian) APT_OS="debian" ;;
743
+ ubuntu) APT_OS="ubuntu" ;;
744
+ *)
745
+ if [ -n "$ID_LIKE" ]; then
746
+ case " $ID $ID_LIKE " in
747
+ *ubuntu*) APT_OS="ubuntu" ;;
748
+ *debian*) APT_OS="debian" ;;
749
+ esac
750
+ fi
751
+ ;;
752
+ esac
753
+ fi
754
+ # Build mirror candidates
755
+ # deb.debian.org is a CDN endpoint, if one backend goes down,
756
+ # the CDN automatically fails over to another mirror,
757
+ # so we only retry for ubuntu here.
758
+ if [ "$APT_OS" = "ubuntu" ]; then
759
+ # Backup current sources once
760
+ backup_source
761
+ # Selected from https://launchpad.net/ubuntu/+archivemirrors
762
+ # and results from apt-select
763
+ local MIRROR_CANDIDATES="mirrors.wikimedia.org mirror.umd.edu"
764
+ for host in $MIRROR_CANDIDATES; do
765
+ echo "Trying APT mirror ($APT_OS): $host" >> "$log"
766
+ if [ -f /etc/apt/sources.list ]; then
767
+ update_apt_sources $host /etc/apt/sources.list
768
+ else
769
+ echo "Error: /etc/apt/sources.list not found" >> "$log"
770
+ break
771
+ fi
772
+ apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
773
+ echo "Install failed with mirror ($APT_OS): $host" >> "$log"
774
+ # Restore to default sources
775
+ restore_source
776
+ done
777
+ fi
778
+ set -e
779
+ if [ "$required" = "1" ]; then
780
+ echo "Error: required package install failed across all mirrors: $packages" >> "$log"
781
+ return 1
782
+ else
783
+ echo "Optional package install failed across all mirrors: $packages; skipping." >> "$log"
784
+ return 0
785
+ fi
786
+ }
659
787
  # Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
660
788
  # so that both fusemount and fusermount3 can be masked before enabling SSH access.
661
789
  PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
@@ -682,7 +810,7 @@ available_node_types:
682
810
  done;
683
811
  if [ ! -z "$INSTALL_FIRST" ]; then
684
812
  echo "Installing core packages: $INSTALL_FIRST";
685
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $INSTALL_FIRST;
813
+ apt_install_with_mirrors 1 $INSTALL_FIRST || { echo "Error: core package installation failed." >> /tmp/apt-update.log; exit 1; }
686
814
  fi;
687
815
  # SSH and other packages are not necessary, so we disable set -e
688
816
  set +e
@@ -706,7 +834,8 @@ available_node_types:
706
834
  fi
707
835
  $(prefix_cmd) cp -p "$FUSERMOUNT_PATH" "${FUSERMOUNT_PATH}-original"
708
836
  $(prefix_cmd) ln -sf {{k8s_fusermount_shared_dir}}/fusermount-shim "$FUSERMOUNT_PATH"
709
- FUSERMOUNT3_PATH=$(which fusermount3)
837
+ # "|| true" because fusermount3 is not always available
838
+ FUSERMOUNT3_PATH=$(which fusermount3) || true
710
839
  if [ -z "$FUSERMOUNT3_PATH" ]; then
711
840
  FUSERMOUNT3_PATH="${FUSERMOUNT_PATH}3"
712
841
  fi
@@ -748,18 +877,23 @@ available_node_types:
748
877
  $(prefix_cmd) mkdir -p ~/.ssh;
749
878
  $(prefix_cmd) chown -R $(whoami) ~/.ssh;
750
879
  $(prefix_cmd) chmod 700 ~/.ssh;
751
- $(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > ~/.ssh/authorized_keys;
880
+ $(prefix_cmd) cat > ~/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
881
+ skypilot:ssh_public_key_content
882
+ SKYPILOT_SSH_KEY_EOF
752
883
  $(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
753
884
  $(prefix_cmd) service ssh restart;
754
885
  $(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
755
886
 
756
887
  touch /tmp/apt_ssh_setup_complete
757
888
  echo "=== SSH setup completed ==="
758
- ) > /tmp/${STEPS[0]}.log 2>&1 || {
759
- echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed
889
+ ) > /tmp/${STEPS[0]}.log 2>&1
890
+ if [ "$?" -ne "0" ]; then
891
+ {
892
+ echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed 2>&1
760
893
  cat /tmp/${STEPS[0]}.log
761
894
  exit 1
762
- }
895
+ }
896
+ fi
763
897
  ) &
764
898
 
765
899
  # STEP 2: Install conda, ray and skypilot (for dependencies); start
@@ -777,15 +911,20 @@ available_node_types:
777
911
  {{ conda_installation_commands }}
778
912
  {{ ray_installation_commands }}
779
913
 
780
- VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
914
+ # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
915
+ # unset PYTHONPATH in case the user provided docker image set it.
916
+ VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
781
917
  # Wait for `patch` package to be installed before applying ray patches
782
918
  until dpkg -l | grep -q "^ii patch "; do
783
919
  sleep 0.1
784
920
  echo "Waiting for patch package to be installed..."
785
921
  done
786
922
  # Apply Ray patches for progress bar fix
787
- ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
788
- VIRTUAL_ENV=~/skypilot-runtime python -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
923
+ # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
924
+ # unset PYTHONPATH in case the user provided docker image set it.
925
+ # ~/.sky/python_path is seeded by conda_installation_commands
926
+ VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
927
+ env -u PYTHONPATH $(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
789
928
  }
790
929
  touch /tmp/ray_skypilot_installation_complete
791
930
  echo "=== Ray and skypilot installation completed ==="
@@ -814,11 +953,14 @@ available_node_types:
814
953
  set +e
815
954
  {{ ray_worker_start_command }}
816
955
  fi
817
- ) > /tmp/${STEPS[1]}.log 2>&1 || {
818
- echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed
956
+ ) > /tmp/${STEPS[1]}.log 2>&1
957
+ if [ "$?" -ne "0" ]; then
958
+ {
959
+ echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed 2>&1
819
960
  cat /tmp/${STEPS[1]}.log
820
961
  exit 1
821
- }
962
+ }
963
+ fi
822
964
  ) &
823
965
 
824
966
 
@@ -836,11 +978,14 @@ available_node_types:
836
978
  fi;
837
979
  fi;
838
980
  export -p > ~/container_env_var.sh && $(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh
839
- ) > /tmp/${STEPS[2]}.log 2>&1 || {
840
- echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed
981
+ ) > /tmp/${STEPS[2]}.log 2>&1
982
+ if [ "$?" -ne "0" ]; then
983
+ {
984
+ echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed 2>&1
841
985
  cat /tmp/${STEPS[2]}.log
842
986
  exit 1
843
- }
987
+ }
988
+ fi
844
989
  ) &
845
990
 
846
991
  function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
@@ -927,7 +1072,7 @@ available_node_types:
927
1072
  # Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
928
1073
  # For SkyServe, this will be None and every service will be recovered. This is because SkyServe
929
1074
  # will delete the service from the database after it is terminated so everything in the database is running.
930
- ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs = state.get_managed_jobs(); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
1075
+ ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs, _ = state.get_managed_jobs_with_filters(fields=['job_id', 'schedule_state']); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
931
1076
  if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
932
1077
  read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
933
1078
  fi
@@ -957,6 +1102,8 @@ available_node_types:
957
1102
 
958
1103
  touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
959
1104
  {% endif %}
1105
+ # Set +x to stop printing the commands and their arguments as they are executed.
1106
+ set +x
960
1107
 
961
1108
  trap : TERM INT; log_tail || sleep infinity & wait
962
1109
 
@@ -970,9 +1117,6 @@ available_node_types:
970
1117
  # object store. If you do not provide this, Ray will fall back to
971
1118
  # /tmp which cause slowdowns if is not a shared memory volume.
972
1119
  volumeMounts:
973
- - name: secret-volume
974
- readOnly: true
975
- mountPath: "/etc/secret-volume"
976
1120
  - mountPath: /dev/shm
977
1121
  name: dshm
978
1122
  {% if k8s_enable_gpudirect_tcpx %}
@@ -1204,24 +1348,21 @@ setup_commands:
1204
1348
  start_epoch=$(date +%s);
1205
1349
 
1206
1350
  # Wait for SSH setup to complete before proceeding
1207
- if [ -f /tmp/apt_ssh_setup_started ]; then
1208
- echo "=== Logs for asynchronous SSH setup ===";
1209
- [ -f /tmp/apt_ssh_setup_complete ] && cat /tmp/${STEPS[0]}.log ||
1210
- { tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/apt_ssh_setup_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1211
- [ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
1212
- fi
1351
+ echo "=== Logs for asynchronous SSH setup ===";
1352
+ ([ -f /tmp/apt_ssh_setup_complete ]|| [ -f /tmp/${STEPS[0]}.failed ]) && cat /tmp/${STEPS[0]}.log ||
1353
+ { tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1354
+ [ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
1213
1355
 
1214
1356
  echo "=== Logs for asynchronous ray and skypilot installation ===";
1215
- if [ -f /tmp/skypilot_is_nimbus ]; then
1216
- echo "=== Logs for asynchronous ray and skypilot installation ===";
1217
- [ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
1218
- { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/ray_skypilot_installation_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1219
- [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
1220
- fi
1357
+ ([ -f /tmp/ray_skypilot_installation_complete ]|| [ -f /tmp/${STEPS[1]}.failed ]) && cat /tmp/${STEPS[1]}.log ||
1358
+ { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1359
+ [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
1360
+
1221
1361
  end_epoch=$(date +%s);
1222
1362
  echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
1223
1363
  start_epoch=$(date +%s);
1224
1364
  {{ skypilot_wheel_installation_commands }}
1365
+ {{ copy_skypilot_templates_commands }}
1225
1366
  end_epoch=$(date +%s);
1226
1367
  echo "=== Skypilot wheel installation completed in $(($end_epoch - $start_epoch)) secs ===";
1227
1368
  start_epoch=$(date +%s);
@@ -91,6 +91,7 @@ setup_commands:
91
91
  rm ~/.local/bin/pip ~/.local/bin/pip3 ~/.local/bin/pip3.8 ~/.local/bin/pip3.10;
92
92
  {{ conda_installation_commands }}
93
93
  {{ ray_skypilot_installation_commands }}
94
+ {{ copy_skypilot_templates_commands }}
94
95
  touch ~/.sudo_as_admin_successful;
95
96
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
96
97
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
@@ -10,6 +10,7 @@ provider:
10
10
  module: sky.provision.nebius
11
11
  region: "{{region}}"
12
12
  use_internal_ips: {{use_internal_ips}}
13
+ use_static_ip_address: {{ use_static_ip_address }}
13
14
 
14
15
  {%- if docker_image is not none %}
15
16
  docker:
@@ -150,11 +151,13 @@ setup_commands:
150
151
  mkdir -p ~/.ssh; touch ~/.ssh/config;
151
152
  {{ conda_installation_commands }}
152
153
  {{ ray_skypilot_installation_commands }}
154
+ {{ copy_skypilot_templates_commands }}
153
155
  {%- if env_vars is defined %}
154
156
  {%- for env_var, env_value in env_vars.items() %}
155
157
  echo '{{env_var}}={{env_value}}' | sudo tee -a /etc/environment;
156
158
  {%- endfor %}
157
159
  {%- endif %}
160
+ IP=$(hostname -I | awk '{print $1}'); echo "$IP $(hostname)" | sudo tee -a /etc/hosts;
158
161
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
159
162
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
160
163
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
@@ -85,6 +85,7 @@ setup_commands:
85
85
  mkdir -p ~/.ssh; touch ~/.ssh/config;
86
86
  {{ conda_installation_commands }}
87
87
  {{ ray_skypilot_installation_commands }}
88
+ {{ copy_skypilot_templates_commands }}
88
89
  touch ~/.sudo_as_admin_successful;
89
90
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
90
91
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
@@ -87,6 +87,7 @@ setup_commands:
87
87
  mkdir -p ~/.ssh; touch ~/.ssh/config;
88
88
  {{ conda_installation_commands }}
89
89
  {{ ray_skypilot_installation_commands }}
90
+ {{ copy_skypilot_templates_commands }}
90
91
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
91
92
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
92
93
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
@@ -0,0 +1,72 @@
1
+ cluster_name: {{cluster_name_on_cloud}}
2
+
3
+ # The maximum number of workers nodes to launch in addition to the head node.
4
+ max_workers: {{num_nodes - 1}}
5
+ upscaling_speed: {{num_nodes - 1}}
6
+ idle_timeout_minutes: 60
7
+
8
+ provider:
9
+ type: external
10
+ module: sky.provision.primeintellect
11
+ region: "{{region}}"
12
+ zones: "{{zones}}"
13
+
14
+ auth:
15
+ ssh_user: skypilot:ssh_user
16
+ ssh_private_key: {{ssh_private_key}}
17
+
18
+ available_node_types:
19
+ ray_head_default:
20
+ resources: {}
21
+ node_config:
22
+ InstanceType: {{instance_type}}
23
+ DiskSize: {{disk_size}}
24
+ ImageId: {{image_id}}
25
+ PublicKey: |-
26
+ skypilot:ssh_public_key_content
27
+
28
+ head_node_type: ray_head_default
29
+
30
+ # Format: `REMOTE_PATH : LOCAL_PATH`
31
+ file_mounts: {
32
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
33
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
34
+ {%- for remote_path, local_path in credentials.items() %}
35
+ "{{remote_path}}": "{{local_path}}",
36
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
37
+ {%- endfor %}
38
+ }
39
+
40
+ rsync_exclude: []
41
+
42
+ initialization_commands: []
43
+
44
+ # List of shell commands to run to set up nodes.
45
+ # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
46
+ # connection, which is expensive. Try your best to co-locate commands into fewer
47
+ # items!
48
+ #
49
+ # Increment the following for catching performance bugs easier:
50
+ # current num items (num SSH connections): 1
51
+ setup_commands:
52
+ # Disable unattended-upgrades and handle apt-get locks
53
+ # Install patch utility for Ray
54
+ # Install conda and Ray
55
+ # Set system limits for Ray performance (nofile and TasksMax)
56
+ - {%- for initial_setup_command in initial_setup_commands %}
57
+ {{ initial_setup_command }}
58
+ {%- endfor %}
59
+ sudo systemctl stop unattended-upgrades || true;
60
+ sudo systemctl disable unattended-upgrades || true;
61
+ sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
62
+ sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
63
+ sudo pkill -9 apt-get;
64
+ sudo pkill -9 dpkg;
65
+ sudo dpkg --configure -a;
66
+ which patch > /dev/null || sudo apt install -y patch;
67
+ {{ conda_installation_commands }}
68
+ {{ ray_skypilot_installation_commands }}
69
+ {{ copy_skypilot_templates_commands }}
70
+ sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
71
+ sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
72
+ {{ ssh_max_sessions_config }}
@@ -93,6 +93,7 @@ setup_commands:
93
93
  mkdir -p ~/.ssh; touch ~/.ssh/config;
94
94
  {{ conda_installation_commands }}
95
95
  {{ ray_skypilot_installation_commands }}
96
+ {{ copy_skypilot_templates_commands }}
96
97
  touch ~/.sudo_as_admin_successful;
97
98
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
98
99
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
@@ -56,6 +56,7 @@ setup_commands:
56
56
  - mkdir -p ~/.ssh; touch ~/.ssh/config;
57
57
  {{ conda_installation_commands }}
58
58
  {{ ray_skypilot_installation_commands }}
59
+ {{ copy_skypilot_templates_commands }}
59
60
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
60
61
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
61
62
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;