skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -33,14 +33,11 @@ provider:
33
33
  networking_mode: {{k8s_networking_mode}}
34
34
 
35
35
  # We use internal IPs since we set up a port-forward between the kubernetes
36
- # cluster and the local machine, or directly use NodePort to reach the
37
- # head node.
36
+ # cluster and the local machine.
38
37
  use_internal_ips: true
39
38
 
40
39
  timeout: {{timeout}}
41
40
 
42
- ssh_jump_image: {{k8s_ssh_jump_image}}
43
-
44
41
  # Namespace used to host SkyPilot system components, such as fuse device
45
42
  # manager.
46
43
  skypilot_system_namespace: {{k8s_skypilot_system_namespace}}
@@ -49,6 +46,10 @@ provider:
49
46
  # Used to set up the necessary permissions and sidecars.
50
47
  fuse_device_required: {{k8s_fuse_device_required}}
51
48
 
49
+ {% if ephemeral_volume_mounts %}
50
+ ephemeral_volume_specs: {{ephemeral_volume_mounts | tojson}}
51
+ {% endif %}
52
+
52
53
  # ServiceAccount created by the autoscaler for the head node pod that it
53
54
  # runs in. If this field isn't provided, the head pod config below must
54
55
  # contain a user-created service account with the proper permissions.
@@ -212,7 +213,9 @@ provider:
212
213
  metadata:
213
214
  labels:
214
215
  parent: skypilot
216
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
215
217
  skypilot-cluster: {{cluster_name_on_cloud}}
218
+ skypilot-cluster-name: {{cluster_name_on_cloud}}
216
219
  skypilot-user: {{ user }}
217
220
  name: {{cluster_name_on_cloud}}-head-ssh
218
221
  spec:
@@ -230,7 +233,9 @@ provider:
230
233
  metadata:
231
234
  labels:
232
235
  parent: skypilot
236
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
233
237
  skypilot-cluster: {{cluster_name_on_cloud}}
238
+ skypilot-cluster-name: {{cluster_name_on_cloud}}
234
239
  skypilot-user: {{ user }}
235
240
  # NOTE: If you're running multiple Ray clusters with services
236
241
  # on one Kubernetes cluster, they must have unique service
@@ -250,7 +255,9 @@ provider:
250
255
  metadata:
251
256
  labels:
252
257
  parent: skypilot
258
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
253
259
  skypilot-cluster: {{cluster_name_on_cloud}}
260
+ skypilot-cluster-name: {{cluster_name_on_cloud}}
254
261
  skypilot-user: {{ user }}
255
262
  name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
256
263
  spec:
@@ -275,9 +282,8 @@ available_node_types:
275
282
  labels:
276
283
  parent: skypilot
277
284
  # component will be set for the head node pod to be the same as the head node service selector above if a
285
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
278
286
  skypilot-cluster: {{cluster_name_on_cloud}}
279
- # Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
280
- skypilot-ssh-jump: {{k8s_ssh_jump_name}}
281
287
  skypilot-user: {{ user }}
282
288
  # Custom tags for the pods
283
289
  {%- for label_key, label_value in labels.items() %}
@@ -444,9 +450,6 @@ available_node_types:
444
450
  # object store. If you do not provide this, Ray will fall back to
445
451
  # /tmp which cause slowdowns if is not a shared memory volume.
446
452
  volumes:
447
- - name: secret-volume
448
- secret:
449
- secretName: {{k8s_ssh_key_secret_name}}
450
453
  - name: dshm
451
454
  emptyDir:
452
455
  medium: Memory
@@ -510,6 +513,24 @@ available_node_types:
510
513
  valueFrom:
511
514
  fieldRef:
512
515
  fieldPath: metadata.labels['ray-node-type']
516
+ - name: SKYPILOT_POD_CPU_CORE_LIMIT
517
+ valueFrom:
518
+ resourceFieldRef:
519
+ containerName: ray-node
520
+ resource: requests.cpu
521
+ - name: SKYPILOT_POD_MEMORY_BYTES_LIMIT
522
+ valueFrom:
523
+ resourceFieldRef:
524
+ containerName: ray-node
525
+ resource: requests.memory
526
+ # Disable Ray memory monitor to prevent Ray's memory manager
527
+ # from interfering with kubernetes resource manager.
528
+ # If ray memory monitor is enabled, the ray memory monitor kills
529
+ # the running job is the job uses more than 95% of allocated memory,
530
+ # even if the job is not misbehaving or using its full allocated memory.
531
+ # This behavior does not give a chance for k8s scheduler to evict the pod.
532
+ - name: RAY_memory_monitor_refresh_ms
533
+ value: "0"
513
534
  {% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
514
535
  - name: {{ key }}
515
536
  value: {{ value }}
@@ -630,12 +651,17 @@ available_node_types:
630
651
  command: ["/bin/bash", "-c", "--"]
631
652
  args:
632
653
  - |
633
- # For backwards compatibility, we put a marker file in the pod
634
- # to indicate that the pod is running with the changes introduced
635
- # in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
636
- # TODO: Remove this marker file and it's usage in setup_commands
637
- # after v0.10.0 release.
638
- touch /tmp/skypilot_is_nimbus
654
+ # Set -x to print the commands and their arguments as they are executed.
655
+ # Useful for debugging.
656
+ set -x
657
+
658
+ # Execute user-provided post-provision runcmd
659
+ # before any of the SkyPilot setup commands.
660
+ {%- if runcmd %}
661
+ {%- for cmd in runcmd %}
662
+ {{cmd}}
663
+ {%- endfor %}
664
+ {%- endif %}
639
665
 
640
666
  # Helper function to conditionally use sudo
641
667
  # TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
@@ -647,15 +673,125 @@ available_node_types:
647
673
  # STEP 1: Run apt update, install missing packages, and set up ssh.
648
674
  (
649
675
  (
650
- # For backwards compatibility, we put a marker file in the pod
651
- # to indicate that the apt ssh setup step will write a completion
652
- # marker file (/tmp/apt_ssh_setup_complete) to the pod.
653
- # TODO: Remove this marker file and its usage in setup_commands
654
- # after v0.11.0 release.
655
- touch /tmp/apt_ssh_setup_started
656
-
657
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
658
- echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
676
+ # Helper: run apt-get update with retries
677
+ apt_update_with_retries() {
678
+ # do not fail the whole shell; we handle return codes
679
+ set +e
680
+ local log=/tmp/apt-update.log
681
+ local tries=3
682
+ local delay=1
683
+ local i
684
+ for i in $(seq 1 $tries); do
685
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update >> "$log" 2>&1 && { set -e; return 0; }
686
+ echo "apt-get update attempt $i/$tries failed; retrying in ${delay}s" >> "$log"
687
+ sleep $delay
688
+ delay=$((delay * 2))
689
+ done
690
+ set -e
691
+ return 1
692
+ }
693
+ apt_install_with_retries() {
694
+ local packages="$@"
695
+ [ -z "$packages" ] && return 0
696
+ set +e
697
+ local log=/tmp/apt-update.log
698
+ local tries=3
699
+ local delay=1
700
+ local i
701
+ for i in $(seq 1 $tries); do
702
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $packages && { set -e; return 0; }
703
+ echo "apt-get install failed for: $packages (attempt $i/$tries). Running -f install and retrying..." >> "$log"
704
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get -f install -y >> "$log" 2>&1 || true
705
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get clean >> "$log" 2>&1 || true
706
+ sleep $delay
707
+ delay=$((delay * 2))
708
+ done
709
+ set -e
710
+ return 1
711
+ }
712
+ apt_update_install_with_retries() {
713
+ apt_update_with_retries
714
+ apt_install_with_retries "$@"
715
+ }
716
+ backup_dir=/etc/apt/sources.list.backup_skypilot
717
+ backup_source() {
718
+ $(prefix_cmd) mkdir -p "$backup_dir"
719
+ if [ -f /etc/apt/sources.list ] && [ ! -f "$backup_dir/sources.list" ]; then
720
+ $(prefix_cmd) cp -a /etc/apt/sources.list "$backup_dir/sources.list" || true
721
+ fi
722
+ }
723
+ restore_source() {
724
+ if [ -f "$backup_dir/sources.list" ]; then
725
+ $(prefix_cmd) cp -a "$backup_dir/sources.list" /etc/apt/sources.list || true
726
+ fi
727
+ }
728
+ update_apt_sources() {
729
+ local host=$1
730
+ local apt_file=$2
731
+ $(prefix_cmd) sed -i -E "s|https?://[a-zA-Z0-9.-]+\.ubuntu\.com/ubuntu|http://$host/ubuntu|g" $apt_file
732
+ }
733
+ # Helper: install packages across mirrors with retries
734
+ apt_install_with_mirrors() {
735
+ local required=$1; shift
736
+ local packages="$@"
737
+ [ -z "$packages" ] && return 0
738
+ set +e
739
+ # Install packages with default sources first
740
+ local log=/tmp/apt-update.log
741
+ echo "$(date +%Y-%m-%d\ %H:%M:%S) Installing packages: $packages" >> "$log"
742
+ restore_source
743
+ apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
744
+ echo "Install failed with default sources: $packages" >> "$log"
745
+ # Detect distro (ubuntu/debian)
746
+ local APT_OS="unknown"
747
+ if [ -f /etc/os-release ]; then
748
+ . /etc/os-release
749
+ case "$ID" in
750
+ debian) APT_OS="debian" ;;
751
+ ubuntu) APT_OS="ubuntu" ;;
752
+ *)
753
+ if [ -n "$ID_LIKE" ]; then
754
+ case " $ID $ID_LIKE " in
755
+ *ubuntu*) APT_OS="ubuntu" ;;
756
+ *debian*) APT_OS="debian" ;;
757
+ esac
758
+ fi
759
+ ;;
760
+ esac
761
+ fi
762
+ # Build mirror candidates
763
+ # deb.debian.org is a CDN endpoint, if one backend goes down,
764
+ # the CDN automatically fails over to another mirror,
765
+ # so we only retry for ubuntu here.
766
+ if [ "$APT_OS" = "ubuntu" ]; then
767
+ # Backup current sources once
768
+ backup_source
769
+ # Selected from https://launchpad.net/ubuntu/+archivemirrors
770
+ # and results from apt-select
771
+ local MIRROR_CANDIDATES="mirrors.wikimedia.org mirror.umd.edu"
772
+ for host in $MIRROR_CANDIDATES; do
773
+ echo "Trying APT mirror ($APT_OS): $host" >> "$log"
774
+ if [ -f /etc/apt/sources.list ]; then
775
+ update_apt_sources $host /etc/apt/sources.list
776
+ else
777
+ echo "Error: /etc/apt/sources.list not found" >> "$log"
778
+ break
779
+ fi
780
+ apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
781
+ echo "Install failed with mirror ($APT_OS): $host" >> "$log"
782
+ # Restore to default sources
783
+ restore_source
784
+ done
785
+ fi
786
+ set -e
787
+ if [ "$required" = "1" ]; then
788
+ echo "Error: required package install failed across all mirrors: $packages" >> "$log"
789
+ return 1
790
+ else
791
+ echo "Optional package install failed across all mirrors: $packages; skipping." >> "$log"
792
+ return 0
793
+ fi
794
+ }
659
795
  # Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
660
796
  # so that both fusemount and fusermount3 can be masked before enabling SSH access.
661
797
  PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
@@ -682,7 +818,7 @@ available_node_types:
682
818
  done;
683
819
  if [ ! -z "$INSTALL_FIRST" ]; then
684
820
  echo "Installing core packages: $INSTALL_FIRST";
685
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $INSTALL_FIRST;
821
+ apt_install_with_mirrors 1 $INSTALL_FIRST || { echo "Error: core package installation failed." >> /tmp/apt-update.log; exit 1; }
686
822
  fi;
687
823
  # SSH and other packages are not necessary, so we disable set -e
688
824
  set +e
@@ -706,7 +842,8 @@ available_node_types:
706
842
  fi
707
843
  $(prefix_cmd) cp -p "$FUSERMOUNT_PATH" "${FUSERMOUNT_PATH}-original"
708
844
  $(prefix_cmd) ln -sf {{k8s_fusermount_shared_dir}}/fusermount-shim "$FUSERMOUNT_PATH"
709
- FUSERMOUNT3_PATH=$(which fusermount3)
845
+ # "|| true" because fusermount3 is not always available
846
+ FUSERMOUNT3_PATH=$(which fusermount3) || true
710
847
  if [ -z "$FUSERMOUNT3_PATH" ]; then
711
848
  FUSERMOUNT3_PATH="${FUSERMOUNT_PATH}3"
712
849
  fi
@@ -748,18 +885,23 @@ available_node_types:
748
885
  $(prefix_cmd) mkdir -p ~/.ssh;
749
886
  $(prefix_cmd) chown -R $(whoami) ~/.ssh;
750
887
  $(prefix_cmd) chmod 700 ~/.ssh;
751
- $(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > ~/.ssh/authorized_keys;
888
+ $(prefix_cmd) cat > ~/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
889
+ skypilot:ssh_public_key_content
890
+ SKYPILOT_SSH_KEY_EOF
752
891
  $(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
753
892
  $(prefix_cmd) service ssh restart;
754
893
  $(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
755
894
 
756
895
  touch /tmp/apt_ssh_setup_complete
757
896
  echo "=== SSH setup completed ==="
758
- ) > /tmp/${STEPS[0]}.log 2>&1 || {
759
- echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed
897
+ ) > /tmp/${STEPS[0]}.log 2>&1
898
+ if [ "$?" -ne "0" ]; then
899
+ {
900
+ echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed 2>&1
760
901
  cat /tmp/${STEPS[0]}.log
761
902
  exit 1
762
- }
903
+ }
904
+ fi
763
905
  ) &
764
906
 
765
907
  # STEP 2: Install conda, ray and skypilot (for dependencies); start
@@ -777,15 +919,20 @@ available_node_types:
777
919
  {{ conda_installation_commands }}
778
920
  {{ ray_installation_commands }}
779
921
 
780
- VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
922
+ # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
923
+ # unset PYTHONPATH in case the user provided docker image set it.
924
+ VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
781
925
  # Wait for `patch` package to be installed before applying ray patches
782
926
  until dpkg -l | grep -q "^ii patch "; do
783
927
  sleep 0.1
784
928
  echo "Waiting for patch package to be installed..."
785
929
  done
786
930
  # Apply Ray patches for progress bar fix
787
- ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
788
- VIRTUAL_ENV=~/skypilot-runtime python -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
931
+ # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
932
+ # unset PYTHONPATH in case the user provided docker image set it.
933
+ # ~/.sky/python_path is seeded by conda_installation_commands
934
+ VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
935
+ env -u PYTHONPATH $(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
789
936
  }
790
937
  touch /tmp/ray_skypilot_installation_complete
791
938
  echo "=== Ray and skypilot installation completed ==="
@@ -814,11 +961,14 @@ available_node_types:
814
961
  set +e
815
962
  {{ ray_worker_start_command }}
816
963
  fi
817
- ) > /tmp/${STEPS[1]}.log 2>&1 || {
818
- echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed
964
+ ) > /tmp/${STEPS[1]}.log 2>&1
965
+ if [ "$?" -ne "0" ]; then
966
+ {
967
+ echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed 2>&1
819
968
  cat /tmp/${STEPS[1]}.log
820
969
  exit 1
821
- }
970
+ }
971
+ fi
822
972
  ) &
823
973
 
824
974
 
@@ -836,11 +986,14 @@ available_node_types:
836
986
  fi;
837
987
  fi;
838
988
  export -p > ~/container_env_var.sh && $(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh
839
- ) > /tmp/${STEPS[2]}.log 2>&1 || {
840
- echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed
989
+ ) > /tmp/${STEPS[2]}.log 2>&1
990
+ if [ "$?" -ne "0" ]; then
991
+ {
992
+ echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed 2>&1
841
993
  cat /tmp/${STEPS[2]}.log
842
994
  exit 1
843
- }
995
+ }
996
+ fi
844
997
  ) &
845
998
 
846
999
  function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
@@ -927,7 +1080,7 @@ available_node_types:
927
1080
  # Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
928
1081
  # For SkyServe, this will be None and every service will be recovered. This is because SkyServe
929
1082
  # will delete the service from the database after it is terminated so everything in the database is running.
930
- ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs = state.get_managed_jobs(); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
1083
+ ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs, _ = state.get_managed_jobs_with_filters(fields=['job_id', 'schedule_state']); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
931
1084
  if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
932
1085
  read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
933
1086
  fi
@@ -957,6 +1110,8 @@ available_node_types:
957
1110
 
958
1111
  touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
959
1112
  {% endif %}
1113
+ # Set +x to stop printing the commands and their arguments as they are executed.
1114
+ set +x
960
1115
 
961
1116
  trap : TERM INT; log_tail || sleep infinity & wait
962
1117
 
@@ -970,9 +1125,6 @@ available_node_types:
970
1125
  # object store. If you do not provide this, Ray will fall back to
971
1126
  # /tmp which cause slowdowns if is not a shared memory volume.
972
1127
  volumeMounts:
973
- - name: secret-volume
974
- readOnly: true
975
- mountPath: "/etc/secret-volume"
976
1128
  - mountPath: /dev/shm
977
1129
  name: dshm
978
1130
  {% if k8s_enable_gpudirect_tcpx %}
@@ -1204,24 +1356,21 @@ setup_commands:
1204
1356
  start_epoch=$(date +%s);
1205
1357
 
1206
1358
  # Wait for SSH setup to complete before proceeding
1207
- if [ -f /tmp/apt_ssh_setup_started ]; then
1208
- echo "=== Logs for asynchronous SSH setup ===";
1209
- [ -f /tmp/apt_ssh_setup_complete ] && cat /tmp/${STEPS[0]}.log ||
1210
- { tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/apt_ssh_setup_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1211
- [ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
1212
- fi
1359
+ echo "=== Logs for asynchronous SSH setup ===";
1360
+ ([ -f /tmp/apt_ssh_setup_complete ]|| [ -f /tmp/${STEPS[0]}.failed ]) && cat /tmp/${STEPS[0]}.log ||
1361
+ { tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1362
+ [ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
1213
1363
 
1214
1364
  echo "=== Logs for asynchronous ray and skypilot installation ===";
1215
- if [ -f /tmp/skypilot_is_nimbus ]; then
1216
- echo "=== Logs for asynchronous ray and skypilot installation ===";
1217
- [ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
1218
- { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/ray_skypilot_installation_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1219
- [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
1220
- fi
1365
+ ([ -f /tmp/ray_skypilot_installation_complete ]|| [ -f /tmp/${STEPS[1]}.failed ]) && cat /tmp/${STEPS[1]}.log ||
1366
+ { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1367
+ [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
1368
+
1221
1369
  end_epoch=$(date +%s);
1222
1370
  echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
1223
1371
  start_epoch=$(date +%s);
1224
1372
  {{ skypilot_wheel_installation_commands }}
1373
+ {{ copy_skypilot_templates_commands }}
1225
1374
  end_epoch=$(date +%s);
1226
1375
  echo "=== Skypilot wheel installation completed in $(($end_epoch - $start_epoch)) secs ===";
1227
1376
  start_epoch=$(date +%s);
@@ -91,6 +91,7 @@ setup_commands:
91
91
  rm ~/.local/bin/pip ~/.local/bin/pip3 ~/.local/bin/pip3.8 ~/.local/bin/pip3.10;
92
92
  {{ conda_installation_commands }}
93
93
  {{ ray_skypilot_installation_commands }}
94
+ {{ copy_skypilot_templates_commands }}
94
95
  touch ~/.sudo_as_admin_successful;
95
96
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
96
97
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
@@ -10,6 +10,7 @@ provider:
10
10
  module: sky.provision.nebius
11
11
  region: "{{region}}"
12
12
  use_internal_ips: {{use_internal_ips}}
13
+ use_static_ip_address: {{ use_static_ip_address }}
13
14
 
14
15
  {%- if docker_image is not none %}
15
16
  docker:
@@ -150,11 +151,13 @@ setup_commands:
150
151
  mkdir -p ~/.ssh; touch ~/.ssh/config;
151
152
  {{ conda_installation_commands }}
152
153
  {{ ray_skypilot_installation_commands }}
154
+ {{ copy_skypilot_templates_commands }}
153
155
  {%- if env_vars is defined %}
154
156
  {%- for env_var, env_value in env_vars.items() %}
155
157
  echo '{{env_var}}={{env_value}}' | sudo tee -a /etc/environment;
156
158
  {%- endfor %}
157
159
  {%- endif %}
160
+ IP=$(hostname -I | awk '{print $1}'); echo "$IP $(hostname)" | sudo tee -a /etc/hosts;
158
161
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
159
162
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
160
163
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
@@ -85,6 +85,7 @@ setup_commands:
85
85
  mkdir -p ~/.ssh; touch ~/.ssh/config;
86
86
  {{ conda_installation_commands }}
87
87
  {{ ray_skypilot_installation_commands }}
88
+ {{ copy_skypilot_templates_commands }}
88
89
  touch ~/.sudo_as_admin_successful;
89
90
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
90
91
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
@@ -87,6 +87,7 @@ setup_commands:
87
87
  mkdir -p ~/.ssh; touch ~/.ssh/config;
88
88
  {{ conda_installation_commands }}
89
89
  {{ ray_skypilot_installation_commands }}
90
+ {{ copy_skypilot_templates_commands }}
90
91
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
91
92
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
92
93
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
@@ -0,0 +1,72 @@
1
+ cluster_name: {{cluster_name_on_cloud}}
2
+
3
+ # The maximum number of workers nodes to launch in addition to the head node.
4
+ max_workers: {{num_nodes - 1}}
5
+ upscaling_speed: {{num_nodes - 1}}
6
+ idle_timeout_minutes: 60
7
+
8
+ provider:
9
+ type: external
10
+ module: sky.provision.primeintellect
11
+ region: "{{region}}"
12
+ zones: "{{zones}}"
13
+
14
+ auth:
15
+ ssh_user: skypilot:ssh_user
16
+ ssh_private_key: {{ssh_private_key}}
17
+
18
+ available_node_types:
19
+ ray_head_default:
20
+ resources: {}
21
+ node_config:
22
+ InstanceType: {{instance_type}}
23
+ DiskSize: {{disk_size}}
24
+ ImageId: {{image_id}}
25
+ PublicKey: |-
26
+ skypilot:ssh_public_key_content
27
+
28
+ head_node_type: ray_head_default
29
+
30
+ # Format: `REMOTE_PATH : LOCAL_PATH`
31
+ file_mounts: {
32
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
33
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
34
+ {%- for remote_path, local_path in credentials.items() %}
35
+ "{{remote_path}}": "{{local_path}}",
36
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
37
+ {%- endfor %}
38
+ }
39
+
40
+ rsync_exclude: []
41
+
42
+ initialization_commands: []
43
+
44
+ # List of shell commands to run to set up nodes.
45
+ # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
46
+ # connection, which is expensive. Try your best to co-locate commands into fewer
47
+ # items!
48
+ #
49
+ # Increment the following for catching performance bugs easier:
50
+ # current num items (num SSH connections): 1
51
+ setup_commands:
52
+ # Disable unattended-upgrades and handle apt-get locks
53
+ # Install patch utility for Ray
54
+ # Install conda and Ray
55
+ # Set system limits for Ray performance (nofile and TasksMax)
56
+ - {%- for initial_setup_command in initial_setup_commands %}
57
+ {{ initial_setup_command }}
58
+ {%- endfor %}
59
+ sudo systemctl stop unattended-upgrades || true;
60
+ sudo systemctl disable unattended-upgrades || true;
61
+ sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
62
+ sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
63
+ sudo pkill -9 apt-get;
64
+ sudo pkill -9 dpkg;
65
+ sudo dpkg --configure -a;
66
+ which patch > /dev/null || sudo apt install -y patch;
67
+ {{ conda_installation_commands }}
68
+ {{ ray_skypilot_installation_commands }}
69
+ {{ copy_skypilot_templates_commands }}
70
+ sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
71
+ sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
72
+ {{ ssh_max_sessions_config }}
@@ -93,6 +93,7 @@ setup_commands:
93
93
  mkdir -p ~/.ssh; touch ~/.ssh/config;
94
94
  {{ conda_installation_commands }}
95
95
  {{ ray_skypilot_installation_commands }}
96
+ {{ copy_skypilot_templates_commands }}
96
97
  touch ~/.sudo_as_admin_successful;
97
98
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
98
99
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
@@ -56,6 +56,7 @@ setup_commands:
56
56
  - mkdir -p ~/.ssh; touch ~/.ssh/config;
57
57
  {{ conda_installation_commands }}
58
58
  {{ ray_skypilot_installation_commands }}
59
+ {{ copy_skypilot_templates_commands }}
59
60
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
60
61
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
61
62
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;