skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/skylet/constants.py CHANGED
@@ -1,5 +1,4 @@
1
1
  """Constants for SkyPilot."""
2
- import os
3
2
  from typing import List, Tuple
4
3
 
5
4
  from packaging import version
@@ -7,8 +6,26 @@ from packaging import version
7
6
  import sky
8
7
  from sky.setup_files import dependencies
9
8
 
9
+ # The base directory for all SkyPilot runtime artifacts.
10
+ # Historically, we have always used $HOME, but we couldn't
11
+ # do that for Slurm, because $HOME typically points to a NFS
12
+ # mounted directory, which does not work well with SQLite.
13
+ # https://sqlite.org/faq.html#q5
14
+ # Additionally, having the skypilot-runtime python venv be
15
+ # on an NFS makes things very slow.
16
+ SKY_RUNTIME_DIR = '${SKY_RUNTIME_DIR:-$HOME}'
17
+ # Same as above but for use within python code instead of shell commands.
18
+ # Example usage:
19
+ # os.path.join(
20
+ # os.path.expanduser(os.environ.get(SKY_RUNTIME_DIR_ENV_VAR_KEY, '~')),
21
+ # '.sky/jobs.db')
22
+ SKY_RUNTIME_DIR_ENV_VAR_KEY = 'SKY_RUNTIME_DIR'
23
+ # We keep sky_logs and sky_workdir in $HOME, because
24
+ # these are artifacts that users can access, and having
25
+ # them be in $HOME makes it more convenient.
10
26
  SKY_LOGS_DIRECTORY = '~/sky_logs'
11
27
  SKY_REMOTE_WORKDIR = '~/sky_workdir'
28
+ SKY_TEMPLATES_DIRECTORY = '~/sky_templates'
12
29
  SKY_IGNORE_FILE = '.skyignore'
13
30
  GIT_IGNORE_FILE = '.gitignore'
14
31
 
@@ -25,22 +42,23 @@ SKY_REMOTE_RAY_PORT_DICT_STR = (
25
42
  f'"ray_dashboard_port":{SKY_REMOTE_RAY_DASHBOARD_PORT}}}')
26
43
  # The file contains the ports of the Ray cluster that SkyPilot launched,
27
44
  # i.e. the PORT_DICT_STR above.
28
- SKY_REMOTE_RAY_PORT_FILE = '~/.sky/ray_port.json'
45
+ SKY_REMOTE_RAY_PORT_FILE = '.sky/ray_port.json'
29
46
  SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
30
47
  SKY_REMOTE_RAY_VERSION = '2.9.3'
31
48
 
49
+ SKY_UNSET_PYTHONPATH = 'env -u PYTHONPATH'
32
50
  # We store the absolute path of the python executable (/opt/conda/bin/python3)
33
51
  # in this file, so that any future internal commands that need to use python
34
52
  # can use this path. This is useful for the case where the user has a custom
35
53
  # conda environment as a default environment, which is not the same as the one
36
54
  # used for installing SkyPilot runtime (ray and skypilot).
37
- SKY_PYTHON_PATH_FILE = '~/.sky/python_path'
38
- SKY_RAY_PATH_FILE = '~/.sky/ray_path'
55
+ SKY_PYTHON_PATH_FILE = f'{SKY_RUNTIME_DIR}/.sky/python_path'
56
+ SKY_RAY_PATH_FILE = f'{SKY_RUNTIME_DIR}/.sky/ray_path'
39
57
  SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
40
58
  f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
41
59
  'which python3')
42
60
  # Python executable, e.g., /opt/conda/bin/python3
43
- SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
61
+ SKY_PYTHON_CMD = f'{SKY_UNSET_PYTHONPATH} $({SKY_GET_PYTHON_PATH_CMD})'
44
62
  # Prefer SKY_UV_PIP_CMD, which is faster.
45
63
  # TODO(cooperc): remove remaining usage (GCP TPU setup).
46
64
  SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
@@ -50,23 +68,44 @@ SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
50
68
  # #!/opt/conda/bin/python3
51
69
  SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
52
70
  f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
71
+
72
+ # Use $(which env) to find env, falling back to /usr/bin/env if which is
73
+ # unavailable. This works around a Slurm quirk where srun's execvp() doesn't
74
+ # check execute permissions, failing when $HOME/.local/bin/env (non-executable,
75
+ # from uv installation) shadows /usr/bin/env.
76
+ SKY_SLURM_UNSET_PYTHONPATH = ('$(which env 2>/dev/null || echo /usr/bin/env) '
77
+ '-u PYTHONPATH')
78
+ SKY_SLURM_PYTHON_CMD = (f'{SKY_SLURM_UNSET_PYTHONPATH} '
79
+ f'$({SKY_GET_PYTHON_PATH_CMD})')
80
+
53
81
  # Separate env for SkyPilot runtime dependencies.
54
82
  SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
55
- SKY_REMOTE_PYTHON_ENV: str = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
83
+ SKY_REMOTE_PYTHON_ENV: str = f'{SKY_RUNTIME_DIR}/{SKY_REMOTE_PYTHON_ENV_NAME}'
56
84
  ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
85
+ # Place the conda root in the runtime directory, as installing to $HOME
86
+ # on an NFS takes too long (1-2m slower).
87
+ SKY_CONDA_ROOT = f'{SKY_RUNTIME_DIR}/miniconda3'
57
88
  # uv is used for venv and pip, much faster than python implementations.
58
89
  SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
59
- SKY_UV_CMD = f'UV_SYSTEM_PYTHON=false {SKY_UV_INSTALL_DIR}/uv'
90
+ # set UV_SYSTEM_PYTHON to false in case the
91
+ # user provided docker image set it to true.
92
+ # unset PYTHONPATH in case the user provided docker image set it.
93
+ SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
94
+ f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
60
95
  # This won't reinstall uv if it's already installed, so it's safe to re-run.
61
96
  SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
62
97
  'curl -LsSf https://astral.sh/uv/install.sh '
63
98
  f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
64
99
  SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
65
- # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH to deactivate the
66
- # environment. `deactivate` command does not work when conda is used.
100
+ SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run '
101
+ '--no-project --no-config')
102
+ # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
103
+ # VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
104
+ # not work when conda is used.
67
105
  DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
68
106
  'export PATH='
69
- f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||")')
107
+ f'$(echo $PATH | sed "s|$(echo {SKY_REMOTE_PYTHON_ENV})/bin:||") && '
108
+ 'unset VIRTUAL_ENV && unset VIRTUAL_ENV_PROMPT')
70
109
 
71
110
  # Prefix for SkyPilot environment variables
72
111
  SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
@@ -91,14 +130,17 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
91
130
  # cluster yaml is updated.
92
131
  #
93
132
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
94
- SKYLET_VERSION = '17'
133
+ SKYLET_VERSION = '27'
95
134
  # The version of the lib files that skylet/jobs use. Whenever there is an API
96
135
  # change for the job_lib or log_lib, we need to bump this version, so that the
97
136
  # user can be notified to update their SkyPilot version on the remote cluster.
98
137
  SKYLET_LIB_VERSION = 4
99
- SKYLET_VERSION_FILE = '~/.sky/skylet_version'
138
+ SKYLET_VERSION_FILE = '.sky/skylet_version'
139
+ SKYLET_LOG_FILE = '.sky/skylet.log'
140
+ SKYLET_PID_FILE = '.sky/skylet_pid'
141
+ SKYLET_PORT_FILE = '.sky/skylet_port'
100
142
  SKYLET_GRPC_PORT = 46590
101
- SKYLET_GRPC_TIMEOUT_SECONDS = 5
143
+ SKYLET_GRPC_TIMEOUT_SECONDS = 10
102
144
 
103
145
  # Docker default options
104
146
  DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
@@ -134,6 +176,10 @@ DISABLE_GPU_ECC_COMMAND = (
134
176
  '{ sudo reboot || echo "Failed to reboot. ECC mode may not be disabled"; } '
135
177
  '|| true; ')
136
178
 
179
+ SETUP_SKY_DIRS_COMMANDS = (f'mkdir -p ~/sky_workdir && '
180
+ f'mkdir -p ~/.sky/sky_app && '
181
+ f'mkdir -p {SKY_RUNTIME_DIR}/.sky;')
182
+
137
183
  # Install conda on the remote cluster if it is not already installed.
138
184
  # We use conda with python 3.10 to be consistent across multiple clouds with
139
185
  # best effort.
@@ -150,8 +196,9 @@ CONDA_INSTALLATION_COMMANDS = (
150
196
  # because for some images, conda is already installed, but not initialized.
151
197
  # In this case, we need to initialize conda and set auto_activate_base to
152
198
  # true.
153
- '{ bash Miniconda3-Linux.sh -b; '
154
- 'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
199
+ '{ '
200
+ f'bash Miniconda3-Linux.sh -b -p "{SKY_CONDA_ROOT}" || true; '
201
+ f'eval "$({SKY_CONDA_ROOT}/bin/conda shell.bash hook)" && conda init && '
155
202
  # Caller should replace {conda_auto_activate} with either true or false.
156
203
  'conda config --set auto_activate_base {conda_auto_activate} && '
157
204
  'conda activate base; }; '
@@ -172,7 +219,7 @@ CONDA_INSTALLATION_COMMANDS = (
172
219
  'fi;'
173
220
  # Install uv for venv management and pip installation.
174
221
  f'{SKY_UV_INSTALL_CMD};'
175
- # Create a separate conda environment for SkyPilot dependencies.
222
+ # Create a separate python environment for SkyPilot dependencies.
176
223
  f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
177
224
  # Do NOT use --system-site-packages here, because if users upgrade any
178
225
  # packages in the base env, they interfere with skypilot dependencies.
@@ -194,7 +241,7 @@ _sky_version = str(version.parse(sky.__version__))
194
241
  RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status'
195
242
  RAY_INSTALLATION_COMMANDS = (
196
243
  f'{SKY_UV_INSTALL_CMD};'
197
- 'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;'
244
+ f'{SETUP_SKY_DIRS_COMMANDS}'
198
245
  # Print the PATH in provision.log to help debug PATH issues.
199
246
  'echo PATH=$PATH; '
200
247
  # Install setuptools<=69.5.1 to avoid the issue with the latest setuptools
@@ -217,7 +264,9 @@ RAY_INSTALLATION_COMMANDS = (
217
264
  f'{SKY_UV_PIP_CMD} list | grep "ray " | '
218
265
  f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
219
266
  f'|| {RAY_STATUS} || '
220
- f'{SKY_UV_PIP_CMD} install -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long
267
+ # The pydantic-core==2.41.3 for arm seems corrupted
268
+ # so we need to avoid that specific version.
269
+ f'{SKY_UV_PIP_CMD} install -U "ray[default]=={SKY_REMOTE_RAY_VERSION}" "pydantic-core==2.41.1"; ' # pylint: disable=line-too-long
221
270
  # In some envs, e.g. pip does not have permission to write under /opt/conda
222
271
  # ray package will be installed under ~/.local/bin. If the user's PATH does
223
272
  # not include ~/.local/bin (the pip install will have the output: `WARNING:
@@ -226,12 +275,32 @@ RAY_INSTALLATION_COMMANDS = (
226
275
  #
227
276
  # Here, we add ~/.local/bin to the end of the PATH to make sure the issues
228
277
  # mentioned above are resolved.
229
- 'export PATH=$PATH:$HOME/.local/bin; '
278
+ f'export PATH=$PATH:{SKY_RUNTIME_DIR}/.local/bin; '
230
279
  # Writes ray path to file if it does not exist or the file is empty.
231
280
  f'[ -s {SKY_RAY_PATH_FILE} ] || '
232
- f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && '
281
+ f'{{ {SKY_UV_RUN_CMD} '
233
282
  f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
234
283
 
284
+ # Copy SkyPilot templates from the installed wheel to ~/sky_templates.
285
+ # This must run after the skypilot wheel is installed.
286
+ # Note: We remove ~/sky_templates first to avoid import conflicts where Python
287
+ # would import from ~/sky_templates instead of site-packages (because
288
+ # sky_templates itself is a package), leading to src == dst error when
289
+ # launching on an existing cluster.
290
+ COPY_SKYPILOT_TEMPLATES_COMMANDS = (
291
+ f'rm -rf {SKY_TEMPLATES_DIRECTORY}; '
292
+ f'{ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
293
+ f'{SKY_PYTHON_CMD} -c \''
294
+ 'import sky_templates, shutil, os; '
295
+ 'src = os.path.dirname(sky_templates.__file__); '
296
+ f'dst = os.path.expanduser(\"{SKY_TEMPLATES_DIRECTORY}\"); '
297
+ 'print(f\"Copying templates from {src} to {dst}...\"); '
298
+ 'shutil.copytree(src, dst); '
299
+ 'print(f\"Templates copied successfully\")\'; '
300
+ # Make scripts executable.
301
+ f'find {SKY_TEMPLATES_DIRECTORY} -type f ! -name "*.py" ! -name "*.md" '
302
+ '-exec chmod +x {} + ; ')
303
+
235
304
  SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
236
305
  f'{SKY_UV_INSTALL_CMD};'
237
306
  f'{{ {SKY_UV_PIP_CMD} list | grep "skypilot " && '
@@ -322,6 +391,14 @@ FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.sky/tmp/'
322
391
  # controller_utils.translate_local_file_mounts_to_two_hop().
323
392
  FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.sky/tmp/controller'
324
393
 
394
+ # For passing in CPU and memory limits to the controller pod when running
395
+ # in k8s. Right now, we only use this for the jobs controller, but we may
396
+ # use this for the serve controller as well in the future.
397
+ # These files are written to disk by the skylet, who reads it from env vars
398
+ # passed by the backend when starting the skylet (start_skylet_on_head_node).
399
+ CONTROLLER_K8S_CPU_FILE = '~/.sky/_internal_k8s_pod_cpu'
400
+ CONTROLLER_K8S_MEMORY_FILE = '~/.sky/_internal_k8s_pod_memory'
401
+
325
402
  # Used when an managed jobs are created and
326
403
  # files are synced up to the cloud.
327
404
  FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
@@ -353,6 +430,8 @@ SERVICE_ACCOUNT_TOKEN_ENV_VAR = (
353
430
  # SkyPilot environment variables
354
431
  SKYPILOT_NUM_NODES = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_NODES'
355
432
  SKYPILOT_NODE_IPS = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_IPS'
433
+ SKYPILOT_SETUP_NUM_GPUS_PER_NODE = (
434
+ f'{SKYPILOT_ENV_VAR_PREFIX}SETUP_NUM_GPUS_PER_NODE')
356
435
  SKYPILOT_NUM_GPUS_PER_NODE = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_GPUS_PER_NODE'
357
436
  SKYPILOT_NODE_RANK = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_RANK'
358
437
 
@@ -371,7 +450,9 @@ RCLONE_CACHE_REFRESH_INTERVAL = 10
371
450
  OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
372
451
  ('docker', 'run_options'),
373
452
  ('nvidia_gpus', 'disable_ecc'),
453
+ ('ssh', 'custom_metadata'),
374
454
  ('ssh', 'pod_config'),
455
+ ('ssh', 'provision_timeout'),
375
456
  ('kubernetes', 'custom_metadata'),
376
457
  ('kubernetes', 'pod_config'),
377
458
  ('kubernetes', 'provision_timeout'),
@@ -381,13 +462,32 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
381
462
  ('gcp', 'enable_gvnic'),
382
463
  ('gcp', 'enable_gpu_direct'),
383
464
  ('gcp', 'placement_policy'),
465
+ ('vast', 'secure_only'),
466
+ ('active_workspace',),
384
467
  ]
385
468
  # When overriding the SkyPilot configs on the API server with the client one,
386
469
  # we skip the following keys because they are meant to be client-side configs.
387
- SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('api_server',),
388
- ('allowed_clouds',),
389
- ('workspaces',), ('db',),
390
- ('daemons',)]
470
+ # Also, we skip the consolidation mode config as those should be only set on
471
+ # the API server side.
472
+ SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
473
+ ('api_server',),
474
+ ('allowed_clouds',),
475
+ ('workspaces',),
476
+ ('db',),
477
+ ('daemons',),
478
+ # TODO(kevin,tian): Override the whole controller config once our test
479
+ # infrastructure supports setting dynamic server side configs.
480
+ # Tests that are affected:
481
+ # - test_managed_jobs_ha_kill_starting
482
+ # - test_managed_jobs_ha_kill_running
483
+ # - all tests that use LOW_CONTROLLER_RESOURCE_ENV or
484
+ # LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG (won't cause test failure,
485
+ # but the configs won't be applied)
486
+ ('jobs', 'controller', 'consolidation_mode'),
487
+ ('serve', 'controller', 'consolidation_mode'),
488
+ ('jobs', 'controller', 'controller_logs_gc_retention_hours'),
489
+ ('jobs', 'controller', 'task_logs_gc_retention_hours'),
490
+ ]
391
491
 
392
492
  # Constants for Azure blob storage
393
493
  WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
@@ -421,6 +521,11 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
421
521
  # TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
422
522
  # Environment variable that is set to 'true' if this is a skypilot server.
423
523
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
524
+ OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
525
+ IS_SKYPILOT_SERVE_CONTROLLER = 'IS_SKYPILOT_SERVE_CONTROLLER'
526
+
527
+ SERVE_OVERRIDE_CONCURRENT_LAUNCHES = (
528
+ f'{SKYPILOT_ENV_VAR_PREFIX}SERVE_OVERRIDE_CONCURRENT_LAUNCHES')
424
529
 
425
530
  # Environment variable that is set to 'true' if metrics are enabled.
426
531
  ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
@@ -436,6 +541,7 @@ ENV_VAR_DB_CONNECTION_URI = (f'{SKYPILOT_ENV_VAR_PREFIX}DB_CONNECTION_URI')
436
541
  # authentication is enabled in the API server.
437
542
  ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
438
543
  SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
544
+ SKYPILOT_INGRESS_BASIC_AUTH_ENABLED = 'SKYPILOT_INGRESS_BASIC_AUTH_ENABLED'
439
545
  ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
440
546
 
441
547
  # Enable debug logging for requests.
@@ -447,11 +553,12 @@ SKYPILOT_DEFAULT_WORKSPACE = 'default'
447
553
  # BEGIN constants used for service catalog.
448
554
  HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
449
555
  HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs' # pylint: disable=line-too-long
450
- CATALOG_SCHEMA_VERSION = 'v7'
556
+ CATALOG_SCHEMA_VERSION = 'v8'
451
557
  CATALOG_DIR = '~/.sky/catalogs'
452
558
  ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
453
559
  'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
454
- 'paperspace', 'do', 'nebius', 'ssh', 'hyperbolic')
560
+ 'paperspace', 'primeintellect', 'do', 'nebius', 'ssh', 'slurm',
561
+ 'hyperbolic', 'seeweb', 'shadeform')
455
562
  # END constants used for service catalog.
456
563
 
457
564
  # The user ID of the SkyPilot system.
@@ -503,8 +610,11 @@ DEFAULT_PRIORITY = 0
503
610
  GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
504
611
  COST_REPORT_DEFAULT_DAYS = 30
505
612
 
506
- # The directory for file locks.
507
- SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
508
-
509
613
  ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
510
614
  'DEBUG_LOOP_LAG_THRESHOLD_MS')
615
+
616
+ ARM64_ARCH = 'arm64'
617
+ X86_64_ARCH = 'x86_64'
618
+
619
+ SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
620
+ f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
sky/skylet/events.py CHANGED
@@ -11,7 +11,8 @@ import psutil
11
11
  from sky import clouds
12
12
  from sky import sky_logging
13
13
  from sky.backends import cloud_vm_ray_backend
14
- from sky.jobs import scheduler as managed_job_scheduler
14
+ from sky.jobs import constants as managed_job_constants
15
+ from sky.jobs import scheduler
15
16
  from sky.jobs import state as managed_job_state
16
17
  from sky.jobs import utils as managed_job_utils
17
18
  from sky.serve import serve_utils
@@ -21,6 +22,7 @@ from sky.skylet import job_lib
21
22
  from sky.usage import usage_lib
22
23
  from sky.utils import cluster_utils
23
24
  from sky.utils import registry
25
+ from sky.utils import subprocess_utils
24
26
  from sky.utils import ux_utils
25
27
  from sky.utils import yaml_utils
26
28
 
@@ -45,6 +47,9 @@ class SkyletEvent:
45
47
  EVENT_CHECKING_INTERVAL_SECONDS))
46
48
  self._n = 0
47
49
 
50
+ def start(self):
51
+ pass
52
+
48
53
  def run(self):
49
54
  self._n = (self._n + 1) % self._event_interval
50
55
  if self._n % self._event_interval == 0:
@@ -73,18 +78,60 @@ class ManagedJobEvent(SkyletEvent):
73
78
  """Skylet event for updating and scheduling managed jobs."""
74
79
  EVENT_INTERVAL_SECONDS = 300
75
80
 
81
+ def start(self):
82
+ cpus_env_var = os.environ.get('SKYPILOT_POD_CPU_CORE_LIMIT')
83
+ if cpus_env_var is not None:
84
+ with open(os.path.expanduser(constants.CONTROLLER_K8S_CPU_FILE),
85
+ 'w',
86
+ encoding='utf-8') as f:
87
+ f.write(cpus_env_var)
88
+ memory_env_var = os.environ.get('SKYPILOT_POD_MEMORY_GB_LIMIT')
89
+ if memory_env_var is not None:
90
+ with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
91
+ 'w',
92
+ encoding='utf-8') as f:
93
+ f.write(memory_env_var)
94
+
76
95
  def _run(self):
96
+ if not os.path.exists(
97
+ os.path.expanduser(
98
+ managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE)
99
+ ) and not managed_job_utils.is_consolidation_mode():
100
+ # Note: since the skylet is started before the user setup (in
101
+ # jobs-controller.yaml.j2) runs, it's possible that we hit this
102
+ # before the indicator file is written. However, since we will wait
103
+ # EVENT_INTERVAL_SECONDS before the first run, this should be very
104
+ # unlikely.
105
+ logger.info('No jobs controller indicator file found.')
106
+ all_job_ids = managed_job_state.get_all_job_ids_by_name(None)
107
+ if not all_job_ids:
108
+ logger.info('No jobs running. Stopping controllers.')
109
+ # TODO(cooperc): Move this to a shared function also called by
110
+ # sdk.api_stop(). (#7229)
111
+ try:
112
+ records = scheduler.get_controller_process_records()
113
+ if records is not None:
114
+ for record in records:
115
+ if managed_job_utils.controller_process_alive(
116
+ record, quiet=False):
117
+ subprocess_utils.kill_children_processes(
118
+ parent_pids=[record.pid], force=True)
119
+ os.remove(
120
+ os.path.expanduser(
121
+ scheduler.JOB_CONTROLLER_PID_PATH))
122
+ except Exception as e: # pylint: disable=broad-except
123
+ # in case we get perm issues or something is messed up, just
124
+ # ignore it and assume the process is dead
125
+ logger.error(
126
+ f'Error looking at job controller pid file: {e}')
127
+ pass
128
+ logger.info(f'{len(all_job_ids)} jobs running. Assuming the '
129
+ 'indicator file hasn\'t been written yet.')
130
+ return
131
+
77
132
  logger.info('=== Updating managed job status ===')
78
133
  managed_job_utils.update_managed_jobs_statuses()
79
-
80
-
81
- class ManagedJobSchedulingEvent(SkyletEvent):
82
- """Skylet event for scheduling managed jobs."""
83
- EVENT_INTERVAL_SECONDS = 20
84
-
85
- def _run(self):
86
- logger.info('=== Scheduling next jobs ===')
87
- managed_job_scheduler.maybe_schedule_next_jobs()
134
+ scheduler.maybe_start_controllers()
88
135
 
89
136
 
90
137
  class ServiceUpdateEvent(SkyletEvent):
@@ -189,7 +236,7 @@ class AutostopEvent(SkyletEvent):
189
236
  RAY_PROVISIONER_SKYPILOT_TERMINATOR):
190
237
  logger.info('Using new provisioner to stop the cluster.')
191
238
  self._stop_cluster_with_new_provisioner(autostop_config, config,
192
- provider_name)
239
+ provider_name, cloud)
193
240
  return
194
241
  logger.info('Not using new provisioner to stop the cluster. '
195
242
  f'Cloud of this cluster: {provider_name}')
@@ -267,7 +314,8 @@ class AutostopEvent(SkyletEvent):
267
314
  raise NotImplementedError
268
315
 
269
316
  def _stop_cluster_with_new_provisioner(self, autostop_config,
270
- cluster_config, provider_name):
317
+ cluster_config, provider_name,
318
+ cloud):
271
319
  # pylint: disable=import-outside-toplevel
272
320
  from sky import provision as provision_lib
273
321
  autostop_lib.set_autostopping_started()
@@ -275,13 +323,25 @@ class AutostopEvent(SkyletEvent):
275
323
  cluster_name_on_cloud = cluster_config['cluster_name']
276
324
  is_cluster_multinode = cluster_config['max_workers'] > 0
277
325
 
326
+ # Clear AWS credentials from environment to force boto3 to use IAM
327
+ # role attached to the instance (lowest priority in credential chain).
328
+ # This allows the cluster to stop/terminate itself using its IAM role.
278
329
  os.environ.pop('AWS_ACCESS_KEY_ID', None)
279
330
  os.environ.pop('AWS_SECRET_ACCESS_KEY', None)
331
+ os.environ.pop('AWS_SESSION_TOKEN', None)
332
+ # Point boto3 to /dev/null to skip reading credentials from files.
333
+ os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/dev/null'
334
+ os.environ['AWS_CONFIG_FILE'] = '/dev/null'
280
335
 
281
336
  # Stop the ray autoscaler to avoid scaling up, during
282
337
  # stopping/terminating of the cluster.
283
- logger.info('Stopping the ray cluster.')
284
- subprocess.run(f'{constants.SKY_RAY_CMD} stop', shell=True, check=True)
338
+ if not cloud.uses_ray():
339
+ logger.info('Skipping ray stop as cloud does not use Ray.')
340
+ else:
341
+ logger.info('Stopping the ray cluster.')
342
+ subprocess.run(f'{constants.SKY_RAY_CMD} stop',
343
+ shell=True,
344
+ check=True)
285
345
 
286
346
  operation_fn = provision_lib.stop_instances
287
347
  if autostop_config.down:
@@ -0,0 +1 @@
1
+ """Task Executors"""
@@ -0,0 +1,189 @@
1
+ """Slurm distributed task executor for SkyPilot.
2
+
3
+ This module is invoked on each Slurm compute node via:
4
+ srun python -m sky.skylet.executor.slurm --script=... --log-dir=...
5
+ """
6
+ import argparse
7
+ import json
8
+ import os
9
+ import pathlib
10
+ import socket
11
+ import subprocess
12
+ import sys
13
+ import time
14
+
15
+ import colorama
16
+
17
+ from sky.skylet.log_lib import run_bash_command_with_log
18
+
19
+
20
+ def _get_ip_address() -> str:
21
+ """Get the IP address of the current node."""
22
+ ip_result = subprocess.run(['hostname', '-I'],
23
+ capture_output=True,
24
+ text=True,
25
+ check=False)
26
+ return ip_result.stdout.strip().split(
27
+ )[0] if ip_result.returncode == 0 else 'unknown'
28
+
29
+
30
+ def _get_job_node_ips() -> str:
31
+ """Get IPs of all nodes in the current Slurm job."""
32
+ nodelist = os.environ.get('SLURM_JOB_NODELIST', '')
33
+ assert nodelist, 'SLURM_JOB_NODELIST is not set'
34
+
35
+ # Expand compressed nodelist (e.g., "node[1-3,5]"
36
+ # -> "node1\nnode2\nnode3\nnode5")
37
+ result = subprocess.run(['scontrol', 'show', 'hostnames', nodelist],
38
+ capture_output=True,
39
+ text=True,
40
+ check=False)
41
+ if result.returncode != 0:
42
+ raise RuntimeError(f'Failed to get hostnames for: {nodelist}')
43
+
44
+ hostnames = result.stdout.strip().split('\n')
45
+ ips = []
46
+ for hostname in hostnames:
47
+ try:
48
+ ip = socket.gethostbyname(hostname)
49
+ ips.append(ip)
50
+ except socket.gaierror as e:
51
+ raise RuntimeError('Failed to get IP for hostname: '
52
+ f'{hostname}') from e
53
+
54
+ return '\n'.join(ips)
55
+
56
+
57
+ def main():
58
+ parser = argparse.ArgumentParser(
59
+ description='SkyPilot Slurm task runner for distributed execution')
60
+ parser.add_argument('--script', help='User script (inline, shell-quoted)')
61
+ parser.add_argument('--script-path',
62
+ help='Path to script file (if too long for inline)')
63
+ parser.add_argument('--env-vars',
64
+ default='{}',
65
+ help='JSON-encoded environment variables')
66
+ parser.add_argument('--log-dir',
67
+ required=True,
68
+ help='Directory for log files')
69
+ parser.add_argument('--cluster-num-nodes',
70
+ type=int,
71
+ required=True,
72
+ help='Total number of nodes in the cluster')
73
+ parser.add_argument('--cluster-ips',
74
+ required=True,
75
+ help='Comma-separated list of cluster node IPs')
76
+ parser.add_argument('--task-name',
77
+ default=None,
78
+ help='Task name for single-node log prefix')
79
+ parser.add_argument(
80
+ '--is-setup',
81
+ action='store_true',
82
+ help=
83
+ 'Whether this is a setup command (affects logging prefix and filename)')
84
+ parser.add_argument('--alloc-signal-file',
85
+ help='Path to allocation signal file')
86
+ parser.add_argument('--setup-done-signal-file',
87
+ help='Path to setup-done signal file')
88
+ args = parser.parse_args()
89
+
90
+ assert args.script is not None or args.script_path is not None, (
91
+ 'Either '
92
+ '--script or --script-path must be provided')
93
+
94
+ # Task rank, different from index of the node in the cluster.
95
+ rank = int(os.environ['SLURM_PROCID'])
96
+ num_nodes = int(os.environ.get('SLURM_NNODES', 1))
97
+ is_single_node_cluster = (args.cluster_num_nodes == 1)
98
+
99
+ # Determine node index from IP (like Ray's cluster_ips_to_node_id)
100
+ cluster_ips = args.cluster_ips.split(',')
101
+ ip_addr = _get_ip_address()
102
+ try:
103
+ node_idx = cluster_ips.index(ip_addr)
104
+ except ValueError as e:
105
+ raise RuntimeError(f'IP address {ip_addr} not found in '
106
+ f'cluster IPs: {cluster_ips}') from e
107
+ node_name = 'head' if node_idx == 0 else f'worker{node_idx}'
108
+
109
+ # Log files are written to a shared filesystem, so each node must use a
110
+ # unique filename to avoid collisions.
111
+ if args.is_setup:
112
+ # TODO(kevin): This is inconsistent with other clouds, where it is
113
+ # simply called 'setup.log'. On Slurm that is obviously not possible,
114
+ # since the ~/sky_logs directory is shared by all nodes, so
115
+ # 'setup.log' will be overwritten by other nodes.
116
+ # Perhaps we should apply this naming convention to other clouds.
117
+ log_filename = f'setup-{node_name}.log'
118
+ elif is_single_node_cluster:
119
+ log_filename = 'run.log'
120
+ else:
121
+ log_filename = f'{rank}-{node_name}.log'
122
+ log_path = os.path.join(args.log_dir, log_filename)
123
+
124
+ if args.script_path:
125
+ with open(args.script_path, 'r', encoding='utf-8') as f:
126
+ script = f.read()
127
+ else:
128
+ script = args.script
129
+
130
+ # Parse env vars and add SKYPILOT environment variables
131
+ env_vars = json.loads(args.env_vars)
132
+ if not args.is_setup:
133
+ # For setup, env vars are set in CloudVmRayBackend._setup.
134
+ env_vars['SKYPILOT_NODE_RANK'] = str(rank)
135
+ env_vars['SKYPILOT_NUM_NODES'] = str(num_nodes)
136
+ env_vars['SKYPILOT_NODE_IPS'] = _get_job_node_ips()
137
+
138
+ # Signal file coordination for setup/run synchronization
139
+ # Rank 0 touches the allocation signal to indicate resources acquired
140
+ if args.alloc_signal_file is not None and rank == 0:
141
+ pathlib.Path(args.alloc_signal_file).touch()
142
+
143
+ # Wait for setup to complete.
144
+ while args.setup_done_signal_file is not None and not os.path.exists(
145
+ args.setup_done_signal_file):
146
+ time.sleep(0.1)
147
+
148
+ # Build log prefix
149
+ # For setup on head: (setup pid={pid})
150
+ # For setup on workers: (setup pid={pid}, ip=1.2.3.4)
151
+ # For single-node cluster: (task_name, pid={pid})
152
+ # For multi-node on head: (head, rank=0, pid={pid})
153
+ # For multi-node on workers: (worker1, rank=1, pid={pid}, ip=1.2.3.4)
154
+ # The {pid} placeholder will be replaced by run_with_log
155
+ if args.is_setup:
156
+ # Setup prefix: head (node_idx=0) shows no IP, workers show IP
157
+ if node_idx == 0:
158
+ prefix = (f'{colorama.Fore.CYAN}(setup pid={{pid}})'
159
+ f'{colorama.Style.RESET_ALL} ')
160
+ else:
161
+ prefix = (f'{colorama.Fore.CYAN}(setup pid={{pid}}, ip={ip_addr})'
162
+ f'{colorama.Style.RESET_ALL} ')
163
+ elif is_single_node_cluster:
164
+ # Single-node cluster: use task name
165
+ name_str = args.task_name if args.task_name else 'task'
166
+ prefix = (f'{colorama.Fore.CYAN}({name_str}, pid={{pid}})'
167
+ f'{colorama.Style.RESET_ALL} ')
168
+ else:
169
+ # Multi-node cluster: head (node_idx=0) shows no IP, workers show IP
170
+ if node_idx == 0:
171
+ prefix = (
172
+ f'{colorama.Fore.CYAN}({node_name}, rank={rank}, pid={{pid}})'
173
+ f'{colorama.Style.RESET_ALL} ')
174
+ else:
175
+ prefix = (f'{colorama.Fore.CYAN}'
176
+ f'({node_name}, rank={rank}, pid={{pid}}, ip={ip_addr})'
177
+ f'{colorama.Style.RESET_ALL} ')
178
+
179
+ returncode = run_bash_command_with_log(script,
180
+ log_path,
181
+ env_vars=env_vars,
182
+ stream_logs=True,
183
+ streaming_prefix=prefix)
184
+
185
+ sys.exit(returncode)
186
+
187
+
188
+ if __name__ == '__main__':
189
+ main()