skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/skylet/constants.py CHANGED
@@ -1,5 +1,4 @@
1
1
  """Constants for SkyPilot."""
2
- import os
3
2
  from typing import List, Tuple
4
3
 
5
4
  from packaging import version
@@ -7,6 +6,23 @@ from packaging import version
7
6
  import sky
8
7
  from sky.setup_files import dependencies
9
8
 
9
+ # The base directory for all SkyPilot runtime artifacts.
10
+ # Historically, we have always used $HOME, but we couldn't
11
+ # do that for Slurm, because $HOME typically points to a NFS
12
+ # mounted directory, which does not work well with SQLite.
13
+ # https://sqlite.org/faq.html#q5
14
+ # Additionally, having the skypilot-runtime python venv be
15
+ # on an NFS makes things very slow.
16
+ SKY_RUNTIME_DIR = '${SKY_RUNTIME_DIR:-$HOME}'
17
+ # Same as above but for use within python code instead of shell commands.
18
+ # Example usage:
19
+ # os.path.join(
20
+ # os.path.expanduser(os.environ.get(SKY_RUNTIME_DIR_ENV_VAR_KEY, '~')),
21
+ # '.sky/jobs.db')
22
+ SKY_RUNTIME_DIR_ENV_VAR_KEY = 'SKY_RUNTIME_DIR'
23
+ # We keep sky_logs and sky_workdir in $HOME, because
24
+ # these are artifacts that users can access, and having
25
+ # them be in $HOME makes it more convenient.
10
26
  SKY_LOGS_DIRECTORY = '~/sky_logs'
11
27
  SKY_REMOTE_WORKDIR = '~/sky_workdir'
12
28
  SKY_IGNORE_FILE = '.skyignore'
@@ -25,22 +41,23 @@ SKY_REMOTE_RAY_PORT_DICT_STR = (
25
41
  f'"ray_dashboard_port":{SKY_REMOTE_RAY_DASHBOARD_PORT}}}')
26
42
  # The file contains the ports of the Ray cluster that SkyPilot launched,
27
43
  # i.e. the PORT_DICT_STR above.
28
- SKY_REMOTE_RAY_PORT_FILE = '~/.sky/ray_port.json'
44
+ SKY_REMOTE_RAY_PORT_FILE = '.sky/ray_port.json'
29
45
  SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
30
46
  SKY_REMOTE_RAY_VERSION = '2.9.3'
31
47
 
48
+ SKY_UNSET_PYTHONPATH = 'env -u PYTHONPATH'
32
49
  # We store the absolute path of the python executable (/opt/conda/bin/python3)
33
50
  # in this file, so that any future internal commands that need to use python
34
51
  # can use this path. This is useful for the case where the user has a custom
35
52
  # conda environment as a default environment, which is not the same as the one
36
53
  # used for installing SkyPilot runtime (ray and skypilot).
37
- SKY_PYTHON_PATH_FILE = '~/.sky/python_path'
38
- SKY_RAY_PATH_FILE = '~/.sky/ray_path'
54
+ SKY_PYTHON_PATH_FILE = f'{SKY_RUNTIME_DIR}/.sky/python_path'
55
+ SKY_RAY_PATH_FILE = f'{SKY_RUNTIME_DIR}/.sky/ray_path'
39
56
  SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
40
57
  f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
41
58
  'which python3')
42
59
  # Python executable, e.g., /opt/conda/bin/python3
43
- SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
60
+ SKY_PYTHON_CMD = f'{SKY_UNSET_PYTHONPATH} $({SKY_GET_PYTHON_PATH_CMD})'
44
61
  # Prefer SKY_UV_PIP_CMD, which is faster.
45
62
  # TODO(cooperc): remove remaining usage (GCP TPU setup).
46
63
  SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
@@ -52,21 +69,29 @@ SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
52
69
  f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
53
70
  # Separate env for SkyPilot runtime dependencies.
54
71
  SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
55
- SKY_REMOTE_PYTHON_ENV: str = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
72
+ SKY_REMOTE_PYTHON_ENV: str = f'{SKY_RUNTIME_DIR}/{SKY_REMOTE_PYTHON_ENV_NAME}'
56
73
  ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
57
74
  # uv is used for venv and pip, much faster than python implementations.
58
75
  SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
59
- SKY_UV_CMD = f'UV_SYSTEM_PYTHON=false {SKY_UV_INSTALL_DIR}/uv'
76
+ # set UV_SYSTEM_PYTHON to false in case the
77
+ # user provided docker image set it to true.
78
+ # unset PYTHONPATH in case the user provided docker image set it.
79
+ SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
80
+ f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
60
81
  # This won't reinstall uv if it's already installed, so it's safe to re-run.
61
82
  SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
62
83
  'curl -LsSf https://astral.sh/uv/install.sh '
63
84
  f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
64
85
  SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
65
- # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH to deactivate the
66
- # environment. `deactivate` command does not work when conda is used.
86
+ SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run '
87
+ '--no-project --no-config')
88
+ # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
89
+ # VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
90
+ # not work when conda is used.
67
91
  DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
68
92
  'export PATH='
69
- f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||")')
93
+ f'$(echo $PATH | sed "s|$(echo {SKY_REMOTE_PYTHON_ENV})/bin:||") && '
94
+ 'unset VIRTUAL_ENV && unset VIRTUAL_ENV_PROMPT')
70
95
 
71
96
  # Prefix for SkyPilot environment variables
72
97
  SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
@@ -91,14 +116,17 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
91
116
  # cluster yaml is updated.
92
117
  #
93
118
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
94
- SKYLET_VERSION = '17'
119
+ SKYLET_VERSION = '27'
95
120
  # The version of the lib files that skylet/jobs use. Whenever there is an API
96
121
  # change for the job_lib or log_lib, we need to bump this version, so that the
97
122
  # user can be notified to update their SkyPilot version on the remote cluster.
98
123
  SKYLET_LIB_VERSION = 4
99
- SKYLET_VERSION_FILE = '~/.sky/skylet_version'
124
+ SKYLET_VERSION_FILE = '.sky/skylet_version'
125
+ SKYLET_LOG_FILE = '.sky/skylet.log'
126
+ SKYLET_PID_FILE = '.sky/skylet_pid'
127
+ SKYLET_PORT_FILE = '.sky/skylet_port'
100
128
  SKYLET_GRPC_PORT = 46590
101
- SKYLET_GRPC_TIMEOUT_SECONDS = 5
129
+ SKYLET_GRPC_TIMEOUT_SECONDS = 10
102
130
 
103
131
  # Docker default options
104
132
  DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
@@ -150,7 +178,7 @@ CONDA_INSTALLATION_COMMANDS = (
150
178
  # because for some images, conda is already installed, but not initialized.
151
179
  # In this case, we need to initialize conda and set auto_activate_base to
152
180
  # true.
153
- '{ bash Miniconda3-Linux.sh -b; '
181
+ '{ bash Miniconda3-Linux.sh -b || true; '
154
182
  'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
155
183
  # Caller should replace {conda_auto_activate} with either true or false.
156
184
  'conda config --set auto_activate_base {conda_auto_activate} && '
@@ -172,7 +200,7 @@ CONDA_INSTALLATION_COMMANDS = (
172
200
  'fi;'
173
201
  # Install uv for venv management and pip installation.
174
202
  f'{SKY_UV_INSTALL_CMD};'
175
- # Create a separate conda environment for SkyPilot dependencies.
203
+ # Create a separate python environment for SkyPilot dependencies.
176
204
  f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
177
205
  # Do NOT use --system-site-packages here, because if users upgrade any
178
206
  # packages in the base env, they interfere with skypilot dependencies.
@@ -217,7 +245,9 @@ RAY_INSTALLATION_COMMANDS = (
217
245
  f'{SKY_UV_PIP_CMD} list | grep "ray " | '
218
246
  f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
219
247
  f'|| {RAY_STATUS} || '
220
- f'{SKY_UV_PIP_CMD} install -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long
248
+ # The pydantic-core==2.41.3 for arm seems corrupted
249
+ # so we need to avoid that specific version.
250
+ f'{SKY_UV_PIP_CMD} install -U "ray[default]=={SKY_REMOTE_RAY_VERSION}" "pydantic-core==2.41.1"; ' # pylint: disable=line-too-long
221
251
  # In some envs, e.g. pip does not have permission to write under /opt/conda
222
252
  # ray package will be installed under ~/.local/bin. If the user's PATH does
223
253
  # not include ~/.local/bin (the pip install will have the output: `WARNING:
@@ -229,9 +259,24 @@ RAY_INSTALLATION_COMMANDS = (
229
259
  'export PATH=$PATH:$HOME/.local/bin; '
230
260
  # Writes ray path to file if it does not exist or the file is empty.
231
261
  f'[ -s {SKY_RAY_PATH_FILE} ] || '
232
- f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && '
262
+ f'{{ {SKY_UV_RUN_CMD} '
233
263
  f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
234
264
 
265
+ # Copy SkyPilot templates from the installed wheel to ~/sky_templates.
266
+ # This must run after the skypilot wheel is installed.
267
+ COPY_SKYPILOT_TEMPLATES_COMMANDS = (
268
+ f'{ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
269
+ f'{SKY_PYTHON_CMD} -c \''
270
+ 'import sky_templates, shutil, os; '
271
+ 'src = os.path.dirname(sky_templates.__file__); '
272
+ 'dst = os.path.expanduser(\"~/sky_templates\"); '
273
+ 'print(f\"Copying templates from {src} to {dst}...\"); '
274
+ 'shutil.copytree(src, dst, dirs_exist_ok=True); '
275
+ 'print(f\"Templates copied successfully\")\'; '
276
+ # Make scripts executable.
277
+ 'find ~/sky_templates -type f ! -name "*.py" ! -name "*.md" '
278
+ '-exec chmod +x {} \\; ')
279
+
235
280
  SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
236
281
  f'{SKY_UV_INSTALL_CMD};'
237
282
  f'{{ {SKY_UV_PIP_CMD} list | grep "skypilot " && '
@@ -322,6 +367,14 @@ FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.sky/tmp/'
322
367
  # controller_utils.translate_local_file_mounts_to_two_hop().
323
368
  FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.sky/tmp/controller'
324
369
 
370
+ # For passing in CPU and memory limits to the controller pod when running
371
+ # in k8s. Right now, we only use this for the jobs controller, but we may
372
+ # use this for the serve controller as well in the future.
373
+ # These files are written to disk by the skylet, who reads it from env vars
374
+ # passed by the backend when starting the skylet (start_skylet_on_head_node).
375
+ CONTROLLER_K8S_CPU_FILE = '~/.sky/_internal_k8s_pod_cpu'
376
+ CONTROLLER_K8S_MEMORY_FILE = '~/.sky/_internal_k8s_pod_memory'
377
+
325
378
  # Used when an managed jobs are created and
326
379
  # files are synced up to the cloud.
327
380
  FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
@@ -353,6 +406,8 @@ SERVICE_ACCOUNT_TOKEN_ENV_VAR = (
353
406
  # SkyPilot environment variables
354
407
  SKYPILOT_NUM_NODES = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_NODES'
355
408
  SKYPILOT_NODE_IPS = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_IPS'
409
+ SKYPILOT_SETUP_NUM_GPUS_PER_NODE = (
410
+ f'{SKYPILOT_ENV_VAR_PREFIX}SETUP_NUM_GPUS_PER_NODE')
356
411
  SKYPILOT_NUM_GPUS_PER_NODE = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_GPUS_PER_NODE'
357
412
  SKYPILOT_NODE_RANK = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_RANK'
358
413
 
@@ -371,7 +426,9 @@ RCLONE_CACHE_REFRESH_INTERVAL = 10
371
426
  OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
372
427
  ('docker', 'run_options'),
373
428
  ('nvidia_gpus', 'disable_ecc'),
429
+ ('ssh', 'custom_metadata'),
374
430
  ('ssh', 'pod_config'),
431
+ ('ssh', 'provision_timeout'),
375
432
  ('kubernetes', 'custom_metadata'),
376
433
  ('kubernetes', 'pod_config'),
377
434
  ('kubernetes', 'provision_timeout'),
@@ -381,13 +438,31 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
381
438
  ('gcp', 'enable_gvnic'),
382
439
  ('gcp', 'enable_gpu_direct'),
383
440
  ('gcp', 'placement_policy'),
441
+ ('active_workspace',),
384
442
  ]
385
443
  # When overriding the SkyPilot configs on the API server with the client one,
386
444
  # we skip the following keys because they are meant to be client-side configs.
387
- SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('api_server',),
388
- ('allowed_clouds',),
389
- ('workspaces',), ('db',),
390
- ('daemons',)]
445
+ # Also, we skip the consolidation mode config as those should be only set on
446
+ # the API server side.
447
+ SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
448
+ ('api_server',),
449
+ ('allowed_clouds',),
450
+ ('workspaces',),
451
+ ('db',),
452
+ ('daemons',),
453
+ # TODO(kevin,tian): Override the whole controller config once our test
454
+ # infrastructure supports setting dynamic server side configs.
455
+ # Tests that are affected:
456
+ # - test_managed_jobs_ha_kill_starting
457
+ # - test_managed_jobs_ha_kill_running
458
+ # - all tests that use LOW_CONTROLLER_RESOURCE_ENV or
459
+ # LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG (won't cause test failure,
460
+ # but the configs won't be applied)
461
+ ('jobs', 'controller', 'consolidation_mode'),
462
+ ('serve', 'controller', 'consolidation_mode'),
463
+ ('jobs', 'controller', 'controller_logs_gc_retention_hours'),
464
+ ('jobs', 'controller', 'task_logs_gc_retention_hours'),
465
+ ]
391
466
 
392
467
  # Constants for Azure blob storage
393
468
  WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
@@ -421,6 +496,11 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
421
496
  # TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
422
497
  # Environment variable that is set to 'true' if this is a skypilot server.
423
498
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
499
+ OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
500
+ IS_SKYPILOT_SERVE_CONTROLLER = 'IS_SKYPILOT_SERVE_CONTROLLER'
501
+
502
+ SERVE_OVERRIDE_CONCURRENT_LAUNCHES = (
503
+ f'{SKYPILOT_ENV_VAR_PREFIX}SERVE_OVERRIDE_CONCURRENT_LAUNCHES')
424
504
 
425
505
  # Environment variable that is set to 'true' if metrics are enabled.
426
506
  ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
@@ -436,6 +516,7 @@ ENV_VAR_DB_CONNECTION_URI = (f'{SKYPILOT_ENV_VAR_PREFIX}DB_CONNECTION_URI')
436
516
  # authentication is enabled in the API server.
437
517
  ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
438
518
  SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
519
+ SKYPILOT_INGRESS_BASIC_AUTH_ENABLED = 'SKYPILOT_INGRESS_BASIC_AUTH_ENABLED'
439
520
  ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
440
521
 
441
522
  # Enable debug logging for requests.
@@ -447,11 +528,12 @@ SKYPILOT_DEFAULT_WORKSPACE = 'default'
447
528
  # BEGIN constants used for service catalog.
448
529
  HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
449
530
  HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs' # pylint: disable=line-too-long
450
- CATALOG_SCHEMA_VERSION = 'v7'
531
+ CATALOG_SCHEMA_VERSION = 'v8'
451
532
  CATALOG_DIR = '~/.sky/catalogs'
452
533
  ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
453
534
  'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
454
- 'paperspace', 'do', 'nebius', 'ssh', 'hyperbolic')
535
+ 'paperspace', 'primeintellect', 'do', 'nebius', 'ssh',
536
+ 'hyperbolic', 'seeweb', 'shadeform')
455
537
  # END constants used for service catalog.
456
538
 
457
539
  # The user ID of the SkyPilot system.
@@ -503,8 +585,11 @@ DEFAULT_PRIORITY = 0
503
585
  GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
504
586
  COST_REPORT_DEFAULT_DAYS = 30
505
587
 
506
- # The directory for file locks.
507
- SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
508
-
509
588
  ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
510
589
  'DEBUG_LOOP_LAG_THRESHOLD_MS')
590
+
591
+ ARM64_ARCH = 'arm64'
592
+ X86_64_ARCH = 'x86_64'
593
+
594
+ SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
595
+ f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
sky/skylet/events.py CHANGED
@@ -11,7 +11,8 @@ import psutil
11
11
  from sky import clouds
12
12
  from sky import sky_logging
13
13
  from sky.backends import cloud_vm_ray_backend
14
- from sky.jobs import scheduler as managed_job_scheduler
14
+ from sky.jobs import constants as managed_job_constants
15
+ from sky.jobs import scheduler
15
16
  from sky.jobs import state as managed_job_state
16
17
  from sky.jobs import utils as managed_job_utils
17
18
  from sky.serve import serve_utils
@@ -21,6 +22,7 @@ from sky.skylet import job_lib
21
22
  from sky.usage import usage_lib
22
23
  from sky.utils import cluster_utils
23
24
  from sky.utils import registry
25
+ from sky.utils import subprocess_utils
24
26
  from sky.utils import ux_utils
25
27
  from sky.utils import yaml_utils
26
28
 
@@ -45,6 +47,9 @@ class SkyletEvent:
45
47
  EVENT_CHECKING_INTERVAL_SECONDS))
46
48
  self._n = 0
47
49
 
50
+ def start(self):
51
+ pass
52
+
48
53
  def run(self):
49
54
  self._n = (self._n + 1) % self._event_interval
50
55
  if self._n % self._event_interval == 0:
@@ -73,18 +78,60 @@ class ManagedJobEvent(SkyletEvent):
73
78
  """Skylet event for updating and scheduling managed jobs."""
74
79
  EVENT_INTERVAL_SECONDS = 300
75
80
 
81
+ def start(self):
82
+ cpus_env_var = os.environ.get('SKYPILOT_POD_CPU_CORE_LIMIT')
83
+ if cpus_env_var is not None:
84
+ with open(os.path.expanduser(constants.CONTROLLER_K8S_CPU_FILE),
85
+ 'w',
86
+ encoding='utf-8') as f:
87
+ f.write(cpus_env_var)
88
+ memory_env_var = os.environ.get('SKYPILOT_POD_MEMORY_GB_LIMIT')
89
+ if memory_env_var is not None:
90
+ with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
91
+ 'w',
92
+ encoding='utf-8') as f:
93
+ f.write(memory_env_var)
94
+
76
95
  def _run(self):
96
+ if not os.path.exists(
97
+ os.path.expanduser(
98
+ managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE)
99
+ ) and not managed_job_utils.is_consolidation_mode():
100
+ # Note: since the skylet is started before the user setup (in
101
+ # jobs-controller.yaml.j2) runs, it's possible that we hit this
102
+ # before the indicator file is written. However, since we will wait
103
+ # EVENT_INTERVAL_SECONDS before the first run, this should be very
104
+ # unlikely.
105
+ logger.info('No jobs controller indicator file found.')
106
+ all_job_ids = managed_job_state.get_all_job_ids_by_name(None)
107
+ if not all_job_ids:
108
+ logger.info('No jobs running. Stopping controllers.')
109
+ # TODO(cooperc): Move this to a shared function also called by
110
+ # sdk.api_stop(). (#7229)
111
+ try:
112
+ records = scheduler.get_controller_process_records()
113
+ if records is not None:
114
+ for record in records:
115
+ if managed_job_utils.controller_process_alive(
116
+ record, quiet=False):
117
+ subprocess_utils.kill_children_processes(
118
+ parent_pids=[record.pid], force=True)
119
+ os.remove(
120
+ os.path.expanduser(
121
+ scheduler.JOB_CONTROLLER_PID_PATH))
122
+ except Exception as e: # pylint: disable=broad-except
123
+ # in case we get perm issues or something is messed up, just
124
+ # ignore it and assume the process is dead
125
+ logger.error(
126
+ f'Error looking at job controller pid file: {e}')
127
+ pass
128
+ logger.info(f'{len(all_job_ids)} jobs running. Assuming the '
129
+ 'indicator file hasn\'t been written yet.')
130
+ return
131
+
77
132
  logger.info('=== Updating managed job status ===')
78
133
  managed_job_utils.update_managed_jobs_statuses()
79
-
80
-
81
- class ManagedJobSchedulingEvent(SkyletEvent):
82
- """Skylet event for scheduling managed jobs."""
83
- EVENT_INTERVAL_SECONDS = 20
84
-
85
- def _run(self):
86
- logger.info('=== Scheduling next jobs ===')
87
- managed_job_scheduler.maybe_schedule_next_jobs()
134
+ scheduler.maybe_start_controllers()
88
135
 
89
136
 
90
137
  class ServiceUpdateEvent(SkyletEvent):
@@ -275,8 +322,15 @@ class AutostopEvent(SkyletEvent):
275
322
  cluster_name_on_cloud = cluster_config['cluster_name']
276
323
  is_cluster_multinode = cluster_config['max_workers'] > 0
277
324
 
325
+ # Clear AWS credentials from environment to force boto3 to use IAM
326
+ # role attached to the instance (lowest priority in credential chain).
327
+ # This allows the cluster to stop/terminate itself using its IAM role.
278
328
  os.environ.pop('AWS_ACCESS_KEY_ID', None)
279
329
  os.environ.pop('AWS_SECRET_ACCESS_KEY', None)
330
+ os.environ.pop('AWS_SESSION_TOKEN', None)
331
+ # Point boto3 to /dev/null to skip reading credentials from files.
332
+ os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/dev/null'
333
+ os.environ['AWS_CONFIG_FILE'] = '/dev/null'
280
334
 
281
335
  # Stop the ray autoscaler to avoid scaling up, during
282
336
  # stopping/terminating of the cluster.