skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/execution.py CHANGED
@@ -3,8 +3,9 @@
3
3
  See `Stage` for a Task's life cycle.
4
4
  """
5
5
  import enum
6
+ import logging
6
7
  import typing
7
- from typing import List, Optional, Tuple, Union
8
+ from typing import Callable, List, Optional, Tuple, Union
8
9
 
9
10
  import colorama
10
11
 
@@ -14,7 +15,9 @@ from sky import clouds
14
15
  from sky import global_user_state
15
16
  from sky import optimizer
16
17
  from sky import sky_logging
18
+ from sky import task as task_lib
17
19
  from sky.backends import backend_utils
20
+ from sky.server.requests import request_names
18
21
  from sky.skylet import autostop_lib
19
22
  from sky.usage import usage_lib
20
23
  from sky.utils import admin_policy_utils
@@ -30,6 +33,7 @@ from sky.utils import ux_utils
30
33
 
31
34
  if typing.TYPE_CHECKING:
32
35
  import sky
36
+ from sky import resources as resources_lib
33
37
 
34
38
  logger = sky_logging.init_logger(__name__)
35
39
 
@@ -110,16 +114,18 @@ def _execute(
110
114
  stages: Optional[List[Stage]] = None,
111
115
  cluster_name: Optional[str] = None,
112
116
  detach_setup: bool = False,
113
- detach_run: bool = False,
114
117
  idle_minutes_to_autostop: Optional[int] = None,
115
118
  no_setup: bool = False,
116
119
  clone_disk_from: Optional[str] = None,
117
120
  skip_unnecessary_provisioning: bool = False,
121
+ *, #keyword only separator
118
122
  # Internal only:
119
123
  # pylint: disable=invalid-name
124
+ _request_name: request_names.AdminPolicyRequestName,
120
125
  _quiet_optimizer: bool = False,
121
126
  _is_launched_by_jobs_controller: bool = False,
122
127
  _is_launched_by_sky_serve_controller: bool = False,
128
+ job_logger: logging.Logger = logger,
123
129
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
124
130
  """Execute an entrypoint.
125
131
 
@@ -154,8 +160,6 @@ def _execute(
154
160
  job itself. You can safely ctrl-c to detach from logging, and it will
155
161
  not interrupt the setup process. To see the logs again after detaching,
156
162
  use `sky logs`. To cancel setup, cancel the job via `sky cancel`.
157
- detach_run: If True, as soon as a job is submitted, return from this
158
- function and do not stream execution logs.
159
163
  idle_minutes_to_autostop: int; if provided, the cluster will be set to
160
164
  autostop after this many minutes of idleness.
161
165
  no_setup: bool; whether to skip setup commands or not when (re-)launching.
@@ -172,6 +176,13 @@ def _execute(
172
176
  handle: Optional[backends.ResourceHandle]; the handle to the cluster. None
173
177
  if dryrun.
174
178
  """
179
+ if _request_name == request_names.AdminPolicyRequestName.CLUSTER_LAUNCH:
180
+ if _is_launched_by_jobs_controller:
181
+ _request_name = (
182
+ request_names.AdminPolicyRequestName.JOBS_LAUNCH_CLUSTER)
183
+ elif _is_launched_by_sky_serve_controller:
184
+ _request_name = (
185
+ request_names.AdminPolicyRequestName.SERVE_LAUNCH_REPLICA)
175
186
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
176
187
  for task in dag.tasks:
177
188
  for resource in task.resources:
@@ -187,6 +198,7 @@ def _execute(
187
198
  idle_minutes_to_autostop = resource.autostop_config.idle_minutes
188
199
  with admin_policy_utils.apply_and_use_config_in_current_request(
189
200
  dag,
201
+ request_name=_request_name,
190
202
  request_options=admin_policy.RequestOptions(
191
203
  cluster_name=cluster_name,
192
204
  idle_minutes_to_autostop=idle_minutes_to_autostop,
@@ -214,14 +226,14 @@ def _execute(
214
226
  stages=stages,
215
227
  cluster_name=cluster_name,
216
228
  detach_setup=detach_setup,
217
- detach_run=detach_run,
218
229
  no_setup=no_setup,
219
230
  clone_disk_from=clone_disk_from,
220
231
  skip_unnecessary_provisioning=skip_unnecessary_provisioning,
221
232
  _quiet_optimizer=_quiet_optimizer,
222
233
  _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
223
234
  _is_launched_by_sky_serve_controller=
224
- _is_launched_by_sky_serve_controller)
235
+ _is_launched_by_sky_serve_controller,
236
+ job_logger=job_logger)
225
237
 
226
238
 
227
239
  def _execute_dag(
@@ -235,7 +247,6 @@ def _execute_dag(
235
247
  stages: Optional[List[Stage]],
236
248
  cluster_name: Optional[str],
237
249
  detach_setup: bool,
238
- detach_run: bool,
239
250
  no_setup: bool,
240
251
  clone_disk_from: Optional[str],
241
252
  skip_unnecessary_provisioning: bool,
@@ -243,6 +254,7 @@ def _execute_dag(
243
254
  _quiet_optimizer: bool,
244
255
  _is_launched_by_jobs_controller: bool,
245
256
  _is_launched_by_sky_serve_controller: bool,
257
+ job_logger: logging.Logger = logger,
246
258
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
247
259
  """Execute a DAG.
248
260
 
@@ -253,7 +265,7 @@ def _execute_dag(
253
265
  task = dag.tasks[0]
254
266
 
255
267
  if any(r.job_recovery is not None for r in task.resources):
256
- logger.warning(
268
+ job_logger.warning(
257
269
  f'{colorama.Style.DIM}The task has `job_recovery` specified, '
258
270
  'but is launched as an unmanaged job. It will be ignored.'
259
271
  'To enable job recovery, use managed jobs: sky jobs launch.'
@@ -261,8 +273,10 @@ def _execute_dag(
261
273
 
262
274
  cluster_exists = False
263
275
  if cluster_name is not None:
264
- cluster_record = global_user_state.get_cluster_from_name(cluster_name)
265
- cluster_exists = cluster_record is not None
276
+ # We use launched_at to check if the cluster exists, because this
277
+ # db query is faster than get_cluster_from_name.
278
+ cluster_exists = global_user_state.cluster_with_name_exists(
279
+ cluster_name)
266
280
  # TODO(woosuk): If the cluster exists, print a warning that
267
281
  # `cpus` and `memory` are not used as a job scheduling constraint,
268
282
  # unlike `gpus`.
@@ -334,10 +348,10 @@ def _execute_dag(
334
348
  # itself have no task running and start the auto{stop,down}
335
349
  # process, before the task is submitted in the EXEC stage.
336
350
  verb = 'torn down' if down else 'stopped'
337
- logger.info(f'{colorama.Style.DIM}The cluster will '
338
- f'be {verb} after 1 minutes of idleness '
339
- '(after all jobs finish).'
340
- f'{colorama.Style.RESET_ALL}')
351
+ job_logger.info(f'{colorama.Style.DIM}The cluster will '
352
+ f'be {verb} after 1 minutes of idleness '
353
+ '(after all jobs finish).'
354
+ f'{colorama.Style.RESET_ALL}')
341
355
  idle_minutes_to_autostop = 1
342
356
  if Stage.DOWN in stages:
343
357
  stages.remove(Stage.DOWN)
@@ -366,7 +380,7 @@ def _execute_dag(
366
380
  yellow = colorama.Fore.YELLOW
367
381
  bold = colorama.Style.BRIGHT
368
382
  reset = colorama.Style.RESET_ALL
369
- logger.info(
383
+ job_logger.info(
370
384
  f'{yellow}Launching a spot job that does not '
371
385
  f'automatically recover from preemptions. To '
372
386
  'get automatic recovery, use managed job instead: '
@@ -385,7 +399,7 @@ def _execute_dag(
385
399
  controller = controller_utils.Controllers.from_name(
386
400
  cluster_name)
387
401
  if controller is not None:
388
- logger.info(
402
+ job_logger.info(
389
403
  f'Choosing resources for {controller.value.name}...'
390
404
  )
391
405
  dag = optimizer.Optimizer.optimize(dag,
@@ -394,6 +408,26 @@ def _execute_dag(
394
408
  task = dag.tasks[0] # Keep: dag may have been deep-copied.
395
409
  assert task.best_resources is not None, task
396
410
 
411
+ # Note on race vs. lock: OPTIMIZE typically runs outside the per-cluster
412
+ # lock. After the backend acquires the lock and refreshes state, the
413
+ # original "do we need to optimize?" decision may be stale (e.g., the
414
+ # cluster just got terminated). To compensate without moving the optimizer
415
+ # into the backend, we inject a small planner the backend can call under
416
+ # the lock only when no reusable snapshot and no caller plan exist.
417
+ planner: Optional[Callable[['sky.Task'], 'resources_lib.Resources']] = None
418
+ if isinstance(backend,
419
+ backends.CloudVmRayBackend) and Stage.OPTIMIZE in stages:
420
+
421
+ def _planner(_t: 'sky.Task'):
422
+ new_dag = optimizer.Optimizer.optimize(dag,
423
+ minimize=optimize_target,
424
+ quiet=_quiet_optimizer)
425
+ new_task = new_dag.tasks[0]
426
+ assert new_task.best_resources is not None, new_task
427
+ return new_task.best_resources.assert_launchable()
428
+
429
+ planner = _planner
430
+
397
431
  backend.register_info(
398
432
  dag=dag,
399
433
  optimize_target=optimize_target,
@@ -402,7 +436,8 @@ def _execute_dag(
402
436
  # after K8S pod recovers from a crash.
403
437
  # See `kubernetes-ray.yml.j2` for more details.
404
438
  dump_final_script=is_controller_high_availability_supported,
405
- is_managed=is_managed)
439
+ is_managed=is_managed,
440
+ planner=planner)
406
441
 
407
442
  if task.storage_mounts is not None:
408
443
  # Optimizer should eventually choose where to store bucket
@@ -427,7 +462,7 @@ def _execute_dag(
427
462
  if handle is None:
428
463
  assert dryrun, ('If not dryrun, handle must be set or '
429
464
  'Stage.PROVISION must be included in stages.')
430
- logger.info('Dryrun finished.')
465
+ job_logger.info('Dryrun finished.')
431
466
  return None, None
432
467
 
433
468
  do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
@@ -436,7 +471,7 @@ def _execute_dag(
436
471
  (task.file_mounts is not None or
437
472
  task.storage_mounts is not None))
438
473
  if do_workdir or do_file_mounts:
439
- logger.info(ux_utils.starting_message('Syncing files.'))
474
+ job_logger.info(ux_utils.starting_message('Syncing files.'))
440
475
 
441
476
  if do_workdir:
442
477
  if cluster_name is not None:
@@ -444,7 +479,9 @@ def _execute_dag(
444
479
  cluster_name, status_lib.ClusterStatus.INIT,
445
480
  'Syncing files to cluster',
446
481
  global_user_state.ClusterEventType.STATUS_CHANGE)
447
- backend.sync_workdir(handle, task.workdir, task.envs_and_secrets)
482
+ envs_and_secrets = task_lib.get_plaintext_envs_and_secrets(
483
+ task.envs_and_secrets)
484
+ backend.sync_workdir(handle, task.workdir, envs_and_secrets)
448
485
 
449
486
  if do_file_mounts:
450
487
  if cluster_name is not None:
@@ -456,11 +493,11 @@ def _execute_dag(
456
493
  task.storage_mounts)
457
494
 
458
495
  if no_setup:
459
- logger.info('Setup commands skipped.')
496
+ job_logger.info('Setup commands skipped.')
460
497
  elif Stage.SETUP in stages and not dryrun:
461
498
  if skip_unnecessary_provisioning and provisioning_skipped:
462
- logger.debug('Unnecessary provisioning was skipped, so '
463
- 'skipping setup as well.')
499
+ job_logger.debug('Unnecessary provisioning was skipped, so '
500
+ 'skipping setup as well.')
464
501
  else:
465
502
  if cluster_name is not None:
466
503
  global_user_state.add_cluster_event(
@@ -479,10 +516,7 @@ def _execute_dag(
479
516
  if Stage.EXEC in stages:
480
517
  try:
481
518
  global_user_state.update_last_use(handle.get_cluster_name())
482
- job_id = backend.execute(handle,
483
- task,
484
- detach_run,
485
- dryrun=dryrun)
519
+ job_id = backend.execute(handle, task, dryrun=dryrun)
486
520
  finally:
487
521
  # Enables post_execute() to be run after KeyboardInterrupt.
488
522
  backend.post_execute(handle, down)
@@ -515,12 +549,16 @@ def launch(
515
549
  no_setup: bool = False,
516
550
  clone_disk_from: Optional[str] = None,
517
551
  fast: bool = False,
552
+ *, #keyword only separator
518
553
  # Internal only:
519
554
  # pylint: disable=invalid-name
520
555
  _quiet_optimizer: bool = False,
521
556
  _is_launched_by_jobs_controller: bool = False,
522
557
  _is_launched_by_sky_serve_controller: bool = False,
523
558
  _disable_controller_check: bool = False,
559
+ _request_name: request_names.AdminPolicyRequestName = request_names.
560
+ AdminPolicyRequestName.CLUSTER_LAUNCH,
561
+ job_logger: logging.Logger = logger,
524
562
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
525
563
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
526
564
  """Launches a cluster or task.
@@ -666,7 +704,6 @@ def launch(
666
704
  # see the setup logs when inspecting the launch process to know
667
705
  # excatly what the job is waiting for.
668
706
  detach_setup = controller_utils.Controllers.from_name(cluster_name) is None
669
-
670
707
  return _execute(
671
708
  entrypoint=entrypoint,
672
709
  dryrun=dryrun,
@@ -679,7 +716,6 @@ def launch(
679
716
  stages=stages,
680
717
  cluster_name=cluster_name,
681
718
  detach_setup=detach_setup,
682
- detach_run=True,
683
719
  idle_minutes_to_autostop=idle_minutes_to_autostop,
684
720
  no_setup=no_setup,
685
721
  clone_disk_from=clone_disk_from,
@@ -688,7 +724,12 @@ def launch(
688
724
  _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
689
725
  _is_launched_by_sky_serve_controller=
690
726
  _is_launched_by_sky_serve_controller,
691
- )
727
+ _request_name=_request_name,
728
+ job_logger=job_logger)
729
+
730
+
731
+ # needed for backward compatibility. Remove by v0.12.0
732
+ cluster_launch = launch
692
733
 
693
734
 
694
735
  @usage_lib.entrypoint
@@ -699,6 +740,7 @@ def exec( # pylint: disable=redefined-builtin
699
740
  down: bool = False,
700
741
  stream_logs: bool = True,
701
742
  backend: Optional[backends.Backend] = None,
743
+ job_logger: logging.Logger = logger,
702
744
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
703
745
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
704
746
  """Executes a task on an existing cluster.
@@ -773,5 +815,6 @@ def exec( # pylint: disable=redefined-builtin
773
815
  Stage.EXEC,
774
816
  ],
775
817
  cluster_name=cluster_name,
776
- detach_run=True,
818
+ job_logger=job_logger,
819
+ _request_name=request_names.AdminPolicyRequestName.CLUSTER_EXEC,
777
820
  )