skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,128 @@
1
+ """Utilities for managing managed job file content.
2
+
3
+ The helpers in this module fetch job file content (DAG YAML/env files) from the
4
+ database-first storage added for managed jobs, transparently falling back to
5
+ legacy on-disk paths when needed. Consumers should prefer the string-based
6
+ helpers so controllers never have to rely on local disk state.
7
+ """
8
+
9
+ import os
10
+ from typing import Optional
11
+
12
+ from sky import sky_logging
13
+ from sky import skypilot_config
14
+ from sky.jobs import state as managed_job_state
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+
19
+ def get_job_dag_content(job_id: int) -> Optional[str]:
20
+ """Get DAG YAML content for a job from database or disk.
21
+
22
+ Args:
23
+ job_id: The job ID
24
+
25
+ Returns:
26
+ DAG YAML content as string, or None if not found
27
+ """
28
+ file_info = managed_job_state.get_job_file_contents(job_id)
29
+
30
+ # Prefer content stored in the database
31
+ if file_info['dag_yaml_content'] is not None:
32
+ return file_info['dag_yaml_content']
33
+
34
+ # Fallback to disk path for backward compatibility
35
+ dag_yaml_path = file_info.get('dag_yaml_path')
36
+ if dag_yaml_path and os.path.exists(dag_yaml_path):
37
+ try:
38
+ with open(dag_yaml_path, 'r', encoding='utf-8') as f:
39
+ content = f.read()
40
+ logger.debug('Loaded DAG YAML from disk for job %s: %s', job_id,
41
+ dag_yaml_path)
42
+ return content
43
+ except (FileNotFoundError, IOError, OSError) as e:
44
+ logger.warning(
45
+ f'Failed to read DAG YAML from disk {dag_yaml_path}: {e}')
46
+
47
+ logger.warning(f'DAG YAML content not found for job {job_id}')
48
+ return None
49
+
50
+
51
+ def get_job_env_content(job_id: int) -> Optional[str]:
52
+ """Get environment file content for a job from database or disk.
53
+
54
+ Args:
55
+ job_id: The job ID
56
+
57
+ Returns:
58
+ Environment file content as string, or None if not found
59
+ """
60
+ file_info = managed_job_state.get_job_file_contents(job_id)
61
+
62
+ # Prefer content stored in the database
63
+ if file_info['env_file_content'] is not None:
64
+ return file_info['env_file_content']
65
+
66
+ # Fallback to disk path for backward compatibility
67
+ env_file_path = file_info.get('env_file_path')
68
+ if env_file_path and os.path.exists(env_file_path):
69
+ try:
70
+ with open(env_file_path, 'r', encoding='utf-8') as f:
71
+ content = f.read()
72
+ logger.debug('Loaded environment file from disk for job %s: %s',
73
+ job_id, env_file_path)
74
+ return content
75
+ except (FileNotFoundError, IOError, OSError) as e:
76
+ logger.warning(
77
+ f'Failed to read environment file from disk {env_file_path}: '
78
+ f'{e}')
79
+
80
+ # Environment file is optional, so don't warn if not found
81
+ return None
82
+
83
+
84
+ def restore_job_config_file(job_id: int) -> None:
85
+ """Restore config file from database if SKYPILOT_CONFIG is set.
86
+
87
+ This reads the config file content from the database and writes it to the
88
+ path specified in the SKYPILOT_CONFIG environment variable. This ensures
89
+ that jobs can run on any controller, even if the original config file
90
+ doesn't exist on disk.
91
+
92
+ For backward compatibility with jobs submitted before config persistence was
93
+ implemented, we fall back to using the file if it already exists on disk.
94
+
95
+ Args:
96
+ job_id: The job ID
97
+ """
98
+ config_path = os.environ.get(skypilot_config.ENV_VAR_SKYPILOT_CONFIG)
99
+ if not config_path:
100
+ # No config file for this job
101
+ return
102
+
103
+ file_info = managed_job_state.get_job_file_contents(job_id)
104
+ config_content = file_info['config_file_content']
105
+
106
+ # Expand ~ in config path
107
+ config_path_expanded = os.path.expanduser(config_path)
108
+
109
+ if config_content is not None:
110
+ # Config content is in database - restore it
111
+ # Ensure the directory exists
112
+ os.makedirs(os.path.dirname(config_path_expanded), exist_ok=True)
113
+ # Write the config file
114
+ with open(config_path_expanded, 'w', encoding='utf-8') as f:
115
+ f.write(config_content)
116
+ logger.info(f'Restored config file for job {job_id} to '
117
+ f'{config_path_expanded} ({len(config_content)} bytes)')
118
+ elif os.path.exists(config_path_expanded):
119
+ # Backward compatibility: config not in DB but file exists on disk
120
+ # This can happen for jobs submitted before config persistence
121
+ logger.debug(f'Config file for job {job_id} not in database, but '
122
+ f'found on disk at {config_path_expanded}')
123
+ else:
124
+ # Config should exist but doesn't - warn about it
125
+ logger.warning(
126
+ f'SKYPILOT_CONFIG is set to {config_path} but config content not '
127
+ f'found in database or on disk for job {job_id}. The job may fail '
128
+ f'if it relies on custom config settings.')
sky/jobs/log_gc.py ADDED
@@ -0,0 +1,193 @@
1
+ """Log garbage collection for managed jobs."""
2
+
3
+ from datetime import datetime
4
+ import os
5
+ import pathlib
6
+ import shutil
7
+ import threading
8
+ import time
9
+
10
+ import filelock
11
+
12
+ from sky import sky_logging
13
+ from sky import skypilot_config
14
+ from sky.jobs import constants as managed_job_constants
15
+ from sky.jobs import state as managed_job_state
16
+ from sky.jobs import utils as managed_job_utils
17
+ from sky.utils import context
18
+
19
+ logger = sky_logging.init_logger(__name__)
20
+
21
+ # Filelock for garbage collector leader election.
22
+ _JOB_CONTROLLER_GC_LOCK_PATH = os.path.expanduser(
23
+ '~/.sky/locks/job_controller_gc.lock')
24
+
25
+ _DEFAULT_TASK_LOGS_GC_RETENTION_HOURS = 24 * 7
26
+ _DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS = 24 * 7
27
+
28
+ _LEAST_FREQUENT_GC_INTERVAL_SECONDS = 3600
29
+ _MOST_FREQUENT_GC_INTERVAL_SECONDS = 30
30
+
31
+
32
+ def _next_gc_interval(retention_seconds: int) -> int:
33
+ """Get the next GC interval."""
34
+ # Run the GC at least per hour to ensure hourly accuracy and
35
+ # at most per 30 seconds (when retention_seconds is small) to
36
+ # avoid too frequent cleanup.
37
+ return max(min(retention_seconds, _LEAST_FREQUENT_GC_INTERVAL_SECONDS),
38
+ _MOST_FREQUENT_GC_INTERVAL_SECONDS)
39
+
40
+
41
+ def gc_controller_logs_for_job():
42
+ """Garbage collect job and controller logs."""
43
+ while True:
44
+ skypilot_config.reload_config()
45
+ controller_logs_retention = skypilot_config.get_nested(
46
+ ('jobs', 'controller', 'controller_logs_gc_retention_hours'),
47
+ _DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS) * 3600
48
+ # Negative value disables the GC
49
+ if controller_logs_retention >= 0:
50
+ logger.info(f'GC controller logs for job: retention '
51
+ f'{controller_logs_retention} seconds')
52
+ try:
53
+ finished = False
54
+ while not finished:
55
+ finished = _clean_controller_logs_with_retention(
56
+ controller_logs_retention)
57
+ except Exception as e: # pylint: disable=broad-except
58
+ logger.error(f'Error GC controller logs for job: {e}',
59
+ exc_info=True)
60
+ else:
61
+ logger.info('Controller logs GC is disabled')
62
+
63
+ interval = _next_gc_interval(controller_logs_retention)
64
+ logger.info('Next controller logs GC is scheduled after '
65
+ f'{interval} seconds')
66
+ time.sleep(interval)
67
+
68
+
69
+ def gc_task_logs_for_job():
70
+ """Garbage collect task logs for job."""
71
+ while True:
72
+ skypilot_config.reload_config()
73
+ task_logs_retention = skypilot_config.get_nested(
74
+ ('jobs', 'controller', 'task_logs_gc_retention_hours'),
75
+ _DEFAULT_TASK_LOGS_GC_RETENTION_HOURS) * 3600
76
+ # Negative value disables the GC
77
+ if task_logs_retention >= 0:
78
+ logger.info('GC task logs for job: '
79
+ f'retention {task_logs_retention} seconds')
80
+ try:
81
+ finished = False
82
+ while not finished:
83
+ finished = _clean_task_logs_with_retention(
84
+ task_logs_retention)
85
+ except Exception as e: # pylint: disable=broad-except
86
+ logger.error(f'Error GC task logs for job: {e}', exc_info=True)
87
+ else:
88
+ logger.info('Controller logs GC is disabled')
89
+
90
+ interval = _next_gc_interval(task_logs_retention)
91
+ logger.info(f'Next task logs GC is scheduled after {interval} seconds')
92
+ time.sleep(_next_gc_interval(task_logs_retention))
93
+
94
+
95
+ def _clean_controller_logs_with_retention(retention_seconds: int,
96
+ batch_size: int = 100):
97
+ """Clean controller logs with retention.
98
+
99
+ Returns:
100
+ Whether the GC of this round has finished, False means there might
101
+ still be more controller logs to clean.
102
+ """
103
+ assert batch_size > 0, 'Batch size must be positive'
104
+ jobs = managed_job_state.get_controller_logs_to_clean(retention_seconds,
105
+ batch_size=batch_size)
106
+ job_ids_to_update = []
107
+ for job in jobs:
108
+ job_ids_to_update.append(job['job_id'])
109
+ log_file = managed_job_utils.controller_log_file_for_job(job['job_id'])
110
+ cleaned_at = time.time()
111
+ if os.path.exists(log_file):
112
+ ts_str = datetime.fromtimestamp(cleaned_at).strftime(
113
+ '%Y-%m-%d %H:%M:%S')
114
+ msg = f'Controller log has been cleaned at {ts_str}.'
115
+ # Sync down logs will reference to this file directly, so we
116
+ # keep the file and delete the content.
117
+ # TODO(aylei): refactor sync down logs if the inode usage
118
+ # becomes an issue.
119
+ with open(log_file, 'w', encoding='utf-8') as f:
120
+ f.write(msg + '\n')
121
+ # Batch the update, the timestamp will be not accurate but it's okay.
122
+ managed_job_state.set_controller_logs_cleaned(job_ids=job_ids_to_update,
123
+ logs_cleaned_at=time.time())
124
+ complete = len(jobs) < batch_size
125
+ logger.info(f'Cleaned {len(jobs)} controller logs with retention '
126
+ f'{retention_seconds} seconds, complete: {complete}')
127
+ return complete
128
+
129
+
130
+ def _clean_task_logs_with_retention(retention_seconds: int,
131
+ batch_size: int = 100):
132
+ """Clean task logs with retention.
133
+
134
+ Returns:
135
+ Whether the GC of this round has finished, False means there might
136
+ still be more task logs to clean.
137
+ """
138
+ assert batch_size > 0, 'Batch size must be positive'
139
+ tasks = managed_job_state.get_task_logs_to_clean(retention_seconds,
140
+ batch_size=batch_size)
141
+ tasks_to_update = []
142
+ for task in tasks:
143
+ local_log_file = pathlib.Path(task['local_log_file'])
144
+ # We assume the log directory has the following layout:
145
+ # task-id/
146
+ # - run.log
147
+ # - tasks/
148
+ # - run.log
149
+ # and also remove the tasks directory on cleanup.
150
+ task_log_dir = local_log_file.parent.joinpath('tasks')
151
+ local_log_file.unlink(missing_ok=True)
152
+ shutil.rmtree(task_log_dir, ignore_errors=True)
153
+ # We have at least once semantic guarantee for the cleanup here.
154
+ tasks_to_update.append((task['job_id'], task['task_id']))
155
+ managed_job_state.set_task_logs_cleaned(tasks=list(tasks_to_update),
156
+ logs_cleaned_at=time.time())
157
+ complete = len(tasks) < batch_size
158
+ logger.info(f'Cleaned {len(tasks)} task logs with retention '
159
+ f'{retention_seconds} seconds, complete: {complete}')
160
+ return complete
161
+
162
+
163
+ @context.contextual
164
+ def run_log_gc():
165
+ """Run the log garbage collector."""
166
+ log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
167
+ os.makedirs(log_dir, exist_ok=True)
168
+ log_path = os.path.join(log_dir, 'garbage_collector.log')
169
+ # Remove previous log file
170
+ pathlib.Path(log_path).unlink(missing_ok=True)
171
+ ctx = context.get()
172
+ assert ctx is not None, 'Context is not initialized'
173
+ ctx.redirect_log(pathlib.Path(log_path))
174
+ tasks = []
175
+ tasks.append(
176
+ threading.Thread(target=gc_controller_logs_for_job, daemon=True))
177
+ tasks.append(threading.Thread(target=gc_task_logs_for_job, daemon=True))
178
+ for task in tasks:
179
+ task.start()
180
+ for task in tasks:
181
+ task.join()
182
+
183
+
184
+ def elect_for_log_gc():
185
+ """Use filelock to elect for the log garbage collector.
186
+
187
+ The log garbage collector runs in the controller process to avoid the
188
+ overhead of launching a new process and the lifecycle management, the
189
+ threads that does not elected as the log garbage collector just wait.
190
+ on the filelock and bring trivial overhead.
191
+ """
192
+ with filelock.FileLock(_JOB_CONTROLLER_GC_LOCK_PATH):
193
+ run_log_gc()