skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -181,57 +181,81 @@ def simplify_ports(ports: List[str]) -> List[str]:
181
181
 
182
182
 
183
183
  def format_resource(resource: 'resources_lib.Resources',
184
- simplify: bool = False) -> str:
184
+ simplified_only: bool = False) -> Tuple[str, Optional[str]]:
185
185
  resource = resource.assert_launchable()
186
- vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
187
- resource.instance_type)
186
+ is_k8s = str(resource.cloud).lower() == 'kubernetes'
187
+ if resource.accelerators is None or is_k8s or not simplified_only:
188
+ vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
189
+ resource.instance_type)
188
190
 
189
- components = []
191
+ elements_simple = []
192
+ elements_full = []
190
193
 
191
194
  if resource.accelerators is not None:
192
195
  acc, count = list(resource.accelerators.items())[0]
193
- components.append(f'gpus={acc}:{count}')
196
+ elements_simple.append(f'gpus={acc}:{count}')
197
+ elements_full.append(f'gpus={acc}:{count}')
194
198
 
195
- is_k8s = str(resource.cloud).lower() == 'kubernetes'
196
- if (resource.accelerators is None or is_k8s or not simplify):
199
+ if (resource.accelerators is None or is_k8s):
200
+ if vcpu is not None:
201
+ elements_simple.append(f'cpus={int(vcpu)}')
202
+ elements_full.append(f'cpus={int(vcpu)}')
203
+ if mem is not None:
204
+ elements_simple.append(f'mem={int(mem)}')
205
+ elements_full.append(f'mem={int(mem)}')
206
+ elif not simplified_only:
197
207
  if vcpu is not None:
198
- components.append(f'cpus={int(vcpu)}')
208
+ elements_full.append(f'cpus={int(vcpu)}')
199
209
  if mem is not None:
200
- components.append(f'mem={int(mem)}')
210
+ elements_full.append(f'mem={int(mem)}')
201
211
 
202
- instance_type = resource.instance_type
203
- if simplify:
204
- instance_type = common_utils.truncate_long_string(instance_type, 15)
205
212
  if not is_k8s:
206
- components.append(instance_type)
207
- if simplify:
208
- components.append('...')
209
- else:
213
+ instance_type_full = resource.instance_type
214
+ instance_type_simple = common_utils.truncate_long_string(
215
+ instance_type_full, 15)
216
+ elements_simple.append(instance_type_simple)
217
+ elements_full.append(instance_type_full)
218
+ elements_simple.append('...')
219
+ if not simplified_only:
210
220
  image_id = resource.image_id
211
221
  if image_id is not None:
212
222
  if None in image_id:
213
- components.append(f'image_id={image_id[None]}')
223
+ elements_full.append(f'image_id={image_id[None]}')
214
224
  else:
215
- components.append(f'image_id={image_id}')
216
- components.append(f'disk={resource.disk_size}')
225
+ elements_full.append(f'image_id={image_id}')
226
+ elements_full.append(f'disk={resource.disk_size}')
217
227
  disk_tier = resource.disk_tier
218
228
  if disk_tier is not None:
219
- components.append(f'disk_tier={disk_tier.value}')
229
+ elements_full.append(f'disk_tier={disk_tier.value}')
220
230
  ports = resource.ports
221
231
  if ports is not None:
222
- components.append(f'ports={ports}')
232
+ elements_full.append(f'ports={ports}')
223
233
 
224
234
  spot = '[spot]' if resource.use_spot else ''
225
- return f'{spot}({"" if not components else ", ".join(components)})'
226
-
227
-
228
- def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle',
229
- simplify: bool = False) -> str:
235
+ resources_str_simple = (
236
+ f'{spot}({"" if not elements_simple else ", ".join(elements_simple)})')
237
+ if simplified_only:
238
+ return resources_str_simple, None
239
+ else:
240
+ resources_str_full = (
241
+ f'{spot}({"" if not elements_full else ", ".join(elements_full)})')
242
+ return resources_str_simple, resources_str_full
243
+
244
+
245
+ def get_readable_resources_repr(
246
+ handle: 'backends.CloudVmRayResourceHandle',
247
+ simplified_only: bool = False) -> Tuple[str, Optional[str]]:
248
+ resource_str_simple, resource_str_full = format_resource(
249
+ handle.launched_resources, simplified_only)
250
+ if not simplified_only:
251
+ assert resource_str_full is not None
230
252
  if (handle.launched_nodes is not None and
231
253
  handle.launched_resources is not None):
232
- return (f'{handle.launched_nodes}x'
233
- f'{format_resource(handle.launched_resources, simplify)}')
234
- return _DEFAULT_MESSAGE_HANDLE_INITIALIZING
254
+ return (f'{handle.launched_nodes}x{resource_str_simple}',
255
+ None if simplified_only else
256
+ f'{handle.launched_nodes}x{resource_str_full}')
257
+ return (_DEFAULT_MESSAGE_HANDLE_INITIALIZING,
258
+ _DEFAULT_MESSAGE_HANDLE_INITIALIZING)
235
259
 
236
260
 
237
261
  def make_ray_custom_resources_str(
sky/utils/rich_utils.py CHANGED
@@ -193,7 +193,8 @@ class _RevertibleStatus:
193
193
  self.get_status_fn().__exit__(exc_type, exc_val, exc_tb)
194
194
  self.set_status_fn(None)
195
195
  else:
196
- self.get_status_fn().update(self.previous_message)
196
+ if self.previous_message is not None:
197
+ self.get_status_fn().update(self.previous_message)
197
198
 
198
199
  def update(self, *args, **kwargs):
199
200
  self.get_status_fn().update(*args, **kwargs)
@@ -263,11 +264,12 @@ def safe_logger():
263
264
  client_status_obj = _get_client_status()
264
265
 
265
266
  client_status_live = (client_status_obj is not None and
267
+ hasattr(client_status_obj, '_live') and
266
268
  client_status_obj._live.is_started) # pylint: disable=protected-access
267
- if client_status_live:
269
+ if client_status_live and client_status_obj is not None:
268
270
  client_status_obj.stop()
269
271
  yield
270
- if client_status_live:
272
+ if client_status_live and client_status_obj is not None:
271
273
  client_status_obj.start()
272
274
 
273
275
 
@@ -421,7 +423,7 @@ async def decode_rich_status_async(
421
423
  undecoded_buffer = b''
422
424
 
423
425
  # Iterate over the response content in chunks
424
- async for chunk in response.content.iter_chunked(8192):
426
+ async for chunk, _ in response.content.iter_chunks():
425
427
  if chunk is None:
426
428
  return
427
429
 
@@ -481,6 +483,8 @@ async def decode_rich_status_async(
481
483
  line = line[:-2] + '\n'
482
484
  is_payload, line = message_utils.decode_payload(
483
485
  line, raise_for_mismatch=False)
486
+ if line is None:
487
+ continue
484
488
  control = None
485
489
  if is_payload:
486
490
  control, encoded_status = Control.decode(line)
sky/utils/schemas.py CHANGED
@@ -460,8 +460,8 @@ def get_volume_schema():
460
460
  'type': 'string',
461
461
  'pattern': constants.MEMORY_SIZE_PATTERN,
462
462
  },
463
- 'resource_name': {
464
- 'type': 'string',
463
+ 'use_existing': {
464
+ 'type': 'boolean',
465
465
  },
466
466
  'config': {
467
467
  'type': 'object',
@@ -574,6 +574,9 @@ def get_volume_mount_schema():
574
574
  'volume_name': {
575
575
  'type': 'string',
576
576
  },
577
+ 'is_ephemeral': {
578
+ 'type': 'boolean',
579
+ },
577
580
  'volume_config': {
578
581
  'type': 'object',
579
582
  'required': [],
@@ -791,23 +794,6 @@ def _filter_schema(schema: dict, keys_to_keep: List[Tuple[str, ...]]) -> dict:
791
794
  return new_schema
792
795
 
793
796
 
794
- def _experimental_task_schema() -> dict:
795
- # TODO: experimental.config_overrides has been deprecated in favor of the
796
- # top-level `config` field. Remove in v0.11.0.
797
- config_override_schema = _filter_schema(
798
- get_config_schema(), constants.OVERRIDEABLE_CONFIG_KEYS_IN_TASK)
799
- return {
800
- 'experimental': {
801
- 'type': 'object',
802
- 'required': [],
803
- 'additionalProperties': False,
804
- 'properties': {
805
- 'config_overrides': config_override_schema,
806
- }
807
- }
808
- }
809
-
810
-
811
797
  def get_task_schema():
812
798
  return {
813
799
  '$schema': 'https://json-schema.org/draft/2020-12/schema',
@@ -918,7 +904,6 @@ def get_task_schema():
918
904
  '_metadata': {
919
905
  'type': 'object',
920
906
  },
921
- **_experimental_task_schema(),
922
907
  }
923
908
  }
924
909
 
@@ -1043,11 +1028,21 @@ class RemoteIdentityOptions(enum.Enum):
1043
1028
 
1044
1029
  def get_default_remote_identity(cloud: str) -> str:
1045
1030
  """Get the default remote identity for the specified cloud."""
1046
- if cloud == 'kubernetes':
1031
+ if cloud in ('kubernetes', 'ssh'):
1047
1032
  return RemoteIdentityOptions.SERVICE_ACCOUNT.value
1048
1033
  return RemoteIdentityOptions.LOCAL_CREDENTIALS.value
1049
1034
 
1050
1035
 
1036
+ _CAPABILITIES_SCHEMA = {
1037
+ 'capabilities': {
1038
+ 'type': 'array',
1039
+ 'items': {
1040
+ 'type': 'string',
1041
+ 'case_insensitive_enum': ['compute', 'storage']
1042
+ },
1043
+ }
1044
+ }
1045
+
1051
1046
  _REMOTE_IDENTITY_SCHEMA = {
1052
1047
  'remote_identity': {
1053
1048
  'type': 'string',
@@ -1070,25 +1065,16 @@ _REMOTE_IDENTITY_SCHEMA_KUBERNETES = {
1070
1065
  },
1071
1066
  }
1072
1067
 
1073
- _CONTEXT_CONFIG_SCHEMA_KUBERNETES = {
1074
- 'networking': {
1075
- 'type': 'string',
1076
- 'case_insensitive_enum': [
1077
- type.value for type in kubernetes_enums.KubernetesNetworkingMode
1078
- ],
1079
- },
1080
- 'ports': {
1081
- 'type': 'string',
1082
- 'case_insensitive_enum': [
1083
- type.value for type in kubernetes_enums.KubernetesPortMode
1084
- ],
1085
- },
1068
+ _CONTEXT_CONFIG_SCHEMA_MINIMAL = {
1086
1069
  'pod_config': {
1087
1070
  'type': 'object',
1088
1071
  'required': [],
1089
1072
  # Allow arbitrary keys since validating pod spec is hard
1090
1073
  'additionalProperties': True,
1091
1074
  },
1075
+ 'provision_timeout': {
1076
+ 'type': 'integer',
1077
+ },
1092
1078
  'custom_metadata': {
1093
1079
  'type': 'object',
1094
1080
  'required': [],
@@ -1103,9 +1089,23 @@ _CONTEXT_CONFIG_SCHEMA_KUBERNETES = {
1103
1089
  }]
1104
1090
  },
1105
1091
  },
1106
- 'provision_timeout': {
1107
- 'type': 'integer',
1092
+ }
1093
+
1094
+ _CONTEXT_CONFIG_SCHEMA_KUBERNETES = {
1095
+ # TODO(kevin): Remove 'networking' in v0.13.0.
1096
+ 'networking': {
1097
+ 'type': 'string',
1098
+ 'case_insensitive_enum': [
1099
+ type.value for type in kubernetes_enums.KubernetesNetworkingMode
1100
+ ],
1101
+ },
1102
+ 'ports': {
1103
+ 'type': 'string',
1104
+ 'case_insensitive_enum': [
1105
+ type.value for type in kubernetes_enums.KubernetesPortMode
1106
+ ],
1108
1107
  },
1108
+ **_CONTEXT_CONFIG_SCHEMA_MINIMAL,
1109
1109
  'autoscaler': {
1110
1110
  'type': 'string',
1111
1111
  'case_insensitive_enum': [
@@ -1153,6 +1153,12 @@ _CONTEXT_CONFIG_SCHEMA_KUBERNETES = {
1153
1153
  },
1154
1154
  'remote_identity': {
1155
1155
  'type': 'string',
1156
+ },
1157
+ 'post_provision_runcmd': {
1158
+ 'type': 'array',
1159
+ 'items': {
1160
+ 'type': 'string'
1161
+ },
1156
1162
  }
1157
1163
  }
1158
1164
 
@@ -1189,7 +1195,13 @@ def get_config_schema():
1189
1195
  'consolidation_mode': {
1190
1196
  'type': 'boolean',
1191
1197
  'default': False,
1192
- }
1198
+ },
1199
+ 'controller_logs_gc_retention_hours': {
1200
+ 'type': 'integer',
1201
+ },
1202
+ 'task_logs_gc_retention_hours': {
1203
+ 'type': 'integer',
1204
+ },
1193
1205
  },
1194
1206
  },
1195
1207
  'bucket': {
@@ -1234,6 +1246,9 @@ def get_config_schema():
1234
1246
  'type': 'null',
1235
1247
  }],
1236
1248
  },
1249
+ 'use_ssm': {
1250
+ 'type': 'boolean',
1251
+ },
1237
1252
  'post_provision_runcmd': {
1238
1253
  'type': 'array',
1239
1254
  'items': {
@@ -1247,6 +1262,7 @@ def get_config_schema():
1247
1262
  }]
1248
1263
  },
1249
1264
  },
1265
+ **_CAPABILITIES_SCHEMA,
1250
1266
  **_LABELS_SCHEMA,
1251
1267
  **_NETWORK_CONFIG_SCHEMA,
1252
1268
  },
@@ -1304,6 +1320,7 @@ def get_config_schema():
1304
1320
  }
1305
1321
  ],
1306
1322
  },
1323
+ **_CAPABILITIES_SCHEMA,
1307
1324
  **_LABELS_SCHEMA,
1308
1325
  **_NETWORK_CONFIG_SCHEMA,
1309
1326
  },
@@ -1328,10 +1345,15 @@ def get_config_schema():
1328
1345
  'additionalProperties': False,
1329
1346
  'properties': {
1330
1347
  'allowed_contexts': {
1331
- 'type': 'array',
1332
- 'items': {
1348
+ 'oneOf': [{
1349
+ 'type': 'array',
1350
+ 'items': {
1351
+ 'type': 'string',
1352
+ },
1353
+ }, {
1333
1354
  'type': 'string',
1334
- },
1355
+ 'pattern': '^all$'
1356
+ }]
1335
1357
  },
1336
1358
  'context_configs': {
1337
1359
  'type': 'object',
@@ -1361,12 +1383,22 @@ def get_config_schema():
1361
1383
  'type': 'string',
1362
1384
  },
1363
1385
  },
1364
- 'pod_config': {
1386
+ 'context_configs': {
1365
1387
  'type': 'object',
1366
1388
  'required': [],
1367
- # Allow arbitrary keys since validating pod spec is hard
1368
- 'additionalProperties': True,
1389
+ 'properties': {},
1390
+ # Properties are ssh cluster names, which are the
1391
+ # kubernetes context names without `ssh-` prefix.
1392
+ 'additionalProperties': {
1393
+ 'type': 'object',
1394
+ 'required': [],
1395
+ 'additionalProperties': False,
1396
+ 'properties': {
1397
+ **_CONTEXT_CONFIG_SCHEMA_MINIMAL,
1398
+ },
1399
+ },
1369
1400
  },
1401
+ **_CONTEXT_CONFIG_SCHEMA_MINIMAL,
1370
1402
  }
1371
1403
  },
1372
1404
  'oci': {
@@ -1407,7 +1439,10 @@ def get_config_schema():
1407
1439
  'type': 'object',
1408
1440
  'required': [],
1409
1441
  'properties': {
1410
- **_NETWORK_CONFIG_SCHEMA, 'tenant_id': {
1442
+ **_NETWORK_CONFIG_SCHEMA, 'use_static_ip_address': {
1443
+ 'type': 'boolean',
1444
+ },
1445
+ 'tenant_id': {
1411
1446
  'type': 'string',
1412
1447
  },
1413
1448
  'domain': {
@@ -1520,7 +1555,7 @@ def get_config_schema():
1520
1555
  }
1521
1556
  }
1522
1557
 
1523
- daemon_schema = {
1558
+ daemon_schema: Dict[str, Any] = {
1524
1559
  'type': 'object',
1525
1560
  'required': [],
1526
1561
  'additionalProperties': False,
@@ -1580,10 +1615,10 @@ def get_config_schema():
1580
1615
 
1581
1616
  allowed_workspace_cloud_names = list(constants.ALL_CLOUDS) + ['cloudflare']
1582
1617
  # Create pattern for not supported clouds, i.e.
1583
- # all clouds except gcp, kubernetes, ssh
1618
+ # all clouds except aws, gcp, kubernetes, ssh, nebius
1584
1619
  not_supported_clouds = [
1585
1620
  cloud for cloud in allowed_workspace_cloud_names
1586
- if cloud.lower() not in ['gcp', 'kubernetes', 'ssh', 'nebius']
1621
+ if cloud.lower() not in ['aws', 'gcp', 'kubernetes', 'ssh', 'nebius']
1587
1622
  ]
1588
1623
  not_supported_cloud_regex = '|'.join(not_supported_clouds)
1589
1624
  workspaces_schema = {
@@ -1594,7 +1629,8 @@ def get_config_schema():
1594
1629
  'type': 'object',
1595
1630
  'additionalProperties': False,
1596
1631
  'patternProperties': {
1597
- # Pattern for non-GCP clouds - only allows 'disabled' property
1632
+ # Pattern for clouds with no workspace-specific config -
1633
+ # only allow 'disabled' property.
1598
1634
  f'^({not_supported_cloud_regex})$': {
1599
1635
  'type': 'object',
1600
1636
  'additionalProperties': False,
@@ -1625,7 +1661,21 @@ def get_config_schema():
1625
1661
  },
1626
1662
  'disabled': {
1627
1663
  'type': 'boolean'
1628
- }
1664
+ },
1665
+ **_CAPABILITIES_SCHEMA,
1666
+ },
1667
+ 'additionalProperties': False,
1668
+ },
1669
+ 'aws': {
1670
+ 'type': 'object',
1671
+ 'properties': {
1672
+ 'profile': {
1673
+ 'type': 'string'
1674
+ },
1675
+ 'disabled': {
1676
+ 'type': 'boolean'
1677
+ },
1678
+ **_CAPABILITIES_SCHEMA,
1629
1679
  },
1630
1680
  'additionalProperties': False,
1631
1681
  },
@@ -1650,10 +1700,15 @@ def get_config_schema():
1650
1700
  'required': [],
1651
1701
  'properties': {
1652
1702
  'allowed_contexts': {
1653
- 'type': 'array',
1654
- 'items': {
1703
+ 'oneOf': [{
1704
+ 'type': 'array',
1705
+ 'items': {
1706
+ 'type': 'string',
1707
+ },
1708
+ }, {
1655
1709
  'type': 'string',
1656
- },
1710
+ 'pattern': '^all$'
1711
+ }]
1657
1712
  },
1658
1713
  'disabled': {
1659
1714
  'type': 'boolean'
@@ -10,7 +10,8 @@ import sys
10
10
  import threading
11
11
  import time
12
12
  import typing
13
- from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
13
+ from typing import (Any, Callable, Dict, List, Optional, Protocol, Set, Tuple,
14
+ Union)
14
15
 
15
16
  import colorama
16
17
 
@@ -18,6 +19,7 @@ from sky import exceptions
18
19
  from sky import sky_logging
19
20
  from sky.adaptors import common as adaptors_common
20
21
  from sky.skylet import log_lib
22
+ from sky.skylet import subprocess_daemon
21
23
  from sky.utils import common_utils
22
24
  from sky.utils import timeline
23
25
  from sky.utils import ux_utils
@@ -107,7 +109,7 @@ def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
107
109
 
108
110
 
109
111
  def run_in_parallel(func: Callable,
110
- args: List[Any],
112
+ args: Union[List[Any], Set[Any]],
111
113
  num_threads: Optional[int] = None) -> List[Any]:
112
114
  """Run a function in parallel on a list of arguments.
113
115
 
@@ -128,7 +130,7 @@ def run_in_parallel(func: Callable,
128
130
  if len(args) == 0:
129
131
  return []
130
132
  if len(args) == 1:
131
- return [func(args[0])]
133
+ return [func(list(args)[0])]
132
134
 
133
135
  processes = (num_threads
134
136
  if num_threads is not None else get_parallel_threads())
@@ -305,11 +307,17 @@ def run_with_retries(
305
307
  return returncode, stdout, stderr
306
308
 
307
309
 
308
- def kill_process_daemon(process_pid: int) -> None:
310
+ def kill_process_daemon(process_pid: int, use_kill_pg: bool = False) -> None:
309
311
  """Start a daemon as a safety net to kill the process.
310
312
 
311
313
  Args:
312
314
  process_pid: The PID of the process to kill.
315
+ use_kill_pg: Whether to use kill process group to kill the process. If
316
+ True, the process will use os.killpg() to kill the target process
317
+ group on UNIX system, which is more efficient than using the daemon
318
+ to refresh the process tree in the daemon. Note that both
319
+ implementations have corner cases where subprocesses might not be
320
+ killed. Refer to subprocess_daemon.py for more details.
313
321
  """
314
322
  # Get initial children list
315
323
  try:
@@ -336,6 +344,10 @@ def kill_process_daemon(process_pid: int) -> None:
336
344
  ','.join(map(str, initial_children)),
337
345
  ]
338
346
 
347
+ env = os.environ.copy()
348
+ if use_kill_pg:
349
+ env[subprocess_daemon.USE_KILL_PG_ENV_VAR] = '1'
350
+
339
351
  # We do not need to set `start_new_session=True` here, as the
340
352
  # daemon script will detach itself from the parent process with
341
353
  # fork to avoid being killed by parent process. See the reason we
@@ -347,6 +359,7 @@ def kill_process_daemon(process_pid: int) -> None:
347
359
  stderr=subprocess.DEVNULL,
348
360
  # Disable input
349
361
  stdin=subprocess.DEVNULL,
362
+ env=env,
350
363
  )
351
364
 
352
365
 
@@ -0,0 +1,91 @@
1
+ """Utility functions for threads."""
2
+
3
+ import threading
4
+ from typing import Any, Dict, Generic, Optional, overload, TypeVar
5
+
6
+ from sky.utils import common_utils
7
+
8
+
9
+ class SafeThread(threading.Thread):
10
+ """A thread that can catch exceptions."""
11
+
12
+ def __init__(self, *args, **kwargs):
13
+ super().__init__(*args, **kwargs)
14
+ self._exc = None
15
+
16
+ def run(self):
17
+ try:
18
+ super().run()
19
+ except BaseException as e: # pylint: disable=broad-except
20
+ self._exc = e
21
+
22
+ @property
23
+ def format_exc(self) -> Optional[str]:
24
+ if self._exc is None:
25
+ return None
26
+ return common_utils.format_exception(self._exc)
27
+
28
+
29
+ # pylint: disable=invalid-name
30
+ KeyType = TypeVar('KeyType')
31
+ ValueType = TypeVar('ValueType')
32
+
33
+
34
+ # Google style guide: Do not rely on the atomicity of built-in types.
35
+ # Our launch and down process pool will be used by multiple threads,
36
+ # therefore we need to use a thread-safe dict.
37
+ # see https://google.github.io/styleguide/pyguide.html#218-threading
38
+ class ThreadSafeDict(Generic[KeyType, ValueType]):
39
+ """A thread-safe dict."""
40
+
41
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
42
+ self._dict: Dict[KeyType, ValueType] = dict(*args, **kwargs)
43
+ self._lock = threading.Lock()
44
+
45
+ def __getitem__(self, key: KeyType) -> ValueType:
46
+ with self._lock:
47
+ return self._dict.__getitem__(key)
48
+
49
+ def __setitem__(self, key: KeyType, value: ValueType) -> None:
50
+ with self._lock:
51
+ return self._dict.__setitem__(key, value)
52
+
53
+ def __delitem__(self, key: KeyType) -> None:
54
+ with self._lock:
55
+ return self._dict.__delitem__(key)
56
+
57
+ def __len__(self) -> int:
58
+ with self._lock:
59
+ return self._dict.__len__()
60
+
61
+ def __contains__(self, key: KeyType) -> bool:
62
+ with self._lock:
63
+ return self._dict.__contains__(key)
64
+
65
+ def items(self):
66
+ with self._lock:
67
+ return self._dict.items()
68
+
69
+ def values(self):
70
+ with self._lock:
71
+ return self._dict.values()
72
+
73
+ @overload
74
+ def get(self, key: KeyType, default: ValueType) -> ValueType:
75
+ ...
76
+
77
+ @overload
78
+ def get(self,
79
+ key: KeyType,
80
+ default: Optional[ValueType] = None) -> Optional[ValueType]:
81
+ ...
82
+
83
+ def get(self,
84
+ key: KeyType,
85
+ default: Optional[ValueType] = None) -> Optional[ValueType]:
86
+ with self._lock:
87
+ return self._dict.get(key, default)
88
+
89
+ def pop(self, key: KeyType) -> Optional[ValueType]:
90
+ with self._lock:
91
+ return self._dict.pop(key, None)
sky/utils/timeline.py CHANGED
@@ -58,7 +58,8 @@ class Event:
58
58
  })
59
59
  event_begin['args'] = {'stack': '\n'.join(traceback.format_stack())}
60
60
  if self._message is not None:
61
- event_begin['args']['message'] = self._message
61
+ event_begin['args'][
62
+ 'message'] = self._message # type: ignore[index]
62
63
  _events.append(event_begin)
63
64
 
64
65
  def end(self):