skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/task.py CHANGED
@@ -1,43 +1,34 @@
1
1
  """Task: a coarse-grained stage in an application."""
2
2
  import collections
3
- import inspect
4
3
  import json
5
4
  import os
6
5
  import re
7
- import typing
8
6
  from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
9
7
  Union)
10
8
 
11
9
  import colorama
10
+ from pydantic import SecretStr
12
11
 
13
- import sky
14
12
  from sky import clouds
13
+ from sky import dag as dag_lib
15
14
  from sky import exceptions
15
+ from sky import resources as resources_lib
16
16
  from sky import sky_logging
17
- from sky.adaptors import common as adaptors_common
18
- import sky.dag
19
17
  from sky.data import data_utils
20
18
  from sky.data import storage as storage_lib
21
19
  from sky.provision import docker_utils
22
20
  from sky.serve import service_spec
23
21
  from sky.skylet import constants
24
22
  from sky.utils import common_utils
23
+ from sky.utils import git
24
+ from sky.utils import registry
25
25
  from sky.utils import schemas
26
26
  from sky.utils import ux_utils
27
-
28
- if typing.TYPE_CHECKING:
29
- import yaml
30
-
31
- from sky import resources as resources_lib
32
- else:
33
- yaml = adaptors_common.LazyImport('yaml')
27
+ from sky.utils import volume as volume_lib
28
+ from sky.utils import yaml_utils
34
29
 
35
30
  logger = sky_logging.init_logger(__name__)
36
31
 
37
- # A lambda generating commands (node rank_i, node addrs -> cmd_i).
38
- CommandGen = Callable[[int, List[str]], Optional[str]]
39
- CommandOrCommandGen = Union[str, CommandGen]
40
-
41
32
  _VALID_NAME_REGEX = '[a-zA-Z0-9]+(?:[._-]{1,2}[a-zA-Z0-9]+)*'
42
33
  _VALID_NAME_DESCR = ('ASCII characters and may contain lowercase and'
43
34
  ' uppercase letters, digits, underscores, periods,'
@@ -121,27 +112,61 @@ def _fill_in_env_vars(
121
112
  return json.loads(yaml_field_str)
122
113
 
123
114
 
124
- def _check_docker_login_config(task_envs: Dict[str, str]) -> bool:
125
- """Checks if there is a valid docker login config in task_envs.
115
+ def _check_docker_login_config(task_envs: Dict[str, str],
116
+ task_secrets: Dict[str, SecretStr]) -> bool:
117
+ """Validates a valid docker login config in task_envs and task_secrets.
126
118
 
127
- If any of the docker login env vars is set, all of them must be set.
119
+ Docker login variables must be specified together either in envs OR secrets,
120
+ not split across both. If any of the docker login env vars is set, all of
121
+ them must be set in the same location.
122
+
123
+ Args:
124
+ task_envs: Environment variables
125
+ task_secrets: Secret variables (optional, defaults to empty dict)
128
126
 
129
127
  Returns:
130
- True if there is a valid docker login config in task_envs.
128
+ True if there is a valid docker login config.
131
129
  False otherwise.
132
130
  Raises:
133
- ValueError: if any of the docker login env vars is set, but not all of
134
- them are set.
131
+ ValueError: if docker login configuration is invalid.
135
132
  """
133
+ if task_secrets is None:
134
+ task_secrets = {}
135
+
136
136
  all_keys = constants.DOCKER_LOGIN_ENV_VARS
137
- existing_keys = all_keys & set(task_envs.keys())
138
- if not existing_keys:
137
+ envs_keys = all_keys & set(task_envs.keys())
138
+ secrets_keys = all_keys & set(task_secrets.keys())
139
+
140
+ # Check if any docker variables exist
141
+ if not envs_keys and not secrets_keys:
139
142
  return False
140
- if len(existing_keys) != len(all_keys):
143
+
144
+ # Check if variables are split across envs and secrets
145
+ if envs_keys and secrets_keys:
141
146
  with ux_utils.print_exception_no_traceback():
142
147
  raise ValueError(
143
- f'If any of {", ".join(all_keys)} is set, all of them must '
144
- f'be set. Missing envs: {all_keys - existing_keys}')
148
+ 'Docker login variables must be specified together either '
149
+ 'in envs OR secrets, not split across both. '
150
+ f'Found in envs: {sorted(envs_keys)}, '
151
+ f'Found in secrets: {sorted(secrets_keys)}')
152
+
153
+ # Check if all variables are present in the chosen location
154
+ if envs_keys:
155
+ if len(envs_keys) != len(all_keys):
156
+ with ux_utils.print_exception_no_traceback():
157
+ raise ValueError(
158
+ 'Docker login variables must be specified together '
159
+ 'in envs. '
160
+ f'Missing from envs: {sorted(all_keys - envs_keys)}')
161
+
162
+ if secrets_keys:
163
+ if len(secrets_keys) != len(all_keys):
164
+ with ux_utils.print_exception_no_traceback():
165
+ raise ValueError(
166
+ 'Docker login variables must be specified together '
167
+ 'in secrets. '
168
+ f'Missing from secrets: {sorted(all_keys - secrets_keys)}')
169
+
145
170
  return True
146
171
 
147
172
 
@@ -149,11 +174,14 @@ def _with_docker_login_config(
149
174
  resources: Union[Set['resources_lib.Resources'],
150
175
  List['resources_lib.Resources']],
151
176
  task_envs: Dict[str, str],
177
+ task_secrets: Dict[str, SecretStr],
152
178
  ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
153
- if not _check_docker_login_config(task_envs):
179
+ if not _check_docker_login_config(task_envs, task_secrets):
154
180
  return resources
155
- docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(
156
- task_envs)
181
+ envs = task_envs.copy()
182
+ for key, value in task_secrets.items():
183
+ envs[key] = value.get_secret_value()
184
+ docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(envs)
157
185
 
158
186
  def _add_docker_login_config(resources: 'resources_lib.Resources'):
159
187
  docker_image = resources.extract_docker_image()
@@ -165,7 +193,8 @@ def _with_docker_login_config(
165
193
  f'ignored.{colorama.Style.RESET_ALL}')
166
194
  return resources
167
195
  # Already checked in extract_docker_image
168
- assert len(resources.image_id) == 1, resources.image_id
196
+ assert resources.image_id is not None and len(
197
+ resources.image_id) == 1, resources.image_id
169
198
  region = list(resources.image_id.keys())[0]
170
199
  return resources.copy(image_id={region: 'docker:' + docker_image},
171
200
  _docker_login_config=docker_login_config)
@@ -180,8 +209,12 @@ def _with_docker_username_for_runpod(
180
209
  resources: Union[Set['resources_lib.Resources'],
181
210
  List['resources_lib.Resources']],
182
211
  task_envs: Dict[str, str],
212
+ task_secrets: Dict[str, SecretStr],
183
213
  ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
184
- docker_username_for_runpod = task_envs.get(
214
+ envs = task_envs.copy()
215
+ for key, value in task_secrets.items():
216
+ envs[key] = value.get_secret_value()
217
+ docker_username_for_runpod = envs.get(
185
218
  constants.RUNPOD_DOCKER_USERNAME_ENV_VAR)
186
219
 
187
220
  # We should not call r.copy() if docker_username_for_runpod is None,
@@ -193,6 +226,18 @@ def _with_docker_username_for_runpod(
193
226
  for r in resources))
194
227
 
195
228
 
229
+ def get_plaintext_envs_and_secrets(
230
+ envs_and_secrets: Dict[str, Union[str, SecretStr]],) -> Dict[str, str]:
231
+ return {
232
+ k: v.get_secret_value() if isinstance(v, SecretStr) else v
233
+ for k, v in envs_and_secrets.items()
234
+ }
235
+
236
+
237
+ def get_plaintext_secrets(secrets: Dict[str, SecretStr]) -> Dict[str, str]:
238
+ return {k: v.get_secret_value() for k, v in secrets.items()}
239
+
240
+
196
241
  class Task:
197
242
  """Task: a computation to be run on the cloud."""
198
243
 
@@ -200,17 +245,27 @@ class Task:
200
245
  self,
201
246
  name: Optional[str] = None,
202
247
  *,
203
- setup: Optional[str] = None,
204
- run: Optional[CommandOrCommandGen] = None,
248
+ setup: Optional[Union[str, List[str]]] = None,
249
+ run: Optional[Union[str, List[str]]] = None,
205
250
  envs: Optional[Dict[str, str]] = None,
206
- workdir: Optional[str] = None,
251
+ secrets: Optional[Dict[str, str]] = None,
252
+ workdir: Optional[Union[str, Dict[str, Any]]] = None,
207
253
  num_nodes: Optional[int] = None,
254
+ file_mounts: Optional[Dict[str, str]] = None,
255
+ storage_mounts: Optional[Dict[str, storage_lib.Storage]] = None,
256
+ volumes: Optional[Dict[str, Union[str, Dict[str, Any]]]] = None,
257
+ resources: Optional[Union['resources_lib.Resources',
258
+ List['resources_lib.Resources'],
259
+ Set['resources_lib.Resources']]] = None,
208
260
  # Advanced:
209
261
  docker_image: Optional[str] = None,
210
262
  event_callback: Optional[str] = None,
211
263
  blocked_resources: Optional[Iterable['resources_lib.Resources']] = None,
212
264
  # Internal use only.
213
- file_mounts_mapping: Optional[Dict[str, str]] = None,
265
+ _file_mounts_mapping: Optional[Dict[str, str]] = None,
266
+ _volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
267
+ _metadata: Optional[Dict[str, Any]] = None,
268
+ _user_specified_yaml: Optional[str] = None,
214
269
  ):
215
270
  """Initializes a Task.
216
271
 
@@ -242,38 +297,86 @@ class Task:
242
297
 
243
298
  Args:
244
299
  name: A string name for the Task for display purposes.
245
- setup: A setup command, which will be run before executing the run
300
+ setup: A setup command(s), which will be run before executing the run
246
301
  commands ``run``, and executed under ``workdir``.
247
302
  run: The actual command for the task. If not None, either a shell
248
- command (str) or a command generator (callable). If latter, it
249
- must take a node rank and a list of node addresses as input and
250
- return a shell command (str) (valid to return None for some nodes,
251
- in which case no commands are run on them). Run commands will be
252
- run under ``workdir``. Note the command generator should be a
253
- self-contained lambda.
303
+ command(s) (str, list(str)) or a command generator (callable). If
304
+ latter, it must take a node rank and a list of node addresses as
305
+ input and return a shell command (str) (valid to return None for
306
+ some nodes, in which case no commands are run on them). Run
307
+ commands will be run under ``workdir``. Note the command generator
308
+ should be a self-contained lambda.
254
309
  envs: A dictionary of environment variables to set before running the
255
310
  setup and run commands.
256
- workdir: The local working directory. This directory will be synced
311
+ secrets: A dictionary of secret environment variables to set before
312
+ running the setup and run commands. These will be redacted in logs
313
+ and YAML output.
314
+ workdir: The local working directory or a git repository.
315
+ For a local working directory, this directory will be synced
257
316
  to a location on the remote VM(s), and ``setup`` and ``run``
258
317
  commands will be run under that location (thus, they can rely on
259
318
  relative paths when invoking binaries).
319
+ If a git repository is provided, the repository will be cloned to
320
+ the working directory and the ``setup`` and ``run`` commands will
321
+ be run under the cloned repository.
260
322
  num_nodes: The number of nodes to provision for this Task. If None,
261
323
  treated as 1 node. If > 1, each node will execute its own
262
324
  setup/run command, where ``run`` can either be a str, meaning all
263
325
  nodes get the same command, or a lambda, with the semantics
264
326
  documented above.
327
+ file_mounts: An optional dict of ``{remote_path: (local_path|cloud
328
+ URI)}``, where remote means the VM(s) on which this Task will
329
+ eventually run on, and local means the node from which the task is
330
+ launched.
331
+ storage_mounts: an optional dict of ``{mount_path: sky.Storage
332
+ object}``, where mount_path is the path inside the remote VM(s)
333
+ where the Storage object will be mounted on.
334
+ volumes: A dict of volumes to be mounted for the task. The dict has
335
+ the form of ``{mount_path: volume_name}`` for external persistent
336
+ volumes, or ``{mount_path: volume_config}`` for ephemeral volumes
337
+ where volume_config is a dict with 'size', and optional type,
338
+ labels, and 'config' fields, etc.
339
+ resources: either a sky.Resources, a set of them, or a list of them.
340
+ A set or a list of resources asks the optimizer to "pick the
341
+ best of these resources" to run this task.
265
342
  docker_image: (EXPERIMENTAL: Only in effect when LocalDockerBackend
266
343
  is used.) The base docker image that this Task will be built on.
267
344
  Defaults to 'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04'.
345
+ event_callback: A bash script that will be executed when the task
346
+ changes state.
268
347
  blocked_resources: A set of resources that this task cannot run on.
348
+ _file_mounts_mapping: (Internal use only) A dictionary of file mounts
349
+ mapping.
350
+ _volume_mounts: (Internal use only) A list of volume mounts.
351
+ _metadata: (Internal use only) A dictionary of metadata to be added to
352
+ the task.
353
+ _user_specified_yaml: (Internal use only) A string of user-specified
354
+ YAML config.
269
355
  """
270
356
  self.name = name
271
- self.run = run
272
357
  self.storage_mounts: Dict[str, storage_lib.Storage] = {}
273
358
  self.storage_plans: Dict[storage_lib.Storage,
274
359
  storage_lib.StoreType] = {}
275
- self.setup = setup
276
360
  self._envs = envs or {}
361
+ self._secrets = {}
362
+ if secrets is not None:
363
+ self._secrets = {k: SecretStr(v) for k, v in secrets.items()}
364
+ self._volumes = volumes or {}
365
+
366
+ # concatenate commands if given as list
367
+ def _concat(commands: Optional[Union[str, List[str]]]) -> Optional[str]:
368
+ if isinstance(commands, list):
369
+ return '\n'.join(commands)
370
+ return commands
371
+
372
+ self.run = _concat(run)
373
+ self.setup = _concat(setup)
374
+
375
+ # Validate Docker login configuration early if both envs and secrets
376
+ # contain Docker variables
377
+ if self._envs or self._secrets:
378
+ _check_docker_login_config(self._envs, self._secrets)
379
+
277
380
  self.workdir = workdir
278
381
  self.docker_image = (docker_image if docker_image else
279
382
  'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04')
@@ -288,33 +391,50 @@ class Task:
288
391
  self.estimated_inputs_size_gigabytes: Optional[float] = None
289
392
  self.estimated_outputs_size_gigabytes: Optional[float] = None
290
393
  # Default to CPU VM
291
- self.resources: Union[List[sky.Resources],
292
- Set[sky.Resources]] = {sky.Resources()}
394
+ self.resources: Union[List['resources_lib.Resources'],
395
+ Set['resources_lib.Resources']] = {
396
+ resources_lib.Resources()
397
+ }
293
398
  self._service: Optional[service_spec.SkyServiceSpec] = None
399
+
294
400
  # Resources that this task cannot run on.
295
401
  self.blocked_resources = blocked_resources
296
402
 
297
- self.time_estimator_func: Optional[Callable[['sky.Resources'],
403
+ self.time_estimator_func: Optional[Callable[['resources_lib.Resources'],
298
404
  int]] = None
299
405
  self.file_mounts: Optional[Dict[str, str]] = None
300
406
 
301
407
  # Only set when 'self' is a jobs controller task: 'self.managed_job_dag'
302
408
  # is the underlying managed job dag (sky.Dag object).
303
- self.managed_job_dag: Optional['sky.Dag'] = None
409
+ self.managed_job_dag: Optional['dag_lib.Dag'] = None
304
410
 
305
411
  # Only set when 'self' is a sky serve controller task.
306
412
  self.service_name: Optional[str] = None
307
413
 
308
414
  # Filled in by the optimizer. If None, this Task is not planned.
309
- self.best_resources: Optional[sky.Resources] = None
415
+ self.best_resources: Optional['resources_lib.Resources'] = None
310
416
 
311
417
  # For internal use only.
312
- self.file_mounts_mapping = file_mounts_mapping
418
+ self.file_mounts_mapping: Optional[Dict[str,
419
+ str]] = _file_mounts_mapping
420
+ self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
421
+ _volume_mounts)
313
422
 
314
- dag = sky.dag.get_current_dag()
423
+ self._metadata = _metadata if _metadata is not None else {}
424
+
425
+ if resources is not None:
426
+ self.set_resources(resources)
427
+ if storage_mounts is not None:
428
+ self.set_storage_mounts(storage_mounts)
429
+ if file_mounts is not None:
430
+ self.set_file_mounts(file_mounts)
431
+
432
+ dag = dag_lib.get_current_dag()
315
433
  if dag is not None:
316
434
  dag.add(self)
317
435
 
436
+ self._user_specified_yaml = _user_specified_yaml
437
+
318
438
  def validate(self,
319
439
  skip_file_mounts: bool = False,
320
440
  skip_workdir: bool = False):
@@ -342,42 +462,9 @@ class Task:
342
462
 
343
463
  def validate_run(self):
344
464
  """Validates if the run command is valid."""
345
- if callable(self.run):
346
- run_sig = inspect.signature(self.run)
347
- # Check that run is a function with 2 arguments.
348
- if len(run_sig.parameters) != 2:
349
- with ux_utils.print_exception_no_traceback():
350
- raise ValueError(_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
351
-
352
- type_list = [int, List[str]]
353
- # Check annotations, if exists
354
- for i, param in enumerate(run_sig.parameters.values()):
355
- if param.annotation != inspect.Parameter.empty:
356
- if param.annotation != type_list[i]:
357
- with ux_utils.print_exception_no_traceback():
358
- raise ValueError(
359
- _RUN_FN_CHECK_FAIL_MSG.format(run_sig))
360
-
361
- # Check self containedness.
362
- run_closure = inspect.getclosurevars(self.run)
363
- if run_closure.nonlocals:
364
- with ux_utils.print_exception_no_traceback():
365
- raise ValueError(
366
- 'run command generator must be self contained. '
367
- f'Found nonlocals: {run_closure.nonlocals}')
368
- if run_closure.globals:
369
- with ux_utils.print_exception_no_traceback():
370
- raise ValueError(
371
- 'run command generator must be self contained. '
372
- f'Found globals: {run_closure.globals}')
373
- if run_closure.unbound:
374
- # Do not raise an error here. Import statements, which are
375
- # allowed, will be considered as unbounded.
376
- pass
377
- elif self.run is not None and not isinstance(self.run, str):
465
+ if self.run is not None and not isinstance(self.run, str):
378
466
  with ux_utils.print_exception_no_traceback():
379
- raise ValueError('run must be either a shell script (str) or '
380
- f'a command generator ({CommandGen}). '
467
+ raise ValueError('run must be a shell script (str). '
381
468
  f'Got {type(self.run)}')
382
469
 
383
470
  def expand_and_validate_file_mounts(self):
@@ -390,12 +477,9 @@ class Task:
390
477
  if self.file_mounts is None:
391
478
  return
392
479
  for target, source in self.file_mounts.items():
393
- if target.endswith('/') or source.endswith('/'):
394
- with ux_utils.print_exception_no_traceback():
395
- raise ValueError(
396
- 'File mount paths cannot end with a slash '
397
- '(try "/mydir: /mydir" or "/myfile: /myfile"). '
398
- f'Found: target={target} source={source}')
480
+ location = f'file_mounts.{target}: {source}'
481
+ self._validate_mount_path(target, location)
482
+ self._validate_path(source, location)
399
483
  if data_utils.is_cloud_store_url(target):
400
484
  with ux_utils.print_exception_no_traceback():
401
485
  raise ValueError(
@@ -410,17 +494,25 @@ class Task:
410
494
  f'File mount source {source!r} does not exist '
411
495
  'locally. To fix: check if it exists, and correct '
412
496
  'the path.')
413
- # TODO(zhwu): /home/username/sky_workdir as the target path need
414
- # to be filtered out as well.
415
- if (target == constants.SKY_REMOTE_WORKDIR and
416
- self.workdir is not None):
417
- with ux_utils.print_exception_no_traceback():
418
- raise ValueError(
419
- f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
420
- 'destination path of a file mount, as it will be used '
421
- 'by the workdir. If uploading a file/folder to the '
422
- 'workdir is needed, please specify the full path to '
423
- 'the file/folder.')
497
+
498
+ def _validate_mount_path(self, path: str, location: str):
499
+ self._validate_path(path, location)
500
+ # TODO(zhwu): /home/username/sky_workdir as the target path need
501
+ # to be filtered out as well.
502
+ if (path == constants.SKY_REMOTE_WORKDIR and self.workdir is not None):
503
+ with ux_utils.print_exception_no_traceback():
504
+ raise ValueError(
505
+ f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
506
+ 'destination path of a file mount, as it will be used '
507
+ 'by the workdir. If uploading a file/folder to the '
508
+ 'workdir is needed, please specify the full path to '
509
+ 'the file/folder.')
510
+
511
+ def _validate_path(self, path: str, location: str):
512
+ if path.endswith('/'):
513
+ with ux_utils.print_exception_no_traceback():
514
+ raise ValueError('Mount paths cannot end with a slash '
515
+ f'Found: {path} in {location}')
424
516
 
425
517
  def expand_and_validate_workdir(self):
426
518
  """Expand workdir to absolute path and validate it.
@@ -431,6 +523,12 @@ class Task:
431
523
  """
432
524
  if self.workdir is None:
433
525
  return
526
+ # Only expand the workdir if it is a string
527
+ if isinstance(self.workdir, dict):
528
+ git_ref = self.workdir.get('ref')
529
+ if git_ref is not None:
530
+ self._metadata['git_commit'] = git_ref
531
+ return
434
532
  user_workdir = self.workdir
435
533
  self.workdir = os.path.abspath(os.path.expanduser(user_workdir))
436
534
  if not os.path.isdir(self.workdir):
@@ -440,11 +538,16 @@ class Task:
440
538
  'Workdir must be a valid directory (or '
441
539
  f'a symlink to a directory). {user_workdir} not found.')
442
540
 
541
+ self._metadata['git_commit'] = common_utils.get_git_commit(self.workdir)
542
+
443
543
  @staticmethod
444
544
  def from_yaml_config(
445
545
  config: Dict[str, Any],
446
546
  env_overrides: Optional[List[Tuple[str, str]]] = None,
547
+ secrets_overrides: Optional[List[Tuple[str, str]]] = None,
447
548
  ) -> 'Task':
549
+ user_specified_yaml = config.pop('_user_specified_yaml',
550
+ yaml_utils.dump_yaml_str(config))
448
551
  # More robust handling for 'envs': explicitly convert keys and values to
449
552
  # str, since users may pass '123' as keys/values which will get parsed
450
553
  # as int causing validate_schema() to fail.
@@ -457,6 +560,20 @@ class Task:
457
560
  else:
458
561
  new_envs[str(k)] = None
459
562
  config['envs'] = new_envs
563
+
564
+ # More robust handling for 'secrets': explicitly convert keys and values
565
+ # to str, since users may pass '123' as keys/values which will get
566
+ # parsed as int causing validate_schema() to fail.
567
+ secrets = config.get('secrets')
568
+ if secrets is not None and isinstance(secrets, dict):
569
+ new_secrets: Dict[str, Optional[str]] = {}
570
+ for k, v in secrets.items():
571
+ if v is not None:
572
+ new_secrets[str(k)] = str(v)
573
+ else:
574
+ new_secrets[str(k)] = None
575
+ config['secrets'] = new_secrets
576
+
460
577
  common_utils.validate_schema(config, schemas.get_task_schema(),
461
578
  'Invalid task YAML: ')
462
579
  if env_overrides is not None:
@@ -470,6 +587,12 @@ class Task:
470
587
  new_envs.update(env_overrides)
471
588
  config['envs'] = new_envs
472
589
 
590
+ if secrets_overrides is not None:
591
+ # Override secrets vars from CLI.
592
+ new_secrets = config.get('secrets', {})
593
+ new_secrets.update(secrets_overrides)
594
+ config['secrets'] = new_secrets
595
+
473
596
  for k, v in config.get('envs', {}).items():
474
597
  if v is None:
475
598
  with ux_utils.print_exception_no_traceback():
@@ -479,21 +602,38 @@ class Task:
479
602
  f'To set it to be empty, use an empty string ({k}: "" '
480
603
  f'in task YAML or --env {k}="" in CLI).')
481
604
 
605
+ for k, v in config.get('secrets', {}).items():
606
+ if v is None:
607
+ with ux_utils.print_exception_no_traceback():
608
+ raise ValueError(
609
+ f'Secret variable {k!r} is None. Please set a '
610
+ 'value for it in task YAML or with --secret flag. '
611
+ f'To set it to be empty, use an empty string ({k}: "" '
612
+ f'in task YAML or --secret {k}="" in CLI).')
613
+
482
614
  # Fill in any Task.envs into file_mounts (src/dst paths, storage
483
615
  # name/source).
616
+ env_vars = config.get('envs', {})
617
+ secrets = config.get('secrets', {})
618
+ env_and_secrets = env_vars.copy()
619
+ env_and_secrets.update(secrets)
484
620
  if config.get('file_mounts') is not None:
485
621
  config['file_mounts'] = _fill_in_env_vars(config['file_mounts'],
486
- config.get('envs', {}))
622
+ env_and_secrets)
487
623
 
488
624
  # Fill in any Task.envs into service (e.g. MODEL_NAME).
489
625
  if config.get('service') is not None:
490
626
  config['service'] = _fill_in_env_vars(config['service'],
491
- config.get('envs', {}))
627
+ env_and_secrets)
492
628
 
493
629
  # Fill in any Task.envs into workdir
494
630
  if config.get('workdir') is not None:
495
631
  config['workdir'] = _fill_in_env_vars(config['workdir'],
496
- config.get('envs', {}))
632
+ env_and_secrets)
633
+
634
+ if config.get('volumes') is not None:
635
+ config['volumes'] = _fill_in_env_vars(config['volumes'],
636
+ env_and_secrets)
497
637
 
498
638
  task = Task(
499
639
  config.pop('name', None),
@@ -502,8 +642,12 @@ class Task:
502
642
  setup=config.pop('setup', None),
503
643
  num_nodes=config.pop('num_nodes', None),
504
644
  envs=config.pop('envs', None),
645
+ secrets=config.pop('secrets', None),
646
+ volumes=config.pop('volumes', None),
505
647
  event_callback=config.pop('event_callback', None),
506
- file_mounts_mapping=config.pop('file_mounts_mapping', None),
648
+ _file_mounts_mapping=config.pop('file_mounts_mapping', None),
649
+ _metadata=config.pop('_metadata', None),
650
+ _user_specified_yaml=user_specified_yaml,
507
651
  )
508
652
 
509
653
  # Create lists to store storage objects inlined in file_mounts.
@@ -511,6 +655,7 @@ class Task:
511
655
  # storage objects with the storage/storage_mount objects.
512
656
  fm_storages = []
513
657
  file_mounts = config.pop('file_mounts', None)
658
+ volumes = []
514
659
  if file_mounts is not None:
515
660
  copy_mounts = {}
516
661
  for dst_path, src in file_mounts.items():
@@ -520,7 +665,27 @@ class Task:
520
665
  # If the src is not a str path, it is likely a dict. Try to
521
666
  # parse storage object.
522
667
  elif isinstance(src, dict):
523
- fm_storages.append((dst_path, src))
668
+ if (src.get('store') ==
669
+ storage_lib.StoreType.VOLUME.value.lower()):
670
+ # Build the volumes config for resources.
671
+ volume_config = {
672
+ 'path': dst_path,
673
+ }
674
+ if src.get('name'):
675
+ volume_config['name'] = src.get('name')
676
+ persistent = src.get('persistent', False)
677
+ volume_config['auto_delete'] = not persistent
678
+ volume_config_detail = src.get('config', {})
679
+ volume_config.update(volume_config_detail)
680
+ volumes.append(volume_config)
681
+ source_path = src.get('source')
682
+ if source_path:
683
+ # For volume, copy the source path to the
684
+ # data directory of the volume mount point.
685
+ copy_mounts[
686
+ f'{dst_path.rstrip("/")}/data'] = source_path
687
+ else:
688
+ fm_storages.append((dst_path, src))
524
689
  else:
525
690
  with ux_utils.print_exception_no_traceback():
526
691
  raise ValueError(f'Unable to parse file_mount '
@@ -559,34 +724,9 @@ class Task:
559
724
  task.set_outputs(outputs=outputs,
560
725
  estimated_size_gigabytes=estimated_size_gigabytes)
561
726
 
562
- # Experimental configs.
563
- experimental_configs = config.pop('experimental', None)
564
-
565
727
  # Handle the top-level config field
566
728
  config_override = config.pop('config', None)
567
729
 
568
- # Handle backward compatibility with experimental.config_overrides
569
- # TODO: Remove experimental.config_overrides in 0.11.0.
570
- if experimental_configs is not None:
571
- exp_config_override = experimental_configs.pop(
572
- 'config_overrides', None)
573
- if exp_config_override is not None:
574
- logger.warning(
575
- f'{colorama.Fore.YELLOW}`experimental.config_overrides` '
576
- 'field is deprecated in the task YAML. Use the `config` '
577
- f'field to set config overrides.{colorama.Style.RESET_ALL}')
578
- if config_override is not None:
579
- logger.warning(
580
- f'{colorama.Fore.YELLOW}Both top-level `config` and '
581
- f'`experimental.config_overrides` are specified. '
582
- f'Using top-level `config`.{colorama.Style.RESET_ALL}')
583
- else:
584
- config_override = exp_config_override
585
- logger.debug('Overriding skypilot config with task-level config: '
586
- f'{config_override}')
587
- assert not experimental_configs, ('Invalid task args: '
588
- f'{experimental_configs.keys()}')
589
-
590
730
  # Store the final config override for use in resource setup
591
731
  cluster_config_override = config_override
592
732
 
@@ -598,12 +738,35 @@ class Task:
598
738
  'experimental.config_overrides')
599
739
  resources_config[
600
740
  '_cluster_config_overrides'] = cluster_config_override
601
- task.set_resources(sky.Resources.from_yaml_config(resources_config))
741
+ if volumes:
742
+ resources_config['volumes'] = volumes
743
+ task.set_resources(
744
+ resources_lib.Resources.from_yaml_config(resources_config))
602
745
 
603
746
  service = config.pop('service', None)
747
+ pool = config.pop('pool', None)
748
+ if service is not None and pool is not None:
749
+ with ux_utils.print_exception_no_traceback():
750
+ raise ValueError(
751
+ 'Cannot set both service and pool in the same task.')
752
+
604
753
  if service is not None:
605
754
  service = service_spec.SkyServiceSpec.from_yaml_config(service)
606
- task.set_service(service)
755
+ task.set_service(service)
756
+ elif pool is not None:
757
+ pool['pool'] = True
758
+ pool = service_spec.SkyServiceSpec.from_yaml_config(pool)
759
+ task.set_service(pool)
760
+
761
+ volume_mounts = config.pop('volume_mounts', None)
762
+ if volume_mounts is not None:
763
+ task.volume_mounts = []
764
+ for vol in volume_mounts:
765
+ common_utils.validate_schema(vol,
766
+ schemas.get_volume_mount_schema(),
767
+ 'Invalid volume mount config: ')
768
+ volume_mount = volume_lib.VolumeMount.from_yaml_config(vol)
769
+ task.volume_mounts.append(volume_mount)
607
770
 
608
771
  assert not config, f'Invalid task args: {config.keys()}'
609
772
  return task
@@ -628,17 +791,140 @@ class Task:
628
791
  # TODO(zongheng): use
629
792
  # https://github.com/yaml/pyyaml/issues/165#issuecomment-430074049
630
793
  # to raise errors on duplicate keys.
631
- config = yaml.safe_load(f)
794
+ user_specified_yaml = f.read()
795
+ return Task.from_yaml_str(user_specified_yaml)
796
+
797
+ @staticmethod
798
+ def from_yaml_str(yaml_str: str) -> 'Task':
799
+ """Initializes a task from a task YAML string.
800
+
801
+ Example:
802
+ .. code-block:: python
803
+
804
+ task = sky.Task.from_yaml_str('yaml_str')
805
+ """
806
+ config = yaml_utils.safe_load(yaml_str)
632
807
 
633
808
  if isinstance(config, str):
634
809
  with ux_utils.print_exception_no_traceback():
635
810
  raise ValueError('YAML loaded as str, not as dict. '
636
- f'Is it correct? Path: {yaml_path}')
811
+ f'Is it correct? content:\n{yaml_str}')
637
812
 
638
813
  if config is None:
639
814
  config = {}
815
+ config['_user_specified_yaml'] = yaml_str
640
816
  return Task.from_yaml_config(config)
641
817
 
818
+ def resolve_and_validate_volumes(self) -> None:
819
+ """Resolve volumes config to volume mounts and validate them.
820
+
821
+ Raises:
822
+ exceptions.VolumeNotFoundError: if any volume is not found.
823
+ exceptions.VolumeTopologyConflictError: if there is conflict in the
824
+ volumes and compute topology.
825
+ """
826
+ # Volumes has been resolved, a typical case is that the API server
827
+ # has resolved the volumes and the dag was then submitted to
828
+ # controllers.
829
+ if self.volume_mounts is not None:
830
+ return None
831
+ if not self._volumes:
832
+ return None
833
+ volume_mounts: List[volume_lib.VolumeMount] = []
834
+ for dst_path, vol in self._volumes.items():
835
+ self._validate_mount_path(dst_path, location='volumes')
836
+ # Shortcut for `dst_path: volume_name` (external persistent volume)
837
+ if isinstance(vol, str):
838
+ volume_mount = volume_lib.VolumeMount.resolve(dst_path, vol)
839
+ elif isinstance(vol, dict):
840
+ # Check if this is an ephemeral volume config or external volume
841
+ # with 'size' field
842
+ if 'size' in vol:
843
+ # This is an ephemeral volume config
844
+ volume_mount = (
845
+ volume_lib.VolumeMount.resolve_ephemeral_config(
846
+ dst_path, vol))
847
+ elif 'name' in vol:
848
+ # External volume with 'name' field
849
+ volume_mount = volume_lib.VolumeMount.resolve(
850
+ dst_path, vol['name'])
851
+ else:
852
+ raise ValueError(
853
+ f'Invalid volume config: {dst_path}: {vol}. '
854
+ 'Either "size" (for ephemeral volume) or "name" '
855
+ '(for external volume) must be set.')
856
+ else:
857
+ raise ValueError(f'Invalid volume config: {dst_path}: {vol}')
858
+ volume_mounts.append(volume_mount)
859
+ # Disable certain access modes
860
+ disabled_modes = {}
861
+ if self.num_nodes > 1:
862
+ disabled_modes[
863
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value] = (
864
+ 'access mode ReadWriteOnce is not supported for '
865
+ 'multi-node tasks.')
866
+ disabled_modes[
867
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value] = (
868
+ 'access mode ReadWriteOncePod is not supported for '
869
+ 'multi-node tasks.')
870
+ # TODO(aylei): generalize access mode to all volume types
871
+ # Record the required topology and the volume that requires it, e.g.
872
+ # {'cloud': ('volume_name', 'aws')}
873
+ topology: Dict[str, Tuple[str, Optional[str]]] = {
874
+ 'cloud': ('', None),
875
+ 'region': ('', None),
876
+ 'zone': ('', None),
877
+ }
878
+ for vol in volume_mounts:
879
+ # Check access mode
880
+ access_mode = vol.volume_config.config.get('access_mode', '')
881
+ if access_mode in disabled_modes:
882
+ raise ValueError(f'Volume {vol.volume_name} with '
883
+ f'{disabled_modes[access_mode]}')
884
+ # Skip ephemeral volumes for topology check
885
+ if vol.is_ephemeral:
886
+ continue
887
+ # Check topology
888
+ for key, (vol_name, previous_req) in topology.items():
889
+ req = getattr(vol.volume_config, key)
890
+ if req is not None:
891
+ if previous_req is not None and req != previous_req:
892
+ raise exceptions.VolumeTopologyConflictError(
893
+ f'Volume {vol.volume_name} can only be attached on '
894
+ f'{key}:{req}, which conflicts with another volume '
895
+ f'{vol_name} that requires {key}:{previous_req}.'
896
+ f'Please use different volumes and retry.')
897
+ topology[key] = (vol_name, req)
898
+ # Now we have the topology requirements from the intersection of all
899
+ # volumes. Check if there is topology conflict with the resources.
900
+ # Volume must have no conflict with ALL resources even if user
901
+ # specifies 'any_of' resources to ensure no resources will conflict
902
+ # with the volumes during failover.
903
+
904
+ for res in self.resources:
905
+ for key, (vol_name, vol_req) in topology.items():
906
+ req = getattr(res, key)
907
+ if (req is not None and vol_req is not None and
908
+ str(req) != vol_req):
909
+ raise exceptions.VolumeTopologyConflictError(
910
+ f'The task requires {key}:{req}, which conflicts with '
911
+ f'the volume constraint {key}:{vol_req}. Please '
912
+ f'use different volumes and retry.')
913
+ # No topology conflict, we safely override the topology of resources to
914
+ # satisfy the volume constraints.
915
+ override_params = {}
916
+ for key, (vol_name, vol_req) in topology.items():
917
+ if vol_req is not None:
918
+ if key == 'cloud':
919
+ override_params[key] = registry.CLOUD_REGISTRY.from_str(
920
+ vol_req)
921
+ else:
922
+ override_params[key] = vol_req
923
+ logger.debug(
924
+ f'Override resources with volume constraints: {override_params}')
925
+ self.set_resources_override(override_params)
926
+ self.volume_mounts = volume_mounts
927
+
642
928
  @property
643
929
  def num_nodes(self) -> int:
644
930
  return self._num_nodes
@@ -653,10 +939,42 @@ class Task:
653
939
  f'num_nodes should be a positive int. Got: {num_nodes}')
654
940
  self._num_nodes = num_nodes
655
941
 
942
+ @property
943
+ def metadata(self) -> Dict[str, Any]:
944
+ return self._metadata
945
+
946
+ @property
947
+ def metadata_json(self) -> str:
948
+ return json.dumps(self._metadata)
949
+
656
950
  @property
657
951
  def envs(self) -> Dict[str, str]:
658
952
  return self._envs
659
953
 
954
+ @property
955
+ def secrets(self) -> Dict[str, SecretStr]:
956
+ return self._secrets
957
+
958
+ @property
959
+ def volumes(self) -> Dict[str, Union[str, Dict[str, Any]]]:
960
+ return self._volumes
961
+
962
+ def set_volumes(self, volumes: Dict[str, Union[str, Dict[str,
963
+ Any]]]) -> None:
964
+ """Sets the volumes for this task.
965
+
966
+ Args:
967
+ volumes: a dict of ``{mount_path: volume_name}`` for external
968
+ persistent volumes, or ``{mount_path: volume_config}`` for
969
+ ephemeral volumes.
970
+ """
971
+ self._volumes = volumes
972
+
973
+ def update_volumes(self, volumes: Dict[str, Union[str, Dict[str,
974
+ Any]]]) -> None:
975
+ """Updates the volumes for this task."""
976
+ self._volumes.update(volumes)
977
+
660
978
  def update_envs(
661
979
  self, envs: Union[None, List[Tuple[str, str]],
662
980
  Dict[str, str]]) -> 'Task':
@@ -697,17 +1015,71 @@ class Task:
697
1015
  # If the update_envs() is called after set_resources(), we need to
698
1016
  # manually update docker login config in task resources, in case the
699
1017
  # docker login envs are newly added.
700
- if _check_docker_login_config(self._envs):
1018
+ if _check_docker_login_config(self._envs, self._secrets):
1019
+ self.resources = _with_docker_login_config(self.resources,
1020
+ self._envs,
1021
+ self._secrets)
1022
+ self.resources = _with_docker_username_for_runpod(
1023
+ self.resources, self._envs, self._secrets)
1024
+ return self
1025
+
1026
+ def update_secrets(
1027
+ self, secrets: Union[None, List[Tuple[str, str]],
1028
+ Dict[str, str]]) -> 'Task':
1029
+ """Updates secret env vars for use inside the setup/run commands.
1030
+
1031
+ Args:
1032
+ secrets: (optional) either a list of ``(secret_name, value)`` or a
1033
+ dict ``{secret_name: value}``.
1034
+
1035
+ Returns:
1036
+ self: The current task, with secrets updated.
1037
+
1038
+ Raises:
1039
+ ValueError: if various invalid inputs errors are detected.
1040
+ """
1041
+ if secrets is None:
1042
+ secrets = {}
1043
+ if isinstance(secrets, (list, tuple)):
1044
+ keys = set(secret[0] for secret in secrets)
1045
+ if len(keys) != len(secrets):
1046
+ with ux_utils.print_exception_no_traceback():
1047
+ raise ValueError('Duplicate secret keys provided.')
1048
+ secrets = dict(secrets)
1049
+ if isinstance(secrets, dict):
1050
+ for key in secrets:
1051
+ if not isinstance(key, str):
1052
+ with ux_utils.print_exception_no_traceback():
1053
+ raise ValueError('Secret keys must be strings.')
1054
+ if not common_utils.is_valid_env_var(key):
1055
+ with ux_utils.print_exception_no_traceback():
1056
+ raise ValueError(f'Invalid secret key: {key}')
1057
+ else:
1058
+ with ux_utils.print_exception_no_traceback():
1059
+ raise ValueError(
1060
+ 'secrets must be List[Tuple[str, str]] or Dict[str, str]: '
1061
+ f'{secrets}')
1062
+ for key, value in secrets.items():
1063
+ self._secrets[key] = SecretStr(value)
1064
+ # Validate Docker login configuration if needed
1065
+ if _check_docker_login_config(self._envs, self._secrets):
701
1066
  self.resources = _with_docker_login_config(self.resources,
702
- self._envs)
1067
+ self._envs,
1068
+ self._secrets)
703
1069
  self.resources = _with_docker_username_for_runpod(
704
- self.resources, self._envs)
1070
+ self.resources, self._envs, self._secrets)
705
1071
  return self
706
1072
 
707
1073
  @property
708
1074
  def use_spot(self) -> bool:
709
1075
  return any(r.use_spot for r in self.resources)
710
1076
 
1077
+ @property
1078
+ def envs_and_secrets(self) -> Dict[str, Union[str, SecretStr]]:
1079
+ envs = self.envs.copy()
1080
+ envs.update(self.secrets)
1081
+ return envs
1082
+
711
1083
  def set_inputs(self, inputs: str,
712
1084
  estimated_size_gigabytes: float) -> 'Task':
713
1085
  # E.g., 's3://bucket', 'gs://bucket', or None.
@@ -749,7 +1121,7 @@ class Task:
749
1121
  def set_resources(
750
1122
  self, resources: Union['resources_lib.Resources',
751
1123
  List['resources_lib.Resources'],
752
- Set['resources_lib.Resources']]
1124
+ Set['resources_lib.Resources'], Dict[str, Any]]
753
1125
  ) -> 'Task':
754
1126
  """Sets the required resources to execute this task.
755
1127
 
@@ -763,19 +1135,22 @@ class Task:
763
1135
  Returns:
764
1136
  self: The current task, with resources set.
765
1137
  """
766
- if isinstance(resources, sky.Resources):
1138
+ if isinstance(resources, dict):
1139
+ resources = resources_lib.Resources.from_yaml_config(resources)
1140
+ elif isinstance(resources, resources_lib.Resources):
767
1141
  resources = {resources}
768
1142
  # TODO(woosuk): Check if the resources are None.
769
- self.resources = _with_docker_login_config(resources, self.envs)
1143
+ self.resources = _with_docker_login_config(resources, self.envs,
1144
+ self.secrets)
770
1145
  # Only have effect on RunPod.
771
1146
  self.resources = _with_docker_username_for_runpod(
772
- self.resources, self.envs)
1147
+ self.resources, self.envs, self.secrets)
773
1148
 
774
1149
  # Evaluate if the task requires FUSE and set the requires_fuse flag
775
1150
  for _, storage_obj in self.storage_mounts.items():
776
1151
  if storage_obj.mode in storage_lib.MOUNTABLE_STORAGE_MODES:
777
1152
  for r in self.resources:
778
- r.requires_fuse = True
1153
+ r.set_requires_fuse(True)
779
1154
  break
780
1155
 
781
1156
  return self
@@ -790,6 +1165,10 @@ class Task:
790
1165
  self.set_resources(type(self.resources)(new_resources_list))
791
1166
  return self
792
1167
 
1168
+ def get_resource_config(self) -> Dict[str, Any]:
1169
+ return _resources_to_config(self.resources,
1170
+ factor_out_common_fields=True)
1171
+
793
1172
  @property
794
1173
  def service(self) -> Optional[service_spec.SkyServiceSpec]:
795
1174
  return self._service
@@ -807,8 +1186,8 @@ class Task:
807
1186
  self._service = service
808
1187
  return self
809
1188
 
810
- def set_time_estimator(self, func: Callable[['sky.Resources'],
811
- int]) -> 'Task':
1189
+ def set_time_estimator(
1190
+ self, func: Callable[['resources_lib.Resources'], int]) -> 'Task':
812
1191
  """Sets a func mapping resources to estimated time (secs).
813
1192
 
814
1193
  This is EXPERIMENTAL.
@@ -864,7 +1243,7 @@ class Task:
864
1243
 
865
1244
  Different from set_file_mounts(), this function updates into the
866
1245
  existing file_mounts (calls ``dict.update()``), rather than
867
- overwritting it.
1246
+ overwriting it.
868
1247
 
869
1248
  This should be called before provisioning in order to take effect.
870
1249
 
@@ -931,7 +1310,7 @@ class Task:
931
1310
  self.storage_mounts = {}
932
1311
  # Clear the requires_fuse flag if no storage mounts are set.
933
1312
  for r in self.resources:
934
- r.requires_fuse = False
1313
+ r.set_requires_fuse(False)
935
1314
  return self
936
1315
  for target, storage_obj in storage_mounts.items():
937
1316
  # TODO(zhwu): /home/username/sky_workdir as the target path need
@@ -956,7 +1335,7 @@ class Task:
956
1335
  # If any storage is using MOUNT mode, we need to enable FUSE in
957
1336
  # the resources.
958
1337
  for r in self.resources:
959
- r.requires_fuse = True
1338
+ r.set_requires_fuse(True)
960
1339
  # Storage source validation is done in Storage object
961
1340
  self.storage_mounts = storage_mounts
962
1341
  return self
@@ -1170,6 +1549,16 @@ class Task:
1170
1549
  self.update_file_mounts({
1171
1550
  mnt_path: blob_path,
1172
1551
  })
1552
+ elif store_type is storage_lib.StoreType.COREWEAVE:
1553
+ if storage.source is not None and not isinstance(
1554
+ storage.source,
1555
+ list) and storage.source.startswith('cw://'):
1556
+ blob_path = storage.source
1557
+ else:
1558
+ blob_path = 'cw://' + storage.name
1559
+ self.update_file_mounts({
1560
+ mnt_path: blob_path,
1561
+ })
1173
1562
  else:
1174
1563
  with ux_utils.print_exception_no_traceback():
1175
1564
  raise ValueError(f'Storage Type {store_type} '
@@ -1219,11 +1608,85 @@ class Task:
1219
1608
  d[k] = v
1220
1609
  return d
1221
1610
 
1222
- def to_yaml_config(self) -> Dict[str, Any]:
1611
+ def update_workdir(self, workdir: Optional[str], git_url: Optional[str],
1612
+ git_ref: Optional[str]) -> 'Task':
1613
+ """Updates the task workdir.
1614
+
1615
+ Args:
1616
+ workdir: The workdir to update.
1617
+ git_url: The git url to update.
1618
+ git_ref: The git ref to update.
1619
+ """
1620
+ if self.workdir is None or isinstance(self.workdir, str):
1621
+ if workdir is not None:
1622
+ self.workdir = workdir
1623
+ return self
1624
+ if git_url is not None:
1625
+ self.workdir = {}
1626
+ self.workdir['url'] = git_url
1627
+ if git_ref is not None:
1628
+ self.workdir['ref'] = git_ref
1629
+ return self
1630
+ return self
1631
+ if git_url is not None:
1632
+ self.workdir['url'] = git_url
1633
+ if git_ref is not None:
1634
+ self.workdir['ref'] = git_ref
1635
+ return self
1636
+
1637
+ def update_envs_and_secrets_from_workdir(self) -> 'Task':
1638
+ """Updates the task envs and secrets from the workdir."""
1639
+ if self.workdir is None:
1640
+ return self
1641
+ if not isinstance(self.workdir, dict):
1642
+ return self
1643
+ url = self.workdir['url']
1644
+ ref = self.workdir.get('ref', '')
1645
+ token = os.environ.get(git.GIT_TOKEN_ENV_VAR)
1646
+ ssh_key_path = os.environ.get(git.GIT_SSH_KEY_PATH_ENV_VAR)
1647
+ try:
1648
+ git_repo = git.GitRepo(url, ref, token, ssh_key_path)
1649
+ clone_info = git_repo.get_repo_clone_info()
1650
+ if clone_info is None:
1651
+ return self
1652
+ self.envs[git.GIT_URL_ENV_VAR] = clone_info.url
1653
+ if ref:
1654
+ ref_type = git_repo.get_ref_type()
1655
+ if ref_type == git.GitRefType.COMMIT:
1656
+ self.envs[git.GIT_COMMIT_HASH_ENV_VAR] = ref
1657
+ elif ref_type == git.GitRefType.BRANCH:
1658
+ self.envs[git.GIT_BRANCH_ENV_VAR] = ref
1659
+ elif ref_type == git.GitRefType.TAG:
1660
+ self.envs[git.GIT_TAG_ENV_VAR] = ref
1661
+ if clone_info.token is None and clone_info.ssh_key is None:
1662
+ return self
1663
+ if clone_info.token is not None:
1664
+ self.secrets[git.GIT_TOKEN_ENV_VAR] = SecretStr(
1665
+ clone_info.token)
1666
+ if clone_info.ssh_key is not None:
1667
+ self.secrets[git.GIT_SSH_KEY_ENV_VAR] = SecretStr(
1668
+ clone_info.ssh_key)
1669
+ except exceptions.GitError as e:
1670
+ with ux_utils.print_exception_no_traceback():
1671
+ raise ValueError(f'{str(e)}') from None
1672
+ return self
1673
+
1674
+ def to_yaml_config(self,
1675
+ use_user_specified_yaml: bool = False) -> Dict[str, Any]:
1223
1676
  """Returns a yaml-style dict representation of the task.
1224
1677
 
1225
1678
  INTERNAL: this method is internal-facing.
1226
1679
  """
1680
+ if use_user_specified_yaml:
1681
+ if self._user_specified_yaml is None:
1682
+ return self._to_yaml_config(redact_secrets=True)
1683
+ config = yaml_utils.safe_load(self._user_specified_yaml)
1684
+ if config.get('secrets') is not None:
1685
+ config['secrets'] = {k: '<redacted>' for k in config['secrets']}
1686
+ return config
1687
+ return self._to_yaml_config()
1688
+
1689
+ def _to_yaml_config(self, redact_secrets: bool = False) -> Dict[str, Any]:
1227
1690
  config = {}
1228
1691
 
1229
1692
  def add_if_not_none(key, value, no_empty: bool = False):
@@ -1234,15 +1697,7 @@ class Task:
1234
1697
 
1235
1698
  add_if_not_none('name', self.name)
1236
1699
 
1237
- tmp_resource_config = {}
1238
- if len(self.resources) > 1:
1239
- resource_list = []
1240
- for r in self.resources:
1241
- resource_list.append(r.to_yaml_config())
1242
- key = 'ordered' if isinstance(self.resources, list) else 'any_of'
1243
- tmp_resource_config[key] = resource_list
1244
- else:
1245
- tmp_resource_config = list(self.resources)[0].to_yaml_config()
1700
+ tmp_resource_config = _resources_to_config(self.resources)
1246
1701
 
1247
1702
  add_if_not_none('resources', tmp_resource_config)
1248
1703
 
@@ -1263,8 +1718,17 @@ class Task:
1263
1718
  add_if_not_none('workdir', self.workdir)
1264
1719
  add_if_not_none('event_callback', self.event_callback)
1265
1720
  add_if_not_none('run', self.run)
1721
+
1722
+ # Add envs without redaction
1266
1723
  add_if_not_none('envs', self.envs, no_empty=True)
1267
1724
 
1725
+ secrets = self.secrets
1726
+ if secrets and not redact_secrets:
1727
+ secrets = {k: v.get_secret_value() for k, v in secrets.items()}
1728
+ elif secrets and redact_secrets:
1729
+ secrets = {k: '<redacted>' for k, v in secrets.items()}
1730
+ add_if_not_none('secrets', secrets, no_empty=True)
1731
+
1268
1732
  add_if_not_none('file_mounts', {})
1269
1733
 
1270
1734
  if self.file_mounts is not None:
@@ -1277,6 +1741,15 @@ class Task:
1277
1741
  })
1278
1742
 
1279
1743
  add_if_not_none('file_mounts_mapping', self.file_mounts_mapping)
1744
+ add_if_not_none('volumes', self.volumes)
1745
+ if self.volume_mounts is not None:
1746
+ config['volume_mounts'] = [
1747
+ volume_mount.to_yaml_config()
1748
+ for volume_mount in self.volume_mounts
1749
+ ]
1750
+ # we manually check if its empty to not clog up the generated yaml
1751
+ add_if_not_none('_metadata', self._metadata if self._metadata else None)
1752
+ add_if_not_none('_user_specified_yaml', self._user_specified_yaml)
1280
1753
  return config
1281
1754
 
1282
1755
  def get_required_cloud_features(
@@ -1304,7 +1777,12 @@ class Task:
1304
1777
  return required_features
1305
1778
 
1306
1779
  def __rshift__(self, b):
1307
- sky.dag.get_current_dag().add_edge(self, b)
1780
+ dag = dag_lib.get_current_dag()
1781
+ if dag is None:
1782
+ raise RuntimeError(
1783
+ 'Cannot use >> operator outside of a DAG context. '
1784
+ 'Please use "with sky.Dag() as dag:" to create a DAG context.')
1785
+ dag.add_edge(self, b)
1308
1786
 
1309
1787
  def __repr__(self):
1310
1788
  if isinstance(self.run, str):
@@ -1339,3 +1817,47 @@ class Task:
1339
1817
  else:
1340
1818
  s += '\n resources: default instances'
1341
1819
  return s
1820
+
1821
+
1822
+ def _resources_to_config(
1823
+ resources: Union[List['resources_lib.Resources'],
1824
+ Set['resources_lib.Resources']],
1825
+ factor_out_common_fields: bool = False) -> Dict[str, Any]:
1826
+ if len(resources) > 1:
1827
+ resource_list: List[Dict[str, Union[str, int]]] = []
1828
+ for r in resources:
1829
+ resource_list.append(r.to_yaml_config())
1830
+ group_key = 'ordered' if isinstance(resources, list) else 'any_of'
1831
+ if factor_out_common_fields:
1832
+ return _factor_out_common_resource_fields(resource_list, group_key)
1833
+ return {group_key: resource_list}
1834
+ else:
1835
+ return list(resources)[0].to_yaml_config()
1836
+
1837
+
1838
+ def _factor_out_common_resource_fields(configs: List[Dict[str, Union[str,
1839
+ int]]],
1840
+ group_key: str) -> Dict[str, Any]:
1841
+ """Factors out the fields that are common to all resources."""
1842
+ return_config: Dict[str, Any] = configs[0].copy()
1843
+ if len(configs) > 1:
1844
+ for config in configs[1:]:
1845
+ for key, value in config.items():
1846
+ if key in return_config and return_config[key] != value:
1847
+ del return_config[key]
1848
+ num_empty_configs = 0
1849
+ for config in configs:
1850
+ keys_to_delete = []
1851
+ for key, value in config.items():
1852
+ if key in return_config:
1853
+ keys_to_delete.append(key)
1854
+ for key in keys_to_delete:
1855
+ del config[key]
1856
+ if not config:
1857
+ num_empty_configs += 1
1858
+
1859
+ if num_empty_configs == len(configs):
1860
+ return return_config
1861
+ if len(configs) > 0:
1862
+ return_config[group_key] = configs
1863
+ return return_config