skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/task.py CHANGED
@@ -1,43 +1,33 @@
1
1
  """Task: a coarse-grained stage in an application."""
2
2
  import collections
3
- import inspect
4
3
  import json
5
4
  import os
6
5
  import re
7
- import typing
8
6
  from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
9
7
  Union)
10
8
 
11
9
  import colorama
12
10
 
13
- import sky
14
11
  from sky import clouds
12
+ from sky import dag as dag_lib
15
13
  from sky import exceptions
14
+ from sky import resources as resources_lib
16
15
  from sky import sky_logging
17
- from sky.adaptors import common as adaptors_common
18
- import sky.dag
19
16
  from sky.data import data_utils
20
17
  from sky.data import storage as storage_lib
21
18
  from sky.provision import docker_utils
22
19
  from sky.serve import service_spec
23
20
  from sky.skylet import constants
24
21
  from sky.utils import common_utils
22
+ from sky.utils import git
23
+ from sky.utils import registry
25
24
  from sky.utils import schemas
26
25
  from sky.utils import ux_utils
27
-
28
- if typing.TYPE_CHECKING:
29
- import yaml
30
-
31
- from sky import resources as resources_lib
32
- else:
33
- yaml = adaptors_common.LazyImport('yaml')
26
+ from sky.utils import volume as volume_lib
27
+ from sky.utils import yaml_utils
34
28
 
35
29
  logger = sky_logging.init_logger(__name__)
36
30
 
37
- # A lambda generating commands (node rank_i, node addrs -> cmd_i).
38
- CommandGen = Callable[[int, List[str]], Optional[str]]
39
- CommandOrCommandGen = Union[str, CommandGen]
40
-
41
31
  _VALID_NAME_REGEX = '[a-zA-Z0-9]+(?:[._-]{1,2}[a-zA-Z0-9]+)*'
42
32
  _VALID_NAME_DESCR = ('ASCII characters and may contain lowercase and'
43
33
  ' uppercase letters, digits, underscores, periods,'
@@ -121,27 +111,61 @@ def _fill_in_env_vars(
121
111
  return json.loads(yaml_field_str)
122
112
 
123
113
 
124
- def _check_docker_login_config(task_envs: Dict[str, str]) -> bool:
125
- """Checks if there is a valid docker login config in task_envs.
114
+ def _check_docker_login_config(task_envs: Dict[str, str],
115
+ task_secrets: Dict[str, str]) -> bool:
116
+ """Validates a valid docker login config in task_envs and task_secrets.
126
117
 
127
- If any of the docker login env vars is set, all of them must be set.
118
+ Docker login variables must be specified together either in envs OR secrets,
119
+ not split across both. If any of the docker login env vars is set, all of
120
+ them must be set in the same location.
121
+
122
+ Args:
123
+ task_envs: Environment variables
124
+ task_secrets: Secret variables (optional, defaults to empty dict)
128
125
 
129
126
  Returns:
130
- True if there is a valid docker login config in task_envs.
127
+ True if there is a valid docker login config.
131
128
  False otherwise.
132
129
  Raises:
133
- ValueError: if any of the docker login env vars is set, but not all of
134
- them are set.
130
+ ValueError: if docker login configuration is invalid.
135
131
  """
132
+ if task_secrets is None:
133
+ task_secrets = {}
134
+
136
135
  all_keys = constants.DOCKER_LOGIN_ENV_VARS
137
- existing_keys = all_keys & set(task_envs.keys())
138
- if not existing_keys:
136
+ envs_keys = all_keys & set(task_envs.keys())
137
+ secrets_keys = all_keys & set(task_secrets.keys())
138
+
139
+ # Check if any docker variables exist
140
+ if not envs_keys and not secrets_keys:
139
141
  return False
140
- if len(existing_keys) != len(all_keys):
142
+
143
+ # Check if variables are split across envs and secrets
144
+ if envs_keys and secrets_keys:
141
145
  with ux_utils.print_exception_no_traceback():
142
146
  raise ValueError(
143
- f'If any of {", ".join(all_keys)} is set, all of them must '
144
- f'be set. Missing envs: {all_keys - existing_keys}')
147
+ 'Docker login variables must be specified together either '
148
+ 'in envs OR secrets, not split across both. '
149
+ f'Found in envs: {sorted(envs_keys)}, '
150
+ f'Found in secrets: {sorted(secrets_keys)}')
151
+
152
+ # Check if all variables are present in the chosen location
153
+ if envs_keys:
154
+ if len(envs_keys) != len(all_keys):
155
+ with ux_utils.print_exception_no_traceback():
156
+ raise ValueError(
157
+ 'Docker login variables must be specified together '
158
+ 'in envs. '
159
+ f'Missing from envs: {sorted(all_keys - envs_keys)}')
160
+
161
+ if secrets_keys:
162
+ if len(secrets_keys) != len(all_keys):
163
+ with ux_utils.print_exception_no_traceback():
164
+ raise ValueError(
165
+ 'Docker login variables must be specified together '
166
+ 'in secrets. '
167
+ f'Missing from secrets: {sorted(all_keys - secrets_keys)}')
168
+
145
169
  return True
146
170
 
147
171
 
@@ -149,11 +173,13 @@ def _with_docker_login_config(
149
173
  resources: Union[Set['resources_lib.Resources'],
150
174
  List['resources_lib.Resources']],
151
175
  task_envs: Dict[str, str],
176
+ task_secrets: Dict[str, str],
152
177
  ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
153
- if not _check_docker_login_config(task_envs):
178
+ if not _check_docker_login_config(task_envs, task_secrets):
154
179
  return resources
155
- docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(
156
- task_envs)
180
+ envs = task_envs.copy()
181
+ envs.update(task_secrets)
182
+ docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(envs)
157
183
 
158
184
  def _add_docker_login_config(resources: 'resources_lib.Resources'):
159
185
  docker_image = resources.extract_docker_image()
@@ -165,7 +191,8 @@ def _with_docker_login_config(
165
191
  f'ignored.{colorama.Style.RESET_ALL}')
166
192
  return resources
167
193
  # Already checked in extract_docker_image
168
- assert len(resources.image_id) == 1, resources.image_id
194
+ assert resources.image_id is not None and len(
195
+ resources.image_id) == 1, resources.image_id
169
196
  region = list(resources.image_id.keys())[0]
170
197
  return resources.copy(image_id={region: 'docker:' + docker_image},
171
198
  _docker_login_config=docker_login_config)
@@ -180,8 +207,11 @@ def _with_docker_username_for_runpod(
180
207
  resources: Union[Set['resources_lib.Resources'],
181
208
  List['resources_lib.Resources']],
182
209
  task_envs: Dict[str, str],
210
+ task_secrets: Dict[str, str],
183
211
  ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
184
- docker_username_for_runpod = task_envs.get(
212
+ envs = task_envs.copy()
213
+ envs.update(task_secrets)
214
+ docker_username_for_runpod = envs.get(
185
215
  constants.RUNPOD_DOCKER_USERNAME_ENV_VAR)
186
216
 
187
217
  # We should not call r.copy() if docker_username_for_runpod is None,
@@ -200,17 +230,27 @@ class Task:
200
230
  self,
201
231
  name: Optional[str] = None,
202
232
  *,
203
- setup: Optional[str] = None,
204
- run: Optional[CommandOrCommandGen] = None,
233
+ setup: Optional[Union[str, List[str]]] = None,
234
+ run: Optional[Union[str, List[str]]] = None,
205
235
  envs: Optional[Dict[str, str]] = None,
206
- workdir: Optional[str] = None,
236
+ secrets: Optional[Dict[str, str]] = None,
237
+ workdir: Optional[Union[str, Dict[str, Any]]] = None,
207
238
  num_nodes: Optional[int] = None,
239
+ file_mounts: Optional[Dict[str, str]] = None,
240
+ storage_mounts: Optional[Dict[str, storage_lib.Storage]] = None,
241
+ volumes: Optional[Dict[str, str]] = None,
242
+ resources: Optional[Union['resources_lib.Resources',
243
+ List['resources_lib.Resources'],
244
+ Set['resources_lib.Resources']]] = None,
208
245
  # Advanced:
209
246
  docker_image: Optional[str] = None,
210
247
  event_callback: Optional[str] = None,
211
248
  blocked_resources: Optional[Iterable['resources_lib.Resources']] = None,
212
249
  # Internal use only.
213
- file_mounts_mapping: Optional[Dict[str, str]] = None,
250
+ _file_mounts_mapping: Optional[Dict[str, str]] = None,
251
+ _volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
252
+ _metadata: Optional[Dict[str, Any]] = None,
253
+ _user_specified_yaml: Optional[str] = None,
214
254
  ):
215
255
  """Initializes a Task.
216
256
 
@@ -242,38 +282,81 @@ class Task:
242
282
 
243
283
  Args:
244
284
  name: A string name for the Task for display purposes.
245
- setup: A setup command, which will be run before executing the run
285
+ setup: A setup command(s), which will be run before executing the run
246
286
  commands ``run``, and executed under ``workdir``.
247
287
  run: The actual command for the task. If not None, either a shell
248
- command (str) or a command generator (callable). If latter, it
249
- must take a node rank and a list of node addresses as input and
250
- return a shell command (str) (valid to return None for some nodes,
251
- in which case no commands are run on them). Run commands will be
252
- run under ``workdir``. Note the command generator should be a
253
- self-contained lambda.
288
+ command(s) (str, list(str)) or a command generator (callable). If
289
+ latter, it must take a node rank and a list of node addresses as
290
+ input and return a shell command (str) (valid to return None for
291
+ some nodes, in which case no commands are run on them). Run
292
+ commands will be run under ``workdir``. Note the command generator
293
+ should be a self-contained lambda.
254
294
  envs: A dictionary of environment variables to set before running the
255
295
  setup and run commands.
256
- workdir: The local working directory. This directory will be synced
296
+ secrets: A dictionary of secret environment variables to set before
297
+ running the setup and run commands. These will be redacted in logs
298
+ and YAML output.
299
+ workdir: The local working directory or a git repository.
300
+ For a local working directory, this directory will be synced
257
301
  to a location on the remote VM(s), and ``setup`` and ``run``
258
302
  commands will be run under that location (thus, they can rely on
259
303
  relative paths when invoking binaries).
304
+ If a git repository is provided, the repository will be cloned to
305
+ the working directory and the ``setup`` and ``run`` commands will
306
+ be run under the cloned repository.
260
307
  num_nodes: The number of nodes to provision for this Task. If None,
261
308
  treated as 1 node. If > 1, each node will execute its own
262
309
  setup/run command, where ``run`` can either be a str, meaning all
263
310
  nodes get the same command, or a lambda, with the semantics
264
311
  documented above.
312
+ file_mounts: An optional dict of ``{remote_path: (local_path|cloud
313
+ URI)}``, where remote means the VM(s) on which this Task will
314
+ eventually run on, and local means the node from which the task is
315
+ launched.
316
+ storage_mounts: an optional dict of ``{mount_path: sky.Storage
317
+ object}``, where mount_path is the path inside the remote VM(s)
318
+ where the Storage object will be mounted on.
319
+ volumes: A dict of volumes to be mounted for the task. The dict has
320
+ the form of ``{mount_path: volume_name}``.
321
+ resources: either a sky.Resources, a set of them, or a list of them.
322
+ A set or a list of resources asks the optimizer to "pick the
323
+ best of these resources" to run this task.
265
324
  docker_image: (EXPERIMENTAL: Only in effect when LocalDockerBackend
266
325
  is used.) The base docker image that this Task will be built on.
267
326
  Defaults to 'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04'.
327
+ event_callback: A bash script that will be executed when the task
328
+ changes state.
268
329
  blocked_resources: A set of resources that this task cannot run on.
330
+ _file_mounts_mapping: (Internal use only) A dictionary of file mounts
331
+ mapping.
332
+ _volume_mounts: (Internal use only) A list of volume mounts.
333
+ _metadata: (Internal use only) A dictionary of metadata to be added to
334
+ the task.
335
+ _user_specified_yaml: (Internal use only) A string of user-specified
336
+ YAML config.
269
337
  """
270
338
  self.name = name
271
- self.run = run
272
339
  self.storage_mounts: Dict[str, storage_lib.Storage] = {}
273
340
  self.storage_plans: Dict[storage_lib.Storage,
274
341
  storage_lib.StoreType] = {}
275
- self.setup = setup
276
342
  self._envs = envs or {}
343
+ self._secrets = secrets or {}
344
+ self._volumes = volumes or {}
345
+
346
+ # concatenate commands if given as list
347
+ def _concat(commands: Optional[Union[str, List[str]]]) -> Optional[str]:
348
+ if isinstance(commands, list):
349
+ return '\n'.join(commands)
350
+ return commands
351
+
352
+ self.run = _concat(run)
353
+ self.setup = _concat(setup)
354
+
355
+ # Validate Docker login configuration early if both envs and secrets
356
+ # contain Docker variables
357
+ if self._envs or self._secrets:
358
+ _check_docker_login_config(self._envs, self._secrets)
359
+
277
360
  self.workdir = workdir
278
361
  self.docker_image = (docker_image if docker_image else
279
362
  'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04')
@@ -288,33 +371,50 @@ class Task:
288
371
  self.estimated_inputs_size_gigabytes: Optional[float] = None
289
372
  self.estimated_outputs_size_gigabytes: Optional[float] = None
290
373
  # Default to CPU VM
291
- self.resources: Union[List[sky.Resources],
292
- Set[sky.Resources]] = {sky.Resources()}
374
+ self.resources: Union[List['resources_lib.Resources'],
375
+ Set['resources_lib.Resources']] = {
376
+ resources_lib.Resources()
377
+ }
293
378
  self._service: Optional[service_spec.SkyServiceSpec] = None
379
+
294
380
  # Resources that this task cannot run on.
295
381
  self.blocked_resources = blocked_resources
296
382
 
297
- self.time_estimator_func: Optional[Callable[['sky.Resources'],
383
+ self.time_estimator_func: Optional[Callable[['resources_lib.Resources'],
298
384
  int]] = None
299
385
  self.file_mounts: Optional[Dict[str, str]] = None
300
386
 
301
387
  # Only set when 'self' is a jobs controller task: 'self.managed_job_dag'
302
388
  # is the underlying managed job dag (sky.Dag object).
303
- self.managed_job_dag: Optional['sky.Dag'] = None
389
+ self.managed_job_dag: Optional['dag_lib.Dag'] = None
304
390
 
305
391
  # Only set when 'self' is a sky serve controller task.
306
392
  self.service_name: Optional[str] = None
307
393
 
308
394
  # Filled in by the optimizer. If None, this Task is not planned.
309
- self.best_resources: Optional[sky.Resources] = None
395
+ self.best_resources: Optional['resources_lib.Resources'] = None
310
396
 
311
397
  # For internal use only.
312
- self.file_mounts_mapping = file_mounts_mapping
398
+ self.file_mounts_mapping: Optional[Dict[str,
399
+ str]] = _file_mounts_mapping
400
+ self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
401
+ _volume_mounts)
402
+
403
+ self._metadata = _metadata if _metadata is not None else {}
313
404
 
314
- dag = sky.dag.get_current_dag()
405
+ if resources is not None:
406
+ self.set_resources(resources)
407
+ if storage_mounts is not None:
408
+ self.set_storage_mounts(storage_mounts)
409
+ if file_mounts is not None:
410
+ self.set_file_mounts(file_mounts)
411
+
412
+ dag = dag_lib.get_current_dag()
315
413
  if dag is not None:
316
414
  dag.add(self)
317
415
 
416
+ self._user_specified_yaml = _user_specified_yaml
417
+
318
418
  def validate(self,
319
419
  skip_file_mounts: bool = False,
320
420
  skip_workdir: bool = False):
@@ -342,42 +442,9 @@ class Task:
342
442
 
343
443
  def validate_run(self):
344
444
  """Validates if the run command is valid."""
345
- if callable(self.run):
346
- run_sig = inspect.signature(self.run)
347
- # Check that run is a function with 2 arguments.
348
- if len(run_sig.parameters) != 2:
349
- with ux_utils.print_exception_no_traceback():
350
- raise ValueError(_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
351
-
352
- type_list = [int, List[str]]
353
- # Check annotations, if exists
354
- for i, param in enumerate(run_sig.parameters.values()):
355
- if param.annotation != inspect.Parameter.empty:
356
- if param.annotation != type_list[i]:
357
- with ux_utils.print_exception_no_traceback():
358
- raise ValueError(
359
- _RUN_FN_CHECK_FAIL_MSG.format(run_sig))
360
-
361
- # Check self containedness.
362
- run_closure = inspect.getclosurevars(self.run)
363
- if run_closure.nonlocals:
364
- with ux_utils.print_exception_no_traceback():
365
- raise ValueError(
366
- 'run command generator must be self contained. '
367
- f'Found nonlocals: {run_closure.nonlocals}')
368
- if run_closure.globals:
369
- with ux_utils.print_exception_no_traceback():
370
- raise ValueError(
371
- 'run command generator must be self contained. '
372
- f'Found globals: {run_closure.globals}')
373
- if run_closure.unbound:
374
- # Do not raise an error here. Import statements, which are
375
- # allowed, will be considered as unbounded.
376
- pass
377
- elif self.run is not None and not isinstance(self.run, str):
445
+ if self.run is not None and not isinstance(self.run, str):
378
446
  with ux_utils.print_exception_no_traceback():
379
- raise ValueError('run must be either a shell script (str) or '
380
- f'a command generator ({CommandGen}). '
447
+ raise ValueError('run must be a shell script (str). '
381
448
  f'Got {type(self.run)}')
382
449
 
383
450
  def expand_and_validate_file_mounts(self):
@@ -390,12 +457,9 @@ class Task:
390
457
  if self.file_mounts is None:
391
458
  return
392
459
  for target, source in self.file_mounts.items():
393
- if target.endswith('/') or source.endswith('/'):
394
- with ux_utils.print_exception_no_traceback():
395
- raise ValueError(
396
- 'File mount paths cannot end with a slash '
397
- '(try "/mydir: /mydir" or "/myfile: /myfile"). '
398
- f'Found: target={target} source={source}')
460
+ location = f'file_mounts.{target}: {source}'
461
+ self._validate_mount_path(target, location)
462
+ self._validate_path(source, location)
399
463
  if data_utils.is_cloud_store_url(target):
400
464
  with ux_utils.print_exception_no_traceback():
401
465
  raise ValueError(
@@ -410,17 +474,25 @@ class Task:
410
474
  f'File mount source {source!r} does not exist '
411
475
  'locally. To fix: check if it exists, and correct '
412
476
  'the path.')
413
- # TODO(zhwu): /home/username/sky_workdir as the target path need
414
- # to be filtered out as well.
415
- if (target == constants.SKY_REMOTE_WORKDIR and
416
- self.workdir is not None):
417
- with ux_utils.print_exception_no_traceback():
418
- raise ValueError(
419
- f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
420
- 'destination path of a file mount, as it will be used '
421
- 'by the workdir. If uploading a file/folder to the '
422
- 'workdir is needed, please specify the full path to '
423
- 'the file/folder.')
477
+
478
+ def _validate_mount_path(self, path: str, location: str):
479
+ self._validate_path(path, location)
480
+ # TODO(zhwu): /home/username/sky_workdir as the target path need
481
+ # to be filtered out as well.
482
+ if (path == constants.SKY_REMOTE_WORKDIR and self.workdir is not None):
483
+ with ux_utils.print_exception_no_traceback():
484
+ raise ValueError(
485
+ f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
486
+ 'destination path of a file mount, as it will be used '
487
+ 'by the workdir. If uploading a file/folder to the '
488
+ 'workdir is needed, please specify the full path to '
489
+ 'the file/folder.')
490
+
491
+ def _validate_path(self, path: str, location: str):
492
+ if path.endswith('/'):
493
+ with ux_utils.print_exception_no_traceback():
494
+ raise ValueError('Mount paths cannot end with a slash '
495
+ f'Found: {path} in {location}')
424
496
 
425
497
  def expand_and_validate_workdir(self):
426
498
  """Expand workdir to absolute path and validate it.
@@ -431,6 +503,12 @@ class Task:
431
503
  """
432
504
  if self.workdir is None:
433
505
  return
506
+ # Only expand the workdir if it is a string
507
+ if isinstance(self.workdir, dict):
508
+ git_ref = self.workdir.get('ref')
509
+ if git_ref is not None:
510
+ self._metadata['git_commit'] = git_ref
511
+ return
434
512
  user_workdir = self.workdir
435
513
  self.workdir = os.path.abspath(os.path.expanduser(user_workdir))
436
514
  if not os.path.isdir(self.workdir):
@@ -440,11 +518,16 @@ class Task:
440
518
  'Workdir must be a valid directory (or '
441
519
  f'a symlink to a directory). {user_workdir} not found.')
442
520
 
521
+ self._metadata['git_commit'] = common_utils.get_git_commit(self.workdir)
522
+
443
523
  @staticmethod
444
524
  def from_yaml_config(
445
525
  config: Dict[str, Any],
446
526
  env_overrides: Optional[List[Tuple[str, str]]] = None,
527
+ secrets_overrides: Optional[List[Tuple[str, str]]] = None,
447
528
  ) -> 'Task':
529
+ user_specified_yaml = config.pop('_user_specified_yaml',
530
+ yaml_utils.dump_yaml_str(config))
448
531
  # More robust handling for 'envs': explicitly convert keys and values to
449
532
  # str, since users may pass '123' as keys/values which will get parsed
450
533
  # as int causing validate_schema() to fail.
@@ -457,6 +540,20 @@ class Task:
457
540
  else:
458
541
  new_envs[str(k)] = None
459
542
  config['envs'] = new_envs
543
+
544
+ # More robust handling for 'secrets': explicitly convert keys and values
545
+ # to str, since users may pass '123' as keys/values which will get
546
+ # parsed as int causing validate_schema() to fail.
547
+ secrets = config.get('secrets')
548
+ if secrets is not None and isinstance(secrets, dict):
549
+ new_secrets: Dict[str, Optional[str]] = {}
550
+ for k, v in secrets.items():
551
+ if v is not None:
552
+ new_secrets[str(k)] = str(v)
553
+ else:
554
+ new_secrets[str(k)] = None
555
+ config['secrets'] = new_secrets
556
+
460
557
  common_utils.validate_schema(config, schemas.get_task_schema(),
461
558
  'Invalid task YAML: ')
462
559
  if env_overrides is not None:
@@ -470,6 +567,12 @@ class Task:
470
567
  new_envs.update(env_overrides)
471
568
  config['envs'] = new_envs
472
569
 
570
+ if secrets_overrides is not None:
571
+ # Override secrets vars from CLI.
572
+ new_secrets = config.get('secrets', {})
573
+ new_secrets.update(secrets_overrides)
574
+ config['secrets'] = new_secrets
575
+
473
576
  for k, v in config.get('envs', {}).items():
474
577
  if v is None:
475
578
  with ux_utils.print_exception_no_traceback():
@@ -479,21 +582,38 @@ class Task:
479
582
  f'To set it to be empty, use an empty string ({k}: "" '
480
583
  f'in task YAML or --env {k}="" in CLI).')
481
584
 
585
+ for k, v in config.get('secrets', {}).items():
586
+ if v is None:
587
+ with ux_utils.print_exception_no_traceback():
588
+ raise ValueError(
589
+ f'Secret variable {k!r} is None. Please set a '
590
+ 'value for it in task YAML or with --secret flag. '
591
+ f'To set it to be empty, use an empty string ({k}: "" '
592
+ f'in task YAML or --secret {k}="" in CLI).')
593
+
482
594
  # Fill in any Task.envs into file_mounts (src/dst paths, storage
483
595
  # name/source).
596
+ env_vars = config.get('envs', {})
597
+ secrets = config.get('secrets', {})
598
+ env_and_secrets = env_vars.copy()
599
+ env_and_secrets.update(secrets)
484
600
  if config.get('file_mounts') is not None:
485
601
  config['file_mounts'] = _fill_in_env_vars(config['file_mounts'],
486
- config.get('envs', {}))
602
+ env_and_secrets)
487
603
 
488
604
  # Fill in any Task.envs into service (e.g. MODEL_NAME).
489
605
  if config.get('service') is not None:
490
606
  config['service'] = _fill_in_env_vars(config['service'],
491
- config.get('envs', {}))
607
+ env_and_secrets)
492
608
 
493
609
  # Fill in any Task.envs into workdir
494
610
  if config.get('workdir') is not None:
495
611
  config['workdir'] = _fill_in_env_vars(config['workdir'],
496
- config.get('envs', {}))
612
+ env_and_secrets)
613
+
614
+ if config.get('volumes') is not None:
615
+ config['volumes'] = _fill_in_env_vars(config['volumes'],
616
+ env_and_secrets)
497
617
 
498
618
  task = Task(
499
619
  config.pop('name', None),
@@ -502,8 +622,12 @@ class Task:
502
622
  setup=config.pop('setup', None),
503
623
  num_nodes=config.pop('num_nodes', None),
504
624
  envs=config.pop('envs', None),
625
+ secrets=config.pop('secrets', None),
626
+ volumes=config.pop('volumes', None),
505
627
  event_callback=config.pop('event_callback', None),
506
- file_mounts_mapping=config.pop('file_mounts_mapping', None),
628
+ _file_mounts_mapping=config.pop('file_mounts_mapping', None),
629
+ _metadata=config.pop('_metadata', None),
630
+ _user_specified_yaml=user_specified_yaml,
507
631
  )
508
632
 
509
633
  # Create lists to store storage objects inlined in file_mounts.
@@ -511,6 +635,7 @@ class Task:
511
635
  # storage objects with the storage/storage_mount objects.
512
636
  fm_storages = []
513
637
  file_mounts = config.pop('file_mounts', None)
638
+ volumes = []
514
639
  if file_mounts is not None:
515
640
  copy_mounts = {}
516
641
  for dst_path, src in file_mounts.items():
@@ -520,7 +645,27 @@ class Task:
520
645
  # If the src is not a str path, it is likely a dict. Try to
521
646
  # parse storage object.
522
647
  elif isinstance(src, dict):
523
- fm_storages.append((dst_path, src))
648
+ if (src.get('store') ==
649
+ storage_lib.StoreType.VOLUME.value.lower()):
650
+ # Build the volumes config for resources.
651
+ volume_config = {
652
+ 'path': dst_path,
653
+ }
654
+ if src.get('name'):
655
+ volume_config['name'] = src.get('name')
656
+ persistent = src.get('persistent', False)
657
+ volume_config['auto_delete'] = not persistent
658
+ volume_config_detail = src.get('config', {})
659
+ volume_config.update(volume_config_detail)
660
+ volumes.append(volume_config)
661
+ source_path = src.get('source')
662
+ if source_path:
663
+ # For volume, copy the source path to the
664
+ # data directory of the volume mount point.
665
+ copy_mounts[
666
+ f'{dst_path.rstrip("/")}/data'] = source_path
667
+ else:
668
+ fm_storages.append((dst_path, src))
524
669
  else:
525
670
  with ux_utils.print_exception_no_traceback():
526
671
  raise ValueError(f'Unable to parse file_mount '
@@ -598,12 +743,35 @@ class Task:
598
743
  'experimental.config_overrides')
599
744
  resources_config[
600
745
  '_cluster_config_overrides'] = cluster_config_override
601
- task.set_resources(sky.Resources.from_yaml_config(resources_config))
746
+ if volumes:
747
+ resources_config['volumes'] = volumes
748
+ task.set_resources(
749
+ resources_lib.Resources.from_yaml_config(resources_config))
602
750
 
603
751
  service = config.pop('service', None)
752
+ pool = config.pop('pool', None)
753
+ if service is not None and pool is not None:
754
+ with ux_utils.print_exception_no_traceback():
755
+ raise ValueError(
756
+ 'Cannot set both service and pool in the same task.')
757
+
604
758
  if service is not None:
605
759
  service = service_spec.SkyServiceSpec.from_yaml_config(service)
606
- task.set_service(service)
760
+ task.set_service(service)
761
+ elif pool is not None:
762
+ pool['pool'] = True
763
+ pool = service_spec.SkyServiceSpec.from_yaml_config(pool)
764
+ task.set_service(pool)
765
+
766
+ volume_mounts = config.pop('volume_mounts', None)
767
+ if volume_mounts is not None:
768
+ task.volume_mounts = []
769
+ for vol in volume_mounts:
770
+ common_utils.validate_schema(vol,
771
+ schemas.get_volume_mount_schema(),
772
+ 'Invalid volume mount config: ')
773
+ volume_mount = volume_lib.VolumeMount.from_yaml_config(vol)
774
+ task.volume_mounts.append(volume_mount)
607
775
 
608
776
  assert not config, f'Invalid task args: {config.keys()}'
609
777
  return task
@@ -628,7 +796,8 @@ class Task:
628
796
  # TODO(zongheng): use
629
797
  # https://github.com/yaml/pyyaml/issues/165#issuecomment-430074049
630
798
  # to raise errors on duplicate keys.
631
- config = yaml.safe_load(f)
799
+ user_specified_yaml = f.read()
800
+ config = yaml_utils.safe_load(user_specified_yaml)
632
801
 
633
802
  if isinstance(config, str):
634
803
  with ux_utils.print_exception_no_traceback():
@@ -637,8 +806,101 @@ class Task:
637
806
 
638
807
  if config is None:
639
808
  config = {}
809
+ config['_user_specified_yaml'] = user_specified_yaml
640
810
  return Task.from_yaml_config(config)
641
811
 
812
+ def resolve_and_validate_volumes(self) -> None:
813
+ """Resolve volumes config to volume mounts and validate them.
814
+
815
+ Raises:
816
+ exceptions.VolumeNotFoundError: if any volume is not found.
817
+ exceptions.VolumeTopologyConflictError: if there is conflict in the
818
+ volumes and compute topology.
819
+ """
820
+ # Volumes has been resolved, a typical case is that the API server
821
+ # has resolved the volumes and the dag was then submitted to
822
+ # controllers.
823
+ if self.volume_mounts is not None:
824
+ return None
825
+ if not self._volumes:
826
+ return None
827
+ volume_mounts: List[volume_lib.VolumeMount] = []
828
+ for dst_path, vol in self._volumes.items():
829
+ self._validate_mount_path(dst_path, location='volumes')
830
+ # Shortcut for `dst_path: volume_name`
831
+ if isinstance(vol, str):
832
+ volume_mount = volume_lib.VolumeMount.resolve(dst_path, vol)
833
+ elif isinstance(vol, dict):
834
+ assert 'name' in vol, 'Volume name must be set.'
835
+ volume_mount = volume_lib.VolumeMount.resolve(
836
+ dst_path, vol['name'])
837
+ else:
838
+ raise ValueError(f'Invalid volume config: {dst_path}: {vol}')
839
+ volume_mounts.append(volume_mount)
840
+ # Disable certain access modes
841
+ disabled_modes = {}
842
+ if self.num_nodes > 1:
843
+ disabled_modes[
844
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value] = (
845
+ 'access mode ReadWriteOnce is not supported for '
846
+ 'multi-node tasks.')
847
+ disabled_modes[
848
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value] = (
849
+ 'access mode ReadWriteOncePod is not supported for '
850
+ 'multi-node tasks.')
851
+ # TODO(aylei): generalize access mode to all volume types
852
+ # Record the required topology and the volume that requires it, e.g.
853
+ # {'cloud': ('volume_name', 'aws')}
854
+ topology: Dict[str, Tuple[str, Optional[str]]] = {
855
+ 'cloud': ('', None),
856
+ 'region': ('', None),
857
+ 'zone': ('', None),
858
+ }
859
+ for vol in volume_mounts:
860
+ # Check access mode
861
+ access_mode = vol.volume_config.config.get('access_mode', '')
862
+ if access_mode in disabled_modes:
863
+ raise ValueError(f'Volume {vol.volume_name} with '
864
+ f'{disabled_modes[access_mode]}')
865
+ # Check topology
866
+ for key, (vol_name, previous_req) in topology.items():
867
+ req = getattr(vol.volume_config, key)
868
+ if req is not None:
869
+ if previous_req is not None and req != previous_req:
870
+ raise exceptions.VolumeTopologyConflictError(
871
+ f'Volume {vol.volume_name} can only be attached on '
872
+ f'{key}:{req}, which conflicts with another volume '
873
+ f'{vol_name} that requires {key}:{previous_req}.'
874
+ f'Please use different volumes and retry.')
875
+ topology[key] = (vol_name, req)
876
+ # Now we have the topology requirements from the intersection of all
877
+ # volumes. Check if there is topology conflict with the resources.
878
+ # Volume must have no conflict with ALL resources even if user
879
+ # specifies 'any_of' resources to ensure no resources will conflict
880
+ # with the volumes during failover.
881
+
882
+ for res in self.resources:
883
+ for key, (vol_name, vol_req) in topology.items():
884
+ req = getattr(res, key)
885
+ if (req is not None and vol_req is not None and
886
+ str(req) != vol_req):
887
+ raise exceptions.VolumeTopologyConflictError(
888
+ f'The task requires {key}:{req}, which conflicts with '
889
+ f'the volume constraint {key}:{vol_req}. Please '
890
+ f'use different volumes and retry.')
891
+ # No topology conflict, we safely override the topology of resources to
892
+ # satisfy the volume constraints.
893
+ override_params = {}
894
+ for key, (vol_name, vol_req) in topology.items():
895
+ if vol_req is not None:
896
+ if key == 'cloud':
897
+ override_params[key] = registry.CLOUD_REGISTRY.from_str(
898
+ vol_req)
899
+ else:
900
+ override_params[key] = vol_req
901
+ self.set_resources_override(override_params)
902
+ self.volume_mounts = volume_mounts
903
+
642
904
  @property
643
905
  def num_nodes(self) -> int:
644
906
  return self._num_nodes
@@ -653,10 +915,38 @@ class Task:
653
915
  f'num_nodes should be a positive int. Got: {num_nodes}')
654
916
  self._num_nodes = num_nodes
655
917
 
918
+ @property
919
+ def metadata(self) -> Dict[str, Any]:
920
+ return self._metadata
921
+
922
+ @property
923
+ def metadata_json(self) -> str:
924
+ return json.dumps(self._metadata)
925
+
656
926
  @property
657
927
  def envs(self) -> Dict[str, str]:
658
928
  return self._envs
659
929
 
930
+ @property
931
+ def secrets(self) -> Dict[str, str]:
932
+ return self._secrets
933
+
934
+ @property
935
+ def volumes(self) -> Dict[str, str]:
936
+ return self._volumes
937
+
938
+ def set_volumes(self, volumes: Dict[str, str]) -> None:
939
+ """Sets the volumes for this task.
940
+
941
+ Args:
942
+ volumes: a dict of ``{mount_path: volume_name}``.
943
+ """
944
+ self._volumes = volumes
945
+
946
+ def update_volumes(self, volumes: Dict[str, str]) -> None:
947
+ """Updates the volumes for this task."""
948
+ self._volumes.update(volumes)
949
+
660
950
  def update_envs(
661
951
  self, envs: Union[None, List[Tuple[str, str]],
662
952
  Dict[str, str]]) -> 'Task':
@@ -697,17 +987,70 @@ class Task:
697
987
  # If the update_envs() is called after set_resources(), we need to
698
988
  # manually update docker login config in task resources, in case the
699
989
  # docker login envs are newly added.
700
- if _check_docker_login_config(self._envs):
990
+ if _check_docker_login_config(self._envs, self._secrets):
991
+ self.resources = _with_docker_login_config(self.resources,
992
+ self._envs,
993
+ self._secrets)
994
+ self.resources = _with_docker_username_for_runpod(
995
+ self.resources, self._envs, self._secrets)
996
+ return self
997
+
998
+ def update_secrets(
999
+ self, secrets: Union[None, List[Tuple[str, str]],
1000
+ Dict[str, str]]) -> 'Task':
1001
+ """Updates secret env vars for use inside the setup/run commands.
1002
+
1003
+ Args:
1004
+ secrets: (optional) either a list of ``(secret_name, value)`` or a
1005
+ dict ``{secret_name: value}``.
1006
+
1007
+ Returns:
1008
+ self: The current task, with secrets updated.
1009
+
1010
+ Raises:
1011
+ ValueError: if various invalid inputs errors are detected.
1012
+ """
1013
+ if secrets is None:
1014
+ secrets = {}
1015
+ if isinstance(secrets, (list, tuple)):
1016
+ keys = set(secret[0] for secret in secrets)
1017
+ if len(keys) != len(secrets):
1018
+ with ux_utils.print_exception_no_traceback():
1019
+ raise ValueError('Duplicate secret keys provided.')
1020
+ secrets = dict(secrets)
1021
+ if isinstance(secrets, dict):
1022
+ for key in secrets:
1023
+ if not isinstance(key, str):
1024
+ with ux_utils.print_exception_no_traceback():
1025
+ raise ValueError('Secret keys must be strings.')
1026
+ if not common_utils.is_valid_env_var(key):
1027
+ with ux_utils.print_exception_no_traceback():
1028
+ raise ValueError(f'Invalid secret key: {key}')
1029
+ else:
1030
+ with ux_utils.print_exception_no_traceback():
1031
+ raise ValueError(
1032
+ 'secrets must be List[Tuple[str, str]] or Dict[str, str]: '
1033
+ f'{secrets}')
1034
+ self._secrets.update(secrets)
1035
+ # Validate Docker login configuration if needed
1036
+ if _check_docker_login_config(self._envs, self._secrets):
701
1037
  self.resources = _with_docker_login_config(self.resources,
702
- self._envs)
1038
+ self._envs,
1039
+ self._secrets)
703
1040
  self.resources = _with_docker_username_for_runpod(
704
- self.resources, self._envs)
1041
+ self.resources, self._envs, self._secrets)
705
1042
  return self
706
1043
 
707
1044
  @property
708
1045
  def use_spot(self) -> bool:
709
1046
  return any(r.use_spot for r in self.resources)
710
1047
 
1048
+ @property
1049
+ def envs_and_secrets(self) -> Dict[str, str]:
1050
+ envs = self.envs.copy()
1051
+ envs.update(self.secrets)
1052
+ return envs
1053
+
711
1054
  def set_inputs(self, inputs: str,
712
1055
  estimated_size_gigabytes: float) -> 'Task':
713
1056
  # E.g., 's3://bucket', 'gs://bucket', or None.
@@ -749,7 +1092,7 @@ class Task:
749
1092
  def set_resources(
750
1093
  self, resources: Union['resources_lib.Resources',
751
1094
  List['resources_lib.Resources'],
752
- Set['resources_lib.Resources']]
1095
+ Set['resources_lib.Resources'], Dict[str, Any]]
753
1096
  ) -> 'Task':
754
1097
  """Sets the required resources to execute this task.
755
1098
 
@@ -763,19 +1106,22 @@ class Task:
763
1106
  Returns:
764
1107
  self: The current task, with resources set.
765
1108
  """
766
- if isinstance(resources, sky.Resources):
1109
+ if isinstance(resources, dict):
1110
+ resources = resources_lib.Resources.from_yaml_config(resources)
1111
+ elif isinstance(resources, resources_lib.Resources):
767
1112
  resources = {resources}
768
1113
  # TODO(woosuk): Check if the resources are None.
769
- self.resources = _with_docker_login_config(resources, self.envs)
1114
+ self.resources = _with_docker_login_config(resources, self.envs,
1115
+ self.secrets)
770
1116
  # Only have effect on RunPod.
771
1117
  self.resources = _with_docker_username_for_runpod(
772
- self.resources, self.envs)
1118
+ self.resources, self.envs, self.secrets)
773
1119
 
774
1120
  # Evaluate if the task requires FUSE and set the requires_fuse flag
775
1121
  for _, storage_obj in self.storage_mounts.items():
776
1122
  if storage_obj.mode in storage_lib.MOUNTABLE_STORAGE_MODES:
777
1123
  for r in self.resources:
778
- r.requires_fuse = True
1124
+ r.set_requires_fuse(True)
779
1125
  break
780
1126
 
781
1127
  return self
@@ -790,6 +1136,10 @@ class Task:
790
1136
  self.set_resources(type(self.resources)(new_resources_list))
791
1137
  return self
792
1138
 
1139
+ def get_resource_config(self) -> Dict[str, Any]:
1140
+ return _resources_to_config(self.resources,
1141
+ factor_out_common_fields=True)
1142
+
793
1143
  @property
794
1144
  def service(self) -> Optional[service_spec.SkyServiceSpec]:
795
1145
  return self._service
@@ -807,8 +1157,8 @@ class Task:
807
1157
  self._service = service
808
1158
  return self
809
1159
 
810
- def set_time_estimator(self, func: Callable[['sky.Resources'],
811
- int]) -> 'Task':
1160
+ def set_time_estimator(
1161
+ self, func: Callable[['resources_lib.Resources'], int]) -> 'Task':
812
1162
  """Sets a func mapping resources to estimated time (secs).
813
1163
 
814
1164
  This is EXPERIMENTAL.
@@ -864,7 +1214,7 @@ class Task:
864
1214
 
865
1215
  Different from set_file_mounts(), this function updates into the
866
1216
  existing file_mounts (calls ``dict.update()``), rather than
867
- overwritting it.
1217
+ overwriting it.
868
1218
 
869
1219
  This should be called before provisioning in order to take effect.
870
1220
 
@@ -931,7 +1281,7 @@ class Task:
931
1281
  self.storage_mounts = {}
932
1282
  # Clear the requires_fuse flag if no storage mounts are set.
933
1283
  for r in self.resources:
934
- r.requires_fuse = False
1284
+ r.set_requires_fuse(False)
935
1285
  return self
936
1286
  for target, storage_obj in storage_mounts.items():
937
1287
  # TODO(zhwu): /home/username/sky_workdir as the target path need
@@ -956,7 +1306,7 @@ class Task:
956
1306
  # If any storage is using MOUNT mode, we need to enable FUSE in
957
1307
  # the resources.
958
1308
  for r in self.resources:
959
- r.requires_fuse = True
1309
+ r.set_requires_fuse(True)
960
1310
  # Storage source validation is done in Storage object
961
1311
  self.storage_mounts = storage_mounts
962
1312
  return self
@@ -1170,6 +1520,16 @@ class Task:
1170
1520
  self.update_file_mounts({
1171
1521
  mnt_path: blob_path,
1172
1522
  })
1523
+ elif store_type is storage_lib.StoreType.COREWEAVE:
1524
+ if storage.source is not None and not isinstance(
1525
+ storage.source,
1526
+ list) and storage.source.startswith('cw://'):
1527
+ blob_path = storage.source
1528
+ else:
1529
+ blob_path = 'cw://' + storage.name
1530
+ self.update_file_mounts({
1531
+ mnt_path: blob_path,
1532
+ })
1173
1533
  else:
1174
1534
  with ux_utils.print_exception_no_traceback():
1175
1535
  raise ValueError(f'Storage Type {store_type} '
@@ -1219,11 +1579,83 @@ class Task:
1219
1579
  d[k] = v
1220
1580
  return d
1221
1581
 
1222
- def to_yaml_config(self) -> Dict[str, Any]:
1582
+ def update_workdir(self, workdir: Optional[str], git_url: Optional[str],
1583
+ git_ref: Optional[str]) -> 'Task':
1584
+ """Updates the task workdir.
1585
+
1586
+ Args:
1587
+ workdir: The workdir to update.
1588
+ git_url: The git url to update.
1589
+ git_ref: The git ref to update.
1590
+ """
1591
+ if self.workdir is None or isinstance(self.workdir, str):
1592
+ if workdir is not None:
1593
+ self.workdir = workdir
1594
+ return self
1595
+ if git_url is not None:
1596
+ self.workdir = {}
1597
+ self.workdir['url'] = git_url
1598
+ if git_ref is not None:
1599
+ self.workdir['ref'] = git_ref
1600
+ return self
1601
+ return self
1602
+ if git_url is not None:
1603
+ self.workdir['url'] = git_url
1604
+ if git_ref is not None:
1605
+ self.workdir['ref'] = git_ref
1606
+ return self
1607
+
1608
+ def update_envs_and_secrets_from_workdir(self) -> 'Task':
1609
+ """Updates the task envs and secrets from the workdir."""
1610
+ if self.workdir is None:
1611
+ return self
1612
+ if not isinstance(self.workdir, dict):
1613
+ return self
1614
+ url = self.workdir['url']
1615
+ ref = self.workdir.get('ref', '')
1616
+ token = os.environ.get(git.GIT_TOKEN_ENV_VAR)
1617
+ ssh_key_path = os.environ.get(git.GIT_SSH_KEY_PATH_ENV_VAR)
1618
+ try:
1619
+ git_repo = git.GitRepo(url, ref, token, ssh_key_path)
1620
+ clone_info = git_repo.get_repo_clone_info()
1621
+ if clone_info is None:
1622
+ return self
1623
+ self.envs[git.GIT_URL_ENV_VAR] = clone_info.url
1624
+ if ref:
1625
+ ref_type = git_repo.get_ref_type()
1626
+ if ref_type == git.GitRefType.COMMIT:
1627
+ self.envs[git.GIT_COMMIT_HASH_ENV_VAR] = ref
1628
+ elif ref_type == git.GitRefType.BRANCH:
1629
+ self.envs[git.GIT_BRANCH_ENV_VAR] = ref
1630
+ elif ref_type == git.GitRefType.TAG:
1631
+ self.envs[git.GIT_TAG_ENV_VAR] = ref
1632
+ if clone_info.token is None and clone_info.ssh_key is None:
1633
+ return self
1634
+ if clone_info.token is not None:
1635
+ self.secrets[git.GIT_TOKEN_ENV_VAR] = clone_info.token
1636
+ if clone_info.ssh_key is not None:
1637
+ self.secrets[git.GIT_SSH_KEY_ENV_VAR] = clone_info.ssh_key
1638
+ except exceptions.GitError as e:
1639
+ with ux_utils.print_exception_no_traceback():
1640
+ raise ValueError(f'{str(e)}') from None
1641
+ return self
1642
+
1643
+ def to_yaml_config(self,
1644
+ use_user_specified_yaml: bool = False) -> Dict[str, Any]:
1223
1645
  """Returns a yaml-style dict representation of the task.
1224
1646
 
1225
1647
  INTERNAL: this method is internal-facing.
1226
1648
  """
1649
+ if use_user_specified_yaml:
1650
+ if self._user_specified_yaml is None:
1651
+ return self._to_yaml_config(redact_secrets=True)
1652
+ config = yaml_utils.safe_load(self._user_specified_yaml)
1653
+ if config.get('secrets') is not None:
1654
+ config['secrets'] = {k: '<redacted>' for k in config['secrets']}
1655
+ return config
1656
+ return self._to_yaml_config()
1657
+
1658
+ def _to_yaml_config(self, redact_secrets: bool = False) -> Dict[str, Any]:
1227
1659
  config = {}
1228
1660
 
1229
1661
  def add_if_not_none(key, value, no_empty: bool = False):
@@ -1234,15 +1666,7 @@ class Task:
1234
1666
 
1235
1667
  add_if_not_none('name', self.name)
1236
1668
 
1237
- tmp_resource_config = {}
1238
- if len(self.resources) > 1:
1239
- resource_list = []
1240
- for r in self.resources:
1241
- resource_list.append(r.to_yaml_config())
1242
- key = 'ordered' if isinstance(self.resources, list) else 'any_of'
1243
- tmp_resource_config[key] = resource_list
1244
- else:
1245
- tmp_resource_config = list(self.resources)[0].to_yaml_config()
1669
+ tmp_resource_config = _resources_to_config(self.resources)
1246
1670
 
1247
1671
  add_if_not_none('resources', tmp_resource_config)
1248
1672
 
@@ -1263,8 +1687,15 @@ class Task:
1263
1687
  add_if_not_none('workdir', self.workdir)
1264
1688
  add_if_not_none('event_callback', self.event_callback)
1265
1689
  add_if_not_none('run', self.run)
1690
+
1691
+ # Add envs without redaction
1266
1692
  add_if_not_none('envs', self.envs, no_empty=True)
1267
1693
 
1694
+ secrets = self.secrets
1695
+ if secrets and redact_secrets:
1696
+ secrets = {k: '<redacted>' for k in secrets}
1697
+ add_if_not_none('secrets', secrets, no_empty=True)
1698
+
1268
1699
  add_if_not_none('file_mounts', {})
1269
1700
 
1270
1701
  if self.file_mounts is not None:
@@ -1277,6 +1708,15 @@ class Task:
1277
1708
  })
1278
1709
 
1279
1710
  add_if_not_none('file_mounts_mapping', self.file_mounts_mapping)
1711
+ add_if_not_none('volumes', self.volumes)
1712
+ if self.volume_mounts is not None:
1713
+ config['volume_mounts'] = [
1714
+ volume_mount.to_yaml_config()
1715
+ for volume_mount in self.volume_mounts
1716
+ ]
1717
+ # we manually check if its empty to not clog up the generated yaml
1718
+ add_if_not_none('_metadata', self._metadata if self._metadata else None)
1719
+ add_if_not_none('_user_specified_yaml', self._user_specified_yaml)
1280
1720
  return config
1281
1721
 
1282
1722
  def get_required_cloud_features(
@@ -1304,7 +1744,7 @@ class Task:
1304
1744
  return required_features
1305
1745
 
1306
1746
  def __rshift__(self, b):
1307
- sky.dag.get_current_dag().add_edge(self, b)
1747
+ dag_lib.get_current_dag().add_edge(self, b)
1308
1748
 
1309
1749
  def __repr__(self):
1310
1750
  if isinstance(self.run, str):
@@ -1339,3 +1779,47 @@ class Task:
1339
1779
  else:
1340
1780
  s += '\n resources: default instances'
1341
1781
  return s
1782
+
1783
+
1784
+ def _resources_to_config(
1785
+ resources: Union[List['resources_lib.Resources'],
1786
+ Set['resources_lib.Resources']],
1787
+ factor_out_common_fields: bool = False) -> Dict[str, Any]:
1788
+ if len(resources) > 1:
1789
+ resource_list: List[Dict[str, Union[str, int]]] = []
1790
+ for r in resources:
1791
+ resource_list.append(r.to_yaml_config())
1792
+ group_key = 'ordered' if isinstance(resources, list) else 'any_of'
1793
+ if factor_out_common_fields:
1794
+ return _factor_out_common_resource_fields(resource_list, group_key)
1795
+ return {group_key: resource_list}
1796
+ else:
1797
+ return list(resources)[0].to_yaml_config()
1798
+
1799
+
1800
+ def _factor_out_common_resource_fields(configs: List[Dict[str, Union[str,
1801
+ int]]],
1802
+ group_key: str) -> Dict[str, Any]:
1803
+ """Factors out the fields that are common to all resources."""
1804
+ return_config: Dict[str, Any] = configs[0].copy()
1805
+ if len(configs) > 1:
1806
+ for config in configs[1:]:
1807
+ for key, value in config.items():
1808
+ if key in return_config and return_config[key] != value:
1809
+ del return_config[key]
1810
+ num_empty_configs = 0
1811
+ for config in configs:
1812
+ keys_to_delete = []
1813
+ for key, value in config.items():
1814
+ if key in return_config:
1815
+ keys_to_delete.append(key)
1816
+ for key in keys_to_delete:
1817
+ del config[key]
1818
+ if not config:
1819
+ num_empty_configs += 1
1820
+
1821
+ if num_empty_configs == len(configs):
1822
+ return return_config
1823
+ if len(configs) > 0:
1824
+ return_config[group_key] = configs
1825
+ return return_config