zenml-nightly 0.82.1.dev20250527__py3-none-any.whl → 0.83.0.dev20250529__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. zenml/VERSION +1 -1
  2. zenml/cli/base.py +6 -1
  3. zenml/cli/model.py +16 -36
  4. zenml/cli/server.py +8 -3
  5. zenml/client.py +20 -4
  6. zenml/client_lazy_loader.py +2 -0
  7. zenml/config/docker_settings.py +15 -2
  8. zenml/enums.py +3 -0
  9. zenml/event_hub/event_hub.py +1 -1
  10. zenml/integrations/bitbucket/plugins/event_sources/bitbucket_webhook_event_source.py +1 -1
  11. zenml/integrations/github/plugins/event_sources/github_webhook_event_source.py +1 -1
  12. zenml/integrations/kubernetes/flavors/kubernetes_orchestrator_flavor.py +3 -0
  13. zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +37 -26
  14. zenml/integrations/skypilot/flavors/skypilot_orchestrator_base_vm_config.py +45 -4
  15. zenml/integrations/skypilot/orchestrators/skypilot_base_vm_orchestrator.py +92 -84
  16. zenml/integrations/skypilot/orchestrators/skypilot_orchestrator_entrypoint.py +207 -179
  17. zenml/integrations/skypilot/utils.py +273 -0
  18. zenml/integrations/skypilot_aws/__init__.py +1 -2
  19. zenml/integrations/skypilot_azure/__init__.py +1 -2
  20. zenml/integrations/skypilot_gcp/__init__.py +9 -1
  21. zenml/integrations/skypilot_kubernetes/__init__.py +2 -3
  22. zenml/integrations/skypilot_lambda/__init__.py +1 -2
  23. zenml/login/web_login.py +1 -1
  24. zenml/model/model.py +13 -23
  25. zenml/models/__init__.py +39 -2
  26. zenml/models/v2/base/scoped.py +34 -15
  27. zenml/models/v2/core/artifact.py +10 -9
  28. zenml/models/v2/core/artifact_version.py +16 -16
  29. zenml/models/v2/core/logs.py +8 -50
  30. zenml/models/v2/core/model.py +10 -10
  31. zenml/models/v2/core/model_version.py +155 -88
  32. zenml/models/v2/core/pipeline.py +10 -11
  33. zenml/models/v2/core/pipeline_deployment.py +1 -9
  34. zenml/models/v2/core/pipeline_run.py +10 -17
  35. zenml/models/v2/core/run_template.py +10 -10
  36. zenml/models/v2/core/step_run.py +100 -16
  37. zenml/models/v2/core/tag.py +5 -4
  38. zenml/models/v2/misc/pipeline_run_dag.py +46 -0
  39. zenml/orchestrators/base_orchestrator.py +8 -19
  40. zenml/orchestrators/cache_utils.py +48 -1
  41. zenml/orchestrators/input_utils.py +35 -39
  42. zenml/orchestrators/step_launcher.py +1 -1
  43. zenml/orchestrators/step_run_utils.py +26 -10
  44. zenml/pipelines/pipeline_definition.py +3 -3
  45. zenml/pipelines/run_utils.py +2 -3
  46. zenml/service_connectors/service_connector.py +5 -1
  47. zenml/stack/stack_component.py +1 -4
  48. zenml/steps/step_context.py +1 -1
  49. zenml/utils/dashboard_utils.py +3 -3
  50. zenml/zen_server/auth.py +6 -3
  51. zenml/zen_server/dashboard/assets/{404-_AtuLtaX.js → 404-DmJUgorp.js} +1 -1
  52. zenml/zen_server/dashboard/assets/{@radix-C7hRs6Kx.js → @radix-Cdvw4jJ8.js} +1 -1
  53. zenml/zen_server/dashboard/assets/{@react-router-CNP6g_RL.js → @react-router-DeDfXbUF.js} +5 -5
  54. zenml/zen_server/dashboard/assets/{@reactflow-CQi1Z1Wq.js → @reactflow-8OCk19Fi.js} +1 -1
  55. zenml/zen_server/dashboard/assets/{@tanstack-CSxjHCME.js → @tanstack-5gTMR7G2.js} +4 -4
  56. zenml/zen_server/dashboard/assets/AlertDialogDropdownItem-CZW4QyWn.js +1 -0
  57. zenml/zen_server/dashboard/assets/ButtonGroup-DFWWFGUE.js +1 -0
  58. zenml/zen_server/dashboard/assets/{CodeSnippet-CvI6D0wx.js → CodeSnippet-D2HkkAGr.js} +1 -1
  59. zenml/zen_server/dashboard/assets/CollapsibleCard-CnS09ljw.js +1 -0
  60. zenml/zen_server/dashboard/assets/{ComponentBadge-DKw7Gndh.js → ComponentBadge-CDgdd0Ks.js} +1 -1
  61. zenml/zen_server/dashboard/assets/ComponentIcon-CbbOc7lb.js +1 -0
  62. zenml/zen_server/dashboard/assets/{DeleteAlertDialog-DVvXt-S6.js → DeleteAlertDialog-VIOMDLmx.js} +1 -1
  63. zenml/zen_server/dashboard/assets/DialogItem-ClFCqxEp.js +1 -0
  64. zenml/zen_server/dashboard/assets/{DisplayDate-CYVBBSgr.js → DisplayDate-8RESqe5H.js} +1 -1
  65. zenml/zen_server/dashboard/assets/{EmptyState-M1jafpg6.js → EmptyState-CjrgDtVk.js} +1 -1
  66. zenml/zen_server/dashboard/assets/{Error-BWaXP0VK.js → Error-CQzjbDcN.js} +1 -1
  67. zenml/zen_server/dashboard/assets/ExecutionStatus-CWreILP0.js +1 -0
  68. zenml/zen_server/dashboard/assets/{Helpbox-iE1xLmiZ.js → Helpbox-CiKxG5_X.js} +1 -1
  69. zenml/zen_server/dashboard/assets/Infobox-CGxFvqzi.js +1 -0
  70. zenml/zen_server/dashboard/assets/LeftSideMenu-DCsKdIjC.js +1 -0
  71. zenml/zen_server/dashboard/assets/{Lock-DW-0_M0o.js → Lock-CrIAdQo6.js} +1 -1
  72. zenml/zen_server/dashboard/assets/NestedCollapsible-3M4llYtH.js +1 -0
  73. zenml/zen_server/dashboard/assets/NumberBox-C0mQktmV.js +1 -0
  74. zenml/zen_server/dashboard/assets/Partials-DSjkttlz.js +1 -0
  75. zenml/zen_server/dashboard/assets/{PasswordChecker-lYTOtNom.js → PasswordChecker-B88WjuCe.js} +1 -1
  76. zenml/zen_server/dashboard/assets/ProCta-Dm5cWKpS.js +1 -0
  77. zenml/zen_server/dashboard/assets/{ProviderIcon-DLo7t1lo.js → ProviderIcon-DPwMR6nF.js} +1 -1
  78. zenml/zen_server/dashboard/assets/ProviderRadio-DEDNRgAb.js +1 -0
  79. zenml/zen_server/dashboard/assets/RunsBody-BRBn1e2O.js +1 -0
  80. zenml/zen_server/dashboard/assets/SearchField-DY6-UbRT.js +1 -0
  81. zenml/zen_server/dashboard/assets/SecretTooltip-CZTRnaCV.js +1 -0
  82. zenml/zen_server/dashboard/assets/{SetPassword-DR-EiLI5.js → SetPassword-BjNGDC5e.js} +1 -1
  83. zenml/zen_server/dashboard/assets/SheetHeader-CASpN2Lz.js +1 -0
  84. zenml/zen_server/dashboard/assets/StackComponentList-Be1pQt9m.js +1 -0
  85. zenml/zen_server/dashboard/assets/StackList-BdiR5DvR.js +1 -0
  86. zenml/zen_server/dashboard/assets/StackName-ojLC6xdl.js +1 -0
  87. zenml/zen_server/dashboard/assets/Tabs-DNSKblCM.js +1 -0
  88. zenml/zen_server/dashboard/assets/Tick-BPrWnNlN.js +1 -0
  89. zenml/zen_server/dashboard/assets/{UpdatePasswordSchemas-DbFEaezI.js → UpdatePasswordSchemas-CNfKDo2Q.js} +1 -1
  90. zenml/zen_server/dashboard/assets/UsageReason-Cb-mpV8M.js +1 -0
  91. zenml/zen_server/dashboard/assets/{Wizard-CMI6Ksgz.js → Wizard-Dg8Pmn5A.js} +1 -1
  92. zenml/zen_server/dashboard/assets/WizardFooter-BcNDIvlQ.js +1 -0
  93. zenml/zen_server/dashboard/assets/{all-pipeline-runs-query-BGASHYtF.js → all-pipeline-runs-query-DCdax7I5.js} +1 -1
  94. zenml/zen_server/dashboard/assets/{arrow-left-CwgF2MEM.js → arrow-left-MRXv5pAH.js} +1 -1
  95. zenml/zen_server/dashboard/assets/bulk-delete-C_kpIB9A.js +3 -0
  96. zenml/zen_server/dashboard/assets/{check-DK77doTf.js → check-B9QMTa3f.js} +1 -1
  97. zenml/zen_server/dashboard/assets/{check-circle-mvyzYvIW.js → check-circle-C4tYvbtw.js} +1 -1
  98. zenml/zen_server/dashboard/assets/{chevron-down-A-rmltmI.js → chevron-down-jbbQh82s.js} +1 -1
  99. zenml/zen_server/dashboard/assets/{chevron-right-double-uNWbJT-C.js → chevron-right-double-Dgp_gEsp.js} +1 -1
  100. zenml/zen_server/dashboard/assets/{clock-CPA5cYxq.js → clock-B_mTG8PH.js} +1 -1
  101. zenml/zen_server/dashboard/assets/{code-browser-j2EpcxIA.js → code-browser-CiD8qkBx.js} +1 -1
  102. zenml/zen_server/dashboard/assets/configuration-form-B2hmKGnF.js +1 -0
  103. zenml/zen_server/dashboard/assets/connectivity-4UKGMYnr.webp +0 -0
  104. zenml/zen_server/dashboard/assets/constants-1EZZxtay.js +1 -0
  105. zenml/zen_server/dashboard/assets/create-stack-TKmMtrkQ.js +1 -0
  106. zenml/zen_server/dashboard/assets/dates-Buh6SMo7.js +1 -0
  107. zenml/zen_server/dashboard/assets/delete-run-CCR9md_s.js +1 -0
  108. zenml/zen_server/dashboard/assets/eye-CbVlAYty.js +1 -0
  109. zenml/zen_server/dashboard/assets/{file-text-BdxZdjP_.js → file-text-Cd8wVfq5.js} +1 -1
  110. zenml/zen_server/dashboard/assets/form-DFJkaFDX.js +1 -0
  111. zenml/zen_server/dashboard/assets/form-schemas-CrznJVzA.js +1 -0
  112. zenml/zen_server/dashboard/assets/{gcp-CHNvgEss.js → gcp-B1I3Qvcx.js} +1 -1
  113. zenml/zen_server/dashboard/assets/{help-DyMolRxD.js → help-Co6aedki.js} +1 -1
  114. zenml/zen_server/dashboard/assets/index-BFqbGSck.js +308 -0
  115. zenml/zen_server/dashboard/assets/{index-CrhdX_qG.js → index-BjUu1mP4.js} +1 -1
  116. zenml/zen_server/dashboard/assets/{index-DR30v9MZ.js → index-DWpiv-Ft.js} +1 -1
  117. zenml/zen_server/dashboard/assets/index-DuhuqTCI.css +1 -0
  118. zenml/zen_server/dashboard/assets/index-U992soPJ.js +1 -0
  119. zenml/zen_server/dashboard/assets/index.es-C1gfATPn.js +14 -0
  120. zenml/zen_server/dashboard/assets/{index.esm-D7jFlf5N.js → index.esm-DhJo3mA6.js} +1 -1
  121. zenml/zen_server/dashboard/assets/info-QkbQz4QU.js +1 -0
  122. zenml/zen_server/dashboard/assets/{key-icon-DO4DPJHZ.js → key-icon-C07HKw8z.js} +1 -1
  123. zenml/zen_server/dashboard/assets/{layout-h3cbx8WZ.js → layout-DBbfEFBe.js} +1 -1
  124. zenml/zen_server/dashboard/assets/layout-Do9YI4QX.js +1 -0
  125. zenml/zen_server/dashboard/assets/login-mutation-D3tFP6Wm.js +1 -0
  126. zenml/zen_server/dashboard/assets/{logs-B5n0U7tB.js → logs-CQKlJjo0.js} +1 -1
  127. zenml/zen_server/dashboard/assets/{package-D1Mhqeh8.js → package-miExReQl.js} +1 -1
  128. zenml/zen_server/dashboard/assets/page-9RjCitFH.js +1 -0
  129. zenml/zen_server/dashboard/assets/page-B0PsXWiT.js +1 -0
  130. zenml/zen_server/dashboard/assets/page-BCrKmYIZ.js +1 -0
  131. zenml/zen_server/dashboard/assets/page-BcRI3-aR.js +29 -0
  132. zenml/zen_server/dashboard/assets/page-Be3R2uYn.js +1 -0
  133. zenml/zen_server/dashboard/assets/page-BgknnddT.js +1 -0
  134. zenml/zen_server/dashboard/assets/page-BrT0_zSJ.js +40 -0
  135. zenml/zen_server/dashboard/assets/page-Bs3W2FDi.js +1 -0
  136. zenml/zen_server/dashboard/assets/page-C210HcBA.js +1 -0
  137. zenml/zen_server/dashboard/assets/page-C6KaiZ_W.js +1 -0
  138. zenml/zen_server/dashboard/assets/page-CAJ8B0vb.js +1 -0
  139. zenml/zen_server/dashboard/assets/page-CAUYrfui.js +1 -0
  140. zenml/zen_server/dashboard/assets/page-CHxVhF3x.js +1 -0
  141. zenml/zen_server/dashboard/assets/{page-CSwZxZMQ.js → page-CN7lkvXr.js} +1 -1
  142. zenml/zen_server/dashboard/assets/page-CUaMMoPG.js +1 -0
  143. zenml/zen_server/dashboard/assets/page-Cal6XQ4U.js +1 -0
  144. zenml/zen_server/dashboard/assets/page-CdZCmszX.js +1 -0
  145. zenml/zen_server/dashboard/assets/page-ChGcZI_6.js +1 -0
  146. zenml/zen_server/dashboard/assets/page-CktmtZ8Z.js +1 -0
  147. zenml/zen_server/dashboard/assets/page-ClvmVesa.js +1 -0
  148. zenml/zen_server/dashboard/assets/page-CnbIYE80.js +1 -0
  149. zenml/zen_server/dashboard/assets/page-CoXzjeEY.js +1 -0
  150. zenml/zen_server/dashboard/assets/page-CtiuMP_r.js +1 -0
  151. zenml/zen_server/dashboard/assets/page-D9Hfx6GV.js +1 -0
  152. zenml/zen_server/dashboard/assets/page-D9iuB88h.js +1 -0
  153. zenml/zen_server/dashboard/assets/page-DCcuPZ8P.js +1 -0
  154. zenml/zen_server/dashboard/assets/page-DEohTSz6.js +1 -0
  155. zenml/zen_server/dashboard/assets/page-DJIGaUQ9.js +1 -0
  156. zenml/zen_server/dashboard/assets/page-DKK6ulgy.js +1 -0
  157. zenml/zen_server/dashboard/assets/page-DNjKHjnH.js +1 -0
  158. zenml/zen_server/dashboard/assets/page-DUK0Nd_1.js +1 -0
  159. zenml/zen_server/dashboard/assets/page-DUKbOhaD.js +1 -0
  160. zenml/zen_server/dashboard/assets/page-DYOucPtA.js +1 -0
  161. zenml/zen_server/dashboard/assets/page-DpqRelAy.js +1 -0
  162. zenml/zen_server/dashboard/assets/{page-ZfTtFicG.js → page-DwVPpCFg.js} +2 -2
  163. zenml/zen_server/dashboard/assets/page-XURWnYZP.js +1 -0
  164. zenml/zen_server/dashboard/assets/page-abw-2oeW.js +1 -0
  165. zenml/zen_server/dashboard/assets/page-akLcPcKw.js +1 -0
  166. zenml/zen_server/dashboard/assets/page-n9ejQ2V3.js +2 -0
  167. zenml/zen_server/dashboard/assets/page-sJjNT9xA.js +6 -0
  168. zenml/zen_server/dashboard/assets/{persist-UUym702q.js → persist-DWMWVP-y.js} +1 -1
  169. zenml/zen_server/dashboard/assets/{persist-D87V82eO.js → persist-Dec_w7aB.js} +1 -1
  170. zenml/zen_server/dashboard/assets/pipeline-CSUlkd50.js +1 -0
  171. zenml/zen_server/dashboard/assets/{plus-COjQg3AG.js → plus-Cl0_rCVF.js} +1 -1
  172. zenml/zen_server/dashboard/assets/{react-error-boundary.esm-fyoUBS25.js → react-error-boundary.esm-7_MuhCay.js} +1 -1
  173. zenml/zen_server/dashboard/assets/{refresh-CM5T3QeU.js → refresh-BcTM09NW.js} +1 -1
  174. zenml/zen_server/dashboard/assets/resource-tyes-list-o2LXiMay.js +1 -0
  175. zenml/zen_server/dashboard/assets/resource-type-tooltip-DwHrJstL.js +1 -0
  176. zenml/zen_server/dashboard/assets/service-connectors-DSEMwJ5A.js +1 -0
  177. zenml/zen_server/dashboard/assets/{service-BQ9KIhls.js → service-jxtvgks0.js} +2 -2
  178. zenml/zen_server/dashboard/assets/sharedSchema-BXzg0EZz.js +1 -0
  179. zenml/zen_server/dashboard/assets/stack-detail-query-Cm0fsgo-.js +1 -0
  180. zenml/zen_server/dashboard/assets/{terminal-square-DMtel8mb.js → terminal-XFL_4QN-.js} +1 -1
  181. zenml/zen_server/dashboard/assets/terminal-square-XFL_4QN-.js +1 -0
  182. zenml/zen_server/dashboard/assets/transform-CeZdrxDZ.js +1 -0
  183. zenml/zen_server/dashboard/assets/{trash-BWSZ7NRK.js → trash-DP6Tpp_E.js} +1 -1
  184. zenml/zen_server/dashboard/assets/type-guards-CNgPYg8l.js +1 -0
  185. zenml/zen_server/dashboard/assets/update-current-user-mutation-D5MjcQ6F.js +1 -0
  186. zenml/zen_server/dashboard/assets/update-server-settings-mutation-CmnxdxiK.js +1 -0
  187. zenml/zen_server/dashboard/assets/{zod-C0xYeTvL.js → zod-XdS2h1ws.js} +1 -1
  188. zenml/zen_server/dashboard/index.html +7 -7
  189. zenml/zen_server/rbac/utils.py +2 -2
  190. zenml/zen_server/routers/auth_endpoints.py +2 -2
  191. zenml/zen_server/routers/devices_endpoints.py +8 -5
  192. zenml/zen_server/routers/pipeline_deployments_endpoints.py +1 -1
  193. zenml/zen_server/routers/pipelines_endpoints.py +1 -1
  194. zenml/zen_server/routers/run_templates_endpoints.py +3 -3
  195. zenml/zen_server/routers/runs_endpoints.py +35 -0
  196. zenml/zen_server/template_execution/utils.py +6 -6
  197. zenml/zen_stores/dag_generator.py +171 -0
  198. zenml/zen_stores/migrations/versions/0.83.0_release.py +23 -0
  199. zenml/zen_stores/rest_zen_store.py +17 -3
  200. zenml/zen_stores/schemas/action_schemas.py +40 -4
  201. zenml/zen_stores/schemas/api_key_schemas.py +29 -1
  202. zenml/zen_stores/schemas/artifact_schemas.py +168 -48
  203. zenml/zen_stores/schemas/base_schemas.py +26 -1
  204. zenml/zen_stores/schemas/code_repository_schemas.py +46 -5
  205. zenml/zen_stores/schemas/component_schemas.py +44 -3
  206. zenml/zen_stores/schemas/device_schemas.py +43 -2
  207. zenml/zen_stores/schemas/event_source_schemas.py +41 -5
  208. zenml/zen_stores/schemas/flavor_schemas.py +42 -2
  209. zenml/zen_stores/schemas/model_schemas.py +113 -77
  210. zenml/zen_stores/schemas/pipeline_build_schemas.py +53 -4
  211. zenml/zen_stores/schemas/pipeline_deployment_schemas.py +53 -4
  212. zenml/zen_stores/schemas/pipeline_run_schemas.py +111 -47
  213. zenml/zen_stores/schemas/pipeline_schemas.py +41 -9
  214. zenml/zen_stores/schemas/run_template_schemas.py +75 -11
  215. zenml/zen_stores/schemas/schedule_schema.py +50 -5
  216. zenml/zen_stores/schemas/secret_schemas.py +39 -2
  217. zenml/zen_stores/schemas/service_connector_schemas.py +39 -2
  218. zenml/zen_stores/schemas/service_schemas.py +39 -4
  219. zenml/zen_stores/schemas/stack_schemas.py +47 -2
  220. zenml/zen_stores/schemas/step_run_schemas.py +89 -26
  221. zenml/zen_stores/schemas/tag_schemas.py +69 -5
  222. zenml/zen_stores/schemas/trigger_schemas.py +44 -5
  223. zenml/zen_stores/schemas/utils.py +25 -4
  224. zenml/zen_stores/sql_zen_store.py +471 -28
  225. zenml/zen_stores/zen_store_interface.py +9 -1
  226. {zenml_nightly-0.82.1.dev20250527.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/METADATA +2 -2
  227. {zenml_nightly-0.82.1.dev20250527.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/RECORD +230 -222
  228. zenml/zen_server/dashboard/assets/AlertDialogDropdownItem-BG7-Ki1L.js +0 -1
  229. zenml/zen_server/dashboard/assets/CollapsibleCard-D20FtrzC.js +0 -1
  230. zenml/zen_server/dashboard/assets/Commands-DGbAvMDk.js +0 -1
  231. zenml/zen_server/dashboard/assets/ComponentIcon-ils7uNAk.js +0 -1
  232. zenml/zen_server/dashboard/assets/CsvVizualization-DVN541XF.js +0 -15
  233. zenml/zen_server/dashboard/assets/DialogItem-BHWf3sIB.js +0 -1
  234. zenml/zen_server/dashboard/assets/ExecutionStatus-XrvT2r65.js +0 -1
  235. zenml/zen_server/dashboard/assets/Infobox-D9k5TFH4.js +0 -1
  236. zenml/zen_server/dashboard/assets/InlineAvatar-Cfz4WSLK.js +0 -1
  237. zenml/zen_server/dashboard/assets/MarkdownVisualization-URCyUPcZ.js +0 -14
  238. zenml/zen_server/dashboard/assets/NestedCollapsible-Dor-bi98.js +0 -1
  239. zenml/zen_server/dashboard/assets/NumberBox-D2A7ENHb.js +0 -1
  240. zenml/zen_server/dashboard/assets/Partials-DQJFw1yW.js +0 -1
  241. zenml/zen_server/dashboard/assets/ProBadge-Cp4hb1YT.js +0 -1
  242. zenml/zen_server/dashboard/assets/ProCta-EYoV9CvK.js +0 -1
  243. zenml/zen_server/dashboard/assets/ProviderRadio-BVDA-fAr.js +0 -1
  244. zenml/zen_server/dashboard/assets/RunSelector-BLuBYNJt.js +0 -1
  245. zenml/zen_server/dashboard/assets/RunsBody-W4WHf-sq.js +0 -1
  246. zenml/zen_server/dashboard/assets/SearchField-D-h6jXyg.js +0 -1
  247. zenml/zen_server/dashboard/assets/SecretTooltip-CePCL8kd.js +0 -1
  248. zenml/zen_server/dashboard/assets/StackList-CgmN5H-i.js +0 -1
  249. zenml/zen_server/dashboard/assets/Tabs-DxQ8PDOD.js +0 -1
  250. zenml/zen_server/dashboard/assets/Tick-CEsT3HPR.js +0 -1
  251. zenml/zen_server/dashboard/assets/UsageReason-DjI5qMje.js +0 -1
  252. zenml/zen_server/dashboard/assets/WizardFooter-CFBHFZas.js +0 -1
  253. zenml/zen_server/dashboard/assets/cloud-squares-DeRLMopf.svg +0 -43
  254. zenml/zen_server/dashboard/assets/configuration-form-BtI2Y4eX.js +0 -1
  255. zenml/zen_server/dashboard/assets/connectors-video-C9qY4syJ.svg +0 -21
  256. zenml/zen_server/dashboard/assets/constants-DP3ZEnXH.js +0 -1
  257. zenml/zen_server/dashboard/assets/create-stack-BJ6x5rzj.js +0 -1
  258. zenml/zen_server/dashboard/assets/dates-3pMLCNrD.js +0 -1
  259. zenml/zen_server/dashboard/assets/delete-run-DlSLEl5T.js +0 -1
  260. zenml/zen_server/dashboard/assets/docker-BuDBFEDL.js +0 -1
  261. zenml/zen_server/dashboard/assets/dots-horizontal-BGRJCPCs.js +0 -1
  262. zenml/zen_server/dashboard/assets/flavor-select-BnPxvQDN.js +0 -1
  263. zenml/zen_server/dashboard/assets/form-schemas-CbvoEUHr.js +0 -1
  264. zenml/zen_server/dashboard/assets/index-CFESYpe4.js +0 -1
  265. zenml/zen_server/dashboard/assets/index-CmLcvK2z.js +0 -1
  266. zenml/zen_server/dashboard/assets/index-CzX3ZYlI.css +0 -1
  267. zenml/zen_server/dashboard/assets/index-D2iSHVZq.js +0 -64
  268. zenml/zen_server/dashboard/assets/kubernetes-D6OUjwSK.js +0 -1
  269. zenml/zen_server/dashboard/assets/link-external-DUhCSKNm.js +0 -1
  270. zenml/zen_server/dashboard/assets/login-command-CkqxPtV3.js +0 -1
  271. zenml/zen_server/dashboard/assets/login-mutation-CXc-Klim.js +0 -1
  272. zenml/zen_server/dashboard/assets/not-found-olRU3fnu.js +0 -1
  273. zenml/zen_server/dashboard/assets/page-7keIM1V3.js +0 -1
  274. zenml/zen_server/dashboard/assets/page-B31neFwG.js +0 -1
  275. zenml/zen_server/dashboard/assets/page-B3zo4KYS.js +0 -1
  276. zenml/zen_server/dashboard/assets/page-BN3MHq1a.js +0 -1
  277. zenml/zen_server/dashboard/assets/page-BNgVExjN.js +0 -1
  278. zenml/zen_server/dashboard/assets/page-BPtvu74G.js +0 -1
  279. zenml/zen_server/dashboard/assets/page-BTIuG0ki.js +0 -2
  280. zenml/zen_server/dashboard/assets/page-BcQzleH6.js +0 -1
  281. zenml/zen_server/dashboard/assets/page-C05Jw4M2.js +0 -1
  282. zenml/zen_server/dashboard/assets/page-C28a7K8h.js +0 -1
  283. zenml/zen_server/dashboard/assets/page-C9WLk0X-.js +0 -1
  284. zenml/zen_server/dashboard/assets/page-CINMx64X.js +0 -1
  285. zenml/zen_server/dashboard/assets/page-CYrJbk7P.js +0 -1
  286. zenml/zen_server/dashboard/assets/page-Ce0cqLo3.js +0 -1
  287. zenml/zen_server/dashboard/assets/page-CgNsEkw-.js +0 -1
  288. zenml/zen_server/dashboard/assets/page-Ct2FUYuR.js +0 -1
  289. zenml/zen_server/dashboard/assets/page-D8G2B3Bu.js +0 -1
  290. zenml/zen_server/dashboard/assets/page-DL8a4_lg.js +0 -3
  291. zenml/zen_server/dashboard/assets/page-DMhYn1cF.js +0 -1
  292. zenml/zen_server/dashboard/assets/page-Dd_Yq-Uf.js +0 -6
  293. zenml/zen_server/dashboard/assets/page-DfSvqT8g.js +0 -1
  294. zenml/zen_server/dashboard/assets/page-Dt6ANUTx.js +0 -1
  295. zenml/zen_server/dashboard/assets/page-DtvTleaT.js +0 -1
  296. zenml/zen_server/dashboard/assets/page-DwfGTiVs.js +0 -1
  297. zenml/zen_server/dashboard/assets/page-JgomSTDc.js +0 -1
  298. zenml/zen_server/dashboard/assets/page-L84ig6HB.js +0 -1
  299. zenml/zen_server/dashboard/assets/page-Mabsn4QJ.js +0 -1
  300. zenml/zen_server/dashboard/assets/page-P04L5cm9.js +0 -1
  301. zenml/zen_server/dashboard/assets/page-PfhAnvq4.js +0 -1
  302. zenml/zen_server/dashboard/assets/page-WdRrlNt_.js +0 -1
  303. zenml/zen_server/dashboard/assets/page-cqJDDDeK.js +0 -1
  304. zenml/zen_server/dashboard/assets/page-k-Wxh9L_.js +0 -1
  305. zenml/zen_server/dashboard/assets/page-y-zV4n0c.js +0 -1
  306. zenml/zen_server/dashboard/assets/rocket-Cf-B-XOR.js +0 -1
  307. zenml/zen_server/dashboard/assets/settings_preview-0JLrRgHP.webp +0 -0
  308. zenml/zen_server/dashboard/assets/sharedSchema-Bse2agAf.js +0 -14
  309. zenml/zen_server/dashboard/assets/stack-detail-query-BAcZJrN3.js +0 -1
  310. zenml/zen_server/dashboard/assets/tick-circle-m94Aa6Zt.js +0 -1
  311. zenml/zen_server/dashboard/assets/tour-cover-BYfeen6M.webp +0 -0
  312. zenml/zen_server/dashboard/assets/type-guards-CaeD8wHO.js +0 -1
  313. zenml/zen_server/dashboard/assets/update-server-settings-mutation-DwMM1LJz.js +0 -1
  314. {zenml_nightly-0.82.1.dev20250527.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/LICENSE +0 -0
  315. {zenml_nightly-0.82.1.dev20250527.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/WHEEL +0 -0
  316. {zenml_nightly-0.82.1.dev20250527.dist-info → zenml_nightly-0.83.0.dev20250529.dist-info}/entry_points.txt +0 -0
@@ -14,12 +14,12 @@
14
14
  """Implementation of the Skypilot base VM orchestrator."""
15
15
 
16
16
  import os
17
- import re
18
17
  from abc import abstractmethod
19
18
  from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, cast
20
19
  from uuid import uuid4
21
20
 
22
21
  import sky
22
+ from sky import StatusRefreshMode
23
23
 
24
24
  from zenml.entrypoints import PipelineEntrypointConfiguration
25
25
  from zenml.enums import StackComponentType
@@ -31,6 +31,15 @@ from zenml.integrations.skypilot.flavors.skypilot_orchestrator_base_vm_config im
31
31
  from zenml.integrations.skypilot.orchestrators.skypilot_orchestrator_entrypoint_configuration import (
32
32
  SkypilotOrchestratorEntrypointConfiguration,
33
33
  )
34
+ from zenml.integrations.skypilot.utils import (
35
+ create_docker_run_command,
36
+ prepare_docker_setup,
37
+ prepare_launch_kwargs,
38
+ prepare_resources_kwargs,
39
+ prepare_task_kwargs,
40
+ sanitize_cluster_name,
41
+ sky_job_get,
42
+ )
34
43
  from zenml.logger import get_logger
35
44
  from zenml.orchestrators import (
36
45
  ContainerizedOrchestrator,
@@ -252,32 +261,21 @@ class SkypilotBaseOrchestrator(ContainerizedOrchestrator):
252
261
  entrypoint_str = " ".join(command)
253
262
  arguments_str = " ".join(args)
254
263
 
255
- task_envs = environment
256
- docker_environment_str = " ".join(
257
- f"-e {k}={v}" for k, v in environment.items()
258
- )
259
- custom_run_args = " ".join(settings.docker_run_args)
260
- if custom_run_args:
261
- custom_run_args += " "
262
-
263
- instance_type = settings.instance_type or self.DEFAULT_INSTANCE_TYPE
264
+ task_envs = environment.copy()
264
265
 
265
266
  # Set up credentials
266
267
  self.setup_credentials()
267
268
 
268
- # Guaranteed by stack validation
269
- assert stack is not None and stack.container_registry is not None
269
+ # Prepare Docker setup
270
+ setup, docker_creds_envs = prepare_docker_setup(
271
+ container_registry_uri=stack.container_registry.config.uri,
272
+ credentials=stack.container_registry.credentials,
273
+ use_sudo=True, # Base orchestrator uses sudo
274
+ )
270
275
 
271
- if docker_creds := stack.container_registry.credentials:
272
- docker_username, docker_password = docker_creds
273
- setup = (
274
- f"sudo docker login --username $DOCKER_USERNAME --password "
275
- f"$DOCKER_PASSWORD {stack.container_registry.config.uri}"
276
- )
277
- task_envs["DOCKER_USERNAME"] = docker_username
278
- task_envs["DOCKER_PASSWORD"] = docker_password
279
- else:
280
- setup = None
276
+ # Update task_envs with Docker credentials
277
+ if docker_creds_envs:
278
+ task_envs.update(docker_creds_envs)
281
279
 
282
280
  # Run the entire pipeline
283
281
 
@@ -291,45 +289,49 @@ class SkypilotBaseOrchestrator(ContainerizedOrchestrator):
291
289
  down = False
292
290
  idle_minutes_to_autostop = None
293
291
  else:
294
- run_command = f"sudo docker run --rm {custom_run_args}{docker_environment_str} {image} {entrypoint_str} {arguments_str}"
292
+ run_command = create_docker_run_command(
293
+ image=image,
294
+ entrypoint_str=entrypoint_str,
295
+ arguments_str=arguments_str,
296
+ environment=task_envs,
297
+ docker_run_args=settings.docker_run_args,
298
+ use_sudo=True, # Base orchestrator uses sudo
299
+ )
295
300
  down = settings.down
296
301
  idle_minutes_to_autostop = settings.idle_minutes_to_autostop
297
- task = sky.Task(
298
- run=run_command,
302
+
303
+ # Create the Task with all parameters and task settings
304
+ task_kwargs = prepare_task_kwargs(
305
+ settings=settings,
306
+ run_command=run_command,
299
307
  setup=setup,
300
- envs=task_envs,
308
+ task_envs=task_envs,
309
+ task_name=f"{orchestrator_run_name}",
301
310
  )
311
+
312
+ task = sky.Task(**task_kwargs)
302
313
  logger.debug(f"Running run: {run_command}")
303
314
 
304
- task = task.set_resources(
305
- sky.Resources(
306
- cloud=self.cloud,
307
- instance_type=instance_type,
308
- cpus=settings.cpus,
309
- memory=settings.memory,
310
- accelerators=settings.accelerators,
311
- accelerator_args=settings.accelerator_args,
312
- use_spot=settings.use_spot,
313
- job_recovery=settings.job_recovery,
314
- region=settings.region,
315
- zone=settings.zone,
316
- image_id=image
317
- if isinstance(self.cloud, sky.clouds.Kubernetes)
318
- else settings.image_id,
319
- disk_size=settings.disk_size,
320
- disk_tier=settings.disk_tier,
321
- )
315
+ # Set resources with all parameters and resource settings
316
+ resources_kwargs = prepare_resources_kwargs(
317
+ cloud=self.cloud,
318
+ settings=settings,
319
+ default_instance_type=self.DEFAULT_INSTANCE_TYPE,
320
+ kubernetes_image=image
321
+ if isinstance(self.cloud, sky.clouds.Kubernetes)
322
+ else None,
322
323
  )
323
- # Do not detach run if logs are being streamed
324
- # Otherwise, the logs will not be streamed after the task is submitted
325
- # Could also be a parameter in the settings to control this behavior
326
- detach_run = not settings.stream_logs
324
+
325
+ task = task.set_resources(sky.Resources(**resources_kwargs))
327
326
 
328
327
  launch_new_cluster = True
329
328
  if settings.cluster_name:
330
- cluster_info = sky.status(
331
- refresh=True, cluster_names=settings.cluster_name
329
+ status_request_id = sky.status(
330
+ refresh=StatusRefreshMode.AUTO,
331
+ cluster_names=[settings.cluster_name],
332
332
  )
333
+ cluster_info = sky.stream_and_get(status_request_id)
334
+
333
335
  if cluster_info:
334
336
  logger.info(
335
337
  f"Found existing cluster {settings.cluster_name}. Reusing..."
@@ -342,7 +344,7 @@ class SkypilotBaseOrchestrator(ContainerizedOrchestrator):
342
344
  )
343
345
  cluster_name = settings.cluster_name
344
346
  else:
345
- cluster_name = self.sanitize_cluster_name(
347
+ cluster_name = sanitize_cluster_name(
346
348
  f"{orchestrator_run_name}"
347
349
  )
348
350
  logger.info(
@@ -350,33 +352,55 @@ class SkypilotBaseOrchestrator(ContainerizedOrchestrator):
350
352
  )
351
353
 
352
354
  if launch_new_cluster:
353
- sky.launch(
355
+ # Prepare launch parameters with additional launch settings
356
+ launch_kwargs = prepare_launch_kwargs(
357
+ settings=settings,
358
+ down=down,
359
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
360
+ )
361
+ logger.info(
362
+ f"Launching the task on a new cluster: {cluster_name}"
363
+ )
364
+ launch_job_id = sky.launch(
354
365
  task,
355
366
  cluster_name,
356
- retry_until_up=settings.retry_until_up,
357
- idle_minutes_to_autostop=idle_minutes_to_autostop,
358
- down=down,
359
- stream_logs=settings.stream_logs,
360
- backend=None,
361
- detach_setup=True,
362
- detach_run=detach_run,
367
+ **launch_kwargs,
363
368
  )
369
+ sky_job_get(launch_job_id, settings.stream_logs, cluster_name)
370
+
364
371
  else:
365
- # Make sure the cluster is up -
366
- # If the cluster is already up, this will not do anything
367
- sky.start(
372
+ # Prepare exec parameters with additional launch settings
373
+ exec_kwargs = {
374
+ "down": down,
375
+ "backend": None,
376
+ **settings.launch_settings, # Can reuse same settings for exec
377
+ }
378
+
379
+ # Remove None values to avoid overriding SkyPilot defaults
380
+ exec_kwargs = {
381
+ k: v for k, v in exec_kwargs.items() if v is not None
382
+ }
383
+
384
+ # Make sure the cluster is up
385
+ start_request_id = sky.start(
368
386
  settings.cluster_name,
369
387
  down=down,
370
388
  idle_minutes_to_autostop=idle_minutes_to_autostop,
371
389
  retry_until_up=settings.retry_until_up,
372
390
  )
373
- sky.exec(
391
+ sky.stream_and_get(start_request_id)
392
+
393
+ logger.info(
394
+ f"Executing the task on the cluster: {settings.cluster_name}"
395
+ )
396
+ exec_job_id = sky.exec(
374
397
  task,
375
- settings.cluster_name,
376
- down=down,
377
- stream_logs=settings.stream_logs,
378
- backend=None,
379
- detach_run=detach_run,
398
+ cluster_name=settings.cluster_name,
399
+ **exec_kwargs,
400
+ )
401
+ assert settings.cluster_name is not None
402
+ sky_job_get(
403
+ exec_job_id, settings.stream_logs, settings.cluster_name
380
404
  )
381
405
 
382
406
  except Exception as e:
@@ -386,19 +410,3 @@ class SkypilotBaseOrchestrator(ContainerizedOrchestrator):
386
410
  finally:
387
411
  # Unset the service connector AWS profile ENV variable
388
412
  self.prepare_environment_variable(set=False)
389
-
390
- def sanitize_cluster_name(self, name: str) -> str:
391
- """Sanitize the value to be used in a cluster name.
392
-
393
- Args:
394
- name: Arbitrary input cluster name.
395
-
396
- Returns:
397
- Sanitized cluster name.
398
- """
399
- name = re.sub(
400
- r"[^a-z0-9-]", "-", name.lower()
401
- ) # replaces any character that is not a lowercase letter, digit, or hyphen with a hyphen
402
- name = re.sub(r"^[-]+", "", name) # trim leading hyphens
403
- name = re.sub(r"[-]+$", "", name) # trim trailing hyphens
404
- return name
@@ -32,8 +32,20 @@ from zenml.integrations.skypilot.orchestrators.skypilot_base_vm_orchestrator imp
32
32
  ENV_ZENML_SKYPILOT_ORCHESTRATOR_RUN_ID,
33
33
  SkypilotBaseOrchestrator,
34
34
  )
35
+ from zenml.integrations.skypilot.utils import (
36
+ create_docker_run_command,
37
+ prepare_docker_setup,
38
+ prepare_launch_kwargs,
39
+ prepare_resources_kwargs,
40
+ prepare_task_kwargs,
41
+ sanitize_cluster_name,
42
+ sky_job_get,
43
+ )
35
44
  from zenml.logger import get_logger
36
- from zenml.orchestrators.dag_runner import ThreadedDagRunner
45
+ from zenml.orchestrators.dag_runner import NodeStatus, ThreadedDagRunner
46
+ from zenml.orchestrators.publish_utils import (
47
+ publish_failed_pipeline_run,
48
+ )
37
49
  from zenml.orchestrators.utils import get_config_environment_vars
38
50
 
39
51
  logger = get_logger(__name__)
@@ -65,212 +77,228 @@ def main() -> None:
65
77
  TypeError: If the active stack's orchestrator is not an instance of
66
78
  SkypilotBaseOrchestrator.
67
79
  ValueError: If the active stack's container registry is None.
80
+ Exception: If the orchestration or one of the steps fails.
68
81
  """
69
82
  # Log to the container's stdout so it can be streamed by the client.
70
83
  logger.info("Skypilot orchestrator VM started.")
71
84
 
72
85
  # Parse / extract args.
73
86
  args = parse_args()
74
-
75
87
  orchestrator_run_id = socket.gethostname()
76
88
 
77
- deployment = Client().get_deployment(args.deployment_id)
78
-
79
- pipeline_dag = {
80
- step_name: step.spec.upstream_steps
81
- for step_name, step in deployment.step_configurations.items()
82
- }
83
- step_command = StepEntrypointConfiguration.get_entrypoint_command()
84
- entrypoint_str = " ".join(step_command)
85
-
86
- active_stack = Client().active_stack
87
-
88
- orchestrator = active_stack.orchestrator
89
- if not isinstance(orchestrator, SkypilotBaseOrchestrator):
90
- raise TypeError(
91
- "The active stack's orchestrator is not an instance of SkypilotBaseOrchestrator."
92
- )
93
-
94
- # Set up credentials
95
- orchestrator.setup_credentials()
96
-
97
- # Set the service connector AWS profile ENV variable
98
- orchestrator.prepare_environment_variable(set=True)
89
+ run = None
99
90
 
100
- # get active container registry
101
- container_registry = active_stack.container_registry
102
- if container_registry is None:
103
- raise ValueError("Container registry cannot be None.")
91
+ try:
92
+ deployment = Client().get_deployment(args.deployment_id)
104
93
 
105
- if docker_creds := container_registry.credentials:
106
- docker_username, docker_password = docker_creds
107
- setup = (
108
- f"docker login --username $DOCKER_USERNAME --password "
109
- f"$DOCKER_PASSWORD {container_registry.config.uri}"
110
- )
111
- task_envs = {
112
- "DOCKER_USERNAME": docker_username,
113
- "DOCKER_PASSWORD": docker_password,
94
+ pipeline_dag = {
95
+ step_name: step.spec.upstream_steps
96
+ for step_name, step in deployment.step_configurations.items()
114
97
  }
115
- else:
116
- setup = None
117
- task_envs = None
118
-
119
- unique_resource_configs: Dict[str, str] = {}
120
- for step_name, step in deployment.step_configurations.items():
121
- settings = cast(
122
- SkypilotBaseOrchestratorSettings,
123
- orchestrator.get_settings(step),
124
- )
125
- # Handle both str and Dict[str, int] types for accelerators
126
- if isinstance(settings.accelerators, dict):
127
- accelerators_hashable = frozenset(settings.accelerators.items())
128
- elif isinstance(settings.accelerators, str):
129
- accelerators_hashable = frozenset({(settings.accelerators, 1)})
130
- else:
131
- accelerators_hashable = None
132
- resource_config = (
133
- settings.instance_type,
134
- settings.cpus,
135
- settings.memory,
136
- settings.disk_size, # Assuming disk_size is part of the settings
137
- settings.disk_tier, # Assuming disk_tier is part of the settings
138
- settings.use_spot,
139
- settings.job_recovery,
140
- settings.region,
141
- settings.zone,
142
- accelerators_hashable,
143
- )
144
- cluster_name_parts = [
145
- orchestrator.sanitize_cluster_name(str(part))
146
- for part in resource_config
147
- if part is not None
148
- ]
149
- cluster_name = f"cluster-{orchestrator_run_id}" + "-".join(
150
- cluster_name_parts
151
- )
152
- unique_resource_configs[step_name] = cluster_name
153
-
154
- run = Client().list_pipeline_runs(
155
- sort_by="asc:created",
156
- size=1,
157
- deployment_id=args.deployment_id,
158
- status=ExecutionStatus.INITIALIZING,
159
- )[0]
98
+ step_command = StepEntrypointConfiguration.get_entrypoint_command()
99
+ entrypoint_str = " ".join(step_command)
160
100
 
161
- logger.info("Fetching pipeline run: %s", run.id)
101
+ active_stack = Client().active_stack
162
102
 
163
- def run_step_on_skypilot_vm(step_name: str) -> None:
164
- """Run a pipeline step in a separate Skypilot VM.
103
+ orchestrator = active_stack.orchestrator
104
+ if not isinstance(orchestrator, SkypilotBaseOrchestrator):
105
+ raise TypeError(
106
+ "The active stack's orchestrator is not an instance of SkypilotBaseOrchestrator."
107
+ )
165
108
 
166
- Args:
167
- step_name: Name of the step.
168
- """
169
- cluster_name = unique_resource_configs[step_name]
109
+ # Set up credentials
110
+ orchestrator.setup_credentials()
170
111
 
171
- image = SkypilotBaseOrchestrator.get_image(
172
- deployment=deployment, step_name=step_name
173
- )
112
+ # Set the service connector AWS profile ENV variable
113
+ orchestrator.prepare_environment_variable(set=True)
174
114
 
175
- step_args = StepEntrypointConfiguration.get_entrypoint_arguments(
176
- step_name=step_name, deployment_id=deployment.id
177
- )
178
- arguments_str = " ".join(step_args)
115
+ # get active container registry
116
+ container_registry = active_stack.container_registry
117
+ if container_registry is None:
118
+ raise ValueError("Container registry cannot be None.")
179
119
 
180
- step = deployment.step_configurations[step_name]
181
- settings = cast(
182
- SkypilotBaseOrchestratorSettings,
183
- orchestrator.get_settings(step),
120
+ # Prepare Docker setup
121
+ setup, task_envs = prepare_docker_setup(
122
+ container_registry_uri=container_registry.config.uri,
123
+ credentials=container_registry.credentials,
124
+ use_sudo=False, # Entrypoint doesn't use sudo
184
125
  )
185
- env = get_config_environment_vars()
186
- env[ENV_ZENML_SKYPILOT_ORCHESTRATOR_RUN_ID] = orchestrator_run_id
187
126
 
188
- docker_environment_str = " ".join(
189
- f"-e {k}={v}" for k, v in env.items()
190
- )
191
- custom_run_args = " ".join(settings.docker_run_args)
192
- if custom_run_args:
193
- custom_run_args += " "
194
-
195
- # Set up the task
196
- run_command = f"docker run --rm {custom_run_args}{docker_environment_str} {image} {entrypoint_str} {arguments_str}"
197
- task_name = f"{deployment.id}-{step_name}-{time.time()}"
198
- task = sky.Task(
199
- run=run_command,
200
- setup=setup,
201
- envs=task_envs,
202
- name=task_name,
203
- )
204
- task = task.set_resources(
205
- sky.Resources(
206
- cloud=orchestrator.cloud,
207
- instance_type=settings.instance_type
208
- or orchestrator.DEFAULT_INSTANCE_TYPE,
209
- cpus=settings.cpus,
210
- memory=settings.memory,
211
- disk_size=settings.disk_size,
212
- disk_tier=settings.disk_tier,
213
- accelerators=settings.accelerators,
214
- accelerator_args=settings.accelerator_args,
215
- use_spot=settings.use_spot,
216
- job_recovery=settings.job_recovery,
217
- region=settings.region,
218
- zone=settings.zone,
219
- image_id=settings.image_id,
127
+ unique_resource_configs: Dict[str, str] = {}
128
+ for step_name, step in deployment.step_configurations.items():
129
+ settings = cast(
130
+ SkypilotBaseOrchestratorSettings,
131
+ orchestrator.get_settings(step),
220
132
  )
221
- )
133
+ # Handle both str and Dict[str, int] types for accelerators
134
+ if isinstance(settings.accelerators, dict):
135
+ accelerators_hashable = frozenset(
136
+ settings.accelerators.items()
137
+ )
138
+ elif isinstance(settings.accelerators, str):
139
+ accelerators_hashable = frozenset({(settings.accelerators, 1)})
140
+ else:
141
+ accelerators_hashable = None
142
+ resource_config = (
143
+ settings.instance_type,
144
+ settings.cpus,
145
+ settings.memory,
146
+ settings.disk_size, # Assuming disk_size is part of the settings
147
+ settings.disk_tier, # Assuming disk_tier is part of the settings
148
+ settings.use_spot,
149
+ settings.job_recovery,
150
+ settings.region,
151
+ settings.zone,
152
+ accelerators_hashable,
153
+ )
154
+ cluster_name_parts = [
155
+ sanitize_cluster_name(str(part))
156
+ for part in resource_config
157
+ if part is not None
158
+ ]
159
+ cluster_name = f"cluster-{orchestrator_run_id}" + "-".join(
160
+ cluster_name_parts
161
+ )
162
+ unique_resource_configs[step_name] = cluster_name
222
163
 
223
- sky.launch(
224
- task,
225
- cluster_name,
226
- retry_until_up=settings.retry_until_up,
227
- idle_minutes_to_autostop=settings.idle_minutes_to_autostop,
228
- down=settings.down,
229
- stream_logs=settings.stream_logs,
230
- detach_setup=True,
231
- detach_run=True,
232
- )
164
+ run = Client().list_pipeline_runs(
165
+ sort_by="asc:created",
166
+ size=1,
167
+ deployment_id=args.deployment_id,
168
+ status=ExecutionStatus.INITIALIZING,
169
+ )[0]
233
170
 
234
- # Wait for pod to finish.
235
- logger.info(f"Waiting for pod of step `{step_name}` to start...")
171
+ logger.info("Fetching pipeline run: %s", run.id)
236
172
 
237
- current_run = Client().get_pipeline_run(run.id)
173
+ def run_step_on_skypilot_vm(step_name: str) -> None:
174
+ """Run a pipeline step in a separate Skypilot VM.
238
175
 
239
- step_is_finished = False
240
- while not step_is_finished:
241
- time.sleep(10)
242
- current_run = Client().get_pipeline_run(run.id)
176
+ Args:
177
+ step_name: Name of the step.
178
+
179
+ Raises:
180
+ Exception: If the step execution fails.
181
+ """
182
+ logger.info(f"Running step `{step_name}` on a VM...")
243
183
  try:
244
- step_is_finished = current_run.steps[
245
- step_name
246
- ].status.is_finished
247
- except KeyError:
248
- # Step is not yet in the run, so we wait for it to appear
249
- continue
250
-
251
- # Pop the resource configuration for this step
252
- unique_resource_configs.pop(step_name)
253
-
254
- if cluster_name in unique_resource_configs.values():
255
- # If there are more steps using this configuration, skip deprovisioning the cluster
256
- logger.info(
257
- f"Resource configuration for cluster '{cluster_name}' "
258
- "is used by subsequent steps. Skipping the deprovisioning of "
259
- "the cluster."
260
- )
261
- else:
262
- # If there are no more steps using this configuration, down the cluster
263
- logger.info(
264
- f"Resource configuration for cluster '{cluster_name}' "
265
- "is not used by subsequent steps. deprovisioning the cluster."
266
- )
267
- sky.down(cluster_name)
184
+ cluster_name = unique_resource_configs[step_name]
185
+
186
+ image = SkypilotBaseOrchestrator.get_image(
187
+ deployment=deployment, step_name=step_name
188
+ )
189
+
190
+ step_args = (
191
+ StepEntrypointConfiguration.get_entrypoint_arguments(
192
+ step_name=step_name, deployment_id=deployment.id
193
+ )
194
+ )
195
+ arguments_str = " ".join(step_args)
196
+
197
+ step = deployment.step_configurations[step_name]
198
+ settings = cast(
199
+ SkypilotBaseOrchestratorSettings,
200
+ orchestrator.get_settings(step),
201
+ )
202
+ env = get_config_environment_vars()
203
+ env[ENV_ZENML_SKYPILOT_ORCHESTRATOR_RUN_ID] = (
204
+ orchestrator_run_id
205
+ )
206
+
207
+ # Create the Docker run command
208
+ run_command = create_docker_run_command(
209
+ image=image,
210
+ entrypoint_str=entrypoint_str,
211
+ arguments_str=arguments_str,
212
+ environment=env,
213
+ docker_run_args=settings.docker_run_args,
214
+ use_sudo=False, # Entrypoint doesn't use sudo
215
+ )
216
+
217
+ task_name = f"{deployment.id}-{step_name}-{time.time()}"
218
+
219
+ # Create task kwargs
220
+ task_kwargs = prepare_task_kwargs(
221
+ settings=settings,
222
+ run_command=run_command,
223
+ setup=setup,
224
+ task_envs=task_envs,
225
+ task_name=task_name,
226
+ )
227
+
228
+ task = sky.Task(**task_kwargs)
229
+
230
+ # Set resources
231
+ resources_kwargs = prepare_resources_kwargs(
232
+ cloud=orchestrator.cloud,
233
+ settings=settings,
234
+ default_instance_type=orchestrator.DEFAULT_INSTANCE_TYPE,
235
+ )
236
+
237
+ task = task.set_resources(sky.Resources(**resources_kwargs))
238
+
239
+ # Prepare launch parameters
240
+ launch_kwargs = prepare_launch_kwargs(
241
+ settings=settings,
242
+ )
243
+
244
+ # sky.launch now returns a request ID (async). Capture it so we can
245
+ # optionally stream logs and block until completion when desired.
246
+ launch_request_id = sky.launch(
247
+ task,
248
+ cluster_name,
249
+ **launch_kwargs,
250
+ )
251
+ sky_job_get(launch_request_id, True, cluster_name)
252
+
253
+ # Pop the resource configuration for this step
254
+ unique_resource_configs.pop(step_name)
255
+
256
+ if cluster_name in unique_resource_configs.values():
257
+ # If there are more steps using this configuration, skip deprovisioning the cluster
258
+ logger.info(
259
+ f"Resource configuration for cluster '{cluster_name}' "
260
+ "is used by subsequent steps. Skipping the deprovisioning of "
261
+ "the cluster."
262
+ )
263
+ else:
264
+ # If there are no more steps using this configuration, down the cluster
265
+ logger.info(
266
+ f"Resource configuration for cluster '{cluster_name}' "
267
+ "is not used by subsequent steps. deprovisioning the cluster."
268
+ )
269
+ down_request_id = sky.down(cluster_name)
270
+ # Wait for the cluster to be terminated
271
+ sky.stream_and_get(down_request_id)
272
+
273
+ logger.info(
274
+ f"Running step `{step_name}` on a VM is completed."
275
+ )
276
+
277
+ except Exception as e:
278
+ logger.error(f"Failed while launching step `{step_name}`: {e}")
279
+ raise
280
+
281
+ dag_runner = ThreadedDagRunner(
282
+ dag=pipeline_dag, run_fn=run_step_on_skypilot_vm
283
+ )
284
+ dag_runner.run()
285
+
286
+ failed_nodes = []
287
+ for node in dag_runner.nodes:
288
+ if dag_runner.node_states[node] == NodeStatus.FAILED:
289
+ failed_nodes.append(node)
268
290
 
269
- logger.info(f"Running step `{step_name}` on a VM is completed.")
291
+ if failed_nodes:
292
+ raise Exception(f"One or more steps failed: {failed_nodes}")
270
293
 
271
- ThreadedDagRunner(dag=pipeline_dag, run_fn=run_step_on_skypilot_vm).run()
294
+ except Exception as e:
295
+ logger.error(f"Orchestrator failed: {e}")
272
296
 
273
- logger.info("Orchestration VM provisioned.")
297
+ # Try to mark the pipeline run as failed
298
+ if run:
299
+ publish_failed_pipeline_run(run.id)
300
+ logger.info("Marked pipeline run as failed in ZenML.")
301
+ raise
274
302
 
275
303
 
276
304
  if __name__ == "__main__":