ob-metaflow-stubs 6.0.7.1__py2.py3-none-any.whl → 6.0.7.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-stubs might be problematic. Click here for more details.

Files changed (262) hide show
  1. metaflow-stubs/__init__.pyi +1112 -1112
  2. metaflow-stubs/cards.pyi +1 -1
  3. metaflow-stubs/cli.pyi +1 -1
  4. metaflow-stubs/cli_components/__init__.pyi +1 -1
  5. metaflow-stubs/cli_components/utils.pyi +1 -1
  6. metaflow-stubs/client/__init__.pyi +1 -1
  7. metaflow-stubs/client/core.pyi +4 -4
  8. metaflow-stubs/client/filecache.pyi +2 -2
  9. metaflow-stubs/events.pyi +2 -2
  10. metaflow-stubs/exception.pyi +1 -1
  11. metaflow-stubs/flowspec.pyi +3 -3
  12. metaflow-stubs/generated_for.txt +1 -1
  13. metaflow-stubs/includefile.pyi +2 -2
  14. metaflow-stubs/meta_files.pyi +1 -1
  15. metaflow-stubs/metadata_provider/__init__.pyi +1 -1
  16. metaflow-stubs/metadata_provider/heartbeat.pyi +1 -1
  17. metaflow-stubs/metadata_provider/metadata.pyi +1 -1
  18. metaflow-stubs/metadata_provider/util.pyi +1 -1
  19. metaflow-stubs/metaflow_config.pyi +1 -1
  20. metaflow-stubs/metaflow_current.pyi +63 -63
  21. metaflow-stubs/metaflow_git.pyi +1 -1
  22. metaflow-stubs/mf_extensions/__init__.pyi +1 -1
  23. metaflow-stubs/mf_extensions/obcheckpoint/__init__.pyi +1 -1
  24. metaflow-stubs/mf_extensions/obcheckpoint/plugins/__init__.pyi +1 -1
  25. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/__init__.pyi +1 -1
  26. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/card_utils/__init__.pyi +1 -1
  27. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/card_utils/async_cards.pyi +1 -1
  28. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/card_utils/deco_injection_mixin.pyi +1 -1
  29. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/card_utils/extra_components.pyi +2 -2
  30. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/checkpoints/__init__.pyi +1 -1
  31. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/checkpoints/cards/__init__.pyi +1 -1
  32. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/checkpoints/cards/checkpoint_lister.pyi +3 -3
  33. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/checkpoints/cards/lineage_card.pyi +1 -1
  34. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/checkpoints/checkpoint_storage.pyi +3 -3
  35. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/checkpoints/constructors.pyi +1 -1
  36. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/checkpoints/core.pyi +3 -3
  37. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/checkpoints/decorator.pyi +3 -3
  38. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/checkpoints/exceptions.pyi +1 -1
  39. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/checkpoints/final_api.pyi +1 -1
  40. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/checkpoints/lineage.pyi +1 -1
  41. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/datastore/__init__.pyi +1 -1
  42. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/datastore/context.pyi +3 -3
  43. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/datastore/core.pyi +1 -1
  44. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/datastore/decorator.pyi +1 -1
  45. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/datastore/exceptions.pyi +1 -1
  46. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/datastore/task_utils.pyi +2 -2
  47. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/datastore/utils.pyi +1 -1
  48. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/datastructures.pyi +1 -1
  49. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/exceptions.pyi +1 -1
  50. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/hf_hub/__init__.pyi +1 -1
  51. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/hf_hub/decorator.pyi +2 -2
  52. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/modeling_utils/__init__.pyi +1 -1
  53. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/modeling_utils/core.pyi +2 -2
  54. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/modeling_utils/exceptions.pyi +1 -1
  55. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/modeling_utils/model_storage.pyi +1 -1
  56. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/utils/__init__.pyi +1 -1
  57. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/utils/flowspec_utils.pyi +1 -1
  58. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/utils/general.pyi +1 -1
  59. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/utils/identity_utils.pyi +2 -2
  60. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/utils/serialization_handler/__init__.pyi +1 -1
  61. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/utils/serialization_handler/base.pyi +1 -1
  62. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/utils/serialization_handler/tar.pyi +1 -1
  63. metaflow-stubs/mf_extensions/obcheckpoint/plugins/machine_learning_utilities/utils/tar_utils.pyi +1 -1
  64. metaflow-stubs/mf_extensions/outerbounds/__init__.pyi +1 -1
  65. metaflow-stubs/mf_extensions/outerbounds/plugins/__init__.pyi +1 -1
  66. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/__init__.pyi +1 -1
  67. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/__init__.pyi +1 -1
  68. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/_state_machine.pyi +1 -1
  69. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/_vendor/__init__.pyi +1 -1
  70. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.pyi +1 -1
  71. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.pyi +1 -1
  72. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/app_cli.pyi +1 -1
  73. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/app_config.pyi +1 -1
  74. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/capsule.pyi +4 -2
  75. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/click_importer.pyi +1 -1
  76. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/code_package/__init__.pyi +1 -1
  77. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/code_package/code_packager.pyi +2 -2
  78. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/config/__init__.pyi +1 -1
  79. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/config/cli_generator.pyi +1 -1
  80. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/config/config_utils.pyi +2 -2
  81. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/config/schema_export.pyi +1 -1
  82. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/config/typed_configs.pyi +1 -1
  83. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/config/unified_config.pyi +1 -1
  84. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/dependencies.pyi +2 -2
  85. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/deployer.pyi +3 -3
  86. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/experimental/__init__.pyi +1 -1
  87. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/perimeters.pyi +2 -2
  88. metaflow-stubs/mf_extensions/outerbounds/plugins/apps/core/utils.pyi +2 -2
  89. metaflow-stubs/mf_extensions/outerbounds/plugins/aws/__init__.pyi +1 -1
  90. metaflow-stubs/mf_extensions/outerbounds/plugins/aws/assume_role_decorator.pyi +2 -2
  91. metaflow-stubs/mf_extensions/outerbounds/plugins/card_utilities/__init__.pyi +1 -1
  92. metaflow-stubs/mf_extensions/outerbounds/plugins/card_utilities/async_cards.pyi +2 -2
  93. metaflow-stubs/mf_extensions/outerbounds/plugins/card_utilities/injector.pyi +1 -1
  94. metaflow-stubs/mf_extensions/outerbounds/plugins/checkpoint_datastores/__init__.pyi +1 -1
  95. metaflow-stubs/mf_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.pyi +3 -3
  96. metaflow-stubs/mf_extensions/outerbounds/plugins/checkpoint_datastores/nebius.pyi +3 -3
  97. metaflow-stubs/mf_extensions/outerbounds/plugins/fast_bakery/__init__.pyi +1 -1
  98. metaflow-stubs/mf_extensions/outerbounds/plugins/fast_bakery/baker.pyi +3 -3
  99. metaflow-stubs/mf_extensions/outerbounds/plugins/fast_bakery/docker_environment.pyi +2 -2
  100. metaflow-stubs/mf_extensions/outerbounds/plugins/fast_bakery/fast_bakery.pyi +1 -1
  101. metaflow-stubs/mf_extensions/outerbounds/plugins/kubernetes/__init__.pyi +1 -1
  102. metaflow-stubs/mf_extensions/outerbounds/plugins/kubernetes/pod_killer.pyi +1 -1
  103. metaflow-stubs/mf_extensions/outerbounds/plugins/ollama/__init__.pyi +1 -1
  104. metaflow-stubs/mf_extensions/outerbounds/plugins/ollama/constants.pyi +1 -1
  105. metaflow-stubs/mf_extensions/outerbounds/plugins/ollama/exceptions.pyi +1 -1
  106. metaflow-stubs/mf_extensions/outerbounds/plugins/ollama/ollama.pyi +1 -1
  107. metaflow-stubs/mf_extensions/outerbounds/plugins/ollama/status_card.pyi +1 -1
  108. metaflow-stubs/mf_extensions/outerbounds/plugins/snowflake/__init__.pyi +1 -1
  109. metaflow-stubs/mf_extensions/outerbounds/plugins/snowflake/snowflake.pyi +1 -1
  110. metaflow-stubs/mf_extensions/outerbounds/profilers/__init__.pyi +1 -1
  111. metaflow-stubs/mf_extensions/outerbounds/profilers/gpu.pyi +1 -1
  112. metaflow-stubs/mf_extensions/outerbounds/remote_config.pyi +2 -2
  113. metaflow-stubs/mf_extensions/outerbounds/toplevel/__init__.pyi +1 -1
  114. metaflow-stubs/mf_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.pyi +1 -1
  115. metaflow-stubs/mf_extensions/outerbounds/toplevel/s3_proxy.pyi +1 -1
  116. metaflow-stubs/multicore_utils.pyi +1 -1
  117. metaflow-stubs/ob_internal.pyi +1 -1
  118. metaflow-stubs/packaging_sys/__init__.pyi +5 -5
  119. metaflow-stubs/packaging_sys/backend.pyi +2 -2
  120. metaflow-stubs/packaging_sys/distribution_support.pyi +3 -3
  121. metaflow-stubs/packaging_sys/tar_backend.pyi +2 -2
  122. metaflow-stubs/packaging_sys/utils.pyi +1 -1
  123. metaflow-stubs/packaging_sys/v1.pyi +1 -1
  124. metaflow-stubs/parameters.pyi +2 -2
  125. metaflow-stubs/plugins/__init__.pyi +14 -13
  126. metaflow-stubs/plugins/airflow/__init__.pyi +1 -1
  127. metaflow-stubs/plugins/airflow/airflow_utils.pyi +1 -1
  128. metaflow-stubs/plugins/airflow/exception.pyi +1 -1
  129. metaflow-stubs/plugins/airflow/sensors/__init__.pyi +1 -1
  130. metaflow-stubs/plugins/airflow/sensors/base_sensor.pyi +1 -1
  131. metaflow-stubs/plugins/airflow/sensors/external_task_sensor.pyi +1 -1
  132. metaflow-stubs/plugins/airflow/sensors/s3_sensor.pyi +1 -1
  133. metaflow-stubs/plugins/argo/__init__.pyi +1 -1
  134. metaflow-stubs/plugins/argo/argo_client.pyi +1 -1
  135. metaflow-stubs/plugins/argo/argo_events.pyi +1 -1
  136. metaflow-stubs/plugins/argo/argo_workflows.pyi +2 -2
  137. metaflow-stubs/plugins/argo/argo_workflows_decorator.pyi +2 -2
  138. metaflow-stubs/plugins/argo/argo_workflows_deployer.pyi +3 -3
  139. metaflow-stubs/plugins/argo/argo_workflows_deployer_objects.pyi +1 -1
  140. metaflow-stubs/plugins/argo/exit_hooks.pyi +1 -1
  141. metaflow-stubs/plugins/aws/__init__.pyi +1 -1
  142. metaflow-stubs/plugins/aws/aws_client.pyi +1 -1
  143. metaflow-stubs/plugins/aws/aws_utils.pyi +1 -1
  144. metaflow-stubs/plugins/aws/batch/__init__.pyi +1 -1
  145. metaflow-stubs/plugins/aws/batch/batch.pyi +1 -1
  146. metaflow-stubs/plugins/aws/batch/batch_client.pyi +1 -1
  147. metaflow-stubs/plugins/aws/batch/batch_decorator.pyi +1 -1
  148. metaflow-stubs/plugins/aws/secrets_manager/__init__.pyi +1 -1
  149. metaflow-stubs/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.pyi +3 -3
  150. metaflow-stubs/plugins/aws/step_functions/__init__.pyi +1 -1
  151. metaflow-stubs/plugins/aws/step_functions/event_bridge_client.pyi +1 -1
  152. metaflow-stubs/plugins/aws/step_functions/schedule_decorator.pyi +1 -1
  153. metaflow-stubs/plugins/aws/step_functions/step_functions.pyi +1 -1
  154. metaflow-stubs/plugins/aws/step_functions/step_functions_client.pyi +1 -1
  155. metaflow-stubs/plugins/aws/step_functions/step_functions_deployer.pyi +3 -3
  156. metaflow-stubs/plugins/aws/step_functions/step_functions_deployer_objects.pyi +1 -1
  157. metaflow-stubs/plugins/azure/__init__.pyi +1 -1
  158. metaflow-stubs/plugins/azure/azure_credential.pyi +1 -1
  159. metaflow-stubs/plugins/azure/azure_exceptions.pyi +1 -1
  160. metaflow-stubs/plugins/azure/azure_secret_manager_secrets_provider.pyi +3 -3
  161. metaflow-stubs/plugins/azure/azure_utils.pyi +1 -1
  162. metaflow-stubs/plugins/azure/blob_service_client_factory.pyi +1 -1
  163. metaflow-stubs/plugins/azure/includefile_support.pyi +1 -1
  164. metaflow-stubs/plugins/cards/__init__.pyi +1 -1
  165. metaflow-stubs/plugins/cards/card_client.pyi +1 -1
  166. metaflow-stubs/plugins/cards/card_creator.pyi +1 -1
  167. metaflow-stubs/plugins/cards/card_datastore.pyi +1 -1
  168. metaflow-stubs/plugins/cards/card_decorator.pyi +1 -1
  169. metaflow-stubs/plugins/cards/card_modules/__init__.pyi +1 -1
  170. metaflow-stubs/plugins/cards/card_modules/basic.pyi +2 -2
  171. metaflow-stubs/plugins/cards/card_modules/card.pyi +1 -1
  172. metaflow-stubs/plugins/cards/card_modules/components.pyi +2 -2
  173. metaflow-stubs/plugins/cards/card_modules/convert_to_native_type.pyi +1 -1
  174. metaflow-stubs/plugins/cards/card_modules/renderer_tools.pyi +1 -1
  175. metaflow-stubs/plugins/cards/card_modules/test_cards.pyi +1 -1
  176. metaflow-stubs/plugins/cards/card_resolver.pyi +1 -1
  177. metaflow-stubs/plugins/cards/component_serializer.pyi +1 -1
  178. metaflow-stubs/plugins/cards/exception.pyi +1 -1
  179. metaflow-stubs/plugins/catch_decorator.pyi +2 -2
  180. metaflow-stubs/plugins/datatools/__init__.pyi +1 -1
  181. metaflow-stubs/plugins/datatools/local.pyi +1 -1
  182. metaflow-stubs/plugins/datatools/s3/__init__.pyi +1 -1
  183. metaflow-stubs/plugins/datatools/s3/s3.pyi +3 -3
  184. metaflow-stubs/plugins/datatools/s3/s3tail.pyi +1 -1
  185. metaflow-stubs/plugins/datatools/s3/s3util.pyi +1 -1
  186. metaflow-stubs/plugins/debug_logger.pyi +1 -1
  187. metaflow-stubs/plugins/debug_monitor.pyi +1 -1
  188. metaflow-stubs/plugins/environment_decorator.pyi +1 -1
  189. metaflow-stubs/plugins/events_decorator.pyi +1 -1
  190. metaflow-stubs/plugins/exit_hook/__init__.pyi +1 -1
  191. metaflow-stubs/plugins/exit_hook/exit_hook_decorator.pyi +1 -1
  192. metaflow-stubs/plugins/frameworks/__init__.pyi +1 -1
  193. metaflow-stubs/plugins/frameworks/pytorch.pyi +1 -1
  194. metaflow-stubs/plugins/gcp/__init__.pyi +1 -1
  195. metaflow-stubs/plugins/gcp/gcp_secret_manager_secrets_provider.pyi +3 -3
  196. metaflow-stubs/plugins/gcp/gs_exceptions.pyi +1 -1
  197. metaflow-stubs/plugins/gcp/gs_storage_client_factory.pyi +1 -1
  198. metaflow-stubs/plugins/gcp/gs_utils.pyi +1 -1
  199. metaflow-stubs/plugins/gcp/includefile_support.pyi +1 -1
  200. metaflow-stubs/plugins/kubernetes/__init__.pyi +1 -1
  201. metaflow-stubs/plugins/kubernetes/kube_utils.pyi +1 -1
  202. metaflow-stubs/plugins/kubernetes/kubernetes.pyi +1 -1
  203. metaflow-stubs/plugins/kubernetes/kubernetes_client.pyi +1 -1
  204. metaflow-stubs/plugins/kubernetes/kubernetes_decorator.pyi +1 -1
  205. metaflow-stubs/plugins/kubernetes/kubernetes_jobsets.pyi +1 -1
  206. metaflow-stubs/plugins/kubernetes/spot_monitor_sidecar.pyi +1 -1
  207. metaflow-stubs/plugins/ollama/__init__.pyi +2 -2
  208. metaflow-stubs/plugins/optuna/__init__.pyi +24 -0
  209. metaflow-stubs/plugins/parallel_decorator.pyi +1 -1
  210. metaflow-stubs/plugins/perimeters.pyi +1 -1
  211. metaflow-stubs/plugins/project_decorator.pyi +1 -1
  212. metaflow-stubs/plugins/pypi/__init__.pyi +1 -1
  213. metaflow-stubs/plugins/pypi/conda_decorator.pyi +1 -1
  214. metaflow-stubs/plugins/pypi/conda_environment.pyi +6 -6
  215. metaflow-stubs/plugins/pypi/parsers.pyi +1 -1
  216. metaflow-stubs/plugins/pypi/pypi_decorator.pyi +1 -1
  217. metaflow-stubs/plugins/pypi/pypi_environment.pyi +1 -1
  218. metaflow-stubs/plugins/pypi/utils.pyi +1 -1
  219. metaflow-stubs/plugins/resources_decorator.pyi +1 -1
  220. metaflow-stubs/plugins/retry_decorator.pyi +1 -1
  221. metaflow-stubs/plugins/secrets/__init__.pyi +1 -1
  222. metaflow-stubs/plugins/secrets/inline_secrets_provider.pyi +2 -2
  223. metaflow-stubs/plugins/secrets/secrets_decorator.pyi +1 -1
  224. metaflow-stubs/plugins/secrets/secrets_func.pyi +1 -1
  225. metaflow-stubs/plugins/secrets/secrets_spec.pyi +1 -1
  226. metaflow-stubs/plugins/secrets/utils.pyi +1 -1
  227. metaflow-stubs/plugins/snowflake/__init__.pyi +1 -1
  228. metaflow-stubs/plugins/storage_executor.pyi +1 -1
  229. metaflow-stubs/plugins/test_unbounded_foreach_decorator.pyi +2 -2
  230. metaflow-stubs/plugins/timeout_decorator.pyi +2 -2
  231. metaflow-stubs/plugins/torchtune/__init__.pyi +1 -1
  232. metaflow-stubs/plugins/uv/__init__.pyi +1 -1
  233. metaflow-stubs/plugins/uv/uv_environment.pyi +2 -2
  234. metaflow-stubs/profilers/__init__.pyi +1 -1
  235. metaflow-stubs/pylint_wrapper.pyi +1 -1
  236. metaflow-stubs/runner/__init__.pyi +1 -1
  237. metaflow-stubs/runner/deployer.pyi +31 -31
  238. metaflow-stubs/runner/deployer_impl.pyi +2 -2
  239. metaflow-stubs/runner/metaflow_runner.pyi +1 -1
  240. metaflow-stubs/runner/nbdeploy.pyi +1 -1
  241. metaflow-stubs/runner/nbrun.pyi +1 -1
  242. metaflow-stubs/runner/subprocess_manager.pyi +1 -1
  243. metaflow-stubs/runner/utils.pyi +2 -2
  244. metaflow-stubs/system/__init__.pyi +1 -1
  245. metaflow-stubs/system/system_logger.pyi +2 -2
  246. metaflow-stubs/system/system_monitor.pyi +1 -1
  247. metaflow-stubs/tagging_util.pyi +1 -1
  248. metaflow-stubs/tuple_util.pyi +1 -1
  249. metaflow-stubs/user_configs/__init__.pyi +1 -1
  250. metaflow-stubs/user_configs/config_options.pyi +2 -2
  251. metaflow-stubs/user_configs/config_parameters.pyi +3 -3
  252. metaflow-stubs/user_decorators/__init__.pyi +1 -1
  253. metaflow-stubs/user_decorators/common.pyi +1 -1
  254. metaflow-stubs/user_decorators/mutable_flow.pyi +3 -3
  255. metaflow-stubs/user_decorators/mutable_step.pyi +2 -2
  256. metaflow-stubs/user_decorators/user_flow_decorator.pyi +2 -2
  257. metaflow-stubs/user_decorators/user_step_decorator.pyi +4 -4
  258. {ob_metaflow_stubs-6.0.7.1.dist-info → ob_metaflow_stubs-6.0.7.2.dist-info}/METADATA +1 -1
  259. ob_metaflow_stubs-6.0.7.2.dist-info/RECORD +262 -0
  260. ob_metaflow_stubs-6.0.7.1.dist-info/RECORD +0 -261
  261. {ob_metaflow_stubs-6.0.7.1.dist-info → ob_metaflow_stubs-6.0.7.2.dist-info}/WHEEL +0 -0
  262. {ob_metaflow_stubs-6.0.7.1.dist-info → ob_metaflow_stubs-6.0.7.2.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,15 @@
1
1
  ######################################################################################################
2
2
  # Auto-generated Metaflow stub file #
3
3
  # MF version: 2.17.1.0+obcheckpoint(0.2.4);ob(v1) #
4
- # Generated on 2025-08-19T19:04:22.043902 #
4
+ # Generated on 2025-08-19T23:54:56.174978 #
5
5
  ######################################################################################################
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
9
  import typing
10
10
  if typing.TYPE_CHECKING:
11
- import typing
12
11
  import datetime
12
+ import typing
13
13
  FlowSpecDerived = typing.TypeVar("FlowSpecDerived", bound="FlowSpec", contravariant=False, covariant=False)
14
14
  StepFlag = typing.NewType("StepFlag", bool)
15
15
 
@@ -49,8 +49,8 @@ from .mf_extensions.outerbounds.toplevel.global_aliases_for_metaflow_package imp
49
49
  from . import includefile as includefile
50
50
  from .includefile import IncludeFile as IncludeFile
51
51
  from .plugins.pypi.parsers import pyproject_toml_parser as pyproject_toml_parser
52
- from .plugins.pypi.parsers import requirements_txt_parser as requirements_txt_parser
53
52
  from .plugins.pypi.parsers import conda_environment_yml_parser as conda_environment_yml_parser
53
+ from .plugins.pypi.parsers import requirements_txt_parser as requirements_txt_parser
54
54
  from . import client as client
55
55
  from .client.core import namespace as namespace
56
56
  from .client.core import get_namespace as get_namespace
@@ -167,435 +167,530 @@ def step(f: typing.Union[typing.Callable[[FlowSpecDerived], None], typing.Callab
167
167
  """
168
168
  ...
169
169
 
170
- def nvidia(*, gpu: int, gpu_type: str, queue_timeout: int) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
170
+ def ollama(*, models: list, backend: str, force_pull: bool, cache_update_policy: str, force_cache_update: bool, debug: bool, circuit_breaker_config: dict, timeout_config: dict) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
171
171
  """
172
- Specifies that this step should execute on DGX cloud.
172
+ This decorator is used to run Ollama APIs as Metaflow task sidecars.
173
+
174
+ User code call
175
+ --------------
176
+ @ollama(
177
+ models=[...],
178
+ ...
179
+ )
180
+
181
+ Valid backend options
182
+ ---------------------
183
+ - 'local': Run as a separate process on the local task machine.
184
+ - (TODO) 'managed': Outerbounds hosts and selects compute provider.
185
+ - (TODO) 'remote': Spin up separate instance to serve Ollama models.
186
+
187
+ Valid model options
188
+ -------------------
189
+ Any model here https://ollama.com/search, e.g. 'llama3.2', 'llama3.3'
173
190
 
174
191
 
175
192
  Parameters
176
193
  ----------
177
- gpu : int
178
- Number of GPUs to use.
179
- gpu_type : str
180
- Type of Nvidia GPU to use.
181
- queue_timeout : int
182
- Time to keep the job in NVCF's queue.
194
+ models: list[str]
195
+ List of Ollama containers running models in sidecars.
196
+ backend: str
197
+ Determines where and how to run the Ollama process.
198
+ force_pull: bool
199
+ Whether to run `ollama pull` no matter what, or first check the remote cache in Metaflow datastore for this model key.
200
+ cache_update_policy: str
201
+ Cache update policy: "auto", "force", or "never".
202
+ force_cache_update: bool
203
+ Simple override for "force" cache update policy.
204
+ debug: bool
205
+ Whether to turn on verbose debugging logs.
206
+ circuit_breaker_config: dict
207
+ Configuration for circuit breaker protection. Keys: failure_threshold, recovery_timeout, reset_timeout.
208
+ timeout_config: dict
209
+ Configuration for various operation timeouts. Keys: pull, stop, health_check, install, server_startup.
183
210
  """
184
211
  ...
185
212
 
186
213
  @typing.overload
187
- def catch(*, var: typing.Optional[str] = None, print_exception: bool = True) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
214
+ def pypi(*, packages: typing.Dict[str, str] = {}, python: typing.Optional[str] = None) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
188
215
  """
189
- Specifies that the step will success under all circumstances.
216
+ Specifies the PyPI packages for the step.
190
217
 
191
- The decorator will create an optional artifact, specified by `var`, which
192
- contains the exception raised. You can use it to detect the presence
193
- of errors, indicating that all happy-path artifacts produced by the step
194
- are missing.
218
+ Information in this decorator will augment any
219
+ attributes set in the `@pyi_base` flow-level decorator. Hence,
220
+ you can use `@pypi_base` to set packages required by all
221
+ steps and use `@pypi` to specify step-specific overrides.
195
222
 
196
223
 
197
224
  Parameters
198
225
  ----------
199
- var : str, optional, default None
200
- Name of the artifact in which to store the caught exception.
201
- If not specified, the exception is not stored.
202
- print_exception : bool, default True
203
- Determines whether or not the exception is printed to
204
- stdout when caught.
226
+ packages : Dict[str, str], default: {}
227
+ Packages to use for this step. The key is the name of the package
228
+ and the value is the version to use.
229
+ python : str, optional, default: None
230
+ Version of Python to use, e.g. '3.7.4'. A default value of None implies
231
+ that the version used will correspond to the version of the Python interpreter used to start the run.
205
232
  """
206
233
  ...
207
234
 
208
235
  @typing.overload
209
- def catch(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
236
+ def pypi(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
210
237
  ...
211
238
 
212
239
  @typing.overload
213
- def catch(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
240
+ def pypi(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
214
241
  ...
215
242
 
216
- def catch(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, var: typing.Optional[str] = None, print_exception: bool = True):
243
+ def pypi(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, packages: typing.Dict[str, str] = {}, python: typing.Optional[str] = None):
217
244
  """
218
- Specifies that the step will success under all circumstances.
245
+ Specifies the PyPI packages for the step.
219
246
 
220
- The decorator will create an optional artifact, specified by `var`, which
221
- contains the exception raised. You can use it to detect the presence
222
- of errors, indicating that all happy-path artifacts produced by the step
223
- are missing.
247
+ Information in this decorator will augment any
248
+ attributes set in the `@pyi_base` flow-level decorator. Hence,
249
+ you can use `@pypi_base` to set packages required by all
250
+ steps and use `@pypi` to specify step-specific overrides.
224
251
 
225
252
 
226
253
  Parameters
227
254
  ----------
228
- var : str, optional, default None
229
- Name of the artifact in which to store the caught exception.
230
- If not specified, the exception is not stored.
231
- print_exception : bool, default True
232
- Determines whether or not the exception is printed to
233
- stdout when caught.
255
+ packages : Dict[str, str], default: {}
256
+ Packages to use for this step. The key is the name of the package
257
+ and the value is the version to use.
258
+ python : str, optional, default: None
259
+ Version of Python to use, e.g. '3.7.4'. A default value of None implies
260
+ that the version used will correspond to the version of the Python interpreter used to start the run.
234
261
  """
235
262
  ...
236
263
 
237
264
  @typing.overload
238
- def parallel(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
265
+ def checkpoint(*, load_policy: str = 'fresh', temp_dir_root: str = None) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
239
266
  """
240
- Decorator prototype for all step decorators. This function gets specialized
241
- and imported for all decorators types by _import_plugin_decorators().
267
+ Enables checkpointing for a step.
268
+
269
+ > Examples
270
+
271
+ - Saving Checkpoints
272
+
273
+ ```python
274
+ @checkpoint
275
+ @step
276
+ def train(self):
277
+ model = create_model(self.parameters, checkpoint_path = None)
278
+ for i in range(self.epochs):
279
+ # some training logic
280
+ loss = model.train(self.dataset)
281
+ if i % 10 == 0:
282
+ model.save(
283
+ current.checkpoint.directory,
284
+ )
285
+ # saves the contents of the `current.checkpoint.directory` as a checkpoint
286
+ # and returns a reference dictionary to the checkpoint saved in the datastore
287
+ self.latest_checkpoint = current.checkpoint.save(
288
+ name="epoch_checkpoint",
289
+ metadata={
290
+ "epoch": i,
291
+ "loss": loss,
292
+ }
293
+ )
294
+ ```
295
+
296
+ - Using Loaded Checkpoints
297
+
298
+ ```python
299
+ @retry(times=3)
300
+ @checkpoint
301
+ @step
302
+ def train(self):
303
+ # Assume that the task has restarted and the previous attempt of the task
304
+ # saved a checkpoint
305
+ checkpoint_path = None
306
+ if current.checkpoint.is_loaded: # Check if a checkpoint is loaded
307
+ print("Loaded checkpoint from the previous attempt")
308
+ checkpoint_path = current.checkpoint.directory
309
+
310
+ model = create_model(self.parameters, checkpoint_path = checkpoint_path)
311
+ for i in range(self.epochs):
312
+ ...
313
+ ```
314
+
315
+
316
+ Parameters
317
+ ----------
318
+ load_policy : str, default: "fresh"
319
+ The policy for loading the checkpoint. The following policies are supported:
320
+ - "eager": Loads the the latest available checkpoint within the namespace.
321
+ With this mode, the latest checkpoint written by any previous task (can be even a different run) of the step
322
+ will be loaded at the start of the task.
323
+ - "none": Do not load any checkpoint
324
+ - "fresh": Loads the lastest checkpoint created within the running Task.
325
+ This mode helps loading checkpoints across various retry attempts of the same task.
326
+ With this mode, no checkpoint will be loaded at the start of a task but any checkpoints
327
+ created within the task will be loaded when the task is retries execution on failure.
328
+
329
+ temp_dir_root : str, default: None
330
+ The root directory under which `current.checkpoint.directory` will be created.
242
331
  """
243
332
  ...
244
333
 
245
334
  @typing.overload
246
- def parallel(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
335
+ def checkpoint(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
247
336
  ...
248
337
 
249
- def parallel(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None):
338
+ @typing.overload
339
+ def checkpoint(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
340
+ ...
341
+
342
+ def checkpoint(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, load_policy: str = 'fresh', temp_dir_root: str = None):
250
343
  """
251
- Decorator prototype for all step decorators. This function gets specialized
252
- and imported for all decorators types by _import_plugin_decorators().
344
+ Enables checkpointing for a step.
345
+
346
+ > Examples
347
+
348
+ - Saving Checkpoints
349
+
350
+ ```python
351
+ @checkpoint
352
+ @step
353
+ def train(self):
354
+ model = create_model(self.parameters, checkpoint_path = None)
355
+ for i in range(self.epochs):
356
+ # some training logic
357
+ loss = model.train(self.dataset)
358
+ if i % 10 == 0:
359
+ model.save(
360
+ current.checkpoint.directory,
361
+ )
362
+ # saves the contents of the `current.checkpoint.directory` as a checkpoint
363
+ # and returns a reference dictionary to the checkpoint saved in the datastore
364
+ self.latest_checkpoint = current.checkpoint.save(
365
+ name="epoch_checkpoint",
366
+ metadata={
367
+ "epoch": i,
368
+ "loss": loss,
369
+ }
370
+ )
371
+ ```
372
+
373
+ - Using Loaded Checkpoints
374
+
375
+ ```python
376
+ @retry(times=3)
377
+ @checkpoint
378
+ @step
379
+ def train(self):
380
+ # Assume that the task has restarted and the previous attempt of the task
381
+ # saved a checkpoint
382
+ checkpoint_path = None
383
+ if current.checkpoint.is_loaded: # Check if a checkpoint is loaded
384
+ print("Loaded checkpoint from the previous attempt")
385
+ checkpoint_path = current.checkpoint.directory
386
+
387
+ model = create_model(self.parameters, checkpoint_path = checkpoint_path)
388
+ for i in range(self.epochs):
389
+ ...
390
+ ```
391
+
392
+
393
+ Parameters
394
+ ----------
395
+ load_policy : str, default: "fresh"
396
+ The policy for loading the checkpoint. The following policies are supported:
397
+ - "eager": Loads the the latest available checkpoint within the namespace.
398
+ With this mode, the latest checkpoint written by any previous task (can be even a different run) of the step
399
+ will be loaded at the start of the task.
400
+ - "none": Do not load any checkpoint
401
+ - "fresh": Loads the lastest checkpoint created within the running Task.
402
+ This mode helps loading checkpoints across various retry attempts of the same task.
403
+ With this mode, no checkpoint will be loaded at the start of a task but any checkpoints
404
+ created within the task will be loaded when the task is retries execution on failure.
405
+
406
+ temp_dir_root : str, default: None
407
+ The root directory under which `current.checkpoint.directory` will be created.
253
408
  """
254
409
  ...
255
410
 
256
411
  @typing.overload
257
- def test_append_card(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
412
+ def coreweave_s3_proxy(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
258
413
  """
259
- A simple decorator that demonstrates using CardDecoratorInjector
260
- to inject a card and render simple markdown content.
414
+ CoreWeave-specific S3 Proxy decorator for routing S3 requests through a local proxy service.
415
+ It exists to make it easier for users to know that this decorator should only be used with
416
+ a Neo Cloud like CoreWeave.
261
417
  """
262
418
  ...
263
419
 
264
420
  @typing.overload
265
- def test_append_card(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
421
+ def coreweave_s3_proxy(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
266
422
  ...
267
423
 
268
- def test_append_card(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None):
269
- """
270
- A simple decorator that demonstrates using CardDecoratorInjector
271
- to inject a card and render simple markdown content.
424
+ def coreweave_s3_proxy(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None):
272
425
  """
273
- ...
426
+ CoreWeave-specific S3 Proxy decorator for routing S3 requests through a local proxy service.
427
+ It exists to make it easier for users to know that this decorator should only be used with
428
+ a Neo Cloud like CoreWeave.
429
+ """
430
+ ...
274
431
 
275
- def kubernetes(*, cpu: int = 1, memory: int = 4096, disk: int = 10240, image: typing.Optional[str] = None, image_pull_policy: str = 'KUBERNETES_IMAGE_PULL_POLICY', image_pull_secrets: typing.List[str] = [], service_account: str = 'METAFLOW_KUBERNETES_SERVICE_ACCOUNT', secrets: typing.Optional[typing.List[str]] = None, node_selector: typing.Union[typing.Dict[str, str], str, None] = None, namespace: str = 'METAFLOW_KUBERNETES_NAMESPACE', gpu: typing.Optional[int] = None, gpu_vendor: str = 'KUBERNETES_GPU_VENDOR', tolerations: typing.List[typing.Dict[str, str]] = [], labels: typing.Dict[str, str] = 'METAFLOW_KUBERNETES_LABELS', annotations: typing.Dict[str, str] = 'METAFLOW_KUBERNETES_ANNOTATIONS', use_tmpfs: bool = False, tmpfs_tempdir: bool = True, tmpfs_size: typing.Optional[int] = None, tmpfs_path: typing.Optional[str] = '/metaflow_temp', persistent_volume_claims: typing.Optional[typing.Dict[str, str]] = None, shared_memory: typing.Optional[int] = None, port: typing.Optional[int] = None, compute_pool: typing.Optional[str] = None, hostname_resolution_timeout: int = 600, qos: str = 'Burstable', security_context: typing.Optional[typing.Dict[str, typing.Any]] = None) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
432
+ @typing.overload
433
+ def model(*, load: typing.Union[typing.List[str], str, typing.List[typing.Tuple[str, typing.Optional[str]]]] = None, temp_dir_root: str = None) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
276
434
  """
277
- Specifies that this step should execute on Kubernetes.
435
+ Enables loading / saving of models within a step.
436
+
437
+ > Examples
438
+ - Saving Models
439
+ ```python
440
+ @model
441
+ @step
442
+ def train(self):
443
+ # current.model.save returns a dictionary reference to the model saved
444
+ self.my_model = current.model.save(
445
+ path_to_my_model,
446
+ label="my_model",
447
+ metadata={
448
+ "epochs": 10,
449
+ "batch-size": 32,
450
+ "learning-rate": 0.001,
451
+ }
452
+ )
453
+ self.next(self.test)
454
+
455
+ @model(load="my_model")
456
+ @step
457
+ def test(self):
458
+ # `current.model.loaded` returns a dictionary of the loaded models
459
+ # where the key is the name of the artifact and the value is the path to the model
460
+ print(os.listdir(current.model.loaded["my_model"]))
461
+ self.next(self.end)
462
+ ```
463
+
464
+ - Loading models
465
+ ```python
466
+ @step
467
+ def train(self):
468
+ # current.model.load returns the path to the model loaded
469
+ checkpoint_path = current.model.load(
470
+ self.checkpoint_key,
471
+ )
472
+ model_path = current.model.load(
473
+ self.model,
474
+ )
475
+ self.next(self.test)
476
+ ```
278
477
 
279
478
 
280
479
  Parameters
281
480
  ----------
282
- cpu : int, default 1
283
- Number of CPUs required for this step. If `@resources` is
284
- also present, the maximum value from all decorators is used.
285
- memory : int, default 4096
286
- Memory size (in MB) required for this step. If
287
- `@resources` is also present, the maximum value from all decorators is
288
- used.
289
- disk : int, default 10240
290
- Disk size (in MB) required for this step. If
291
- `@resources` is also present, the maximum value from all decorators is
292
- used.
293
- image : str, optional, default None
294
- Docker image to use when launching on Kubernetes. If not specified, and
295
- METAFLOW_KUBERNETES_CONTAINER_IMAGE is specified, that image is used. If
296
- not, a default Docker image mapping to the current version of Python is used.
297
- image_pull_policy: str, default KUBERNETES_IMAGE_PULL_POLICY
298
- If given, the imagePullPolicy to be applied to the Docker image of the step.
299
- image_pull_secrets: List[str], default []
300
- The default is extracted from METAFLOW_KUBERNETES_IMAGE_PULL_SECRETS.
301
- Kubernetes image pull secrets to use when pulling container images
302
- in Kubernetes.
303
- service_account : str, default METAFLOW_KUBERNETES_SERVICE_ACCOUNT
304
- Kubernetes service account to use when launching pod in Kubernetes.
305
- secrets : List[str], optional, default None
306
- Kubernetes secrets to use when launching pod in Kubernetes. These
307
- secrets are in addition to the ones defined in `METAFLOW_KUBERNETES_SECRETS`
308
- in Metaflow configuration.
309
- node_selector: Union[Dict[str,str], str], optional, default None
310
- Kubernetes node selector(s) to apply to the pod running the task.
311
- Can be passed in as a comma separated string of values e.g.
312
- 'kubernetes.io/os=linux,kubernetes.io/arch=amd64' or as a dictionary
313
- {'kubernetes.io/os': 'linux', 'kubernetes.io/arch': 'amd64'}
314
- namespace : str, default METAFLOW_KUBERNETES_NAMESPACE
315
- Kubernetes namespace to use when launching pod in Kubernetes.
316
- gpu : int, optional, default None
317
- Number of GPUs required for this step. A value of zero implies that
318
- the scheduled node should not have GPUs.
319
- gpu_vendor : str, default KUBERNETES_GPU_VENDOR
320
- The vendor of the GPUs to be used for this step.
321
- tolerations : List[Dict[str,str]], default []
322
- The default is extracted from METAFLOW_KUBERNETES_TOLERATIONS.
323
- Kubernetes tolerations to use when launching pod in Kubernetes.
324
- labels: Dict[str, str], default: METAFLOW_KUBERNETES_LABELS
325
- Kubernetes labels to use when launching pod in Kubernetes.
326
- annotations: Dict[str, str], default: METAFLOW_KUBERNETES_ANNOTATIONS
327
- Kubernetes annotations to use when launching pod in Kubernetes.
328
- use_tmpfs : bool, default False
329
- This enables an explicit tmpfs mount for this step.
330
- tmpfs_tempdir : bool, default True
331
- sets METAFLOW_TEMPDIR to tmpfs_path if set for this step.
332
- tmpfs_size : int, optional, default: None
333
- The value for the size (in MiB) of the tmpfs mount for this step.
334
- This parameter maps to the `--tmpfs` option in Docker. Defaults to 50% of the
335
- memory allocated for this step.
336
- tmpfs_path : str, optional, default /metaflow_temp
337
- Path to tmpfs mount for this step.
338
- persistent_volume_claims : Dict[str, str], optional, default None
339
- A map (dictionary) of persistent volumes to be mounted to the pod for this step. The map is from persistent
340
- volumes to the path to which the volume is to be mounted, e.g., `{'pvc-name': '/path/to/mount/on'}`.
341
- shared_memory: int, optional
342
- Shared memory size (in MiB) required for this step
343
- port: int, optional
344
- Port number to specify in the Kubernetes job object
345
- compute_pool : str, optional, default None
346
- Compute pool to be used for for this step.
347
- If not specified, any accessible compute pool within the perimeter is used.
348
- hostname_resolution_timeout: int, default 10 * 60
349
- Timeout in seconds for the workers tasks in the gang scheduled cluster to resolve the hostname of control task.
350
- Only applicable when @parallel is used.
351
- qos: str, default: Burstable
352
- Quality of Service class to assign to the pod. Supported values are: Guaranteed, Burstable, BestEffort
481
+ load : Union[List[str],str,List[Tuple[str,Union[str,None]]]], default: None
482
+ Artifact name/s referencing the models/checkpoints to load. Artifact names refer to the names of the instance variables set to `self`.
483
+ These artifact names give to `load` be reference objects or reference `key` string's from objects created by `current.checkpoint` / `current.model` / `current.huggingface_hub`.
484
+ If a list of tuples is provided, the first element is the artifact name and the second element is the path the artifact needs be unpacked on
485
+ the local filesystem. If the second element is None, the artifact will be unpacked in the current working directory.
486
+ If a string is provided, then the artifact corresponding to that name will be loaded in the current working directory.
353
487
 
354
- security_context: Dict[str, Any], optional, default None
355
- Container security context. Applies to the task container. Allows the following keys:
356
- - privileged: bool, optional, default None
357
- - allow_privilege_escalation: bool, optional, default None
358
- - run_as_user: int, optional, default None
359
- - run_as_group: int, optional, default None
360
- - run_as_non_root: bool, optional, default None
488
+ temp_dir_root : str, default: None
489
+ The root directory under which `current.model.loaded` will store loaded models
361
490
  """
362
491
  ...
363
492
 
364
- def vllm(*, model: str, backend: str, openai_api_server: bool, debug: bool, card_refresh_interval: int, max_retries: int, retry_alert_frequency: int, engine_args: dict) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
493
+ @typing.overload
494
+ def model(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
495
+ ...
496
+
497
+ @typing.overload
498
+ def model(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
499
+ ...
500
+
501
+ def model(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, load: typing.Union[typing.List[str], str, typing.List[typing.Tuple[str, typing.Optional[str]]]] = None, temp_dir_root: str = None):
365
502
  """
366
- This decorator is used to run vllm APIs as Metaflow task sidecars.
367
-
368
- User code call
369
- --------------
370
- @vllm(
371
- model="...",
372
- ...
373
- )
503
+ Enables loading / saving of models within a step.
374
504
 
375
- Valid backend options
376
- ---------------------
377
- - 'local': Run as a separate process on the local task machine.
505
+ > Examples
506
+ - Saving Models
507
+ ```python
508
+ @model
509
+ @step
510
+ def train(self):
511
+ # current.model.save returns a dictionary reference to the model saved
512
+ self.my_model = current.model.save(
513
+ path_to_my_model,
514
+ label="my_model",
515
+ metadata={
516
+ "epochs": 10,
517
+ "batch-size": 32,
518
+ "learning-rate": 0.001,
519
+ }
520
+ )
521
+ self.next(self.test)
378
522
 
379
- Valid model options
380
- -------------------
381
- Any HuggingFace model identifier, e.g. 'meta-llama/Llama-3.2-1B'
523
+ @model(load="my_model")
524
+ @step
525
+ def test(self):
526
+ # `current.model.loaded` returns a dictionary of the loaded models
527
+ # where the key is the name of the artifact and the value is the path to the model
528
+ print(os.listdir(current.model.loaded["my_model"]))
529
+ self.next(self.end)
530
+ ```
382
531
 
383
- NOTE: vLLM's OpenAI-compatible server serves ONE model per server instance.
384
- If you need multiple models, you must create multiple @vllm decorators.
532
+ - Loading models
533
+ ```python
534
+ @step
535
+ def train(self):
536
+ # current.model.load returns the path to the model loaded
537
+ checkpoint_path = current.model.load(
538
+ self.checkpoint_key,
539
+ )
540
+ model_path = current.model.load(
541
+ self.model,
542
+ )
543
+ self.next(self.test)
544
+ ```
385
545
 
386
546
 
387
547
  Parameters
388
548
  ----------
389
- model: str
390
- HuggingFace model identifier to be served by vLLM.
391
- backend: str
392
- Determines where and how to run the vLLM process.
393
- openai_api_server: bool
394
- Whether to use OpenAI-compatible API server mode (subprocess) instead of native engine.
395
- Default is False (uses native engine).
396
- Set to True for backward compatibility with existing code.
397
- debug: bool
398
- Whether to turn on verbose debugging logs.
399
- card_refresh_interval: int
400
- Interval in seconds for refreshing the vLLM status card.
401
- Only used when openai_api_server=True.
402
- max_retries: int
403
- Maximum number of retries checking for vLLM server startup.
404
- Only used when openai_api_server=True.
405
- retry_alert_frequency: int
406
- Frequency of alert logs for vLLM server startup retries.
407
- Only used when openai_api_server=True.
408
- engine_args : dict
409
- Additional keyword arguments to pass to the vLLM engine.
410
- For example, `tensor_parallel_size=2`.
549
+ load : Union[List[str],str,List[Tuple[str,Union[str,None]]]], default: None
550
+ Artifact name/s referencing the models/checkpoints to load. Artifact names refer to the names of the instance variables set to `self`.
551
+ These artifact names give to `load` be reference objects or reference `key` string's from objects created by `current.checkpoint` / `current.model` / `current.huggingface_hub`.
552
+ If a list of tuples is provided, the first element is the artifact name and the second element is the path the artifact needs be unpacked on
553
+ the local filesystem. If the second element is None, the artifact will be unpacked in the current working directory.
554
+ If a string is provided, then the artifact corresponding to that name will be loaded in the current working directory.
555
+
556
+ temp_dir_root : str, default: None
557
+ The root directory under which `current.model.loaded` will store loaded models
411
558
  """
412
559
  ...
413
560
 
414
561
  @typing.overload
415
- def app_deploy(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
562
+ def nebius_s3_proxy(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
416
563
  """
417
- Decorator prototype for all step decorators. This function gets specialized
418
- and imported for all decorators types by _import_plugin_decorators().
564
+ Nebius-specific S3 Proxy decorator for routing S3 requests through a local proxy service.
565
+ It exists to make it easier for users to know that this decorator should only be used with
566
+ a Neo Cloud like Nebius.
419
567
  """
420
568
  ...
421
569
 
422
570
  @typing.overload
423
- def app_deploy(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
571
+ def nebius_s3_proxy(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
424
572
  ...
425
573
 
426
- def app_deploy(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None):
574
+ def nebius_s3_proxy(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None):
427
575
  """
428
- Decorator prototype for all step decorators. This function gets specialized
429
- and imported for all decorators types by _import_plugin_decorators().
576
+ Nebius-specific S3 Proxy decorator for routing S3 requests through a local proxy service.
577
+ It exists to make it easier for users to know that this decorator should only be used with
578
+ a Neo Cloud like Nebius.
430
579
  """
431
580
  ...
432
581
 
433
- @typing.overload
434
- def secrets(*, sources: typing.List[typing.Union[str, typing.Dict[str, typing.Any]]] = [], role: typing.Optional[str] = None) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
582
+ def nvidia(*, gpu: int, gpu_type: str, queue_timeout: int) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
435
583
  """
436
- Specifies secrets to be retrieved and injected as environment variables prior to
437
- the execution of a step.
584
+ Specifies that this step should execute on DGX cloud.
438
585
 
439
586
 
440
587
  Parameters
441
588
  ----------
442
- sources : List[Union[str, Dict[str, Any]]], default: []
443
- List of secret specs, defining how the secrets are to be retrieved
444
- role : str, optional, default: None
445
- Role to use for fetching secrets
446
- """
447
- ...
448
-
449
- @typing.overload
450
- def secrets(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
451
- ...
452
-
453
- @typing.overload
454
- def secrets(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
455
- ...
456
-
457
- def secrets(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, sources: typing.List[typing.Union[str, typing.Dict[str, typing.Any]]] = [], role: typing.Optional[str] = None):
458
- """
459
- Specifies secrets to be retrieved and injected as environment variables prior to
460
- the execution of a step.
461
-
462
-
463
- Parameters
464
- ----------
465
- sources : List[Union[str, Dict[str, Any]]], default: []
466
- List of secret specs, defining how the secrets are to be retrieved
467
- role : str, optional, default: None
468
- Role to use for fetching secrets
589
+ gpu : int
590
+ Number of GPUs to use.
591
+ gpu_type : str
592
+ Type of Nvidia GPU to use.
593
+ queue_timeout : int
594
+ Time to keep the job in NVCF's queue.
469
595
  """
470
596
  ...
471
597
 
472
598
  @typing.overload
473
- def pypi(*, packages: typing.Dict[str, str] = {}, python: typing.Optional[str] = None) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
599
+ def test_append_card(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
474
600
  """
475
- Specifies the PyPI packages for the step.
476
-
477
- Information in this decorator will augment any
478
- attributes set in the `@pyi_base` flow-level decorator. Hence,
479
- you can use `@pypi_base` to set packages required by all
480
- steps and use `@pypi` to specify step-specific overrides.
481
-
482
-
483
- Parameters
484
- ----------
485
- packages : Dict[str, str], default: {}
486
- Packages to use for this step. The key is the name of the package
487
- and the value is the version to use.
488
- python : str, optional, default: None
489
- Version of Python to use, e.g. '3.7.4'. A default value of None implies
490
- that the version used will correspond to the version of the Python interpreter used to start the run.
601
+ A simple decorator that demonstrates using CardDecoratorInjector
602
+ to inject a card and render simple markdown content.
491
603
  """
492
604
  ...
493
605
 
494
606
  @typing.overload
495
- def pypi(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
496
- ...
497
-
498
- @typing.overload
499
- def pypi(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
607
+ def test_append_card(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
500
608
  ...
501
609
 
502
- def pypi(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, packages: typing.Dict[str, str] = {}, python: typing.Optional[str] = None):
610
+ def test_append_card(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None):
503
611
  """
504
- Specifies the PyPI packages for the step.
505
-
506
- Information in this decorator will augment any
507
- attributes set in the `@pyi_base` flow-level decorator. Hence,
508
- you can use `@pypi_base` to set packages required by all
509
- steps and use `@pypi` to specify step-specific overrides.
510
-
511
-
512
- Parameters
513
- ----------
514
- packages : Dict[str, str], default: {}
515
- Packages to use for this step. The key is the name of the package
516
- and the value is the version to use.
517
- python : str, optional, default: None
518
- Version of Python to use, e.g. '3.7.4'. A default value of None implies
519
- that the version used will correspond to the version of the Python interpreter used to start the run.
612
+ A simple decorator that demonstrates using CardDecoratorInjector
613
+ to inject a card and render simple markdown content.
520
614
  """
521
615
  ...
522
616
 
523
- @typing.overload
524
- def resources(*, cpu: int = 1, gpu: typing.Optional[int] = None, disk: typing.Optional[int] = None, memory: int = 4096, shared_memory: typing.Optional[int] = None) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
617
+ def huggingface_hub(*, temp_dir_root: typing.Optional[str] = None, load: typing.Union[typing.List[str], typing.List[typing.Tuple[typing.Dict, str]], typing.List[typing.Tuple[str, str]], typing.List[typing.Dict], None]) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
525
618
  """
526
- Specifies the resources needed when executing this step.
619
+ Decorator that helps cache, version and store models/datasets from huggingface hub.
527
620
 
528
- Use `@resources` to specify the resource requirements
529
- independently of the specific compute layer (`@batch`, `@kubernetes`).
621
+ > Examples
530
622
 
531
- You can choose the compute layer on the command line by executing e.g.
623
+ **Usage: creating references of models from huggingface that may be loaded in downstream steps**
624
+ ```python
625
+ @huggingface_hub
626
+ @step
627
+ def pull_model_from_huggingface(self):
628
+ # `current.huggingface_hub.snapshot_download` downloads the model from the Hugging Face Hub
629
+ # and saves it in the backend storage based on the model's `repo_id`. If there exists a model
630
+ # with the same `repo_id` in the backend storage, it will not download the model again. The return
631
+ # value of the function is a reference to the model in the backend storage.
632
+ # This reference can be used to load the model in the subsequent steps via `@model(load=["llama_model"])`
633
+
634
+ self.model_id = "mistralai/Mistral-7B-Instruct-v0.1"
635
+ self.llama_model = current.huggingface_hub.snapshot_download(
636
+ repo_id=self.model_id,
637
+ allow_patterns=["*.safetensors", "*.json", "tokenizer.*"],
638
+ )
639
+ self.next(self.train)
532
640
  ```
533
- python myflow.py run --with batch
641
+
642
+ **Usage: loading models directly from huggingface hub or from cache (from metaflow's datastore)**
643
+ ```python
644
+ @huggingface_hub(load=["mistralai/Mistral-7B-Instruct-v0.1"])
645
+ @step
646
+ def pull_model_from_huggingface(self):
647
+ path_to_model = current.huggingface_hub.loaded["mistralai/Mistral-7B-Instruct-v0.1"]
534
648
  ```
535
- or
649
+
650
+ ```python
651
+ @huggingface_hub(load=[("mistralai/Mistral-7B-Instruct-v0.1", "/my-directory"), ("myorg/mistral-lora, "/my-lora-directory")])
652
+ @step
653
+ def finetune_model(self):
654
+ path_to_model = current.huggingface_hub.loaded["mistralai/Mistral-7B-Instruct-v0.1"]
655
+ # path_to_model will be /my-directory
536
656
  ```
537
- python myflow.py run --with kubernetes
657
+
658
+ ```python
659
+ # Takes all the arguments passed to `snapshot_download`
660
+ # except for `local_dir`
661
+ @huggingface_hub(load=[
662
+ {
663
+ "repo_id": "mistralai/Mistral-7B-Instruct-v0.1",
664
+ },
665
+ {
666
+ "repo_id": "myorg/mistral-lora",
667
+ "repo_type": "model",
668
+ },
669
+ ])
670
+ @step
671
+ def finetune_model(self):
672
+ path_to_model = current.huggingface_hub.loaded["mistralai/Mistral-7B-Instruct-v0.1"]
673
+ # path_to_model will be /my-directory
538
674
  ```
539
- which executes the flow on the desired system using the
540
- requirements specified in `@resources`.
541
675
 
542
676
 
543
677
  Parameters
544
678
  ----------
545
- cpu : int, default 1
546
- Number of CPUs required for this step.
547
- gpu : int, optional, default None
548
- Number of GPUs required for this step.
549
- disk : int, optional, default None
550
- Disk size (in MB) required for this step. Only applies on Kubernetes.
551
- memory : int, default 4096
552
- Memory size (in MB) required for this step.
553
- shared_memory : int, optional, default None
554
- The value for the size (in MiB) of the /dev/shm volume for this step.
555
- This parameter maps to the `--shm-size` option in Docker.
556
- """
557
- ...
558
-
559
- @typing.overload
560
- def resources(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
561
- ...
562
-
563
- @typing.overload
564
- def resources(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
565
- ...
566
-
567
- def resources(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, cpu: int = 1, gpu: typing.Optional[int] = None, disk: typing.Optional[int] = None, memory: int = 4096, shared_memory: typing.Optional[int] = None):
568
- """
569
- Specifies the resources needed when executing this step.
679
+ temp_dir_root : str, optional
680
+ The root directory that will hold the temporary directory where objects will be downloaded.
570
681
 
571
- Use `@resources` to specify the resource requirements
572
- independently of the specific compute layer (`@batch`, `@kubernetes`).
682
+ load: Union[List[str], List[Tuple[Dict, str]], List[Tuple[str, str]], List[Dict], None]
683
+ The list of repos (models/datasets) to load.
573
684
 
574
- You can choose the compute layer on the command line by executing e.g.
575
- ```
576
- python myflow.py run --with batch
577
- ```
578
- or
579
- ```
580
- python myflow.py run --with kubernetes
581
- ```
582
- which executes the flow on the desired system using the
583
- requirements specified in `@resources`.
685
+ Loaded repos can be accessed via `current.huggingface_hub.loaded`. If load is set, then the following happens:
584
686
 
687
+ - If repo (model/dataset) is not found in the datastore:
688
+ - Downloads the repo from Hugging Face Hub to a temporary directory (or uses specified path) for local access
689
+ - Stores it in Metaflow's datastore (s3/gcs/azure etc.) with a unique name based on repo_type/repo_id
690
+ - All HF models loaded for a `@step` will be cached separately under flow/step/namespace.
585
691
 
586
- Parameters
587
- ----------
588
- cpu : int, default 1
589
- Number of CPUs required for this step.
590
- gpu : int, optional, default None
591
- Number of GPUs required for this step.
592
- disk : int, optional, default None
593
- Disk size (in MB) required for this step. Only applies on Kubernetes.
594
- memory : int, default 4096
595
- Memory size (in MB) required for this step.
596
- shared_memory : int, optional, default None
597
- The value for the size (in MiB) of the /dev/shm volume for this step.
598
- This parameter maps to the `--shm-size` option in Docker.
692
+ - If repo is found in the datastore:
693
+ - Loads it directly from datastore to local path (can be temporary directory or specified path)
599
694
  """
600
695
  ...
601
696
 
@@ -659,154 +754,193 @@ def timeout(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None],
659
754
  ...
660
755
 
661
756
  @typing.overload
662
- def fast_bakery_internal(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
757
+ def card(*, type: str = 'default', id: typing.Optional[str] = None, options: typing.Dict[str, typing.Any] = {}, timeout: int = 45) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
663
758
  """
664
- Internal decorator to support Fast bakery
759
+ Creates a human-readable report, a Metaflow Card, after this step completes.
760
+
761
+ Note that you may add multiple `@card` decorators in a step with different parameters.
762
+
763
+
764
+ Parameters
765
+ ----------
766
+ type : str, default 'default'
767
+ Card type.
768
+ id : str, optional, default None
769
+ If multiple cards are present, use this id to identify this card.
770
+ options : Dict[str, Any], default {}
771
+ Options passed to the card. The contents depend on the card type.
772
+ timeout : int, default 45
773
+ Interrupt reporting if it takes more than this many seconds.
665
774
  """
666
775
  ...
667
776
 
668
777
  @typing.overload
669
- def fast_bakery_internal(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
778
+ def card(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
670
779
  ...
671
780
 
672
- def fast_bakery_internal(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None):
673
- """
674
- Internal decorator to support Fast bakery
675
- """
781
+ @typing.overload
782
+ def card(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
676
783
  ...
677
784
 
678
- @typing.overload
679
- def retry(*, times: int = 3, minutes_between_retries: int = 2) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
785
+ def card(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, type: str = 'default', id: typing.Optional[str] = None, options: typing.Dict[str, typing.Any] = {}, timeout: int = 45):
680
786
  """
681
- Specifies the number of times the task corresponding
682
- to a step needs to be retried.
683
-
684
- This decorator is useful for handling transient errors, such as networking issues.
685
- If your task contains operations that can't be retried safely, e.g. database updates,
686
- it is advisable to annotate it with `@retry(times=0)`.
787
+ Creates a human-readable report, a Metaflow Card, after this step completes.
687
788
 
688
- This can be used in conjunction with the `@catch` decorator. The `@catch`
689
- decorator will execute a no-op task after all retries have been exhausted,
690
- ensuring that the flow execution can continue.
789
+ Note that you may add multiple `@card` decorators in a step with different parameters.
691
790
 
692
791
 
693
792
  Parameters
694
793
  ----------
695
- times : int, default 3
696
- Number of times to retry this task.
697
- minutes_between_retries : int, default 2
698
- Number of minutes between retries.
794
+ type : str, default 'default'
795
+ Card type.
796
+ id : str, optional, default None
797
+ If multiple cards are present, use this id to identify this card.
798
+ options : Dict[str, Any], default {}
799
+ Options passed to the card. The contents depend on the card type.
800
+ timeout : int, default 45
801
+ Interrupt reporting if it takes more than this many seconds.
699
802
  """
700
803
  ...
701
804
 
702
805
  @typing.overload
703
- def retry(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
704
- ...
705
-
806
+ def app_deploy(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
807
+ """
808
+ Decorator prototype for all step decorators. This function gets specialized
809
+ and imported for all decorators types by _import_plugin_decorators().
810
+ """
811
+ ...
812
+
706
813
  @typing.overload
707
- def retry(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
814
+ def app_deploy(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
708
815
  ...
709
816
 
710
- def retry(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, times: int = 3, minutes_between_retries: int = 2):
817
+ def app_deploy(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None):
711
818
  """
712
- Specifies the number of times the task corresponding
713
- to a step needs to be retried.
714
-
715
- This decorator is useful for handling transient errors, such as networking issues.
716
- If your task contains operations that can't be retried safely, e.g. database updates,
717
- it is advisable to annotate it with `@retry(times=0)`.
718
-
719
- This can be used in conjunction with the `@catch` decorator. The `@catch`
720
- decorator will execute a no-op task after all retries have been exhausted,
721
- ensuring that the flow execution can continue.
819
+ Decorator prototype for all step decorators. This function gets specialized
820
+ and imported for all decorators types by _import_plugin_decorators().
821
+ """
822
+ ...
823
+
824
+ @typing.overload
825
+ def secrets(*, sources: typing.List[typing.Union[str, typing.Dict[str, typing.Any]]] = [], role: typing.Optional[str] = None) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
826
+ """
827
+ Specifies secrets to be retrieved and injected as environment variables prior to
828
+ the execution of a step.
722
829
 
723
830
 
724
831
  Parameters
725
832
  ----------
726
- times : int, default 3
727
- Number of times to retry this task.
728
- minutes_between_retries : int, default 2
729
- Number of minutes between retries.
833
+ sources : List[Union[str, Dict[str, Any]]], default: []
834
+ List of secret specs, defining how the secrets are to be retrieved
835
+ role : str, optional, default: None
836
+ Role to use for fetching secrets
730
837
  """
731
838
  ...
732
839
 
733
- def huggingface_hub(*, temp_dir_root: typing.Optional[str] = None, load: typing.Union[typing.List[str], typing.List[typing.Tuple[typing.Dict, str]], typing.List[typing.Tuple[str, str]], typing.List[typing.Dict], None]) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
840
+ @typing.overload
841
+ def secrets(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
842
+ ...
843
+
844
+ @typing.overload
845
+ def secrets(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
846
+ ...
847
+
848
+ def secrets(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, sources: typing.List[typing.Union[str, typing.Dict[str, typing.Any]]] = [], role: typing.Optional[str] = None):
734
849
  """
735
- Decorator that helps cache, version and store models/datasets from huggingface hub.
850
+ Specifies secrets to be retrieved and injected as environment variables prior to
851
+ the execution of a step.
736
852
 
737
- > Examples
738
853
 
739
- **Usage: creating references of models from huggingface that may be loaded in downstream steps**
740
- ```python
741
- @huggingface_hub
742
- @step
743
- def pull_model_from_huggingface(self):
744
- # `current.huggingface_hub.snapshot_download` downloads the model from the Hugging Face Hub
745
- # and saves it in the backend storage based on the model's `repo_id`. If there exists a model
746
- # with the same `repo_id` in the backend storage, it will not download the model again. The return
747
- # value of the function is a reference to the model in the backend storage.
748
- # This reference can be used to load the model in the subsequent steps via `@model(load=["llama_model"])`
854
+ Parameters
855
+ ----------
856
+ sources : List[Union[str, Dict[str, Any]]], default: []
857
+ List of secret specs, defining how the secrets are to be retrieved
858
+ role : str, optional, default: None
859
+ Role to use for fetching secrets
860
+ """
861
+ ...
862
+
863
+ @typing.overload
864
+ def catch(*, var: typing.Optional[str] = None, print_exception: bool = True) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
865
+ """
866
+ Specifies that the step will success under all circumstances.
749
867
 
750
- self.model_id = "mistralai/Mistral-7B-Instruct-v0.1"
751
- self.llama_model = current.huggingface_hub.snapshot_download(
752
- repo_id=self.model_id,
753
- allow_patterns=["*.safetensors", "*.json", "tokenizer.*"],
754
- )
755
- self.next(self.train)
756
- ```
868
+ The decorator will create an optional artifact, specified by `var`, which
869
+ contains the exception raised. You can use it to detect the presence
870
+ of errors, indicating that all happy-path artifacts produced by the step
871
+ are missing.
757
872
 
758
- **Usage: loading models directly from huggingface hub or from cache (from metaflow's datastore)**
759
- ```python
760
- @huggingface_hub(load=["mistralai/Mistral-7B-Instruct-v0.1"])
761
- @step
762
- def pull_model_from_huggingface(self):
763
- path_to_model = current.huggingface_hub.loaded["mistralai/Mistral-7B-Instruct-v0.1"]
764
- ```
765
873
 
766
- ```python
767
- @huggingface_hub(load=[("mistralai/Mistral-7B-Instruct-v0.1", "/my-directory"), ("myorg/mistral-lora, "/my-lora-directory")])
768
- @step
769
- def finetune_model(self):
770
- path_to_model = current.huggingface_hub.loaded["mistralai/Mistral-7B-Instruct-v0.1"]
771
- # path_to_model will be /my-directory
772
- ```
874
+ Parameters
875
+ ----------
876
+ var : str, optional, default None
877
+ Name of the artifact in which to store the caught exception.
878
+ If not specified, the exception is not stored.
879
+ print_exception : bool, default True
880
+ Determines whether or not the exception is printed to
881
+ stdout when caught.
882
+ """
883
+ ...
884
+
885
+ @typing.overload
886
+ def catch(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
887
+ ...
888
+
889
+ @typing.overload
890
+ def catch(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
891
+ ...
892
+
893
+ def catch(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, var: typing.Optional[str] = None, print_exception: bool = True):
894
+ """
895
+ Specifies that the step will success under all circumstances.
773
896
 
774
- ```python
775
- # Takes all the arguments passed to `snapshot_download`
776
- # except for `local_dir`
777
- @huggingface_hub(load=[
778
- {
779
- "repo_id": "mistralai/Mistral-7B-Instruct-v0.1",
780
- },
781
- {
782
- "repo_id": "myorg/mistral-lora",
783
- "repo_type": "model",
784
- },
785
- ])
786
- @step
787
- def finetune_model(self):
788
- path_to_model = current.huggingface_hub.loaded["mistralai/Mistral-7B-Instruct-v0.1"]
789
- # path_to_model will be /my-directory
790
- ```
897
+ The decorator will create an optional artifact, specified by `var`, which
898
+ contains the exception raised. You can use it to detect the presence
899
+ of errors, indicating that all happy-path artifacts produced by the step
900
+ are missing.
791
901
 
792
902
 
793
903
  Parameters
794
904
  ----------
795
- temp_dir_root : str, optional
796
- The root directory that will hold the temporary directory where objects will be downloaded.
905
+ var : str, optional, default None
906
+ Name of the artifact in which to store the caught exception.
907
+ If not specified, the exception is not stored.
908
+ print_exception : bool, default True
909
+ Determines whether or not the exception is printed to
910
+ stdout when caught.
911
+ """
912
+ ...
913
+
914
+ @typing.overload
915
+ def environment(*, vars: typing.Dict[str, str] = {}) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
916
+ """
917
+ Specifies environment variables to be set prior to the execution of a step.
797
918
 
798
- load: Union[List[str], List[Tuple[Dict, str]], List[Tuple[str, str]], List[Dict], None]
799
- The list of repos (models/datasets) to load.
800
919
 
801
- Loaded repos can be accessed via `current.huggingface_hub.loaded`. If load is set, then the following happens:
920
+ Parameters
921
+ ----------
922
+ vars : Dict[str, str], default {}
923
+ Dictionary of environment variables to set.
924
+ """
925
+ ...
926
+
927
+ @typing.overload
928
+ def environment(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
929
+ ...
930
+
931
+ @typing.overload
932
+ def environment(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
933
+ ...
934
+
935
+ def environment(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, vars: typing.Dict[str, str] = {}):
936
+ """
937
+ Specifies environment variables to be set prior to the execution of a step.
802
938
 
803
- - If repo (model/dataset) is not found in the datastore:
804
- - Downloads the repo from Hugging Face Hub to a temporary directory (or uses specified path) for local access
805
- - Stores it in Metaflow's datastore (s3/gcs/azure etc.) with a unique name based on repo_type/repo_id
806
- - All HF models loaded for a `@step` will be cached separately under flow/step/namespace.
807
939
 
808
- - If repo is found in the datastore:
809
- - Loads it directly from datastore to local path (can be temporary directory or specified path)
940
+ Parameters
941
+ ----------
942
+ vars : Dict[str, str], default {}
943
+ Dictionary of environment variables to set.
810
944
  """
811
945
  ...
812
946
 
@@ -869,237 +1003,89 @@ def conda(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], ty
869
1003
  """
870
1004
  ...
871
1005
 
872
- @typing.overload
873
- def card(*, type: str = 'default', id: typing.Optional[str] = None, options: typing.Dict[str, typing.Any] = {}, timeout: int = 45) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
1006
+ def nvct(*, gpu: int, gpu_type: str) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
874
1007
  """
875
- Creates a human-readable report, a Metaflow Card, after this step completes.
876
-
877
- Note that you may add multiple `@card` decorators in a step with different parameters.
1008
+ Specifies that this step should execute on DGX cloud.
878
1009
 
879
1010
 
880
1011
  Parameters
881
1012
  ----------
882
- type : str, default 'default'
883
- Card type.
884
- id : str, optional, default None
885
- If multiple cards are present, use this id to identify this card.
886
- options : Dict[str, Any], default {}
887
- Options passed to the card. The contents depend on the card type.
888
- timeout : int, default 45
889
- Interrupt reporting if it takes more than this many seconds.
1013
+ gpu : int
1014
+ Number of GPUs to use.
1015
+ gpu_type : str
1016
+ Type of Nvidia GPU to use.
890
1017
  """
891
1018
  ...
892
1019
 
893
1020
  @typing.overload
894
- def card(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
1021
+ def fast_bakery_internal(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
1022
+ """
1023
+ Internal decorator to support Fast bakery
1024
+ """
895
1025
  ...
896
1026
 
897
1027
  @typing.overload
898
- def card(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
1028
+ def fast_bakery_internal(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
899
1029
  ...
900
1030
 
901
- def card(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, type: str = 'default', id: typing.Optional[str] = None, options: typing.Dict[str, typing.Any] = {}, timeout: int = 45):
1031
+ def fast_bakery_internal(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None):
902
1032
  """
903
- Creates a human-readable report, a Metaflow Card, after this step completes.
1033
+ Internal decorator to support Fast bakery
1034
+ """
1035
+ ...
1036
+
1037
+ @typing.overload
1038
+ def retry(*, times: int = 3, minutes_between_retries: int = 2) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
1039
+ """
1040
+ Specifies the number of times the task corresponding
1041
+ to a step needs to be retried.
904
1042
 
905
- Note that you may add multiple `@card` decorators in a step with different parameters.
1043
+ This decorator is useful for handling transient errors, such as networking issues.
1044
+ If your task contains operations that can't be retried safely, e.g. database updates,
1045
+ it is advisable to annotate it with `@retry(times=0)`.
1046
+
1047
+ This can be used in conjunction with the `@catch` decorator. The `@catch`
1048
+ decorator will execute a no-op task after all retries have been exhausted,
1049
+ ensuring that the flow execution can continue.
906
1050
 
907
1051
 
908
1052
  Parameters
909
1053
  ----------
910
- type : str, default 'default'
911
- Card type.
912
- id : str, optional, default None
913
- If multiple cards are present, use this id to identify this card.
914
- options : Dict[str, Any], default {}
915
- Options passed to the card. The contents depend on the card type.
916
- timeout : int, default 45
917
- Interrupt reporting if it takes more than this many seconds.
1054
+ times : int, default 3
1055
+ Number of times to retry this task.
1056
+ minutes_between_retries : int, default 2
1057
+ Number of minutes between retries.
918
1058
  """
919
1059
  ...
920
1060
 
921
1061
  @typing.overload
922
- def coreweave_s3_proxy(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
923
- """
924
- CoreWeave-specific S3 Proxy decorator for routing S3 requests through a local proxy service.
925
- It exists to make it easier for users to know that this decorator should only be used with
926
- a Neo Cloud like CoreWeave.
927
- """
1062
+ def retry(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
928
1063
  ...
929
1064
 
930
1065
  @typing.overload
931
- def coreweave_s3_proxy(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
932
- ...
933
-
934
- def coreweave_s3_proxy(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None):
935
- """
936
- CoreWeave-specific S3 Proxy decorator for routing S3 requests through a local proxy service.
937
- It exists to make it easier for users to know that this decorator should only be used with
938
- a Neo Cloud like CoreWeave.
939
- """
940
- ...
941
-
942
- @typing.overload
943
- def model(*, load: typing.Union[typing.List[str], str, typing.List[typing.Tuple[str, typing.Optional[str]]]] = None, temp_dir_root: str = None) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
944
- """
945
- Enables loading / saving of models within a step.
946
-
947
- > Examples
948
- - Saving Models
949
- ```python
950
- @model
951
- @step
952
- def train(self):
953
- # current.model.save returns a dictionary reference to the model saved
954
- self.my_model = current.model.save(
955
- path_to_my_model,
956
- label="my_model",
957
- metadata={
958
- "epochs": 10,
959
- "batch-size": 32,
960
- "learning-rate": 0.001,
961
- }
962
- )
963
- self.next(self.test)
964
-
965
- @model(load="my_model")
966
- @step
967
- def test(self):
968
- # `current.model.loaded` returns a dictionary of the loaded models
969
- # where the key is the name of the artifact and the value is the path to the model
970
- print(os.listdir(current.model.loaded["my_model"]))
971
- self.next(self.end)
972
- ```
973
-
974
- - Loading models
975
- ```python
976
- @step
977
- def train(self):
978
- # current.model.load returns the path to the model loaded
979
- checkpoint_path = current.model.load(
980
- self.checkpoint_key,
981
- )
982
- model_path = current.model.load(
983
- self.model,
984
- )
985
- self.next(self.test)
986
- ```
987
-
988
-
989
- Parameters
990
- ----------
991
- load : Union[List[str],str,List[Tuple[str,Union[str,None]]]], default: None
992
- Artifact name/s referencing the models/checkpoints to load. Artifact names refer to the names of the instance variables set to `self`.
993
- These artifact names give to `load` be reference objects or reference `key` string's from objects created by `current.checkpoint` / `current.model` / `current.huggingface_hub`.
994
- If a list of tuples is provided, the first element is the artifact name and the second element is the path the artifact needs be unpacked on
995
- the local filesystem. If the second element is None, the artifact will be unpacked in the current working directory.
996
- If a string is provided, then the artifact corresponding to that name will be loaded in the current working directory.
997
-
998
- temp_dir_root : str, default: None
999
- The root directory under which `current.model.loaded` will store loaded models
1000
- """
1001
- ...
1002
-
1003
- @typing.overload
1004
- def model(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
1005
- ...
1006
-
1007
- @typing.overload
1008
- def model(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
1066
+ def retry(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
1009
1067
  ...
1010
1068
 
1011
- def model(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, load: typing.Union[typing.List[str], str, typing.List[typing.Tuple[str, typing.Optional[str]]]] = None, temp_dir_root: str = None):
1069
+ def retry(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, times: int = 3, minutes_between_retries: int = 2):
1012
1070
  """
1013
- Enables loading / saving of models within a step.
1014
-
1015
- > Examples
1016
- - Saving Models
1017
- ```python
1018
- @model
1019
- @step
1020
- def train(self):
1021
- # current.model.save returns a dictionary reference to the model saved
1022
- self.my_model = current.model.save(
1023
- path_to_my_model,
1024
- label="my_model",
1025
- metadata={
1026
- "epochs": 10,
1027
- "batch-size": 32,
1028
- "learning-rate": 0.001,
1029
- }
1030
- )
1031
- self.next(self.test)
1032
-
1033
- @model(load="my_model")
1034
- @step
1035
- def test(self):
1036
- # `current.model.loaded` returns a dictionary of the loaded models
1037
- # where the key is the name of the artifact and the value is the path to the model
1038
- print(os.listdir(current.model.loaded["my_model"]))
1039
- self.next(self.end)
1040
- ```
1041
-
1042
- - Loading models
1043
- ```python
1044
- @step
1045
- def train(self):
1046
- # current.model.load returns the path to the model loaded
1047
- checkpoint_path = current.model.load(
1048
- self.checkpoint_key,
1049
- )
1050
- model_path = current.model.load(
1051
- self.model,
1052
- )
1053
- self.next(self.test)
1054
- ```
1055
-
1071
+ Specifies the number of times the task corresponding
1072
+ to a step needs to be retried.
1056
1073
 
1057
- Parameters
1058
- ----------
1059
- load : Union[List[str],str,List[Tuple[str,Union[str,None]]]], default: None
1060
- Artifact name/s referencing the models/checkpoints to load. Artifact names refer to the names of the instance variables set to `self`.
1061
- These artifact names give to `load` be reference objects or reference `key` string's from objects created by `current.checkpoint` / `current.model` / `current.huggingface_hub`.
1062
- If a list of tuples is provided, the first element is the artifact name and the second element is the path the artifact needs be unpacked on
1063
- the local filesystem. If the second element is None, the artifact will be unpacked in the current working directory.
1064
- If a string is provided, then the artifact corresponding to that name will be loaded in the current working directory.
1074
+ This decorator is useful for handling transient errors, such as networking issues.
1075
+ If your task contains operations that can't be retried safely, e.g. database updates,
1076
+ it is advisable to annotate it with `@retry(times=0)`.
1065
1077
 
1066
- temp_dir_root : str, default: None
1067
- The root directory under which `current.model.loaded` will store loaded models
1068
- """
1069
- ...
1070
-
1071
- def nvct(*, gpu: int, gpu_type: str) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
1072
- """
1073
- Specifies that this step should execute on DGX cloud.
1078
+ This can be used in conjunction with the `@catch` decorator. The `@catch`
1079
+ decorator will execute a no-op task after all retries have been exhausted,
1080
+ ensuring that the flow execution can continue.
1074
1081
 
1075
1082
 
1076
1083
  Parameters
1077
1084
  ----------
1078
- gpu : int
1079
- Number of GPUs to use.
1080
- gpu_type : str
1081
- Type of Nvidia GPU to use.
1082
- """
1083
- ...
1084
-
1085
- @typing.overload
1086
- def nebius_s3_proxy(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
1087
- """
1088
- Nebius-specific S3 Proxy decorator for routing S3 requests through a local proxy service.
1089
- It exists to make it easier for users to know that this decorator should only be used with
1090
- a Neo Cloud like Nebius.
1091
- """
1092
- ...
1093
-
1094
- @typing.overload
1095
- def nebius_s3_proxy(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
1096
- ...
1097
-
1098
- def nebius_s3_proxy(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None):
1099
- """
1100
- Nebius-specific S3 Proxy decorator for routing S3 requests through a local proxy service.
1101
- It exists to make it easier for users to know that this decorator should only be used with
1102
- a Neo Cloud like Nebius.
1085
+ times : int, default 3
1086
+ Number of times to retry this task.
1087
+ minutes_between_retries : int, default 2
1088
+ Number of minutes between retries.
1103
1089
  """
1104
1090
  ...
1105
1091
 
@@ -1125,398 +1111,240 @@ def s3_proxy(*, integration_name: typing.Optional[str] = None, write_mode: typin
1125
1111
  """
1126
1112
  ...
1127
1113
 
1128
- def ollama(*, models: list, backend: str, force_pull: bool, cache_update_policy: str, force_cache_update: bool, debug: bool, circuit_breaker_config: dict, timeout_config: dict) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
1114
+ def kubernetes(*, cpu: int = 1, memory: int = 4096, disk: int = 10240, image: typing.Optional[str] = None, image_pull_policy: str = 'KUBERNETES_IMAGE_PULL_POLICY', image_pull_secrets: typing.List[str] = [], service_account: str = 'METAFLOW_KUBERNETES_SERVICE_ACCOUNT', secrets: typing.Optional[typing.List[str]] = None, node_selector: typing.Union[typing.Dict[str, str], str, None] = None, namespace: str = 'METAFLOW_KUBERNETES_NAMESPACE', gpu: typing.Optional[int] = None, gpu_vendor: str = 'KUBERNETES_GPU_VENDOR', tolerations: typing.List[typing.Dict[str, str]] = [], labels: typing.Dict[str, str] = 'METAFLOW_KUBERNETES_LABELS', annotations: typing.Dict[str, str] = 'METAFLOW_KUBERNETES_ANNOTATIONS', use_tmpfs: bool = False, tmpfs_tempdir: bool = True, tmpfs_size: typing.Optional[int] = None, tmpfs_path: typing.Optional[str] = '/metaflow_temp', persistent_volume_claims: typing.Optional[typing.Dict[str, str]] = None, shared_memory: typing.Optional[int] = None, port: typing.Optional[int] = None, compute_pool: typing.Optional[str] = None, hostname_resolution_timeout: int = 600, qos: str = 'Burstable', security_context: typing.Optional[typing.Dict[str, typing.Any]] = None) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
1129
1115
  """
1130
- This decorator is used to run Ollama APIs as Metaflow task sidecars.
1131
-
1132
- User code call
1133
- --------------
1134
- @ollama(
1135
- models=[...],
1136
- ...
1137
- )
1138
-
1139
- Valid backend options
1140
- ---------------------
1141
- - 'local': Run as a separate process on the local task machine.
1142
- - (TODO) 'managed': Outerbounds hosts and selects compute provider.
1143
- - (TODO) 'remote': Spin up separate instance to serve Ollama models.
1144
-
1145
- Valid model options
1146
- -------------------
1147
- Any model here https://ollama.com/search, e.g. 'llama3.2', 'llama3.3'
1116
+ Specifies that this step should execute on Kubernetes.
1148
1117
 
1149
1118
 
1150
1119
  Parameters
1151
1120
  ----------
1152
- models: list[str]
1153
- List of Ollama containers running models in sidecars.
1154
- backend: str
1155
- Determines where and how to run the Ollama process.
1156
- force_pull: bool
1157
- Whether to run `ollama pull` no matter what, or first check the remote cache in Metaflow datastore for this model key.
1158
- cache_update_policy: str
1159
- Cache update policy: "auto", "force", or "never".
1160
- force_cache_update: bool
1161
- Simple override for "force" cache update policy.
1162
- debug: bool
1163
- Whether to turn on verbose debugging logs.
1164
- circuit_breaker_config: dict
1165
- Configuration for circuit breaker protection. Keys: failure_threshold, recovery_timeout, reset_timeout.
1166
- timeout_config: dict
1167
- Configuration for various operation timeouts. Keys: pull, stop, health_check, install, server_startup.
1168
- """
1169
- ...
1170
-
1171
- @typing.overload
1172
- def checkpoint(*, load_policy: str = 'fresh', temp_dir_root: str = None) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
1173
- """
1174
- Enables checkpointing for a step.
1175
-
1176
- > Examples
1177
-
1178
- - Saving Checkpoints
1179
-
1180
- ```python
1181
- @checkpoint
1182
- @step
1183
- def train(self):
1184
- model = create_model(self.parameters, checkpoint_path = None)
1185
- for i in range(self.epochs):
1186
- # some training logic
1187
- loss = model.train(self.dataset)
1188
- if i % 10 == 0:
1189
- model.save(
1190
- current.checkpoint.directory,
1191
- )
1192
- # saves the contents of the `current.checkpoint.directory` as a checkpoint
1193
- # and returns a reference dictionary to the checkpoint saved in the datastore
1194
- self.latest_checkpoint = current.checkpoint.save(
1195
- name="epoch_checkpoint",
1196
- metadata={
1197
- "epoch": i,
1198
- "loss": loss,
1199
- }
1200
- )
1201
- ```
1202
-
1203
- - Using Loaded Checkpoints
1204
-
1205
- ```python
1206
- @retry(times=3)
1207
- @checkpoint
1208
- @step
1209
- def train(self):
1210
- # Assume that the task has restarted and the previous attempt of the task
1211
- # saved a checkpoint
1212
- checkpoint_path = None
1213
- if current.checkpoint.is_loaded: # Check if a checkpoint is loaded
1214
- print("Loaded checkpoint from the previous attempt")
1215
- checkpoint_path = current.checkpoint.directory
1216
-
1217
- model = create_model(self.parameters, checkpoint_path = checkpoint_path)
1218
- for i in range(self.epochs):
1219
- ...
1220
- ```
1221
-
1222
-
1223
- Parameters
1224
- ----------
1225
- load_policy : str, default: "fresh"
1226
- The policy for loading the checkpoint. The following policies are supported:
1227
- - "eager": Loads the the latest available checkpoint within the namespace.
1228
- With this mode, the latest checkpoint written by any previous task (can be even a different run) of the step
1229
- will be loaded at the start of the task.
1230
- - "none": Do not load any checkpoint
1231
- - "fresh": Loads the lastest checkpoint created within the running Task.
1232
- This mode helps loading checkpoints across various retry attempts of the same task.
1233
- With this mode, no checkpoint will be loaded at the start of a task but any checkpoints
1234
- created within the task will be loaded when the task is retries execution on failure.
1121
+ cpu : int, default 1
1122
+ Number of CPUs required for this step. If `@resources` is
1123
+ also present, the maximum value from all decorators is used.
1124
+ memory : int, default 4096
1125
+ Memory size (in MB) required for this step. If
1126
+ `@resources` is also present, the maximum value from all decorators is
1127
+ used.
1128
+ disk : int, default 10240
1129
+ Disk size (in MB) required for this step. If
1130
+ `@resources` is also present, the maximum value from all decorators is
1131
+ used.
1132
+ image : str, optional, default None
1133
+ Docker image to use when launching on Kubernetes. If not specified, and
1134
+ METAFLOW_KUBERNETES_CONTAINER_IMAGE is specified, that image is used. If
1135
+ not, a default Docker image mapping to the current version of Python is used.
1136
+ image_pull_policy: str, default KUBERNETES_IMAGE_PULL_POLICY
1137
+ If given, the imagePullPolicy to be applied to the Docker image of the step.
1138
+ image_pull_secrets: List[str], default []
1139
+ The default is extracted from METAFLOW_KUBERNETES_IMAGE_PULL_SECRETS.
1140
+ Kubernetes image pull secrets to use when pulling container images
1141
+ in Kubernetes.
1142
+ service_account : str, default METAFLOW_KUBERNETES_SERVICE_ACCOUNT
1143
+ Kubernetes service account to use when launching pod in Kubernetes.
1144
+ secrets : List[str], optional, default None
1145
+ Kubernetes secrets to use when launching pod in Kubernetes. These
1146
+ secrets are in addition to the ones defined in `METAFLOW_KUBERNETES_SECRETS`
1147
+ in Metaflow configuration.
1148
+ node_selector: Union[Dict[str,str], str], optional, default None
1149
+ Kubernetes node selector(s) to apply to the pod running the task.
1150
+ Can be passed in as a comma separated string of values e.g.
1151
+ 'kubernetes.io/os=linux,kubernetes.io/arch=amd64' or as a dictionary
1152
+ {'kubernetes.io/os': 'linux', 'kubernetes.io/arch': 'amd64'}
1153
+ namespace : str, default METAFLOW_KUBERNETES_NAMESPACE
1154
+ Kubernetes namespace to use when launching pod in Kubernetes.
1155
+ gpu : int, optional, default None
1156
+ Number of GPUs required for this step. A value of zero implies that
1157
+ the scheduled node should not have GPUs.
1158
+ gpu_vendor : str, default KUBERNETES_GPU_VENDOR
1159
+ The vendor of the GPUs to be used for this step.
1160
+ tolerations : List[Dict[str,str]], default []
1161
+ The default is extracted from METAFLOW_KUBERNETES_TOLERATIONS.
1162
+ Kubernetes tolerations to use when launching pod in Kubernetes.
1163
+ labels: Dict[str, str], default: METAFLOW_KUBERNETES_LABELS
1164
+ Kubernetes labels to use when launching pod in Kubernetes.
1165
+ annotations: Dict[str, str], default: METAFLOW_KUBERNETES_ANNOTATIONS
1166
+ Kubernetes annotations to use when launching pod in Kubernetes.
1167
+ use_tmpfs : bool, default False
1168
+ This enables an explicit tmpfs mount for this step.
1169
+ tmpfs_tempdir : bool, default True
1170
+ sets METAFLOW_TEMPDIR to tmpfs_path if set for this step.
1171
+ tmpfs_size : int, optional, default: None
1172
+ The value for the size (in MiB) of the tmpfs mount for this step.
1173
+ This parameter maps to the `--tmpfs` option in Docker. Defaults to 50% of the
1174
+ memory allocated for this step.
1175
+ tmpfs_path : str, optional, default /metaflow_temp
1176
+ Path to tmpfs mount for this step.
1177
+ persistent_volume_claims : Dict[str, str], optional, default None
1178
+ A map (dictionary) of persistent volumes to be mounted to the pod for this step. The map is from persistent
1179
+ volumes to the path to which the volume is to be mounted, e.g., `{'pvc-name': '/path/to/mount/on'}`.
1180
+ shared_memory: int, optional
1181
+ Shared memory size (in MiB) required for this step
1182
+ port: int, optional
1183
+ Port number to specify in the Kubernetes job object
1184
+ compute_pool : str, optional, default None
1185
+ Compute pool to be used for for this step.
1186
+ If not specified, any accessible compute pool within the perimeter is used.
1187
+ hostname_resolution_timeout: int, default 10 * 60
1188
+ Timeout in seconds for the workers tasks in the gang scheduled cluster to resolve the hostname of control task.
1189
+ Only applicable when @parallel is used.
1190
+ qos: str, default: Burstable
1191
+ Quality of Service class to assign to the pod. Supported values are: Guaranteed, Burstable, BestEffort
1235
1192
 
1236
- temp_dir_root : str, default: None
1237
- The root directory under which `current.checkpoint.directory` will be created.
1193
+ security_context: Dict[str, Any], optional, default None
1194
+ Container security context. Applies to the task container. Allows the following keys:
1195
+ - privileged: bool, optional, default None
1196
+ - allow_privilege_escalation: bool, optional, default None
1197
+ - run_as_user: int, optional, default None
1198
+ - run_as_group: int, optional, default None
1199
+ - run_as_non_root: bool, optional, default None
1238
1200
  """
1239
1201
  ...
1240
1202
 
1241
- @typing.overload
1242
- def checkpoint(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
1243
- ...
1244
-
1245
- @typing.overload
1246
- def checkpoint(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
1247
- ...
1248
-
1249
- def checkpoint(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, load_policy: str = 'fresh', temp_dir_root: str = None):
1203
+ def vllm(*, model: str, backend: str, openai_api_server: bool, debug: bool, card_refresh_interval: int, max_retries: int, retry_alert_frequency: int, engine_args: dict) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
1250
1204
  """
1251
- Enables checkpointing for a step.
1252
-
1253
- > Examples
1254
-
1255
- - Saving Checkpoints
1205
+ This decorator is used to run vllm APIs as Metaflow task sidecars.
1256
1206
 
1257
- ```python
1258
- @checkpoint
1259
- @step
1260
- def train(self):
1261
- model = create_model(self.parameters, checkpoint_path = None)
1262
- for i in range(self.epochs):
1263
- # some training logic
1264
- loss = model.train(self.dataset)
1265
- if i % 10 == 0:
1266
- model.save(
1267
- current.checkpoint.directory,
1268
- )
1269
- # saves the contents of the `current.checkpoint.directory` as a checkpoint
1270
- # and returns a reference dictionary to the checkpoint saved in the datastore
1271
- self.latest_checkpoint = current.checkpoint.save(
1272
- name="epoch_checkpoint",
1273
- metadata={
1274
- "epoch": i,
1275
- "loss": loss,
1276
- }
1277
- )
1278
- ```
1207
+ User code call
1208
+ --------------
1209
+ @vllm(
1210
+ model="...",
1211
+ ...
1212
+ )
1279
1213
 
1280
- - Using Loaded Checkpoints
1214
+ Valid backend options
1215
+ ---------------------
1216
+ - 'local': Run as a separate process on the local task machine.
1281
1217
 
1282
- ```python
1283
- @retry(times=3)
1284
- @checkpoint
1285
- @step
1286
- def train(self):
1287
- # Assume that the task has restarted and the previous attempt of the task
1288
- # saved a checkpoint
1289
- checkpoint_path = None
1290
- if current.checkpoint.is_loaded: # Check if a checkpoint is loaded
1291
- print("Loaded checkpoint from the previous attempt")
1292
- checkpoint_path = current.checkpoint.directory
1218
+ Valid model options
1219
+ -------------------
1220
+ Any HuggingFace model identifier, e.g. 'meta-llama/Llama-3.2-1B'
1293
1221
 
1294
- model = create_model(self.parameters, checkpoint_path = checkpoint_path)
1295
- for i in range(self.epochs):
1296
- ...
1297
- ```
1222
+ NOTE: vLLM's OpenAI-compatible server serves ONE model per server instance.
1223
+ If you need multiple models, you must create multiple @vllm decorators.
1298
1224
 
1299
1225
 
1300
1226
  Parameters
1301
1227
  ----------
1302
- load_policy : str, default: "fresh"
1303
- The policy for loading the checkpoint. The following policies are supported:
1304
- - "eager": Loads the the latest available checkpoint within the namespace.
1305
- With this mode, the latest checkpoint written by any previous task (can be even a different run) of the step
1306
- will be loaded at the start of the task.
1307
- - "none": Do not load any checkpoint
1308
- - "fresh": Loads the lastest checkpoint created within the running Task.
1309
- This mode helps loading checkpoints across various retry attempts of the same task.
1310
- With this mode, no checkpoint will be loaded at the start of a task but any checkpoints
1311
- created within the task will be loaded when the task is retries execution on failure.
1312
-
1313
- temp_dir_root : str, default: None
1314
- The root directory under which `current.checkpoint.directory` will be created.
1228
+ model: str
1229
+ HuggingFace model identifier to be served by vLLM.
1230
+ backend: str
1231
+ Determines where and how to run the vLLM process.
1232
+ openai_api_server: bool
1233
+ Whether to use OpenAI-compatible API server mode (subprocess) instead of native engine.
1234
+ Default is False (uses native engine).
1235
+ Set to True for backward compatibility with existing code.
1236
+ debug: bool
1237
+ Whether to turn on verbose debugging logs.
1238
+ card_refresh_interval: int
1239
+ Interval in seconds for refreshing the vLLM status card.
1240
+ Only used when openai_api_server=True.
1241
+ max_retries: int
1242
+ Maximum number of retries checking for vLLM server startup.
1243
+ Only used when openai_api_server=True.
1244
+ retry_alert_frequency: int
1245
+ Frequency of alert logs for vLLM server startup retries.
1246
+ Only used when openai_api_server=True.
1247
+ engine_args : dict
1248
+ Additional keyword arguments to pass to the vLLM engine.
1249
+ For example, `tensor_parallel_size=2`.
1315
1250
  """
1316
1251
  ...
1317
1252
 
1318
1253
  @typing.overload
1319
- def environment(*, vars: typing.Dict[str, str] = {}) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
1254
+ def resources(*, cpu: int = 1, gpu: typing.Optional[int] = None, disk: typing.Optional[int] = None, memory: int = 4096, shared_memory: typing.Optional[int] = None) -> typing.Callable[[typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]], typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]]]:
1320
1255
  """
1321
- Specifies environment variables to be set prior to the execution of a step.
1256
+ Specifies the resources needed when executing this step.
1257
+
1258
+ Use `@resources` to specify the resource requirements
1259
+ independently of the specific compute layer (`@batch`, `@kubernetes`).
1260
+
1261
+ You can choose the compute layer on the command line by executing e.g.
1262
+ ```
1263
+ python myflow.py run --with batch
1264
+ ```
1265
+ or
1266
+ ```
1267
+ python myflow.py run --with kubernetes
1268
+ ```
1269
+ which executes the flow on the desired system using the
1270
+ requirements specified in `@resources`.
1322
1271
 
1323
1272
 
1324
1273
  Parameters
1325
1274
  ----------
1326
- vars : Dict[str, str], default {}
1327
- Dictionary of environment variables to set.
1275
+ cpu : int, default 1
1276
+ Number of CPUs required for this step.
1277
+ gpu : int, optional, default None
1278
+ Number of GPUs required for this step.
1279
+ disk : int, optional, default None
1280
+ Disk size (in MB) required for this step. Only applies on Kubernetes.
1281
+ memory : int, default 4096
1282
+ Memory size (in MB) required for this step.
1283
+ shared_memory : int, optional, default None
1284
+ The value for the size (in MiB) of the /dev/shm volume for this step.
1285
+ This parameter maps to the `--shm-size` option in Docker.
1328
1286
  """
1329
1287
  ...
1330
1288
 
1331
1289
  @typing.overload
1332
- def environment(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
1290
+ def resources(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
1333
1291
  ...
1334
1292
 
1335
1293
  @typing.overload
1336
- def environment(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
1294
+ def resources(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
1337
1295
  ...
1338
1296
 
1339
- def environment(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, vars: typing.Dict[str, str] = {}):
1297
+ def resources(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None, *, cpu: int = 1, gpu: typing.Optional[int] = None, disk: typing.Optional[int] = None, memory: int = 4096, shared_memory: typing.Optional[int] = None):
1340
1298
  """
1341
- Specifies environment variables to be set prior to the execution of a step.
1299
+ Specifies the resources needed when executing this step.
1342
1300
 
1301
+ Use `@resources` to specify the resource requirements
1302
+ independently of the specific compute layer (`@batch`, `@kubernetes`).
1343
1303
 
1344
- Parameters
1345
- ----------
1346
- vars : Dict[str, str], default {}
1347
- Dictionary of environment variables to set.
1348
- """
1349
- ...
1350
-
1351
- def project(*, name: str, branch: typing.Optional[str] = None, production: bool = False) -> typing.Callable[[typing.Type[FlowSpecDerived]], typing.Type[FlowSpecDerived]]:
1352
- """
1353
- Specifies what flows belong to the same project.
1354
-
1355
- A project-specific namespace is created for all flows that
1356
- use the same `@project(name)`.
1304
+ You can choose the compute layer on the command line by executing e.g.
1305
+ ```
1306
+ python myflow.py run --with batch
1307
+ ```
1308
+ or
1309
+ ```
1310
+ python myflow.py run --with kubernetes
1311
+ ```
1312
+ which executes the flow on the desired system using the
1313
+ requirements specified in `@resources`.
1357
1314
 
1358
1315
 
1359
1316
  Parameters
1360
1317
  ----------
1361
- name : str
1362
- Project name. Make sure that the name is unique amongst all
1363
- projects that use the same production scheduler. The name may
1364
- contain only lowercase alphanumeric characters and underscores.
1365
-
1366
- branch : Optional[str], default None
1367
- The branch to use. If not specified, the branch is set to
1368
- `user.<username>` unless `production` is set to `True`. This can
1369
- also be set on the command line using `--branch` as a top-level option.
1370
- It is an error to specify `branch` in the decorator and on the command line.
1371
-
1372
- production : bool, default False
1373
- Whether or not the branch is the production branch. This can also be set on the
1374
- command line using `--production` as a top-level option. It is an error to specify
1375
- `production` in the decorator and on the command line.
1376
- The project branch name will be:
1377
- - if `branch` is specified:
1378
- - if `production` is True: `prod.<branch>`
1379
- - if `production` is False: `test.<branch>`
1380
- - if `branch` is not specified:
1381
- - if `production` is True: `prod`
1382
- - if `production` is False: `user.<username>`
1318
+ cpu : int, default 1
1319
+ Number of CPUs required for this step.
1320
+ gpu : int, optional, default None
1321
+ Number of GPUs required for this step.
1322
+ disk : int, optional, default None
1323
+ Disk size (in MB) required for this step. Only applies on Kubernetes.
1324
+ memory : int, default 4096
1325
+ Memory size (in MB) required for this step.
1326
+ shared_memory : int, optional, default None
1327
+ The value for the size (in MiB) of the /dev/shm volume for this step.
1328
+ This parameter maps to the `--shm-size` option in Docker.
1383
1329
  """
1384
1330
  ...
1385
1331
 
1386
1332
  @typing.overload
1387
- def conda_base(*, packages: typing.Dict[str, str] = {}, libraries: typing.Dict[str, str] = {}, python: typing.Optional[str] = None, disabled: bool = False) -> typing.Callable[[typing.Type[FlowSpecDerived]], typing.Type[FlowSpecDerived]]:
1333
+ def parallel(f: typing.Callable[[FlowSpecDerived, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, StepFlag], None]:
1388
1334
  """
1389
- Specifies the Conda environment for all steps of the flow.
1390
-
1391
- Use `@conda_base` to set common libraries required by all
1392
- steps and use `@conda` to specify step-specific additions.
1393
-
1394
-
1395
- Parameters
1396
- ----------
1397
- packages : Dict[str, str], default {}
1398
- Packages to use for this flow. The key is the name of the package
1399
- and the value is the version to use.
1400
- libraries : Dict[str, str], default {}
1401
- Supported for backward compatibility. When used with packages, packages will take precedence.
1402
- python : str, optional, default None
1403
- Version of Python to use, e.g. '3.7.4'. A default value of None implies
1404
- that the version used will correspond to the version of the Python interpreter used to start the run.
1405
- disabled : bool, default False
1406
- If set to True, disables Conda.
1335
+ Decorator prototype for all step decorators. This function gets specialized
1336
+ and imported for all decorators types by _import_plugin_decorators().
1407
1337
  """
1408
1338
  ...
1409
1339
 
1410
1340
  @typing.overload
1411
- def conda_base(f: typing.Type[FlowSpecDerived]) -> typing.Type[FlowSpecDerived]:
1412
- ...
1413
-
1414
- def conda_base(f: typing.Optional[typing.Type[FlowSpecDerived]] = None, *, packages: typing.Dict[str, str] = {}, libraries: typing.Dict[str, str] = {}, python: typing.Optional[str] = None, disabled: bool = False):
1415
- """
1416
- Specifies the Conda environment for all steps of the flow.
1417
-
1418
- Use `@conda_base` to set common libraries required by all
1419
- steps and use `@conda` to specify step-specific additions.
1420
-
1421
-
1422
- Parameters
1423
- ----------
1424
- packages : Dict[str, str], default {}
1425
- Packages to use for this flow. The key is the name of the package
1426
- and the value is the version to use.
1427
- libraries : Dict[str, str], default {}
1428
- Supported for backward compatibility. When used with packages, packages will take precedence.
1429
- python : str, optional, default None
1430
- Version of Python to use, e.g. '3.7.4'. A default value of None implies
1431
- that the version used will correspond to the version of the Python interpreter used to start the run.
1432
- disabled : bool, default False
1433
- If set to True, disables Conda.
1434
- """
1435
- ...
1436
-
1437
- def airflow_external_task_sensor(*, timeout: int, poke_interval: int, mode: str, exponential_backoff: bool, pool: str, soft_fail: bool, name: str, description: str, external_dag_id: str, external_task_ids: typing.List[str], allowed_states: typing.List[str], failed_states: typing.List[str], execution_delta: "datetime.timedelta", check_existence: bool) -> typing.Callable[[typing.Type[FlowSpecDerived]], typing.Type[FlowSpecDerived]]:
1438
- """
1439
- The `@airflow_external_task_sensor` decorator attaches a Airflow [ExternalTaskSensor](https://airflow.apache.org/docs/apache-airflow/stable/_api/airflow/sensors/external_task/index.html#airflow.sensors.external_task.ExternalTaskSensor) before the start step of the flow.
1440
- This decorator only works when a flow is scheduled on Airflow and is compiled using `airflow create`. More than one `@airflow_external_task_sensor` can be added as a flow decorators. Adding more than one decorator will ensure that `start` step starts only after all sensors finish.
1441
-
1442
-
1443
- Parameters
1444
- ----------
1445
- timeout : int
1446
- Time, in seconds before the task times out and fails. (Default: 3600)
1447
- poke_interval : int
1448
- Time in seconds that the job should wait in between each try. (Default: 60)
1449
- mode : str
1450
- How the sensor operates. Options are: { poke | reschedule }. (Default: "poke")
1451
- exponential_backoff : bool
1452
- allow progressive longer waits between pokes by using exponential backoff algorithm. (Default: True)
1453
- pool : str
1454
- the slot pool this task should run in,
1455
- slot pools are a way to limit concurrency for certain tasks. (Default:None)
1456
- soft_fail : bool
1457
- Set to true to mark the task as SKIPPED on failure. (Default: False)
1458
- name : str
1459
- Name of the sensor on Airflow
1460
- description : str
1461
- Description of sensor in the Airflow UI
1462
- external_dag_id : str
1463
- The dag_id that contains the task you want to wait for.
1464
- external_task_ids : List[str]
1465
- The list of task_ids that you want to wait for.
1466
- If None (default value) the sensor waits for the DAG. (Default: None)
1467
- allowed_states : List[str]
1468
- Iterable of allowed states, (Default: ['success'])
1469
- failed_states : List[str]
1470
- Iterable of failed or dis-allowed states. (Default: None)
1471
- execution_delta : datetime.timedelta
1472
- time difference with the previous execution to look at,
1473
- the default is the same logical date as the current task or DAG. (Default: None)
1474
- check_existence: bool
1475
- Set to True to check if the external task exists or check if
1476
- the DAG to wait for exists. (Default: True)
1477
- """
1341
+ def parallel(f: typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]) -> typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None]:
1478
1342
  ...
1479
1343
 
1480
- def airflow_s3_key_sensor(*, timeout: int, poke_interval: int, mode: str, exponential_backoff: bool, pool: str, soft_fail: bool, name: str, description: str, bucket_key: typing.Union[str, typing.List[str]], bucket_name: str, wildcard_match: bool, aws_conn_id: str, verify: bool) -> typing.Callable[[typing.Type[FlowSpecDerived]], typing.Type[FlowSpecDerived]]:
1344
+ def parallel(f: typing.Union[typing.Callable[[FlowSpecDerived, StepFlag], None], typing.Callable[[FlowSpecDerived, typing.Any, StepFlag], None], None] = None):
1481
1345
  """
1482
- The `@airflow_s3_key_sensor` decorator attaches a Airflow [S3KeySensor](https://airflow.apache.org/docs/apache-airflow-providers-amazon/stable/_api/airflow/providers/amazon/aws/sensors/s3/index.html#airflow.providers.amazon.aws.sensors.s3.S3KeySensor)
1483
- before the start step of the flow. This decorator only works when a flow is scheduled on Airflow
1484
- and is compiled using `airflow create`. More than one `@airflow_s3_key_sensor` can be
1485
- added as a flow decorators. Adding more than one decorator will ensure that `start` step
1486
- starts only after all sensors finish.
1487
-
1488
-
1489
- Parameters
1490
- ----------
1491
- timeout : int
1492
- Time, in seconds before the task times out and fails. (Default: 3600)
1493
- poke_interval : int
1494
- Time in seconds that the job should wait in between each try. (Default: 60)
1495
- mode : str
1496
- How the sensor operates. Options are: { poke | reschedule }. (Default: "poke")
1497
- exponential_backoff : bool
1498
- allow progressive longer waits between pokes by using exponential backoff algorithm. (Default: True)
1499
- pool : str
1500
- the slot pool this task should run in,
1501
- slot pools are a way to limit concurrency for certain tasks. (Default:None)
1502
- soft_fail : bool
1503
- Set to true to mark the task as SKIPPED on failure. (Default: False)
1504
- name : str
1505
- Name of the sensor on Airflow
1506
- description : str
1507
- Description of sensor in the Airflow UI
1508
- bucket_key : Union[str, List[str]]
1509
- The key(s) being waited on. Supports full s3:// style url or relative path from root level.
1510
- When it's specified as a full s3:// url, please leave `bucket_name` as None
1511
- bucket_name : str
1512
- Name of the S3 bucket. Only needed when bucket_key is not provided as a full s3:// url.
1513
- When specified, all the keys passed to bucket_key refers to this bucket. (Default:None)
1514
- wildcard_match : bool
1515
- whether the bucket_key should be interpreted as a Unix wildcard pattern. (Default: False)
1516
- aws_conn_id : str
1517
- a reference to the s3 connection on Airflow. (Default: None)
1518
- verify : bool
1519
- Whether or not to verify SSL certificates for S3 connection. (Default: None)
1346
+ Decorator prototype for all step decorators. This function gets specialized
1347
+ and imported for all decorators types by _import_plugin_decorators().
1520
1348
  """
1521
1349
  ...
1522
1350
 
@@ -1621,95 +1449,168 @@ def trigger_on_finish(f: typing.Optional[typing.Type[FlowSpecDerived]] = None, *
1621
1449
  """
1622
1450
  ...
1623
1451
 
1624
- @typing.overload
1625
- def schedule(*, hourly: bool = False, daily: bool = True, weekly: bool = False, cron: typing.Optional[str] = None, timezone: typing.Optional[str] = None) -> typing.Callable[[typing.Type[FlowSpecDerived]], typing.Type[FlowSpecDerived]]:
1452
+ def with_artifact_store(f: typing.Optional[typing.Type[FlowSpecDerived]] = None):
1626
1453
  """
1627
- Specifies the times when the flow should be run when running on a
1628
- production scheduler.
1629
-
1454
+ Allows setting external datastores to save data for the
1455
+ `@checkpoint`/`@model`/`@huggingface_hub` decorators.
1630
1456
 
1631
- Parameters
1632
- ----------
1633
- hourly : bool, default False
1634
- Run the workflow hourly.
1635
- daily : bool, default True
1636
- Run the workflow daily.
1637
- weekly : bool, default False
1638
- Run the workflow weekly.
1639
- cron : str, optional, default None
1640
- Run the workflow at [a custom Cron schedule](https://docs.aws.amazon.com/eventbridge/latest/userguide/scheduled-events.html#cron-expressions)
1641
- specified by this expression.
1642
- timezone : str, optional, default None
1643
- Timezone on which the schedule runs (default: None). Currently supported only for Argo workflows,
1644
- which accepts timezones in [IANA format](https://nodatime.org/TimeZones).
1645
- """
1646
- ...
1647
-
1648
- @typing.overload
1649
- def schedule(f: typing.Type[FlowSpecDerived]) -> typing.Type[FlowSpecDerived]:
1650
- ...
1651
-
1652
- def schedule(f: typing.Optional[typing.Type[FlowSpecDerived]] = None, *, hourly: bool = False, daily: bool = True, weekly: bool = False, cron: typing.Optional[str] = None, timezone: typing.Optional[str] = None):
1653
- """
1654
- Specifies the times when the flow should be run when running on a
1655
- production scheduler.
1457
+ This decorator is useful when users wish to save data to a different datastore
1458
+ than what is configured in Metaflow. This can be for variety of reasons:
1656
1459
 
1460
+ 1. Data security: The objects needs to be stored in a bucket (object storage) that is not accessible by other flows.
1461
+ 2. Data Locality: The location where the task is executing is not located in the same region as the datastore.
1462
+ - Example: Metaflow datastore lives in US East, but the task is executing in Finland datacenters.
1463
+ 3. Data Lifecycle Policies: The objects need to be archived / managed separately from the Metaflow managed objects.
1464
+ - Example: Flow is training very large models that need to be stored separately and will be deleted more aggressively than the Metaflow managed objects.
1657
1465
 
1658
- Parameters
1466
+ Usage:
1659
1467
  ----------
1660
- hourly : bool, default False
1661
- Run the workflow hourly.
1662
- daily : bool, default True
1663
- Run the workflow daily.
1664
- weekly : bool, default False
1665
- Run the workflow weekly.
1666
- cron : str, optional, default None
1667
- Run the workflow at [a custom Cron schedule](https://docs.aws.amazon.com/eventbridge/latest/userguide/scheduled-events.html#cron-expressions)
1668
- specified by this expression.
1669
- timezone : str, optional, default None
1670
- Timezone on which the schedule runs (default: None). Currently supported only for Argo workflows,
1671
- which accepts timezones in [IANA format](https://nodatime.org/TimeZones).
1672
- """
1673
- ...
1674
-
1675
- @typing.overload
1676
- def pypi_base(*, packages: typing.Dict[str, str] = {}, python: typing.Optional[str] = None) -> typing.Callable[[typing.Type[FlowSpecDerived]], typing.Type[FlowSpecDerived]]:
1677
- """
1678
- Specifies the PyPI packages for all steps of the flow.
1679
1468
 
1680
- Use `@pypi_base` to set common packages required by all
1681
- steps and use `@pypi` to specify step-specific overrides.
1469
+ - Using a custom IAM role to access the datastore.
1682
1470
 
1683
- Parameters
1684
- ----------
1685
- packages : Dict[str, str], default: {}
1686
- Packages to use for this flow. The key is the name of the package
1687
- and the value is the version to use.
1688
- python : str, optional, default: None
1689
- Version of Python to use, e.g. '3.7.4'. A default value of None implies
1690
- that the version used will correspond to the version of the Python interpreter used to start the run.
1691
- """
1692
- ...
1693
-
1694
- @typing.overload
1695
- def pypi_base(f: typing.Type[FlowSpecDerived]) -> typing.Type[FlowSpecDerived]:
1471
+ ```python
1472
+ @with_artifact_store(
1473
+ type="s3",
1474
+ config=lambda: {
1475
+ "root": "s3://my-bucket-foo/path/to/root",
1476
+ "role_arn": ROLE,
1477
+ },
1478
+ )
1479
+ class MyFlow(FlowSpec):
1480
+
1481
+ @checkpoint
1482
+ @step
1483
+ def start(self):
1484
+ with open("my_file.txt", "w") as f:
1485
+ f.write("Hello, World!")
1486
+ self.external_bucket_checkpoint = current.checkpoint.save("my_file.txt")
1487
+ self.next(self.end)
1488
+
1489
+ ```
1490
+
1491
+ - Using credentials to access the s3-compatible datastore.
1492
+
1493
+ ```python
1494
+ @with_artifact_store(
1495
+ type="s3",
1496
+ config=lambda: {
1497
+ "root": "s3://my-bucket-foo/path/to/root",
1498
+ "client_params": {
1499
+ "aws_access_key_id": os.environ.get("MY_CUSTOM_ACCESS_KEY"),
1500
+ "aws_secret_access_key": os.environ.get("MY_CUSTOM_SECRET_KEY"),
1501
+ },
1502
+ },
1503
+ )
1504
+ class MyFlow(FlowSpec):
1505
+
1506
+ @checkpoint
1507
+ @step
1508
+ def start(self):
1509
+ with open("my_file.txt", "w") as f:
1510
+ f.write("Hello, World!")
1511
+ self.external_bucket_checkpoint = current.checkpoint.save("my_file.txt")
1512
+ self.next(self.end)
1513
+
1514
+ ```
1515
+
1516
+ - Accessing objects stored in external datastores after task execution.
1517
+
1518
+ ```python
1519
+ run = Run("CheckpointsTestsFlow/8992")
1520
+ with artifact_store_from(run=run, config={
1521
+ "client_params": {
1522
+ "aws_access_key_id": os.environ.get("MY_CUSTOM_ACCESS_KEY"),
1523
+ "aws_secret_access_key": os.environ.get("MY_CUSTOM_SECRET_KEY"),
1524
+ },
1525
+ }):
1526
+ with Checkpoint() as cp:
1527
+ latest = cp.list(
1528
+ task=run["start"].task
1529
+ )[0]
1530
+ print(latest)
1531
+ cp.load(
1532
+ latest,
1533
+ "test-checkpoints"
1534
+ )
1535
+
1536
+ task = Task("TorchTuneFlow/8484/train/53673")
1537
+ with artifact_store_from(run=run, config={
1538
+ "client_params": {
1539
+ "aws_access_key_id": os.environ.get("MY_CUSTOM_ACCESS_KEY"),
1540
+ "aws_secret_access_key": os.environ.get("MY_CUSTOM_SECRET_KEY"),
1541
+ },
1542
+ }):
1543
+ load_model(
1544
+ task.data.model_ref,
1545
+ "test-models"
1546
+ )
1547
+ ```
1548
+ Parameters:
1549
+ ----------
1550
+
1551
+ type: str
1552
+ The type of the datastore. Can be one of 's3', 'gcs', 'azure' or any other supported metaflow Datastore.
1553
+
1554
+ config: dict or Callable
1555
+ Dictionary of configuration options for the datastore. The following keys are required:
1556
+ - root: The root path in the datastore where the data will be saved. (needs to be in the format expected by the datastore)
1557
+ - example: 's3://bucket-name/path/to/root'
1558
+ - example: 'gs://bucket-name/path/to/root'
1559
+ - example: 'https://myblockacc.blob.core.windows.net/metaflow/'
1560
+ - role_arn (optional): AWS IAM role to access s3 bucket (only when `type` is 's3')
1561
+ - session_vars (optional): AWS session variables to access s3 bucket (only when `type` is 's3')
1562
+ - client_params (optional): AWS client parameters to access s3 bucket (only when `type` is 's3')
1563
+ """
1696
1564
  ...
1697
1565
 
1698
- def pypi_base(f: typing.Optional[typing.Type[FlowSpecDerived]] = None, *, packages: typing.Dict[str, str] = {}, python: typing.Optional[str] = None):
1566
+ @typing.overload
1567
+ def schedule(*, hourly: bool = False, daily: bool = True, weekly: bool = False, cron: typing.Optional[str] = None, timezone: typing.Optional[str] = None) -> typing.Callable[[typing.Type[FlowSpecDerived]], typing.Type[FlowSpecDerived]]:
1699
1568
  """
1700
- Specifies the PyPI packages for all steps of the flow.
1569
+ Specifies the times when the flow should be run when running on a
1570
+ production scheduler.
1701
1571
 
1702
- Use `@pypi_base` to set common packages required by all
1703
- steps and use `@pypi` to specify step-specific overrides.
1704
1572
 
1705
1573
  Parameters
1706
1574
  ----------
1707
- packages : Dict[str, str], default: {}
1708
- Packages to use for this flow. The key is the name of the package
1709
- and the value is the version to use.
1710
- python : str, optional, default: None
1711
- Version of Python to use, e.g. '3.7.4'. A default value of None implies
1712
- that the version used will correspond to the version of the Python interpreter used to start the run.
1575
+ hourly : bool, default False
1576
+ Run the workflow hourly.
1577
+ daily : bool, default True
1578
+ Run the workflow daily.
1579
+ weekly : bool, default False
1580
+ Run the workflow weekly.
1581
+ cron : str, optional, default None
1582
+ Run the workflow at [a custom Cron schedule](https://docs.aws.amazon.com/eventbridge/latest/userguide/scheduled-events.html#cron-expressions)
1583
+ specified by this expression.
1584
+ timezone : str, optional, default None
1585
+ Timezone on which the schedule runs (default: None). Currently supported only for Argo workflows,
1586
+ which accepts timezones in [IANA format](https://nodatime.org/TimeZones).
1587
+ """
1588
+ ...
1589
+
1590
+ @typing.overload
1591
+ def schedule(f: typing.Type[FlowSpecDerived]) -> typing.Type[FlowSpecDerived]:
1592
+ ...
1593
+
1594
+ def schedule(f: typing.Optional[typing.Type[FlowSpecDerived]] = None, *, hourly: bool = False, daily: bool = True, weekly: bool = False, cron: typing.Optional[str] = None, timezone: typing.Optional[str] = None):
1595
+ """
1596
+ Specifies the times when the flow should be run when running on a
1597
+ production scheduler.
1598
+
1599
+
1600
+ Parameters
1601
+ ----------
1602
+ hourly : bool, default False
1603
+ Run the workflow hourly.
1604
+ daily : bool, default True
1605
+ Run the workflow daily.
1606
+ weekly : bool, default False
1607
+ Run the workflow weekly.
1608
+ cron : str, optional, default None
1609
+ Run the workflow at [a custom Cron schedule](https://docs.aws.amazon.com/eventbridge/latest/userguide/scheduled-events.html#cron-expressions)
1610
+ specified by this expression.
1611
+ timezone : str, optional, default None
1612
+ Timezone on which the schedule runs (default: None). Currently supported only for Argo workflows,
1613
+ which accepts timezones in [IANA format](https://nodatime.org/TimeZones).
1713
1614
  """
1714
1615
  ...
1715
1616
 
@@ -1806,117 +1707,216 @@ def trigger(f: typing.Optional[typing.Type[FlowSpecDerived]] = None, *, event: t
1806
1707
  """
1807
1708
  ...
1808
1709
 
1809
- def with_artifact_store(f: typing.Optional[typing.Type[FlowSpecDerived]] = None):
1710
+ @typing.overload
1711
+ def pypi_base(*, packages: typing.Dict[str, str] = {}, python: typing.Optional[str] = None) -> typing.Callable[[typing.Type[FlowSpecDerived]], typing.Type[FlowSpecDerived]]:
1810
1712
  """
1811
- Allows setting external datastores to save data for the
1812
- `@checkpoint`/`@model`/`@huggingface_hub` decorators.
1713
+ Specifies the PyPI packages for all steps of the flow.
1813
1714
 
1814
- This decorator is useful when users wish to save data to a different datastore
1815
- than what is configured in Metaflow. This can be for variety of reasons:
1715
+ Use `@pypi_base` to set common packages required by all
1716
+ steps and use `@pypi` to specify step-specific overrides.
1816
1717
 
1817
- 1. Data security: The objects needs to be stored in a bucket (object storage) that is not accessible by other flows.
1818
- 2. Data Locality: The location where the task is executing is not located in the same region as the datastore.
1819
- - Example: Metaflow datastore lives in US East, but the task is executing in Finland datacenters.
1820
- 3. Data Lifecycle Policies: The objects need to be archived / managed separately from the Metaflow managed objects.
1821
- - Example: Flow is training very large models that need to be stored separately and will be deleted more aggressively than the Metaflow managed objects.
1718
+ Parameters
1719
+ ----------
1720
+ packages : Dict[str, str], default: {}
1721
+ Packages to use for this flow. The key is the name of the package
1722
+ and the value is the version to use.
1723
+ python : str, optional, default: None
1724
+ Version of Python to use, e.g. '3.7.4'. A default value of None implies
1725
+ that the version used will correspond to the version of the Python interpreter used to start the run.
1726
+ """
1727
+ ...
1728
+
1729
+ @typing.overload
1730
+ def pypi_base(f: typing.Type[FlowSpecDerived]) -> typing.Type[FlowSpecDerived]:
1731
+ ...
1732
+
1733
+ def pypi_base(f: typing.Optional[typing.Type[FlowSpecDerived]] = None, *, packages: typing.Dict[str, str] = {}, python: typing.Optional[str] = None):
1734
+ """
1735
+ Specifies the PyPI packages for all steps of the flow.
1822
1736
 
1823
- Usage:
1737
+ Use `@pypi_base` to set common packages required by all
1738
+ steps and use `@pypi` to specify step-specific overrides.
1739
+
1740
+ Parameters
1824
1741
  ----------
1742
+ packages : Dict[str, str], default: {}
1743
+ Packages to use for this flow. The key is the name of the package
1744
+ and the value is the version to use.
1745
+ python : str, optional, default: None
1746
+ Version of Python to use, e.g. '3.7.4'. A default value of None implies
1747
+ that the version used will correspond to the version of the Python interpreter used to start the run.
1748
+ """
1749
+ ...
1750
+
1751
+ def airflow_external_task_sensor(*, timeout: int, poke_interval: int, mode: str, exponential_backoff: bool, pool: str, soft_fail: bool, name: str, description: str, external_dag_id: str, external_task_ids: typing.List[str], allowed_states: typing.List[str], failed_states: typing.List[str], execution_delta: "datetime.timedelta", check_existence: bool) -> typing.Callable[[typing.Type[FlowSpecDerived]], typing.Type[FlowSpecDerived]]:
1752
+ """
1753
+ The `@airflow_external_task_sensor` decorator attaches a Airflow [ExternalTaskSensor](https://airflow.apache.org/docs/apache-airflow/stable/_api/airflow/sensors/external_task/index.html#airflow.sensors.external_task.ExternalTaskSensor) before the start step of the flow.
1754
+ This decorator only works when a flow is scheduled on Airflow and is compiled using `airflow create`. More than one `@airflow_external_task_sensor` can be added as a flow decorators. Adding more than one decorator will ensure that `start` step starts only after all sensors finish.
1825
1755
 
1826
- - Using a custom IAM role to access the datastore.
1827
1756
 
1828
- ```python
1829
- @with_artifact_store(
1830
- type="s3",
1831
- config=lambda: {
1832
- "root": "s3://my-bucket-foo/path/to/root",
1833
- "role_arn": ROLE,
1834
- },
1835
- )
1836
- class MyFlow(FlowSpec):
1757
+ Parameters
1758
+ ----------
1759
+ timeout : int
1760
+ Time, in seconds before the task times out and fails. (Default: 3600)
1761
+ poke_interval : int
1762
+ Time in seconds that the job should wait in between each try. (Default: 60)
1763
+ mode : str
1764
+ How the sensor operates. Options are: { poke | reschedule }. (Default: "poke")
1765
+ exponential_backoff : bool
1766
+ allow progressive longer waits between pokes by using exponential backoff algorithm. (Default: True)
1767
+ pool : str
1768
+ the slot pool this task should run in,
1769
+ slot pools are a way to limit concurrency for certain tasks. (Default:None)
1770
+ soft_fail : bool
1771
+ Set to true to mark the task as SKIPPED on failure. (Default: False)
1772
+ name : str
1773
+ Name of the sensor on Airflow
1774
+ description : str
1775
+ Description of sensor in the Airflow UI
1776
+ external_dag_id : str
1777
+ The dag_id that contains the task you want to wait for.
1778
+ external_task_ids : List[str]
1779
+ The list of task_ids that you want to wait for.
1780
+ If None (default value) the sensor waits for the DAG. (Default: None)
1781
+ allowed_states : List[str]
1782
+ Iterable of allowed states, (Default: ['success'])
1783
+ failed_states : List[str]
1784
+ Iterable of failed or dis-allowed states. (Default: None)
1785
+ execution_delta : datetime.timedelta
1786
+ time difference with the previous execution to look at,
1787
+ the default is the same logical date as the current task or DAG. (Default: None)
1788
+ check_existence: bool
1789
+ Set to True to check if the external task exists or check if
1790
+ the DAG to wait for exists. (Default: True)
1791
+ """
1792
+ ...
1793
+
1794
+ def project(*, name: str, branch: typing.Optional[str] = None, production: bool = False) -> typing.Callable[[typing.Type[FlowSpecDerived]], typing.Type[FlowSpecDerived]]:
1795
+ """
1796
+ Specifies what flows belong to the same project.
1837
1797
 
1838
- @checkpoint
1839
- @step
1840
- def start(self):
1841
- with open("my_file.txt", "w") as f:
1842
- f.write("Hello, World!")
1843
- self.external_bucket_checkpoint = current.checkpoint.save("my_file.txt")
1844
- self.next(self.end)
1798
+ A project-specific namespace is created for all flows that
1799
+ use the same `@project(name)`.
1845
1800
 
1846
- ```
1847
1801
 
1848
- - Using credentials to access the s3-compatible datastore.
1802
+ Parameters
1803
+ ----------
1804
+ name : str
1805
+ Project name. Make sure that the name is unique amongst all
1806
+ projects that use the same production scheduler. The name may
1807
+ contain only lowercase alphanumeric characters and underscores.
1849
1808
 
1850
- ```python
1851
- @with_artifact_store(
1852
- type="s3",
1853
- config=lambda: {
1854
- "root": "s3://my-bucket-foo/path/to/root",
1855
- "client_params": {
1856
- "aws_access_key_id": os.environ.get("MY_CUSTOM_ACCESS_KEY"),
1857
- "aws_secret_access_key": os.environ.get("MY_CUSTOM_SECRET_KEY"),
1858
- },
1859
- },
1860
- )
1861
- class MyFlow(FlowSpec):
1809
+ branch : Optional[str], default None
1810
+ The branch to use. If not specified, the branch is set to
1811
+ `user.<username>` unless `production` is set to `True`. This can
1812
+ also be set on the command line using `--branch` as a top-level option.
1813
+ It is an error to specify `branch` in the decorator and on the command line.
1862
1814
 
1863
- @checkpoint
1864
- @step
1865
- def start(self):
1866
- with open("my_file.txt", "w") as f:
1867
- f.write("Hello, World!")
1868
- self.external_bucket_checkpoint = current.checkpoint.save("my_file.txt")
1869
- self.next(self.end)
1815
+ production : bool, default False
1816
+ Whether or not the branch is the production branch. This can also be set on the
1817
+ command line using `--production` as a top-level option. It is an error to specify
1818
+ `production` in the decorator and on the command line.
1819
+ The project branch name will be:
1820
+ - if `branch` is specified:
1821
+ - if `production` is True: `prod.<branch>`
1822
+ - if `production` is False: `test.<branch>`
1823
+ - if `branch` is not specified:
1824
+ - if `production` is True: `prod`
1825
+ - if `production` is False: `user.<username>`
1826
+ """
1827
+ ...
1828
+
1829
+ def airflow_s3_key_sensor(*, timeout: int, poke_interval: int, mode: str, exponential_backoff: bool, pool: str, soft_fail: bool, name: str, description: str, bucket_key: typing.Union[str, typing.List[str]], bucket_name: str, wildcard_match: bool, aws_conn_id: str, verify: bool) -> typing.Callable[[typing.Type[FlowSpecDerived]], typing.Type[FlowSpecDerived]]:
1830
+ """
1831
+ The `@airflow_s3_key_sensor` decorator attaches a Airflow [S3KeySensor](https://airflow.apache.org/docs/apache-airflow-providers-amazon/stable/_api/airflow/providers/amazon/aws/sensors/s3/index.html#airflow.providers.amazon.aws.sensors.s3.S3KeySensor)
1832
+ before the start step of the flow. This decorator only works when a flow is scheduled on Airflow
1833
+ and is compiled using `airflow create`. More than one `@airflow_s3_key_sensor` can be
1834
+ added as a flow decorators. Adding more than one decorator will ensure that `start` step
1835
+ starts only after all sensors finish.
1870
1836
 
1871
- ```
1872
1837
 
1873
- - Accessing objects stored in external datastores after task execution.
1838
+ Parameters
1839
+ ----------
1840
+ timeout : int
1841
+ Time, in seconds before the task times out and fails. (Default: 3600)
1842
+ poke_interval : int
1843
+ Time in seconds that the job should wait in between each try. (Default: 60)
1844
+ mode : str
1845
+ How the sensor operates. Options are: { poke | reschedule }. (Default: "poke")
1846
+ exponential_backoff : bool
1847
+ allow progressive longer waits between pokes by using exponential backoff algorithm. (Default: True)
1848
+ pool : str
1849
+ the slot pool this task should run in,
1850
+ slot pools are a way to limit concurrency for certain tasks. (Default:None)
1851
+ soft_fail : bool
1852
+ Set to true to mark the task as SKIPPED on failure. (Default: False)
1853
+ name : str
1854
+ Name of the sensor on Airflow
1855
+ description : str
1856
+ Description of sensor in the Airflow UI
1857
+ bucket_key : Union[str, List[str]]
1858
+ The key(s) being waited on. Supports full s3:// style url or relative path from root level.
1859
+ When it's specified as a full s3:// url, please leave `bucket_name` as None
1860
+ bucket_name : str
1861
+ Name of the S3 bucket. Only needed when bucket_key is not provided as a full s3:// url.
1862
+ When specified, all the keys passed to bucket_key refers to this bucket. (Default:None)
1863
+ wildcard_match : bool
1864
+ whether the bucket_key should be interpreted as a Unix wildcard pattern. (Default: False)
1865
+ aws_conn_id : str
1866
+ a reference to the s3 connection on Airflow. (Default: None)
1867
+ verify : bool
1868
+ Whether or not to verify SSL certificates for S3 connection. (Default: None)
1869
+ """
1870
+ ...
1871
+
1872
+ @typing.overload
1873
+ def conda_base(*, packages: typing.Dict[str, str] = {}, libraries: typing.Dict[str, str] = {}, python: typing.Optional[str] = None, disabled: bool = False) -> typing.Callable[[typing.Type[FlowSpecDerived]], typing.Type[FlowSpecDerived]]:
1874
+ """
1875
+ Specifies the Conda environment for all steps of the flow.
1874
1876
 
1875
- ```python
1876
- run = Run("CheckpointsTestsFlow/8992")
1877
- with artifact_store_from(run=run, config={
1878
- "client_params": {
1879
- "aws_access_key_id": os.environ.get("MY_CUSTOM_ACCESS_KEY"),
1880
- "aws_secret_access_key": os.environ.get("MY_CUSTOM_SECRET_KEY"),
1881
- },
1882
- }):
1883
- with Checkpoint() as cp:
1884
- latest = cp.list(
1885
- task=run["start"].task
1886
- )[0]
1887
- print(latest)
1888
- cp.load(
1889
- latest,
1890
- "test-checkpoints"
1891
- )
1877
+ Use `@conda_base` to set common libraries required by all
1878
+ steps and use `@conda` to specify step-specific additions.
1892
1879
 
1893
- task = Task("TorchTuneFlow/8484/train/53673")
1894
- with artifact_store_from(run=run, config={
1895
- "client_params": {
1896
- "aws_access_key_id": os.environ.get("MY_CUSTOM_ACCESS_KEY"),
1897
- "aws_secret_access_key": os.environ.get("MY_CUSTOM_SECRET_KEY"),
1898
- },
1899
- }):
1900
- load_model(
1901
- task.data.model_ref,
1902
- "test-models"
1903
- )
1904
- ```
1905
- Parameters:
1880
+
1881
+ Parameters
1906
1882
  ----------
1883
+ packages : Dict[str, str], default {}
1884
+ Packages to use for this flow. The key is the name of the package
1885
+ and the value is the version to use.
1886
+ libraries : Dict[str, str], default {}
1887
+ Supported for backward compatibility. When used with packages, packages will take precedence.
1888
+ python : str, optional, default None
1889
+ Version of Python to use, e.g. '3.7.4'. A default value of None implies
1890
+ that the version used will correspond to the version of the Python interpreter used to start the run.
1891
+ disabled : bool, default False
1892
+ If set to True, disables Conda.
1893
+ """
1894
+ ...
1895
+
1896
+ @typing.overload
1897
+ def conda_base(f: typing.Type[FlowSpecDerived]) -> typing.Type[FlowSpecDerived]:
1898
+ ...
1899
+
1900
+ def conda_base(f: typing.Optional[typing.Type[FlowSpecDerived]] = None, *, packages: typing.Dict[str, str] = {}, libraries: typing.Dict[str, str] = {}, python: typing.Optional[str] = None, disabled: bool = False):
1901
+ """
1902
+ Specifies the Conda environment for all steps of the flow.
1907
1903
 
1908
- type: str
1909
- The type of the datastore. Can be one of 's3', 'gcs', 'azure' or any other supported metaflow Datastore.
1904
+ Use `@conda_base` to set common libraries required by all
1905
+ steps and use `@conda` to specify step-specific additions.
1910
1906
 
1911
- config: dict or Callable
1912
- Dictionary of configuration options for the datastore. The following keys are required:
1913
- - root: The root path in the datastore where the data will be saved. (needs to be in the format expected by the datastore)
1914
- - example: 's3://bucket-name/path/to/root'
1915
- - example: 'gs://bucket-name/path/to/root'
1916
- - example: 'https://myblockacc.blob.core.windows.net/metaflow/'
1917
- - role_arn (optional): AWS IAM role to access s3 bucket (only when `type` is 's3')
1918
- - session_vars (optional): AWS session variables to access s3 bucket (only when `type` is 's3')
1919
- - client_params (optional): AWS client parameters to access s3 bucket (only when `type` is 's3')
1907
+
1908
+ Parameters
1909
+ ----------
1910
+ packages : Dict[str, str], default {}
1911
+ Packages to use for this flow. The key is the name of the package
1912
+ and the value is the version to use.
1913
+ libraries : Dict[str, str], default {}
1914
+ Supported for backward compatibility. When used with packages, packages will take precedence.
1915
+ python : str, optional, default None
1916
+ Version of Python to use, e.g. '3.7.4'. A default value of None implies
1917
+ that the version used will correspond to the version of the Python interpreter used to start the run.
1918
+ disabled : bool, default False
1919
+ If set to True, disables Conda.
1920
1920
  """
1921
1921
  ...
1922
1922