ob-metaflow-extensions 1.2.3__tar.gz → 1.4.34__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/PKG-INFO +1 -1
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/__init__.py +5 -1
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +16 -10
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/app_cli.py +51 -32
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +2 -1
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +119 -51
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +7 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +3 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +9 -1
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +88 -5
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +66 -4
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +1 -1
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +4 -10
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +29 -18
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +1 -1
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +1 -1
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/utils.py +2 -2
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +21 -6
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
- ob_metaflow_extensions-1.2.3/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py → ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +15 -64
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +1 -0
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +6 -3
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +13 -7
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +8 -2
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/remote_config.py +8 -3
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +61 -1
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- ob_metaflow_extensions-1.4.34/metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/ob_metaflow_extensions.egg-info/SOURCES.txt +13 -0
- ob_metaflow_extensions-1.4.34/ob_metaflow_extensions.egg-info/requires.txt +3 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/setup.py +2 -2
- ob_metaflow_extensions-1.2.3/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +0 -139
- ob_metaflow_extensions-1.2.3/ob_metaflow_extensions.egg-info/requires.txt +0 -3
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/MANIFEST.in +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/README.md +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/app_deploy_decorator.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/app_utils.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/consts.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/core/validations.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/aws/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/aws/assume_role.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nim/card.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nim/utils.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvcf/constants.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvcf/utils.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvct/nvct.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/nvct/utils.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/ollama/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/ollama/constants.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/ollama/ollama.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/ollama/status_card.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/secrets/secrets.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/vllm/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/vllm/constants.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/vllm/status_card.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/profilers/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/profilers/gpu.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/toplevel/ob_internal.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
- {ob_metaflow_extensions-1.2.3 → ob_metaflow_extensions-1.4.34}/setup.cfg +0 -0
|
@@ -335,10 +335,14 @@ STEP_DECORATORS_DESC = [
|
|
|
335
335
|
("snowpark", ".snowpark.snowpark_decorator.SnowparkDecorator"),
|
|
336
336
|
("tensorboard", ".tensorboard.TensorboardDecorator"),
|
|
337
337
|
("gpu_profile", ".profilers.gpu_profile_decorator.GPUProfileDecorator"),
|
|
338
|
+
("test_append_card", ".profilers.simple_card_decorator.DynamicCardAppendDecorator"),
|
|
338
339
|
("nim", ".nim.nim_decorator.NimDecorator"),
|
|
339
340
|
("ollama", ".ollama.OllamaDecorator"),
|
|
340
341
|
("vllm", ".vllm.VLLMDecorator"),
|
|
341
342
|
("app_deploy", ".apps.app_deploy_decorator.AppDeployDecorator"),
|
|
343
|
+
("s3_proxy", ".s3_proxy.s3_proxy_decorator.S3ProxyDecorator"),
|
|
344
|
+
("nebius_s3_proxy", ".s3_proxy.s3_proxy_decorator.NebiusS3ProxyDecorator"),
|
|
345
|
+
("coreweave_s3_proxy", ".s3_proxy.s3_proxy_decorator.CoreWeaveS3ProxyDecorator"),
|
|
342
346
|
]
|
|
343
347
|
|
|
344
348
|
TOGGLE_STEP_DECORATOR = [
|
|
@@ -357,4 +361,4 @@ SECRETS_PROVIDERS_DESC = [
|
|
|
357
361
|
("outerbounds", ".secrets.secrets.OuterboundsSecretsProvider"),
|
|
358
362
|
]
|
|
359
363
|
# Adding an override here so the library can be imported at the metaflow.plugins level
|
|
360
|
-
__mf_promote_submodules__ = ["snowflake", "ollama", "torchtune"]
|
|
364
|
+
__mf_promote_submodules__ = ["snowflake", "ollama", "torchtune", "optuna"]
|
|
@@ -180,6 +180,7 @@ class WorkerInfoDict(TypedDict):
|
|
|
180
180
|
pending: Dict[str, List[WorkerStatus]]
|
|
181
181
|
running: Dict[str, List[WorkerStatus]]
|
|
182
182
|
crashlooping: Dict[str, List[WorkerStatus]]
|
|
183
|
+
failed: Dict[str, List[WorkerStatus]]
|
|
183
184
|
|
|
184
185
|
|
|
185
186
|
class CurrentWorkerInfo(TypedDict):
|
|
@@ -199,29 +200,29 @@ class DEPLOYMENT_READY_CONDITIONS:
|
|
|
199
200
|
This allows users or platform designers to configure the criteria for deployment readiness.
|
|
200
201
|
|
|
201
202
|
Why do we need deployment readiness conditions?
|
|
202
|
-
|
|
203
|
+
- Deployments might be taking place from a CI/CD-esque environment, In these setups, the downstream build triggers might be depending on a specific criteria for deployment completion. Having readiness conditions allows the CI/CD systems to get a signal of when the deployment is ready.
|
|
203
204
|
- Users might be calling the deployment API under different conditions:
|
|
204
205
|
- Some users might want a cluster of workers ready before serving traffic while others might want just one worker ready to start serving traffic.
|
|
205
206
|
|
|
206
207
|
Some readiness conditions include:
|
|
207
|
-
|
|
208
|
+
1) [at_least_one_running] At least min(min_replicas, 1) workers of the current deployment instance's version have started running.
|
|
208
209
|
- Usecase: Some endpoints may be deployed ephemerally and are considered ready when at least one instance is running; additional instances are for load management.
|
|
209
|
-
|
|
210
|
+
2) [all_running] At least min_replicas number of workers are running for the deployment to be considered ready.
|
|
210
211
|
- Usecase: Operators may require that all replicas are available before traffic is routed. Needed when inference endpoints maybe under some SLA or require a larger load
|
|
211
|
-
|
|
212
|
+
3) [fully_finished] At least min_replicas number of workers are running for the deployment and there are no pending or crashlooping workers from previous versions lying around.
|
|
212
213
|
- Usecase: Ensuring endpoint is fully available and no other versions are running or endpoint has been fully scaled down.
|
|
213
214
|
4) [async] The deployment will be assumed ready as soon as the server responds with a 200.
|
|
214
215
|
- Usecase: Operators may only care that the URL is minted for the deployment or the deployment eventually scales down to 0.
|
|
215
216
|
"""
|
|
216
217
|
|
|
217
|
-
# `ATLEAST_ONE_RUNNING` implies that
|
|
218
|
+
# `ATLEAST_ONE_RUNNING` implies that at least one worker of the current deployment instance's version has started running.
|
|
218
219
|
ATLEAST_ONE_RUNNING = "at_least_one_running"
|
|
219
220
|
|
|
220
221
|
# `ALL_RUNNING` implies that all workers of the current deployment instance's version have started running (i.e. all workers aligning to the minimum number of replicas).
|
|
221
222
|
# It doesn't imply that all the workers relating to other deployments have been torn down.
|
|
222
223
|
ALL_RUNNING = "all_running"
|
|
223
224
|
|
|
224
|
-
# `FULLY_FINISHED` implies
|
|
225
|
+
# `FULLY_FINISHED` implies at least min_replicas number of workers are running for the deployment and there are no pending or crashlooping workers from previous versions lying around.
|
|
225
226
|
FULLY_FINISHED = "fully_finished"
|
|
226
227
|
|
|
227
228
|
# `ASYNC` implies that the deployment will be assumed ready after the URL is minted and the worker statuses are not checked.
|
|
@@ -442,14 +443,16 @@ def _capsule_worker_semantic_status(
|
|
|
442
443
|
xx[worker_version].append(w)
|
|
443
444
|
return xx
|
|
444
445
|
|
|
446
|
+
# phases can be Pending, Running, Succeeded, Failed, Unknown, CrashLoopBackOff
|
|
445
447
|
pending_workers = _make_version_dict(workers, "Pending")
|
|
446
448
|
running_workers = _make_version_dict(workers, "Running")
|
|
447
449
|
crashlooping_workers = _make_version_dict(workers, "CrashLoopBackOff")
|
|
450
|
+
failed_workers = _make_version_dict(workers, "Failed")
|
|
448
451
|
|
|
449
452
|
# current_status (formulated basis):
|
|
450
|
-
# -
|
|
451
|
-
# -
|
|
452
|
-
# -
|
|
453
|
+
# - at least one pods are pending for `_end_state_capsule_version`
|
|
454
|
+
# - at least one pod is in Running state for `_end_state_capsule_version` (maybe terminal) [Might require health-check thing here]
|
|
455
|
+
# - at least one pod is crashlooping for `_end_state_capsule_version` (maybe terminal)
|
|
453
456
|
# - all pods are running for `_end_state_capsule_version` that match the minimum number of replicas
|
|
454
457
|
# - all pods are running for `_end_state_capsule_version` that match the maximum number of replicas and no other pods of older versions are running
|
|
455
458
|
# - no pods relating to `_end_state_capsule_version` are pending/running/crashlooping
|
|
@@ -464,7 +467,8 @@ def _capsule_worker_semantic_status(
|
|
|
464
467
|
"at_least_one_running": (
|
|
465
468
|
count_for_version(running_workers) >= min(min_replicas, 1)
|
|
466
469
|
),
|
|
467
|
-
"at_least_one_crashlooping": count_for_version(crashlooping_workers) > 0
|
|
470
|
+
"at_least_one_crashlooping": count_for_version(crashlooping_workers) > 0
|
|
471
|
+
or count_for_version(failed_workers) > 0,
|
|
468
472
|
"none_present": (
|
|
469
473
|
count_for_version(running_workers) == 0
|
|
470
474
|
and count_for_version(pending_workers) == 0
|
|
@@ -484,6 +488,7 @@ def _capsule_worker_semantic_status(
|
|
|
484
488
|
"pending": count_for_version(pending_workers),
|
|
485
489
|
"running": count_for_version(running_workers),
|
|
486
490
|
"crashlooping": count_for_version(crashlooping_workers),
|
|
491
|
+
"failed": count_for_version(failed_workers),
|
|
487
492
|
},
|
|
488
493
|
}
|
|
489
494
|
|
|
@@ -491,6 +496,7 @@ def _capsule_worker_semantic_status(
|
|
|
491
496
|
"pending": pending_workers,
|
|
492
497
|
"running": running_workers,
|
|
493
498
|
"crashlooping": crashlooping_workers,
|
|
499
|
+
"failed": failed_workers,
|
|
494
500
|
}
|
|
495
501
|
|
|
496
502
|
return {
|
|
@@ -239,7 +239,7 @@ def _bake_image(app_config: AppConfig, cache_dir: str, logger):
|
|
|
239
239
|
baking_status.resolved_image,
|
|
240
240
|
)
|
|
241
241
|
app_config.set_state("python_path", baking_status.python_path)
|
|
242
|
-
logger("🐳 Using
|
|
242
|
+
logger("🐳 Using the docker image : %s" % app_config.get_state("image"))
|
|
243
243
|
|
|
244
244
|
|
|
245
245
|
def print_table(data, headers):
|
|
@@ -339,7 +339,7 @@ def deployment_instance_options(func):
|
|
|
339
339
|
"--readiness-wait-time",
|
|
340
340
|
type=int,
|
|
341
341
|
help="The time (in seconds) to monitor the deployment for readiness after the readiness condition is met.",
|
|
342
|
-
default=
|
|
342
|
+
default=15,
|
|
343
343
|
)
|
|
344
344
|
@click.option(
|
|
345
345
|
"--deployment-timeout",
|
|
@@ -374,11 +374,11 @@ def _package_necessary_things(app_config: AppConfig, logger):
|
|
|
374
374
|
# or is it relative to where the caller command is sitting. Ideally it should work
|
|
375
375
|
# like Kustomizations where its relative to where the yaml file sits for simplicity
|
|
376
376
|
# of understanding relationships between config files. Ideally users can pass the src_path
|
|
377
|
-
# from the command line and that will
|
|
377
|
+
# from the command line and that will alleviate any need to package any other directories for
|
|
378
378
|
#
|
|
379
379
|
|
|
380
|
-
|
|
381
|
-
if
|
|
380
|
+
package_dirs = app_config.get_state("packaging_directories")
|
|
381
|
+
if package_dirs is None:
|
|
382
382
|
app_config.set_state("code_package_url", None)
|
|
383
383
|
app_config.set_state("code_package_key", None)
|
|
384
384
|
return
|
|
@@ -391,11 +391,24 @@ def _package_necessary_things(app_config: AppConfig, logger):
|
|
|
391
391
|
datastore_type=DEFAULT_DATASTORE, code_package_prefix=CODE_PACKAGE_PREFIX
|
|
392
392
|
)
|
|
393
393
|
package_url, package_key = packager.store(
|
|
394
|
-
paths_to_include=
|
|
394
|
+
paths_to_include=package_dirs, file_suffixes=suffixes
|
|
395
395
|
)
|
|
396
396
|
app_config.set_state("code_package_url", package_url)
|
|
397
397
|
app_config.set_state("code_package_key", package_key)
|
|
398
|
-
logger("💾 Code
|
|
398
|
+
logger("💾 Code package saved to : %s" % app_config.get_state("code_package_url"))
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def _sniff_pyproject_and_requirements(packaging_directories: List[str]):
|
|
402
|
+
pyproject_path = None
|
|
403
|
+
requirements_path = None
|
|
404
|
+
for directory in packaging_directories:
|
|
405
|
+
pyproject_toml = os.path.join(directory, "pyproject.toml")
|
|
406
|
+
requirements_txt = os.path.join(directory, "requirements.txt")
|
|
407
|
+
if os.path.exists(pyproject_toml):
|
|
408
|
+
pyproject_path = pyproject_toml
|
|
409
|
+
elif os.path.exists(requirements_txt):
|
|
410
|
+
requirements_path = requirements_txt
|
|
411
|
+
return pyproject_path, requirements_path
|
|
399
412
|
|
|
400
413
|
|
|
401
414
|
@app.command(help="Deploy an app to the Outerbounds Platform.")
|
|
@@ -449,23 +462,19 @@ def deploy(
|
|
|
449
462
|
system_msg=True,
|
|
450
463
|
)
|
|
451
464
|
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
packaging_directory = os.path.abspath(package_src_path)
|
|
459
|
-
else:
|
|
460
|
-
raise AppConfigError(f"src_path '{package_src_path}' does not exist")
|
|
461
|
-
else:
|
|
462
|
-
# If src_path is None then we assume then we can assume for the moment
|
|
465
|
+
package_src_paths = app_config.get("package", {}).get("src_paths", [])
|
|
466
|
+
if package_src_paths is None:
|
|
467
|
+
package_src_paths = []
|
|
468
|
+
|
|
469
|
+
if len(package_src_paths) == 0:
|
|
470
|
+
# If src_paths is None then we assume then we can assume for the moment
|
|
463
471
|
# that we can package the current working directory.
|
|
464
|
-
|
|
472
|
+
package_src_paths = [os.getcwd()]
|
|
465
473
|
|
|
466
|
-
app_config.set_state("
|
|
474
|
+
app_config.set_state("packaging_directories", package_src_paths)
|
|
467
475
|
logger(
|
|
468
|
-
"📦 Packaging
|
|
476
|
+
"📦 Packaging directories : %s"
|
|
477
|
+
% ", ".join(app_config.get_state("packaging_directories")),
|
|
469
478
|
)
|
|
470
479
|
|
|
471
480
|
if app_config.get("no_deps", False):
|
|
@@ -484,22 +493,32 @@ def deploy(
|
|
|
484
493
|
dependencies.get("conda", None) is None,
|
|
485
494
|
]
|
|
486
495
|
):
|
|
496
|
+
python_version = dependencies.get(
|
|
497
|
+
"python"
|
|
498
|
+
) # python gets a default value so it's always set.
|
|
487
499
|
# The user has not set any dependencies, so we can sniff the packaging directory
|
|
488
500
|
# for a dependencies file.
|
|
489
|
-
requirements_file =
|
|
490
|
-
|
|
501
|
+
pyproject_toml, requirements_file = _sniff_pyproject_and_requirements(
|
|
502
|
+
package_src_paths
|
|
491
503
|
)
|
|
492
|
-
pyproject_toml
|
|
493
|
-
if os.path.exists(pyproject_toml):
|
|
504
|
+
if pyproject_toml:
|
|
494
505
|
app_config.set_state(
|
|
495
|
-
"dependencies",
|
|
506
|
+
"dependencies",
|
|
507
|
+
{
|
|
508
|
+
"from_pyproject_toml": pyproject_toml,
|
|
509
|
+
"python": python_version,
|
|
510
|
+
},
|
|
496
511
|
)
|
|
497
512
|
logger(
|
|
498
513
|
"📦 Using dependencies from pyproject.toml: %s" % pyproject_toml
|
|
499
514
|
)
|
|
500
|
-
elif
|
|
515
|
+
elif requirements_file:
|
|
501
516
|
app_config.set_state(
|
|
502
|
-
"dependencies",
|
|
517
|
+
"dependencies",
|
|
518
|
+
{
|
|
519
|
+
"from_requirements_file": requirements_file,
|
|
520
|
+
"python": python_version,
|
|
521
|
+
},
|
|
503
522
|
)
|
|
504
523
|
logger(
|
|
505
524
|
"📦 Using dependencies from requirements.txt: %s"
|
|
@@ -611,7 +630,7 @@ def deploy(
|
|
|
611
630
|
)
|
|
612
631
|
raise AppConfigError(message)
|
|
613
632
|
capsule_logger(
|
|
614
|
-
f"🚀 {'' if not force_upgrade else 'Force'}
|
|
633
|
+
f"🚀 {'Upgrading' if not force_upgrade else 'Force upgrading'} {capsule.capsule_type.lower()} `{capsule.name}`....",
|
|
615
634
|
color=ColorTheme.INFO_COLOR,
|
|
616
635
|
system_msg=True,
|
|
617
636
|
)
|
|
@@ -632,7 +651,7 @@ def deploy(
|
|
|
632
651
|
capsule_spinner.stop()
|
|
633
652
|
|
|
634
653
|
logger(
|
|
635
|
-
f"💊 {capsule.capsule_type} {app_config.config['name']} ({capsule.identifier}) deployed! {capsule.capsule_type}
|
|
654
|
+
f"💊 {capsule.capsule_type} {app_config.config['name']} ({capsule.identifier}) deployed! {capsule.capsule_type} available on the URL: {capsule.url}",
|
|
636
655
|
color=ColorTheme.INFO_COLOR,
|
|
637
656
|
system_msg=True,
|
|
638
657
|
)
|
|
@@ -761,7 +780,7 @@ def list(ctx, project, branch, name, tags, format, auth_type):
|
|
|
761
780
|
def delete(ctx, name, cap_id, project, branch, tags, auto_approve):
|
|
762
781
|
|
|
763
782
|
"""Delete an app/apps from the Outerbounds Platform."""
|
|
764
|
-
#
|
|
783
|
+
# At least one of the args need to be provided
|
|
765
784
|
if not any(
|
|
766
785
|
[
|
|
767
786
|
name is not None,
|
|
@@ -772,7 +791,7 @@ def delete(ctx, name, cap_id, project, branch, tags, auto_approve):
|
|
|
772
791
|
]
|
|
773
792
|
):
|
|
774
793
|
raise AppConfigError(
|
|
775
|
-
"
|
|
794
|
+
"At least one of the options need to be provided. You can use --name, --id, --project, --branch, --tag"
|
|
776
795
|
)
|
|
777
796
|
|
|
778
797
|
capsule_api = CapsuleApi(ctx.obj.api_url, ctx.obj.perimeter)
|
|
@@ -45,10 +45,11 @@ def _try_loading_yaml(file):
|
|
|
45
45
|
class AuthType:
|
|
46
46
|
BROWSER = "Browser"
|
|
47
47
|
API = "API"
|
|
48
|
+
BROWSER_AND_API = "BrowserAndApi"
|
|
48
49
|
|
|
49
50
|
@classmethod
|
|
50
51
|
def enums(cls):
|
|
51
|
-
return [cls.BROWSER, cls.API]
|
|
52
|
+
return [cls.BROWSER, cls.API, cls.BROWSER_AND_API]
|
|
52
53
|
|
|
53
54
|
@classproperty
|
|
54
55
|
def default(cls):
|
|
@@ -7,7 +7,7 @@ import sys
|
|
|
7
7
|
import time
|
|
8
8
|
from functools import partial
|
|
9
9
|
import shlex
|
|
10
|
-
from typing import Optional, List, Dict, Any, Tuple, Union
|
|
10
|
+
from typing import Optional, List, Dict, Any, Tuple, Union, Callable
|
|
11
11
|
from .utils import TODOException, safe_requests_wrapper, MaximumRetriesExceeded
|
|
12
12
|
from .app_config import AppConfig, CAPSULE_DEBUG, AuthType
|
|
13
13
|
from . import experimental
|
|
@@ -44,24 +44,24 @@ class CapsuleStateMachine:
|
|
|
44
44
|
- Happy Path:
|
|
45
45
|
- First time Create :
|
|
46
46
|
- wait for status.updateInProgress to be set to False
|
|
47
|
-
- (
|
|
47
|
+
- (interleaved) Poll the worker endpoints to check their status
|
|
48
48
|
- showcase how many workers are coming up if things are on the cli side.
|
|
49
49
|
- If the user has set some flag like `--dont-wait-to-fully-finish` then we check the `status.currentlyServedVersion` to see if even one replica is ready to
|
|
50
50
|
serve traffic.
|
|
51
51
|
- once the status.updateInProgress is set to False, it means that the replicas are ready
|
|
52
52
|
- Upgrade:
|
|
53
53
|
- wait for status.updateInProgress to be set to False
|
|
54
|
-
- (
|
|
54
|
+
- (interleaved) Poll the worker endpoints to check their status and signal the user the number replicas coming up
|
|
55
55
|
- If the user has set some flag like `--dont-wait-to-fully-finish` then we check the `status.currentlyServedVersion` to see if even one replica is ready to
|
|
56
56
|
serve traffic.
|
|
57
57
|
- Unhappy Path:
|
|
58
58
|
- First time Create :
|
|
59
59
|
- wait for status.updateInProgress to be set to False,
|
|
60
|
-
- (
|
|
60
|
+
- (interleaved) Poll the workers to check their status.
|
|
61
61
|
- If the worker pertaining the current deployment instance version is crashlooping then crash the deployment process with the error messages and logs.
|
|
62
62
|
- Upgrade:
|
|
63
63
|
- wait for status.updateInProgress to be set to False,
|
|
64
|
-
- (
|
|
64
|
+
- (interleaved) Poll the workers to check their status.
|
|
65
65
|
- If the worker pertaining the current deployment instance version is crashlooping then crash the deployment process with the error messages and logs.
|
|
66
66
|
|
|
67
67
|
"""
|
|
@@ -75,7 +75,6 @@ class CapsuleStateMachine:
|
|
|
75
75
|
return self._status_trail
|
|
76
76
|
|
|
77
77
|
def add_status(self, status: CapsuleStatus):
|
|
78
|
-
assert type(status) == dict, "TODO: Make this check somewhere else"
|
|
79
78
|
self._status_trail.append({"timestamp": time.time(), "status": status})
|
|
80
79
|
|
|
81
80
|
@property
|
|
@@ -116,7 +115,9 @@ class CapsuleStateMachine:
|
|
|
116
115
|
pass
|
|
117
116
|
|
|
118
117
|
def save_debug_info(self, state_dir: str):
|
|
119
|
-
debug_path = os.path.join(
|
|
118
|
+
debug_path = os.path.join(
|
|
119
|
+
state_dir, f"debug_capsule_sm_{self._capsule_id}.json"
|
|
120
|
+
)
|
|
120
121
|
with open(debug_path, "w") as f:
|
|
121
122
|
json.dump(self._status_trail, f, indent=4)
|
|
122
123
|
|
|
@@ -210,9 +211,9 @@ class CapsuleInput:
|
|
|
210
211
|
def construct_exec_command(cls, commands: List[str]):
|
|
211
212
|
commands = ["set -eEuo pipefail"] + commands
|
|
212
213
|
command_string = "\n".join(commands)
|
|
213
|
-
# First
|
|
214
|
+
# First construct a base64 encoded string of the quoted command
|
|
214
215
|
# One of the reasons we don't directly pass the command string to the backend with a `\n` join
|
|
215
|
-
# is because the backend controller
|
|
216
|
+
# is because the backend controller doesn't play nice when the command can be a multi-line string.
|
|
216
217
|
# So we encode it to a base64 string and then decode it back to a command string at runtime to provide to
|
|
217
218
|
# `bash -c`. The ideal thing to have done is to run "bash -c {shlex.quote(command_string)}" and call it a day
|
|
218
219
|
# but the backend controller yields the following error:
|
|
@@ -255,6 +256,12 @@ class CapsuleInput:
|
|
|
255
256
|
replicas.get("min"),
|
|
256
257
|
replicas.get("max"),
|
|
257
258
|
)
|
|
259
|
+
rpm = replicas.get("scaling_policy", {}).get("rpm", None)
|
|
260
|
+
autoscaling_config = {}
|
|
261
|
+
if rpm:
|
|
262
|
+
autoscaling_config = {
|
|
263
|
+
"requestRateBasedAutoscalingConfig": {"targetRequestsPerMinute": rpm}
|
|
264
|
+
}
|
|
258
265
|
if fixed is not None:
|
|
259
266
|
_min, _max = fixed, fixed
|
|
260
267
|
gpu_resource = app_config.get_state("resources").get("gpu")
|
|
@@ -296,6 +303,7 @@ class CapsuleInput:
|
|
|
296
303
|
"autoscalingConfig": {
|
|
297
304
|
"minReplicas": _min,
|
|
298
305
|
"maxReplicas": _max,
|
|
306
|
+
**autoscaling_config,
|
|
299
307
|
},
|
|
300
308
|
**_scheduling_config,
|
|
301
309
|
"containerStartupConfig": {
|
|
@@ -420,7 +428,7 @@ class CapsuleApi:
|
|
|
420
428
|
message="Capsule JSON decode failed",
|
|
421
429
|
)
|
|
422
430
|
|
|
423
|
-
def get(self, capsule_id: str):
|
|
431
|
+
def get(self, capsule_id: str) -> Dict[str, Any]:
|
|
424
432
|
_url = os.path.join(self._base_url, capsule_id)
|
|
425
433
|
response = self._wrapped_api_caller(
|
|
426
434
|
requests.get,
|
|
@@ -439,6 +447,35 @@ class CapsuleApi:
|
|
|
439
447
|
message="Capsule JSON decode failed",
|
|
440
448
|
)
|
|
441
449
|
|
|
450
|
+
# TODO: refactor me since name *currently(9/8/25)* is unique across capsules.
|
|
451
|
+
def get_by_name(self, name: str, most_recent_only: bool = True):
|
|
452
|
+
_url = os.path.join(self._base_url, f"?displayName={name}")
|
|
453
|
+
response = self._wrapped_api_caller(
|
|
454
|
+
requests.get,
|
|
455
|
+
_url,
|
|
456
|
+
retryable_status_codes=[409], # todo : verify me
|
|
457
|
+
conn_error_retries=3,
|
|
458
|
+
)
|
|
459
|
+
try:
|
|
460
|
+
if most_recent_only:
|
|
461
|
+
result = response.json()
|
|
462
|
+
candidates = result["capsules"]
|
|
463
|
+
if not candidates:
|
|
464
|
+
return None
|
|
465
|
+
return sorted(
|
|
466
|
+
candidates, key=lambda x: x["metadata"]["createdAt"], reverse=True
|
|
467
|
+
)[0]
|
|
468
|
+
else:
|
|
469
|
+
return response.json()
|
|
470
|
+
except json.JSONDecodeError as e:
|
|
471
|
+
raise CapsuleApiException(
|
|
472
|
+
_url,
|
|
473
|
+
"get",
|
|
474
|
+
response.status_code,
|
|
475
|
+
response.text,
|
|
476
|
+
message="Capsule JSON decode failed",
|
|
477
|
+
)
|
|
478
|
+
|
|
442
479
|
def list(self):
|
|
443
480
|
response = self._wrapped_api_caller(
|
|
444
481
|
requests.get,
|
|
@@ -641,7 +678,7 @@ class CapsuleDeployer:
|
|
|
641
678
|
auth_type = self._app_config.get_state("auth", {}).get("type", AuthType.default)
|
|
642
679
|
if auth_type == AuthType.BROWSER:
|
|
643
680
|
return "App"
|
|
644
|
-
elif auth_type == AuthType.API:
|
|
681
|
+
elif auth_type == AuthType.API or auth_type == AuthType.BROWSER_AND_API:
|
|
645
682
|
return "Endpoint"
|
|
646
683
|
else:
|
|
647
684
|
raise TODOException(f"Unknown auth type: {auth_type}")
|
|
@@ -682,7 +719,7 @@ class CapsuleDeployer:
|
|
|
682
719
|
"""
|
|
683
720
|
- `capsule_response.version` contains the version of the object present in the database
|
|
684
721
|
- `current_deployment_instance_version` contains the version of the object that was deployed by this instance of the deployer.
|
|
685
|
-
In the
|
|
722
|
+
In the situation that the versions of the objects become a mismatch then it means that current deployment process is not giving the user the
|
|
686
723
|
output that they desire.
|
|
687
724
|
"""
|
|
688
725
|
if capsule_response.get("version", None) != current_deployment_instance_version:
|
|
@@ -691,17 +728,51 @@ class CapsuleDeployer:
|
|
|
691
728
|
f"A capsule upgrade was triggered outside current deployment instance. Current deployment version was discarded. Current deployment version: {current_deployment_instance_version} and new version: {capsule_response.get('version', None)}",
|
|
692
729
|
)
|
|
693
730
|
|
|
731
|
+
def _update_capsule_and_worker_sm(
|
|
732
|
+
self,
|
|
733
|
+
capsule_sm: "CapsuleStateMachine",
|
|
734
|
+
workers_sm: "CapsuleWorkersStateMachine",
|
|
735
|
+
logger: Callable[[str], None],
|
|
736
|
+
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
|
|
737
|
+
capsule_response = self.get()
|
|
738
|
+
capsule_sm.add_status(capsule_response.get("status", {})) # type: ignore
|
|
739
|
+
|
|
740
|
+
# We need to check if someone has not upgraded the capsule under the hood and
|
|
741
|
+
# the current deployment instance is invalid.
|
|
742
|
+
self._backend_version_mismatch_check(
|
|
743
|
+
capsule_response, self.current_deployment_instance_version # type: ignore
|
|
744
|
+
)
|
|
745
|
+
workers_response = self.get_workers()
|
|
746
|
+
capsule_sm.report_current_status(logger)
|
|
747
|
+
workers_sm.add_status(workers_response)
|
|
748
|
+
workers_sm.report_current_status(logger)
|
|
749
|
+
return capsule_response, workers_response
|
|
750
|
+
|
|
751
|
+
def _publish_capsule_debug_info(
|
|
752
|
+
self,
|
|
753
|
+
capsule_sm: "CapsuleStateMachine",
|
|
754
|
+
workers_sm: "CapsuleWorkersStateMachine",
|
|
755
|
+
capsule_response: Dict[str, Any],
|
|
756
|
+
):
|
|
757
|
+
if CAPSULE_DEBUG and self._debug_dir:
|
|
758
|
+
capsule_sm.save_debug_info(self._debug_dir)
|
|
759
|
+
workers_sm.save_debug_info(self._debug_dir)
|
|
760
|
+
debug_path = os.path.join(
|
|
761
|
+
self._debug_dir, f"debug_capsule_{self.identifier}.json"
|
|
762
|
+
)
|
|
763
|
+
with open(debug_path, "w") as f:
|
|
764
|
+
f.write(json.dumps(capsule_response, indent=4))
|
|
765
|
+
|
|
694
766
|
def _monitor_worker_readiness(
|
|
695
767
|
self,
|
|
696
768
|
workers_sm: "CapsuleWorkersStateMachine",
|
|
769
|
+
capsule_sm: "CapsuleStateMachine",
|
|
697
770
|
):
|
|
698
771
|
"""returns True if the worker is crashlooping, False otherwise"""
|
|
699
772
|
logger = self._logger_fn or partial(print, file=sys.stderr)
|
|
700
773
|
for i in range(self._readiness_wait_time):
|
|
701
774
|
time.sleep(1)
|
|
702
|
-
|
|
703
|
-
workers_sm.add_status(workers_response)
|
|
704
|
-
workers_sm.report_current_status(logger)
|
|
775
|
+
self._update_capsule_and_worker_sm(capsule_sm, workers_sm, logger)
|
|
705
776
|
if workers_sm.is_crashlooping:
|
|
706
777
|
return True
|
|
707
778
|
return False
|
|
@@ -713,7 +784,7 @@ class CapsuleDeployer:
|
|
|
713
784
|
workers_status: List[WorkerStatus],
|
|
714
785
|
):
|
|
715
786
|
for worker in workers_status:
|
|
716
|
-
if worker["phase"] == "CrashLoopBackOff":
|
|
787
|
+
if worker["phase"] == "CrashLoopBackOff" or worker["phase"] == "Failed":
|
|
717
788
|
return worker["workerId"]
|
|
718
789
|
return None
|
|
719
790
|
|
|
@@ -747,21 +818,19 @@ class CapsuleDeployer:
|
|
|
747
818
|
minimum_replicas=min_replicas,
|
|
748
819
|
)
|
|
749
820
|
self.status = state_machine
|
|
821
|
+
|
|
822
|
+
# This loop will check all the conditions that help verify the terminal state.
|
|
823
|
+
# How it works is by extracting the statuses of the capsule and workers and
|
|
824
|
+
# then adding them as a part of a state-machine that helps track transitions and
|
|
825
|
+
# helps derive terminal states.
|
|
826
|
+
# We will first keep checking for terminal conditions or outright failure conditions
|
|
827
|
+
# If we reach a teminal condition like described in `DEPLOYMENT_READY_CONDITIONS`, then
|
|
828
|
+
# we will further check for readiness conditions.
|
|
750
829
|
for i in range(self._create_timeout):
|
|
751
830
|
time.sleep(1)
|
|
752
|
-
capsule_response = self.
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
# We first need to check if someone has not upgraded the capsule under the hood and
|
|
756
|
-
# the current deployment instance is invalid.
|
|
757
|
-
self._backend_version_mismatch_check(
|
|
758
|
-
capsule_response, self.current_deployment_instance_version # type: ignore
|
|
831
|
+
capsule_response, _ = self._update_capsule_and_worker_sm(
|
|
832
|
+
state_machine, workers_state_machine, logger
|
|
759
833
|
)
|
|
760
|
-
state_machine.add_status(capsule_response.get("status", {})) # type: ignore
|
|
761
|
-
workers_state_machine.add_status(workers_response)
|
|
762
|
-
state_machine.report_current_status(logger)
|
|
763
|
-
|
|
764
|
-
workers_state_machine.report_current_status(logger)
|
|
765
834
|
# Deployment readiness checks will determine what is the terminal state
|
|
766
835
|
# of the workerstate machine. If we detect a terminal state in the workers,
|
|
767
836
|
# then even if the capsule upgrade is still in progress we will end up crashing
|
|
@@ -783,28 +852,29 @@ class CapsuleDeployer:
|
|
|
783
852
|
)
|
|
784
853
|
if capsule_ready or failure_condition_satisfied:
|
|
785
854
|
logger(
|
|
786
|
-
"💊 %s deployment status: %s
|
|
855
|
+
"💊 %s deployment status: %s "
|
|
787
856
|
% (
|
|
788
857
|
self.capsule_type.title(),
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
858
|
+
(
|
|
859
|
+
"in progress"
|
|
860
|
+
if state_machine.update_in_progress
|
|
861
|
+
else "completed"
|
|
862
|
+
),
|
|
794
863
|
)
|
|
795
864
|
)
|
|
796
865
|
_further_readiness_check_failed = False
|
|
797
866
|
if further_check_worker_readiness:
|
|
798
867
|
# HACK : monitor the workers for N seconds to make sure they are healthy
|
|
799
|
-
# this is a hack. Ideally we should
|
|
868
|
+
# this is a hack. Ideally we should implement a healthcheck as a first class citizen
|
|
800
869
|
# but it will take some time to do that so in the meanwhile a timeout set on the cli
|
|
801
870
|
# side will be really helpful.
|
|
802
871
|
logger(
|
|
803
|
-
"💊
|
|
872
|
+
"💊 Running last minute readiness check for %s..."
|
|
804
873
|
% self.identifier
|
|
805
874
|
)
|
|
806
875
|
_further_readiness_check_failed = self._monitor_worker_readiness(
|
|
807
|
-
workers_state_machine
|
|
876
|
+
workers_state_machine,
|
|
877
|
+
state_machine,
|
|
808
878
|
)
|
|
809
879
|
|
|
810
880
|
if CAPSULE_DEBUG:
|
|
@@ -848,13 +918,18 @@ class CapsuleDeployer:
|
|
|
848
918
|
|
|
849
919
|
break
|
|
850
920
|
|
|
851
|
-
|
|
852
|
-
state_machine
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
921
|
+
self._publish_capsule_debug_info(
|
|
922
|
+
state_machine, workers_state_machine, capsule_response
|
|
923
|
+
)
|
|
924
|
+
|
|
925
|
+
if CAPSULE_DEBUG and i % 3 == 0: # Every 3 seconds report the status
|
|
926
|
+
logger(
|
|
927
|
+
f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status: {state_machine.current_status} | worker states: {workers_state_machine.current_status} | capsule_ready : {capsule_ready} | further_check_worker_readiness {further_check_worker_readiness}"
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
self._publish_capsule_debug_info(
|
|
931
|
+
state_machine, workers_state_machine, capsule_response
|
|
932
|
+
)
|
|
858
933
|
|
|
859
934
|
# We will only check ready_to_serve_traffic under the following conditions:
|
|
860
935
|
# If the readiness condition is not Async and min_replicas in this deployment
|
|
@@ -872,13 +947,6 @@ class CapsuleDeployer:
|
|
|
872
947
|
f"Capsule {self.identifier} failed to be ready to serve traffic",
|
|
873
948
|
)
|
|
874
949
|
|
|
875
|
-
if CAPSULE_DEBUG and self._debug_dir:
|
|
876
|
-
state_machine.save_debug_info(self._debug_dir)
|
|
877
|
-
workers_state_machine.save_debug_info(self._debug_dir)
|
|
878
|
-
logger(
|
|
879
|
-
f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status [on return]: {state_machine.current_status} | worker states: {workers_state_machine.current_status}"
|
|
880
|
-
)
|
|
881
|
-
|
|
882
950
|
return dict(
|
|
883
951
|
id=self.identifier,
|
|
884
952
|
auth_type=self.capsule_type,
|