PyPI - metaflow - Versions diffs - 2.11.1__py2.py3-none-any.whl → 2.11.3__py2.py3-none-any.whl - Mend

metaflow 2.11.1py2.py3-none-any.whl → 2.11.3py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

metaflow/flowspec.py +7 -3
metaflow/metaflow_config.py +11 -1
metaflow/parameters.py +6 -0
metaflow/plugins/argo/argo_workflows.py +101 -23
metaflow/plugins/aws/batch/batch.py +2 -0
metaflow/plugins/aws/batch/batch_client.py +10 -2
metaflow/plugins/aws/step_functions/dynamo_db_client.py +28 -6
metaflow/plugins/aws/step_functions/production_token.py +1 -1
metaflow/plugins/aws/step_functions/step_functions.py +219 -4
metaflow/plugins/aws/step_functions/step_functions_cli.py +104 -6
metaflow/plugins/aws/step_functions/step_functions_client.py +8 -3
metaflow/plugins/aws/step_functions/step_functions_decorator.py +1 -1
metaflow/plugins/cards/card_cli.py +2 -2
metaflow/plugins/kubernetes/kubernetes.py +2 -0
metaflow/plugins/kubernetes/kubernetes_cli.py +3 -0
metaflow/plugins/kubernetes/kubernetes_client.py +10 -2
metaflow/plugins/kubernetes/kubernetes_decorator.py +17 -0
metaflow/plugins/kubernetes/kubernetes_job.py +27 -0
metaflow/plugins/pypi/bootstrap.py +1 -1
metaflow/plugins/pypi/conda_decorator.py +21 -1
metaflow/plugins/pypi/conda_environment.py +21 -4
metaflow/version.py +1 -1
{metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/METADATA +2 -2
{metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/RECORD +28 -28
{metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/LICENSE +0 -0
{metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/WHEEL +0 -0
{metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/entry_points.txt +0 -0
{metaflow-2.11.1.dist-info → metaflow-2.11.3.dist-info}/top_level.txt +0 -0

metaflow/plugins/aws/step_functions/step_functions.py CHANGED Viewed

@@ -15,6 +15,7 @@ from metaflow.metaflow_config import (
     SFN_DYNAMO_DB_TABLE,
     SFN_EXECUTION_LOG_GROUP_ARN,
     SFN_IAM_ROLE,
+    SFN_S3_DISTRIBUTED_MAP_OUTPUT_PATH,
 )
 from metaflow.parameters import deploy_time_eval
 from metaflow.util import dict_to_cli_options, to_pascalcase
@@ -52,6 +53,7 @@ class StepFunctions(object):
         max_workers=None,
         workflow_timeout=None,
         is_project=False,
+        use_distributed_map=False,
     ):
         self.name = name
         self.graph = graph
@@ -70,6 +72,9 @@ class StepFunctions(object):
         self.max_workers = max_workers
         self.workflow_timeout = workflow_timeout
+        # https://aws.amazon.com/blogs/aws/step-functions-distributed-map-a-serverless-solution-for-large-scale-parallel-data-processing/
+        self.use_distributed_map = use_distributed_map
         self._client = StepFunctionsClient()
         self._workflow = self._compile()
         self._cron = self._cron()
@@ -166,6 +171,13 @@ class StepFunctions(object):
         return schedule_deleted, sfn_deleted
+    @classmethod
+    def terminate(cls, flow_name, name):
+        client = StepFunctionsClient()
+        execution_arn, _, _, _ = cls.get_execution(flow_name, name)
+        response = client.terminate_execution(execution_arn)
+        return response
     @classmethod
     def trigger(cls, name, parameters):
         try:
@@ -234,6 +246,50 @@ class StepFunctions(object):
                 )
         return None
+    @classmethod
+    def get_execution(cls, state_machine_name, name):
+        client = StepFunctionsClient()
+        try:
+            state_machine = client.get(state_machine_name)
+        except Exception as e:
+            raise StepFunctionsException(repr(e))
+        if state_machine is None:
+            raise StepFunctionsException(
+                "The state machine *%s* doesn't exist on AWS Step Functions."
+                % state_machine_name
+            )
+        try:
+            state_machine_arn = state_machine.get("stateMachineArn")
+            environment_vars = (
+                json.loads(state_machine.get("definition"))
+                .get("States")
+                .get("start")
+                .get("Parameters")
+                .get("ContainerOverrides")
+                .get("Environment")
+            )
+            parameters = {
+                item.get("Name"): item.get("Value") for item in environment_vars
+            }
+            executions = client.list_executions(state_machine_arn, states=["RUNNING"])
+            for execution in executions:
+                if execution.get("name") == name:
+                    try:
+                        return (
+                            execution.get("executionArn"),
+                            parameters.get("METAFLOW_OWNER"),
+                            parameters.get("METAFLOW_PRODUCTION_TOKEN"),
+                            parameters.get("SFN_STATE_MACHINE"),
+                        )
+                    except KeyError:
+                        raise StepFunctionsException(
+                            "A non-metaflow workflow *%s* already exists in AWS Step Functions."
+                            % name
+                        )
+            return None
+        except Exception as e:
+            raise StepFunctionsException(repr(e))
     def _compile(self):
         if self.flow._flow_decorators.get("trigger") or self.flow._flow_decorators.get(
             "trigger_on_finish"
@@ -314,17 +370,80 @@ class StepFunctions(object):
                     .parameter("SplitParentTaskId.$", "$.JobId")
                     .parameter("Parameters.$", "$.Parameters")
                     .parameter("Index.$", "$$.Map.Item.Value")
-                    .next(node.matching_join)
+                    .next(
+                        "%s_*GetManifest" % iterator_name
+                        if self.use_distributed_map
+                        else node.matching_join
+                    )
                     .iterator(
                         _visit(
                             self.graph[node.out_funcs[0]],
-                            Workflow(node.out_funcs[0]).start_at(node.out_funcs[0]),
+                            Workflow(node.out_funcs[0])
+                            .start_at(node.out_funcs[0])
+                            .mode(
+                                "DISTRIBUTED" if self.use_distributed_map else "INLINE"
+                            ),
                             node.matching_join,
                         )
                     )
                     .max_concurrency(self.max_workers)
-                    .output_path("$.[0]")
+                    # AWS Step Functions has a short coming for DistributedMap at the
+                    # moment that does not allow us to subset the output of for-each
+                    # to just a single element. We have to rely on a rather terrible
+                    # hack and resort to using ResultWriter to write the state to
+                    # Amazon S3 and process it in another task. But, well what can we
+                    # do...
+                    .result_writer(
+                        *(
+                            (
+                                (
+                                    SFN_S3_DISTRIBUTED_MAP_OUTPUT_PATH[len("s3://") :]
+                                    if SFN_S3_DISTRIBUTED_MAP_OUTPUT_PATH.startswith(
+                                        "s3://"
+                                    )
+                                    else SFN_S3_DISTRIBUTED_MAP_OUTPUT_PATH
+                                ).split("/", 1)
+                                + [""]
+                            )[:2]
+                            if self.use_distributed_map
+                            else (None, None)
+                        )
+                    )
+                    .output_path("$" if self.use_distributed_map else "$.[0]")
                 )
+                if self.use_distributed_map:
+                    workflow.add_state(
+                        State("%s_*GetManifest" % iterator_name)
+                        .resource("arn:aws:states:::aws-sdk:s3:getObject")
+                        .parameter("Bucket.$", "$.ResultWriterDetails.Bucket")
+                        .parameter("Key.$", "$.ResultWriterDetails.Key")
+                        .next("%s_*Map" % iterator_name)
+                        .result_selector("Body.$", "States.StringToJson($.Body)")
+                    )
+                    workflow.add_state(
+                        Map("%s_*Map" % iterator_name)
+                        .iterator(
+                            Workflow("%s_*PassWorkflow" % iterator_name)
+                            .mode("DISTRIBUTED")
+                            .start_at("%s_*Pass" % iterator_name)
+                            .add_state(
+                                Pass("%s_*Pass" % iterator_name)
+                                .end()
+                                .parameter("Output.$", "States.StringToJson($.Output)")
+                                .output_path("$.Output")
+                            )
+                        )
+                        .next(node.matching_join)
+                        .max_concurrency(1000)
+                        .item_reader(
+                            JSONItemReader()
+                            .resource("arn:aws:states:::s3:getObject")
+                            .parameter("Bucket.$", "$.Body.DestinationBucket")
+                            .parameter("Key.$", "$.Body.ResultFiles.SUCCEEDED.[0].Key")
+                        )
+                        .output_path("$.[0]")
+                    )
                 # Continue the traversal from the matching_join.
                 _visit(self.graph[node.matching_join], workflow, exit_node)
             # We shouldn't ideally ever get here.
@@ -393,7 +512,6 @@ class StepFunctions(object):
             "metaflow.owner": self.username,
             "metaflow.flow_name": self.flow.name,
             "metaflow.step_name": node.name,
-            "metaflow.run_id.$": "$$.Execution.Name",
             # Unfortunately we can't set the task id here since AWS Step
             # Functions lacks any notion of run-scoped task identifiers. We
             # instead co-opt the AWS Batch job id as the task id. This also
@@ -405,6 +523,10 @@ class StepFunctions(object):
             # `$$.State.RetryCount` resolves to an int dynamically and
             # AWS Batch job specification only accepts strings. We handle
             # retries/catch within AWS Batch to get around this limitation.
+            # And, we also cannot set the run id here since the run id maps to
+            # the execution name of the AWS Step Functions State Machine, which
+            # is different when executing inside a distributed map. We set it once
+            # in the start step and move it along to be consumed by all the children.
             "metaflow.version": self.environment.get_environment_info()[
                 "metaflow_version"
             ],
@@ -441,6 +563,12 @@ class StepFunctions(object):
             env["METAFLOW_S3_ENDPOINT_URL"] = S3_ENDPOINT_URL
         if node.name == "start":
+            # metaflow.run_id maps to AWS Step Functions State Machine Execution in all
+            # cases except for when within a for-each construct that relies on
+            # Distributed Map. To work around this issue, we pass the run id from the
+            # start step to all subsequent tasks.
+            attrs["metaflow.run_id.$"] = "$$.Execution.Name"
             # Initialize parameters for the flow in the `start` step.
             parameters = self._process_parameters()
             if parameters:
@@ -499,6 +627,8 @@ class StepFunctions(object):
                 env["METAFLOW_SPLIT_PARENT_TASK_ID"] = (
                     "$.Parameters.split_parent_task_id_%s" % node.split_parents[-1]
                 )
+                # Inherit the run id from the parent and pass it along to children.
+                attrs["metaflow.run_id.$"] = "$.Parameters.['metaflow.run_id']"
             else:
                 # Set appropriate environment variables for runtime replacement.
                 if len(node.in_funcs) == 1:
@@ -507,6 +637,8 @@ class StepFunctions(object):
                         % node.in_funcs[0]
                     )
                     env["METAFLOW_PARENT_TASK_ID"] = "$.JobId"
+                    # Inherit the run id from the parent and pass it along to children.
+                    attrs["metaflow.run_id.$"] = "$.Parameters.['metaflow.run_id']"
                 else:
                     # Generate the input paths in a quasi-compressed format.
                     # See util.decompress_list for why this is written the way
@@ -516,6 +648,8 @@ class StepFunctions(object):
                         "${METAFLOW_PARENT_%s_TASK_ID}" % (idx, idx)
                         for idx, _ in enumerate(node.in_funcs)
                     )
+                    # Inherit the run id from the parent and pass it along to children.
+                    attrs["metaflow.run_id.$"] = "$.[0].Parameters.['metaflow.run_id']"
                     for idx, _ in enumerate(node.in_funcs):
                         env["METAFLOW_PARENT_%s_TASK_ID" % idx] = "$.[%s].JobId" % idx
                         env["METAFLOW_PARENT_%s_STEP" % idx] = (
@@ -842,6 +976,12 @@ class Workflow(object):
         tree = lambda: defaultdict(tree)
         self.payload = tree()
+    def mode(self, mode):
+        self.payload["ProcessorConfig"] = {"Mode": mode}
+        if mode == "DISTRIBUTED":
+            self.payload["ProcessorConfig"]["ExecutionType"] = "STANDARD"
+        return self
     def start_at(self, start_at):
         self.payload["StartAt"] = start_at
         return self
@@ -889,10 +1029,18 @@ class State(object):
         self.payload["ResultPath"] = result_path
         return self
+    def result_selector(self, name, value):
+        self.payload["ResultSelector"][name] = value
+        return self
     def _partition(self):
         # This is needed to support AWS Gov Cloud and AWS CN regions
         return SFN_IAM_ROLE.split(":")[1]
+    def retry_strategy(self, retry_strategy):
+        self.payload["Retry"] = [retry_strategy]
+        return self
     def batch(self, job):
         self.resource(
             "arn:%s:states:::batch:submitJob.sync" % self._partition()
@@ -912,6 +1060,19 @@ class State(object):
         # tags may not be present in all scenarios
         if "tags" in job.payload:
             self.parameter("Tags", job.payload["tags"])
+        # set retry strategy for AWS Batch job submission to account for the
+        # measily 50 jobs / second queue admission limit which people can
+        # run into very quickly.
+        self.retry_strategy(
+            {
+                "ErrorEquals": ["Batch.AWSBatchException"],
+                "BackoffRate": 2,
+                "IntervalSeconds": 2,
+                "MaxDelaySeconds": 60,
+                "MaxAttempts": 10,
+                "JitterStrategy": "FULL",
+            }
+        )
         return self
     def dynamo_db(self, table_name, primary_key, values):
@@ -925,6 +1086,26 @@ class State(object):
         return self
+class Pass(object):
+    def __init__(self, name):
+        self.name = name
+        tree = lambda: defaultdict(tree)
+        self.payload = tree()
+        self.payload["Type"] = "Pass"
+    def end(self):
+        self.payload["End"] = True
+        return self
+    def parameter(self, name, value):
+        self.payload["Parameters"][name] = value
+        return self
+    def output_path(self, output_path):
+        self.payload["OutputPath"] = output_path
+        return self
 class Parallel(object):
     def __init__(self, name):
         self.name = name
@@ -986,3 +1167,37 @@ class Map(object):
     def result_path(self, result_path):
         self.payload["ResultPath"] = result_path
         return self
+    def item_reader(self, item_reader):
+        self.payload["ItemReader"] = item_reader.payload
+        return self
+    def result_writer(self, bucket, prefix):
+        if bucket is not None and prefix is not None:
+            self.payload["ResultWriter"] = {
+                "Resource": "arn:aws:states:::s3:putObject",
+                "Parameters": {
+                    "Bucket": bucket,
+                    "Prefix": prefix,
+                },
+            }
+        return self
+class JSONItemReader(object):
+    def __init__(self):
+        tree = lambda: defaultdict(tree)
+        self.payload = tree()
+        self.payload["ReaderConfig"] = {"InputType": "JSON", "MaxItems": 1}
+    def resource(self, resource):
+        self.payload["Resource"] = resource
+        return self
+    def parameter(self, name, value):
+        self.payload["Parameters"][name] = value
+        return self
+    def output_path(self, output_path):
+        self.payload["OutputPath"] = output_path
+        return self

metaflow/plugins/aws/step_functions/step_functions_cli.py CHANGED Viewed

@@ -1,23 +1,23 @@
 import base64
-from metaflow._vendor import click
-from hashlib import sha1
 import json
 import re
+from hashlib import sha1
-from metaflow import current, decorators, parameters, JSONType
+from metaflow import JSONType, current, decorators, parameters
+from metaflow._vendor import click
+from metaflow.exception import MetaflowException, MetaflowInternalError
 from metaflow.metaflow_config import (
     SERVICE_VERSION_CHECK,
     SFN_STATE_MACHINE_PREFIX,
     UI_URL,
 )
-from metaflow.exception import MetaflowException, MetaflowInternalError
 from metaflow.package import MetaflowPackage
 from metaflow.plugins.aws.batch.batch_decorator import BatchDecorator
 from metaflow.tagging_util import validate_tags
 from metaflow.util import get_username, to_bytes, to_unicode, version_parse
+from .production_token import load_token, new_token, store_token
 from .step_functions import StepFunctions
-from .production_token import load_token, store_token, new_token
 VALID_NAME = re.compile(r"[^a-zA-Z0-9_\-\.]")
@@ -26,6 +26,10 @@ class IncorrectProductionToken(MetaflowException):
     headline = "Incorrect production token"
+class RunIdMismatch(MetaflowException):
+    headline = "Run ID mismatch"
 class IncorrectMetadataServiceVersion(MetaflowException):
     headline = "Incorrect version for metaflow service"
@@ -120,6 +124,12 @@ def step_functions(obj, name=None):
     help="Log AWS Step Functions execution history to AWS CloudWatch "
     "Logs log group.",
 )
+@click.option(
+    "--use-distributed-map/--no-use-distributed-map",
+    is_flag=True,
+    help="Use AWS Step Functions Distributed Map instead of Inline Map for "
+    "defining foreach tasks in Amazon State Language.",
+)
 @click.pass_obj
 def create(
     obj,
@@ -132,6 +142,7 @@ def create(
     max_workers=None,
     workflow_timeout=None,
     log_execution_history=False,
+    use_distributed_map=False,
 ):
     validate_tags(tags)
@@ -161,6 +172,7 @@ def create(
         max_workers,
         workflow_timeout,
         obj.is_project,
+        use_distributed_map,
     )
     if only_json:
@@ -269,7 +281,15 @@ def resolve_state_machine_name(obj, name):
 def make_flow(
-    obj, token, name, tags, namespace, max_workers, workflow_timeout, is_project
+    obj,
+    token,
+    name,
+    tags,
+    namespace,
+    max_workers,
+    workflow_timeout,
+    is_project,
+    use_distributed_map,
 ):
     if obj.flow_datastore.TYPE != "s3":
         raise MetaflowException("AWS Step Functions requires --datastore=s3.")
@@ -305,6 +325,7 @@ def make_flow(
         username=get_username(),
         workflow_timeout=workflow_timeout,
         is_project=is_project,
+        use_distributed_map=use_distributed_map,
     )
@@ -614,6 +635,83 @@ def delete(obj, authorize=None):
         )
+@step_functions.command(help="Terminate flow execution on Step Functions.")
+@click.option(
+    "--authorize",
+    default=None,
+    type=str,
+    help="Authorize the termination with a production token",
+)
+@click.argument("run-id", required=True, type=str)
+@click.pass_obj
+def terminate(obj, run_id, authorize=None):
+    def _token_instructions(flow_name, prev_user):
+        obj.echo(
+            "There is an existing version of *%s* on AWS Step Functions which was "
+            "deployed by the user *%s*." % (flow_name, prev_user)
+        )
+        obj.echo(
+            "To terminate this flow, you need to use the same production token that they used."
+        )
+        obj.echo(
+            "Please reach out to them to get the token. Once you have it, call "
+            "this command:"
+        )
+        obj.echo("    step-functions terminate --authorize MY_TOKEN RUN_ID", fg="green")
+        obj.echo(
+            'See "Organizing Results" at docs.metaflow.org for more information '
+            "about production tokens."
+        )
+    validate_run_id(
+        obj.state_machine_name, obj.token_prefix, authorize, run_id, _token_instructions
+    )
+    # Trim prefix from run_id
+    name = run_id[4:]
+    obj.echo(
+        "Terminating run *{run_id}* for {flow_name} ...".format(
+            run_id=run_id, flow_name=obj.flow.name
+        ),
+        bold=True,
+    )
+    terminated = StepFunctions.terminate(obj.state_machine_name, name)
+    if terminated:
+        obj.echo("\nRun terminated at %s." % terminated.get("stopDate"))
+def validate_run_id(
+    state_machine_name, token_prefix, authorize, run_id, instructions_fn=None
+):
+    if not run_id.startswith("sfn-"):
+        raise RunIdMismatch(
+            "Run IDs for flows executed through AWS Step Functions begin with 'sfn-'"
+        )
+    name = run_id[4:]
+    execution = StepFunctions.get_execution(state_machine_name, name)
+    if execution is None:
+        raise MetaflowException(
+            "Could not find the execution *%s* (in RUNNING state) for the state machine *%s* on AWS Step Functions"
+            % (name, state_machine_name)
+        )
+    _, owner, token, _ = execution
+    if authorize is None:
+        authorize = load_token(token_prefix)
+    elif authorize.startswith("production:"):
+        authorize = authorize[11:]
+    if owner != get_username() and authorize != token:
+        if instructions_fn:
+            instructions_fn(flow_name=name, prev_user=owner)
+        raise IncorrectProductionToken("Try again with the correct production token.")
+    return True
 def validate_token(name, token_prefix, authorize, instruction_fn=None):
     """
     Validate that the production token matches that of the deployed flow.

metaflow/plugins/aws/step_functions/step_functions_client.py CHANGED Viewed

@@ -81,9 +81,14 @@ class StepFunctionsClient(object):
             for execution in page["executions"]
         )
-    def terminate_execution(self, state_machine_arn, execution_arn):
-        # TODO
-        pass
+    def terminate_execution(self, execution_arn):
+        try:
+            response = self._client.stop_execution(executionArn=execution_arn)
+            return response
+        except self._client.exceptions.ExecutionDoesNotExist:
+            raise ValueError("The execution ARN %s does not exist." % execution_arn)
+        except Exception as e:
+            raise e
     def _default_logging_configuration(self, log_execution_history):
         if log_execution_history:

metaflow/plugins/aws/step_functions/step_functions_decorator.py CHANGED Viewed

@@ -1,5 +1,5 @@
-import os
 import json
+import os
 import time
 from metaflow.decorators import StepDecorator

metaflow/plugins/cards/card_cli.py CHANGED Viewed

@@ -17,6 +17,7 @@ import random
 from contextlib import contextmanager
 from functools import wraps
 from metaflow.exception import MetaflowNamespaceMismatch
 from .card_datastore import CardDatastore, NUM_SHORT_HASH_CHARS
 from .exception import (
     CardClassFoundException,
@@ -736,8 +737,7 @@ def create(
     if error_stack_trace is not None and mode != "refresh":
         rendered_content = error_card().render(task, stack_trace=error_stack_trace)
-    if (
+    elif (
         rendered_info.is_implemented
         and rendered_info.timed_out
         and mode != "refresh"

metaflow/plugins/kubernetes/kubernetes.py CHANGED Viewed

@@ -174,6 +174,7 @@ class Kubernetes(object):
         persistent_volume_claims=None,
         tolerations=None,
         labels=None,
+        shared_memory=None,
     ):
         if env is None:
             env = {}
@@ -213,6 +214,7 @@ class Kubernetes(object):
                 tmpfs_size=tmpfs_size,
                 tmpfs_path=tmpfs_path,
                 persistent_volume_claims=persistent_volume_claims,
+                shared_memory=shared_memory,
             )
             .environment_variable("METAFLOW_CODE_SHA", code_package_sha)
             .environment_variable("METAFLOW_CODE_URL", code_package_url)

metaflow/plugins/kubernetes/kubernetes_cli.py CHANGED Viewed

@@ -107,6 +107,7 @@ def kubernetes():
     type=JSONTypeClass(),
     multiple=False,
 )
+@click.option("--shared-memory", default=None, help="Size of shared memory in MiB")
 @click.pass_context
 def step(
     ctx,
@@ -132,6 +133,7 @@ def step(
     run_time_limit=None,
     persistent_volume_claims=None,
     tolerations=None,
+    shared_memory=None,
     **kwargs
 ):
     def echo(msg, stream="stderr", job_id=None, **kwargs):
@@ -245,6 +247,7 @@ def step(
                 env=env,
                 persistent_volume_claims=persistent_volume_claims,
                 tolerations=tolerations,
+                shared_memory=shared_memory,
             )
     except Exception as e:
         traceback.print_exc(chain=False)

metaflow/plugins/kubernetes/kubernetes_client.py CHANGED Viewed

@@ -6,6 +6,7 @@ from metaflow.exception import MetaflowException
 from .kubernetes_job import KubernetesJob
 CLIENT_REFRESH_INTERVAL_SECONDS = 300
@@ -32,11 +33,18 @@ class KubernetesClient(object):
     def _refresh_client(self):
         from kubernetes import client, config
-        if os.getenv("KUBERNETES_SERVICE_HOST"):
+        if os.getenv("KUBECONFIG"):
+            # There are cases where we're running inside a pod, but can't use
+            # the kubernetes client for that pod's cluster: for example when
+            # running in Bitbucket Cloud or other CI system.
+            # In this scenario, the user can set a KUBECONFIG environment variable
+            # to load the kubeconfig, regardless of whether we're in a pod or not.
+            config.load_kube_config()
+        elif os.getenv("KUBERNETES_SERVICE_HOST"):
             # We are inside a pod, authenticate via ServiceAccount assigned to us
             config.load_incluster_config()
         else:
-            # Use kubeconfig, likely $HOME/.kube/config
+            # Default to using kubeconfig, likely $HOME/.kube/config
             # TODO (savin):
             #  1. Support generating kubeconfig on the fly using boto3
             #  2. Support auth via OIDC - https://docs.aws.amazon.com/eks/latest/userguide/authenticate-oidc-identity-provider.html

metaflow/plugins/kubernetes/kubernetes_decorator.py CHANGED Viewed

@@ -20,6 +20,7 @@ from metaflow.metaflow_config import (
     KUBERNETES_PERSISTENT_VOLUME_CLAIMS,
     KUBERNETES_TOLERATIONS,
     KUBERNETES_SERVICE_ACCOUNT,
+    KUBERNETES_SHARED_MEMORY,
 )
 from metaflow.plugins.resources_decorator import ResourcesDecorator
 from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
@@ -87,6 +88,8 @@ class KubernetesDecorator(StepDecorator):
     persistent_volume_claims : Dict[str, str], optional, default None
         A map (dictionary) of persistent volumes to be mounted to the pod for this step. The map is from persistent
         volumes to the path to which the volume is to be mounted, e.g., `{'pvc-name': '/path/to/mount/on'}`.
+    shared_memory: int, optional
+        Shared memory size (in MiB) required for this step
     """
     name = "kubernetes"
@@ -109,6 +112,7 @@ class KubernetesDecorator(StepDecorator):
         "tmpfs_size": None,
         "tmpfs_path": "/metaflow_temp",
         "persistent_volume_claims": None,  # e.g., {"pvc-name": "/mnt/vol", "another-pvc": "/mnt/vol2"}
+        "shared_memory": None,
     }
     package_url = None
     package_sha = None
@@ -194,6 +198,8 @@ class KubernetesDecorator(StepDecorator):
             if not self.attributes["tmpfs_size"]:
                 # default tmpfs behavior - https://man7.org/linux/man-pages/man5/tmpfs.5.html
                 self.attributes["tmpfs_size"] = int(self.attributes["memory"]) // 2
+        if not self.attributes["shared_memory"]:
+            self.attributes["shared_memory"] = KUBERNETES_SHARED_MEMORY
     # Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png
     def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger):
@@ -289,6 +295,17 @@ class KubernetesDecorator(StepDecorator):
                     )
                 )
+        if self.attributes["shared_memory"]:
+            if not (
+                isinstance(self.attributes["shared_memory"], int)
+                and int(self.attributes["shared_memory"]) > 0
+            ):
+                raise KubernetesException(
+                    "Invalid shared_memory value: *{size}* for step *{step}* (should be an integer greater than 0)".format(
+                        size=self.attributes["shared_memory"], step=step
+                    )
+                )
     def package_init(self, flow, step_name, environment):
         try:
             # Kubernetes is a soft dependency.

metaflow 2.11.1__py2.py3-none-any.whl → 2.11.3__py2.py3-none-any.whl

metaflow 2.11.1py2.py3-none-any.whl → 2.11.3py2.py3-none-any.whl