PyPI - ob-metaflow - Versions diffs - 2.10.7.4__py2.py3-none-any.whl → 2.10.9.2__py2.py3-none-any.whl - Mend

ob-metaflow 2.10.7.4py2.py3-none-any.whl → 2.10.9.2py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ob-metaflow might be problematic. Click here for more details.

Files changed (57) hide show

metaflow/cards.py +2 -0
metaflow/decorators.py +1 -1
metaflow/metaflow_config.py +4 -0
metaflow/plugins/__init__.py +4 -0
metaflow/plugins/airflow/airflow_cli.py +1 -1
metaflow/plugins/argo/argo_workflows.py +5 -0
metaflow/plugins/argo/argo_workflows_cli.py +1 -1
metaflow/plugins/aws/aws_utils.py +1 -1
metaflow/plugins/aws/batch/batch.py +4 -0
metaflow/plugins/aws/batch/batch_cli.py +3 -0
metaflow/plugins/aws/batch/batch_client.py +40 -11
metaflow/plugins/aws/batch/batch_decorator.py +1 -0
metaflow/plugins/aws/step_functions/step_functions.py +1 -0
metaflow/plugins/aws/step_functions/step_functions_cli.py +1 -1
metaflow/plugins/azure/azure_exceptions.py +1 -1
metaflow/plugins/cards/card_cli.py +413 -28
metaflow/plugins/cards/card_client.py +16 -7
metaflow/plugins/cards/card_creator.py +228 -0
metaflow/plugins/cards/card_datastore.py +124 -26
metaflow/plugins/cards/card_decorator.py +40 -86
metaflow/plugins/cards/card_modules/base.html +12 -0
metaflow/plugins/cards/card_modules/basic.py +74 -8
metaflow/plugins/cards/card_modules/bundle.css +1 -170
metaflow/plugins/cards/card_modules/card.py +65 -0
metaflow/plugins/cards/card_modules/components.py +446 -81
metaflow/plugins/cards/card_modules/convert_to_native_type.py +9 -3
metaflow/plugins/cards/card_modules/main.js +250 -21
metaflow/plugins/cards/card_modules/test_cards.py +117 -0
metaflow/plugins/cards/card_resolver.py +0 -2
metaflow/plugins/cards/card_server.py +361 -0
metaflow/plugins/cards/component_serializer.py +506 -42
metaflow/plugins/cards/exception.py +20 -1
metaflow/plugins/datastores/azure_storage.py +1 -2
metaflow/plugins/datastores/gs_storage.py +1 -2
metaflow/plugins/datastores/s3_storage.py +2 -1
metaflow/plugins/datatools/s3/s3.py +24 -11
metaflow/plugins/env_escape/client.py +2 -12
metaflow/plugins/env_escape/client_modules.py +18 -14
metaflow/plugins/env_escape/server.py +18 -11
metaflow/plugins/env_escape/utils.py +12 -0
metaflow/plugins/gcp/gs_exceptions.py +1 -1
metaflow/plugins/gcp/gs_utils.py +1 -1
metaflow/plugins/kubernetes/kubernetes.py +43 -6
metaflow/plugins/kubernetes/kubernetes_cli.py +40 -1
metaflow/plugins/kubernetes/kubernetes_decorator.py +73 -6
metaflow/plugins/kubernetes/kubernetes_job.py +536 -161
metaflow/plugins/pypi/conda_environment.py +5 -6
metaflow/plugins/pypi/pip.py +2 -2
metaflow/plugins/pypi/utils.py +15 -0
metaflow/task.py +1 -0
metaflow/version.py +1 -1
{ob_metaflow-2.10.7.4.dist-info → ob_metaflow-2.10.9.2.dist-info}/METADATA +1 -1
{ob_metaflow-2.10.7.4.dist-info → ob_metaflow-2.10.9.2.dist-info}/RECORD +57 -55
{ob_metaflow-2.10.7.4.dist-info → ob_metaflow-2.10.9.2.dist-info}/LICENSE +0 -0
{ob_metaflow-2.10.7.4.dist-info → ob_metaflow-2.10.9.2.dist-info}/WHEEL +0 -0
{ob_metaflow-2.10.7.4.dist-info → ob_metaflow-2.10.9.2.dist-info}/entry_points.txt +0 -0
{ob_metaflow-2.10.7.4.dist-info → ob_metaflow-2.10.9.2.dist-info}/top_level.txt +0 -0

metaflow/plugins/kubernetes/kubernetes_job.py CHANGED Viewed

@@ -2,20 +2,18 @@ import json
 import math
 import random
 import time
-from metaflow.tracing import inject_tracing_vars
+import os
+import socket
+import copy
 from metaflow.exception import MetaflowException
 from metaflow.metaflow_config import KUBERNETES_SECRETS
 CLIENT_REFRESH_INTERVAL_SECONDS = 300
 class KubernetesJobException(MetaflowException):
     headline = "Kubernetes job error"
 # Implements truncated exponential backoff from
 # https://cloud.google.com/storage/docs/retry-strategy#exponential-backoff
 def k8s_retry(deadline_seconds=60, max_backoff=32):
@@ -78,107 +76,260 @@ class KubernetesJob(object):
         tmpfs_size = self._kwargs["tmpfs_size"]
         tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs)
-        self._job = client.V1Job(
-            api_version="batch/v1",
-            kind="Job",
-            metadata=client.V1ObjectMeta(
-                # Annotations are for humans
-                annotations=self._kwargs.get("annotations", {}),
-                # While labels are for Kubernetes
-                labels=self._kwargs.get("labels", {}),
-                generate_name=self._kwargs["generate_name"],
-                namespace=self._kwargs["namespace"],  # Defaults to `default`
-            ),
-            spec=client.V1JobSpec(
-                # Retries are handled by Metaflow when it is responsible for
-                # executing the flow. The responsibility is moved to Kubernetes
-                # when Argo Workflows is responsible for the execution.
-                backoff_limit=self._kwargs.get("retries", 0),
-                completions=1,  # A single non-indexed pod job
-                ttl_seconds_after_finished=7
-                * 60
-                * 60  # Remove job after a week. TODO: Make this configurable
-                * 24,
-                template=client.V1PodTemplateSpec(
+        jobset_name = "js-%s" % self._kwargs["attrs"]["metaflow.task_id"].split('-')[-1]
+        main_job_name = "control"
+        main_job_index = 0
+        main_pod_index = 0
+        subdomain = jobset_name
+        master_port = int(self._kwargs['port']) if self._kwargs['port'] else None
+        passwordless_ssh = self._kwargs["attrs"]["requires_passwordless_ssh"]
+        if passwordless_ssh:
+            passwordless_ssh_service_name = subdomain
+            passwordless_ssh_service_selector = {
+                "passwordless-ssh-jobset": "true"
+            }
+        else:
+            passwordless_ssh_service_name = None
+            passwordless_ssh_service_selector = {}
+        fqdn_suffix = "%s.svc.cluster.local" % self._kwargs["namespace"]
+        jobset_main_addr = "%s-%s-%s-%s.%s.%s" % (
+            jobset_name,
+            main_job_name,
+            main_job_index,
+            main_pod_index,
+            subdomain,
+            fqdn_suffix,
+        )
+        def _install_jobset(
+            repo_url="https://github.com/kubernetes-sigs/jobset",
+            python_sdk_path="jobset/sdk/python",
+        ):
+            # TODO (Eddie): Remove this and suggest to user.
+            import subprocess
+            import tempfile
+            import shutil
+            import os
+            with open(os.devnull, "wb") as devnull:
+                cwd = os.getcwd()
+                tmp_dir = tempfile.mkdtemp()
+                os.chdir(tmp_dir)
+                subprocess.check_call(
+                    ["git", "clone", repo_url], stdout=devnull, stderr=subprocess.STDOUT
+                )
+                tmp_python_sdk_path = os.path.join(tmp_dir, python_sdk_path)
+                os.chdir(tmp_python_sdk_path)
+                subprocess.check_call(
+                    ["pip", "install", "."], stdout=devnull, stderr=subprocess.STDOUT
+                )
+                os.chdir(cwd)
+                shutil.rmtree(tmp_dir)
+        def _get_passwordless_ssh_service():
+            return client.V1Service(
+                api_version="v1",
+                kind="Service",
+                metadata=client.V1ObjectMeta(
+                    name=passwordless_ssh_service_name,
+                    namespace=self._kwargs["namespace"]
+                ),
+                spec=client.V1ServiceSpec(
+                    cluster_ip="None",
+                    internal_traffic_policy="Cluster",
+                    ip_families=["IPv4"],
+                    ip_family_policy="SingleStack",
+                    selector=passwordless_ssh_service_selector,
+                    session_affinity="None",
+                    type="ClusterIP",
+                    ports=[
+                        client.V1ServicePort(
+                            name="control",
+                            port=22,
+                            protocol="TCP",
+                            target_port=22
+                        )
+                    ]
+                )
+            )
+        def _get_replicated_job(job_name, parallelism, command):
+            return jobset.models.jobset_v1alpha2_replicated_job.JobsetV1alpha2ReplicatedJob(
+                name=job_name,
+                template=client.V1JobTemplateSpec(
                     metadata=client.V1ObjectMeta(
                         annotations=self._kwargs.get("annotations", {}),
                         labels=self._kwargs.get("labels", {}),
                         namespace=self._kwargs["namespace"],
                     ),
-                    spec=client.V1PodSpec(
-                        # Timeout is set on the pod and not the job (important!)
-                        active_deadline_seconds=self._kwargs["timeout_in_seconds"],
-                        # TODO (savin): Enable affinities for GPU scheduling.
-                        # affinity=?,
-                        containers=[
-                            client.V1Container(
-                                command=self._kwargs["command"],
-                                env=[
-                                    client.V1EnvVar(name=k, value=str(v))
-                                    for k, v in self._kwargs.get(
-                                        "environment_variables", {}
-                                    ).items()
-                                ]
-                                # And some downward API magic. Add (key, value)
-                                # pairs below to make pod metadata available
-                                # within Kubernetes container.
-                                + [
-                                    client.V1EnvVar(
-                                        name=k,
-                                        value_from=client.V1EnvVarSource(
-                                            field_ref=client.V1ObjectFieldSelector(
-                                                field_path=str(v)
+                    spec=client.V1JobSpec(
+                        parallelism=parallelism,  # how many jobs can run at once
+                        completions=parallelism,  # how many Pods the JobSet creates in total
+                        backoff_limit=0,
+                        ttl_seconds_after_finished=7
+                        * 60
+                        * 60
+                        * 24,
+                        template=client.V1PodTemplateSpec(
+                            metadata=client.V1ObjectMeta(
+                                annotations=self._kwargs.get("annotations", {}),
+                                labels={
+                                    **self._kwargs.get("labels", {}),
+                                    **passwordless_ssh_service_selector, # TODO: necessary?
+                                    # TODO: cluster-name, app.kubernetes.io/name necessary?
+                                },
+                                namespace=self._kwargs["namespace"],
+                            ),
+                            spec=client.V1PodSpec(
+                                active_deadline_seconds=self._kwargs[
+                                    "timeout_in_seconds"
+                                ],
+                                containers=[
+                                    client.V1Container(
+                                        command=command,
+                                        ports=[client.V1ContainerPort(container_port=master_port)] if master_port and job_name=="control" else [],
+                                        env=[
+                                            client.V1EnvVar(name=k, value=str(v))
+                                            for k, v in self._kwargs.get(
+                                                "environment_variables", {}
+                                            ).items()
+                                        ]
+                                        + [
+                                            client.V1EnvVar(
+                                                name=k,
+                                                value_from=client.V1EnvVarSource(
+                                                    field_ref=client.V1ObjectFieldSelector(
+                                                        field_path=str(v)
+                                                    )
+                                                ),
+                                            )
+                                            for k, v in {
+                                                "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
+                                                "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
+                                                "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
+                                                "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
+                                                "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
+                                            }.items()
+                                        ]
+                                        # Mimicking the AWS Batch Multinode env vars.
+                                        + [
+                                            client.V1EnvVar(
+                                                name="MASTER_ADDR",
+                                                value=jobset_main_addr,
+                                            ),
+                                            client.V1EnvVar(
+                                                name="MASTER_PORT",
+                                                value=str(master_port),
+                                            ),
+                                            client.V1EnvVar(
+                                                name="RANK",
+                                                value_from=client.V1EnvVarSource(
+                                                    field_ref=client.V1ObjectFieldSelector(
+                                                        field_path="metadata.annotations['batch.kubernetes.io/job-completion-index']"
+                                                    )
+                                                ),
+                                            ),
+                                            client.V1EnvVar(
+                                                name="WORLD_SIZE",
+                                                value=str(self._kwargs["num_parallel"]),
+                                            ),
+                                            client.V1EnvVar(
+                                                name="PYTHONUNBUFFERED",
+                                                value="0",
+                                            ),
+                                        ],
+                                        env_from=[
+                                            client.V1EnvFromSource(
+                                                secret_ref=client.V1SecretEnvSource(
+                                                    name=str(k),
+                                                    # optional=True
+                                                )
                                             )
+                                            for k in list(
+                                                self._kwargs.get("secrets", [])
+                                            )
+                                            + KUBERNETES_SECRETS.split(",")
+                                            if k
+                                        ],
+                                        image=self._kwargs["image"],
+                                        image_pull_policy=self._kwargs[
+                                            "image_pull_policy"
+                                        ],
+                                        name=self._kwargs["step_name"].replace(
+                                            "_", "-"
                                         ),
-                                    )
-                                    for k, v in {
-                                        "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
-                                        "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
-                                        "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
-                                        "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
-                                        "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
-                                    }.items()
-                                ]
-                                + [
-                                    client.V1EnvVar(name=k, value=str(v))
-                                    for k, v in inject_tracing_vars({}).items()
-                                ],
-                                env_from=[
-                                    client.V1EnvFromSource(
-                                        secret_ref=client.V1SecretEnvSource(
-                                            name=str(k),
-                                            # optional=True
+                                        resources=client.V1ResourceRequirements(
+                                            requests={
+                                                "cpu": str(self._kwargs["cpu"]),
+                                                "memory": "%sM"
+                                                % str(self._kwargs["memory"]),
+                                                "ephemeral-storage": "%sM"
+                                                % str(self._kwargs["disk"]),
+                                            },
+                                            limits={
+                                                "%s.com/gpu".lower()
+                                                % self._kwargs["gpu_vendor"]: str(
+                                                    self._kwargs["gpu"]
+                                                )
+                                                for k in [0]
+                                                # Don't set GPU limits if gpu isn't specified.
+                                                if self._kwargs["gpu"] is not None
+                                            },
+                                        ),
+                                        volume_mounts=(
+                                            [
+                                                client.V1VolumeMount(
+                                                    mount_path=self._kwargs.get(
+                                                        "tmpfs_path"
+                                                    ),
+                                                    name="tmpfs-ephemeral-volume",
+                                                )
+                                            ]
+                                            if tmpfs_enabled
+                                            else []
                                         )
+                                        + (
+                                            [
+                                                client.V1VolumeMount(
+                                                    mount_path=path, name=claim
+                                                )
+                                                for claim, path in self._kwargs[
+                                                    "persistent_volume_claims"
+                                                ].items()
+                                            ]
+                                            if self._kwargs["persistent_volume_claims"]
+                                            is not None
+                                            else []
+                                        ),
                                     )
-                                    for k in list(self._kwargs.get("secrets", []))
-                                    + KUBERNETES_SECRETS.split(",")
-                                    if k
                                 ],
-                                image=self._kwargs["image"],
-                                image_pull_policy=self._kwargs["image_pull_policy"],
-                                name=self._kwargs["step_name"].replace("_", "-"),
-                                resources=client.V1ResourceRequirements(
-                                    requests={
-                                        "cpu": str(self._kwargs["cpu"]),
-                                        "memory": "%sM" % str(self._kwargs["memory"]),
-                                        "ephemeral-storage": "%sM"
-                                        % str(self._kwargs["disk"]),
-                                    },
-                                    limits={
-                                        "%s.com/gpu".lower()
-                                        % self._kwargs["gpu_vendor"]: str(
-                                            self._kwargs["gpu"]
-                                        )
-                                        for k in [0]
-                                        # Don't set GPU limits if gpu isn't specified.
-                                        if self._kwargs["gpu"] is not None
-                                    },
-                                ),
-                                volume_mounts=(
+                                node_selector=self._kwargs.get("node_selector"),
+                                restart_policy="Never",
+                                set_hostname_as_fqdn=True, # configure pod hostname as pod's FQDN
+                                share_process_namespace=False, # default
+                                subdomain=subdomain, # FQDN = <hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>
+                                service_account_name=self._kwargs["service_account"],
+                                termination_grace_period_seconds=0,
+                                tolerations=[
+                                    client.V1Toleration(**toleration)
+                                    for toleration in self._kwargs.get("tolerations")
+                                    or []
+                                ],
+                                volumes=(
                                     [
-                                        client.V1VolumeMount(
-                                            mount_path=self._kwargs.get("tmpfs_path"),
+                                        client.V1Volume(
                                             name="tmpfs-ephemeral-volume",
+                                            empty_dir=client.V1EmptyDirVolumeSource(
+                                                medium="Memory",
+                                                size_limit="{}Mi".format(tmpfs_size),
+                                            ),
                                         )
                                     ]
                                     if tmpfs_enabled
@@ -186,72 +337,264 @@ class KubernetesJob(object):
                                 )
                                 + (
                                     [
-                                        client.V1VolumeMount(
-                                            mount_path=path, name=claim
+                                        client.V1Volume(
+                                            name=claim,
+                                            persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
+                                                claim_name=claim
+                                            ),
                                         )
-                                        for claim, path in self._kwargs[
+                                        for claim in self._kwargs[
                                             "persistent_volume_claims"
-                                        ].items()
+                                        ].keys()
                                     ]
                                     if self._kwargs["persistent_volume_claims"]
                                     is not None
                                     else []
                                 ),
-                            )
-                        ],
-                        node_selector=self._kwargs.get("node_selector"),
-                        # TODO (savin): Support image_pull_secrets
-                        # image_pull_secrets=?,
-                        # TODO (savin): Support preemption policies
-                        # preemption_policy=?,
-                        #
-                        # A Container in a Pod may fail for a number of
-                        # reasons, such as because the process in it exited
-                        # with a non-zero exit code, or the Container was
-                        # killed due to OOM etc. If this happens, fail the pod
-                        # and let Metaflow handle the retries.
-                        restart_policy="Never",
-                        service_account_name=self._kwargs["service_account"],
-                        # Terminate the container immediately on SIGTERM
-                        termination_grace_period_seconds=0,
-                        tolerations=[
-                            client.V1Toleration(**toleration)
-                            for toleration in self._kwargs.get("tolerations") or []
-                        ],
-                        volumes=(
-                            [
-                                client.V1Volume(
-                                    name="tmpfs-ephemeral-volume",
-                                    empty_dir=client.V1EmptyDirVolumeSource(
-                                        medium="Memory",
-                                        # Add default unit as ours differs from Kubernetes default.
-                                        size_limit="{}Mi".format(tmpfs_size),
+                            ),
+                        ),
+                    ),
+                ),
+            )
+        if "num_parallel" in self._kwargs and self._kwargs["num_parallel"] >= 1:
+            try:
+                import jobset
+            except ImportError:
+                _install_jobset()
+                import jobset
+            main_commands = copy.copy(self._kwargs["command"])
+            main_commands[-1] = main_commands[-1].replace(
+                "[multinode-args]", "--split-index 0"
+            )
+            task_id = self._kwargs["attrs"]["metaflow.task_id"]
+            secondary_commands = copy.copy(self._kwargs["command"])
+            # RANK needs +1 because control node is not in the worker index group, yet we want global nodes.
+            # Technically, control and worker could be same replicated job type, but cleaner to separate for future use cases.
+            secondary_commands[-1] = secondary_commands[-1].replace(
+                "[multinode-args]", "--split-index `expr $RANK + 1`"
+            )
+            secondary_commands[-1] = secondary_commands[-1].replace(
+                "ubf_control", "ubf_task"
+            )
+            secondary_commands[-1] = secondary_commands[-1].replace(
+                task_id,
+                task_id.replace("control-", "") + "-node-`expr $RANK + 1`",
+            )
+            if passwordless_ssh:
+                if not os.path.exists("/usr/sbin/sshd"):
+                    raise KubernetesJobException(
+                        "This @parallel decorator requires sshd to be installed in the container image."
+                        "Please install OpenSSH."
+                    )
+                # run sshd in background
+                main_commands[-1] = "/usr/sbin/sshd -D & %s" % main_commands[-1]
+                secondary_commands[-1] = "/usr/sbin/sshd -D & %s" % secondary_commands[-1]
+            self._jobset = jobset.models.jobset_v1alpha2_job_set.JobsetV1alpha2JobSet(
+                api_version="jobset.x-k8s.io/v1alpha2",
+                kind="JobSet",
+                metadata=client.V1ObjectMeta(
+                    annotations=self._kwargs.get("annotations", {}),
+                    labels=self._kwargs.get("labels", {}),
+                    name=jobset_name,
+                    namespace=self._kwargs["namespace"],
+                ),
+                spec=jobset.models.jobset_v1alpha2_job_set_spec.JobsetV1alpha2JobSetSpec(
+                    network=jobset.models.jobset_v1alpha2_network.JobsetV1alpha2Network(
+                        enable_dns_hostnames=True if not self._kwargs['attrs']['requires_passwordless_ssh'] else False,
+                        subdomain=subdomain
+                    ),
+                    replicated_jobs=[
+                        _get_replicated_job("control", 1, main_commands),
+                        _get_replicated_job(
+                            "worker",
+                            self._kwargs["num_parallel"] - 1,
+                            secondary_commands,
+                        ),
+                    ],
+                ),
+            )
+            self._passwordless_ssh_service = _get_passwordless_ssh_service()
+        else:
+            self._job = client.V1Job(
+                api_version="batch/v1",
+                kind="Job",
+                metadata=client.V1ObjectMeta(
+                    # Annotations are for humans
+                    annotations=self._kwargs.get("annotations", {}),
+                    # While labels are for Kubernetes
+                    labels=self._kwargs.get("labels", {}),
+                    generate_name=self._kwargs["generate_name"],
+                    namespace=self._kwargs["namespace"],  # Defaults to `default`
+                ),
+                spec=client.V1JobSpec(
+                    # Retries are handled by Metaflow when it is responsible for
+                    # executing the flow. The responsibility is moved to Kubernetes
+                    # when Argo Workflows is responsible for the execution.
+                    backoff_limit=self._kwargs.get("retries", 0),
+                    completions=1,  # A single non-indexed pod job
+                    ttl_seconds_after_finished=7
+                    * 60
+                    * 60  # Remove job after a week. TODO: Make this configurable
+                    * 24,
+                    template=client.V1PodTemplateSpec(
+                        metadata=client.V1ObjectMeta(
+                            annotations=self._kwargs.get("annotations", {}),
+                            labels=self._kwargs.get("labels", {}),
+                            namespace=self._kwargs["namespace"],
+                        ),
+                        spec=client.V1PodSpec(
+                            # Timeout is set on the pod and not the job (important!)
+                            active_deadline_seconds=self._kwargs["timeout_in_seconds"],
+                            # TODO (savin): Enable affinities for GPU scheduling.
+                            # affinity=?,
+                            containers=[
+                                client.V1Container(
+                                    command=self._kwargs["command"],
+                                    env=[
+                                        client.V1EnvVar(name=k, value=str(v))
+                                        for k, v in self._kwargs.get(
+                                            "environment_variables", {}
+                                        ).items()
+                                    ]
+                                    # And some downward API magic. Add (key, value)
+                                    # pairs below to make pod metadata available
+                                    # within Kubernetes container.
+                                    + [
+                                        client.V1EnvVar(
+                                            name=k,
+                                            value_from=client.V1EnvVarSource(
+                                                field_ref=client.V1ObjectFieldSelector(
+                                                    field_path=str(v)
+                                                )
+                                            ),
+                                        )
+                                        for k, v in {
+                                            "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
+                                            "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
+                                            "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
+                                            "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
+                                            "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
+                                        }.items()
+                                    ],
+                                    env_from=[
+                                        client.V1EnvFromSource(
+                                            secret_ref=client.V1SecretEnvSource(
+                                                name=str(k),
+                                                # optional=True
+                                            )
+                                        )
+                                        for k in list(self._kwargs.get("secrets", []))
+                                        + KUBERNETES_SECRETS.split(",")
+                                        if k
+                                    ],
+                                    image=self._kwargs["image"],
+                                    image_pull_policy=self._kwargs["image_pull_policy"],
+                                    name=self._kwargs["step_name"].replace("_", "-"),
+                                    resources=client.V1ResourceRequirements(
+                                        requests={
+                                            "cpu": str(self._kwargs["cpu"]),
+                                            "memory": "%sM"
+                                            % str(self._kwargs["memory"]),
+                                            "ephemeral-storage": "%sM"
+                                            % str(self._kwargs["disk"]),
+                                        },
+                                        limits={
+                                            "%s.com/gpu".lower()
+                                            % self._kwargs["gpu_vendor"]: str(
+                                                self._kwargs["gpu"]
+                                            )
+                                            for k in [0]
+                                            # Don't set GPU limits if gpu isn't specified.
+                                            if self._kwargs["gpu"] is not None
+                                        },
                                     ),
-                                )
-                            ]
-                            if tmpfs_enabled
-                            else []
-                        )
-                        + (
-                            [
-                                client.V1Volume(
-                                    name=claim,
-                                    persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
-                                        claim_name=claim
+                                    volume_mounts=(
+                                        [
+                                            client.V1VolumeMount(
+                                                mount_path=self._kwargs.get(
+                                                    "tmpfs_path"
+                                                ),
+                                                name="tmpfs-ephemeral-volume",
+                                            )
+                                        ]
+                                        if tmpfs_enabled
+                                        else []
+                                    )
+                                    + (
+                                        [
+                                            client.V1VolumeMount(
+                                                mount_path=path, name=claim
+                                            )
+                                            for claim, path in self._kwargs[
+                                                "persistent_volume_claims"
+                                            ].items()
+                                        ]
+                                        if self._kwargs["persistent_volume_claims"]
+                                        is not None
+                                        else []
                                     ),
                                 )
-                                for claim in self._kwargs[
-                                    "persistent_volume_claims"
-                                ].keys()
-                            ]
-                            if self._kwargs["persistent_volume_claims"] is not None
-                            else []
+                            ],
+                            node_selector=self._kwargs.get("node_selector"),
+                            # TODO (savin): Support image_pull_secrets
+                            # image_pull_secrets=?,
+                            # TODO (savin): Support preemption policies
+                            # preemption_policy=?,
+                            #
+                            # A Container in a Pod may fail for a number of
+                            # reasons, such as because the process in it exited
+                            # with a non-zero exit code, or the Container was
+                            # killed due to OOM etc. If this happens, fail the pod
+                            # and let Metaflow handle the retries.
+                            restart_policy="Never",
+                            service_account_name=self._kwargs["service_account"],
+                            # Terminate the container immediately on SIGTERM
+                            termination_grace_period_seconds=0,
+                            tolerations=[
+                                client.V1Toleration(**toleration)
+                                for toleration in self._kwargs.get("tolerations") or []
+                            ],
+                            volumes=(
+                                [
+                                    client.V1Volume(
+                                        name="tmpfs-ephemeral-volume",
+                                        empty_dir=client.V1EmptyDirVolumeSource(
+                                            medium="Memory",
+                                            # Add default unit as ours differs from Kubernetes default.
+                                            size_limit="{}Mi".format(tmpfs_size),
+                                        ),
+                                    )
+                                ]
+                                if tmpfs_enabled
+                                else []
+                            )
+                            + (
+                                [
+                                    client.V1Volume(
+                                        name=claim,
+                                        persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
+                                            claim_name=claim
+                                        ),
+                                    )
+                                    for claim in self._kwargs[
+                                        "persistent_volume_claims"
+                                    ].keys()
+                                ]
+                                if self._kwargs["persistent_volume_claims"] is not None
+                                else []
+                            ),
+                            # TODO (savin): Set termination_message_policy
                         ),
-                        # TODO (savin): Set termination_message_policy
                     ),
                 ),
-            ),
-        )
+            )
         return self
     def execute(self):
@@ -262,19 +605,53 @@ class KubernetesJob(object):
             #       achieve the guarantees that we are seeking.
             #       https://github.com/kubernetes/enhancements/issues/1040
             #       Hopefully, we will be able to get creative with kube-batch
-            response = (
-                client.BatchV1Api()
-                .create_namespaced_job(
-                    body=self._job, namespace=self._kwargs["namespace"]
+            if "num_parallel" in self._kwargs and self._kwargs["num_parallel"] >= 1:
+                # TODO (Eddie): this is kinda gross. fix it.
+                if self._kwargs["attrs"]["requires_passwordless_ssh"]:
+                    api_instance = client.CoreV1Api()
+                    api_response = api_instance.create_namespaced_service(namespace=self._kwargs['namespace'], body=self._passwordless_ssh_service)
+                with client.ApiClient() as api_client:
+                    api_instance = client.CustomObjectsApi(api_client)
+                response = api_instance.create_namespaced_custom_object(
+                    body=self._jobset,
+                    group="jobset.x-k8s.io",
+                    version="v1alpha2",
+                    namespace=self._kwargs["namespace"],
+                    plural="jobsets",
                 )
-                .to_dict()
-            )
-            return RunningJob(
-                client=self._client,
-                name=response["metadata"]["name"],
-                uid=response["metadata"]["uid"],
-                namespace=response["metadata"]["namespace"],
-            )
+                # HACK: Give K8s some time to actually create the job
+                time.sleep(10)
+                # TODO (Eddie): Remove hack and make RunningJobSet.
+                # There are many jobs running that should be monitored.
+                job_name = "%s-control-0" % response["metadata"]["name"]
+                fake_id = 123
+                return RunningJob(
+                    client=self._client,
+                    name=job_name,
+                    uid=fake_id,
+                    namespace=response["metadata"]["namespace"],
+                )
+            else:
+                response = (
+                    client.BatchV1Api()
+                    .create_namespaced_job(
+                        body=self._job, namespace=self._kwargs["namespace"]
+                    )
+                    .to_dict()
+                )
+                return RunningJob(
+                    client=self._client,
+                    name=response["metadata"]["name"],
+                    uid=response["metadata"]["uid"],
+                    namespace=response["metadata"]["namespace"],
+                )
         except client.rest.ApiException as e:
             raise KubernetesJobException(
                 "Unable to launch Kubernetes job.\n %s"
@@ -330,7 +707,6 @@ class KubernetesJob(object):
 class RunningJob(object):
     # State Machine implementation for the lifecycle behavior documented in
     # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/
     #
@@ -450,7 +826,6 @@ class RunningJob(object):
         client = self._client.get()
         if not self.is_done:
             if self.is_running:
                 # Case 1.
                 from kubernetes.stream import stream

ob-metaflow 2.10.7.4__py2.py3-none-any.whl → 2.10.9.2__py2.py3-none-any.whl

Potentially problematic release.

ob-metaflow 2.10.7.4py2.py3-none-any.whl → 2.10.9.2py2.py3-none-any.whl