PyPI - ob-metaflow - Versions diffs - 2.15.7.1__py2.py3-none-any.whl → 2.15.10.1__py2.py3-none-any.whl - Mend

ob-metaflow 2.15.7.1py2.py3-none-any.whl → 2.15.10.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ob-metaflow might be problematic. Click here for more details.

Files changed (37) hide show

metaflow/cli.py +8 -0
metaflow/cli_components/run_cmds.py +2 -2
metaflow/cmd/main_cli.py +1 -1
metaflow/metadata_provider/metadata.py +35 -0
metaflow/metaflow_config.py +6 -0
metaflow/metaflow_environment.py +6 -1
metaflow/metaflow_git.py +115 -0
metaflow/metaflow_version.py +2 -2
metaflow/plugins/__init__.py +1 -0
metaflow/plugins/argo/argo_workflows.py +43 -6
metaflow/plugins/argo/argo_workflows_cli.py +12 -0
metaflow/plugins/aws/aws_client.py +4 -3
metaflow/plugins/datatools/s3/s3.py +54 -45
metaflow/plugins/datatools/s3/s3op.py +149 -62
metaflow/plugins/kubernetes/kubernetes.py +4 -0
metaflow/plugins/kubernetes/kubernetes_cli.py +8 -0
metaflow/plugins/kubernetes/kubernetes_decorator.py +10 -0
metaflow/plugins/kubernetes/kubernetes_job.py +8 -0
metaflow/plugins/kubernetes/kubernetes_jobsets.py +7 -0
metaflow/plugins/pypi/conda_decorator.py +2 -1
metaflow/plugins/pypi/conda_environment.py +1 -0
metaflow/plugins/uv/__init__.py +0 -0
metaflow/plugins/uv/bootstrap.py +100 -0
metaflow/plugins/uv/uv_environment.py +70 -0
metaflow/runner/deployer.py +8 -2
metaflow/runner/deployer_impl.py +6 -2
metaflow/runner/metaflow_runner.py +7 -2
metaflow/version.py +1 -1
{ob_metaflow-2.15.7.1.data → ob_metaflow-2.15.10.1.data}/data/share/metaflow/devtools/Makefile +2 -0
{ob_metaflow-2.15.7.1.dist-info → ob_metaflow-2.15.10.1.dist-info}/METADATA +2 -2
{ob_metaflow-2.15.7.1.dist-info → ob_metaflow-2.15.10.1.dist-info}/RECORD +37 -33
{ob_metaflow-2.15.7.1.dist-info → ob_metaflow-2.15.10.1.dist-info}/WHEEL +1 -1
{ob_metaflow-2.15.7.1.data → ob_metaflow-2.15.10.1.data}/data/share/metaflow/devtools/Tiltfile +0 -0
{ob_metaflow-2.15.7.1.data → ob_metaflow-2.15.10.1.data}/data/share/metaflow/devtools/pick_services.sh +0 -0
{ob_metaflow-2.15.7.1.dist-info → ob_metaflow-2.15.10.1.dist-info}/entry_points.txt +0 -0
{ob_metaflow-2.15.7.1.dist-info → ob_metaflow-2.15.10.1.dist-info}/licenses/LICENSE +0 -0
{ob_metaflow-2.15.7.1.dist-info → ob_metaflow-2.15.10.1.dist-info}/top_level.txt +0 -0

metaflow/cli.py CHANGED Viewed

@@ -17,6 +17,7 @@ from .flowspec import _FlowState
 from .graph import FlowGraph
 from .metaflow_config import (
     DEFAULT_DATASTORE,
+    DEFAULT_DECOSPECS,
     DEFAULT_ENVIRONMENT,
     DEFAULT_EVENT_LOGGER,
     DEFAULT_METADATA,
@@ -509,9 +510,16 @@ def start(
     ):
         # run/resume are special cases because they can add more decorators with --with,
         # so they have to take care of themselves.
         all_decospecs = ctx.obj.tl_decospecs + list(
             ctx.obj.environment.decospecs() or []
         )
+        # We add the default decospecs for everything except init and step since in those
+        # cases, the decospecs will already have been handled by either a run/resume
+        # or a scheduler setting them up in their own way.
+        if ctx.saved_args[0] not in ("step", "init"):
+            all_decospecs += DEFAULT_DECOSPECS.split()
         if all_decospecs:
             decorators._attach_decorators(ctx.obj.flow, all_decospecs)
             decorators._init(ctx.obj.flow)

metaflow/cli_components/run_cmds.py CHANGED Viewed

@@ -71,7 +71,7 @@ def write_file(file_path, content):
             f.write(str(content))
-def config_merge_cb(ctx, param, value):
+def config_callback(ctx, param, value):
     # Callback to:
     #  - read  the Click auto_envvar variable from both the
     #    environment AND the configuration
@@ -127,7 +127,7 @@ def common_run_options(func):
         help="Add a decorator to all steps. You can specify this "
         "option multiple times to attach multiple decorators "
         "in steps.",
-        callback=config_merge_cb,
+        callback=config_callback,
     )
     @click.option(
         "--run-id-file",

metaflow/cmd/main_cli.py CHANGED Viewed

@@ -94,7 +94,7 @@ def start(ctx):
         echo("(%s)\n" % version, fg="magenta", bold=False)
     if ctx.invoked_subcommand is None:
-        echo("More data science, less engineering\n", fg="magenta")
+        echo("More AI, less engineering\n", fg="magenta")
         lnk_sz = max(len(lnk) for lnk in CONTACT_INFO.values()) + 1
         for what, lnk in CONTACT_INFO.items():

metaflow/metadata_provider/metadata.py CHANGED Viewed

@@ -630,6 +630,20 @@ class MetadataProvider(object):
             sys_info["r_version"] = env["r_version_code"]
         return sys_info
+    def _get_git_info_as_dict(self):
+        git_info = {}
+        env = self._environment.get_environment_info()
+        for key in [
+            "repo_url",
+            "branch_name",
+            "commit_sha",
+            "has_uncommitted_changes",
+        ]:
+            if key in env and env[key]:
+                git_info[key] = env[key]
+        return git_info
     def _get_system_tags(self):
         """Convert system info dictionary into a list of system tags"""
         return [
@@ -670,6 +684,27 @@ class MetadataProvider(object):
                     tags=["attempt_id:{0}".format(attempt)],
                 )
             )
+        # Add script name as metadata
+        script_name = self._environment.get_environment_info()["script"]
+        metadata.append(
+            MetaDatum(
+                field="script-name",
+                value=script_name,
+                type="script-name",
+                tags=["attempt_id:{0}".format(attempt)],
+            )
+        )
+        # And add git metadata
+        git_info = self._get_git_info_as_dict()
+        if git_info:
+            metadata.append(
+                MetaDatum(
+                    field="git-info",
+                    value=json.dumps(git_info),
+                    type="git-info",
+                    tags=["attempt_id:{0}".format(attempt)],
+                )
+            )
         if metadata:
             self.register_metadata(run_id, step_name, task_id, metadata)

metaflow/metaflow_config.py CHANGED Viewed

@@ -109,6 +109,12 @@ S3_WORKER_COUNT = from_conf("S3_WORKER_COUNT", 64)
 # top-level retries)
 S3_TRANSIENT_RETRY_COUNT = from_conf("S3_TRANSIENT_RETRY_COUNT", 20)
+# S3 retry configuration used in the aws client
+# Use the adaptive retry strategy by default
+S3_CLIENT_RETRY_CONFIG = from_conf(
+    "S3_CLIENT_RETRY_CONFIG", {"max_attempts": 10, "mode": "adaptive"}
+)
 # Threshold to start printing warnings for an AWS retry
 RETRY_WARNING_THRESHOLD = 3

metaflow/metaflow_environment.py CHANGED Viewed

@@ -4,6 +4,7 @@ import sys
 from .util import get_username
 from . import metaflow_version
+from . import metaflow_git
 from metaflow.exception import MetaflowException
 from metaflow.extension_support import dump_module_info
 from metaflow.mflog import BASH_MFLOG, BASH_FLUSH_LOGS
@@ -197,6 +198,10 @@ class MetaflowEnvironment(object):
             "python_version_code": "%d.%d.%d" % sys.version_info[:3],
             "metaflow_version": metaflow_version.get_version(),
             "script": os.path.basename(os.path.abspath(sys.argv[0])),
+            # Add git info
+            **metaflow_git.get_repository_info(
+                path=os.path.dirname(os.path.abspath(sys.argv[0]))
+            ),
         }
         if R.use_r():
             env["metaflow_r_version"] = R.metaflow_r_version()
@@ -206,7 +211,7 @@ class MetaflowEnvironment(object):
             # Information about extension modules (to load them in the proper order)
             ext_key, ext_val = dump_module_info()
             env[ext_key] = ext_val
-        return env
+        return {k: v for k, v in env.items() if v is not None and v != ""}
     def executable(self, step_name, default=None):
         if default is not None:

metaflow/metaflow_git.py ADDED Viewed

@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+"""Get git repository information for the package
+Functions to retrieve git repository details like URL, branch name,
+and commit SHA for Metaflow code provenance tracking.
+"""
+import os
+import subprocess
+from typing import Dict, List, Optional, Tuple, Union
+# Cache for git information to avoid repeated subprocess calls
+_git_info_cache = None
+__all__ = ("get_repository_info",)
+def _call_git(
+    args: List[str], path=Union[str, os.PathLike]
+) -> Tuple[Optional[str], Optional[int], bool]:
+    """
+    Call git with provided args.
+    Returns
+    -------
+        tuple : Tuple containing
+            (stdout, exitcode, failure) of the call
+    """
+    try:
+        result = subprocess.run(
+            ["git", *args],
+            cwd=path,
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        return result.stdout.strip(), result.returncode, False
+    except (OSError, subprocess.SubprocessError):
+        # Covers subprocess timeouts and other errors which would not lead to an exit code
+        return None, None, True
+def _get_repo_url(path: Union[str, os.PathLike]) -> Optional[str]:
+    """Get the repository URL from git config"""
+    stdout, returncode, _failed = _call_git(
+        ["config", "--get", "remote.origin.url"], path
+    )
+    if returncode == 0:
+        url = stdout
+        # Convert SSH URLs to HTTPS for clickable links
+        if url.startswith("git@"):
+            parts = url.split(":", 1)
+            if len(parts) == 2:
+                domain = parts[0].replace("git@", "")
+                repo_path = parts[1]
+                url = f"https://{domain}/{repo_path}"
+        return url
+    return None
+def _get_branch_name(path: Union[str, os.PathLike]) -> Optional[str]:
+    """Get the current git branch name"""
+    stdout, returncode, _failed = _call_git(["rev-parse", "--abbrev-ref", "HEAD"], path)
+    return stdout if returncode == 0 else None
+def _get_commit_sha(path: Union[str, os.PathLike]) -> Optional[str]:
+    """Get the current git commit SHA"""
+    stdout, returncode, _failed = _call_git(["rev-parse", "HEAD"], path)
+    return stdout if returncode == 0 else None
+def _is_in_git_repo(path: Union[str, os.PathLike]) -> bool:
+    """Check if we're currently in a git repository"""
+    stdout, returncode, _failed = _call_git(
+        ["rev-parse", "--is-inside-work-tree"], path
+    )
+    return returncode == 0 and stdout == "true"
+def _has_uncommitted_changes(path: Union[str, os.PathLike]) -> Optional[bool]:
+    """Check if the git repository has uncommitted changes"""
+    _stdout, returncode, failed = _call_git(
+        ["diff-index", "--quiet", "HEAD", "--"], path
+    )
+    if failed:
+        return None
+    return returncode != 0
+def get_repository_info(path: Union[str, os.PathLike]) -> Dict[str, Union[str, bool]]:
+    """Get git repository information for a path
+    Returns:
+        dict: Dictionary containing:
+            repo_url: Repository URL (converted to HTTPS if from SSH)
+            branch_name: Current branch name
+            commit_sha: Current commit SHA
+            has_uncommitted_changes: Boolean indicating if there are uncommitted changes
+    """
+    global _git_info_cache
+    if _git_info_cache is not None:
+        return _git_info_cache
+    _git_info_cache = {}
+    if _is_in_git_repo(path):
+        _git_info_cache = {
+            "repo_url": _get_repo_url(path),
+            "branch_name": _get_branch_name(path),
+            "commit_sha": _get_commit_sha(path),
+            "has_uncommitted_changes": _has_uncommitted_changes(path),
+        }
+    return _git_info_cache

metaflow/metaflow_version.py CHANGED Viewed

@@ -27,11 +27,11 @@ if name == "nt":
         """find the path to the git executable on Windows"""
         # first see if git is in the path
         try:
-            check_output(["where", "/Q", "git"])
+            subprocess.check_output(["where", "/Q", "git"])
             # if this command succeeded, git is in the path
             return "git"
         # catch the exception thrown if git was not found
-        except CalledProcessError:
+        except subprocess.CalledProcessError:
             pass
         # There are several locations where git.exe may be hiding
         possible_locations = []

metaflow/plugins/__init__.py CHANGED Viewed

@@ -75,6 +75,7 @@ FLOW_DECORATORS_DESC = [
 ENVIRONMENTS_DESC = [
     ("conda", ".pypi.conda_environment.CondaEnvironment"),
     ("pypi", ".pypi.pypi_environment.PyPIEnvironment"),
+    ("uv", ".uv.uv_environment.UVEnvironment"),
 ]
 # Add metadata providers here

metaflow/plugins/argo/argo_workflows.py CHANGED Viewed

@@ -7,6 +7,7 @@ import sys
 from collections import defaultdict
 from hashlib import sha1
 from math import inf
+from typing import List
 from metaflow import JSONType, current
 from metaflow.decorators import flow_decorators
@@ -111,6 +112,7 @@ class ArgoWorkflows(object):
         notify_pager_duty_integration_key=None,
         notify_incident_io_api_key=None,
         incident_io_alert_source_config_id=None,
+        incident_io_metadata: List[str] = None,
         enable_heartbeat_daemon=True,
         enable_error_msg_capture=False,
     ):
@@ -162,6 +164,9 @@ class ArgoWorkflows(object):
         self.notify_pager_duty_integration_key = notify_pager_duty_integration_key
         self.notify_incident_io_api_key = notify_incident_io_api_key
         self.incident_io_alert_source_config_id = incident_io_alert_source_config_id
+        self.incident_io_metadata = self.parse_incident_io_metadata(
+            incident_io_metadata
+        )
         self.enable_heartbeat_daemon = enable_heartbeat_daemon
         self.enable_error_msg_capture = enable_error_msg_capture
         self.parameters = self._process_parameters()
@@ -288,6 +293,21 @@ class ArgoWorkflows(object):
         return True
+    @staticmethod
+    def parse_incident_io_metadata(metadata: List[str] = None):
+        "parse key value pairs into a dict for incident.io metadata if given"
+        parsed_metadata = None
+        if metadata is not None:
+            parsed_metadata = {}
+            for kv in metadata:
+                key, value = kv.split("=", 1)
+                if key in parsed_metadata:
+                    raise MetaflowException(
+                        "Incident.io Metadata *%s* provided multiple times" % key
+                    )
+                parsed_metadata[key] = value
+        return parsed_metadata
     @classmethod
     def trigger(cls, name, parameters=None):
         if parameters is None:
@@ -1972,6 +1992,15 @@ class ArgoWorkflows(object):
                 resources["disk"],
             )
+            security_context = resources.get("security_context", None)
+            _security_context = {}
+            if security_context is not None and len(security_context) > 0:
+                _security_context = {
+                    "security_context": kubernetes_sdk.V1SecurityContext(
+                        **security_context
+                    )
+                }
             # Create a ContainerTemplate for this node. Ideally, we would have
             # liked to inline this ContainerTemplate and avoid scanning the workflow
             # twice, but due to issues with variable substitution, we will have to
@@ -2028,6 +2057,7 @@ class ArgoWorkflows(object):
                     shared_memory=shared_memory,
                     port=port,
                     qos=resources["qos"],
+                    security_context=security_context,
                 )
                 for k, v in env.items():
@@ -2313,6 +2343,7 @@ class ArgoWorkflows(object):
                                     is not None
                                     else []
                                 ),
+                                **_security_context,
                             ).to_dict()
                         )
                     )
@@ -2575,9 +2606,12 @@ class ArgoWorkflows(object):
                             else None
                         ),
                         "metadata": {
-                            "run_status": "failed",
-                            "flow_name": self.flow.name,
-                            "run_id": "argo-{{workflow.name}}",
+                            **(self.incident_io_metadata or {}),
+                            **{
+                                "run_status": "failed",
+                                "flow_name": self.flow.name,
+                                "run_id": "argo-{{workflow.name}}",
+                            },
                         },
                     }
                 )
@@ -2626,9 +2660,12 @@ class ArgoWorkflows(object):
                             else None
                         ),
                         "metadata": {
-                            "run_status": "succeeded",
-                            "flow_name": self.flow.name,
-                            "run_id": "argo-{{workflow.name}}",
+                            **(self.incident_io_metadata or {}),
+                            **{
+                                "run_status": "succeeded",
+                                "flow_name": self.flow.name,
+                                "run_id": "argo-{{workflow.name}}",
+                            },
                         },
                     }
                 )

metaflow/plugins/argo/argo_workflows_cli.py CHANGED Viewed

@@ -40,6 +40,7 @@ unsupported_decorators = {
     "snowpark": "Step *%s* is marked for execution on Snowpark with Argo Workflows which isn't currently supported.",
     "slurm": "Step *%s* is marked for execution on Slurm with Argo Workflows which isn't currently supported.",
     "nvidia": "Step *%s* is marked for execution on Nvidia with Argo Workflows which isn't currently supported.",
+    "nvct": "Step *%s* is marked for execution on Nvct with Argo Workflows which isn't currently supported.",
 }
@@ -187,6 +188,13 @@ def argo_workflows(obj, name=None):
     default=None,
     help="Incident.io Alert source config ID. Example '01GW2G3V0S59R238FAHPDS1R66'",
 )
+@click.option(
+    "--incident-io-metadata",
+    default=None,
+    type=str,
+    multiple=True,
+    help="Incident.io Alert Custom Metadata field in the form of Key=Value",
+)
 @click.option(
     "--enable-heartbeat-daemon/--no-enable-heartbeat-daemon",
     default=False,
@@ -226,6 +234,7 @@ def create(
     notify_pager_duty_integration_key=None,
     notify_incident_io_api_key=None,
     incident_io_alert_source_config_id=None,
+    incident_io_metadata=None,
     enable_heartbeat_daemon=True,
     deployer_attribute_file=None,
     enable_error_msg_capture=False,
@@ -283,6 +292,7 @@ def create(
         notify_pager_duty_integration_key,
         notify_incident_io_api_key,
         incident_io_alert_source_config_id,
+        incident_io_metadata,
         enable_heartbeat_daemon,
         enable_error_msg_capture,
     )
@@ -459,6 +469,7 @@ def make_flow(
     notify_pager_duty_integration_key,
     notify_incident_io_api_key,
     incident_io_alert_source_config_id,
+    incident_io_metadata,
     enable_heartbeat_daemon,
     enable_error_msg_capture,
 ):
@@ -538,6 +549,7 @@ def make_flow(
         notify_pager_duty_integration_key=notify_pager_duty_integration_key,
         notify_incident_io_api_key=notify_incident_io_api_key,
         incident_io_alert_source_config_id=incident_io_alert_source_config_id,
+        incident_io_metadata=incident_io_metadata,
         enable_heartbeat_daemon=enable_heartbeat_daemon,
         enable_error_msg_capture=enable_error_msg_capture,
     )

metaflow/plugins/aws/aws_client.py CHANGED Viewed

@@ -14,6 +14,7 @@ class Boto3ClientProvider(object):
             AWS_SANDBOX_ENABLED,
             AWS_SANDBOX_STS_ENDPOINT_URL,
             AWS_SANDBOX_API_KEY,
+            S3_CLIENT_RETRY_CONFIG,
         )
         if session_vars is None:
@@ -37,10 +38,10 @@ class Boto3ClientProvider(object):
         if module == "s3" and (
             "config" not in client_params or client_params["config"].retries is None
         ):
-            # Use the adaptive retry strategy by default -- do not set anything if
-            # the user has already set something
+            # do not set anything if the user has already set something
             config = client_params.get("config", Config())
-            config.retries = {"max_attempts": 10, "mode": "adaptive"}
+            config.retries = S3_CLIENT_RETRY_CONFIG
+            client_params["config"] = config
         if AWS_SANDBOX_ENABLED:
             # role is ignored in the sandbox

metaflow/plugins/datatools/s3/s3.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import errno
 import json
 import os
 import re
@@ -18,6 +19,7 @@ from metaflow.metaflow_config import (
     S3_RETRY_COUNT,
     S3_TRANSIENT_RETRY_COUNT,
     S3_SERVER_SIDE_ENCRYPTION,
+    S3_WORKER_COUNT,
     TEMPDIR,
 )
 from metaflow.util import (
@@ -136,6 +138,10 @@ class MetaflowS3InvalidRange(MetaflowException):
     headline = "S3 invalid range"
+class MetaflowS3InsufficientDiskSpace(MetaflowException):
+    headline = "Insufficient disk space"
 class S3Object(object):
     """
     This object represents a path or an object in S3,
@@ -1376,8 +1382,10 @@ class S3(object):
                 elif error_code == "NoSuchBucket":
                     raise MetaflowS3URLException("Specified S3 bucket doesn't exist.")
                 error = str(err)
+            except OSError as e:
+                if e.errno == errno.ENOSPC:
+                    raise MetaflowS3InsufficientDiskSpace(str(e))
             except Exception as ex:
-                # TODO specific error message for out of disk space
                 error = str(ex)
             if tmp:
                 os.unlink(tmp.name)
@@ -1390,9 +1398,31 @@ class S3(object):
         )
     # add some jitter to make sure retries are not synchronized
-    def _jitter_sleep(self, trynum, multiplier=2):
-        interval = multiplier**trynum + random.randint(0, 10)
-        time.sleep(interval)
+    def _jitter_sleep(
+        self, trynum: int, base: int = 2, cap: int = 360, jitter: float = 0.1
+    ) -> None:
+        """
+        Sleep for an exponentially increasing interval with added jitter.
+        Parameters
+        ----------
+        trynum: The current retry attempt number.
+        base: The base multiplier for the exponential backoff.
+        cap: The maximum interval to sleep.
+        jitter: The maximum jitter percentage to add to the interval.
+        """
+        # Calculate the exponential backoff interval
+        interval = min(cap, base**trynum)
+        # Add random jitter
+        jitter_value = interval * jitter * random.uniform(-1, 1)
+        interval_with_jitter = interval + jitter_value
+        # Ensure the interval is not negative
+        interval_with_jitter = max(0, interval_with_jitter)
+        # Sleep for the calculated interval
+        time.sleep(interval_with_jitter)
     # NOTE: re: _read_many_files and _put_many_files
     # All file IO is through binary files - we write bytes, we read
@@ -1480,20 +1510,17 @@ class S3(object):
         #    - a known transient failure (SlowDown for example) in which case we will
         #      retry *only* the inputs that have this transient failure.
         #    - an unknown failure (something went wrong but we cannot say if it was
-        #      a known permanent failure or something else). In this case, we retry
-        #      the operation completely.
+        #      a known permanent failure or something else). In this case, we assume
+        #      it's a transient failure and retry only those inputs (same as above).
         #
-        # There are therefore two retry counts:
-        #  - the transient failure retry count: how many times do we try on known
-        #    transient errors
-        #  - the top-level retry count: how many times do we try on unknown failures
-        #
-        # Note that, if the operation runs out of transient failure retries, it will
-        # count as an "unknown" failure (ie: it will be retried according to the
-        # outer top-level retry count). In other words, you can potentially have
-        # transient_retry_count * retry_count tries).
-        # Finally, if on transient failures, we make NO progress (ie: no input is
-        # successfully processed), that counts as an "unknown" failure.
+        # NOTES(npow): 2025-05-13
+        # Previously, this code would also retry the fatal failures, including no_progress
+        # and unknown failures, from the beginning. This is not ideal because:
+        # 1. Fatal errors are not supposed to be retried.
+        # 2. Retrying from the beginning does not improve the situation, and is
+        #    wasteful since we have already uploaded some files.
+        # 3. The number of transient errors is far more than fatal errors, so we
+        #    can be optimistic and assume the unknown errors are transient.
         cmdline = [sys.executable, os.path.abspath(s3op.__file__), mode]
         recursive_get = False
         for key, value in options.items():
@@ -1528,7 +1555,6 @@ class S3(object):
             # Otherwise, we cap the failure rate at 90%
             return min(90, self._s3_inject_failures)
-        retry_count = 0  # Number of retries (excluding transient failures)
         transient_retry_count = 0  # Number of transient retries (per top-level retry)
         inject_failures = _inject_failure_rate()
         out_lines = []  # List to contain the lines returned by _s3op_with_retries
@@ -1595,7 +1621,12 @@ class S3(object):
                         # things, this will shrink more and more until we are doing a
                         # single operation at a time. If things start going better, it
                         # will increase by 20% every round.
-                        max_count = min(int(last_ok_count * 1.2), len(pending_retries))
+                        #
+                        # If we made no progress (last_ok_count == 0) we retry at most
+                        # 2*S3_WORKER_COUNT from whatever is left in `pending_retries`
+                        max_count = min(
+                            int(last_ok_count * 1.2), len(pending_retries)
+                        ) or min(2 * S3_WORKER_COUNT, len(pending_retries))
                         tmp_input.writelines(pending_retries[:max_count])
                         tmp_input.flush()
                         debug.s3client_exec(
@@ -1712,38 +1743,16 @@ class S3(object):
                         _update_out_lines(out_lines, ok_lines, resize=loop_count == 0)
                         return 0, 0, inject_failures, err_out
-        while retry_count <= S3_RETRY_COUNT:
+        while transient_retry_count <= S3_TRANSIENT_RETRY_COUNT:
             (
                 last_ok_count,
                 last_retry_count,
                 inject_failures,
                 err_out,
             ) = try_s3_op(last_ok_count, pending_retries, out_lines, inject_failures)
-            if err_out or (
-                last_retry_count != 0
-                and (
-                    last_ok_count == 0
-                    or transient_retry_count > S3_TRANSIENT_RETRY_COUNT
-                )
-            ):
-                # We had a fatal failure (err_out is not None)
-                # or we made no progress (last_ok_count is 0)
-                # or we are out of transient retries
-                # so we will restart from scratch (being very conservative)
-                retry_count += 1
-                err_msg = err_out
-                if err_msg is None and last_ok_count == 0:
-                    err_msg = "No progress"
-                if err_msg is None:
-                    err_msg = "Too many transient errors"
-                print(
-                    "S3 non-transient error (attempt #%d): %s" % (retry_count, err_msg)
-                )
-                _reset()
-                if retry_count <= S3_RETRY_COUNT:
-                    self._jitter_sleep(retry_count)
-                continue
-            elif last_retry_count != 0:
+            if err_out:
+                break
+            if last_retry_count != 0:
                 # During our last try, we did not manage to process everything we wanted
                 # due to a transient failure so we try again.
                 transient_retry_count += 1

ob-metaflow 2.15.7.1__py2.py3-none-any.whl → 2.15.10.1__py2.py3-none-any.whl

Potentially problematic release.

ob-metaflow 2.15.7.1py2.py3-none-any.whl → 2.15.10.1py2.py3-none-any.whl