ob-metaflow 2.15.7.1__py2.py3-none-any.whl → 2.15.10.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow might be problematic. Click here for more details.
- metaflow/cli.py +8 -0
- metaflow/cli_components/run_cmds.py +2 -2
- metaflow/cmd/main_cli.py +1 -1
- metaflow/metadata_provider/metadata.py +35 -0
- metaflow/metaflow_config.py +6 -0
- metaflow/metaflow_environment.py +6 -1
- metaflow/metaflow_git.py +115 -0
- metaflow/metaflow_version.py +2 -2
- metaflow/plugins/__init__.py +1 -0
- metaflow/plugins/argo/argo_workflows.py +43 -6
- metaflow/plugins/argo/argo_workflows_cli.py +12 -0
- metaflow/plugins/aws/aws_client.py +4 -3
- metaflow/plugins/datatools/s3/s3.py +54 -45
- metaflow/plugins/datatools/s3/s3op.py +149 -62
- metaflow/plugins/kubernetes/kubernetes.py +4 -0
- metaflow/plugins/kubernetes/kubernetes_cli.py +8 -0
- metaflow/plugins/kubernetes/kubernetes_decorator.py +10 -0
- metaflow/plugins/kubernetes/kubernetes_job.py +8 -0
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +7 -0
- metaflow/plugins/pypi/conda_decorator.py +2 -1
- metaflow/plugins/pypi/conda_environment.py +1 -0
- metaflow/plugins/uv/__init__.py +0 -0
- metaflow/plugins/uv/bootstrap.py +100 -0
- metaflow/plugins/uv/uv_environment.py +70 -0
- metaflow/runner/deployer.py +8 -2
- metaflow/runner/deployer_impl.py +6 -2
- metaflow/runner/metaflow_runner.py +7 -2
- metaflow/version.py +1 -1
- {ob_metaflow-2.15.7.1.data → ob_metaflow-2.15.10.1.data}/data/share/metaflow/devtools/Makefile +2 -0
- {ob_metaflow-2.15.7.1.dist-info → ob_metaflow-2.15.10.1.dist-info}/METADATA +2 -2
- {ob_metaflow-2.15.7.1.dist-info → ob_metaflow-2.15.10.1.dist-info}/RECORD +37 -33
- {ob_metaflow-2.15.7.1.dist-info → ob_metaflow-2.15.10.1.dist-info}/WHEEL +1 -1
- {ob_metaflow-2.15.7.1.data → ob_metaflow-2.15.10.1.data}/data/share/metaflow/devtools/Tiltfile +0 -0
- {ob_metaflow-2.15.7.1.data → ob_metaflow-2.15.10.1.data}/data/share/metaflow/devtools/pick_services.sh +0 -0
- {ob_metaflow-2.15.7.1.dist-info → ob_metaflow-2.15.10.1.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.15.7.1.dist-info → ob_metaflow-2.15.10.1.dist-info}/licenses/LICENSE +0 -0
- {ob_metaflow-2.15.7.1.dist-info → ob_metaflow-2.15.10.1.dist-info}/top_level.txt +0 -0
metaflow/cli.py
CHANGED
|
@@ -17,6 +17,7 @@ from .flowspec import _FlowState
|
|
|
17
17
|
from .graph import FlowGraph
|
|
18
18
|
from .metaflow_config import (
|
|
19
19
|
DEFAULT_DATASTORE,
|
|
20
|
+
DEFAULT_DECOSPECS,
|
|
20
21
|
DEFAULT_ENVIRONMENT,
|
|
21
22
|
DEFAULT_EVENT_LOGGER,
|
|
22
23
|
DEFAULT_METADATA,
|
|
@@ -509,9 +510,16 @@ def start(
|
|
|
509
510
|
):
|
|
510
511
|
# run/resume are special cases because they can add more decorators with --with,
|
|
511
512
|
# so they have to take care of themselves.
|
|
513
|
+
|
|
512
514
|
all_decospecs = ctx.obj.tl_decospecs + list(
|
|
513
515
|
ctx.obj.environment.decospecs() or []
|
|
514
516
|
)
|
|
517
|
+
|
|
518
|
+
# We add the default decospecs for everything except init and step since in those
|
|
519
|
+
# cases, the decospecs will already have been handled by either a run/resume
|
|
520
|
+
# or a scheduler setting them up in their own way.
|
|
521
|
+
if ctx.saved_args[0] not in ("step", "init"):
|
|
522
|
+
all_decospecs += DEFAULT_DECOSPECS.split()
|
|
515
523
|
if all_decospecs:
|
|
516
524
|
decorators._attach_decorators(ctx.obj.flow, all_decospecs)
|
|
517
525
|
decorators._init(ctx.obj.flow)
|
|
@@ -71,7 +71,7 @@ def write_file(file_path, content):
|
|
|
71
71
|
f.write(str(content))
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
def
|
|
74
|
+
def config_callback(ctx, param, value):
|
|
75
75
|
# Callback to:
|
|
76
76
|
# - read the Click auto_envvar variable from both the
|
|
77
77
|
# environment AND the configuration
|
|
@@ -127,7 +127,7 @@ def common_run_options(func):
|
|
|
127
127
|
help="Add a decorator to all steps. You can specify this "
|
|
128
128
|
"option multiple times to attach multiple decorators "
|
|
129
129
|
"in steps.",
|
|
130
|
-
callback=
|
|
130
|
+
callback=config_callback,
|
|
131
131
|
)
|
|
132
132
|
@click.option(
|
|
133
133
|
"--run-id-file",
|
metaflow/cmd/main_cli.py
CHANGED
|
@@ -94,7 +94,7 @@ def start(ctx):
|
|
|
94
94
|
echo("(%s)\n" % version, fg="magenta", bold=False)
|
|
95
95
|
|
|
96
96
|
if ctx.invoked_subcommand is None:
|
|
97
|
-
echo("More
|
|
97
|
+
echo("More AI, less engineering\n", fg="magenta")
|
|
98
98
|
|
|
99
99
|
lnk_sz = max(len(lnk) for lnk in CONTACT_INFO.values()) + 1
|
|
100
100
|
for what, lnk in CONTACT_INFO.items():
|
|
@@ -630,6 +630,20 @@ class MetadataProvider(object):
|
|
|
630
630
|
sys_info["r_version"] = env["r_version_code"]
|
|
631
631
|
return sys_info
|
|
632
632
|
|
|
633
|
+
def _get_git_info_as_dict(self):
|
|
634
|
+
git_info = {}
|
|
635
|
+
env = self._environment.get_environment_info()
|
|
636
|
+
for key in [
|
|
637
|
+
"repo_url",
|
|
638
|
+
"branch_name",
|
|
639
|
+
"commit_sha",
|
|
640
|
+
"has_uncommitted_changes",
|
|
641
|
+
]:
|
|
642
|
+
if key in env and env[key]:
|
|
643
|
+
git_info[key] = env[key]
|
|
644
|
+
|
|
645
|
+
return git_info
|
|
646
|
+
|
|
633
647
|
def _get_system_tags(self):
|
|
634
648
|
"""Convert system info dictionary into a list of system tags"""
|
|
635
649
|
return [
|
|
@@ -670,6 +684,27 @@ class MetadataProvider(object):
|
|
|
670
684
|
tags=["attempt_id:{0}".format(attempt)],
|
|
671
685
|
)
|
|
672
686
|
)
|
|
687
|
+
# Add script name as metadata
|
|
688
|
+
script_name = self._environment.get_environment_info()["script"]
|
|
689
|
+
metadata.append(
|
|
690
|
+
MetaDatum(
|
|
691
|
+
field="script-name",
|
|
692
|
+
value=script_name,
|
|
693
|
+
type="script-name",
|
|
694
|
+
tags=["attempt_id:{0}".format(attempt)],
|
|
695
|
+
)
|
|
696
|
+
)
|
|
697
|
+
# And add git metadata
|
|
698
|
+
git_info = self._get_git_info_as_dict()
|
|
699
|
+
if git_info:
|
|
700
|
+
metadata.append(
|
|
701
|
+
MetaDatum(
|
|
702
|
+
field="git-info",
|
|
703
|
+
value=json.dumps(git_info),
|
|
704
|
+
type="git-info",
|
|
705
|
+
tags=["attempt_id:{0}".format(attempt)],
|
|
706
|
+
)
|
|
707
|
+
)
|
|
673
708
|
if metadata:
|
|
674
709
|
self.register_metadata(run_id, step_name, task_id, metadata)
|
|
675
710
|
|
metaflow/metaflow_config.py
CHANGED
|
@@ -109,6 +109,12 @@ S3_WORKER_COUNT = from_conf("S3_WORKER_COUNT", 64)
|
|
|
109
109
|
# top-level retries)
|
|
110
110
|
S3_TRANSIENT_RETRY_COUNT = from_conf("S3_TRANSIENT_RETRY_COUNT", 20)
|
|
111
111
|
|
|
112
|
+
# S3 retry configuration used in the aws client
|
|
113
|
+
# Use the adaptive retry strategy by default
|
|
114
|
+
S3_CLIENT_RETRY_CONFIG = from_conf(
|
|
115
|
+
"S3_CLIENT_RETRY_CONFIG", {"max_attempts": 10, "mode": "adaptive"}
|
|
116
|
+
)
|
|
117
|
+
|
|
112
118
|
# Threshold to start printing warnings for an AWS retry
|
|
113
119
|
RETRY_WARNING_THRESHOLD = 3
|
|
114
120
|
|
metaflow/metaflow_environment.py
CHANGED
|
@@ -4,6 +4,7 @@ import sys
|
|
|
4
4
|
|
|
5
5
|
from .util import get_username
|
|
6
6
|
from . import metaflow_version
|
|
7
|
+
from . import metaflow_git
|
|
7
8
|
from metaflow.exception import MetaflowException
|
|
8
9
|
from metaflow.extension_support import dump_module_info
|
|
9
10
|
from metaflow.mflog import BASH_MFLOG, BASH_FLUSH_LOGS
|
|
@@ -197,6 +198,10 @@ class MetaflowEnvironment(object):
|
|
|
197
198
|
"python_version_code": "%d.%d.%d" % sys.version_info[:3],
|
|
198
199
|
"metaflow_version": metaflow_version.get_version(),
|
|
199
200
|
"script": os.path.basename(os.path.abspath(sys.argv[0])),
|
|
201
|
+
# Add git info
|
|
202
|
+
**metaflow_git.get_repository_info(
|
|
203
|
+
path=os.path.dirname(os.path.abspath(sys.argv[0]))
|
|
204
|
+
),
|
|
200
205
|
}
|
|
201
206
|
if R.use_r():
|
|
202
207
|
env["metaflow_r_version"] = R.metaflow_r_version()
|
|
@@ -206,7 +211,7 @@ class MetaflowEnvironment(object):
|
|
|
206
211
|
# Information about extension modules (to load them in the proper order)
|
|
207
212
|
ext_key, ext_val = dump_module_info()
|
|
208
213
|
env[ext_key] = ext_val
|
|
209
|
-
return env
|
|
214
|
+
return {k: v for k, v in env.items() if v is not None and v != ""}
|
|
210
215
|
|
|
211
216
|
def executable(self, step_name, default=None):
|
|
212
217
|
if default is not None:
|
metaflow/metaflow_git.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""Get git repository information for the package
|
|
3
|
+
|
|
4
|
+
Functions to retrieve git repository details like URL, branch name,
|
|
5
|
+
and commit SHA for Metaflow code provenance tracking.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import subprocess
|
|
10
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
11
|
+
|
|
12
|
+
# Cache for git information to avoid repeated subprocess calls
|
|
13
|
+
_git_info_cache = None
|
|
14
|
+
|
|
15
|
+
__all__ = ("get_repository_info",)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _call_git(
|
|
19
|
+
args: List[str], path=Union[str, os.PathLike]
|
|
20
|
+
) -> Tuple[Optional[str], Optional[int], bool]:
|
|
21
|
+
"""
|
|
22
|
+
Call git with provided args.
|
|
23
|
+
|
|
24
|
+
Returns
|
|
25
|
+
-------
|
|
26
|
+
tuple : Tuple containing
|
|
27
|
+
(stdout, exitcode, failure) of the call
|
|
28
|
+
"""
|
|
29
|
+
try:
|
|
30
|
+
result = subprocess.run(
|
|
31
|
+
["git", *args],
|
|
32
|
+
cwd=path,
|
|
33
|
+
capture_output=True,
|
|
34
|
+
text=True,
|
|
35
|
+
check=False,
|
|
36
|
+
)
|
|
37
|
+
return result.stdout.strip(), result.returncode, False
|
|
38
|
+
except (OSError, subprocess.SubprocessError):
|
|
39
|
+
# Covers subprocess timeouts and other errors which would not lead to an exit code
|
|
40
|
+
return None, None, True
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _get_repo_url(path: Union[str, os.PathLike]) -> Optional[str]:
|
|
44
|
+
"""Get the repository URL from git config"""
|
|
45
|
+
stdout, returncode, _failed = _call_git(
|
|
46
|
+
["config", "--get", "remote.origin.url"], path
|
|
47
|
+
)
|
|
48
|
+
if returncode == 0:
|
|
49
|
+
url = stdout
|
|
50
|
+
# Convert SSH URLs to HTTPS for clickable links
|
|
51
|
+
if url.startswith("git@"):
|
|
52
|
+
parts = url.split(":", 1)
|
|
53
|
+
if len(parts) == 2:
|
|
54
|
+
domain = parts[0].replace("git@", "")
|
|
55
|
+
repo_path = parts[1]
|
|
56
|
+
url = f"https://{domain}/{repo_path}"
|
|
57
|
+
return url
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _get_branch_name(path: Union[str, os.PathLike]) -> Optional[str]:
|
|
62
|
+
"""Get the current git branch name"""
|
|
63
|
+
stdout, returncode, _failed = _call_git(["rev-parse", "--abbrev-ref", "HEAD"], path)
|
|
64
|
+
return stdout if returncode == 0 else None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _get_commit_sha(path: Union[str, os.PathLike]) -> Optional[str]:
|
|
68
|
+
"""Get the current git commit SHA"""
|
|
69
|
+
stdout, returncode, _failed = _call_git(["rev-parse", "HEAD"], path)
|
|
70
|
+
return stdout if returncode == 0 else None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _is_in_git_repo(path: Union[str, os.PathLike]) -> bool:
|
|
74
|
+
"""Check if we're currently in a git repository"""
|
|
75
|
+
stdout, returncode, _failed = _call_git(
|
|
76
|
+
["rev-parse", "--is-inside-work-tree"], path
|
|
77
|
+
)
|
|
78
|
+
return returncode == 0 and stdout == "true"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _has_uncommitted_changes(path: Union[str, os.PathLike]) -> Optional[bool]:
|
|
82
|
+
"""Check if the git repository has uncommitted changes"""
|
|
83
|
+
_stdout, returncode, failed = _call_git(
|
|
84
|
+
["diff-index", "--quiet", "HEAD", "--"], path
|
|
85
|
+
)
|
|
86
|
+
if failed:
|
|
87
|
+
return None
|
|
88
|
+
return returncode != 0
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def get_repository_info(path: Union[str, os.PathLike]) -> Dict[str, Union[str, bool]]:
|
|
92
|
+
"""Get git repository information for a path
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
dict: Dictionary containing:
|
|
96
|
+
repo_url: Repository URL (converted to HTTPS if from SSH)
|
|
97
|
+
branch_name: Current branch name
|
|
98
|
+
commit_sha: Current commit SHA
|
|
99
|
+
has_uncommitted_changes: Boolean indicating if there are uncommitted changes
|
|
100
|
+
"""
|
|
101
|
+
global _git_info_cache
|
|
102
|
+
|
|
103
|
+
if _git_info_cache is not None:
|
|
104
|
+
return _git_info_cache
|
|
105
|
+
|
|
106
|
+
_git_info_cache = {}
|
|
107
|
+
if _is_in_git_repo(path):
|
|
108
|
+
_git_info_cache = {
|
|
109
|
+
"repo_url": _get_repo_url(path),
|
|
110
|
+
"branch_name": _get_branch_name(path),
|
|
111
|
+
"commit_sha": _get_commit_sha(path),
|
|
112
|
+
"has_uncommitted_changes": _has_uncommitted_changes(path),
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return _git_info_cache
|
metaflow/metaflow_version.py
CHANGED
|
@@ -27,11 +27,11 @@ if name == "nt":
|
|
|
27
27
|
"""find the path to the git executable on Windows"""
|
|
28
28
|
# first see if git is in the path
|
|
29
29
|
try:
|
|
30
|
-
check_output(["where", "/Q", "git"])
|
|
30
|
+
subprocess.check_output(["where", "/Q", "git"])
|
|
31
31
|
# if this command succeeded, git is in the path
|
|
32
32
|
return "git"
|
|
33
33
|
# catch the exception thrown if git was not found
|
|
34
|
-
except CalledProcessError:
|
|
34
|
+
except subprocess.CalledProcessError:
|
|
35
35
|
pass
|
|
36
36
|
# There are several locations where git.exe may be hiding
|
|
37
37
|
possible_locations = []
|
metaflow/plugins/__init__.py
CHANGED
|
@@ -7,6 +7,7 @@ import sys
|
|
|
7
7
|
from collections import defaultdict
|
|
8
8
|
from hashlib import sha1
|
|
9
9
|
from math import inf
|
|
10
|
+
from typing import List
|
|
10
11
|
|
|
11
12
|
from metaflow import JSONType, current
|
|
12
13
|
from metaflow.decorators import flow_decorators
|
|
@@ -111,6 +112,7 @@ class ArgoWorkflows(object):
|
|
|
111
112
|
notify_pager_duty_integration_key=None,
|
|
112
113
|
notify_incident_io_api_key=None,
|
|
113
114
|
incident_io_alert_source_config_id=None,
|
|
115
|
+
incident_io_metadata: List[str] = None,
|
|
114
116
|
enable_heartbeat_daemon=True,
|
|
115
117
|
enable_error_msg_capture=False,
|
|
116
118
|
):
|
|
@@ -162,6 +164,9 @@ class ArgoWorkflows(object):
|
|
|
162
164
|
self.notify_pager_duty_integration_key = notify_pager_duty_integration_key
|
|
163
165
|
self.notify_incident_io_api_key = notify_incident_io_api_key
|
|
164
166
|
self.incident_io_alert_source_config_id = incident_io_alert_source_config_id
|
|
167
|
+
self.incident_io_metadata = self.parse_incident_io_metadata(
|
|
168
|
+
incident_io_metadata
|
|
169
|
+
)
|
|
165
170
|
self.enable_heartbeat_daemon = enable_heartbeat_daemon
|
|
166
171
|
self.enable_error_msg_capture = enable_error_msg_capture
|
|
167
172
|
self.parameters = self._process_parameters()
|
|
@@ -288,6 +293,21 @@ class ArgoWorkflows(object):
|
|
|
288
293
|
|
|
289
294
|
return True
|
|
290
295
|
|
|
296
|
+
@staticmethod
|
|
297
|
+
def parse_incident_io_metadata(metadata: List[str] = None):
|
|
298
|
+
"parse key value pairs into a dict for incident.io metadata if given"
|
|
299
|
+
parsed_metadata = None
|
|
300
|
+
if metadata is not None:
|
|
301
|
+
parsed_metadata = {}
|
|
302
|
+
for kv in metadata:
|
|
303
|
+
key, value = kv.split("=", 1)
|
|
304
|
+
if key in parsed_metadata:
|
|
305
|
+
raise MetaflowException(
|
|
306
|
+
"Incident.io Metadata *%s* provided multiple times" % key
|
|
307
|
+
)
|
|
308
|
+
parsed_metadata[key] = value
|
|
309
|
+
return parsed_metadata
|
|
310
|
+
|
|
291
311
|
@classmethod
|
|
292
312
|
def trigger(cls, name, parameters=None):
|
|
293
313
|
if parameters is None:
|
|
@@ -1972,6 +1992,15 @@ class ArgoWorkflows(object):
|
|
|
1972
1992
|
resources["disk"],
|
|
1973
1993
|
)
|
|
1974
1994
|
|
|
1995
|
+
security_context = resources.get("security_context", None)
|
|
1996
|
+
_security_context = {}
|
|
1997
|
+
if security_context is not None and len(security_context) > 0:
|
|
1998
|
+
_security_context = {
|
|
1999
|
+
"security_context": kubernetes_sdk.V1SecurityContext(
|
|
2000
|
+
**security_context
|
|
2001
|
+
)
|
|
2002
|
+
}
|
|
2003
|
+
|
|
1975
2004
|
# Create a ContainerTemplate for this node. Ideally, we would have
|
|
1976
2005
|
# liked to inline this ContainerTemplate and avoid scanning the workflow
|
|
1977
2006
|
# twice, but due to issues with variable substitution, we will have to
|
|
@@ -2028,6 +2057,7 @@ class ArgoWorkflows(object):
|
|
|
2028
2057
|
shared_memory=shared_memory,
|
|
2029
2058
|
port=port,
|
|
2030
2059
|
qos=resources["qos"],
|
|
2060
|
+
security_context=security_context,
|
|
2031
2061
|
)
|
|
2032
2062
|
|
|
2033
2063
|
for k, v in env.items():
|
|
@@ -2313,6 +2343,7 @@ class ArgoWorkflows(object):
|
|
|
2313
2343
|
is not None
|
|
2314
2344
|
else []
|
|
2315
2345
|
),
|
|
2346
|
+
**_security_context,
|
|
2316
2347
|
).to_dict()
|
|
2317
2348
|
)
|
|
2318
2349
|
)
|
|
@@ -2575,9 +2606,12 @@ class ArgoWorkflows(object):
|
|
|
2575
2606
|
else None
|
|
2576
2607
|
),
|
|
2577
2608
|
"metadata": {
|
|
2578
|
-
|
|
2579
|
-
|
|
2580
|
-
|
|
2609
|
+
**(self.incident_io_metadata or {}),
|
|
2610
|
+
**{
|
|
2611
|
+
"run_status": "failed",
|
|
2612
|
+
"flow_name": self.flow.name,
|
|
2613
|
+
"run_id": "argo-{{workflow.name}}",
|
|
2614
|
+
},
|
|
2581
2615
|
},
|
|
2582
2616
|
}
|
|
2583
2617
|
)
|
|
@@ -2626,9 +2660,12 @@ class ArgoWorkflows(object):
|
|
|
2626
2660
|
else None
|
|
2627
2661
|
),
|
|
2628
2662
|
"metadata": {
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2663
|
+
**(self.incident_io_metadata or {}),
|
|
2664
|
+
**{
|
|
2665
|
+
"run_status": "succeeded",
|
|
2666
|
+
"flow_name": self.flow.name,
|
|
2667
|
+
"run_id": "argo-{{workflow.name}}",
|
|
2668
|
+
},
|
|
2632
2669
|
},
|
|
2633
2670
|
}
|
|
2634
2671
|
)
|
|
@@ -40,6 +40,7 @@ unsupported_decorators = {
|
|
|
40
40
|
"snowpark": "Step *%s* is marked for execution on Snowpark with Argo Workflows which isn't currently supported.",
|
|
41
41
|
"slurm": "Step *%s* is marked for execution on Slurm with Argo Workflows which isn't currently supported.",
|
|
42
42
|
"nvidia": "Step *%s* is marked for execution on Nvidia with Argo Workflows which isn't currently supported.",
|
|
43
|
+
"nvct": "Step *%s* is marked for execution on Nvct with Argo Workflows which isn't currently supported.",
|
|
43
44
|
}
|
|
44
45
|
|
|
45
46
|
|
|
@@ -187,6 +188,13 @@ def argo_workflows(obj, name=None):
|
|
|
187
188
|
default=None,
|
|
188
189
|
help="Incident.io Alert source config ID. Example '01GW2G3V0S59R238FAHPDS1R66'",
|
|
189
190
|
)
|
|
191
|
+
@click.option(
|
|
192
|
+
"--incident-io-metadata",
|
|
193
|
+
default=None,
|
|
194
|
+
type=str,
|
|
195
|
+
multiple=True,
|
|
196
|
+
help="Incident.io Alert Custom Metadata field in the form of Key=Value",
|
|
197
|
+
)
|
|
190
198
|
@click.option(
|
|
191
199
|
"--enable-heartbeat-daemon/--no-enable-heartbeat-daemon",
|
|
192
200
|
default=False,
|
|
@@ -226,6 +234,7 @@ def create(
|
|
|
226
234
|
notify_pager_duty_integration_key=None,
|
|
227
235
|
notify_incident_io_api_key=None,
|
|
228
236
|
incident_io_alert_source_config_id=None,
|
|
237
|
+
incident_io_metadata=None,
|
|
229
238
|
enable_heartbeat_daemon=True,
|
|
230
239
|
deployer_attribute_file=None,
|
|
231
240
|
enable_error_msg_capture=False,
|
|
@@ -283,6 +292,7 @@ def create(
|
|
|
283
292
|
notify_pager_duty_integration_key,
|
|
284
293
|
notify_incident_io_api_key,
|
|
285
294
|
incident_io_alert_source_config_id,
|
|
295
|
+
incident_io_metadata,
|
|
286
296
|
enable_heartbeat_daemon,
|
|
287
297
|
enable_error_msg_capture,
|
|
288
298
|
)
|
|
@@ -459,6 +469,7 @@ def make_flow(
|
|
|
459
469
|
notify_pager_duty_integration_key,
|
|
460
470
|
notify_incident_io_api_key,
|
|
461
471
|
incident_io_alert_source_config_id,
|
|
472
|
+
incident_io_metadata,
|
|
462
473
|
enable_heartbeat_daemon,
|
|
463
474
|
enable_error_msg_capture,
|
|
464
475
|
):
|
|
@@ -538,6 +549,7 @@ def make_flow(
|
|
|
538
549
|
notify_pager_duty_integration_key=notify_pager_duty_integration_key,
|
|
539
550
|
notify_incident_io_api_key=notify_incident_io_api_key,
|
|
540
551
|
incident_io_alert_source_config_id=incident_io_alert_source_config_id,
|
|
552
|
+
incident_io_metadata=incident_io_metadata,
|
|
541
553
|
enable_heartbeat_daemon=enable_heartbeat_daemon,
|
|
542
554
|
enable_error_msg_capture=enable_error_msg_capture,
|
|
543
555
|
)
|
|
@@ -14,6 +14,7 @@ class Boto3ClientProvider(object):
|
|
|
14
14
|
AWS_SANDBOX_ENABLED,
|
|
15
15
|
AWS_SANDBOX_STS_ENDPOINT_URL,
|
|
16
16
|
AWS_SANDBOX_API_KEY,
|
|
17
|
+
S3_CLIENT_RETRY_CONFIG,
|
|
17
18
|
)
|
|
18
19
|
|
|
19
20
|
if session_vars is None:
|
|
@@ -37,10 +38,10 @@ class Boto3ClientProvider(object):
|
|
|
37
38
|
if module == "s3" and (
|
|
38
39
|
"config" not in client_params or client_params["config"].retries is None
|
|
39
40
|
):
|
|
40
|
-
#
|
|
41
|
-
# the user has already set something
|
|
41
|
+
# do not set anything if the user has already set something
|
|
42
42
|
config = client_params.get("config", Config())
|
|
43
|
-
config.retries =
|
|
43
|
+
config.retries = S3_CLIENT_RETRY_CONFIG
|
|
44
|
+
client_params["config"] = config
|
|
44
45
|
|
|
45
46
|
if AWS_SANDBOX_ENABLED:
|
|
46
47
|
# role is ignored in the sandbox
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import errno
|
|
1
2
|
import json
|
|
2
3
|
import os
|
|
3
4
|
import re
|
|
@@ -18,6 +19,7 @@ from metaflow.metaflow_config import (
|
|
|
18
19
|
S3_RETRY_COUNT,
|
|
19
20
|
S3_TRANSIENT_RETRY_COUNT,
|
|
20
21
|
S3_SERVER_SIDE_ENCRYPTION,
|
|
22
|
+
S3_WORKER_COUNT,
|
|
21
23
|
TEMPDIR,
|
|
22
24
|
)
|
|
23
25
|
from metaflow.util import (
|
|
@@ -136,6 +138,10 @@ class MetaflowS3InvalidRange(MetaflowException):
|
|
|
136
138
|
headline = "S3 invalid range"
|
|
137
139
|
|
|
138
140
|
|
|
141
|
+
class MetaflowS3InsufficientDiskSpace(MetaflowException):
|
|
142
|
+
headline = "Insufficient disk space"
|
|
143
|
+
|
|
144
|
+
|
|
139
145
|
class S3Object(object):
|
|
140
146
|
"""
|
|
141
147
|
This object represents a path or an object in S3,
|
|
@@ -1376,8 +1382,10 @@ class S3(object):
|
|
|
1376
1382
|
elif error_code == "NoSuchBucket":
|
|
1377
1383
|
raise MetaflowS3URLException("Specified S3 bucket doesn't exist.")
|
|
1378
1384
|
error = str(err)
|
|
1385
|
+
except OSError as e:
|
|
1386
|
+
if e.errno == errno.ENOSPC:
|
|
1387
|
+
raise MetaflowS3InsufficientDiskSpace(str(e))
|
|
1379
1388
|
except Exception as ex:
|
|
1380
|
-
# TODO specific error message for out of disk space
|
|
1381
1389
|
error = str(ex)
|
|
1382
1390
|
if tmp:
|
|
1383
1391
|
os.unlink(tmp.name)
|
|
@@ -1390,9 +1398,31 @@ class S3(object):
|
|
|
1390
1398
|
)
|
|
1391
1399
|
|
|
1392
1400
|
# add some jitter to make sure retries are not synchronized
|
|
1393
|
-
def _jitter_sleep(
|
|
1394
|
-
|
|
1395
|
-
|
|
1401
|
+
def _jitter_sleep(
|
|
1402
|
+
self, trynum: int, base: int = 2, cap: int = 360, jitter: float = 0.1
|
|
1403
|
+
) -> None:
|
|
1404
|
+
"""
|
|
1405
|
+
Sleep for an exponentially increasing interval with added jitter.
|
|
1406
|
+
|
|
1407
|
+
Parameters
|
|
1408
|
+
----------
|
|
1409
|
+
trynum: The current retry attempt number.
|
|
1410
|
+
base: The base multiplier for the exponential backoff.
|
|
1411
|
+
cap: The maximum interval to sleep.
|
|
1412
|
+
jitter: The maximum jitter percentage to add to the interval.
|
|
1413
|
+
"""
|
|
1414
|
+
# Calculate the exponential backoff interval
|
|
1415
|
+
interval = min(cap, base**trynum)
|
|
1416
|
+
|
|
1417
|
+
# Add random jitter
|
|
1418
|
+
jitter_value = interval * jitter * random.uniform(-1, 1)
|
|
1419
|
+
interval_with_jitter = interval + jitter_value
|
|
1420
|
+
|
|
1421
|
+
# Ensure the interval is not negative
|
|
1422
|
+
interval_with_jitter = max(0, interval_with_jitter)
|
|
1423
|
+
|
|
1424
|
+
# Sleep for the calculated interval
|
|
1425
|
+
time.sleep(interval_with_jitter)
|
|
1396
1426
|
|
|
1397
1427
|
# NOTE: re: _read_many_files and _put_many_files
|
|
1398
1428
|
# All file IO is through binary files - we write bytes, we read
|
|
@@ -1480,20 +1510,17 @@ class S3(object):
|
|
|
1480
1510
|
# - a known transient failure (SlowDown for example) in which case we will
|
|
1481
1511
|
# retry *only* the inputs that have this transient failure.
|
|
1482
1512
|
# - an unknown failure (something went wrong but we cannot say if it was
|
|
1483
|
-
# a known permanent failure or something else). In this case, we
|
|
1484
|
-
#
|
|
1513
|
+
# a known permanent failure or something else). In this case, we assume
|
|
1514
|
+
# it's a transient failure and retry only those inputs (same as above).
|
|
1485
1515
|
#
|
|
1486
|
-
#
|
|
1487
|
-
#
|
|
1488
|
-
#
|
|
1489
|
-
#
|
|
1490
|
-
#
|
|
1491
|
-
#
|
|
1492
|
-
#
|
|
1493
|
-
#
|
|
1494
|
-
# transient_retry_count * retry_count tries).
|
|
1495
|
-
# Finally, if on transient failures, we make NO progress (ie: no input is
|
|
1496
|
-
# successfully processed), that counts as an "unknown" failure.
|
|
1516
|
+
# NOTES(npow): 2025-05-13
|
|
1517
|
+
# Previously, this code would also retry the fatal failures, including no_progress
|
|
1518
|
+
# and unknown failures, from the beginning. This is not ideal because:
|
|
1519
|
+
# 1. Fatal errors are not supposed to be retried.
|
|
1520
|
+
# 2. Retrying from the beginning does not improve the situation, and is
|
|
1521
|
+
# wasteful since we have already uploaded some files.
|
|
1522
|
+
# 3. The number of transient errors is far more than fatal errors, so we
|
|
1523
|
+
# can be optimistic and assume the unknown errors are transient.
|
|
1497
1524
|
cmdline = [sys.executable, os.path.abspath(s3op.__file__), mode]
|
|
1498
1525
|
recursive_get = False
|
|
1499
1526
|
for key, value in options.items():
|
|
@@ -1528,7 +1555,6 @@ class S3(object):
|
|
|
1528
1555
|
# Otherwise, we cap the failure rate at 90%
|
|
1529
1556
|
return min(90, self._s3_inject_failures)
|
|
1530
1557
|
|
|
1531
|
-
retry_count = 0 # Number of retries (excluding transient failures)
|
|
1532
1558
|
transient_retry_count = 0 # Number of transient retries (per top-level retry)
|
|
1533
1559
|
inject_failures = _inject_failure_rate()
|
|
1534
1560
|
out_lines = [] # List to contain the lines returned by _s3op_with_retries
|
|
@@ -1595,7 +1621,12 @@ class S3(object):
|
|
|
1595
1621
|
# things, this will shrink more and more until we are doing a
|
|
1596
1622
|
# single operation at a time. If things start going better, it
|
|
1597
1623
|
# will increase by 20% every round.
|
|
1598
|
-
|
|
1624
|
+
#
|
|
1625
|
+
# If we made no progress (last_ok_count == 0) we retry at most
|
|
1626
|
+
# 2*S3_WORKER_COUNT from whatever is left in `pending_retries`
|
|
1627
|
+
max_count = min(
|
|
1628
|
+
int(last_ok_count * 1.2), len(pending_retries)
|
|
1629
|
+
) or min(2 * S3_WORKER_COUNT, len(pending_retries))
|
|
1599
1630
|
tmp_input.writelines(pending_retries[:max_count])
|
|
1600
1631
|
tmp_input.flush()
|
|
1601
1632
|
debug.s3client_exec(
|
|
@@ -1712,38 +1743,16 @@ class S3(object):
|
|
|
1712
1743
|
_update_out_lines(out_lines, ok_lines, resize=loop_count == 0)
|
|
1713
1744
|
return 0, 0, inject_failures, err_out
|
|
1714
1745
|
|
|
1715
|
-
while
|
|
1746
|
+
while transient_retry_count <= S3_TRANSIENT_RETRY_COUNT:
|
|
1716
1747
|
(
|
|
1717
1748
|
last_ok_count,
|
|
1718
1749
|
last_retry_count,
|
|
1719
1750
|
inject_failures,
|
|
1720
1751
|
err_out,
|
|
1721
1752
|
) = try_s3_op(last_ok_count, pending_retries, out_lines, inject_failures)
|
|
1722
|
-
if err_out
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
last_ok_count == 0
|
|
1726
|
-
or transient_retry_count > S3_TRANSIENT_RETRY_COUNT
|
|
1727
|
-
)
|
|
1728
|
-
):
|
|
1729
|
-
# We had a fatal failure (err_out is not None)
|
|
1730
|
-
# or we made no progress (last_ok_count is 0)
|
|
1731
|
-
# or we are out of transient retries
|
|
1732
|
-
# so we will restart from scratch (being very conservative)
|
|
1733
|
-
retry_count += 1
|
|
1734
|
-
err_msg = err_out
|
|
1735
|
-
if err_msg is None and last_ok_count == 0:
|
|
1736
|
-
err_msg = "No progress"
|
|
1737
|
-
if err_msg is None:
|
|
1738
|
-
err_msg = "Too many transient errors"
|
|
1739
|
-
print(
|
|
1740
|
-
"S3 non-transient error (attempt #%d): %s" % (retry_count, err_msg)
|
|
1741
|
-
)
|
|
1742
|
-
_reset()
|
|
1743
|
-
if retry_count <= S3_RETRY_COUNT:
|
|
1744
|
-
self._jitter_sleep(retry_count)
|
|
1745
|
-
continue
|
|
1746
|
-
elif last_retry_count != 0:
|
|
1753
|
+
if err_out:
|
|
1754
|
+
break
|
|
1755
|
+
if last_retry_count != 0:
|
|
1747
1756
|
# During our last try, we did not manage to process everything we wanted
|
|
1748
1757
|
# due to a transient failure so we try again.
|
|
1749
1758
|
transient_retry_count += 1
|