PyPI - metaflow - Versions diffs - 2.18.13__py2.py3-none-any.whl → 2.19.1__py2.py3-none-any.whl - Mend

metaflow 2.18.13py2.py3-none-any.whl → 2.19.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

metaflow/__init__.py +1 -0
metaflow/cli.py +78 -13
metaflow/cli_components/run_cmds.py +182 -39
metaflow/cli_components/step_cmd.py +160 -4
metaflow/client/__init__.py +1 -0
metaflow/client/core.py +162 -99
metaflow/client/filecache.py +59 -32
metaflow/cmd/code/__init__.py +2 -1
metaflow/datastore/__init__.py +1 -0
metaflow/datastore/content_addressed_store.py +40 -9
metaflow/datastore/datastore_set.py +10 -1
metaflow/datastore/flow_datastore.py +123 -4
metaflow/datastore/spin_datastore.py +91 -0
metaflow/datastore/task_datastore.py +86 -2
metaflow/decorators.py +75 -6
metaflow/extension_support/__init__.py +372 -305
metaflow/flowspec.py +3 -2
metaflow/metaflow_config.py +41 -0
metaflow/metaflow_profile.py +18 -0
metaflow/packaging_sys/utils.py +2 -39
metaflow/packaging_sys/v1.py +63 -16
metaflow/plugins/__init__.py +2 -0
metaflow/plugins/argo/argo_client.py +1 -0
metaflow/plugins/argo/argo_workflows.py +3 -1
metaflow/plugins/cards/card_datastore.py +9 -3
metaflow/plugins/cards/card_decorator.py +1 -0
metaflow/plugins/cards/card_modules/basic.py +9 -3
metaflow/plugins/datastores/local_storage.py +12 -6
metaflow/plugins/datastores/spin_storage.py +12 -0
metaflow/plugins/datatools/s3/s3.py +29 -10
metaflow/plugins/datatools/s3/s3op.py +90 -62
metaflow/plugins/metadata_providers/local.py +76 -82
metaflow/plugins/metadata_providers/spin.py +16 -0
metaflow/runner/metaflow_runner.py +210 -19
metaflow/runtime.py +348 -21
metaflow/task.py +61 -12
metaflow/user_configs/config_parameters.py +2 -4
metaflow/user_decorators/mutable_flow.py +1 -1
metaflow/user_decorators/user_step_decorator.py +10 -1
metaflow/util.py +191 -1
metaflow/version.py +1 -1
{metaflow-2.18.13.data → metaflow-2.19.1.data}/data/share/metaflow/devtools/Makefile +10 -0
{metaflow-2.18.13.dist-info → metaflow-2.19.1.dist-info}/METADATA +2 -4
{metaflow-2.18.13.dist-info → metaflow-2.19.1.dist-info}/RECORD +50 -47
{metaflow-2.18.13.data → metaflow-2.19.1.data}/data/share/metaflow/devtools/Tiltfile +0 -0
{metaflow-2.18.13.data → metaflow-2.19.1.data}/data/share/metaflow/devtools/pick_services.sh +0 -0
{metaflow-2.18.13.dist-info → metaflow-2.19.1.dist-info}/WHEEL +0 -0
{metaflow-2.18.13.dist-info → metaflow-2.19.1.dist-info}/entry_points.txt +0 -0
{metaflow-2.18.13.dist-info → metaflow-2.19.1.dist-info}/licenses/LICENSE +0 -0
{metaflow-2.18.13.dist-info → metaflow-2.19.1.dist-info}/top_level.txt +0 -0

metaflow/plugins/datatools/s3/s3op.py CHANGED Viewed

@@ -285,7 +285,9 @@ def worker(result_file_name, queue, mode, s3config):
                                     "%d %d\n" % (idx, -ERROR_OUT_OF_DISK_SPACE)
                                 )
                             else:
-                                result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
+                                result_file.write(
+                                    "%d %d %s\n" % (idx, -ERROR_TRANSIENT, "OSError")
+                                )
                             result_file.flush()
                             continue
                     except MetaflowException:
@@ -297,7 +299,9 @@ def worker(result_file_name, queue, mode, s3config):
                         tmp.close()
                         os.unlink(tmp.name)
                         # assume anything else is transient
-                        result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
+                        result_file.write(
+                            "%d %d %s\n" % (idx, -ERROR_TRANSIENT, type(e).__name__)
+                        )
                         result_file.flush()
                         continue
                     # If we need the metadata, get it and write it out
@@ -368,7 +372,9 @@ def worker(result_file_name, queue, mode, s3config):
                             raise
                         except (SSLError, Exception) as e:
                             # assume anything else is transient
-                            result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
+                            result_file.write(
+                                "%d %d %s\n" % (idx, -ERROR_TRANSIENT, type(e).__name__)
+                            )
                             result_file.flush()
                             continue
         except:
@@ -399,6 +405,8 @@ def handle_client_error(err, idx, result_file):
         raise err
     error_code = normalize_client_error(err)
+    original_error_code = err.response["Error"]["Code"]
     if error_code == 404:
         result_file.write("%d %d\n" % (idx, -ERROR_URL_NOT_FOUND))
         result_file.flush()
@@ -406,13 +414,12 @@ def handle_client_error(err, idx, result_file):
         result_file.write("%d %d\n" % (idx, -ERROR_URL_ACCESS_DENIED))
         result_file.flush()
     elif error_code == 503:
-        result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
+        result_file.write("%d %d %s\n" % (idx, -ERROR_TRANSIENT, original_error_code))
         result_file.flush()
     else:
         # optimistically assume it is a transient error
-        result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
+        result_file.write("%d %d %s\n" % (idx, -ERROR_TRANSIENT, original_error_code))
         result_file.flush()
-    # TODO specific error message for out of disk space
 def start_workers(mode, urls, num_workers, inject_failure, s3config):
@@ -424,6 +431,7 @@ def start_workers(mode, urls, num_workers, inject_failure, s3config):
     random.seed()
     sz_results = []
+    transient_error_type = None
     # 1. push sources and destinations to the queue
     # We only push if we don't inject a failure; otherwise, we already set the sz_results
     # appropriately with the result of the injected failure.
@@ -478,13 +486,19 @@ def start_workers(mode, urls, num_workers, inject_failure, s3config):
                     # Read the output file if all went well
                     with open(out_path, "r") as out_file:
                         for line in out_file:
-                            line_split = line.split(" ")
-                            sz_results[int(line_split[0])] = int(line_split[1])
+                            line_split = line.split(" ", 2)
+                            idx = int(line_split[0])
+                            size = int(line_split[1])
+                            sz_results[idx] = size
+                            # For transient errors, store the transient error type (should be the same for all)
+                            if size == -ERROR_TRANSIENT and len(line_split) > 2:
+                                transient_error_type = line_split[2].strip()
                 else:
                     # Put this process back in the processes to check
                     new_procs[proc] = out_path
             procs = new_procs
-    return sz_results
+    return sz_results, transient_error_type
 def process_urls(mode, urls, verbose, inject_failure, num_workers, s3config):
@@ -493,7 +507,9 @@ def process_urls(mode, urls, verbose, inject_failure, num_workers, s3config):
         print("%sing %d files.." % (mode.capitalize(), len(urls)), file=sys.stderr)
     start = time.time()
-    sz_results = start_workers(mode, urls, num_workers, inject_failure, s3config)
+    sz_results, transient_error_type = start_workers(
+        mode, urls, num_workers, inject_failure, s3config
+    )
     end = time.time()
     if verbose:
@@ -510,7 +526,7 @@ def process_urls(mode, urls, verbose, inject_failure, num_workers, s3config):
             ),
             file=sys.stderr,
         )
-    return sz_results
+    return sz_results, transient_error_type
 # Utility functions
@@ -719,9 +735,21 @@ def generate_local_path(url, range="whole", suffix=None):
     quoted = url_quote(url)
     fname = quoted.split(b"/")[-1].replace(b".", b"_").replace(b"-", b"_")
     sha = sha1(quoted).hexdigest()
+    # Truncate fname to ensure the final filename doesn't exceed filesystem limits.
+    # Most filesystems have a 255 character limit. The structure is:
+    # <40-char-sha>-<fname>-<range>[-<suffix>]
+    # We need to leave room for: sha (40) + hyphens (2-3) + range (~10) + suffix (~10)
+    # This leaves roughly 190 characters for fname. We use 150 to be safe.
+    fname_decoded = fname.decode("utf-8")
+    max_fname_len = 150
+    if len(fname_decoded) > max_fname_len:
+        # Truncate and add an ellipsis to indicate truncation
+        fname_decoded = fname_decoded[:max_fname_len] + "..."
     if suffix:
-        return "-".join((sha, fname.decode("utf-8"), range, suffix))
-    return "-".join((sha, fname.decode("utf-8"), range))
+        return "-".join((sha, fname_decoded, range, suffix))
+    return "-".join((sha, fname_decoded, range))
 def parallel_op(op, lst, num_workers):
@@ -858,7 +886,7 @@ def lst(
     urllist = []
     to_iterate, _ = _populate_prefixes(prefixes, inputs)
     for _, prefix, url, _ in to_iterate:
-        src = urlparse(url)
+        src = urlparse(url, allow_fragments=False)
         url = S3Url(
             url=url,
             bucket=src.netloc,
@@ -964,7 +992,7 @@ def put(
                 yield input_line_idx, local, url, content_type, metadata, encryption
     def _make_url(idx, local, user_url, content_type, metadata, encryption):
-        src = urlparse(user_url)
+        src = urlparse(user_url, allow_fragments=False)
         url = S3Url(
             url=user_url,
             bucket=src.netloc,
@@ -992,7 +1020,7 @@ def put(
     ul_op = "upload"
     if not overwrite:
         ul_op = "info_upload"
-    sz_results = process_urls(
+    sz_results, transient_error_type = process_urls(
         ul_op, urls, verbose, inject_failure, num_workers, s3config
     )
     retry_lines = []
@@ -1010,19 +1038,17 @@ def put(
         elif listing and sz == 0:
             out_lines.append(format_result_line(url.idx, url.url) + "\n")
         elif sz == -ERROR_TRANSIENT:
-            retry_lines.append(
-                json.dumps(
-                    {
-                        "idx": url.idx,
-                        "url": url.url,
-                        "local": url.local,
-                        "content_type": url.content_type,
-                        "metadata": url.metadata,
-                        "encryption": url.encryption,
-                    }
-                )
-                + "\n"
-            )
+            retry_data = {
+                "idx": url.idx,
+                "url": url.url,
+                "local": url.local,
+                "content_type": url.content_type,
+                "metadata": url.metadata,
+                "encryption": url.encryption,
+            }
+            if transient_error_type:
+                retry_data["transient_error_type"] = transient_error_type
+            retry_lines.append(json.dumps(retry_data) + "\n")
             # Output something to get a total count the first time around
             if not is_transient_retry:
                 out_lines.append("%d %s\n" % (url.idx, TRANSIENT_RETRY_LINE_CONTENT))
@@ -1060,22 +1086,21 @@ def _populate_prefixes(prefixes, inputs):
             for idx, l in enumerate(f, start=len(prefixes)):
                 s = l.split(b" ")
                 if len(s) == 1:
+                    # User input format: <url>
                     url = url_unquote(s[0].strip())
                     prefixes.append((idx, url, url, None))
                 elif len(s) == 2:
+                    # User input format: <url> <range>
                     url = url_unquote(s[0].strip())
                     prefixes.append((idx, url, url, url_unquote(s[1].strip())))
-                else:
+                elif len(s) in (4, 5):
+                    # Retry format: <idx> <prefix> <url> <range> [<transient_error_type>]
+                    # The transient_error_type (5th field) is optional and only used for logging.
+                    # Lines with other field counts (e.g., 3) are silently ignored as invalid.
                     is_transient_retry = True
-                    if len(s) == 3:
-                        prefix = url = url_unquote(s[1].strip())
-                        range_info = url_unquote(s[2].strip())
-                    else:
-                        # Special case when we have both prefix and URL -- this is
-                        # used in recursive gets for example
-                        prefix = url_unquote(s[1].strip())
-                        url = url_unquote(s[2].strip())
-                        range_info = url_unquote(s[3].strip())
+                    prefix = url_unquote(s[1].strip())
+                    url = url_unquote(s[2].strip())
+                    range_info = url_unquote(s[3].strip())
                     if range_info == "<norange>":
                         range_info = None
                     prefixes.append(
@@ -1139,7 +1164,7 @@ def get(
     urllist = []
     to_iterate, is_transient_retry = _populate_prefixes(prefixes, inputs)
     for idx, prefix, url, r in to_iterate:
-        src = urlparse(url)
+        src = urlparse(url, allow_fragments=False)
         url = S3Url(
             url=url,
             bucket=src.netloc,
@@ -1186,7 +1211,7 @@ def get(
     # exclude the non-existent files from loading
     to_load = [url for url, size in urls if size is not None]
-    sz_results = process_urls(
+    sz_results, transient_error_type = process_urls(
         dl_op, to_load, verbose, inject_failure, num_workers, s3config
     )
     # We check if there is any access denied
@@ -1222,21 +1247,19 @@ def get(
                 break
             out_lines.append(format_result_line(url.idx, url.url) + "\n")
         elif sz == -ERROR_TRANSIENT:
-            retry_lines.append(
-                " ".join(
-                    [
-                        str(url.idx),
-                        url_quote(url.prefix).decode(encoding="utf-8"),
-                        url_quote(url.url).decode(encoding="utf-8"),
-                        (
-                            url_quote(url.range).decode(encoding="utf-8")
-                            if url.range
-                            else "<norange>"
-                        ),
-                    ]
-                )
-                + "\n"
-            )
+            retry_line_parts = [
+                str(url.idx),
+                url_quote(url.prefix).decode(encoding="utf-8"),
+                url_quote(url.url).decode(encoding="utf-8"),
+                (
+                    url_quote(url.range).decode(encoding="utf-8")
+                    if url.range
+                    else "<norange>"
+                ),
+            ]
+            if transient_error_type:
+                retry_line_parts.append(transient_error_type)
+            retry_lines.append(" ".join(retry_line_parts) + "\n")
             # First time around, we output something to indicate the total length
             if not is_transient_retry:
                 out_lines.append("%d %s\n" % (url.idx, TRANSIENT_RETRY_LINE_CONTENT))
@@ -1288,7 +1311,7 @@ def info(
     urllist = []
     to_iterate, is_transient_retry = _populate_prefixes(prefixes, inputs)
     for idx, prefix, url, _ in to_iterate:
-        src = urlparse(url)
+        src = urlparse(url, allow_fragments=False)
         url = S3Url(
             url=url,
             bucket=src.netloc,
@@ -1302,7 +1325,7 @@ def info(
             exit(ERROR_INVALID_URL, url)
         urllist.append(url)
-    sz_results = process_urls(
+    sz_results, transient_error_type = process_urls(
         "info", urllist, verbose, inject_failure, num_workers, s3config
     )
@@ -1315,10 +1338,15 @@ def info(
                 format_result_line(url.idx, url.prefix, url.url, url.local) + "\n"
             )
         else:
-            retry_lines.append(
-                "%d %s <norange>\n"
-                % (url.idx, url_quote(url.url).decode(encoding="utf-8"))
-            )
+            retry_line_parts = [
+                str(url.idx),
+                url_quote(url.prefix).decode(encoding="utf-8"),
+                url_quote(url.url).decode(encoding="utf-8"),
+                "<norange>",
+            ]
+            if transient_error_type:
+                retry_line_parts.append(transient_error_type)
+            retry_lines.append(" ".join(retry_line_parts) + "\n")
             if not is_transient_retry:
                 out_lines.append("%d %s\n" % (url.idx, TRANSIENT_RETRY_LINE_CONTENT))

metaflow 2.18.13__py2.py3-none-any.whl → 2.19.1__py2.py3-none-any.whl

metaflow 2.18.13py2.py3-none-any.whl → 2.19.1py2.py3-none-any.whl