metaflow 2.18.13__py2.py3-none-any.whl → 2.19.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. metaflow/__init__.py +1 -0
  2. metaflow/cli.py +78 -13
  3. metaflow/cli_components/run_cmds.py +182 -39
  4. metaflow/cli_components/step_cmd.py +160 -4
  5. metaflow/client/__init__.py +1 -0
  6. metaflow/client/core.py +162 -99
  7. metaflow/client/filecache.py +59 -32
  8. metaflow/cmd/code/__init__.py +2 -1
  9. metaflow/datastore/__init__.py +1 -0
  10. metaflow/datastore/content_addressed_store.py +40 -9
  11. metaflow/datastore/datastore_set.py +10 -1
  12. metaflow/datastore/flow_datastore.py +123 -4
  13. metaflow/datastore/spin_datastore.py +91 -0
  14. metaflow/datastore/task_datastore.py +86 -2
  15. metaflow/decorators.py +75 -6
  16. metaflow/extension_support/__init__.py +372 -305
  17. metaflow/flowspec.py +3 -2
  18. metaflow/metaflow_config.py +41 -0
  19. metaflow/metaflow_profile.py +18 -0
  20. metaflow/packaging_sys/utils.py +2 -39
  21. metaflow/packaging_sys/v1.py +63 -16
  22. metaflow/plugins/__init__.py +2 -0
  23. metaflow/plugins/argo/argo_client.py +1 -0
  24. metaflow/plugins/argo/argo_workflows.py +3 -1
  25. metaflow/plugins/cards/card_datastore.py +9 -3
  26. metaflow/plugins/cards/card_decorator.py +1 -0
  27. metaflow/plugins/cards/card_modules/basic.py +9 -3
  28. metaflow/plugins/datastores/local_storage.py +12 -6
  29. metaflow/plugins/datastores/spin_storage.py +12 -0
  30. metaflow/plugins/datatools/s3/s3.py +29 -10
  31. metaflow/plugins/datatools/s3/s3op.py +90 -62
  32. metaflow/plugins/metadata_providers/local.py +76 -82
  33. metaflow/plugins/metadata_providers/spin.py +16 -0
  34. metaflow/runner/metaflow_runner.py +210 -19
  35. metaflow/runtime.py +348 -21
  36. metaflow/task.py +61 -12
  37. metaflow/user_configs/config_parameters.py +2 -4
  38. metaflow/user_decorators/mutable_flow.py +1 -1
  39. metaflow/user_decorators/user_step_decorator.py +10 -1
  40. metaflow/util.py +191 -1
  41. metaflow/version.py +1 -1
  42. {metaflow-2.18.13.data → metaflow-2.19.1.data}/data/share/metaflow/devtools/Makefile +10 -0
  43. {metaflow-2.18.13.dist-info → metaflow-2.19.1.dist-info}/METADATA +2 -4
  44. {metaflow-2.18.13.dist-info → metaflow-2.19.1.dist-info}/RECORD +50 -47
  45. {metaflow-2.18.13.data → metaflow-2.19.1.data}/data/share/metaflow/devtools/Tiltfile +0 -0
  46. {metaflow-2.18.13.data → metaflow-2.19.1.data}/data/share/metaflow/devtools/pick_services.sh +0 -0
  47. {metaflow-2.18.13.dist-info → metaflow-2.19.1.dist-info}/WHEEL +0 -0
  48. {metaflow-2.18.13.dist-info → metaflow-2.19.1.dist-info}/entry_points.txt +0 -0
  49. {metaflow-2.18.13.dist-info → metaflow-2.19.1.dist-info}/licenses/LICENSE +0 -0
  50. {metaflow-2.18.13.dist-info → metaflow-2.19.1.dist-info}/top_level.txt +0 -0
@@ -285,7 +285,9 @@ def worker(result_file_name, queue, mode, s3config):
285
285
  "%d %d\n" % (idx, -ERROR_OUT_OF_DISK_SPACE)
286
286
  )
287
287
  else:
288
- result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
288
+ result_file.write(
289
+ "%d %d %s\n" % (idx, -ERROR_TRANSIENT, "OSError")
290
+ )
289
291
  result_file.flush()
290
292
  continue
291
293
  except MetaflowException:
@@ -297,7 +299,9 @@ def worker(result_file_name, queue, mode, s3config):
297
299
  tmp.close()
298
300
  os.unlink(tmp.name)
299
301
  # assume anything else is transient
300
- result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
302
+ result_file.write(
303
+ "%d %d %s\n" % (idx, -ERROR_TRANSIENT, type(e).__name__)
304
+ )
301
305
  result_file.flush()
302
306
  continue
303
307
  # If we need the metadata, get it and write it out
@@ -368,7 +372,9 @@ def worker(result_file_name, queue, mode, s3config):
368
372
  raise
369
373
  except (SSLError, Exception) as e:
370
374
  # assume anything else is transient
371
- result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
375
+ result_file.write(
376
+ "%d %d %s\n" % (idx, -ERROR_TRANSIENT, type(e).__name__)
377
+ )
372
378
  result_file.flush()
373
379
  continue
374
380
  except:
@@ -399,6 +405,8 @@ def handle_client_error(err, idx, result_file):
399
405
  raise err
400
406
 
401
407
  error_code = normalize_client_error(err)
408
+ original_error_code = err.response["Error"]["Code"]
409
+
402
410
  if error_code == 404:
403
411
  result_file.write("%d %d\n" % (idx, -ERROR_URL_NOT_FOUND))
404
412
  result_file.flush()
@@ -406,13 +414,12 @@ def handle_client_error(err, idx, result_file):
406
414
  result_file.write("%d %d\n" % (idx, -ERROR_URL_ACCESS_DENIED))
407
415
  result_file.flush()
408
416
  elif error_code == 503:
409
- result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
417
+ result_file.write("%d %d %s\n" % (idx, -ERROR_TRANSIENT, original_error_code))
410
418
  result_file.flush()
411
419
  else:
412
420
  # optimistically assume it is a transient error
413
- result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
421
+ result_file.write("%d %d %s\n" % (idx, -ERROR_TRANSIENT, original_error_code))
414
422
  result_file.flush()
415
- # TODO specific error message for out of disk space
416
423
 
417
424
 
418
425
  def start_workers(mode, urls, num_workers, inject_failure, s3config):
@@ -424,6 +431,7 @@ def start_workers(mode, urls, num_workers, inject_failure, s3config):
424
431
  random.seed()
425
432
 
426
433
  sz_results = []
434
+ transient_error_type = None
427
435
  # 1. push sources and destinations to the queue
428
436
  # We only push if we don't inject a failure; otherwise, we already set the sz_results
429
437
  # appropriately with the result of the injected failure.
@@ -478,13 +486,19 @@ def start_workers(mode, urls, num_workers, inject_failure, s3config):
478
486
  # Read the output file if all went well
479
487
  with open(out_path, "r") as out_file:
480
488
  for line in out_file:
481
- line_split = line.split(" ")
482
- sz_results[int(line_split[0])] = int(line_split[1])
489
+ line_split = line.split(" ", 2)
490
+ idx = int(line_split[0])
491
+ size = int(line_split[1])
492
+ sz_results[idx] = size
493
+
494
+ # For transient errors, store the transient error type (should be the same for all)
495
+ if size == -ERROR_TRANSIENT and len(line_split) > 2:
496
+ transient_error_type = line_split[2].strip()
483
497
  else:
484
498
  # Put this process back in the processes to check
485
499
  new_procs[proc] = out_path
486
500
  procs = new_procs
487
- return sz_results
501
+ return sz_results, transient_error_type
488
502
 
489
503
 
490
504
  def process_urls(mode, urls, verbose, inject_failure, num_workers, s3config):
@@ -493,7 +507,9 @@ def process_urls(mode, urls, verbose, inject_failure, num_workers, s3config):
493
507
  print("%sing %d files.." % (mode.capitalize(), len(urls)), file=sys.stderr)
494
508
 
495
509
  start = time.time()
496
- sz_results = start_workers(mode, urls, num_workers, inject_failure, s3config)
510
+ sz_results, transient_error_type = start_workers(
511
+ mode, urls, num_workers, inject_failure, s3config
512
+ )
497
513
  end = time.time()
498
514
 
499
515
  if verbose:
@@ -510,7 +526,7 @@ def process_urls(mode, urls, verbose, inject_failure, num_workers, s3config):
510
526
  ),
511
527
  file=sys.stderr,
512
528
  )
513
- return sz_results
529
+ return sz_results, transient_error_type
514
530
 
515
531
 
516
532
  # Utility functions
@@ -719,9 +735,21 @@ def generate_local_path(url, range="whole", suffix=None):
719
735
  quoted = url_quote(url)
720
736
  fname = quoted.split(b"/")[-1].replace(b".", b"_").replace(b"-", b"_")
721
737
  sha = sha1(quoted).hexdigest()
738
+
739
+ # Truncate fname to ensure the final filename doesn't exceed filesystem limits.
740
+ # Most filesystems have a 255 character limit. The structure is:
741
+ # <40-char-sha>-<fname>-<range>[-<suffix>]
742
+ # We need to leave room for: sha (40) + hyphens (2-3) + range (~10) + suffix (~10)
743
+ # This leaves roughly 190 characters for fname. We use 150 to be safe.
744
+ fname_decoded = fname.decode("utf-8")
745
+ max_fname_len = 150
746
+ if len(fname_decoded) > max_fname_len:
747
+ # Truncate and add an ellipsis to indicate truncation
748
+ fname_decoded = fname_decoded[:max_fname_len] + "..."
749
+
722
750
  if suffix:
723
- return "-".join((sha, fname.decode("utf-8"), range, suffix))
724
- return "-".join((sha, fname.decode("utf-8"), range))
751
+ return "-".join((sha, fname_decoded, range, suffix))
752
+ return "-".join((sha, fname_decoded, range))
725
753
 
726
754
 
727
755
  def parallel_op(op, lst, num_workers):
@@ -858,7 +886,7 @@ def lst(
858
886
  urllist = []
859
887
  to_iterate, _ = _populate_prefixes(prefixes, inputs)
860
888
  for _, prefix, url, _ in to_iterate:
861
- src = urlparse(url)
889
+ src = urlparse(url, allow_fragments=False)
862
890
  url = S3Url(
863
891
  url=url,
864
892
  bucket=src.netloc,
@@ -964,7 +992,7 @@ def put(
964
992
  yield input_line_idx, local, url, content_type, metadata, encryption
965
993
 
966
994
  def _make_url(idx, local, user_url, content_type, metadata, encryption):
967
- src = urlparse(user_url)
995
+ src = urlparse(user_url, allow_fragments=False)
968
996
  url = S3Url(
969
997
  url=user_url,
970
998
  bucket=src.netloc,
@@ -992,7 +1020,7 @@ def put(
992
1020
  ul_op = "upload"
993
1021
  if not overwrite:
994
1022
  ul_op = "info_upload"
995
- sz_results = process_urls(
1023
+ sz_results, transient_error_type = process_urls(
996
1024
  ul_op, urls, verbose, inject_failure, num_workers, s3config
997
1025
  )
998
1026
  retry_lines = []
@@ -1010,19 +1038,17 @@ def put(
1010
1038
  elif listing and sz == 0:
1011
1039
  out_lines.append(format_result_line(url.idx, url.url) + "\n")
1012
1040
  elif sz == -ERROR_TRANSIENT:
1013
- retry_lines.append(
1014
- json.dumps(
1015
- {
1016
- "idx": url.idx,
1017
- "url": url.url,
1018
- "local": url.local,
1019
- "content_type": url.content_type,
1020
- "metadata": url.metadata,
1021
- "encryption": url.encryption,
1022
- }
1023
- )
1024
- + "\n"
1025
- )
1041
+ retry_data = {
1042
+ "idx": url.idx,
1043
+ "url": url.url,
1044
+ "local": url.local,
1045
+ "content_type": url.content_type,
1046
+ "metadata": url.metadata,
1047
+ "encryption": url.encryption,
1048
+ }
1049
+ if transient_error_type:
1050
+ retry_data["transient_error_type"] = transient_error_type
1051
+ retry_lines.append(json.dumps(retry_data) + "\n")
1026
1052
  # Output something to get a total count the first time around
1027
1053
  if not is_transient_retry:
1028
1054
  out_lines.append("%d %s\n" % (url.idx, TRANSIENT_RETRY_LINE_CONTENT))
@@ -1060,22 +1086,21 @@ def _populate_prefixes(prefixes, inputs):
1060
1086
  for idx, l in enumerate(f, start=len(prefixes)):
1061
1087
  s = l.split(b" ")
1062
1088
  if len(s) == 1:
1089
+ # User input format: <url>
1063
1090
  url = url_unquote(s[0].strip())
1064
1091
  prefixes.append((idx, url, url, None))
1065
1092
  elif len(s) == 2:
1093
+ # User input format: <url> <range>
1066
1094
  url = url_unquote(s[0].strip())
1067
1095
  prefixes.append((idx, url, url, url_unquote(s[1].strip())))
1068
- else:
1096
+ elif len(s) in (4, 5):
1097
+ # Retry format: <idx> <prefix> <url> <range> [<transient_error_type>]
1098
+ # The transient_error_type (5th field) is optional and only used for logging.
1099
+ # Lines with other field counts (e.g., 3) are silently ignored as invalid.
1069
1100
  is_transient_retry = True
1070
- if len(s) == 3:
1071
- prefix = url = url_unquote(s[1].strip())
1072
- range_info = url_unquote(s[2].strip())
1073
- else:
1074
- # Special case when we have both prefix and URL -- this is
1075
- # used in recursive gets for example
1076
- prefix = url_unquote(s[1].strip())
1077
- url = url_unquote(s[2].strip())
1078
- range_info = url_unquote(s[3].strip())
1101
+ prefix = url_unquote(s[1].strip())
1102
+ url = url_unquote(s[2].strip())
1103
+ range_info = url_unquote(s[3].strip())
1079
1104
  if range_info == "<norange>":
1080
1105
  range_info = None
1081
1106
  prefixes.append(
@@ -1139,7 +1164,7 @@ def get(
1139
1164
  urllist = []
1140
1165
  to_iterate, is_transient_retry = _populate_prefixes(prefixes, inputs)
1141
1166
  for idx, prefix, url, r in to_iterate:
1142
- src = urlparse(url)
1167
+ src = urlparse(url, allow_fragments=False)
1143
1168
  url = S3Url(
1144
1169
  url=url,
1145
1170
  bucket=src.netloc,
@@ -1186,7 +1211,7 @@ def get(
1186
1211
 
1187
1212
  # exclude the non-existent files from loading
1188
1213
  to_load = [url for url, size in urls if size is not None]
1189
- sz_results = process_urls(
1214
+ sz_results, transient_error_type = process_urls(
1190
1215
  dl_op, to_load, verbose, inject_failure, num_workers, s3config
1191
1216
  )
1192
1217
  # We check if there is any access denied
@@ -1222,21 +1247,19 @@ def get(
1222
1247
  break
1223
1248
  out_lines.append(format_result_line(url.idx, url.url) + "\n")
1224
1249
  elif sz == -ERROR_TRANSIENT:
1225
- retry_lines.append(
1226
- " ".join(
1227
- [
1228
- str(url.idx),
1229
- url_quote(url.prefix).decode(encoding="utf-8"),
1230
- url_quote(url.url).decode(encoding="utf-8"),
1231
- (
1232
- url_quote(url.range).decode(encoding="utf-8")
1233
- if url.range
1234
- else "<norange>"
1235
- ),
1236
- ]
1237
- )
1238
- + "\n"
1239
- )
1250
+ retry_line_parts = [
1251
+ str(url.idx),
1252
+ url_quote(url.prefix).decode(encoding="utf-8"),
1253
+ url_quote(url.url).decode(encoding="utf-8"),
1254
+ (
1255
+ url_quote(url.range).decode(encoding="utf-8")
1256
+ if url.range
1257
+ else "<norange>"
1258
+ ),
1259
+ ]
1260
+ if transient_error_type:
1261
+ retry_line_parts.append(transient_error_type)
1262
+ retry_lines.append(" ".join(retry_line_parts) + "\n")
1240
1263
  # First time around, we output something to indicate the total length
1241
1264
  if not is_transient_retry:
1242
1265
  out_lines.append("%d %s\n" % (url.idx, TRANSIENT_RETRY_LINE_CONTENT))
@@ -1288,7 +1311,7 @@ def info(
1288
1311
  urllist = []
1289
1312
  to_iterate, is_transient_retry = _populate_prefixes(prefixes, inputs)
1290
1313
  for idx, prefix, url, _ in to_iterate:
1291
- src = urlparse(url)
1314
+ src = urlparse(url, allow_fragments=False)
1292
1315
  url = S3Url(
1293
1316
  url=url,
1294
1317
  bucket=src.netloc,
@@ -1302,7 +1325,7 @@ def info(
1302
1325
  exit(ERROR_INVALID_URL, url)
1303
1326
  urllist.append(url)
1304
1327
 
1305
- sz_results = process_urls(
1328
+ sz_results, transient_error_type = process_urls(
1306
1329
  "info", urllist, verbose, inject_failure, num_workers, s3config
1307
1330
  )
1308
1331
 
@@ -1315,10 +1338,15 @@ def info(
1315
1338
  format_result_line(url.idx, url.prefix, url.url, url.local) + "\n"
1316
1339
  )
1317
1340
  else:
1318
- retry_lines.append(
1319
- "%d %s <norange>\n"
1320
- % (url.idx, url_quote(url.url).decode(encoding="utf-8"))
1321
- )
1341
+ retry_line_parts = [
1342
+ str(url.idx),
1343
+ url_quote(url.prefix).decode(encoding="utf-8"),
1344
+ url_quote(url.url).decode(encoding="utf-8"),
1345
+ "<norange>",
1346
+ ]
1347
+ if transient_error_type:
1348
+ retry_line_parts.append(transient_error_type)
1349
+ retry_lines.append(" ".join(retry_line_parts) + "\n")
1322
1350
  if not is_transient_retry:
1323
1351
  out_lines.append("%d %s\n" % (url.idx, TRANSIENT_RETRY_LINE_CONTENT))
1324
1352