metaflow 2.15.6__py2.py3-none-any.whl → 2.15.8__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. metaflow/cli.py +8 -0
  2. metaflow/cli_components/run_cmds.py +2 -2
  3. metaflow/cmd/main_cli.py +1 -1
  4. metaflow/metadata_provider/metadata.py +35 -0
  5. metaflow/metaflow_config.py +6 -0
  6. metaflow/metaflow_environment.py +6 -1
  7. metaflow/metaflow_git.py +115 -0
  8. metaflow/metaflow_version.py +2 -2
  9. metaflow/plugins/__init__.py +1 -0
  10. metaflow/plugins/argo/argo_workflows.py +66 -17
  11. metaflow/plugins/argo/argo_workflows_cli.py +11 -0
  12. metaflow/plugins/argo/argo_workflows_deployer_objects.py +7 -6
  13. metaflow/plugins/aws/aws_client.py +4 -3
  14. metaflow/plugins/datatools/s3/s3.py +46 -44
  15. metaflow/plugins/datatools/s3/s3op.py +133 -63
  16. metaflow/plugins/uv/__init__.py +0 -0
  17. metaflow/plugins/uv/bootstrap.py +100 -0
  18. metaflow/plugins/uv/uv_environment.py +70 -0
  19. metaflow/version.py +1 -1
  20. {metaflow-2.15.6.data → metaflow-2.15.8.data}/data/share/metaflow/devtools/Makefile +4 -2
  21. {metaflow-2.15.6.dist-info → metaflow-2.15.8.dist-info}/METADATA +4 -3
  22. {metaflow-2.15.6.dist-info → metaflow-2.15.8.dist-info}/RECORD +28 -24
  23. {metaflow-2.15.6.dist-info → metaflow-2.15.8.dist-info}/WHEEL +1 -1
  24. {metaflow-2.15.6.data → metaflow-2.15.8.data}/data/share/metaflow/devtools/Tiltfile +0 -0
  25. {metaflow-2.15.6.data → metaflow-2.15.8.data}/data/share/metaflow/devtools/pick_services.sh +0 -0
  26. {metaflow-2.15.6.dist-info → metaflow-2.15.8.dist-info}/entry_points.txt +0 -0
  27. {metaflow-2.15.6.dist-info → metaflow-2.15.8.dist-info/licenses}/LICENSE +0 -0
  28. {metaflow-2.15.6.dist-info → metaflow-2.15.8.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@ from metaflow.metaflow_config import (
18
18
  S3_RETRY_COUNT,
19
19
  S3_TRANSIENT_RETRY_COUNT,
20
20
  S3_SERVER_SIDE_ENCRYPTION,
21
+ S3_WORKER_COUNT,
21
22
  TEMPDIR,
22
23
  )
23
24
  from metaflow.util import (
@@ -1390,9 +1391,31 @@ class S3(object):
1390
1391
  )
1391
1392
 
1392
1393
  # add some jitter to make sure retries are not synchronized
1393
- def _jitter_sleep(self, trynum, multiplier=2):
1394
- interval = multiplier**trynum + random.randint(0, 10)
1395
- time.sleep(interval)
1394
+ def _jitter_sleep(
1395
+ self, trynum: int, base: int = 2, cap: int = 360, jitter: float = 0.1
1396
+ ) -> None:
1397
+ """
1398
+ Sleep for an exponentially increasing interval with added jitter.
1399
+
1400
+ Parameters
1401
+ ----------
1402
+ trynum: The current retry attempt number.
1403
+ base: The base multiplier for the exponential backoff.
1404
+ cap: The maximum interval to sleep.
1405
+ jitter: The maximum jitter percentage to add to the interval.
1406
+ """
1407
+ # Calculate the exponential backoff interval
1408
+ interval = min(cap, base**trynum)
1409
+
1410
+ # Add random jitter
1411
+ jitter_value = interval * jitter * random.uniform(-1, 1)
1412
+ interval_with_jitter = interval + jitter_value
1413
+
1414
+ # Ensure the interval is not negative
1415
+ interval_with_jitter = max(0, interval_with_jitter)
1416
+
1417
+ # Sleep for the calculated interval
1418
+ time.sleep(interval_with_jitter)
1396
1419
 
1397
1420
  # NOTE: re: _read_many_files and _put_many_files
1398
1421
  # All file IO is through binary files - we write bytes, we read
@@ -1480,20 +1503,17 @@ class S3(object):
1480
1503
  # - a known transient failure (SlowDown for example) in which case we will
1481
1504
  # retry *only* the inputs that have this transient failure.
1482
1505
  # - an unknown failure (something went wrong but we cannot say if it was
1483
- # a known permanent failure or something else). In this case, we retry
1484
- # the operation completely.
1485
- #
1486
- # There are therefore two retry counts:
1487
- # - the transient failure retry count: how many times do we try on known
1488
- # transient errors
1489
- # - the top-level retry count: how many times do we try on unknown failures
1506
+ # a known permanent failure or something else). In this case, we assume
1507
+ # it's a transient failure and retry only those inputs (same as above).
1490
1508
  #
1491
- # Note that, if the operation runs out of transient failure retries, it will
1492
- # count as an "unknown" failure (ie: it will be retried according to the
1493
- # outer top-level retry count). In other words, you can potentially have
1494
- # transient_retry_count * retry_count tries).
1495
- # Finally, if on transient failures, we make NO progress (ie: no input is
1496
- # successfully processed), that counts as an "unknown" failure.
1509
+ # NOTES(npow): 2025-05-13
1510
+ # Previously, this code would also retry the fatal failures, including no_progress
1511
+ # and unknown failures, from the beginning. This is not ideal because:
1512
+ # 1. Fatal errors are not supposed to be retried.
1513
+ # 2. Retrying from the beginning does not improve the situation, and is
1514
+ # wasteful since we have already uploaded some files.
1515
+ # 3. The number of transient errors is far more than fatal errors, so we
1516
+ # can be optimistic and assume the unknown errors are transient.
1497
1517
  cmdline = [sys.executable, os.path.abspath(s3op.__file__), mode]
1498
1518
  recursive_get = False
1499
1519
  for key, value in options.items():
@@ -1528,7 +1548,6 @@ class S3(object):
1528
1548
  # Otherwise, we cap the failure rate at 90%
1529
1549
  return min(90, self._s3_inject_failures)
1530
1550
 
1531
- retry_count = 0 # Number of retries (excluding transient failures)
1532
1551
  transient_retry_count = 0 # Number of transient retries (per top-level retry)
1533
1552
  inject_failures = _inject_failure_rate()
1534
1553
  out_lines = [] # List to contain the lines returned by _s3op_with_retries
@@ -1595,7 +1614,12 @@ class S3(object):
1595
1614
  # things, this will shrink more and more until we are doing a
1596
1615
  # single operation at a time. If things start going better, it
1597
1616
  # will increase by 20% every round.
1598
- max_count = min(int(last_ok_count * 1.2), len(pending_retries))
1617
+ #
1618
+ # If we made no progress (last_ok_count == 0) we retry at most
1619
+ # 2*S3_WORKER_COUNT from whatever is left in `pending_retries`
1620
+ max_count = min(
1621
+ int(last_ok_count * 1.2), len(pending_retries)
1622
+ ) or min(2 * S3_WORKER_COUNT, len(pending_retries))
1599
1623
  tmp_input.writelines(pending_retries[:max_count])
1600
1624
  tmp_input.flush()
1601
1625
  debug.s3client_exec(
@@ -1712,38 +1736,16 @@ class S3(object):
1712
1736
  _update_out_lines(out_lines, ok_lines, resize=loop_count == 0)
1713
1737
  return 0, 0, inject_failures, err_out
1714
1738
 
1715
- while retry_count <= S3_RETRY_COUNT:
1739
+ while transient_retry_count <= S3_TRANSIENT_RETRY_COUNT:
1716
1740
  (
1717
1741
  last_ok_count,
1718
1742
  last_retry_count,
1719
1743
  inject_failures,
1720
1744
  err_out,
1721
1745
  ) = try_s3_op(last_ok_count, pending_retries, out_lines, inject_failures)
1722
- if err_out or (
1723
- last_retry_count != 0
1724
- and (
1725
- last_ok_count == 0
1726
- or transient_retry_count > S3_TRANSIENT_RETRY_COUNT
1727
- )
1728
- ):
1729
- # We had a fatal failure (err_out is not None)
1730
- # or we made no progress (last_ok_count is 0)
1731
- # or we are out of transient retries
1732
- # so we will restart from scratch (being very conservative)
1733
- retry_count += 1
1734
- err_msg = err_out
1735
- if err_msg is None and last_ok_count == 0:
1736
- err_msg = "No progress"
1737
- if err_msg is None:
1738
- err_msg = "Too many transient errors"
1739
- print(
1740
- "S3 non-transient error (attempt #%d): %s" % (retry_count, err_msg)
1741
- )
1742
- _reset()
1743
- if retry_count <= S3_RETRY_COUNT:
1744
- self._jitter_sleep(retry_count)
1745
- continue
1746
- elif last_retry_count != 0:
1746
+ if err_out:
1747
+ break
1748
+ if last_retry_count != 0:
1747
1749
  # During our last try, we did not manage to process everything we wanted
1748
1750
  # due to a transient failure so we try again.
1749
1751
  transient_retry_count += 1
@@ -15,7 +15,10 @@ from tempfile import NamedTemporaryFile
15
15
  from multiprocessing import Process, Queue
16
16
  from itertools import starmap, chain, islice
17
17
 
18
+ from boto3.exceptions import RetriesExceededError, S3UploadFailedError
18
19
  from boto3.s3.transfer import TransferConfig
20
+ from botocore.config import Config
21
+ from botocore.exceptions import ClientError, SSLError
19
22
 
20
23
  try:
21
24
  # python2
@@ -46,13 +49,21 @@ from metaflow.plugins.datatools.s3.s3util import (
46
49
  import metaflow.tracing as tracing
47
50
  from metaflow.metaflow_config import (
48
51
  S3_WORKER_COUNT,
52
+ S3_CLIENT_RETRY_CONFIG,
49
53
  )
50
54
 
51
55
  DOWNLOAD_FILE_THRESHOLD = 2 * TransferConfig().multipart_threshold
52
56
  DOWNLOAD_MAX_CHUNK = 2 * 1024 * 1024 * 1024 - 1
53
57
 
58
+ DEFAULT_S3_CLIENT_PARAMS = {"config": Config(retries=S3_CLIENT_RETRY_CONFIG)}
54
59
  RANGE_MATCH = re.compile(r"bytes (?P<start>[0-9]+)-(?P<end>[0-9]+)/(?P<total>[0-9]+)")
55
60
 
61
+ # from botocore ClientError MSG_TEMPLATE:
62
+ # https://github.com/boto/botocore/blob/68ca78f3097906c9231840a49931ef4382c41eea/botocore/exceptions.py#L521
63
+ BOTOCORE_MSG_TEMPLATE_MATCH = re.compile(
64
+ r"An error occurred \((\w+)\) when calling the (\w+) operation.*: (.+)"
65
+ )
66
+
56
67
  S3Config = namedtuple("S3Config", "role session_vars client_params")
57
68
 
58
69
 
@@ -147,6 +158,7 @@ def normalize_client_error(err):
147
158
  "LimitExceededException",
148
159
  "RequestThrottled",
149
160
  "EC2ThrottledException",
161
+ "InternalError",
150
162
  ):
151
163
  return 503
152
164
  return error_code
@@ -221,54 +233,57 @@ def worker(result_file_name, queue, mode, s3config):
221
233
  elif mode == "download":
222
234
  tmp = NamedTemporaryFile(dir=".", mode="wb", delete=False)
223
235
  try:
224
- if url.range:
225
- resp = s3.get_object(
226
- Bucket=url.bucket, Key=url.path, Range=url.range
227
- )
228
- range_result = resp["ContentRange"]
229
- range_result_match = RANGE_MATCH.match(range_result)
230
- if range_result_match is None:
231
- raise RuntimeError(
232
- "Wrong format for ContentRange: %s"
233
- % str(range_result)
236
+ try:
237
+ if url.range:
238
+ resp = s3.get_object(
239
+ Bucket=url.bucket, Key=url.path, Range=url.range
234
240
  )
235
- range_result = {
236
- x: int(range_result_match.group(x))
237
- for x in ["total", "start", "end"]
238
- }
239
- else:
240
- resp = s3.get_object(Bucket=url.bucket, Key=url.path)
241
- range_result = None
242
- sz = resp["ContentLength"]
243
- if range_result is None:
244
- range_result = {"total": sz, "start": 0, "end": sz - 1}
245
- if not url.range and sz > DOWNLOAD_FILE_THRESHOLD:
246
- # In this case, it is more efficient to use download_file as it
247
- # will download multiple parts in parallel (it does it after
248
- # multipart_threshold)
249
- s3.download_file(url.bucket, url.path, tmp.name)
250
- else:
251
- read_in_chunks(tmp, resp["Body"], sz, DOWNLOAD_MAX_CHUNK)
252
- tmp.close()
253
- os.rename(tmp.name, url.local)
254
- except client_error as err:
255
- tmp.close()
256
- os.unlink(tmp.name)
257
- error_code = normalize_client_error(err)
258
- if error_code == 404:
259
- result_file.write("%d %d\n" % (idx, -ERROR_URL_NOT_FOUND))
260
- continue
261
- elif error_code == 403:
262
- result_file.write(
263
- "%d %d\n" % (idx, -ERROR_URL_ACCESS_DENIED)
264
- )
241
+ range_result = resp["ContentRange"]
242
+ range_result_match = RANGE_MATCH.match(range_result)
243
+ if range_result_match is None:
244
+ raise RuntimeError(
245
+ "Wrong format for ContentRange: %s"
246
+ % str(range_result)
247
+ )
248
+ range_result = {
249
+ x: int(range_result_match.group(x))
250
+ for x in ["total", "start", "end"]
251
+ }
252
+ else:
253
+ resp = s3.get_object(Bucket=url.bucket, Key=url.path)
254
+ range_result = None
255
+ sz = resp["ContentLength"]
256
+ if range_result is None:
257
+ range_result = {"total": sz, "start": 0, "end": sz - 1}
258
+ if not url.range and sz > DOWNLOAD_FILE_THRESHOLD:
259
+ # In this case, it is more efficient to use download_file as it
260
+ # will download multiple parts in parallel (it does it after
261
+ # multipart_threshold)
262
+ s3.download_file(url.bucket, url.path, tmp.name)
263
+ else:
264
+ read_in_chunks(
265
+ tmp, resp["Body"], sz, DOWNLOAD_MAX_CHUNK
266
+ )
267
+ tmp.close()
268
+ os.rename(tmp.name, url.local)
269
+ except client_error as err:
270
+ tmp.close()
271
+ os.unlink(tmp.name)
272
+ handle_client_error(err, idx, result_file)
265
273
  continue
266
- elif error_code == 503:
267
- result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
274
+ except RetriesExceededError as e:
275
+ tmp.close()
276
+ os.unlink(tmp.name)
277
+ err = convert_to_client_error(e)
278
+ handle_client_error(err, idx, result_file)
268
279
  continue
269
- else:
270
- raise
271
- # TODO specific error message for out of disk space
280
+ except (SSLError, Exception) as e:
281
+ tmp.close()
282
+ os.unlink(tmp.name)
283
+ # assume anything else is transient
284
+ result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
285
+ result_file.flush()
286
+ continue
272
287
  # If we need the metadata, get it and write it out
273
288
  if pre_op_info:
274
289
  with open("%s_meta" % url.local, mode="w") as f:
@@ -316,28 +331,67 @@ def worker(result_file_name, queue, mode, s3config):
316
331
  if url.encryption is not None:
317
332
  extra["ServerSideEncryption"] = url.encryption
318
333
  try:
319
- s3.upload_file(
320
- url.local, url.bucket, url.path, ExtraArgs=extra
321
- )
322
- # We indicate that the file was uploaded
323
- result_file.write("%d %d\n" % (idx, 0))
324
- except client_error as err:
325
- error_code = normalize_client_error(err)
326
- if error_code == 403:
327
- result_file.write(
328
- "%d %d\n" % (idx, -ERROR_URL_ACCESS_DENIED)
334
+ try:
335
+ s3.upload_file(
336
+ url.local, url.bucket, url.path, ExtraArgs=extra
329
337
  )
338
+ # We indicate that the file was uploaded
339
+ result_file.write("%d %d\n" % (idx, 0))
340
+ except client_error as err:
341
+ # Shouldn't get here, but just in case.
342
+ # Internally, botocore catches ClientError and returns a S3UploadFailedError.
343
+ # See https://github.com/boto/boto3/blob/develop/boto3/s3/transfer.py#L377
344
+ handle_client_error(err, idx, result_file)
330
345
  continue
331
- elif error_code == 503:
332
- result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
346
+ except S3UploadFailedError as e:
347
+ err = convert_to_client_error(e)
348
+ handle_client_error(err, idx, result_file)
333
349
  continue
334
- else:
335
- raise
350
+ except (SSLError, Exception) as e:
351
+ # assume anything else is transient
352
+ result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
353
+ result_file.flush()
354
+ continue
336
355
  except:
337
356
  traceback.print_exc()
357
+ result_file.flush()
338
358
  sys.exit(ERROR_WORKER_EXCEPTION)
339
359
 
340
360
 
361
+ def convert_to_client_error(e):
362
+ match = BOTOCORE_MSG_TEMPLATE_MATCH.search(str(e))
363
+ if not match:
364
+ raise e
365
+ error_code = match.group(1)
366
+ operation_name = match.group(2)
367
+ error_message = match.group(3)
368
+ response = {
369
+ "Error": {
370
+ "Code": error_code,
371
+ "Message": error_message,
372
+ }
373
+ }
374
+ return ClientError(response, operation_name)
375
+
376
+
377
+ def handle_client_error(err, idx, result_file):
378
+ error_code = normalize_client_error(err)
379
+ if error_code == 404:
380
+ result_file.write("%d %d\n" % (idx, -ERROR_URL_NOT_FOUND))
381
+ result_file.flush()
382
+ elif error_code == 403:
383
+ result_file.write("%d %d\n" % (idx, -ERROR_URL_ACCESS_DENIED))
384
+ result_file.flush()
385
+ elif error_code == 503:
386
+ result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
387
+ result_file.flush()
388
+ else:
389
+ # optimistically assume it is a transient error
390
+ result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
391
+ result_file.flush()
392
+ # TODO specific error message for out of disk space
393
+
394
+
341
395
  def start_workers(mode, urls, num_workers, inject_failure, s3config):
342
396
  # We start the minimum of len(urls) or num_workers to avoid starting
343
397
  # workers that will definitely do nothing
@@ -381,6 +435,22 @@ def start_workers(mode, urls, num_workers, inject_failure, s3config):
381
435
  if proc.exitcode is not None:
382
436
  if proc.exitcode != 0:
383
437
  msg = "Worker process failed (exit code %d)" % proc.exitcode
438
+
439
+ # IMPORTANT: if this process has put items on a queue, then it will not terminate
440
+ # until all buffered items have been flushed to the pipe, causing a deadlock.
441
+ # `cancel_join_thread()` allows it to exit without flushing the queue.
442
+ # Without this line, the parent process would hang indefinitely when a subprocess
443
+ # did not exit cleanly in the case of unhandled exceptions.
444
+ #
445
+ # The error situation is:
446
+ # 1. this process puts stuff in queue
447
+ # 2. subprocess dies so doesn't consume its end-of-queue marker (the None)
448
+ # 3. other subprocesses consume all useful bits AND their end-of-queue marker
449
+ # 4. one marker is left and not consumed
450
+ # 5. this process cannot shut down until the queue is empty.
451
+ # 6. it will never be empty because all subprocesses (workers) have died.
452
+ queue.cancel_join_thread()
453
+
384
454
  exit(msg, proc.exitcode)
385
455
  # Read the output file if all went well
386
456
  with open(out_path, "r") as out_file:
@@ -745,7 +815,7 @@ def lst(
745
815
  s3config = S3Config(
746
816
  s3role,
747
817
  json.loads(s3sessionvars) if s3sessionvars else None,
748
- json.loads(s3clientparams) if s3clientparams else None,
818
+ json.loads(s3clientparams) if s3clientparams else DEFAULT_S3_CLIENT_PARAMS,
749
819
  )
750
820
 
751
821
  urllist = []
@@ -878,7 +948,7 @@ def put(
878
948
  s3config = S3Config(
879
949
  s3role,
880
950
  json.loads(s3sessionvars) if s3sessionvars else None,
881
- json.loads(s3clientparams) if s3clientparams else None,
951
+ json.loads(s3clientparams) if s3clientparams else DEFAULT_S3_CLIENT_PARAMS,
882
952
  )
883
953
 
884
954
  urls = list(starmap(_make_url, _files()))
@@ -1025,7 +1095,7 @@ def get(
1025
1095
  s3config = S3Config(
1026
1096
  s3role,
1027
1097
  json.loads(s3sessionvars) if s3sessionvars else None,
1028
- json.loads(s3clientparams) if s3clientparams else None,
1098
+ json.loads(s3clientparams) if s3clientparams else DEFAULT_S3_CLIENT_PARAMS,
1029
1099
  )
1030
1100
 
1031
1101
  # Construct a list of URL (prefix) objects
@@ -1172,7 +1242,7 @@ def info(
1172
1242
  s3config = S3Config(
1173
1243
  s3role,
1174
1244
  json.loads(s3sessionvars) if s3sessionvars else None,
1175
- json.loads(s3clientparams) if s3clientparams else None,
1245
+ json.loads(s3clientparams) if s3clientparams else DEFAULT_S3_CLIENT_PARAMS,
1176
1246
  )
1177
1247
 
1178
1248
  # Construct a list of URL (prefix) objects
File without changes
@@ -0,0 +1,100 @@
1
+ import os
2
+ import subprocess
3
+ import sys
4
+ import time
5
+
6
+ from metaflow.util import which
7
+ from metaflow.metaflow_config import get_pinned_conda_libs
8
+ from urllib.request import Request, urlopen
9
+ from urllib.error import URLError
10
+
11
+ # TODO: support version/platform/architecture selection.
12
+ UV_URL = "https://github.com/astral-sh/uv/releases/download/0.6.11/uv-x86_64-unknown-linux-gnu.tar.gz"
13
+
14
+ if __name__ == "__main__":
15
+
16
+ def run_cmd(cmd, stdin_str=None):
17
+ result = subprocess.run(
18
+ cmd,
19
+ shell=True,
20
+ input=stdin_str,
21
+ stdout=subprocess.PIPE,
22
+ stderr=subprocess.PIPE,
23
+ text=True,
24
+ )
25
+ if result.returncode != 0:
26
+ print(f"Bootstrap failed while executing: {cmd}")
27
+ print("Stdout:", result.stdout)
28
+ print("Stderr:", result.stderr)
29
+ sys.exit(1)
30
+
31
+ def install_uv():
32
+ import tarfile
33
+
34
+ uv_install_path = os.path.join(os.getcwd(), "uv_install")
35
+ if which("uv"):
36
+ return
37
+
38
+ print("Installing uv...")
39
+
40
+ # Prepare directory once
41
+ os.makedirs(uv_install_path, exist_ok=True)
42
+
43
+ # Download and decompress in one go
44
+ headers = {
45
+ "Accept-Encoding": "gzip, deflate, br",
46
+ "Connection": "keep-alive",
47
+ "User-Agent": "python-urllib",
48
+ }
49
+
50
+ def _tar_filter(member: tarfile.TarInfo, path):
51
+ if os.path.basename(member.name) != "uv":
52
+ return None # skip
53
+ member.path = os.path.basename(member.path)
54
+ return member
55
+
56
+ max_retries = 3
57
+ for attempt in range(max_retries):
58
+ try:
59
+ req = Request(UV_URL, headers=headers)
60
+ with urlopen(req) as response:
61
+ with tarfile.open(fileobj=response, mode="r:gz") as tar:
62
+ tar.extractall(uv_install_path, filter=_tar_filter)
63
+ break
64
+ except (URLError, IOError) as e:
65
+ if attempt == max_retries - 1:
66
+ raise Exception(
67
+ f"Failed to download UV after {max_retries} attempts: {e}"
68
+ )
69
+ time.sleep(2**attempt)
70
+
71
+ # Update PATH only once at the end
72
+ os.environ["PATH"] += os.pathsep + uv_install_path
73
+
74
+ def get_dependencies(datastore_type):
75
+ # return required dependencies for Metaflow that must be added to the UV environment.
76
+ pinned = get_pinned_conda_libs(None, datastore_type)
77
+
78
+ # return only dependency names instead of pinned versions
79
+ return pinned.keys()
80
+
81
+ def sync_uv_project(datastore_type):
82
+ print("Syncing uv project...")
83
+ dependencies = " ".join(get_dependencies(datastore_type))
84
+ cmd = f"""set -e;
85
+ uv sync --frozen --no-install-package metaflow;
86
+ uv pip install {dependencies} --strict
87
+ """
88
+ run_cmd(cmd)
89
+
90
+ if len(sys.argv) != 2:
91
+ print("Usage: bootstrap.py <datastore_type>")
92
+ sys.exit(1)
93
+
94
+ try:
95
+ datastore_type = sys.argv[1]
96
+ install_uv()
97
+ sync_uv_project(datastore_type)
98
+ except Exception as e:
99
+ print(f"Error: {str(e)}", file=sys.stderr)
100
+ sys.exit(1)
@@ -0,0 +1,70 @@
1
+ import os
2
+
3
+ from metaflow.exception import MetaflowException
4
+ from metaflow.metaflow_environment import MetaflowEnvironment
5
+
6
+
7
+ class UVException(MetaflowException):
8
+ headline = "uv error"
9
+
10
+
11
+ class UVEnvironment(MetaflowEnvironment):
12
+ TYPE = "uv"
13
+
14
+ def __init__(self, flow):
15
+ self.flow = flow
16
+
17
+ def validate_environment(self, logger, datastore_type):
18
+ self.datastore_type = datastore_type
19
+ self.logger = logger
20
+
21
+ def init_environment(self, echo, only_steps=None):
22
+ self.logger("Bootstrapping uv...")
23
+
24
+ def executable(self, step_name, default=None):
25
+ return "uv run python"
26
+
27
+ def add_to_package(self):
28
+ # NOTE: We treat uv.lock and pyproject.toml as regular project assets and ship these along user code as part of the code package
29
+ # These are the minimal required files to reproduce the UV environment on the remote platform.
30
+ def _find(filename):
31
+ current_dir = os.getcwd()
32
+ while True:
33
+ file_path = os.path.join(current_dir, filename)
34
+ if os.path.isfile(file_path):
35
+ return file_path
36
+ parent_dir = os.path.dirname(current_dir)
37
+ if parent_dir == current_dir: # Reached root
38
+ raise UVException(
39
+ f"Could not find {filename} in current directory or any parent directory"
40
+ )
41
+ current_dir = parent_dir
42
+
43
+ pyproject_path = _find("pyproject.toml")
44
+ uv_lock_path = _find("uv.lock")
45
+ files = [
46
+ (uv_lock_path, "uv.lock"),
47
+ (pyproject_path, "pyproject.toml"),
48
+ ]
49
+ return files
50
+
51
+ def pylint_config(self):
52
+ config = super().pylint_config()
53
+ # Disable (import-error) in pylint
54
+ config.append("--disable=F0401")
55
+ return config
56
+
57
+ def bootstrap_commands(self, step_name, datastore_type):
58
+ return [
59
+ "echo 'Bootstrapping uv project...'",
60
+ "flush_mflogs",
61
+ # We have to prevent the tracing module from loading, as the bootstrapping process
62
+ # uses the internal S3 client which would fail to import tracing due to the required
63
+ # dependencies being bundled into the conda environment, which is yet to be
64
+ # initialized at this point.
65
+ 'DISABLE_TRACING=True python -m metaflow.plugins.uv.bootstrap "%s"'
66
+ % datastore_type,
67
+ "echo 'uv project bootstrapped.'",
68
+ "flush_mflogs",
69
+ "export PATH=$PATH:$(pwd)/uv_install",
70
+ ]
metaflow/version.py CHANGED
@@ -1 +1 @@
1
- metaflow_version = "2.15.6"
1
+ metaflow_version = "2.15.8"
@@ -75,7 +75,7 @@ check-docker:
75
75
  @if [ "$(shell uname)" = "Darwin" ]; then \
76
76
  open -a Docker || (echo "❌ Please start Docker Desktop" && exit 1); \
77
77
  else \
78
- systemctl is-active --quiet docker || (echo "❌ Docker daemon is not running. Start with 'sudo systemctl start docker'" && exit 1); \
78
+ docker info >/dev/null 2>&1 || (echo "❌ Docker daemon is not running." && exit 1); \
79
79
  fi
80
80
  @echo "✅ Docker is running"
81
81
 
@@ -260,6 +260,7 @@ shell: setup-tilt
260
260
  env METAFLOW_HOME="$(DEVTOOLS_DIR)" \
261
261
  METAFLOW_PROFILE=local \
262
262
  AWS_CONFIG_FILE="$(DEVTOOLS_DIR)/aws_config" \
263
+ AWS_SHARED_CREDENTIALS_FILE= \
263
264
  "$$user_shell" -i; \
264
265
  else \
265
266
  env METAFLOW_HOME="$(DEVTOOLS_DIR)" \
@@ -301,6 +302,7 @@ create-dev-shell: setup-tilt
301
302
  echo " env METAFLOW_HOME=\"$(DEVTOOLS_DIR)\" \\" >> $$SHELL_PATH && \
302
303
  echo " METAFLOW_PROFILE=local \\" >> $$SHELL_PATH && \
303
304
  echo " AWS_CONFIG_FILE=\"$(DEVTOOLS_DIR)/aws_config\" \\" >> $$SHELL_PATH && \
305
+ echo " AWS_SHARED_CREDENTIALS_FILE= \\" >> $$SHELL_PATH && \
304
306
  echo " \"\$$user_shell\" -i" >> $$SHELL_PATH && \
305
307
  echo "else" >> $$SHELL_PATH && \
306
308
  echo " env METAFLOW_HOME=\"$(DEVTOOLS_DIR)\" \\" >> $$SHELL_PATH && \
@@ -339,4 +341,4 @@ ui: setup-tilt
339
341
 
340
342
  .PHONY: install-helm setup-minikube setup-tilt teardown-minikube tunnel up down check-docker install-curl install-gum install-brew up down dashboard shell ui all-up help
341
343
 
342
- .DEFAULT_GOAL := help
344
+ .DEFAULT_GOAL := help
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: metaflow
3
- Version: 2.15.6
3
+ Version: 2.15.8
4
4
  Summary: Metaflow: More AI and ML, Less Engineering
5
5
  Author: Metaflow Developers
6
6
  Author-email: help@metaflow.org
@@ -26,13 +26,14 @@ License-File: LICENSE
26
26
  Requires-Dist: requests
27
27
  Requires-Dist: boto3
28
28
  Provides-Extra: stubs
29
- Requires-Dist: metaflow-stubs==2.15.6; extra == "stubs"
29
+ Requires-Dist: metaflow-stubs==2.15.8; extra == "stubs"
30
30
  Dynamic: author
31
31
  Dynamic: author-email
32
32
  Dynamic: classifier
33
33
  Dynamic: description
34
34
  Dynamic: description-content-type
35
35
  Dynamic: license
36
+ Dynamic: license-file
36
37
  Dynamic: project-url
37
38
  Dynamic: provides-extra
38
39
  Dynamic: requires-dist