metaflow 2.15.6__py2.py3-none-any.whl → 2.15.8__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow/cli.py +8 -0
- metaflow/cli_components/run_cmds.py +2 -2
- metaflow/cmd/main_cli.py +1 -1
- metaflow/metadata_provider/metadata.py +35 -0
- metaflow/metaflow_config.py +6 -0
- metaflow/metaflow_environment.py +6 -1
- metaflow/metaflow_git.py +115 -0
- metaflow/metaflow_version.py +2 -2
- metaflow/plugins/__init__.py +1 -0
- metaflow/plugins/argo/argo_workflows.py +66 -17
- metaflow/plugins/argo/argo_workflows_cli.py +11 -0
- metaflow/plugins/argo/argo_workflows_deployer_objects.py +7 -6
- metaflow/plugins/aws/aws_client.py +4 -3
- metaflow/plugins/datatools/s3/s3.py +46 -44
- metaflow/plugins/datatools/s3/s3op.py +133 -63
- metaflow/plugins/uv/__init__.py +0 -0
- metaflow/plugins/uv/bootstrap.py +100 -0
- metaflow/plugins/uv/uv_environment.py +70 -0
- metaflow/version.py +1 -1
- {metaflow-2.15.6.data → metaflow-2.15.8.data}/data/share/metaflow/devtools/Makefile +4 -2
- {metaflow-2.15.6.dist-info → metaflow-2.15.8.dist-info}/METADATA +4 -3
- {metaflow-2.15.6.dist-info → metaflow-2.15.8.dist-info}/RECORD +28 -24
- {metaflow-2.15.6.dist-info → metaflow-2.15.8.dist-info}/WHEEL +1 -1
- {metaflow-2.15.6.data → metaflow-2.15.8.data}/data/share/metaflow/devtools/Tiltfile +0 -0
- {metaflow-2.15.6.data → metaflow-2.15.8.data}/data/share/metaflow/devtools/pick_services.sh +0 -0
- {metaflow-2.15.6.dist-info → metaflow-2.15.8.dist-info}/entry_points.txt +0 -0
- {metaflow-2.15.6.dist-info → metaflow-2.15.8.dist-info/licenses}/LICENSE +0 -0
- {metaflow-2.15.6.dist-info → metaflow-2.15.8.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@ from metaflow.metaflow_config import (
|
|
18
18
|
S3_RETRY_COUNT,
|
19
19
|
S3_TRANSIENT_RETRY_COUNT,
|
20
20
|
S3_SERVER_SIDE_ENCRYPTION,
|
21
|
+
S3_WORKER_COUNT,
|
21
22
|
TEMPDIR,
|
22
23
|
)
|
23
24
|
from metaflow.util import (
|
@@ -1390,9 +1391,31 @@ class S3(object):
|
|
1390
1391
|
)
|
1391
1392
|
|
1392
1393
|
# add some jitter to make sure retries are not synchronized
|
1393
|
-
def _jitter_sleep(
|
1394
|
-
|
1395
|
-
|
1394
|
+
def _jitter_sleep(
|
1395
|
+
self, trynum: int, base: int = 2, cap: int = 360, jitter: float = 0.1
|
1396
|
+
) -> None:
|
1397
|
+
"""
|
1398
|
+
Sleep for an exponentially increasing interval with added jitter.
|
1399
|
+
|
1400
|
+
Parameters
|
1401
|
+
----------
|
1402
|
+
trynum: The current retry attempt number.
|
1403
|
+
base: The base multiplier for the exponential backoff.
|
1404
|
+
cap: The maximum interval to sleep.
|
1405
|
+
jitter: The maximum jitter percentage to add to the interval.
|
1406
|
+
"""
|
1407
|
+
# Calculate the exponential backoff interval
|
1408
|
+
interval = min(cap, base**trynum)
|
1409
|
+
|
1410
|
+
# Add random jitter
|
1411
|
+
jitter_value = interval * jitter * random.uniform(-1, 1)
|
1412
|
+
interval_with_jitter = interval + jitter_value
|
1413
|
+
|
1414
|
+
# Ensure the interval is not negative
|
1415
|
+
interval_with_jitter = max(0, interval_with_jitter)
|
1416
|
+
|
1417
|
+
# Sleep for the calculated interval
|
1418
|
+
time.sleep(interval_with_jitter)
|
1396
1419
|
|
1397
1420
|
# NOTE: re: _read_many_files and _put_many_files
|
1398
1421
|
# All file IO is through binary files - we write bytes, we read
|
@@ -1480,20 +1503,17 @@ class S3(object):
|
|
1480
1503
|
# - a known transient failure (SlowDown for example) in which case we will
|
1481
1504
|
# retry *only* the inputs that have this transient failure.
|
1482
1505
|
# - an unknown failure (something went wrong but we cannot say if it was
|
1483
|
-
# a known permanent failure or something else). In this case, we
|
1484
|
-
#
|
1485
|
-
#
|
1486
|
-
# There are therefore two retry counts:
|
1487
|
-
# - the transient failure retry count: how many times do we try on known
|
1488
|
-
# transient errors
|
1489
|
-
# - the top-level retry count: how many times do we try on unknown failures
|
1506
|
+
# a known permanent failure or something else). In this case, we assume
|
1507
|
+
# it's a transient failure and retry only those inputs (same as above).
|
1490
1508
|
#
|
1491
|
-
#
|
1492
|
-
#
|
1493
|
-
#
|
1494
|
-
#
|
1495
|
-
#
|
1496
|
-
#
|
1509
|
+
# NOTES(npow): 2025-05-13
|
1510
|
+
# Previously, this code would also retry the fatal failures, including no_progress
|
1511
|
+
# and unknown failures, from the beginning. This is not ideal because:
|
1512
|
+
# 1. Fatal errors are not supposed to be retried.
|
1513
|
+
# 2. Retrying from the beginning does not improve the situation, and is
|
1514
|
+
# wasteful since we have already uploaded some files.
|
1515
|
+
# 3. The number of transient errors is far more than fatal errors, so we
|
1516
|
+
# can be optimistic and assume the unknown errors are transient.
|
1497
1517
|
cmdline = [sys.executable, os.path.abspath(s3op.__file__), mode]
|
1498
1518
|
recursive_get = False
|
1499
1519
|
for key, value in options.items():
|
@@ -1528,7 +1548,6 @@ class S3(object):
|
|
1528
1548
|
# Otherwise, we cap the failure rate at 90%
|
1529
1549
|
return min(90, self._s3_inject_failures)
|
1530
1550
|
|
1531
|
-
retry_count = 0 # Number of retries (excluding transient failures)
|
1532
1551
|
transient_retry_count = 0 # Number of transient retries (per top-level retry)
|
1533
1552
|
inject_failures = _inject_failure_rate()
|
1534
1553
|
out_lines = [] # List to contain the lines returned by _s3op_with_retries
|
@@ -1595,7 +1614,12 @@ class S3(object):
|
|
1595
1614
|
# things, this will shrink more and more until we are doing a
|
1596
1615
|
# single operation at a time. If things start going better, it
|
1597
1616
|
# will increase by 20% every round.
|
1598
|
-
|
1617
|
+
#
|
1618
|
+
# If we made no progress (last_ok_count == 0) we retry at most
|
1619
|
+
# 2*S3_WORKER_COUNT from whatever is left in `pending_retries`
|
1620
|
+
max_count = min(
|
1621
|
+
int(last_ok_count * 1.2), len(pending_retries)
|
1622
|
+
) or min(2 * S3_WORKER_COUNT, len(pending_retries))
|
1599
1623
|
tmp_input.writelines(pending_retries[:max_count])
|
1600
1624
|
tmp_input.flush()
|
1601
1625
|
debug.s3client_exec(
|
@@ -1712,38 +1736,16 @@ class S3(object):
|
|
1712
1736
|
_update_out_lines(out_lines, ok_lines, resize=loop_count == 0)
|
1713
1737
|
return 0, 0, inject_failures, err_out
|
1714
1738
|
|
1715
|
-
while
|
1739
|
+
while transient_retry_count <= S3_TRANSIENT_RETRY_COUNT:
|
1716
1740
|
(
|
1717
1741
|
last_ok_count,
|
1718
1742
|
last_retry_count,
|
1719
1743
|
inject_failures,
|
1720
1744
|
err_out,
|
1721
1745
|
) = try_s3_op(last_ok_count, pending_retries, out_lines, inject_failures)
|
1722
|
-
if err_out
|
1723
|
-
|
1724
|
-
|
1725
|
-
last_ok_count == 0
|
1726
|
-
or transient_retry_count > S3_TRANSIENT_RETRY_COUNT
|
1727
|
-
)
|
1728
|
-
):
|
1729
|
-
# We had a fatal failure (err_out is not None)
|
1730
|
-
# or we made no progress (last_ok_count is 0)
|
1731
|
-
# or we are out of transient retries
|
1732
|
-
# so we will restart from scratch (being very conservative)
|
1733
|
-
retry_count += 1
|
1734
|
-
err_msg = err_out
|
1735
|
-
if err_msg is None and last_ok_count == 0:
|
1736
|
-
err_msg = "No progress"
|
1737
|
-
if err_msg is None:
|
1738
|
-
err_msg = "Too many transient errors"
|
1739
|
-
print(
|
1740
|
-
"S3 non-transient error (attempt #%d): %s" % (retry_count, err_msg)
|
1741
|
-
)
|
1742
|
-
_reset()
|
1743
|
-
if retry_count <= S3_RETRY_COUNT:
|
1744
|
-
self._jitter_sleep(retry_count)
|
1745
|
-
continue
|
1746
|
-
elif last_retry_count != 0:
|
1746
|
+
if err_out:
|
1747
|
+
break
|
1748
|
+
if last_retry_count != 0:
|
1747
1749
|
# During our last try, we did not manage to process everything we wanted
|
1748
1750
|
# due to a transient failure so we try again.
|
1749
1751
|
transient_retry_count += 1
|
@@ -15,7 +15,10 @@ from tempfile import NamedTemporaryFile
|
|
15
15
|
from multiprocessing import Process, Queue
|
16
16
|
from itertools import starmap, chain, islice
|
17
17
|
|
18
|
+
from boto3.exceptions import RetriesExceededError, S3UploadFailedError
|
18
19
|
from boto3.s3.transfer import TransferConfig
|
20
|
+
from botocore.config import Config
|
21
|
+
from botocore.exceptions import ClientError, SSLError
|
19
22
|
|
20
23
|
try:
|
21
24
|
# python2
|
@@ -46,13 +49,21 @@ from metaflow.plugins.datatools.s3.s3util import (
|
|
46
49
|
import metaflow.tracing as tracing
|
47
50
|
from metaflow.metaflow_config import (
|
48
51
|
S3_WORKER_COUNT,
|
52
|
+
S3_CLIENT_RETRY_CONFIG,
|
49
53
|
)
|
50
54
|
|
51
55
|
DOWNLOAD_FILE_THRESHOLD = 2 * TransferConfig().multipart_threshold
|
52
56
|
DOWNLOAD_MAX_CHUNK = 2 * 1024 * 1024 * 1024 - 1
|
53
57
|
|
58
|
+
DEFAULT_S3_CLIENT_PARAMS = {"config": Config(retries=S3_CLIENT_RETRY_CONFIG)}
|
54
59
|
RANGE_MATCH = re.compile(r"bytes (?P<start>[0-9]+)-(?P<end>[0-9]+)/(?P<total>[0-9]+)")
|
55
60
|
|
61
|
+
# from botocore ClientError MSG_TEMPLATE:
|
62
|
+
# https://github.com/boto/botocore/blob/68ca78f3097906c9231840a49931ef4382c41eea/botocore/exceptions.py#L521
|
63
|
+
BOTOCORE_MSG_TEMPLATE_MATCH = re.compile(
|
64
|
+
r"An error occurred \((\w+)\) when calling the (\w+) operation.*: (.+)"
|
65
|
+
)
|
66
|
+
|
56
67
|
S3Config = namedtuple("S3Config", "role session_vars client_params")
|
57
68
|
|
58
69
|
|
@@ -147,6 +158,7 @@ def normalize_client_error(err):
|
|
147
158
|
"LimitExceededException",
|
148
159
|
"RequestThrottled",
|
149
160
|
"EC2ThrottledException",
|
161
|
+
"InternalError",
|
150
162
|
):
|
151
163
|
return 503
|
152
164
|
return error_code
|
@@ -221,54 +233,57 @@ def worker(result_file_name, queue, mode, s3config):
|
|
221
233
|
elif mode == "download":
|
222
234
|
tmp = NamedTemporaryFile(dir=".", mode="wb", delete=False)
|
223
235
|
try:
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
range_result = resp["ContentRange"]
|
229
|
-
range_result_match = RANGE_MATCH.match(range_result)
|
230
|
-
if range_result_match is None:
|
231
|
-
raise RuntimeError(
|
232
|
-
"Wrong format for ContentRange: %s"
|
233
|
-
% str(range_result)
|
236
|
+
try:
|
237
|
+
if url.range:
|
238
|
+
resp = s3.get_object(
|
239
|
+
Bucket=url.bucket, Key=url.path, Range=url.range
|
234
240
|
)
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
)
|
241
|
+
range_result = resp["ContentRange"]
|
242
|
+
range_result_match = RANGE_MATCH.match(range_result)
|
243
|
+
if range_result_match is None:
|
244
|
+
raise RuntimeError(
|
245
|
+
"Wrong format for ContentRange: %s"
|
246
|
+
% str(range_result)
|
247
|
+
)
|
248
|
+
range_result = {
|
249
|
+
x: int(range_result_match.group(x))
|
250
|
+
for x in ["total", "start", "end"]
|
251
|
+
}
|
252
|
+
else:
|
253
|
+
resp = s3.get_object(Bucket=url.bucket, Key=url.path)
|
254
|
+
range_result = None
|
255
|
+
sz = resp["ContentLength"]
|
256
|
+
if range_result is None:
|
257
|
+
range_result = {"total": sz, "start": 0, "end": sz - 1}
|
258
|
+
if not url.range and sz > DOWNLOAD_FILE_THRESHOLD:
|
259
|
+
# In this case, it is more efficient to use download_file as it
|
260
|
+
# will download multiple parts in parallel (it does it after
|
261
|
+
# multipart_threshold)
|
262
|
+
s3.download_file(url.bucket, url.path, tmp.name)
|
263
|
+
else:
|
264
|
+
read_in_chunks(
|
265
|
+
tmp, resp["Body"], sz, DOWNLOAD_MAX_CHUNK
|
266
|
+
)
|
267
|
+
tmp.close()
|
268
|
+
os.rename(tmp.name, url.local)
|
269
|
+
except client_error as err:
|
270
|
+
tmp.close()
|
271
|
+
os.unlink(tmp.name)
|
272
|
+
handle_client_error(err, idx, result_file)
|
265
273
|
continue
|
266
|
-
|
267
|
-
|
274
|
+
except RetriesExceededError as e:
|
275
|
+
tmp.close()
|
276
|
+
os.unlink(tmp.name)
|
277
|
+
err = convert_to_client_error(e)
|
278
|
+
handle_client_error(err, idx, result_file)
|
268
279
|
continue
|
269
|
-
|
270
|
-
|
271
|
-
|
280
|
+
except (SSLError, Exception) as e:
|
281
|
+
tmp.close()
|
282
|
+
os.unlink(tmp.name)
|
283
|
+
# assume anything else is transient
|
284
|
+
result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
|
285
|
+
result_file.flush()
|
286
|
+
continue
|
272
287
|
# If we need the metadata, get it and write it out
|
273
288
|
if pre_op_info:
|
274
289
|
with open("%s_meta" % url.local, mode="w") as f:
|
@@ -316,28 +331,67 @@ def worker(result_file_name, queue, mode, s3config):
|
|
316
331
|
if url.encryption is not None:
|
317
332
|
extra["ServerSideEncryption"] = url.encryption
|
318
333
|
try:
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
# We indicate that the file was uploaded
|
323
|
-
result_file.write("%d %d\n" % (idx, 0))
|
324
|
-
except client_error as err:
|
325
|
-
error_code = normalize_client_error(err)
|
326
|
-
if error_code == 403:
|
327
|
-
result_file.write(
|
328
|
-
"%d %d\n" % (idx, -ERROR_URL_ACCESS_DENIED)
|
334
|
+
try:
|
335
|
+
s3.upload_file(
|
336
|
+
url.local, url.bucket, url.path, ExtraArgs=extra
|
329
337
|
)
|
338
|
+
# We indicate that the file was uploaded
|
339
|
+
result_file.write("%d %d\n" % (idx, 0))
|
340
|
+
except client_error as err:
|
341
|
+
# Shouldn't get here, but just in case.
|
342
|
+
# Internally, botocore catches ClientError and returns a S3UploadFailedError.
|
343
|
+
# See https://github.com/boto/boto3/blob/develop/boto3/s3/transfer.py#L377
|
344
|
+
handle_client_error(err, idx, result_file)
|
330
345
|
continue
|
331
|
-
|
332
|
-
|
346
|
+
except S3UploadFailedError as e:
|
347
|
+
err = convert_to_client_error(e)
|
348
|
+
handle_client_error(err, idx, result_file)
|
333
349
|
continue
|
334
|
-
|
335
|
-
|
350
|
+
except (SSLError, Exception) as e:
|
351
|
+
# assume anything else is transient
|
352
|
+
result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
|
353
|
+
result_file.flush()
|
354
|
+
continue
|
336
355
|
except:
|
337
356
|
traceback.print_exc()
|
357
|
+
result_file.flush()
|
338
358
|
sys.exit(ERROR_WORKER_EXCEPTION)
|
339
359
|
|
340
360
|
|
361
|
+
def convert_to_client_error(e):
|
362
|
+
match = BOTOCORE_MSG_TEMPLATE_MATCH.search(str(e))
|
363
|
+
if not match:
|
364
|
+
raise e
|
365
|
+
error_code = match.group(1)
|
366
|
+
operation_name = match.group(2)
|
367
|
+
error_message = match.group(3)
|
368
|
+
response = {
|
369
|
+
"Error": {
|
370
|
+
"Code": error_code,
|
371
|
+
"Message": error_message,
|
372
|
+
}
|
373
|
+
}
|
374
|
+
return ClientError(response, operation_name)
|
375
|
+
|
376
|
+
|
377
|
+
def handle_client_error(err, idx, result_file):
|
378
|
+
error_code = normalize_client_error(err)
|
379
|
+
if error_code == 404:
|
380
|
+
result_file.write("%d %d\n" % (idx, -ERROR_URL_NOT_FOUND))
|
381
|
+
result_file.flush()
|
382
|
+
elif error_code == 403:
|
383
|
+
result_file.write("%d %d\n" % (idx, -ERROR_URL_ACCESS_DENIED))
|
384
|
+
result_file.flush()
|
385
|
+
elif error_code == 503:
|
386
|
+
result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
|
387
|
+
result_file.flush()
|
388
|
+
else:
|
389
|
+
# optimistically assume it is a transient error
|
390
|
+
result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
|
391
|
+
result_file.flush()
|
392
|
+
# TODO specific error message for out of disk space
|
393
|
+
|
394
|
+
|
341
395
|
def start_workers(mode, urls, num_workers, inject_failure, s3config):
|
342
396
|
# We start the minimum of len(urls) or num_workers to avoid starting
|
343
397
|
# workers that will definitely do nothing
|
@@ -381,6 +435,22 @@ def start_workers(mode, urls, num_workers, inject_failure, s3config):
|
|
381
435
|
if proc.exitcode is not None:
|
382
436
|
if proc.exitcode != 0:
|
383
437
|
msg = "Worker process failed (exit code %d)" % proc.exitcode
|
438
|
+
|
439
|
+
# IMPORTANT: if this process has put items on a queue, then it will not terminate
|
440
|
+
# until all buffered items have been flushed to the pipe, causing a deadlock.
|
441
|
+
# `cancel_join_thread()` allows it to exit without flushing the queue.
|
442
|
+
# Without this line, the parent process would hang indefinitely when a subprocess
|
443
|
+
# did not exit cleanly in the case of unhandled exceptions.
|
444
|
+
#
|
445
|
+
# The error situation is:
|
446
|
+
# 1. this process puts stuff in queue
|
447
|
+
# 2. subprocess dies so doesn't consume its end-of-queue marker (the None)
|
448
|
+
# 3. other subprocesses consume all useful bits AND their end-of-queue marker
|
449
|
+
# 4. one marker is left and not consumed
|
450
|
+
# 5. this process cannot shut down until the queue is empty.
|
451
|
+
# 6. it will never be empty because all subprocesses (workers) have died.
|
452
|
+
queue.cancel_join_thread()
|
453
|
+
|
384
454
|
exit(msg, proc.exitcode)
|
385
455
|
# Read the output file if all went well
|
386
456
|
with open(out_path, "r") as out_file:
|
@@ -745,7 +815,7 @@ def lst(
|
|
745
815
|
s3config = S3Config(
|
746
816
|
s3role,
|
747
817
|
json.loads(s3sessionvars) if s3sessionvars else None,
|
748
|
-
json.loads(s3clientparams) if s3clientparams else
|
818
|
+
json.loads(s3clientparams) if s3clientparams else DEFAULT_S3_CLIENT_PARAMS,
|
749
819
|
)
|
750
820
|
|
751
821
|
urllist = []
|
@@ -878,7 +948,7 @@ def put(
|
|
878
948
|
s3config = S3Config(
|
879
949
|
s3role,
|
880
950
|
json.loads(s3sessionvars) if s3sessionvars else None,
|
881
|
-
json.loads(s3clientparams) if s3clientparams else
|
951
|
+
json.loads(s3clientparams) if s3clientparams else DEFAULT_S3_CLIENT_PARAMS,
|
882
952
|
)
|
883
953
|
|
884
954
|
urls = list(starmap(_make_url, _files()))
|
@@ -1025,7 +1095,7 @@ def get(
|
|
1025
1095
|
s3config = S3Config(
|
1026
1096
|
s3role,
|
1027
1097
|
json.loads(s3sessionvars) if s3sessionvars else None,
|
1028
|
-
json.loads(s3clientparams) if s3clientparams else
|
1098
|
+
json.loads(s3clientparams) if s3clientparams else DEFAULT_S3_CLIENT_PARAMS,
|
1029
1099
|
)
|
1030
1100
|
|
1031
1101
|
# Construct a list of URL (prefix) objects
|
@@ -1172,7 +1242,7 @@ def info(
|
|
1172
1242
|
s3config = S3Config(
|
1173
1243
|
s3role,
|
1174
1244
|
json.loads(s3sessionvars) if s3sessionvars else None,
|
1175
|
-
json.loads(s3clientparams) if s3clientparams else
|
1245
|
+
json.loads(s3clientparams) if s3clientparams else DEFAULT_S3_CLIENT_PARAMS,
|
1176
1246
|
)
|
1177
1247
|
|
1178
1248
|
# Construct a list of URL (prefix) objects
|
File without changes
|
@@ -0,0 +1,100 @@
|
|
1
|
+
import os
|
2
|
+
import subprocess
|
3
|
+
import sys
|
4
|
+
import time
|
5
|
+
|
6
|
+
from metaflow.util import which
|
7
|
+
from metaflow.metaflow_config import get_pinned_conda_libs
|
8
|
+
from urllib.request import Request, urlopen
|
9
|
+
from urllib.error import URLError
|
10
|
+
|
11
|
+
# TODO: support version/platform/architecture selection.
|
12
|
+
UV_URL = "https://github.com/astral-sh/uv/releases/download/0.6.11/uv-x86_64-unknown-linux-gnu.tar.gz"
|
13
|
+
|
14
|
+
if __name__ == "__main__":
|
15
|
+
|
16
|
+
def run_cmd(cmd, stdin_str=None):
|
17
|
+
result = subprocess.run(
|
18
|
+
cmd,
|
19
|
+
shell=True,
|
20
|
+
input=stdin_str,
|
21
|
+
stdout=subprocess.PIPE,
|
22
|
+
stderr=subprocess.PIPE,
|
23
|
+
text=True,
|
24
|
+
)
|
25
|
+
if result.returncode != 0:
|
26
|
+
print(f"Bootstrap failed while executing: {cmd}")
|
27
|
+
print("Stdout:", result.stdout)
|
28
|
+
print("Stderr:", result.stderr)
|
29
|
+
sys.exit(1)
|
30
|
+
|
31
|
+
def install_uv():
|
32
|
+
import tarfile
|
33
|
+
|
34
|
+
uv_install_path = os.path.join(os.getcwd(), "uv_install")
|
35
|
+
if which("uv"):
|
36
|
+
return
|
37
|
+
|
38
|
+
print("Installing uv...")
|
39
|
+
|
40
|
+
# Prepare directory once
|
41
|
+
os.makedirs(uv_install_path, exist_ok=True)
|
42
|
+
|
43
|
+
# Download and decompress in one go
|
44
|
+
headers = {
|
45
|
+
"Accept-Encoding": "gzip, deflate, br",
|
46
|
+
"Connection": "keep-alive",
|
47
|
+
"User-Agent": "python-urllib",
|
48
|
+
}
|
49
|
+
|
50
|
+
def _tar_filter(member: tarfile.TarInfo, path):
|
51
|
+
if os.path.basename(member.name) != "uv":
|
52
|
+
return None # skip
|
53
|
+
member.path = os.path.basename(member.path)
|
54
|
+
return member
|
55
|
+
|
56
|
+
max_retries = 3
|
57
|
+
for attempt in range(max_retries):
|
58
|
+
try:
|
59
|
+
req = Request(UV_URL, headers=headers)
|
60
|
+
with urlopen(req) as response:
|
61
|
+
with tarfile.open(fileobj=response, mode="r:gz") as tar:
|
62
|
+
tar.extractall(uv_install_path, filter=_tar_filter)
|
63
|
+
break
|
64
|
+
except (URLError, IOError) as e:
|
65
|
+
if attempt == max_retries - 1:
|
66
|
+
raise Exception(
|
67
|
+
f"Failed to download UV after {max_retries} attempts: {e}"
|
68
|
+
)
|
69
|
+
time.sleep(2**attempt)
|
70
|
+
|
71
|
+
# Update PATH only once at the end
|
72
|
+
os.environ["PATH"] += os.pathsep + uv_install_path
|
73
|
+
|
74
|
+
def get_dependencies(datastore_type):
|
75
|
+
# return required dependencies for Metaflow that must be added to the UV environment.
|
76
|
+
pinned = get_pinned_conda_libs(None, datastore_type)
|
77
|
+
|
78
|
+
# return only dependency names instead of pinned versions
|
79
|
+
return pinned.keys()
|
80
|
+
|
81
|
+
def sync_uv_project(datastore_type):
|
82
|
+
print("Syncing uv project...")
|
83
|
+
dependencies = " ".join(get_dependencies(datastore_type))
|
84
|
+
cmd = f"""set -e;
|
85
|
+
uv sync --frozen --no-install-package metaflow;
|
86
|
+
uv pip install {dependencies} --strict
|
87
|
+
"""
|
88
|
+
run_cmd(cmd)
|
89
|
+
|
90
|
+
if len(sys.argv) != 2:
|
91
|
+
print("Usage: bootstrap.py <datastore_type>")
|
92
|
+
sys.exit(1)
|
93
|
+
|
94
|
+
try:
|
95
|
+
datastore_type = sys.argv[1]
|
96
|
+
install_uv()
|
97
|
+
sync_uv_project(datastore_type)
|
98
|
+
except Exception as e:
|
99
|
+
print(f"Error: {str(e)}", file=sys.stderr)
|
100
|
+
sys.exit(1)
|
@@ -0,0 +1,70 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
from metaflow.exception import MetaflowException
|
4
|
+
from metaflow.metaflow_environment import MetaflowEnvironment
|
5
|
+
|
6
|
+
|
7
|
+
class UVException(MetaflowException):
|
8
|
+
headline = "uv error"
|
9
|
+
|
10
|
+
|
11
|
+
class UVEnvironment(MetaflowEnvironment):
|
12
|
+
TYPE = "uv"
|
13
|
+
|
14
|
+
def __init__(self, flow):
|
15
|
+
self.flow = flow
|
16
|
+
|
17
|
+
def validate_environment(self, logger, datastore_type):
|
18
|
+
self.datastore_type = datastore_type
|
19
|
+
self.logger = logger
|
20
|
+
|
21
|
+
def init_environment(self, echo, only_steps=None):
|
22
|
+
self.logger("Bootstrapping uv...")
|
23
|
+
|
24
|
+
def executable(self, step_name, default=None):
|
25
|
+
return "uv run python"
|
26
|
+
|
27
|
+
def add_to_package(self):
|
28
|
+
# NOTE: We treat uv.lock and pyproject.toml as regular project assets and ship these along user code as part of the code package
|
29
|
+
# These are the minimal required files to reproduce the UV environment on the remote platform.
|
30
|
+
def _find(filename):
|
31
|
+
current_dir = os.getcwd()
|
32
|
+
while True:
|
33
|
+
file_path = os.path.join(current_dir, filename)
|
34
|
+
if os.path.isfile(file_path):
|
35
|
+
return file_path
|
36
|
+
parent_dir = os.path.dirname(current_dir)
|
37
|
+
if parent_dir == current_dir: # Reached root
|
38
|
+
raise UVException(
|
39
|
+
f"Could not find {filename} in current directory or any parent directory"
|
40
|
+
)
|
41
|
+
current_dir = parent_dir
|
42
|
+
|
43
|
+
pyproject_path = _find("pyproject.toml")
|
44
|
+
uv_lock_path = _find("uv.lock")
|
45
|
+
files = [
|
46
|
+
(uv_lock_path, "uv.lock"),
|
47
|
+
(pyproject_path, "pyproject.toml"),
|
48
|
+
]
|
49
|
+
return files
|
50
|
+
|
51
|
+
def pylint_config(self):
|
52
|
+
config = super().pylint_config()
|
53
|
+
# Disable (import-error) in pylint
|
54
|
+
config.append("--disable=F0401")
|
55
|
+
return config
|
56
|
+
|
57
|
+
def bootstrap_commands(self, step_name, datastore_type):
|
58
|
+
return [
|
59
|
+
"echo 'Bootstrapping uv project...'",
|
60
|
+
"flush_mflogs",
|
61
|
+
# We have to prevent the tracing module from loading, as the bootstrapping process
|
62
|
+
# uses the internal S3 client which would fail to import tracing due to the required
|
63
|
+
# dependencies being bundled into the conda environment, which is yet to be
|
64
|
+
# initialized at this point.
|
65
|
+
'DISABLE_TRACING=True python -m metaflow.plugins.uv.bootstrap "%s"'
|
66
|
+
% datastore_type,
|
67
|
+
"echo 'uv project bootstrapped.'",
|
68
|
+
"flush_mflogs",
|
69
|
+
"export PATH=$PATH:$(pwd)/uv_install",
|
70
|
+
]
|
metaflow/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
metaflow_version = "2.15.
|
1
|
+
metaflow_version = "2.15.8"
|
@@ -75,7 +75,7 @@ check-docker:
|
|
75
75
|
@if [ "$(shell uname)" = "Darwin" ]; then \
|
76
76
|
open -a Docker || (echo "❌ Please start Docker Desktop" && exit 1); \
|
77
77
|
else \
|
78
|
-
|
78
|
+
docker info >/dev/null 2>&1 || (echo "❌ Docker daemon is not running." && exit 1); \
|
79
79
|
fi
|
80
80
|
@echo "✅ Docker is running"
|
81
81
|
|
@@ -260,6 +260,7 @@ shell: setup-tilt
|
|
260
260
|
env METAFLOW_HOME="$(DEVTOOLS_DIR)" \
|
261
261
|
METAFLOW_PROFILE=local \
|
262
262
|
AWS_CONFIG_FILE="$(DEVTOOLS_DIR)/aws_config" \
|
263
|
+
AWS_SHARED_CREDENTIALS_FILE= \
|
263
264
|
"$$user_shell" -i; \
|
264
265
|
else \
|
265
266
|
env METAFLOW_HOME="$(DEVTOOLS_DIR)" \
|
@@ -301,6 +302,7 @@ create-dev-shell: setup-tilt
|
|
301
302
|
echo " env METAFLOW_HOME=\"$(DEVTOOLS_DIR)\" \\" >> $$SHELL_PATH && \
|
302
303
|
echo " METAFLOW_PROFILE=local \\" >> $$SHELL_PATH && \
|
303
304
|
echo " AWS_CONFIG_FILE=\"$(DEVTOOLS_DIR)/aws_config\" \\" >> $$SHELL_PATH && \
|
305
|
+
echo " AWS_SHARED_CREDENTIALS_FILE= \\" >> $$SHELL_PATH && \
|
304
306
|
echo " \"\$$user_shell\" -i" >> $$SHELL_PATH && \
|
305
307
|
echo "else" >> $$SHELL_PATH && \
|
306
308
|
echo " env METAFLOW_HOME=\"$(DEVTOOLS_DIR)\" \\" >> $$SHELL_PATH && \
|
@@ -339,4 +341,4 @@ ui: setup-tilt
|
|
339
341
|
|
340
342
|
.PHONY: install-helm setup-minikube setup-tilt teardown-minikube tunnel up down check-docker install-curl install-gum install-brew up down dashboard shell ui all-up help
|
341
343
|
|
342
|
-
.DEFAULT_GOAL := help
|
344
|
+
.DEFAULT_GOAL := help
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: metaflow
|
3
|
-
Version: 2.15.
|
3
|
+
Version: 2.15.8
|
4
4
|
Summary: Metaflow: More AI and ML, Less Engineering
|
5
5
|
Author: Metaflow Developers
|
6
6
|
Author-email: help@metaflow.org
|
@@ -26,13 +26,14 @@ License-File: LICENSE
|
|
26
26
|
Requires-Dist: requests
|
27
27
|
Requires-Dist: boto3
|
28
28
|
Provides-Extra: stubs
|
29
|
-
Requires-Dist: metaflow-stubs==2.15.
|
29
|
+
Requires-Dist: metaflow-stubs==2.15.8; extra == "stubs"
|
30
30
|
Dynamic: author
|
31
31
|
Dynamic: author-email
|
32
32
|
Dynamic: classifier
|
33
33
|
Dynamic: description
|
34
34
|
Dynamic: description-content-type
|
35
35
|
Dynamic: license
|
36
|
+
Dynamic: license-file
|
36
37
|
Dynamic: project-url
|
37
38
|
Dynamic: provides-extra
|
38
39
|
Dynamic: requires-dist
|