metaflow 2.18.13__py2.py3-none-any.whl → 2.19.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow/__init__.py +1 -0
- metaflow/cli.py +78 -13
- metaflow/cli_components/run_cmds.py +182 -39
- metaflow/cli_components/step_cmd.py +160 -4
- metaflow/client/__init__.py +1 -0
- metaflow/client/core.py +162 -99
- metaflow/client/filecache.py +59 -32
- metaflow/cmd/code/__init__.py +2 -1
- metaflow/datastore/__init__.py +1 -0
- metaflow/datastore/content_addressed_store.py +40 -9
- metaflow/datastore/datastore_set.py +10 -1
- metaflow/datastore/flow_datastore.py +123 -4
- metaflow/datastore/spin_datastore.py +91 -0
- metaflow/datastore/task_datastore.py +86 -2
- metaflow/decorators.py +75 -6
- metaflow/extension_support/__init__.py +372 -305
- metaflow/flowspec.py +3 -2
- metaflow/metaflow_config.py +41 -0
- metaflow/metaflow_profile.py +18 -0
- metaflow/packaging_sys/utils.py +2 -39
- metaflow/packaging_sys/v1.py +63 -16
- metaflow/plugins/__init__.py +2 -0
- metaflow/plugins/cards/card_datastore.py +9 -3
- metaflow/plugins/cards/card_decorator.py +1 -0
- metaflow/plugins/cards/card_modules/basic.py +9 -3
- metaflow/plugins/datastores/local_storage.py +12 -6
- metaflow/plugins/datastores/spin_storage.py +12 -0
- metaflow/plugins/datatools/s3/s3.py +29 -10
- metaflow/plugins/datatools/s3/s3op.py +90 -62
- metaflow/plugins/metadata_providers/local.py +76 -82
- metaflow/plugins/metadata_providers/spin.py +16 -0
- metaflow/runner/metaflow_runner.py +210 -19
- metaflow/runtime.py +348 -21
- metaflow/task.py +61 -12
- metaflow/user_configs/config_parameters.py +2 -4
- metaflow/user_decorators/mutable_flow.py +1 -1
- metaflow/user_decorators/user_step_decorator.py +10 -1
- metaflow/util.py +191 -1
- metaflow/version.py +1 -1
- {metaflow-2.18.13.data → metaflow-2.19.0.data}/data/share/metaflow/devtools/Makefile +10 -0
- {metaflow-2.18.13.dist-info → metaflow-2.19.0.dist-info}/METADATA +2 -4
- {metaflow-2.18.13.dist-info → metaflow-2.19.0.dist-info}/RECORD +48 -45
- {metaflow-2.18.13.data → metaflow-2.19.0.data}/data/share/metaflow/devtools/Tiltfile +0 -0
- {metaflow-2.18.13.data → metaflow-2.19.0.data}/data/share/metaflow/devtools/pick_services.sh +0 -0
- {metaflow-2.18.13.dist-info → metaflow-2.19.0.dist-info}/WHEEL +0 -0
- {metaflow-2.18.13.dist-info → metaflow-2.19.0.dist-info}/entry_points.txt +0 -0
- {metaflow-2.18.13.dist-info → metaflow-2.19.0.dist-info}/licenses/LICENSE +0 -0
- {metaflow-2.18.13.dist-info → metaflow-2.19.0.dist-info}/top_level.txt +0 -0
metaflow/client/core.py
CHANGED
|
@@ -207,6 +207,20 @@ def default_namespace() -> str:
|
|
|
207
207
|
return get_namespace()
|
|
208
208
|
|
|
209
209
|
|
|
210
|
+
def inspect_spin(datastore_root: str = "."):
|
|
211
|
+
"""
|
|
212
|
+
Set metadata provider to spin metadata so that users can inspect spin
|
|
213
|
+
steps, tasks, and artifacts.
|
|
214
|
+
|
|
215
|
+
Parameters
|
|
216
|
+
----------
|
|
217
|
+
datastore_root : str, default "."
|
|
218
|
+
The root path to the spin datastore.
|
|
219
|
+
"""
|
|
220
|
+
metadata_str = f"spin@{datastore_root}"
|
|
221
|
+
metadata(metadata_str)
|
|
222
|
+
|
|
223
|
+
|
|
210
224
|
MetaflowArtifacts = NamedTuple
|
|
211
225
|
|
|
212
226
|
|
|
@@ -277,6 +291,7 @@ class MetaflowObject(object):
|
|
|
277
291
|
self._attempt = attempt
|
|
278
292
|
self._current_namespace = _current_namespace or get_namespace()
|
|
279
293
|
self._namespace_check = _namespace_check
|
|
294
|
+
|
|
280
295
|
# If the current namespace is False, we disable checking for namespace for this
|
|
281
296
|
# and all children objects. Not setting namespace_check to False has the consequence
|
|
282
297
|
# of preventing access to children objects after the namespace changes
|
|
@@ -1189,149 +1204,197 @@ class Task(MetaflowObject):
|
|
|
1189
1204
|
_PARENT_CLASS = "step"
|
|
1190
1205
|
_CHILD_CLASS = "artifact"
|
|
1191
1206
|
|
|
1192
|
-
def __init__(self, *args, **kwargs):
|
|
1193
|
-
super(Task, self).__init__(*args, **kwargs)
|
|
1194
|
-
|
|
1195
1207
|
def _iter_filter(self, x):
|
|
1196
1208
|
# exclude private data artifacts
|
|
1197
1209
|
return x.id[0] != "_"
|
|
1198
1210
|
|
|
1199
|
-
def
|
|
1211
|
+
def _get_matching_pathspecs(self, steps, metadata_key, metadata_pattern):
|
|
1200
1212
|
"""
|
|
1201
|
-
Yield tasks from specified steps
|
|
1213
|
+
Yield pathspecs of tasks from specified steps that match a given metadata pattern.
|
|
1202
1214
|
|
|
1203
1215
|
Parameters
|
|
1204
1216
|
----------
|
|
1205
1217
|
steps : List[str]
|
|
1206
|
-
List of
|
|
1207
|
-
|
|
1208
|
-
|
|
1218
|
+
List of Step objects to search for tasks.
|
|
1219
|
+
metadata_key : str
|
|
1220
|
+
Metadata key to filter tasks on (e.g., 'foreach-execution-path').
|
|
1221
|
+
metadata_pattern : str
|
|
1222
|
+
Regular expression pattern to match against the metadata value.
|
|
1209
1223
|
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1224
|
+
Yields
|
|
1225
|
+
------
|
|
1226
|
+
str
|
|
1227
|
+
Pathspec of each task whose metadata value for the specified key matches the pattern.
|
|
1214
1228
|
"""
|
|
1215
1229
|
flow_id, run_id, _, _ = self.path_components
|
|
1216
|
-
|
|
1217
1230
|
for step in steps:
|
|
1218
1231
|
task_pathspecs = self._metaflow.metadata.filter_tasks_by_metadata(
|
|
1219
|
-
flow_id, run_id, step
|
|
1232
|
+
flow_id, run_id, step, metadata_key, metadata_pattern
|
|
1220
1233
|
)
|
|
1221
1234
|
for task_pathspec in task_pathspecs:
|
|
1222
|
-
yield
|
|
1235
|
+
yield task_pathspec
|
|
1236
|
+
|
|
1237
|
+
@staticmethod
|
|
1238
|
+
def _get_previous_steps(graph_info, step_name):
|
|
1239
|
+
# Get the parent steps
|
|
1240
|
+
steps = []
|
|
1241
|
+
for node_name, attributes in graph_info["steps"].items():
|
|
1242
|
+
if step_name in attributes["next"]:
|
|
1243
|
+
steps.append(node_name)
|
|
1244
|
+
return steps
|
|
1223
1245
|
|
|
1224
1246
|
@property
|
|
1225
|
-
def
|
|
1247
|
+
def parent_task_pathspecs(self) -> Iterator[str]:
|
|
1226
1248
|
"""
|
|
1227
|
-
Yields all parent tasks of the current task
|
|
1249
|
+
Yields pathspecs of all parent tasks of the current task.
|
|
1228
1250
|
|
|
1229
1251
|
Yields
|
|
1230
1252
|
------
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1253
|
+
str
|
|
1254
|
+
Pathspec of the parent task of the current task
|
|
1234
1255
|
"""
|
|
1235
|
-
|
|
1256
|
+
_, _, step_name, _ = self.path_components
|
|
1257
|
+
metadata_dict = self.metadata_dict
|
|
1258
|
+
graph_info = self["_graph_info"].data
|
|
1236
1259
|
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
current_path =
|
|
1260
|
+
# Get the parent steps
|
|
1261
|
+
steps = self._get_previous_steps(graph_info, step_name)
|
|
1262
|
+
node_type = graph_info["steps"][step_name]["type"]
|
|
1263
|
+
metadata_key = "foreach-execution-path"
|
|
1264
|
+
current_path = metadata_dict.get(metadata_key)
|
|
1242
1265
|
|
|
1243
1266
|
if len(steps) > 1:
|
|
1244
1267
|
# Static join - use exact path matching
|
|
1245
1268
|
pattern = current_path or ".*"
|
|
1246
|
-
yield from self._iter_matching_tasks(
|
|
1247
|
-
steps, "foreach-execution-path", pattern
|
|
1248
|
-
)
|
|
1249
|
-
return
|
|
1250
|
-
|
|
1251
|
-
# Handle single step case
|
|
1252
|
-
target_task = Step(
|
|
1253
|
-
f"{flow_id}/{run_id}/{steps[0].id}", _namespace_check=False
|
|
1254
|
-
).task
|
|
1255
|
-
target_path = target_task.metadata_dict.get("foreach-execution-path")
|
|
1256
|
-
|
|
1257
|
-
if not target_path or not current_path:
|
|
1258
|
-
# (Current task, "A:10") and (Parent task, "")
|
|
1259
|
-
# Pattern: ".*"
|
|
1260
|
-
pattern = ".*"
|
|
1261
1269
|
else:
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
if current_depth < target_depth:
|
|
1266
|
-
# Foreach join
|
|
1267
|
-
# (Current task, "A:10,B:13") and (Parent task, "A:10,B:13,C:21")
|
|
1268
|
-
# Pattern: "A:10,B:13,.*"
|
|
1269
|
-
pattern = f"{current_path},.*"
|
|
1270
|
-
else:
|
|
1271
|
-
# Foreach split or linear step
|
|
1272
|
-
# Option 1:
|
|
1273
|
-
# (Current task, "A:10,B:13,C:21") and (Parent task, "A:10,B:13")
|
|
1274
|
-
# Option 2:
|
|
1275
|
-
# (Current task, "A:10,B:13") and (Parent task, "A:10,B:13")
|
|
1276
|
-
# Pattern: "A:10,B:13"
|
|
1277
|
-
pattern = ",".join(current_path.split(",")[:target_depth])
|
|
1270
|
+
if not steps:
|
|
1271
|
+
return # No parent steps, yield nothing
|
|
1278
1272
|
|
|
1279
|
-
|
|
1273
|
+
if not current_path:
|
|
1274
|
+
# Current task is not part of a foreach
|
|
1275
|
+
# Pattern: ".*"
|
|
1276
|
+
pattern = ".*"
|
|
1277
|
+
else:
|
|
1278
|
+
current_depth = len(current_path.split(","))
|
|
1279
|
+
if node_type == "join":
|
|
1280
|
+
# Foreach join
|
|
1281
|
+
# (Current task, "A:10,B:13") and (Parent task, "A:10,B:13,C:21")
|
|
1282
|
+
# Pattern: "A:10,B:13,.*"
|
|
1283
|
+
pattern = f"{current_path},.*"
|
|
1284
|
+
else:
|
|
1285
|
+
# Foreach split or linear step
|
|
1286
|
+
# Pattern: "A:10,B:13"
|
|
1287
|
+
parent_step_type = graph_info["steps"][steps[0]]["type"]
|
|
1288
|
+
target_depth = current_depth
|
|
1289
|
+
if (
|
|
1290
|
+
parent_step_type == "split-foreach"
|
|
1291
|
+
or parent_step_type == "split-parallel"
|
|
1292
|
+
) and current_depth == 1:
|
|
1293
|
+
# (Current task, "A:10") and (Parent task, "")
|
|
1294
|
+
pattern = ".*"
|
|
1295
|
+
else:
|
|
1296
|
+
# (Current task, "A:10,B:13,C:21") and (Parent task, "A:10,B:13")
|
|
1297
|
+
# (Current task, "A:10,B:13") and (Parent task, "A:10,B:13")
|
|
1298
|
+
if (
|
|
1299
|
+
parent_step_type == "split-foreach"
|
|
1300
|
+
or parent_step_type == "split-parallel"
|
|
1301
|
+
):
|
|
1302
|
+
target_depth = current_depth - 1
|
|
1303
|
+
pattern = ",".join(current_path.split(",")[:target_depth])
|
|
1304
|
+
|
|
1305
|
+
for pathspec in self._get_matching_pathspecs(steps, metadata_key, pattern):
|
|
1306
|
+
yield pathspec
|
|
1280
1307
|
|
|
1281
1308
|
@property
|
|
1282
|
-
def
|
|
1309
|
+
def child_task_pathspecs(self) -> Iterator[str]:
|
|
1283
1310
|
"""
|
|
1284
|
-
|
|
1311
|
+
Yields pathspecs of all child tasks of the current task.
|
|
1285
1312
|
|
|
1286
1313
|
Yields
|
|
1287
1314
|
------
|
|
1288
|
-
|
|
1289
|
-
|
|
1315
|
+
str
|
|
1316
|
+
Pathspec of the child task of the current task
|
|
1290
1317
|
"""
|
|
1291
|
-
flow_id, run_id,
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1318
|
+
flow_id, run_id, step_name, _ = self.path_components
|
|
1319
|
+
metadata_dict = self.metadata_dict
|
|
1320
|
+
graph_info = self["_graph_info"].data
|
|
1321
|
+
|
|
1322
|
+
# Get the child steps
|
|
1323
|
+
steps = graph_info["steps"][step_name]["next"]
|
|
1295
1324
|
|
|
1296
|
-
|
|
1325
|
+
node_type = graph_info["steps"][step_name]["type"]
|
|
1326
|
+
metadata_key = "foreach-execution-path"
|
|
1327
|
+
current_path = metadata_dict.get(metadata_key)
|
|
1297
1328
|
|
|
1298
1329
|
if len(steps) > 1:
|
|
1299
1330
|
# Static split - use exact path matching
|
|
1300
1331
|
pattern = current_path or ".*"
|
|
1301
|
-
yield from self._iter_matching_tasks(
|
|
1302
|
-
steps, "foreach-execution-path", pattern
|
|
1303
|
-
)
|
|
1304
|
-
return
|
|
1305
|
-
|
|
1306
|
-
# Handle single step case
|
|
1307
|
-
target_task = Step(
|
|
1308
|
-
f"{flow_id}/{run_id}/{steps[0].id}", _namespace_check=False
|
|
1309
|
-
).task
|
|
1310
|
-
target_path = target_task.metadata_dict.get("foreach-execution-path")
|
|
1311
|
-
|
|
1312
|
-
if not target_path or not current_path:
|
|
1313
|
-
# (Current task, "A:10") and (Child task, "")
|
|
1314
|
-
# Pattern: ".*"
|
|
1315
|
-
pattern = ".*"
|
|
1316
1332
|
else:
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
if
|
|
1321
|
-
#
|
|
1322
|
-
#
|
|
1323
|
-
|
|
1324
|
-
pattern = f"{current_path},.*"
|
|
1333
|
+
if not steps:
|
|
1334
|
+
return # No child steps, yield nothing
|
|
1335
|
+
|
|
1336
|
+
if not current_path:
|
|
1337
|
+
# Current task is not part of a foreach
|
|
1338
|
+
# Pattern: ".*"
|
|
1339
|
+
pattern = ".*"
|
|
1325
1340
|
else:
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1341
|
+
current_depth = len(current_path.split(","))
|
|
1342
|
+
if node_type == "split-foreach" or node_type == "split-parallel":
|
|
1343
|
+
# Foreach split
|
|
1344
|
+
# (Current task, "A:10,B:13") and (Child task, "A:10,B:13,C:21")
|
|
1345
|
+
# Pattern: "A:10,B:13,.*"
|
|
1346
|
+
pattern = f"{current_path},.*"
|
|
1347
|
+
else:
|
|
1348
|
+
# Foreach join or linear step
|
|
1349
|
+
# Pattern: "A:10,B:13"
|
|
1350
|
+
child_step_type = graph_info["steps"][steps[0]]["type"]
|
|
1351
|
+
|
|
1352
|
+
# We need to know if the child step is a foreach join or a static join
|
|
1353
|
+
child_step_prev_steps = self._get_previous_steps(
|
|
1354
|
+
graph_info, steps[0]
|
|
1355
|
+
)
|
|
1356
|
+
if len(child_step_prev_steps) > 1:
|
|
1357
|
+
child_step_type = "static-join"
|
|
1358
|
+
target_depth = current_depth
|
|
1359
|
+
if child_step_type == "join" and current_depth == 1:
|
|
1360
|
+
# (Current task, "A:10") and (Child task, "")
|
|
1361
|
+
pattern = ".*"
|
|
1362
|
+
else:
|
|
1363
|
+
# (Current task, "A:10,B:13,C:21") and (Child task, "A:10,B:13")
|
|
1364
|
+
# (Current task, "A:10,B:13") and (Child task, "A:10,B:13")
|
|
1365
|
+
if child_step_type == "join":
|
|
1366
|
+
target_depth = current_depth - 1
|
|
1367
|
+
pattern = ",".join(current_path.split(",")[:target_depth])
|
|
1368
|
+
|
|
1369
|
+
for pathspec in self._get_matching_pathspecs(steps, metadata_key, pattern):
|
|
1370
|
+
yield pathspec
|
|
1371
|
+
|
|
1372
|
+
@property
|
|
1373
|
+
def parent_tasks(self) -> Iterator["Task"]:
|
|
1374
|
+
"""
|
|
1375
|
+
Yields all parent tasks of the current task if one exists.
|
|
1376
|
+
|
|
1377
|
+
Yields
|
|
1378
|
+
------
|
|
1379
|
+
Task
|
|
1380
|
+
Parent task of the current task
|
|
1381
|
+
"""
|
|
1382
|
+
parent_task_pathspecs = self.parent_task_pathspecs
|
|
1383
|
+
for pathspec in parent_task_pathspecs:
|
|
1384
|
+
yield Task(pathspec=pathspec, _namespace_check=False)
|
|
1385
|
+
|
|
1386
|
+
@property
|
|
1387
|
+
def child_tasks(self) -> Iterator["Task"]:
|
|
1388
|
+
"""
|
|
1389
|
+
Yields all child tasks of the current task if one exists.
|
|
1390
|
+
|
|
1391
|
+
Yields
|
|
1392
|
+
------
|
|
1393
|
+
Task
|
|
1394
|
+
Child task of the current task
|
|
1395
|
+
"""
|
|
1396
|
+
for pathspec in self.child_task_pathspecs:
|
|
1397
|
+
yield Task(pathspec=pathspec, _namespace_check=False)
|
|
1335
1398
|
|
|
1336
1399
|
@property
|
|
1337
1400
|
def metadata(self) -> List[Metadata]:
|
metaflow/client/filecache.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import print_function
|
|
2
2
|
from collections import OrderedDict
|
|
3
|
+
import json
|
|
3
4
|
import os
|
|
4
5
|
import sys
|
|
5
6
|
import time
|
|
@@ -10,13 +11,14 @@ from urllib.parse import urlparse
|
|
|
10
11
|
|
|
11
12
|
from metaflow.datastore import FlowDataStore
|
|
12
13
|
from metaflow.datastore.content_addressed_store import BlobCache
|
|
14
|
+
from metaflow.datastore.flow_datastore import MetadataCache
|
|
13
15
|
from metaflow.exception import MetaflowException
|
|
14
16
|
from metaflow.metaflow_config import (
|
|
15
17
|
CLIENT_CACHE_PATH,
|
|
16
18
|
CLIENT_CACHE_MAX_SIZE,
|
|
17
19
|
CLIENT_CACHE_MAX_FLOWDATASTORE_COUNT,
|
|
18
|
-
CLIENT_CACHE_MAX_TASKDATASTORE_COUNT,
|
|
19
20
|
)
|
|
21
|
+
from metaflow.metaflow_profile import from_start
|
|
20
22
|
|
|
21
23
|
from metaflow.plugins import DATASTORES
|
|
22
24
|
|
|
@@ -63,8 +65,8 @@ class FileCache(object):
|
|
|
63
65
|
# when querying for sizes of artifacts. Once we have queried for the size
|
|
64
66
|
# of one artifact in a TaskDatastore, caching this means that any
|
|
65
67
|
# queries on that same TaskDatastore will be quick (since we already
|
|
66
|
-
# have all the metadata)
|
|
67
|
-
|
|
68
|
+
# have all the metadata). We keep track of this in a file so it persists
|
|
69
|
+
# across processes.
|
|
68
70
|
|
|
69
71
|
@property
|
|
70
72
|
def cache_dir(self):
|
|
@@ -87,7 +89,7 @@ class FileCache(object):
|
|
|
87
89
|
):
|
|
88
90
|
ds_cls = self._get_datastore_storage_impl(ds_type)
|
|
89
91
|
ds_root = ds_cls.path_join(*ds_cls.path_split(location)[:-5])
|
|
90
|
-
cache_id = self.
|
|
92
|
+
cache_id = self.flow_ds_id(ds_type, ds_root, flow_name)
|
|
91
93
|
|
|
92
94
|
token = (
|
|
93
95
|
"%s.cached"
|
|
@@ -311,13 +313,13 @@ class FileCache(object):
|
|
|
311
313
|
self._objects = sorted(objects, reverse=False)
|
|
312
314
|
|
|
313
315
|
@staticmethod
|
|
314
|
-
def
|
|
316
|
+
def flow_ds_id(ds_type, ds_root, flow_name):
|
|
315
317
|
p = urlparse(ds_root)
|
|
316
318
|
sanitized_root = (p.netloc + p.path).replace("/", "_")
|
|
317
319
|
return ".".join([ds_type, sanitized_root, flow_name])
|
|
318
320
|
|
|
319
321
|
@staticmethod
|
|
320
|
-
def
|
|
322
|
+
def task_ds_id(ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt):
|
|
321
323
|
p = urlparse(ds_root)
|
|
322
324
|
sanitized_root = (p.netloc + p.path).replace("/", "_")
|
|
323
325
|
return ".".join(
|
|
@@ -365,7 +367,7 @@ class FileCache(object):
|
|
|
365
367
|
return storage_impl[0]
|
|
366
368
|
|
|
367
369
|
def _get_flow_datastore(self, ds_type, ds_root, flow_name):
|
|
368
|
-
cache_id = self.
|
|
370
|
+
cache_id = self.flow_ds_id(ds_type, ds_root, flow_name)
|
|
369
371
|
cached_flow_datastore = self._store_caches.get(cache_id)
|
|
370
372
|
|
|
371
373
|
if cached_flow_datastore:
|
|
@@ -380,9 +382,14 @@ class FileCache(object):
|
|
|
380
382
|
ds_root=ds_root,
|
|
381
383
|
)
|
|
382
384
|
blob_cache = self._blob_caches.setdefault(
|
|
383
|
-
cache_id,
|
|
385
|
+
cache_id,
|
|
386
|
+
(
|
|
387
|
+
FileBlobCache(self, cache_id),
|
|
388
|
+
TaskMetadataCache(self, ds_type, ds_root, flow_name),
|
|
389
|
+
),
|
|
384
390
|
)
|
|
385
|
-
cached_flow_datastore.ca_store.set_blob_cache(blob_cache)
|
|
391
|
+
cached_flow_datastore.ca_store.set_blob_cache(blob_cache[0])
|
|
392
|
+
cached_flow_datastore.set_metadata_cache(blob_cache[1])
|
|
386
393
|
self._store_caches[cache_id] = cached_flow_datastore
|
|
387
394
|
if len(self._store_caches) > CLIENT_CACHE_MAX_FLOWDATASTORE_COUNT:
|
|
388
395
|
cache_id_to_remove, _ = self._store_caches.popitem(last=False)
|
|
@@ -393,32 +400,52 @@ class FileCache(object):
|
|
|
393
400
|
self, ds_type, ds_root, flow_name, run_id, step_name, task_id, attempt
|
|
394
401
|
):
|
|
395
402
|
flow_ds = self._get_flow_datastore(ds_type, ds_root, flow_name)
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
403
|
+
|
|
404
|
+
return flow_ds.get_task_datastore(run_id, step_name, task_id, attempt=attempt)
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
class TaskMetadataCache(MetadataCache):
|
|
408
|
+
def __init__(self, filecache, ds_type, ds_root, flow_name):
|
|
409
|
+
self._filecache = filecache
|
|
410
|
+
self._ds_type = ds_type
|
|
411
|
+
self._ds_root = ds_root
|
|
412
|
+
self._flow_name = flow_name
|
|
413
|
+
|
|
414
|
+
def _path(self, run_id, step_name, task_id, attempt):
|
|
415
|
+
if attempt is None:
|
|
416
|
+
raise MetaflowException(
|
|
417
|
+
"Attempt number must be specified to use task metadata cache. Raise an issue "
|
|
418
|
+
"on Metaflow GitHub if you see this message.",
|
|
400
419
|
)
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
data_metadata=cached_metadata,
|
|
410
|
-
)
|
|
411
|
-
# If we are here, we either have attempt=None or nothing in the cache
|
|
412
|
-
task_ds = flow_ds.get_task_datastore(
|
|
413
|
-
run_id, step_name, task_id, attempt=attempt
|
|
420
|
+
cache_id = self._filecache.task_ds_id(
|
|
421
|
+
self._ds_type,
|
|
422
|
+
self._ds_root,
|
|
423
|
+
self._flow_name,
|
|
424
|
+
run_id,
|
|
425
|
+
step_name,
|
|
426
|
+
task_id,
|
|
427
|
+
attempt,
|
|
414
428
|
)
|
|
415
|
-
|
|
416
|
-
|
|
429
|
+
token = (
|
|
430
|
+
"%s.cached"
|
|
431
|
+
% sha1(
|
|
432
|
+
os.path.join(
|
|
433
|
+
run_id, step_name, task_id, str(attempt), "metadata"
|
|
434
|
+
).encode("utf-8")
|
|
435
|
+
).hexdigest()
|
|
436
|
+
)
|
|
437
|
+
return os.path.join(self._filecache.cache_dir, cache_id, token[:2], token)
|
|
438
|
+
|
|
439
|
+
def load_metadata(self, run_id, step_name, task_id, attempt):
|
|
440
|
+
d = self._filecache.read_file(self._path(run_id, step_name, task_id, attempt))
|
|
441
|
+
if d:
|
|
442
|
+
return json.loads(d)
|
|
443
|
+
|
|
444
|
+
def store_metadata(self, run_id, step_name, task_id, attempt, metadata_dict):
|
|
445
|
+
self._filecache.create_file(
|
|
446
|
+
self._path(run_id, step_name, task_id, attempt),
|
|
447
|
+
json.dumps(metadata_dict).encode("utf-8"),
|
|
417
448
|
)
|
|
418
|
-
self._task_metadata_caches[cache_id] = task_ds.ds_metadata
|
|
419
|
-
if len(self._task_metadata_caches) > CLIENT_CACHE_MAX_TASKDATASTORE_COUNT:
|
|
420
|
-
self._task_metadata_caches.popitem(last=False)
|
|
421
|
-
return task_ds
|
|
422
449
|
|
|
423
450
|
|
|
424
451
|
class FileBlobCache(BlobCache):
|
metaflow/cmd/code/__init__.py
CHANGED
|
@@ -6,6 +6,7 @@ from tempfile import TemporaryDirectory
|
|
|
6
6
|
from typing import Any, Callable, List, Mapping, Optional, cast
|
|
7
7
|
|
|
8
8
|
from metaflow import Run
|
|
9
|
+
from metaflow.util import walk_without_cycles
|
|
9
10
|
from metaflow._vendor import click
|
|
10
11
|
from metaflow.cli import echo_always
|
|
11
12
|
|
|
@@ -51,7 +52,7 @@ def perform_diff(
|
|
|
51
52
|
target_dir = os.getcwd()
|
|
52
53
|
|
|
53
54
|
diffs = []
|
|
54
|
-
for dirpath,
|
|
55
|
+
for dirpath, _, filenames in walk_without_cycles(source_dir):
|
|
55
56
|
for fname in filenames:
|
|
56
57
|
# NOTE: the paths below need to be set up carefully
|
|
57
58
|
# for the `patch` command to work. Better not to touch
|
metaflow/datastore/__init__.py
CHANGED
|
@@ -38,7 +38,7 @@ class ContentAddressedStore(object):
|
|
|
38
38
|
def set_blob_cache(self, blob_cache):
|
|
39
39
|
self._blob_cache = blob_cache
|
|
40
40
|
|
|
41
|
-
def save_blobs(self, blob_iter, raw=False, len_hint=0):
|
|
41
|
+
def save_blobs(self, blob_iter, raw=False, len_hint=0, is_transfer=False):
|
|
42
42
|
"""
|
|
43
43
|
Saves blobs of data to the datastore
|
|
44
44
|
|
|
@@ -60,11 +60,16 @@ class ContentAddressedStore(object):
|
|
|
60
60
|
|
|
61
61
|
Parameters
|
|
62
62
|
----------
|
|
63
|
-
blob_iter : Iterator
|
|
64
|
-
|
|
63
|
+
blob_iter : Iterator
|
|
64
|
+
Iterator over bytes objects to save
|
|
65
|
+
raw : bool, default False
|
|
65
66
|
Whether to save the bytes directly or process them, by default False
|
|
66
|
-
len_hint :
|
|
67
|
+
len_hint : int, default 0
|
|
68
|
+
Hint of the number of blobs that will be produced by the
|
|
67
69
|
iterator, by default 0
|
|
70
|
+
is_transfer : bool, default False
|
|
71
|
+
If True, this indicates we are saving blobs directly from the output of another
|
|
72
|
+
content addressed store's
|
|
68
73
|
|
|
69
74
|
Returns
|
|
70
75
|
-------
|
|
@@ -76,6 +81,20 @@ class ContentAddressedStore(object):
|
|
|
76
81
|
|
|
77
82
|
def packing_iter():
|
|
78
83
|
for blob in blob_iter:
|
|
84
|
+
if is_transfer:
|
|
85
|
+
key, blob_data, meta = blob
|
|
86
|
+
path = self._storage_impl.path_join(self._prefix, key[:2], key)
|
|
87
|
+
# Transfer data is always raw/decompressed, so mark it as such
|
|
88
|
+
meta_corrected = {"cas_raw": True, "cas_version": 1}
|
|
89
|
+
|
|
90
|
+
results.append(
|
|
91
|
+
self.save_blobs_result(
|
|
92
|
+
uri=self._storage_impl.full_uri(path),
|
|
93
|
+
key=key,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
yield path, (BytesIO(blob_data), meta_corrected)
|
|
97
|
+
continue
|
|
79
98
|
sha = sha1(blob).hexdigest()
|
|
80
99
|
path = self._storage_impl.path_join(self._prefix, sha[:2], sha)
|
|
81
100
|
results.append(
|
|
@@ -100,7 +119,7 @@ class ContentAddressedStore(object):
|
|
|
100
119
|
self._storage_impl.save_bytes(packing_iter(), overwrite=True, len_hint=len_hint)
|
|
101
120
|
return results
|
|
102
121
|
|
|
103
|
-
def load_blobs(self, keys, force_raw=False):
|
|
122
|
+
def load_blobs(self, keys, force_raw=False, is_transfer=False):
|
|
104
123
|
"""
|
|
105
124
|
Mirror function of save_blobs
|
|
106
125
|
|
|
@@ -111,15 +130,20 @@ class ContentAddressedStore(object):
|
|
|
111
130
|
----------
|
|
112
131
|
keys : List of string
|
|
113
132
|
Key describing the object to load
|
|
114
|
-
force_raw : bool,
|
|
133
|
+
force_raw : bool, default False
|
|
115
134
|
Support for backward compatibility with previous datastores. If
|
|
116
135
|
True, this will force the key to be loaded as is (raw). By default,
|
|
117
136
|
False
|
|
137
|
+
is_transfer : bool, default False
|
|
138
|
+
If True, this indicates we are loading blobs to transfer them directly
|
|
139
|
+
to another datastore. We will, in this case, also transfer the metadata
|
|
140
|
+
and do minimal processing. This is for internal use only.
|
|
118
141
|
|
|
119
142
|
Returns
|
|
120
143
|
-------
|
|
121
144
|
Returns an iterator of (string, bytes) tuples; the iterator may return keys
|
|
122
|
-
in a different order than were passed in.
|
|
145
|
+
in a different order than were passed in. If is_transfer is True, the tuple
|
|
146
|
+
has three elements with the third one being the metadata.
|
|
123
147
|
"""
|
|
124
148
|
load_paths = []
|
|
125
149
|
for key in keys:
|
|
@@ -127,7 +151,11 @@ class ContentAddressedStore(object):
|
|
|
127
151
|
if self._blob_cache:
|
|
128
152
|
blob = self._blob_cache.load_key(key)
|
|
129
153
|
if blob is not None:
|
|
130
|
-
|
|
154
|
+
if is_transfer:
|
|
155
|
+
# Cached blobs are decompressed/processed bytes regardless of original format
|
|
156
|
+
yield key, blob, {"cas_raw": False, "cas_version": 1}
|
|
157
|
+
else:
|
|
158
|
+
yield key, blob
|
|
131
159
|
else:
|
|
132
160
|
path = self._storage_impl.path_join(self._prefix, key[:2], key)
|
|
133
161
|
load_paths.append((key, path))
|
|
@@ -169,7 +197,10 @@ class ContentAddressedStore(object):
|
|
|
169
197
|
if self._blob_cache:
|
|
170
198
|
self._blob_cache.store_key(key, blob)
|
|
171
199
|
|
|
172
|
-
|
|
200
|
+
if is_transfer:
|
|
201
|
+
yield key, blob, meta # Preserve exact original metadata from storage
|
|
202
|
+
else:
|
|
203
|
+
yield key, blob
|
|
173
204
|
|
|
174
205
|
def _unpack_backward_compatible(self, blob):
|
|
175
206
|
# This is the backward compatible unpack
|
|
@@ -21,9 +21,18 @@ class TaskDataStoreSet(object):
|
|
|
21
21
|
pathspecs=None,
|
|
22
22
|
prefetch_data_artifacts=None,
|
|
23
23
|
allow_not_done=False,
|
|
24
|
+
join_type=None,
|
|
25
|
+
orig_flow_datastore=None,
|
|
26
|
+
spin_artifacts=None,
|
|
24
27
|
):
|
|
25
28
|
self.task_datastores = flow_datastore.get_task_datastores(
|
|
26
|
-
run_id,
|
|
29
|
+
run_id,
|
|
30
|
+
steps=steps,
|
|
31
|
+
pathspecs=pathspecs,
|
|
32
|
+
allow_not_done=allow_not_done,
|
|
33
|
+
join_type=join_type,
|
|
34
|
+
orig_flow_datastore=orig_flow_datastore,
|
|
35
|
+
spin_artifacts=spin_artifacts,
|
|
27
36
|
)
|
|
28
37
|
|
|
29
38
|
if prefetch_data_artifacts:
|