metaflow 2.15.21__py2.py3-none-any.whl → 2.16.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow/__init__.py +7 -1
- metaflow/cli.py +16 -1
- metaflow/cli_components/init_cmd.py +1 -0
- metaflow/cli_components/run_cmds.py +6 -2
- metaflow/client/core.py +22 -30
- metaflow/datastore/task_datastore.py +0 -1
- metaflow/debug.py +5 -0
- metaflow/decorators.py +230 -70
- metaflow/extension_support/__init__.py +15 -8
- metaflow/extension_support/_empty_file.py +2 -2
- metaflow/flowspec.py +80 -53
- metaflow/graph.py +24 -2
- metaflow/meta_files.py +13 -0
- metaflow/metadata_provider/metadata.py +7 -1
- metaflow/metaflow_config.py +5 -0
- metaflow/metaflow_environment.py +82 -25
- metaflow/metaflow_version.py +1 -1
- metaflow/package/__init__.py +664 -0
- metaflow/packaging_sys/__init__.py +870 -0
- metaflow/packaging_sys/backend.py +113 -0
- metaflow/packaging_sys/distribution_support.py +153 -0
- metaflow/packaging_sys/tar_backend.py +86 -0
- metaflow/packaging_sys/utils.py +91 -0
- metaflow/packaging_sys/v1.py +476 -0
- metaflow/plugins/airflow/airflow.py +5 -1
- metaflow/plugins/airflow/airflow_cli.py +15 -4
- metaflow/plugins/argo/argo_workflows.py +15 -4
- metaflow/plugins/argo/argo_workflows_cli.py +16 -4
- metaflow/plugins/aws/batch/batch.py +22 -3
- metaflow/plugins/aws/batch/batch_cli.py +3 -0
- metaflow/plugins/aws/batch/batch_decorator.py +13 -5
- metaflow/plugins/aws/step_functions/step_functions.py +4 -1
- metaflow/plugins/aws/step_functions/step_functions_cli.py +15 -4
- metaflow/plugins/cards/card_decorator.py +0 -5
- metaflow/plugins/kubernetes/kubernetes.py +8 -1
- metaflow/plugins/kubernetes/kubernetes_cli.py +3 -0
- metaflow/plugins/kubernetes/kubernetes_decorator.py +13 -5
- metaflow/plugins/package_cli.py +25 -23
- metaflow/plugins/parallel_decorator.py +4 -2
- metaflow/plugins/pypi/bootstrap.py +8 -2
- metaflow/plugins/pypi/conda_decorator.py +39 -82
- metaflow/plugins/pypi/conda_environment.py +6 -2
- metaflow/plugins/pypi/pypi_decorator.py +4 -4
- metaflow/plugins/test_unbounded_foreach_decorator.py +2 -2
- metaflow/plugins/timeout_decorator.py +0 -1
- metaflow/plugins/uv/bootstrap.py +11 -0
- metaflow/plugins/uv/uv_environment.py +4 -2
- metaflow/pylint_wrapper.py +5 -1
- metaflow/runner/click_api.py +5 -4
- metaflow/runner/subprocess_manager.py +14 -2
- metaflow/runtime.py +37 -11
- metaflow/task.py +91 -7
- metaflow/user_configs/config_options.py +13 -8
- metaflow/user_configs/config_parameters.py +0 -4
- metaflow/user_decorators/__init__.py +0 -0
- metaflow/user_decorators/common.py +144 -0
- metaflow/user_decorators/mutable_flow.py +499 -0
- metaflow/user_decorators/mutable_step.py +424 -0
- metaflow/user_decorators/user_flow_decorator.py +263 -0
- metaflow/user_decorators/user_step_decorator.py +712 -0
- metaflow/util.py +4 -1
- metaflow/version.py +1 -1
- {metaflow-2.15.21.dist-info → metaflow-2.16.0.dist-info}/METADATA +2 -2
- {metaflow-2.15.21.dist-info → metaflow-2.16.0.dist-info}/RECORD +71 -60
- metaflow/info_file.py +0 -25
- metaflow/package.py +0 -203
- metaflow/user_configs/config_decorators.py +0 -568
- {metaflow-2.15.21.data → metaflow-2.16.0.data}/data/share/metaflow/devtools/Makefile +0 -0
- {metaflow-2.15.21.data → metaflow-2.16.0.data}/data/share/metaflow/devtools/Tiltfile +0 -0
- {metaflow-2.15.21.data → metaflow-2.16.0.data}/data/share/metaflow/devtools/pick_services.sh +0 -0
- {metaflow-2.15.21.dist-info → metaflow-2.16.0.dist-info}/WHEEL +0 -0
- {metaflow-2.15.21.dist-info → metaflow-2.16.0.dist-info}/entry_points.txt +0 -0
- {metaflow-2.15.21.dist-info → metaflow-2.16.0.dist-info}/licenses/LICENSE +0 -0
- {metaflow-2.15.21.dist-info → metaflow-2.16.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,664 @@
|
|
1
|
+
import json
|
2
|
+
import os
|
3
|
+
import sys
|
4
|
+
import threading
|
5
|
+
import time
|
6
|
+
|
7
|
+
from io import BytesIO
|
8
|
+
from types import ModuleType
|
9
|
+
from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Type, cast
|
10
|
+
|
11
|
+
from ..debug import debug
|
12
|
+
from ..packaging_sys import ContentType, MetaflowCodeContent
|
13
|
+
from ..packaging_sys.backend import PackagingBackend
|
14
|
+
from ..packaging_sys.tar_backend import TarPackagingBackend
|
15
|
+
from ..packaging_sys.v1 import MetaflowCodeContentV1
|
16
|
+
from ..packaging_sys.utils import suffix_filter, walk
|
17
|
+
from ..metaflow_config import DEFAULT_PACKAGE_SUFFIXES
|
18
|
+
from ..exception import MetaflowException
|
19
|
+
from ..user_configs.config_parameters import dump_config_values
|
20
|
+
from ..util import get_metaflow_root
|
21
|
+
from .. import R
|
22
|
+
|
23
|
+
DEFAULT_SUFFIXES_LIST = DEFAULT_PACKAGE_SUFFIXES.split(",")
|
24
|
+
|
25
|
+
|
26
|
+
if TYPE_CHECKING:
|
27
|
+
import metaflow.datastore
|
28
|
+
|
29
|
+
|
30
|
+
class NonUniqueFileNameToFilePathMappingException(MetaflowException):
|
31
|
+
headline = "Non-unique file path for a file name included in code package"
|
32
|
+
|
33
|
+
def __init__(self, filename, file_paths, lineno=None):
|
34
|
+
msg = (
|
35
|
+
"Filename %s included in the code package includes multiple different "
|
36
|
+
"paths for the same name : %s.\n"
|
37
|
+
"The `filename` in the `add_to_package` decorator hook requires a unique "
|
38
|
+
"`file_path` to `file_name` mapping" % (filename, ", ".join(file_paths))
|
39
|
+
)
|
40
|
+
super().__init__(msg=msg, lineno=lineno)
|
41
|
+
|
42
|
+
|
43
|
+
class MetaflowPackage(object):
|
44
|
+
def __init__(
|
45
|
+
self,
|
46
|
+
flow,
|
47
|
+
environment,
|
48
|
+
echo,
|
49
|
+
suffixes: Optional[List[str]] = DEFAULT_SUFFIXES_LIST,
|
50
|
+
user_code_filter: Optional[Callable[[str], bool]] = None,
|
51
|
+
flow_datastore: Optional["metaflow.datastore.FlowDataStore"] = None,
|
52
|
+
mfcontent: Optional[MetaflowCodeContent] = None,
|
53
|
+
exclude_tl_dirs=None,
|
54
|
+
backend: Type[PackagingBackend] = TarPackagingBackend,
|
55
|
+
):
|
56
|
+
self._environment = environment
|
57
|
+
self._environment.init_environment(echo)
|
58
|
+
|
59
|
+
self._echo = echo
|
60
|
+
self._flow = flow
|
61
|
+
self._flow_datastore = flow_datastore
|
62
|
+
self._backend = backend
|
63
|
+
|
64
|
+
# Info about the package
|
65
|
+
self._name = None
|
66
|
+
self._create_time = time.time()
|
67
|
+
self._user_flow_dir = None
|
68
|
+
|
69
|
+
# Content of the package (and settings on how to create it)
|
70
|
+
if suffixes is not None:
|
71
|
+
self._suffixes = list(set().union(suffixes, DEFAULT_SUFFIXES_LIST))
|
72
|
+
else:
|
73
|
+
self._suffixes = None
|
74
|
+
|
75
|
+
def _module_selector(m) -> bool:
|
76
|
+
from ..user_decorators.user_flow_decorator import FlowMutatorMeta
|
77
|
+
from ..user_decorators.user_step_decorator import UserStepDecoratorMeta
|
78
|
+
|
79
|
+
if (
|
80
|
+
m.__name__ in FlowMutatorMeta._import_modules
|
81
|
+
or m.__name__ in UserStepDecoratorMeta._import_modules
|
82
|
+
or hasattr(m, "METAFLOW_PACKAGE")
|
83
|
+
):
|
84
|
+
return True
|
85
|
+
|
86
|
+
if mfcontent is None:
|
87
|
+
self._mfcontent = MetaflowCodeContentV1(criteria=_module_selector)
|
88
|
+
|
89
|
+
else:
|
90
|
+
self._mfcontent = mfcontent
|
91
|
+
# We exclude the environment when packaging as this will be packaged separately.
|
92
|
+
# This comes into play primarily if packaging from a node already running packaged
|
93
|
+
# code.
|
94
|
+
# These directories are only excluded at the top-level (ie: not further down
|
95
|
+
# in sub-directories)
|
96
|
+
# "_escape_trampolines" is a special directory where trampoline escape hatch
|
97
|
+
# files are stored (used by Netflix Extension's Conda implementation).
|
98
|
+
self._exclude_tl_dirs = (
|
99
|
+
self._mfcontent.get_excluded_tl_entries()
|
100
|
+
+ ["_escape_trampolines"]
|
101
|
+
+ (exclude_tl_dirs or [])
|
102
|
+
)
|
103
|
+
|
104
|
+
if self._suffixes is not None and user_code_filter is not None:
|
105
|
+
self._user_code_filter = lambda x, f1=suffix_filter(
|
106
|
+
self._suffixes
|
107
|
+
), f2=user_code_filter: f1(x) and f2(x)
|
108
|
+
self._filter_type = "suffixes and user filter"
|
109
|
+
elif self._suffixes is not None:
|
110
|
+
self._user_code_filter = suffix_filter(self._suffixes)
|
111
|
+
self._filter_type = "suffixes"
|
112
|
+
elif user_code_filter is not None:
|
113
|
+
self._user_code_filter = user_code_filter
|
114
|
+
self._filter_type = "user filter"
|
115
|
+
else:
|
116
|
+
self._user_code_filter = lambda x: True
|
117
|
+
self._filter_type = "no filter"
|
118
|
+
|
119
|
+
# Info about the package creation (it happens async)
|
120
|
+
self._is_package_available = None
|
121
|
+
self._blob_sha = None
|
122
|
+
self._blob_url = None
|
123
|
+
self._blob = None
|
124
|
+
|
125
|
+
# We launch a thread to create the package asynchronously and upload
|
126
|
+
# it opportunistically
|
127
|
+
self._create_thread = threading.Thread(
|
128
|
+
target=self._package_and_upload,
|
129
|
+
daemon=True,
|
130
|
+
)
|
131
|
+
self._create_thread.start()
|
132
|
+
|
133
|
+
# HORRIBLE HACK SO THAT CURRENT COMPUTE IMPLEMENTATIONS CAN STILL
|
134
|
+
# DO pkg.blob. Ideally, this goes away and blob_with_timeout becomes
|
135
|
+
# the main method (called blob).
|
136
|
+
@property
|
137
|
+
def blob(self) -> BytesIO:
|
138
|
+
return self.blob_with_timeout()
|
139
|
+
|
140
|
+
def blob_with_timeout(self, timeout: Optional[float] = None) -> BytesIO:
|
141
|
+
if self._blob is None:
|
142
|
+
self._create_thread.join(timeout)
|
143
|
+
if self._is_package_available is not None:
|
144
|
+
# We have our result now
|
145
|
+
if self._is_package_available:
|
146
|
+
return self._blob
|
147
|
+
else:
|
148
|
+
raise self._packaging_exception
|
149
|
+
return self._blob
|
150
|
+
|
151
|
+
def package_sha(self, timeout: Optional[float] = None) -> Optional[str]:
|
152
|
+
if self._blob_sha is None:
|
153
|
+
self._create_thread.join(timeout)
|
154
|
+
if self._is_package_available is not None:
|
155
|
+
# We have our result now
|
156
|
+
if self._is_package_available:
|
157
|
+
return self._blob_sha
|
158
|
+
else:
|
159
|
+
raise self._packaging_exception
|
160
|
+
return self._blob_sha
|
161
|
+
|
162
|
+
def package_url(self, timeout: Optional[float] = None) -> Optional[str]:
|
163
|
+
if self._blob_url is None:
|
164
|
+
self._create_thread.join(timeout)
|
165
|
+
if self._is_package_available is not None:
|
166
|
+
# We have our result now
|
167
|
+
if self._is_package_available:
|
168
|
+
return self._blob_url
|
169
|
+
else:
|
170
|
+
raise self._packaging_exception
|
171
|
+
return self._blob_url
|
172
|
+
|
173
|
+
@property
|
174
|
+
def package_metadata(self):
|
175
|
+
return json.dumps(
|
176
|
+
{
|
177
|
+
"version": 0,
|
178
|
+
"archive_format": self._backend.backend_type(),
|
179
|
+
"mfcontent_version": self._mfcontent.get_package_version(),
|
180
|
+
}
|
181
|
+
)
|
182
|
+
|
183
|
+
@classmethod
|
184
|
+
def get_backend(cls, pkg_metadata: str) -> PackagingBackend:
|
185
|
+
"""
|
186
|
+
Method to get the backend type from the package metadata.
|
187
|
+
|
188
|
+
Parameters
|
189
|
+
----------
|
190
|
+
pkg_metadata : str
|
191
|
+
The metadata of the package to extract.
|
192
|
+
|
193
|
+
Returns
|
194
|
+
-------
|
195
|
+
PackagingBackend
|
196
|
+
The backend type that can be used to extract the package.
|
197
|
+
"""
|
198
|
+
backend_type = json.loads(pkg_metadata).get("archive_format", "tgz")
|
199
|
+
return PackagingBackend.get_backend(backend_type)
|
200
|
+
|
201
|
+
@classmethod
|
202
|
+
def get_extract_commands(
|
203
|
+
cls, pkg_metadata: str, archive_path: str, dest_dir: str = "."
|
204
|
+
) -> List[str]:
|
205
|
+
"""
|
206
|
+
Method to get the commands needed to extract the package into
|
207
|
+
the directory dest_dir. Note that this will return a list of commands
|
208
|
+
that can be passed to subprocess.run for example.
|
209
|
+
|
210
|
+
Parameters
|
211
|
+
----------
|
212
|
+
pkg_metadata : str
|
213
|
+
The metadata of the package to extract.
|
214
|
+
archive_path : str
|
215
|
+
The path to the archive to extract.
|
216
|
+
dest_dir : str, default "."
|
217
|
+
The directory to extract the package into.
|
218
|
+
|
219
|
+
Returns
|
220
|
+
-------
|
221
|
+
List[str]
|
222
|
+
The commands needed to extract the package into the directory dest_dir.
|
223
|
+
"""
|
224
|
+
backend_type = json.loads(pkg_metadata).get("archive_format", "tgz")
|
225
|
+
# We now ask the backend type how to extract itself
|
226
|
+
backend = PackagingBackend.get_backend(backend_type)
|
227
|
+
cmds = backend.get_extract_commands(archive_path, dest_dir)
|
228
|
+
debug.package_exec(f"Command to extract {archive_path} into {dest_dir}: {cmds}")
|
229
|
+
return cmds
|
230
|
+
|
231
|
+
@classmethod
|
232
|
+
def get_post_extract_env_vars(
|
233
|
+
cls, pkg_metadata: str, dest_dir: str = "."
|
234
|
+
) -> Dict[str, str]:
|
235
|
+
"""
|
236
|
+
Method to get the environment variables needed to access the content
|
237
|
+
that has been extracted into the directory dest_dir. This will
|
238
|
+
typically involve setting PYTHONPATH
|
239
|
+
|
240
|
+
Parameters
|
241
|
+
----------
|
242
|
+
pkg_metadata : str
|
243
|
+
The metadata of the package to extract.
|
244
|
+
dest_dir : str, default "."
|
245
|
+
The directory where the content has been extracted to.
|
246
|
+
|
247
|
+
Returns
|
248
|
+
-------
|
249
|
+
Dict[str, str]
|
250
|
+
The post-extract environment variables that are needed to access the content
|
251
|
+
that has been extracted into dest_dir.
|
252
|
+
"""
|
253
|
+
mfcontent_version = json.loads(pkg_metadata).get("mfcontent_version", 0)
|
254
|
+
env_vars = MetaflowCodeContent.get_post_extract_env_vars(
|
255
|
+
mfcontent_version, dest_dir
|
256
|
+
)
|
257
|
+
debug.package_exec(
|
258
|
+
f"Environment variables to access content extracted into {dest_dir}: {env_vars}"
|
259
|
+
)
|
260
|
+
return env_vars
|
261
|
+
|
262
|
+
@classmethod
|
263
|
+
def cls_get_content(
|
264
|
+
cls, pkg_metadata, archive: BytesIO, name: str
|
265
|
+
) -> Optional[bytes]:
|
266
|
+
"""
|
267
|
+
Method to get the content of a member in the package archive.
|
268
|
+
|
269
|
+
Parameters
|
270
|
+
----------
|
271
|
+
pkg_metadata : str
|
272
|
+
The metadata of the package to extract.
|
273
|
+
archive : BytesIO
|
274
|
+
The archive to extract the member from.
|
275
|
+
name : str
|
276
|
+
The name of the member to extract.
|
277
|
+
|
278
|
+
Returns
|
279
|
+
-------
|
280
|
+
Optional[bytes]
|
281
|
+
The content of the member if it exists, None otherwise.
|
282
|
+
"""
|
283
|
+
backend = cls.get_backend(pkg_metadata)
|
284
|
+
with backend.cls_open(archive) as opened_archive:
|
285
|
+
return backend.cls_get_member(opened_archive, name)
|
286
|
+
|
287
|
+
@classmethod
|
288
|
+
def cls_get_info(cls, pkg_metadata, archive: BytesIO) -> Optional[Dict[str, str]]:
|
289
|
+
"""
|
290
|
+
Method to get the info of the package from the archive.
|
291
|
+
Parameters
|
292
|
+
----------
|
293
|
+
pkg_metadata : str
|
294
|
+
The metadata of the package to extract.
|
295
|
+
archive : BytesIO
|
296
|
+
The archive to extract the info from.
|
297
|
+
Returns
|
298
|
+
-------
|
299
|
+
Optional[Dict[str, str]]
|
300
|
+
The info of the package if it exists, None otherwise.
|
301
|
+
"""
|
302
|
+
backend = cls.get_backend(pkg_metadata)
|
303
|
+
with backend.cls_open(archive) as opened_archive:
|
304
|
+
return MetaflowCodeContent.get_archive_info(opened_archive, backend)
|
305
|
+
|
306
|
+
@classmethod
|
307
|
+
def cls_get_config(
|
308
|
+
cls, pkg_metadata: str, archive: BytesIO
|
309
|
+
) -> Optional[Dict[str, str]]:
|
310
|
+
"""
|
311
|
+
Method to get the config of the package from the archive.
|
312
|
+
|
313
|
+
Parameters
|
314
|
+
----------
|
315
|
+
pkg_metadata : str
|
316
|
+
The metadata of the package to extract.
|
317
|
+
archive : BytesIO
|
318
|
+
The archive to extract the config from.
|
319
|
+
|
320
|
+
Returns
|
321
|
+
-------
|
322
|
+
Optional[Dict[str, str]]
|
323
|
+
The config of the package if it exists, None otherwise.
|
324
|
+
"""
|
325
|
+
backend = cls.get_backend(pkg_metadata)
|
326
|
+
with backend.cls_open(archive) as opened_archive:
|
327
|
+
return MetaflowCodeContent.get_archive_config(opened_archive, backend)
|
328
|
+
|
329
|
+
@classmethod
|
330
|
+
def cls_extract_into(
|
331
|
+
cls,
|
332
|
+
pkg_metadata: str,
|
333
|
+
archive: BytesIO,
|
334
|
+
dest_dir: str = ".",
|
335
|
+
content_types: int = ContentType.ALL_CONTENT.value,
|
336
|
+
):
|
337
|
+
"""
|
338
|
+
Method to extract the package archive into a directory.
|
339
|
+
|
340
|
+
Parameters
|
341
|
+
----------
|
342
|
+
pkg_metadata : str
|
343
|
+
The metadata of the package to extract.
|
344
|
+
archive : BytesIO
|
345
|
+
The archive to extract.
|
346
|
+
dest_dir : str, default "."
|
347
|
+
The directory to extract the package into.
|
348
|
+
content_types : int, default ALL_CONTENT
|
349
|
+
The types of content to extract. This is a bitmask of ContentType values.
|
350
|
+
"""
|
351
|
+
backend = cls.get_backend(pkg_metadata)
|
352
|
+
with backend.cls_open(archive) as opened_archive:
|
353
|
+
include_names = MetaflowCodeContent.get_archive_content_names(
|
354
|
+
opened_archive, content_types, backend
|
355
|
+
)
|
356
|
+
backend.extract_members(include_names, dest_dir)
|
357
|
+
|
358
|
+
def user_tuples(self, timeout: Optional[float] = None):
|
359
|
+
# Wait for at least the blob to be formed
|
360
|
+
_ = self.blob_with_timeout(timeout=timeout)
|
361
|
+
for path, arcname in self._cached_user_members:
|
362
|
+
yield path, arcname
|
363
|
+
|
364
|
+
def path_tuples(self, timeout: Optional[float] = None):
|
365
|
+
# Wait for at least the blob to be formed
|
366
|
+
_ = self.blob_with_timeout(timeout=timeout)
|
367
|
+
# Files included in the environment
|
368
|
+
yield from self._mfcontent.content_names()
|
369
|
+
|
370
|
+
# Files included in the user code
|
371
|
+
yield from self.user_tuples()
|
372
|
+
|
373
|
+
def show(self, timeout: Optional[float] = None) -> str:
|
374
|
+
# Human-readable content of the package
|
375
|
+
blob = self.blob_with_timeout(timeout=timeout) # Ensure the package is created
|
376
|
+
lines = [
|
377
|
+
f"Package size: {self._format_size(len(blob))}",
|
378
|
+
f"Number of files: {sum(1 for _ in self.path_tuples())}",
|
379
|
+
self._mfcontent.show(),
|
380
|
+
]
|
381
|
+
|
382
|
+
if self._flow:
|
383
|
+
lines.append(f"\nUser code in flow {self._name}:")
|
384
|
+
lines.append(f" - Packaged from directory {self._user_flow_dir}")
|
385
|
+
if self._filter_type != "no filter":
|
386
|
+
if self._suffixes:
|
387
|
+
lines.append(
|
388
|
+
f" - Filtered by suffixes: {', '.join(self._suffixes)}"
|
389
|
+
)
|
390
|
+
else:
|
391
|
+
lines.append(f" - Filtered by {self._filter_type}")
|
392
|
+
else:
|
393
|
+
lines.append(" - No user code filter applied")
|
394
|
+
if self._exclude_tl_dirs:
|
395
|
+
lines.append(
|
396
|
+
f" - Excluded directories: {', '.join(self._exclude_tl_dirs)}"
|
397
|
+
)
|
398
|
+
return "\n".join(lines)
|
399
|
+
|
400
|
+
def get_content(
|
401
|
+
self, name: str, content_type: ContentType, timeout: Optional[float] = None
|
402
|
+
) -> Optional[bytes]:
|
403
|
+
"""
|
404
|
+
Method to get the content of a file within the package. This method
|
405
|
+
should be used for one-off access to small-ish files. If more files are
|
406
|
+
needed, use extract_into to extract the package into a directory and
|
407
|
+
then access the files from there.
|
408
|
+
|
409
|
+
Parameters
|
410
|
+
----------
|
411
|
+
name : str
|
412
|
+
The name of the file to get the content of. Note that this
|
413
|
+
is not necessarily the name in the archive but is the name
|
414
|
+
that was passed in when creating the archive (in the archive,
|
415
|
+
it may be prefixed by some directory structure).
|
416
|
+
content_type : ContentType
|
417
|
+
The type of file to get the content of.
|
418
|
+
|
419
|
+
Returns
|
420
|
+
-------
|
421
|
+
Optional[bytes]
|
422
|
+
The content of the file. If the file is not found, None is returned.
|
423
|
+
"""
|
424
|
+
# Wait for at least the blob to be formed
|
425
|
+
_ = self.blob_with_timeout(timeout=timeout)
|
426
|
+
if content_type == ContentType.USER_CONTENT:
|
427
|
+
for path, arcname in self.user_tuples():
|
428
|
+
if name == arcname:
|
429
|
+
return open(path, "rb").read()
|
430
|
+
return None
|
431
|
+
elif content_type in (
|
432
|
+
ContentType.CODE_CONTENT,
|
433
|
+
ContentType.MODULE_CONTENT,
|
434
|
+
ContentType.OTHER_CONTENT,
|
435
|
+
):
|
436
|
+
mangled_name = self._mfcontent.get_archive_filename(name, content_type)
|
437
|
+
for path_or_bytes, arcname in self._mfcontent.contents(content_type):
|
438
|
+
if mangled_name == arcname:
|
439
|
+
if isinstance(path_or_bytes, bytes):
|
440
|
+
# In case this is generated content like an INFO file
|
441
|
+
return path_or_bytes
|
442
|
+
# Otherwise, it is a file path
|
443
|
+
return open(path_or_bytes, "rb").read()
|
444
|
+
return None
|
445
|
+
raise ValueError(f"Unknown content type: {content_type}")
|
446
|
+
|
447
|
+
def extract_into(
|
448
|
+
self,
|
449
|
+
dest_dir: str = ".",
|
450
|
+
content_types: int = ContentType.ALL_CONTENT.value,
|
451
|
+
timeout: Optional[float] = None,
|
452
|
+
):
|
453
|
+
"""
|
454
|
+
Method to extract the package (or some of the files) into a directory.
|
455
|
+
|
456
|
+
Parameters
|
457
|
+
----------
|
458
|
+
dest_dir : str, default "."
|
459
|
+
The directory to extract the package into.
|
460
|
+
content_types : int, default ALL_CONTENT
|
461
|
+
The types of content to extract.
|
462
|
+
"""
|
463
|
+
_ = self.blob_with_timeout(timeout=timeout) # Ensure the package is created
|
464
|
+
member_list = []
|
465
|
+
if content_types & ContentType.USER_CONTENT.value:
|
466
|
+
member_list.extend(
|
467
|
+
[(m[0], os.path.join(dest_dir, m[1])) for m in self.user_tuples()]
|
468
|
+
)
|
469
|
+
if content_types & (
|
470
|
+
ContentType.CODE_CONTENT.value | ContentType.MODULE_CONTENT.value
|
471
|
+
):
|
472
|
+
# We need to get the name of the files in the content archive to extract
|
473
|
+
member_list.extend(
|
474
|
+
[
|
475
|
+
(m[0], os.path.join(dest_dir, m[1]))
|
476
|
+
for m in self._mfcontent.content_names(
|
477
|
+
content_types & ~ContentType.OTHER_CONTENT.value
|
478
|
+
)
|
479
|
+
]
|
480
|
+
)
|
481
|
+
for orig_path, new_path in member_list:
|
482
|
+
os.makedirs(os.path.dirname(new_path), exist_ok=True)
|
483
|
+
# TODO: In case there are duplicate files -- that should not be the case
|
484
|
+
# but there is a bug currently with internal Netflix code.
|
485
|
+
if not os.path.exists(new_path):
|
486
|
+
os.symlink(orig_path, new_path)
|
487
|
+
# Could copy files as well if we want to split them out.
|
488
|
+
# shutil.copy(orig_path, new_path)
|
489
|
+
# OTHER_CONTENT requires special handling because sometimes the file isn't a file
|
490
|
+
# but generated content
|
491
|
+
member_list = []
|
492
|
+
if content_types & ContentType.OTHER_CONTENT.value:
|
493
|
+
member_list.extend(
|
494
|
+
[
|
495
|
+
(m[0], os.path.join(dest_dir, m[1]))
|
496
|
+
for m in self._mfcontent.contents(ContentType.OTHER_CONTENT)
|
497
|
+
]
|
498
|
+
)
|
499
|
+
for path_or_content, new_path in member_list:
|
500
|
+
os.makedirs(os.path.dirname(new_path), exist_ok=True)
|
501
|
+
if not os.path.exists(new_path):
|
502
|
+
if isinstance(path_or_content, bytes):
|
503
|
+
with open(new_path, "wb") as f:
|
504
|
+
f.write(path_or_content)
|
505
|
+
else:
|
506
|
+
os.symlink(path_or_content, new_path)
|
507
|
+
|
508
|
+
@staticmethod
|
509
|
+
def _format_size(size_in_bytes):
|
510
|
+
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
511
|
+
if size_in_bytes < 1024.0:
|
512
|
+
return f"{size_in_bytes:.2f} {unit}"
|
513
|
+
size_in_bytes /= 1024.0
|
514
|
+
return f"{size_in_bytes:.2f} PB"
|
515
|
+
|
516
|
+
def _package_and_upload(self):
|
517
|
+
try:
|
518
|
+
# Can be called without a flow (Function)
|
519
|
+
if self._flow:
|
520
|
+
for step in self._flow:
|
521
|
+
for deco in step.decorators:
|
522
|
+
deco.package_init(self._flow, step.__name__, self._environment)
|
523
|
+
self._name = f"flow {self._flow.name}"
|
524
|
+
else:
|
525
|
+
self._name = "<generic code package>"
|
526
|
+
|
527
|
+
# Add metacontent
|
528
|
+
self._mfcontent.add_info(
|
529
|
+
self._environment.get_environment_info(include_ext_info=True)
|
530
|
+
)
|
531
|
+
|
532
|
+
self._mfcontent.add_config(dump_config_values(self._flow))
|
533
|
+
|
534
|
+
# Add user files (from decorators and environment)
|
535
|
+
if self._flow:
|
536
|
+
self._add_addl_files()
|
537
|
+
self._cached_user_members = list(self._user_code_tuples())
|
538
|
+
debug.package_exec(
|
539
|
+
f"User files to package: {self._cached_user_members}"
|
540
|
+
)
|
541
|
+
|
542
|
+
self._blob = self._make()
|
543
|
+
if self._flow_datastore:
|
544
|
+
if len(self._blob) > 100 * 1024 * 1024:
|
545
|
+
self._echo(
|
546
|
+
f"Warning: The code package for {self._flow.name} is larger than "
|
547
|
+
f"100MB (found it to be {self._format_size(len(self._blob))}) "
|
548
|
+
"This may lead to slower upload times for remote runs and no "
|
549
|
+
"uploads for local runs. Consider reducing the package size. "
|
550
|
+
"Use `<myflow.py> package info` or `<myflow.py> package list` "
|
551
|
+
"to get more information about what is included in the package."
|
552
|
+
)
|
553
|
+
self._blob_url, self._blob_sha = self._flow_datastore.save_data(
|
554
|
+
[self._blob], len_hint=1
|
555
|
+
)[0]
|
556
|
+
else:
|
557
|
+
self._blob_url = self._blob_sha = ""
|
558
|
+
self._is_package_available = True
|
559
|
+
except Exception as e:
|
560
|
+
self._packaging_exception = e
|
561
|
+
self._echo(f"Package creation/upload failed for {self._flow.name}: {e}")
|
562
|
+
self._is_package_available = False
|
563
|
+
|
564
|
+
def _add_addl_files(self):
|
565
|
+
# Look at all decorators that provide additional files
|
566
|
+
deco_module_paths = {}
|
567
|
+
addl_modules = set()
|
568
|
+
|
569
|
+
def _check_tuple(path_tuple):
|
570
|
+
if len(path_tuple) == 2:
|
571
|
+
path_tuple = (
|
572
|
+
path_tuple[0],
|
573
|
+
path_tuple[1],
|
574
|
+
ContentType.CODE_CONTENT,
|
575
|
+
)
|
576
|
+
file_path, file_name, file_type = path_tuple
|
577
|
+
if file_type == ContentType.MODULE_CONTENT:
|
578
|
+
if file_path in addl_modules:
|
579
|
+
return None # Module was already added -- we don't add twice
|
580
|
+
addl_modules.add(file_path)
|
581
|
+
elif file_type in (
|
582
|
+
ContentType.OTHER_CONTENT,
|
583
|
+
ContentType.CODE_CONTENT,
|
584
|
+
):
|
585
|
+
path_tuple = (os.path.realpath(path_tuple[0]), path_tuple[1], file_type)
|
586
|
+
# These are files
|
587
|
+
# Check if the path is not duplicated as
|
588
|
+
# many steps can have the same packages being imported
|
589
|
+
if file_name not in deco_module_paths:
|
590
|
+
deco_module_paths[file_name] = file_path
|
591
|
+
elif deco_module_paths[file_name] != file_path:
|
592
|
+
raise NonUniqueFileNameToFilePathMappingException(
|
593
|
+
file_name, [deco_module_paths[file_name], file_path]
|
594
|
+
)
|
595
|
+
else:
|
596
|
+
raise ValueError(f"Unknown file type: {file_type}")
|
597
|
+
return path_tuple
|
598
|
+
|
599
|
+
def _add_tuple(path_tuple):
|
600
|
+
file_path, file_name, file_type = path_tuple
|
601
|
+
if file_type == ContentType.MODULE_CONTENT:
|
602
|
+
# file_path is actually a module
|
603
|
+
self._mfcontent.add_module(cast(ModuleType, file_path))
|
604
|
+
elif file_type == ContentType.CODE_CONTENT:
|
605
|
+
self._mfcontent.add_code_file(file_path, file_name)
|
606
|
+
elif file_type == ContentType.OTHER_CONTENT:
|
607
|
+
self._mfcontent.add_other_file(file_path, file_name)
|
608
|
+
|
609
|
+
for step in self._flow:
|
610
|
+
for deco in step.decorators:
|
611
|
+
for path_tuple in deco.add_to_package():
|
612
|
+
path_tuple = _check_tuple(path_tuple)
|
613
|
+
if path_tuple is None:
|
614
|
+
continue
|
615
|
+
_add_tuple(path_tuple)
|
616
|
+
|
617
|
+
# the package folders for environment
|
618
|
+
for path_tuple in self._environment.add_to_package():
|
619
|
+
path_tuple = _check_tuple(path_tuple)
|
620
|
+
if path_tuple is None:
|
621
|
+
continue
|
622
|
+
_add_tuple(path_tuple)
|
623
|
+
|
624
|
+
def _user_code_tuples(self):
|
625
|
+
if R.use_r():
|
626
|
+
# the R working directory
|
627
|
+
self._user_flow_dir = R.working_dir()
|
628
|
+
for path_tuple in walk(
|
629
|
+
"%s/" % R.working_dir(), file_filter=self._user_code_filter
|
630
|
+
):
|
631
|
+
yield path_tuple
|
632
|
+
# the R package
|
633
|
+
for path_tuple in R.package_paths():
|
634
|
+
yield path_tuple
|
635
|
+
else:
|
636
|
+
# the user's working directory
|
637
|
+
flowdir = os.path.dirname(os.path.abspath(sys.argv[0])) + "/"
|
638
|
+
self._user_flow_dir = flowdir
|
639
|
+
for path_tuple in walk(
|
640
|
+
flowdir,
|
641
|
+
file_filter=self._user_code_filter,
|
642
|
+
exclude_tl_dirs=self._exclude_tl_dirs,
|
643
|
+
):
|
644
|
+
# TODO: This is where we will check if the file is already included
|
645
|
+
# in the mfcontent portion
|
646
|
+
yield path_tuple
|
647
|
+
|
648
|
+
def _make(self):
|
649
|
+
backend = self._backend()
|
650
|
+
with backend.create() as archive:
|
651
|
+
# Package the environment
|
652
|
+
for path_or_bytes, arcname in self._mfcontent.contents():
|
653
|
+
if isinstance(path_or_bytes, str):
|
654
|
+
archive.add_file(path_or_bytes, arcname=arcname)
|
655
|
+
else:
|
656
|
+
archive.add_data(BytesIO(path_or_bytes), arcname=arcname)
|
657
|
+
|
658
|
+
# Package the user code
|
659
|
+
for path, arcname in self._cached_user_members:
|
660
|
+
archive.add_file(path, arcname=arcname)
|
661
|
+
return backend.get_blob()
|
662
|
+
|
663
|
+
def __str__(self):
|
664
|
+
return f"<code package for {self._name} (created @ {self._create_time})>"
|