metaflow 2.15.20__py2.py3-none-any.whl → 2.16.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. metaflow/__init__.py +7 -1
  2. metaflow/cli.py +16 -1
  3. metaflow/cli_components/init_cmd.py +1 -0
  4. metaflow/cli_components/run_cmds.py +6 -2
  5. metaflow/client/core.py +22 -30
  6. metaflow/datastore/task_datastore.py +0 -1
  7. metaflow/debug.py +5 -0
  8. metaflow/decorators.py +230 -70
  9. metaflow/extension_support/__init__.py +15 -8
  10. metaflow/extension_support/_empty_file.py +2 -2
  11. metaflow/flowspec.py +80 -53
  12. metaflow/graph.py +24 -2
  13. metaflow/meta_files.py +13 -0
  14. metaflow/metadata_provider/metadata.py +7 -1
  15. metaflow/metaflow_config.py +5 -0
  16. metaflow/metaflow_environment.py +82 -25
  17. metaflow/metaflow_version.py +1 -1
  18. metaflow/package/__init__.py +664 -0
  19. metaflow/packaging_sys/__init__.py +870 -0
  20. metaflow/packaging_sys/backend.py +113 -0
  21. metaflow/packaging_sys/distribution_support.py +153 -0
  22. metaflow/packaging_sys/tar_backend.py +86 -0
  23. metaflow/packaging_sys/utils.py +91 -0
  24. metaflow/packaging_sys/v1.py +476 -0
  25. metaflow/plugins/airflow/airflow.py +5 -1
  26. metaflow/plugins/airflow/airflow_cli.py +15 -4
  27. metaflow/plugins/argo/argo_workflows.py +23 -17
  28. metaflow/plugins/argo/argo_workflows_cli.py +16 -4
  29. metaflow/plugins/aws/batch/batch.py +22 -3
  30. metaflow/plugins/aws/batch/batch_cli.py +3 -0
  31. metaflow/plugins/aws/batch/batch_decorator.py +13 -5
  32. metaflow/plugins/aws/step_functions/step_functions.py +4 -1
  33. metaflow/plugins/aws/step_functions/step_functions_cli.py +15 -4
  34. metaflow/plugins/cards/card_decorator.py +0 -5
  35. metaflow/plugins/kubernetes/kubernetes.py +8 -1
  36. metaflow/plugins/kubernetes/kubernetes_cli.py +3 -0
  37. metaflow/plugins/kubernetes/kubernetes_decorator.py +13 -5
  38. metaflow/plugins/package_cli.py +25 -23
  39. metaflow/plugins/parallel_decorator.py +4 -2
  40. metaflow/plugins/pypi/bootstrap.py +8 -2
  41. metaflow/plugins/pypi/conda_decorator.py +39 -82
  42. metaflow/plugins/pypi/conda_environment.py +6 -2
  43. metaflow/plugins/pypi/pypi_decorator.py +4 -4
  44. metaflow/plugins/test_unbounded_foreach_decorator.py +2 -2
  45. metaflow/plugins/timeout_decorator.py +0 -1
  46. metaflow/plugins/uv/bootstrap.py +11 -0
  47. metaflow/plugins/uv/uv_environment.py +4 -2
  48. metaflow/pylint_wrapper.py +5 -1
  49. metaflow/runner/click_api.py +5 -4
  50. metaflow/runner/subprocess_manager.py +14 -2
  51. metaflow/runtime.py +37 -11
  52. metaflow/task.py +91 -7
  53. metaflow/user_configs/config_options.py +13 -8
  54. metaflow/user_configs/config_parameters.py +0 -4
  55. metaflow/user_decorators/__init__.py +0 -0
  56. metaflow/user_decorators/common.py +144 -0
  57. metaflow/user_decorators/mutable_flow.py +499 -0
  58. metaflow/user_decorators/mutable_step.py +424 -0
  59. metaflow/user_decorators/user_flow_decorator.py +263 -0
  60. metaflow/user_decorators/user_step_decorator.py +712 -0
  61. metaflow/util.py +4 -1
  62. metaflow/version.py +1 -1
  63. {metaflow-2.15.20.dist-info → metaflow-2.16.0.dist-info}/METADATA +2 -2
  64. {metaflow-2.15.20.dist-info → metaflow-2.16.0.dist-info}/RECORD +71 -60
  65. metaflow/info_file.py +0 -25
  66. metaflow/package.py +0 -203
  67. metaflow/user_configs/config_decorators.py +0 -568
  68. {metaflow-2.15.20.data → metaflow-2.16.0.data}/data/share/metaflow/devtools/Makefile +0 -0
  69. {metaflow-2.15.20.data → metaflow-2.16.0.data}/data/share/metaflow/devtools/Tiltfile +0 -0
  70. {metaflow-2.15.20.data → metaflow-2.16.0.data}/data/share/metaflow/devtools/pick_services.sh +0 -0
  71. {metaflow-2.15.20.dist-info → metaflow-2.16.0.dist-info}/WHEEL +0 -0
  72. {metaflow-2.15.20.dist-info → metaflow-2.16.0.dist-info}/entry_points.txt +0 -0
  73. {metaflow-2.15.20.dist-info → metaflow-2.16.0.dist-info}/licenses/LICENSE +0 -0
  74. {metaflow-2.15.20.dist-info → metaflow-2.16.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,664 @@
1
+ import json
2
+ import os
3
+ import sys
4
+ import threading
5
+ import time
6
+
7
+ from io import BytesIO
8
+ from types import ModuleType
9
+ from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Type, cast
10
+
11
+ from ..debug import debug
12
+ from ..packaging_sys import ContentType, MetaflowCodeContent
13
+ from ..packaging_sys.backend import PackagingBackend
14
+ from ..packaging_sys.tar_backend import TarPackagingBackend
15
+ from ..packaging_sys.v1 import MetaflowCodeContentV1
16
+ from ..packaging_sys.utils import suffix_filter, walk
17
+ from ..metaflow_config import DEFAULT_PACKAGE_SUFFIXES
18
+ from ..exception import MetaflowException
19
+ from ..user_configs.config_parameters import dump_config_values
20
+ from ..util import get_metaflow_root
21
+ from .. import R
22
+
23
+ DEFAULT_SUFFIXES_LIST = DEFAULT_PACKAGE_SUFFIXES.split(",")
24
+
25
+
26
+ if TYPE_CHECKING:
27
+ import metaflow.datastore
28
+
29
+
30
+ class NonUniqueFileNameToFilePathMappingException(MetaflowException):
31
+ headline = "Non-unique file path for a file name included in code package"
32
+
33
+ def __init__(self, filename, file_paths, lineno=None):
34
+ msg = (
35
+ "Filename %s included in the code package includes multiple different "
36
+ "paths for the same name : %s.\n"
37
+ "The `filename` in the `add_to_package` decorator hook requires a unique "
38
+ "`file_path` to `file_name` mapping" % (filename, ", ".join(file_paths))
39
+ )
40
+ super().__init__(msg=msg, lineno=lineno)
41
+
42
+
43
+ class MetaflowPackage(object):
44
+ def __init__(
45
+ self,
46
+ flow,
47
+ environment,
48
+ echo,
49
+ suffixes: Optional[List[str]] = DEFAULT_SUFFIXES_LIST,
50
+ user_code_filter: Optional[Callable[[str], bool]] = None,
51
+ flow_datastore: Optional["metaflow.datastore.FlowDataStore"] = None,
52
+ mfcontent: Optional[MetaflowCodeContent] = None,
53
+ exclude_tl_dirs=None,
54
+ backend: Type[PackagingBackend] = TarPackagingBackend,
55
+ ):
56
+ self._environment = environment
57
+ self._environment.init_environment(echo)
58
+
59
+ self._echo = echo
60
+ self._flow = flow
61
+ self._flow_datastore = flow_datastore
62
+ self._backend = backend
63
+
64
+ # Info about the package
65
+ self._name = None
66
+ self._create_time = time.time()
67
+ self._user_flow_dir = None
68
+
69
+ # Content of the package (and settings on how to create it)
70
+ if suffixes is not None:
71
+ self._suffixes = list(set().union(suffixes, DEFAULT_SUFFIXES_LIST))
72
+ else:
73
+ self._suffixes = None
74
+
75
+ def _module_selector(m) -> bool:
76
+ from ..user_decorators.user_flow_decorator import FlowMutatorMeta
77
+ from ..user_decorators.user_step_decorator import UserStepDecoratorMeta
78
+
79
+ if (
80
+ m.__name__ in FlowMutatorMeta._import_modules
81
+ or m.__name__ in UserStepDecoratorMeta._import_modules
82
+ or hasattr(m, "METAFLOW_PACKAGE")
83
+ ):
84
+ return True
85
+
86
+ if mfcontent is None:
87
+ self._mfcontent = MetaflowCodeContentV1(criteria=_module_selector)
88
+
89
+ else:
90
+ self._mfcontent = mfcontent
91
+ # We exclude the environment when packaging as this will be packaged separately.
92
+ # This comes into play primarily if packaging from a node already running packaged
93
+ # code.
94
+ # These directories are only excluded at the top-level (ie: not further down
95
+ # in sub-directories)
96
+ # "_escape_trampolines" is a special directory where trampoline escape hatch
97
+ # files are stored (used by Netflix Extension's Conda implementation).
98
+ self._exclude_tl_dirs = (
99
+ self._mfcontent.get_excluded_tl_entries()
100
+ + ["_escape_trampolines"]
101
+ + (exclude_tl_dirs or [])
102
+ )
103
+
104
+ if self._suffixes is not None and user_code_filter is not None:
105
+ self._user_code_filter = lambda x, f1=suffix_filter(
106
+ self._suffixes
107
+ ), f2=user_code_filter: f1(x) and f2(x)
108
+ self._filter_type = "suffixes and user filter"
109
+ elif self._suffixes is not None:
110
+ self._user_code_filter = suffix_filter(self._suffixes)
111
+ self._filter_type = "suffixes"
112
+ elif user_code_filter is not None:
113
+ self._user_code_filter = user_code_filter
114
+ self._filter_type = "user filter"
115
+ else:
116
+ self._user_code_filter = lambda x: True
117
+ self._filter_type = "no filter"
118
+
119
+ # Info about the package creation (it happens async)
120
+ self._is_package_available = None
121
+ self._blob_sha = None
122
+ self._blob_url = None
123
+ self._blob = None
124
+
125
+ # We launch a thread to create the package asynchronously and upload
126
+ # it opportunistically
127
+ self._create_thread = threading.Thread(
128
+ target=self._package_and_upload,
129
+ daemon=True,
130
+ )
131
+ self._create_thread.start()
132
+
133
+ # HORRIBLE HACK SO THAT CURRENT COMPUTE IMPLEMENTATIONS CAN STILL
134
+ # DO pkg.blob. Ideally, this goes away and blob_with_timeout becomes
135
+ # the main method (called blob).
136
+ @property
137
+ def blob(self) -> BytesIO:
138
+ return self.blob_with_timeout()
139
+
140
+ def blob_with_timeout(self, timeout: Optional[float] = None) -> BytesIO:
141
+ if self._blob is None:
142
+ self._create_thread.join(timeout)
143
+ if self._is_package_available is not None:
144
+ # We have our result now
145
+ if self._is_package_available:
146
+ return self._blob
147
+ else:
148
+ raise self._packaging_exception
149
+ return self._blob
150
+
151
+ def package_sha(self, timeout: Optional[float] = None) -> Optional[str]:
152
+ if self._blob_sha is None:
153
+ self._create_thread.join(timeout)
154
+ if self._is_package_available is not None:
155
+ # We have our result now
156
+ if self._is_package_available:
157
+ return self._blob_sha
158
+ else:
159
+ raise self._packaging_exception
160
+ return self._blob_sha
161
+
162
+ def package_url(self, timeout: Optional[float] = None) -> Optional[str]:
163
+ if self._blob_url is None:
164
+ self._create_thread.join(timeout)
165
+ if self._is_package_available is not None:
166
+ # We have our result now
167
+ if self._is_package_available:
168
+ return self._blob_url
169
+ else:
170
+ raise self._packaging_exception
171
+ return self._blob_url
172
+
173
+ @property
174
+ def package_metadata(self):
175
+ return json.dumps(
176
+ {
177
+ "version": 0,
178
+ "archive_format": self._backend.backend_type(),
179
+ "mfcontent_version": self._mfcontent.get_package_version(),
180
+ }
181
+ )
182
+
183
+ @classmethod
184
+ def get_backend(cls, pkg_metadata: str) -> PackagingBackend:
185
+ """
186
+ Method to get the backend type from the package metadata.
187
+
188
+ Parameters
189
+ ----------
190
+ pkg_metadata : str
191
+ The metadata of the package to extract.
192
+
193
+ Returns
194
+ -------
195
+ PackagingBackend
196
+ The backend type that can be used to extract the package.
197
+ """
198
+ backend_type = json.loads(pkg_metadata).get("archive_format", "tgz")
199
+ return PackagingBackend.get_backend(backend_type)
200
+
201
+ @classmethod
202
+ def get_extract_commands(
203
+ cls, pkg_metadata: str, archive_path: str, dest_dir: str = "."
204
+ ) -> List[str]:
205
+ """
206
+ Method to get the commands needed to extract the package into
207
+ the directory dest_dir. Note that this will return a list of commands
208
+ that can be passed to subprocess.run for example.
209
+
210
+ Parameters
211
+ ----------
212
+ pkg_metadata : str
213
+ The metadata of the package to extract.
214
+ archive_path : str
215
+ The path to the archive to extract.
216
+ dest_dir : str, default "."
217
+ The directory to extract the package into.
218
+
219
+ Returns
220
+ -------
221
+ List[str]
222
+ The commands needed to extract the package into the directory dest_dir.
223
+ """
224
+ backend_type = json.loads(pkg_metadata).get("archive_format", "tgz")
225
+ # We now ask the backend type how to extract itself
226
+ backend = PackagingBackend.get_backend(backend_type)
227
+ cmds = backend.get_extract_commands(archive_path, dest_dir)
228
+ debug.package_exec(f"Command to extract {archive_path} into {dest_dir}: {cmds}")
229
+ return cmds
230
+
231
+ @classmethod
232
+ def get_post_extract_env_vars(
233
+ cls, pkg_metadata: str, dest_dir: str = "."
234
+ ) -> Dict[str, str]:
235
+ """
236
+ Method to get the environment variables needed to access the content
237
+ that has been extracted into the directory dest_dir. This will
238
+ typically involve setting PYTHONPATH
239
+
240
+ Parameters
241
+ ----------
242
+ pkg_metadata : str
243
+ The metadata of the package to extract.
244
+ dest_dir : str, default "."
245
+ The directory where the content has been extracted to.
246
+
247
+ Returns
248
+ -------
249
+ Dict[str, str]
250
+ The post-extract environment variables that are needed to access the content
251
+ that has been extracted into dest_dir.
252
+ """
253
+ mfcontent_version = json.loads(pkg_metadata).get("mfcontent_version", 0)
254
+ env_vars = MetaflowCodeContent.get_post_extract_env_vars(
255
+ mfcontent_version, dest_dir
256
+ )
257
+ debug.package_exec(
258
+ f"Environment variables to access content extracted into {dest_dir}: {env_vars}"
259
+ )
260
+ return env_vars
261
+
262
+ @classmethod
263
+ def cls_get_content(
264
+ cls, pkg_metadata, archive: BytesIO, name: str
265
+ ) -> Optional[bytes]:
266
+ """
267
+ Method to get the content of a member in the package archive.
268
+
269
+ Parameters
270
+ ----------
271
+ pkg_metadata : str
272
+ The metadata of the package to extract.
273
+ archive : BytesIO
274
+ The archive to extract the member from.
275
+ name : str
276
+ The name of the member to extract.
277
+
278
+ Returns
279
+ -------
280
+ Optional[bytes]
281
+ The content of the member if it exists, None otherwise.
282
+ """
283
+ backend = cls.get_backend(pkg_metadata)
284
+ with backend.cls_open(archive) as opened_archive:
285
+ return backend.cls_get_member(opened_archive, name)
286
+
287
+ @classmethod
288
+ def cls_get_info(cls, pkg_metadata, archive: BytesIO) -> Optional[Dict[str, str]]:
289
+ """
290
+ Method to get the info of the package from the archive.
291
+ Parameters
292
+ ----------
293
+ pkg_metadata : str
294
+ The metadata of the package to extract.
295
+ archive : BytesIO
296
+ The archive to extract the info from.
297
+ Returns
298
+ -------
299
+ Optional[Dict[str, str]]
300
+ The info of the package if it exists, None otherwise.
301
+ """
302
+ backend = cls.get_backend(pkg_metadata)
303
+ with backend.cls_open(archive) as opened_archive:
304
+ return MetaflowCodeContent.get_archive_info(opened_archive, backend)
305
+
306
+ @classmethod
307
+ def cls_get_config(
308
+ cls, pkg_metadata: str, archive: BytesIO
309
+ ) -> Optional[Dict[str, str]]:
310
+ """
311
+ Method to get the config of the package from the archive.
312
+
313
+ Parameters
314
+ ----------
315
+ pkg_metadata : str
316
+ The metadata of the package to extract.
317
+ archive : BytesIO
318
+ The archive to extract the config from.
319
+
320
+ Returns
321
+ -------
322
+ Optional[Dict[str, str]]
323
+ The config of the package if it exists, None otherwise.
324
+ """
325
+ backend = cls.get_backend(pkg_metadata)
326
+ with backend.cls_open(archive) as opened_archive:
327
+ return MetaflowCodeContent.get_archive_config(opened_archive, backend)
328
+
329
+ @classmethod
330
+ def cls_extract_into(
331
+ cls,
332
+ pkg_metadata: str,
333
+ archive: BytesIO,
334
+ dest_dir: str = ".",
335
+ content_types: int = ContentType.ALL_CONTENT.value,
336
+ ):
337
+ """
338
+ Method to extract the package archive into a directory.
339
+
340
+ Parameters
341
+ ----------
342
+ pkg_metadata : str
343
+ The metadata of the package to extract.
344
+ archive : BytesIO
345
+ The archive to extract.
346
+ dest_dir : str, default "."
347
+ The directory to extract the package into.
348
+ content_types : int, default ALL_CONTENT
349
+ The types of content to extract. This is a bitmask of ContentType values.
350
+ """
351
+ backend = cls.get_backend(pkg_metadata)
352
+ with backend.cls_open(archive) as opened_archive:
353
+ include_names = MetaflowCodeContent.get_archive_content_names(
354
+ opened_archive, content_types, backend
355
+ )
356
+ backend.extract_members(include_names, dest_dir)
357
+
358
+ def user_tuples(self, timeout: Optional[float] = None):
359
+ # Wait for at least the blob to be formed
360
+ _ = self.blob_with_timeout(timeout=timeout)
361
+ for path, arcname in self._cached_user_members:
362
+ yield path, arcname
363
+
364
+ def path_tuples(self, timeout: Optional[float] = None):
365
+ # Wait for at least the blob to be formed
366
+ _ = self.blob_with_timeout(timeout=timeout)
367
+ # Files included in the environment
368
+ yield from self._mfcontent.content_names()
369
+
370
+ # Files included in the user code
371
+ yield from self.user_tuples()
372
+
373
+ def show(self, timeout: Optional[float] = None) -> str:
374
+ # Human-readable content of the package
375
+ blob = self.blob_with_timeout(timeout=timeout) # Ensure the package is created
376
+ lines = [
377
+ f"Package size: {self._format_size(len(blob))}",
378
+ f"Number of files: {sum(1 for _ in self.path_tuples())}",
379
+ self._mfcontent.show(),
380
+ ]
381
+
382
+ if self._flow:
383
+ lines.append(f"\nUser code in flow {self._name}:")
384
+ lines.append(f" - Packaged from directory {self._user_flow_dir}")
385
+ if self._filter_type != "no filter":
386
+ if self._suffixes:
387
+ lines.append(
388
+ f" - Filtered by suffixes: {', '.join(self._suffixes)}"
389
+ )
390
+ else:
391
+ lines.append(f" - Filtered by {self._filter_type}")
392
+ else:
393
+ lines.append(" - No user code filter applied")
394
+ if self._exclude_tl_dirs:
395
+ lines.append(
396
+ f" - Excluded directories: {', '.join(self._exclude_tl_dirs)}"
397
+ )
398
+ return "\n".join(lines)
399
+
400
+ def get_content(
401
+ self, name: str, content_type: ContentType, timeout: Optional[float] = None
402
+ ) -> Optional[bytes]:
403
+ """
404
+ Method to get the content of a file within the package. This method
405
+ should be used for one-off access to small-ish files. If more files are
406
+ needed, use extract_into to extract the package into a directory and
407
+ then access the files from there.
408
+
409
+ Parameters
410
+ ----------
411
+ name : str
412
+ The name of the file to get the content of. Note that this
413
+ is not necessarily the name in the archive but is the name
414
+ that was passed in when creating the archive (in the archive,
415
+ it may be prefixed by some directory structure).
416
+ content_type : ContentType
417
+ The type of file to get the content of.
418
+
419
+ Returns
420
+ -------
421
+ Optional[bytes]
422
+ The content of the file. If the file is not found, None is returned.
423
+ """
424
+ # Wait for at least the blob to be formed
425
+ _ = self.blob_with_timeout(timeout=timeout)
426
+ if content_type == ContentType.USER_CONTENT:
427
+ for path, arcname in self.user_tuples():
428
+ if name == arcname:
429
+ return open(path, "rb").read()
430
+ return None
431
+ elif content_type in (
432
+ ContentType.CODE_CONTENT,
433
+ ContentType.MODULE_CONTENT,
434
+ ContentType.OTHER_CONTENT,
435
+ ):
436
+ mangled_name = self._mfcontent.get_archive_filename(name, content_type)
437
+ for path_or_bytes, arcname in self._mfcontent.contents(content_type):
438
+ if mangled_name == arcname:
439
+ if isinstance(path_or_bytes, bytes):
440
+ # In case this is generated content like an INFO file
441
+ return path_or_bytes
442
+ # Otherwise, it is a file path
443
+ return open(path_or_bytes, "rb").read()
444
+ return None
445
+ raise ValueError(f"Unknown content type: {content_type}")
446
+
447
+ def extract_into(
448
+ self,
449
+ dest_dir: str = ".",
450
+ content_types: int = ContentType.ALL_CONTENT.value,
451
+ timeout: Optional[float] = None,
452
+ ):
453
+ """
454
+ Method to extract the package (or some of the files) into a directory.
455
+
456
+ Parameters
457
+ ----------
458
+ dest_dir : str, default "."
459
+ The directory to extract the package into.
460
+ content_types : int, default ALL_CONTENT
461
+ The types of content to extract.
462
+ """
463
+ _ = self.blob_with_timeout(timeout=timeout) # Ensure the package is created
464
+ member_list = []
465
+ if content_types & ContentType.USER_CONTENT.value:
466
+ member_list.extend(
467
+ [(m[0], os.path.join(dest_dir, m[1])) for m in self.user_tuples()]
468
+ )
469
+ if content_types & (
470
+ ContentType.CODE_CONTENT.value | ContentType.MODULE_CONTENT.value
471
+ ):
472
+ # We need to get the name of the files in the content archive to extract
473
+ member_list.extend(
474
+ [
475
+ (m[0], os.path.join(dest_dir, m[1]))
476
+ for m in self._mfcontent.content_names(
477
+ content_types & ~ContentType.OTHER_CONTENT.value
478
+ )
479
+ ]
480
+ )
481
+ for orig_path, new_path in member_list:
482
+ os.makedirs(os.path.dirname(new_path), exist_ok=True)
483
+ # TODO: In case there are duplicate files -- that should not be the case
484
+ # but there is a bug currently with internal Netflix code.
485
+ if not os.path.exists(new_path):
486
+ os.symlink(orig_path, new_path)
487
+ # Could copy files as well if we want to split them out.
488
+ # shutil.copy(orig_path, new_path)
489
+ # OTHER_CONTENT requires special handling because sometimes the file isn't a file
490
+ # but generated content
491
+ member_list = []
492
+ if content_types & ContentType.OTHER_CONTENT.value:
493
+ member_list.extend(
494
+ [
495
+ (m[0], os.path.join(dest_dir, m[1]))
496
+ for m in self._mfcontent.contents(ContentType.OTHER_CONTENT)
497
+ ]
498
+ )
499
+ for path_or_content, new_path in member_list:
500
+ os.makedirs(os.path.dirname(new_path), exist_ok=True)
501
+ if not os.path.exists(new_path):
502
+ if isinstance(path_or_content, bytes):
503
+ with open(new_path, "wb") as f:
504
+ f.write(path_or_content)
505
+ else:
506
+ os.symlink(path_or_content, new_path)
507
+
508
+ @staticmethod
509
+ def _format_size(size_in_bytes):
510
+ for unit in ["B", "KB", "MB", "GB", "TB"]:
511
+ if size_in_bytes < 1024.0:
512
+ return f"{size_in_bytes:.2f} {unit}"
513
+ size_in_bytes /= 1024.0
514
+ return f"{size_in_bytes:.2f} PB"
515
+
516
+ def _package_and_upload(self):
517
+ try:
518
+ # Can be called without a flow (Function)
519
+ if self._flow:
520
+ for step in self._flow:
521
+ for deco in step.decorators:
522
+ deco.package_init(self._flow, step.__name__, self._environment)
523
+ self._name = f"flow {self._flow.name}"
524
+ else:
525
+ self._name = "<generic code package>"
526
+
527
+ # Add metacontent
528
+ self._mfcontent.add_info(
529
+ self._environment.get_environment_info(include_ext_info=True)
530
+ )
531
+
532
+ self._mfcontent.add_config(dump_config_values(self._flow))
533
+
534
+ # Add user files (from decorators and environment)
535
+ if self._flow:
536
+ self._add_addl_files()
537
+ self._cached_user_members = list(self._user_code_tuples())
538
+ debug.package_exec(
539
+ f"User files to package: {self._cached_user_members}"
540
+ )
541
+
542
+ self._blob = self._make()
543
+ if self._flow_datastore:
544
+ if len(self._blob) > 100 * 1024 * 1024:
545
+ self._echo(
546
+ f"Warning: The code package for {self._flow.name} is larger than "
547
+ f"100MB (found it to be {self._format_size(len(self._blob))}) "
548
+ "This may lead to slower upload times for remote runs and no "
549
+ "uploads for local runs. Consider reducing the package size. "
550
+ "Use `<myflow.py> package info` or `<myflow.py> package list` "
551
+ "to get more information about what is included in the package."
552
+ )
553
+ self._blob_url, self._blob_sha = self._flow_datastore.save_data(
554
+ [self._blob], len_hint=1
555
+ )[0]
556
+ else:
557
+ self._blob_url = self._blob_sha = ""
558
+ self._is_package_available = True
559
+ except Exception as e:
560
+ self._packaging_exception = e
561
+ self._echo(f"Package creation/upload failed for {self._flow.name}: {e}")
562
+ self._is_package_available = False
563
+
564
+ def _add_addl_files(self):
565
+ # Look at all decorators that provide additional files
566
+ deco_module_paths = {}
567
+ addl_modules = set()
568
+
569
+ def _check_tuple(path_tuple):
570
+ if len(path_tuple) == 2:
571
+ path_tuple = (
572
+ path_tuple[0],
573
+ path_tuple[1],
574
+ ContentType.CODE_CONTENT,
575
+ )
576
+ file_path, file_name, file_type = path_tuple
577
+ if file_type == ContentType.MODULE_CONTENT:
578
+ if file_path in addl_modules:
579
+ return None # Module was already added -- we don't add twice
580
+ addl_modules.add(file_path)
581
+ elif file_type in (
582
+ ContentType.OTHER_CONTENT,
583
+ ContentType.CODE_CONTENT,
584
+ ):
585
+ path_tuple = (os.path.realpath(path_tuple[0]), path_tuple[1], file_type)
586
+ # These are files
587
+ # Check if the path is not duplicated as
588
+ # many steps can have the same packages being imported
589
+ if file_name not in deco_module_paths:
590
+ deco_module_paths[file_name] = file_path
591
+ elif deco_module_paths[file_name] != file_path:
592
+ raise NonUniqueFileNameToFilePathMappingException(
593
+ file_name, [deco_module_paths[file_name], file_path]
594
+ )
595
+ else:
596
+ raise ValueError(f"Unknown file type: {file_type}")
597
+ return path_tuple
598
+
599
+ def _add_tuple(path_tuple):
600
+ file_path, file_name, file_type = path_tuple
601
+ if file_type == ContentType.MODULE_CONTENT:
602
+ # file_path is actually a module
603
+ self._mfcontent.add_module(cast(ModuleType, file_path))
604
+ elif file_type == ContentType.CODE_CONTENT:
605
+ self._mfcontent.add_code_file(file_path, file_name)
606
+ elif file_type == ContentType.OTHER_CONTENT:
607
+ self._mfcontent.add_other_file(file_path, file_name)
608
+
609
+ for step in self._flow:
610
+ for deco in step.decorators:
611
+ for path_tuple in deco.add_to_package():
612
+ path_tuple = _check_tuple(path_tuple)
613
+ if path_tuple is None:
614
+ continue
615
+ _add_tuple(path_tuple)
616
+
617
+ # the package folders for environment
618
+ for path_tuple in self._environment.add_to_package():
619
+ path_tuple = _check_tuple(path_tuple)
620
+ if path_tuple is None:
621
+ continue
622
+ _add_tuple(path_tuple)
623
+
624
+ def _user_code_tuples(self):
625
+ if R.use_r():
626
+ # the R working directory
627
+ self._user_flow_dir = R.working_dir()
628
+ for path_tuple in walk(
629
+ "%s/" % R.working_dir(), file_filter=self._user_code_filter
630
+ ):
631
+ yield path_tuple
632
+ # the R package
633
+ for path_tuple in R.package_paths():
634
+ yield path_tuple
635
+ else:
636
+ # the user's working directory
637
+ flowdir = os.path.dirname(os.path.abspath(sys.argv[0])) + "/"
638
+ self._user_flow_dir = flowdir
639
+ for path_tuple in walk(
640
+ flowdir,
641
+ file_filter=self._user_code_filter,
642
+ exclude_tl_dirs=self._exclude_tl_dirs,
643
+ ):
644
+ # TODO: This is where we will check if the file is already included
645
+ # in the mfcontent portion
646
+ yield path_tuple
647
+
648
+ def _make(self):
649
+ backend = self._backend()
650
+ with backend.create() as archive:
651
+ # Package the environment
652
+ for path_or_bytes, arcname in self._mfcontent.contents():
653
+ if isinstance(path_or_bytes, str):
654
+ archive.add_file(path_or_bytes, arcname=arcname)
655
+ else:
656
+ archive.add_data(BytesIO(path_or_bytes), arcname=arcname)
657
+
658
+ # Package the user code
659
+ for path, arcname in self._cached_user_members:
660
+ archive.add_file(path, arcname=arcname)
661
+ return backend.get_blob()
662
+
663
+ def __str__(self):
664
+ return f"<code package for {self._name} (created @ {self._create_time})>"