ob-metaflow 2.15.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. metaflow/__init__.py +10 -3
  2. metaflow/_vendor/imghdr/__init__.py +186 -0
  3. metaflow/_vendor/yaml/__init__.py +427 -0
  4. metaflow/_vendor/yaml/composer.py +139 -0
  5. metaflow/_vendor/yaml/constructor.py +748 -0
  6. metaflow/_vendor/yaml/cyaml.py +101 -0
  7. metaflow/_vendor/yaml/dumper.py +62 -0
  8. metaflow/_vendor/yaml/emitter.py +1137 -0
  9. metaflow/_vendor/yaml/error.py +75 -0
  10. metaflow/_vendor/yaml/events.py +86 -0
  11. metaflow/_vendor/yaml/loader.py +63 -0
  12. metaflow/_vendor/yaml/nodes.py +49 -0
  13. metaflow/_vendor/yaml/parser.py +589 -0
  14. metaflow/_vendor/yaml/reader.py +185 -0
  15. metaflow/_vendor/yaml/representer.py +389 -0
  16. metaflow/_vendor/yaml/resolver.py +227 -0
  17. metaflow/_vendor/yaml/scanner.py +1435 -0
  18. metaflow/_vendor/yaml/serializer.py +111 -0
  19. metaflow/_vendor/yaml/tokens.py +104 -0
  20. metaflow/cards.py +4 -0
  21. metaflow/cli.py +125 -21
  22. metaflow/cli_components/init_cmd.py +1 -0
  23. metaflow/cli_components/run_cmds.py +204 -40
  24. metaflow/cli_components/step_cmd.py +160 -4
  25. metaflow/client/__init__.py +1 -0
  26. metaflow/client/core.py +198 -130
  27. metaflow/client/filecache.py +59 -32
  28. metaflow/cmd/code/__init__.py +2 -1
  29. metaflow/cmd/develop/stub_generator.py +49 -18
  30. metaflow/cmd/develop/stubs.py +9 -27
  31. metaflow/cmd/make_wrapper.py +30 -0
  32. metaflow/datastore/__init__.py +1 -0
  33. metaflow/datastore/content_addressed_store.py +40 -9
  34. metaflow/datastore/datastore_set.py +10 -1
  35. metaflow/datastore/flow_datastore.py +124 -4
  36. metaflow/datastore/spin_datastore.py +91 -0
  37. metaflow/datastore/task_datastore.py +92 -6
  38. metaflow/debug.py +5 -0
  39. metaflow/decorators.py +331 -82
  40. metaflow/extension_support/__init__.py +414 -356
  41. metaflow/extension_support/_empty_file.py +2 -2
  42. metaflow/flowspec.py +322 -82
  43. metaflow/graph.py +178 -15
  44. metaflow/includefile.py +25 -3
  45. metaflow/lint.py +94 -3
  46. metaflow/meta_files.py +13 -0
  47. metaflow/metadata_provider/metadata.py +13 -2
  48. metaflow/metaflow_config.py +66 -4
  49. metaflow/metaflow_environment.py +91 -25
  50. metaflow/metaflow_profile.py +18 -0
  51. metaflow/metaflow_version.py +16 -1
  52. metaflow/package/__init__.py +673 -0
  53. metaflow/packaging_sys/__init__.py +880 -0
  54. metaflow/packaging_sys/backend.py +128 -0
  55. metaflow/packaging_sys/distribution_support.py +153 -0
  56. metaflow/packaging_sys/tar_backend.py +99 -0
  57. metaflow/packaging_sys/utils.py +54 -0
  58. metaflow/packaging_sys/v1.py +527 -0
  59. metaflow/parameters.py +6 -2
  60. metaflow/plugins/__init__.py +6 -0
  61. metaflow/plugins/airflow/airflow.py +11 -1
  62. metaflow/plugins/airflow/airflow_cli.py +16 -5
  63. metaflow/plugins/argo/argo_client.py +42 -20
  64. metaflow/plugins/argo/argo_events.py +6 -6
  65. metaflow/plugins/argo/argo_workflows.py +1023 -344
  66. metaflow/plugins/argo/argo_workflows_cli.py +396 -94
  67. metaflow/plugins/argo/argo_workflows_decorator.py +9 -0
  68. metaflow/plugins/argo/argo_workflows_deployer_objects.py +75 -49
  69. metaflow/plugins/argo/capture_error.py +5 -2
  70. metaflow/plugins/argo/conditional_input_paths.py +35 -0
  71. metaflow/plugins/argo/exit_hooks.py +209 -0
  72. metaflow/plugins/argo/param_val.py +19 -0
  73. metaflow/plugins/aws/aws_client.py +6 -0
  74. metaflow/plugins/aws/aws_utils.py +33 -1
  75. metaflow/plugins/aws/batch/batch.py +72 -5
  76. metaflow/plugins/aws/batch/batch_cli.py +24 -3
  77. metaflow/plugins/aws/batch/batch_decorator.py +57 -6
  78. metaflow/plugins/aws/step_functions/step_functions.py +28 -3
  79. metaflow/plugins/aws/step_functions/step_functions_cli.py +49 -4
  80. metaflow/plugins/aws/step_functions/step_functions_deployer.py +3 -0
  81. metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +30 -0
  82. metaflow/plugins/cards/card_cli.py +20 -1
  83. metaflow/plugins/cards/card_creator.py +24 -1
  84. metaflow/plugins/cards/card_datastore.py +21 -49
  85. metaflow/plugins/cards/card_decorator.py +58 -6
  86. metaflow/plugins/cards/card_modules/basic.py +38 -9
  87. metaflow/plugins/cards/card_modules/bundle.css +1 -1
  88. metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
  89. metaflow/plugins/cards/card_modules/components.py +592 -3
  90. metaflow/plugins/cards/card_modules/convert_to_native_type.py +34 -5
  91. metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
  92. metaflow/plugins/cards/card_modules/main.css +1 -0
  93. metaflow/plugins/cards/card_modules/main.js +56 -41
  94. metaflow/plugins/cards/card_modules/test_cards.py +22 -6
  95. metaflow/plugins/cards/component_serializer.py +1 -8
  96. metaflow/plugins/cards/metadata.py +22 -0
  97. metaflow/plugins/catch_decorator.py +9 -0
  98. metaflow/plugins/datastores/local_storage.py +12 -6
  99. metaflow/plugins/datastores/spin_storage.py +12 -0
  100. metaflow/plugins/datatools/s3/s3.py +49 -17
  101. metaflow/plugins/datatools/s3/s3op.py +113 -66
  102. metaflow/plugins/env_escape/client_modules.py +102 -72
  103. metaflow/plugins/events_decorator.py +127 -121
  104. metaflow/plugins/exit_hook/__init__.py +0 -0
  105. metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
  106. metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
  107. metaflow/plugins/kubernetes/kubernetes.py +12 -1
  108. metaflow/plugins/kubernetes/kubernetes_cli.py +11 -0
  109. metaflow/plugins/kubernetes/kubernetes_decorator.py +25 -6
  110. metaflow/plugins/kubernetes/kubernetes_job.py +12 -4
  111. metaflow/plugins/kubernetes/kubernetes_jobsets.py +31 -30
  112. metaflow/plugins/metadata_providers/local.py +76 -82
  113. metaflow/plugins/metadata_providers/service.py +13 -9
  114. metaflow/plugins/metadata_providers/spin.py +16 -0
  115. metaflow/plugins/package_cli.py +36 -24
  116. metaflow/plugins/parallel_decorator.py +11 -2
  117. metaflow/plugins/parsers.py +16 -0
  118. metaflow/plugins/pypi/bootstrap.py +7 -1
  119. metaflow/plugins/pypi/conda_decorator.py +41 -82
  120. metaflow/plugins/pypi/conda_environment.py +14 -6
  121. metaflow/plugins/pypi/micromamba.py +9 -1
  122. metaflow/plugins/pypi/pip.py +41 -5
  123. metaflow/plugins/pypi/pypi_decorator.py +4 -4
  124. metaflow/plugins/pypi/utils.py +22 -0
  125. metaflow/plugins/secrets/__init__.py +3 -0
  126. metaflow/plugins/secrets/secrets_decorator.py +14 -178
  127. metaflow/plugins/secrets/secrets_func.py +49 -0
  128. metaflow/plugins/secrets/secrets_spec.py +101 -0
  129. metaflow/plugins/secrets/utils.py +74 -0
  130. metaflow/plugins/test_unbounded_foreach_decorator.py +2 -2
  131. metaflow/plugins/timeout_decorator.py +0 -1
  132. metaflow/plugins/uv/bootstrap.py +29 -1
  133. metaflow/plugins/uv/uv_environment.py +5 -3
  134. metaflow/pylint_wrapper.py +5 -1
  135. metaflow/runner/click_api.py +79 -26
  136. metaflow/runner/deployer.py +208 -6
  137. metaflow/runner/deployer_impl.py +32 -12
  138. metaflow/runner/metaflow_runner.py +266 -33
  139. metaflow/runner/subprocess_manager.py +21 -1
  140. metaflow/runner/utils.py +27 -16
  141. metaflow/runtime.py +660 -66
  142. metaflow/task.py +255 -26
  143. metaflow/user_configs/config_options.py +33 -21
  144. metaflow/user_configs/config_parameters.py +220 -58
  145. metaflow/user_decorators/__init__.py +0 -0
  146. metaflow/user_decorators/common.py +144 -0
  147. metaflow/user_decorators/mutable_flow.py +512 -0
  148. metaflow/user_decorators/mutable_step.py +424 -0
  149. metaflow/user_decorators/user_flow_decorator.py +264 -0
  150. metaflow/user_decorators/user_step_decorator.py +749 -0
  151. metaflow/util.py +197 -7
  152. metaflow/vendor.py +23 -7
  153. metaflow/version.py +1 -1
  154. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/Makefile +13 -2
  155. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/Tiltfile +107 -7
  156. {ob_metaflow-2.15.13.1.data → ob_metaflow-2.19.7.1rc0.data}/data/share/metaflow/devtools/pick_services.sh +1 -0
  157. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/METADATA +2 -3
  158. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/RECORD +162 -121
  159. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
  160. metaflow/_vendor/v3_5/__init__.py +0 -1
  161. metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
  162. metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
  163. metaflow/_vendor/v3_5/zipp.py +0 -329
  164. metaflow/info_file.py +0 -25
  165. metaflow/package.py +0 -203
  166. metaflow/user_configs/config_decorators.py +0 -568
  167. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +0 -0
  168. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/licenses/LICENSE +0 -0
  169. {ob_metaflow-2.15.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,673 @@
1
+ import json
2
+ import os
3
+ import sys
4
+ import threading
5
+ import time
6
+
7
+ from io import BytesIO
8
+ from types import ModuleType
9
+ from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Type, cast
10
+
11
+ from ..debug import debug
12
+ from ..packaging_sys import ContentType, MetaflowCodeContent
13
+ from ..packaging_sys.backend import PackagingBackend
14
+ from ..packaging_sys.tar_backend import TarPackagingBackend
15
+ from ..packaging_sys.v1 import MetaflowCodeContentV1
16
+ from ..packaging_sys.utils import suffix_filter, walk
17
+ from ..metaflow_config import DEFAULT_PACKAGE_SUFFIXES
18
+ from ..exception import MetaflowException
19
+ from ..user_configs.config_parameters import dump_config_values
20
+ from .. import R
21
+
22
+ DEFAULT_SUFFIXES_LIST = DEFAULT_PACKAGE_SUFFIXES.split(",")
23
+
24
+
25
+ if TYPE_CHECKING:
26
+ import metaflow.datastore
27
+
28
+
29
+ class NonUniqueFileNameToFilePathMappingException(MetaflowException):
30
+ headline = "Non-unique file path for a file name included in code package"
31
+
32
+ def __init__(self, filename, file_paths, lineno=None):
33
+ msg = (
34
+ "Filename %s included in the code package includes multiple different "
35
+ "paths for the same name : %s.\n"
36
+ "The `filename` in the `add_to_package` decorator hook requires a unique "
37
+ "`file_path` to `file_name` mapping" % (filename, ", ".join(file_paths))
38
+ )
39
+ super().__init__(msg=msg, lineno=lineno)
40
+
41
+
42
+ class MetaflowPackage(object):
43
+ def __init__(
44
+ self,
45
+ flow,
46
+ environment,
47
+ echo,
48
+ suffixes: Optional[List[str]] = DEFAULT_SUFFIXES_LIST,
49
+ user_code_filter: Optional[Callable[[str], bool]] = None,
50
+ flow_datastore: Optional["metaflow.datastore.FlowDataStore"] = None,
51
+ mfcontent: Optional[MetaflowCodeContent] = None,
52
+ exclude_tl_dirs=None,
53
+ backend: Type[PackagingBackend] = TarPackagingBackend,
54
+ ):
55
+ self._environment = environment
56
+ self._environment.init_environment(echo)
57
+
58
+ self._echo = echo
59
+ self._flow = flow
60
+ self._flow_datastore = flow_datastore
61
+ self._backend = backend
62
+
63
+ # Info about the package
64
+ self._name = None
65
+ self._create_time = time.time()
66
+ self._user_flow_dir = None
67
+
68
+ # Content of the package (and settings on how to create it)
69
+ if suffixes is not None:
70
+ self._suffixes = list(set().union(suffixes, DEFAULT_SUFFIXES_LIST))
71
+ else:
72
+ self._suffixes = None
73
+
74
+ def _module_selector(m) -> bool:
75
+ from ..user_decorators.user_flow_decorator import FlowMutatorMeta
76
+ from ..user_decorators.user_step_decorator import UserStepDecoratorMeta
77
+
78
+ # Be very defensive here to filter modules in case there are
79
+ # some badly behaved modules that have weird values for
80
+ # METAFLOW_PACKAGE_POLICY for example.
81
+ try:
82
+ if (
83
+ m.__name__ in FlowMutatorMeta._import_modules
84
+ or m.__name__ in UserStepDecoratorMeta._import_modules
85
+ or (
86
+ hasattr(m, "METAFLOW_PACKAGE_POLICY")
87
+ and m.METAFLOW_PACKAGE_POLICY == "include"
88
+ )
89
+ ):
90
+ return True
91
+ return False
92
+ except:
93
+ return False
94
+
95
+ if mfcontent is None:
96
+ self._mfcontent = MetaflowCodeContentV1(criteria=_module_selector)
97
+
98
+ else:
99
+ self._mfcontent = mfcontent
100
+ # We exclude the environment when packaging as this will be packaged separately.
101
+ # This comes into play primarily if packaging from a node already running packaged
102
+ # code.
103
+ # These directories are only excluded at the top-level (ie: not further down
104
+ # in sub-directories)
105
+ # "_escape_trampolines" is a special directory where trampoline escape hatch
106
+ # files are stored (used by Netflix Extension's Conda implementation).
107
+ self._exclude_tl_dirs = (
108
+ self._mfcontent.get_excluded_tl_entries()
109
+ + ["_escape_trampolines"]
110
+ + (exclude_tl_dirs or [])
111
+ )
112
+
113
+ if self._suffixes is not None and user_code_filter is not None:
114
+ self._user_code_filter = lambda x, f1=suffix_filter(
115
+ self._suffixes
116
+ ), f2=user_code_filter: f1(x) and f2(x)
117
+ self._filter_type = "suffixes and user filter"
118
+ elif self._suffixes is not None:
119
+ self._user_code_filter = suffix_filter(self._suffixes)
120
+ self._filter_type = "suffixes"
121
+ elif user_code_filter is not None:
122
+ self._user_code_filter = user_code_filter
123
+ self._filter_type = "user filter"
124
+ else:
125
+ self._user_code_filter = lambda x: True
126
+ self._filter_type = "no filter"
127
+
128
+ # Info about the package creation (it happens async)
129
+ self._is_package_available = None
130
+ self._blob_sha = None
131
+ self._blob_url = None
132
+ self._blob = None
133
+
134
+ # We launch a thread to create the package asynchronously and upload
135
+ # it opportunistically
136
+ self._create_thread = threading.Thread(
137
+ target=self._package_and_upload,
138
+ daemon=True,
139
+ )
140
+ self._create_thread.start()
141
+
142
+ # HORRIBLE HACK SO THAT CURRENT COMPUTE IMPLEMENTATIONS CAN STILL
143
+ # DO pkg.blob. Ideally, this goes away and blob_with_timeout becomes
144
+ # the main method (called blob).
145
+ @property
146
+ def blob(self) -> BytesIO:
147
+ return self.blob_with_timeout()
148
+
149
+ def blob_with_timeout(self, timeout: Optional[float] = None) -> BytesIO:
150
+ if self._blob is None:
151
+ self._create_thread.join(timeout)
152
+ if self._is_package_available is not None:
153
+ # We have our result now
154
+ if self._is_package_available:
155
+ return self._blob
156
+ else:
157
+ raise self._packaging_exception
158
+ return self._blob
159
+
160
+ def package_sha(self, timeout: Optional[float] = None) -> Optional[str]:
161
+ if self._blob_sha is None:
162
+ self._create_thread.join(timeout)
163
+ if self._is_package_available is not None:
164
+ # We have our result now
165
+ if self._is_package_available:
166
+ return self._blob_sha
167
+ else:
168
+ raise self._packaging_exception
169
+ return self._blob_sha
170
+
171
+ def package_url(self, timeout: Optional[float] = None) -> Optional[str]:
172
+ if self._blob_url is None:
173
+ self._create_thread.join(timeout)
174
+ if self._is_package_available is not None:
175
+ # We have our result now
176
+ if self._is_package_available:
177
+ return self._blob_url
178
+ else:
179
+ raise self._packaging_exception
180
+ return self._blob_url
181
+
182
+ @property
183
+ def package_metadata(self):
184
+ return json.dumps(
185
+ {
186
+ "version": 0,
187
+ "archive_format": self._backend.backend_type(),
188
+ "mfcontent_version": self._mfcontent.get_package_version(),
189
+ }
190
+ )
191
+
192
+ @classmethod
193
+ def get_backend(cls, pkg_metadata: str) -> PackagingBackend:
194
+ """
195
+ Method to get the backend type from the package metadata.
196
+
197
+ Parameters
198
+ ----------
199
+ pkg_metadata : str
200
+ The metadata of the package to extract.
201
+
202
+ Returns
203
+ -------
204
+ PackagingBackend
205
+ The backend type that can be used to extract the package.
206
+ """
207
+ backend_type = json.loads(pkg_metadata).get("archive_format", "tgz")
208
+ return PackagingBackend.get_backend(backend_type)
209
+
210
+ @classmethod
211
+ def get_extract_commands(
212
+ cls, pkg_metadata: str, archive_path: str, dest_dir: str = "."
213
+ ) -> List[str]:
214
+ """
215
+ Method to get the commands needed to extract the package into
216
+ the directory dest_dir. Note that this will return a list of commands
217
+ that can be passed to subprocess.run for example.
218
+
219
+ Parameters
220
+ ----------
221
+ pkg_metadata : str
222
+ The metadata of the package to extract.
223
+ archive_path : str
224
+ The path to the archive to extract.
225
+ dest_dir : str, default "."
226
+ The directory to extract the package into.
227
+
228
+ Returns
229
+ -------
230
+ List[str]
231
+ The commands needed to extract the package into the directory dest_dir.
232
+ """
233
+ backend_type = json.loads(pkg_metadata).get("archive_format", "tgz")
234
+ # We now ask the backend type how to extract itself
235
+ backend = PackagingBackend.get_backend(backend_type)
236
+ cmds = backend.get_extract_commands(archive_path, dest_dir)
237
+ debug.package_exec(f"Command to extract {archive_path} into {dest_dir}: {cmds}")
238
+ return cmds
239
+
240
+ @classmethod
241
+ def get_post_extract_env_vars(
242
+ cls, pkg_metadata: str, dest_dir: str = "."
243
+ ) -> Dict[str, str]:
244
+ """
245
+ Method to get the environment variables needed to access the content
246
+ that has been extracted into the directory dest_dir. This will
247
+ typically involve setting PYTHONPATH
248
+
249
+ Parameters
250
+ ----------
251
+ pkg_metadata : str
252
+ The metadata of the package to extract.
253
+ dest_dir : str, default "."
254
+ The directory where the content has been extracted to.
255
+
256
+ Returns
257
+ -------
258
+ Dict[str, str]
259
+ The post-extract environment variables that are needed to access the content
260
+ that has been extracted into dest_dir.
261
+ """
262
+ mfcontent_version = json.loads(pkg_metadata).get("mfcontent_version", 0)
263
+ env_vars = MetaflowCodeContent.get_post_extract_env_vars(
264
+ mfcontent_version, dest_dir
265
+ )
266
+ debug.package_exec(
267
+ f"Environment variables to access content extracted into {dest_dir}: {env_vars}"
268
+ )
269
+ return env_vars
270
+
271
+ @classmethod
272
+ def cls_get_content(
273
+ cls, pkg_metadata, archive: BytesIO, name: str
274
+ ) -> Optional[bytes]:
275
+ """
276
+ Method to get the content of a member in the package archive.
277
+
278
+ Parameters
279
+ ----------
280
+ pkg_metadata : str
281
+ The metadata of the package to extract.
282
+ archive : BytesIO
283
+ The archive to extract the member from.
284
+ name : str
285
+ The name of the member to extract.
286
+
287
+ Returns
288
+ -------
289
+ Optional[bytes]
290
+ The content of the member if it exists, None otherwise.
291
+ """
292
+ backend = cls.get_backend(pkg_metadata)
293
+ with backend.cls_open(archive) as opened_archive:
294
+ return backend.cls_get_member(opened_archive, name)
295
+
296
+ @classmethod
297
+ def cls_get_info(cls, pkg_metadata, archive: BytesIO) -> Optional[Dict[str, str]]:
298
+ """
299
+ Method to get the info of the package from the archive.
300
+ Parameters
301
+ ----------
302
+ pkg_metadata : str
303
+ The metadata of the package to extract.
304
+ archive : BytesIO
305
+ The archive to extract the info from.
306
+ Returns
307
+ -------
308
+ Optional[Dict[str, str]]
309
+ The info of the package if it exists, None otherwise.
310
+ """
311
+ backend = cls.get_backend(pkg_metadata)
312
+ with backend.cls_open(archive) as opened_archive:
313
+ return MetaflowCodeContent.get_archive_info(opened_archive, backend)
314
+
315
+ @classmethod
316
+ def cls_get_config(
317
+ cls, pkg_metadata: str, archive: BytesIO
318
+ ) -> Optional[Dict[str, str]]:
319
+ """
320
+ Method to get the config of the package from the archive.
321
+
322
+ Parameters
323
+ ----------
324
+ pkg_metadata : str
325
+ The metadata of the package to extract.
326
+ archive : BytesIO
327
+ The archive to extract the config from.
328
+
329
+ Returns
330
+ -------
331
+ Optional[Dict[str, str]]
332
+ The config of the package if it exists, None otherwise.
333
+ """
334
+ backend = cls.get_backend(pkg_metadata)
335
+ with backend.cls_open(archive) as opened_archive:
336
+ return MetaflowCodeContent.get_archive_config(opened_archive, backend)
337
+
338
+ @classmethod
339
+ def cls_extract_into(
340
+ cls,
341
+ pkg_metadata: str,
342
+ archive: BytesIO,
343
+ dest_dir: str = ".",
344
+ content_types: int = ContentType.ALL_CONTENT.value,
345
+ ):
346
+ """
347
+ Method to extract the package archive into a directory.
348
+
349
+ Parameters
350
+ ----------
351
+ pkg_metadata : str
352
+ The metadata of the package to extract.
353
+ archive : BytesIO
354
+ The archive to extract.
355
+ dest_dir : str, default "."
356
+ The directory to extract the package into.
357
+ content_types : int, default ALL_CONTENT
358
+ The types of content to extract. This is a bitmask of ContentType values.
359
+ """
360
+ backend = cls.get_backend(pkg_metadata)
361
+ with backend.cls_open(archive) as opened_archive:
362
+ include_members = MetaflowCodeContent.get_archive_content_members(
363
+ opened_archive, content_types, backend
364
+ )
365
+ backend.cls_extract_members(opened_archive, include_members, dest_dir)
366
+
367
+ def user_tuples(self, timeout: Optional[float] = None):
368
+ # Wait for at least the blob to be formed
369
+ _ = self.blob_with_timeout(timeout=timeout)
370
+ for path, arcname in self._cached_user_members:
371
+ yield path, arcname
372
+
373
+ def path_tuples(self, timeout: Optional[float] = None):
374
+ # Wait for at least the blob to be formed
375
+ _ = self.blob_with_timeout(timeout=timeout)
376
+ # Files included in the environment
377
+ yield from self._mfcontent.content_names()
378
+
379
+ # Files included in the user code
380
+ yield from self.user_tuples()
381
+
382
+ def show(self, timeout: Optional[float] = None) -> str:
383
+ # Human-readable content of the package
384
+ blob = self.blob_with_timeout(timeout=timeout) # Ensure the package is created
385
+ lines = [
386
+ f"Package size: {self._format_size(len(blob))}",
387
+ f"Number of files: {sum(1 for _ in self.path_tuples())}",
388
+ self._mfcontent.show(),
389
+ ]
390
+
391
+ if self._flow:
392
+ lines.append(f"\nUser code in flow {self._name}:")
393
+ lines.append(f" - Packaged from directory {self._user_flow_dir}")
394
+ if self._filter_type != "no filter":
395
+ if self._suffixes:
396
+ lines.append(
397
+ f" - Filtered by suffixes: {', '.join(self._suffixes)}"
398
+ )
399
+ else:
400
+ lines.append(f" - Filtered by {self._filter_type}")
401
+ else:
402
+ lines.append(" - No user code filter applied")
403
+ if self._exclude_tl_dirs:
404
+ lines.append(
405
+ f" - Excluded directories: {', '.join(self._exclude_tl_dirs)}"
406
+ )
407
+ return "\n".join(lines)
408
+
409
+ def get_content(
410
+ self, name: str, content_type: ContentType, timeout: Optional[float] = None
411
+ ) -> Optional[bytes]:
412
+ """
413
+ Method to get the content of a file within the package. This method
414
+ should be used for one-off access to small-ish files. If more files are
415
+ needed, use extract_into to extract the package into a directory and
416
+ then access the files from there.
417
+
418
+ Parameters
419
+ ----------
420
+ name : str
421
+ The name of the file to get the content of. Note that this
422
+ is not necessarily the name in the archive but is the name
423
+ that was passed in when creating the archive (in the archive,
424
+ it may be prefixed by some directory structure).
425
+ content_type : ContentType
426
+ The type of file to get the content of.
427
+
428
+ Returns
429
+ -------
430
+ Optional[bytes]
431
+ The content of the file. If the file is not found, None is returned.
432
+ """
433
+ # Wait for at least the blob to be formed
434
+ _ = self.blob_with_timeout(timeout=timeout)
435
+ if content_type == ContentType.USER_CONTENT:
436
+ for path, arcname in self.user_tuples():
437
+ if name == arcname:
438
+ return open(path, "rb").read()
439
+ return None
440
+ elif content_type in (
441
+ ContentType.CODE_CONTENT,
442
+ ContentType.MODULE_CONTENT,
443
+ ContentType.OTHER_CONTENT,
444
+ ):
445
+ mangled_name = self._mfcontent.get_archive_filename(name, content_type)
446
+ for path_or_bytes, arcname in self._mfcontent.contents(content_type):
447
+ if mangled_name == arcname:
448
+ if isinstance(path_or_bytes, bytes):
449
+ # In case this is generated content like an INFO file
450
+ return path_or_bytes
451
+ # Otherwise, it is a file path
452
+ return open(path_or_bytes, "rb").read()
453
+ return None
454
+ raise ValueError(f"Unknown content type: {content_type}")
455
+
456
+ def extract_into(
457
+ self,
458
+ dest_dir: str = ".",
459
+ content_types: int = ContentType.ALL_CONTENT.value,
460
+ timeout: Optional[float] = None,
461
+ ):
462
+ """
463
+ Method to extract the package (or some of the files) into a directory.
464
+
465
+ Parameters
466
+ ----------
467
+ dest_dir : str, default "."
468
+ The directory to extract the package into.
469
+ content_types : int, default ALL_CONTENT
470
+ The types of content to extract.
471
+ """
472
+ _ = self.blob_with_timeout(timeout=timeout) # Ensure the package is created
473
+ member_list = []
474
+ if content_types & ContentType.USER_CONTENT.value:
475
+ member_list.extend(
476
+ [(m[0], os.path.join(dest_dir, m[1])) for m in self.user_tuples()]
477
+ )
478
+ if content_types & (
479
+ ContentType.CODE_CONTENT.value | ContentType.MODULE_CONTENT.value
480
+ ):
481
+ # We need to get the name of the files in the content archive to extract
482
+ member_list.extend(
483
+ [
484
+ (m[0], os.path.join(dest_dir, m[1]))
485
+ for m in self._mfcontent.content_names(
486
+ content_types & ~ContentType.OTHER_CONTENT.value
487
+ )
488
+ ]
489
+ )
490
+ for orig_path, new_path in member_list:
491
+ os.makedirs(os.path.dirname(new_path), exist_ok=True)
492
+ # TODO: In case there are duplicate files -- that should not be the case
493
+ # but there is a bug currently with internal Netflix code.
494
+ if not os.path.exists(new_path):
495
+ os.symlink(orig_path, new_path)
496
+ # Could copy files as well if we want to split them out.
497
+ # shutil.copy(orig_path, new_path)
498
+ # OTHER_CONTENT requires special handling because sometimes the file isn't a file
499
+ # but generated content
500
+ member_list = []
501
+ if content_types & ContentType.OTHER_CONTENT.value:
502
+ member_list.extend(
503
+ [
504
+ (m[0], os.path.join(dest_dir, m[1]))
505
+ for m in self._mfcontent.contents(ContentType.OTHER_CONTENT)
506
+ ]
507
+ )
508
+ for path_or_content, new_path in member_list:
509
+ os.makedirs(os.path.dirname(new_path), exist_ok=True)
510
+ if not os.path.exists(new_path):
511
+ if isinstance(path_or_content, bytes):
512
+ with open(new_path, "wb") as f:
513
+ f.write(path_or_content)
514
+ else:
515
+ os.symlink(path_or_content, new_path)
516
+
517
+ @staticmethod
518
+ def _format_size(size_in_bytes):
519
+ for unit in ["B", "KB", "MB", "GB", "TB"]:
520
+ if size_in_bytes < 1024.0:
521
+ return f"{size_in_bytes:.2f} {unit}"
522
+ size_in_bytes /= 1024.0
523
+ return f"{size_in_bytes:.2f} PB"
524
+
525
+ def _package_and_upload(self):
526
+ try:
527
+ # Can be called without a flow (Function)
528
+ if self._flow:
529
+ for step in self._flow:
530
+ for deco in step.decorators:
531
+ deco.package_init(self._flow, step.__name__, self._environment)
532
+ self._name = f"flow {self._flow.name}"
533
+ else:
534
+ self._name = "<generic code package>"
535
+
536
+ # Add metacontent
537
+ self._mfcontent.add_info(
538
+ self._environment.get_environment_info(include_ext_info=True)
539
+ )
540
+
541
+ self._mfcontent.add_config(dump_config_values(self._flow))
542
+
543
+ # Add user files (from decorators and environment)
544
+ if self._flow:
545
+ self._add_addl_files()
546
+ self._cached_user_members = list(self._user_code_tuples())
547
+ debug.package_exec(
548
+ f"User files to package: {self._cached_user_members}"
549
+ )
550
+
551
+ self._blob = self._make()
552
+ if self._flow_datastore:
553
+ if len(self._blob) > 100 * 1024 * 1024:
554
+ self._echo(
555
+ f"Warning: The code package for {self._flow.name} is larger than "
556
+ f"100MB (found it to be {self._format_size(len(self._blob))}) "
557
+ "This may lead to slower upload times for remote runs and no "
558
+ "uploads for local runs. Consider reducing the package size. "
559
+ "Use `<myflow.py> package info` or `<myflow.py> package list` "
560
+ "to get more information about what is included in the package."
561
+ )
562
+ self._blob_url, self._blob_sha = self._flow_datastore.save_data(
563
+ [self._blob], len_hint=1
564
+ )[0]
565
+ else:
566
+ self._blob_url = self._blob_sha = ""
567
+ self._is_package_available = True
568
+ except Exception as e:
569
+ self._packaging_exception = e
570
+ self._echo(f"Package creation/upload failed for {self._flow.name}: {e}")
571
+ self._is_package_available = False
572
+
573
+ def _add_addl_files(self):
574
+ # Look at all decorators that provide additional files
575
+ deco_module_paths = {}
576
+ addl_modules = set()
577
+
578
+ def _check_tuple(path_tuple):
579
+ if len(path_tuple) == 2:
580
+ path_tuple = (
581
+ path_tuple[0],
582
+ path_tuple[1],
583
+ ContentType.CODE_CONTENT,
584
+ )
585
+ file_path, file_name, file_type = path_tuple
586
+ if file_type == ContentType.MODULE_CONTENT:
587
+ if file_path in addl_modules:
588
+ return None # Module was already added -- we don't add twice
589
+ addl_modules.add(file_path)
590
+ elif file_type in (
591
+ ContentType.OTHER_CONTENT,
592
+ ContentType.CODE_CONTENT,
593
+ ):
594
+ path_tuple = (os.path.realpath(path_tuple[0]), path_tuple[1], file_type)
595
+ # These are files
596
+ # Check if the path is not duplicated as
597
+ # many steps can have the same packages being imported
598
+ if file_name not in deco_module_paths:
599
+ deco_module_paths[file_name] = file_path
600
+ elif deco_module_paths[file_name] != file_path:
601
+ raise NonUniqueFileNameToFilePathMappingException(
602
+ file_name, [deco_module_paths[file_name], file_path]
603
+ )
604
+ else:
605
+ raise ValueError(f"Unknown file type: {file_type}")
606
+ return path_tuple
607
+
608
+ def _add_tuple(path_tuple):
609
+ file_path, file_name, file_type = path_tuple
610
+ if file_type == ContentType.MODULE_CONTENT:
611
+ # file_path is actually a module
612
+ self._mfcontent.add_module(cast(ModuleType, file_path))
613
+ elif file_type == ContentType.CODE_CONTENT:
614
+ self._mfcontent.add_code_file(file_path, file_name)
615
+ elif file_type == ContentType.OTHER_CONTENT:
616
+ self._mfcontent.add_other_file(file_path, file_name)
617
+
618
+ for step in self._flow:
619
+ for deco in step.decorators:
620
+ for path_tuple in deco.add_to_package():
621
+ path_tuple = _check_tuple(path_tuple)
622
+ if path_tuple is None:
623
+ continue
624
+ _add_tuple(path_tuple)
625
+
626
+ # the package folders for environment
627
+ for path_tuple in self._environment.add_to_package():
628
+ path_tuple = _check_tuple(path_tuple)
629
+ if path_tuple is None:
630
+ continue
631
+ _add_tuple(path_tuple)
632
+
633
+ def _user_code_tuples(self):
634
+ if R.use_r():
635
+ # the R working directory
636
+ self._user_flow_dir = R.working_dir()
637
+ for path_tuple in walk(
638
+ "%s/" % R.working_dir(), file_filter=self._user_code_filter
639
+ ):
640
+ yield path_tuple
641
+ # the R package
642
+ for path_tuple in R.package_paths():
643
+ yield path_tuple
644
+ else:
645
+ # the user's working directory
646
+ flowdir = os.path.dirname(os.path.abspath(sys.argv[0])) + "/"
647
+ self._user_flow_dir = flowdir
648
+ for path_tuple in walk(
649
+ flowdir,
650
+ file_filter=self._user_code_filter,
651
+ exclude_tl_dirs=self._exclude_tl_dirs,
652
+ ):
653
+ # TODO: This is where we will check if the file is already included
654
+ # in the mfcontent portion
655
+ yield path_tuple
656
+
657
+ def _make(self):
658
+ backend = self._backend()
659
+ with backend.create() as archive:
660
+ # Package the environment
661
+ for path_or_bytes, arcname in self._mfcontent.contents():
662
+ if isinstance(path_or_bytes, str):
663
+ archive.add_file(path_or_bytes, arcname=arcname)
664
+ else:
665
+ archive.add_data(BytesIO(path_or_bytes), arcname=arcname)
666
+
667
+ # Package the user code
668
+ for path, arcname in self._cached_user_members:
669
+ archive.add_file(path, arcname=arcname)
670
+ return backend.get_blob()
671
+
672
+ def __str__(self):
673
+ return f"<code package for {self._name} (created @ {self._create_time})>"