flyte 2.0.0b32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of flyte might be problematic. Click here for more details.

Files changed (204) hide show
  1. flyte/__init__.py +108 -0
  2. flyte/_bin/__init__.py +0 -0
  3. flyte/_bin/debug.py +38 -0
  4. flyte/_bin/runtime.py +195 -0
  5. flyte/_bin/serve.py +178 -0
  6. flyte/_build.py +26 -0
  7. flyte/_cache/__init__.py +12 -0
  8. flyte/_cache/cache.py +147 -0
  9. flyte/_cache/defaults.py +9 -0
  10. flyte/_cache/local_cache.py +216 -0
  11. flyte/_cache/policy_function_body.py +42 -0
  12. flyte/_code_bundle/__init__.py +8 -0
  13. flyte/_code_bundle/_ignore.py +121 -0
  14. flyte/_code_bundle/_packaging.py +218 -0
  15. flyte/_code_bundle/_utils.py +347 -0
  16. flyte/_code_bundle/bundle.py +266 -0
  17. flyte/_constants.py +1 -0
  18. flyte/_context.py +155 -0
  19. flyte/_custom_context.py +73 -0
  20. flyte/_debug/__init__.py +0 -0
  21. flyte/_debug/constants.py +38 -0
  22. flyte/_debug/utils.py +17 -0
  23. flyte/_debug/vscode.py +307 -0
  24. flyte/_deploy.py +408 -0
  25. flyte/_deployer.py +109 -0
  26. flyte/_doc.py +29 -0
  27. flyte/_docstring.py +32 -0
  28. flyte/_environment.py +122 -0
  29. flyte/_excepthook.py +37 -0
  30. flyte/_group.py +32 -0
  31. flyte/_hash.py +8 -0
  32. flyte/_image.py +1055 -0
  33. flyte/_initialize.py +628 -0
  34. flyte/_interface.py +119 -0
  35. flyte/_internal/__init__.py +3 -0
  36. flyte/_internal/controllers/__init__.py +129 -0
  37. flyte/_internal/controllers/_local_controller.py +239 -0
  38. flyte/_internal/controllers/_trace.py +48 -0
  39. flyte/_internal/controllers/remote/__init__.py +58 -0
  40. flyte/_internal/controllers/remote/_action.py +211 -0
  41. flyte/_internal/controllers/remote/_client.py +47 -0
  42. flyte/_internal/controllers/remote/_controller.py +583 -0
  43. flyte/_internal/controllers/remote/_core.py +465 -0
  44. flyte/_internal/controllers/remote/_informer.py +381 -0
  45. flyte/_internal/controllers/remote/_service_protocol.py +50 -0
  46. flyte/_internal/imagebuild/__init__.py +3 -0
  47. flyte/_internal/imagebuild/docker_builder.py +706 -0
  48. flyte/_internal/imagebuild/image_builder.py +277 -0
  49. flyte/_internal/imagebuild/remote_builder.py +386 -0
  50. flyte/_internal/imagebuild/utils.py +78 -0
  51. flyte/_internal/resolvers/__init__.py +0 -0
  52. flyte/_internal/resolvers/_task_module.py +21 -0
  53. flyte/_internal/resolvers/common.py +31 -0
  54. flyte/_internal/resolvers/default.py +28 -0
  55. flyte/_internal/runtime/__init__.py +0 -0
  56. flyte/_internal/runtime/convert.py +486 -0
  57. flyte/_internal/runtime/entrypoints.py +204 -0
  58. flyte/_internal/runtime/io.py +188 -0
  59. flyte/_internal/runtime/resources_serde.py +152 -0
  60. flyte/_internal/runtime/reuse.py +125 -0
  61. flyte/_internal/runtime/rusty.py +193 -0
  62. flyte/_internal/runtime/task_serde.py +362 -0
  63. flyte/_internal/runtime/taskrunner.py +209 -0
  64. flyte/_internal/runtime/trigger_serde.py +160 -0
  65. flyte/_internal/runtime/types_serde.py +54 -0
  66. flyte/_keyring/__init__.py +0 -0
  67. flyte/_keyring/file.py +115 -0
  68. flyte/_logging.py +300 -0
  69. flyte/_map.py +312 -0
  70. flyte/_module.py +72 -0
  71. flyte/_pod.py +30 -0
  72. flyte/_resources.py +473 -0
  73. flyte/_retry.py +32 -0
  74. flyte/_reusable_environment.py +102 -0
  75. flyte/_run.py +724 -0
  76. flyte/_secret.py +96 -0
  77. flyte/_task.py +550 -0
  78. flyte/_task_environment.py +316 -0
  79. flyte/_task_plugins.py +47 -0
  80. flyte/_timeout.py +47 -0
  81. flyte/_tools.py +27 -0
  82. flyte/_trace.py +119 -0
  83. flyte/_trigger.py +1000 -0
  84. flyte/_utils/__init__.py +30 -0
  85. flyte/_utils/asyn.py +121 -0
  86. flyte/_utils/async_cache.py +139 -0
  87. flyte/_utils/coro_management.py +27 -0
  88. flyte/_utils/docker_credentials.py +173 -0
  89. flyte/_utils/file_handling.py +72 -0
  90. flyte/_utils/helpers.py +134 -0
  91. flyte/_utils/lazy_module.py +54 -0
  92. flyte/_utils/module_loader.py +104 -0
  93. flyte/_utils/org_discovery.py +57 -0
  94. flyte/_utils/uv_script_parser.py +49 -0
  95. flyte/_version.py +34 -0
  96. flyte/app/__init__.py +22 -0
  97. flyte/app/_app_environment.py +157 -0
  98. flyte/app/_deploy.py +125 -0
  99. flyte/app/_input.py +160 -0
  100. flyte/app/_runtime/__init__.py +3 -0
  101. flyte/app/_runtime/app_serde.py +347 -0
  102. flyte/app/_types.py +101 -0
  103. flyte/app/extras/__init__.py +3 -0
  104. flyte/app/extras/_fastapi.py +151 -0
  105. flyte/cli/__init__.py +12 -0
  106. flyte/cli/_abort.py +28 -0
  107. flyte/cli/_build.py +114 -0
  108. flyte/cli/_common.py +468 -0
  109. flyte/cli/_create.py +371 -0
  110. flyte/cli/_delete.py +45 -0
  111. flyte/cli/_deploy.py +293 -0
  112. flyte/cli/_gen.py +176 -0
  113. flyte/cli/_get.py +370 -0
  114. flyte/cli/_option.py +33 -0
  115. flyte/cli/_params.py +554 -0
  116. flyte/cli/_plugins.py +209 -0
  117. flyte/cli/_run.py +597 -0
  118. flyte/cli/_serve.py +64 -0
  119. flyte/cli/_update.py +37 -0
  120. flyte/cli/_user.py +17 -0
  121. flyte/cli/main.py +221 -0
  122. flyte/config/__init__.py +3 -0
  123. flyte/config/_config.py +248 -0
  124. flyte/config/_internal.py +73 -0
  125. flyte/config/_reader.py +225 -0
  126. flyte/connectors/__init__.py +11 -0
  127. flyte/connectors/_connector.py +270 -0
  128. flyte/connectors/_server.py +197 -0
  129. flyte/connectors/utils.py +135 -0
  130. flyte/errors.py +243 -0
  131. flyte/extend.py +19 -0
  132. flyte/extras/__init__.py +5 -0
  133. flyte/extras/_container.py +286 -0
  134. flyte/git/__init__.py +3 -0
  135. flyte/git/_config.py +21 -0
  136. flyte/io/__init__.py +29 -0
  137. flyte/io/_dataframe/__init__.py +131 -0
  138. flyte/io/_dataframe/basic_dfs.py +223 -0
  139. flyte/io/_dataframe/dataframe.py +1026 -0
  140. flyte/io/_dir.py +910 -0
  141. flyte/io/_file.py +914 -0
  142. flyte/io/_hashing_io.py +342 -0
  143. flyte/models.py +479 -0
  144. flyte/py.typed +0 -0
  145. flyte/remote/__init__.py +35 -0
  146. flyte/remote/_action.py +738 -0
  147. flyte/remote/_app.py +57 -0
  148. flyte/remote/_client/__init__.py +0 -0
  149. flyte/remote/_client/_protocols.py +189 -0
  150. flyte/remote/_client/auth/__init__.py +12 -0
  151. flyte/remote/_client/auth/_auth_utils.py +14 -0
  152. flyte/remote/_client/auth/_authenticators/__init__.py +0 -0
  153. flyte/remote/_client/auth/_authenticators/base.py +403 -0
  154. flyte/remote/_client/auth/_authenticators/client_credentials.py +73 -0
  155. flyte/remote/_client/auth/_authenticators/device_code.py +117 -0
  156. flyte/remote/_client/auth/_authenticators/external_command.py +79 -0
  157. flyte/remote/_client/auth/_authenticators/factory.py +200 -0
  158. flyte/remote/_client/auth/_authenticators/pkce.py +516 -0
  159. flyte/remote/_client/auth/_channel.py +213 -0
  160. flyte/remote/_client/auth/_client_config.py +85 -0
  161. flyte/remote/_client/auth/_default_html.py +32 -0
  162. flyte/remote/_client/auth/_grpc_utils/__init__.py +0 -0
  163. flyte/remote/_client/auth/_grpc_utils/auth_interceptor.py +288 -0
  164. flyte/remote/_client/auth/_grpc_utils/default_metadata_interceptor.py +151 -0
  165. flyte/remote/_client/auth/_keyring.py +152 -0
  166. flyte/remote/_client/auth/_token_client.py +260 -0
  167. flyte/remote/_client/auth/errors.py +16 -0
  168. flyte/remote/_client/controlplane.py +128 -0
  169. flyte/remote/_common.py +30 -0
  170. flyte/remote/_console.py +19 -0
  171. flyte/remote/_data.py +161 -0
  172. flyte/remote/_logs.py +185 -0
  173. flyte/remote/_project.py +88 -0
  174. flyte/remote/_run.py +386 -0
  175. flyte/remote/_secret.py +142 -0
  176. flyte/remote/_task.py +527 -0
  177. flyte/remote/_trigger.py +306 -0
  178. flyte/remote/_user.py +33 -0
  179. flyte/report/__init__.py +3 -0
  180. flyte/report/_report.py +182 -0
  181. flyte/report/_template.html +124 -0
  182. flyte/storage/__init__.py +36 -0
  183. flyte/storage/_config.py +237 -0
  184. flyte/storage/_parallel_reader.py +274 -0
  185. flyte/storage/_remote_fs.py +34 -0
  186. flyte/storage/_storage.py +456 -0
  187. flyte/storage/_utils.py +5 -0
  188. flyte/syncify/__init__.py +56 -0
  189. flyte/syncify/_api.py +375 -0
  190. flyte/types/__init__.py +52 -0
  191. flyte/types/_interface.py +40 -0
  192. flyte/types/_pickle.py +145 -0
  193. flyte/types/_renderer.py +162 -0
  194. flyte/types/_string_literals.py +119 -0
  195. flyte/types/_type_engine.py +2254 -0
  196. flyte/types/_utils.py +80 -0
  197. flyte-2.0.0b32.data/scripts/debug.py +38 -0
  198. flyte-2.0.0b32.data/scripts/runtime.py +195 -0
  199. flyte-2.0.0b32.dist-info/METADATA +351 -0
  200. flyte-2.0.0b32.dist-info/RECORD +204 -0
  201. flyte-2.0.0b32.dist-info/WHEEL +5 -0
  202. flyte-2.0.0b32.dist-info/entry_points.txt +7 -0
  203. flyte-2.0.0b32.dist-info/licenses/LICENSE +201 -0
  204. flyte-2.0.0b32.dist-info/top_level.txt +1 -0
@@ -0,0 +1,286 @@
1
+ import os
2
+ import pathlib
3
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
4
+
5
+ from flyteidl2.core import tasks_pb2
6
+
7
+ from flyte import Image, storage
8
+ from flyte._logging import logger
9
+ from flyte._task import TaskTemplate
10
+ from flyte.io import Dir, File
11
+ from flyte.models import NativeInterface, SerializationContext
12
+
13
+
14
+ def _extract_command_key(cmd: str, **kwargs) -> List[Any] | None:
15
+ """
16
+ Extract the key from the command using regex.
17
+ """
18
+ import re
19
+
20
+ input_regex = r"\{\{\.inputs\.([a-zA-Z0-9_]+)\}\}"
21
+ return re.findall(input_regex, cmd)
22
+
23
+
24
+ def _extract_path_command_key(cmd: str, input_data_dir: Optional[str]) -> Optional[str]:
25
+ """
26
+ Extract the key from the path-like command using regex.
27
+ """
28
+ import re
29
+
30
+ input_data_dir = input_data_dir or ""
31
+ input_regex = rf"{re.escape(input_data_dir)}/([\w\-.]+)" # captures file or dir names
32
+
33
+ match = re.search(input_regex, cmd)
34
+ if match:
35
+ return match.group(1)
36
+ return None
37
+
38
+
39
+ class ContainerTask(TaskTemplate):
40
+ """
41
+ This is an intermediate class that represents Flyte Tasks that run a container at execution time. This is the vast
42
+ majority of tasks - the typical ``@task`` decorated tasks; for instance, all run a container. An example of
43
+ something that doesn't run a container would be something like the Athena SQL task.
44
+
45
+ :param name: Name of the task
46
+ :param image: The container image to use for the task. This can be a string or an Image object.
47
+ :param command: The command to run in the container. This can be a list of strings or a single string.
48
+ :param inputs: The inputs to the task. This is a dictionary of input names to types.
49
+ :param arguments: The arguments to pass to the command. This is a list of strings.
50
+ :param outputs: The outputs of the task. This is a dictionary of output names to types.
51
+ :param input_data_dir: The directory where the input data is stored. This is a string or a Path object.
52
+ :param output_data_dir: The directory where the output data is stored. This is a string or a Path object.
53
+ :param metadata_format: The format of the output file. This can be "JSON", "YAML", or "PROTO".
54
+ :param local_logs: If True, logs will be printed to the console in the local execution.
55
+ """
56
+
57
+ MetadataFormat = Literal["JSON", "YAML", "PROTO"]
58
+
59
+ def __init__(
60
+ self,
61
+ name: str,
62
+ image: Union[str, Image],
63
+ command: List[str],
64
+ inputs: Optional[Dict[str, Type]] = None,
65
+ arguments: Optional[List[str]] = None,
66
+ outputs: Optional[Dict[str, Type]] = None,
67
+ input_data_dir: str | pathlib.Path = "/var/inputs",
68
+ output_data_dir: str | pathlib.Path = "/var/outputs",
69
+ metadata_format: MetadataFormat = "JSON",
70
+ local_logs: bool = True,
71
+ **kwargs,
72
+ ):
73
+ super().__init__(
74
+ task_type="raw-container",
75
+ name=name,
76
+ image=image,
77
+ interface=NativeInterface({k: (v, None) for k, v in inputs.items()} if inputs else {}, outputs or {}),
78
+ **kwargs,
79
+ )
80
+ self._image = image
81
+ if isinstance(image, str):
82
+ if image == "auto":
83
+ self._image = Image.from_debian_base()
84
+ else:
85
+ self._image = Image.from_base(image)
86
+
87
+ if command and any(not isinstance(c, str) for c in command):
88
+ raise ValueError("All elements in the command list must be strings.")
89
+ if arguments and any(not isinstance(a, str) for a in arguments):
90
+ raise ValueError("All elements in the arguments list must be strings.")
91
+ self._cmd = command
92
+ self._args = arguments
93
+ self._input_data_dir = input_data_dir
94
+ if isinstance(input_data_dir, str):
95
+ self._input_data_dir = pathlib.Path(input_data_dir)
96
+ self._output_data_dir = output_data_dir
97
+ if isinstance(output_data_dir, str):
98
+ self._output_data_dir = pathlib.Path(output_data_dir)
99
+ self._metadata_format = metadata_format
100
+ self._inputs = inputs
101
+ self._outputs = outputs
102
+ self.local_logs = local_logs
103
+
104
+ def _render_command_and_volume_binding(self, cmd: str, **kwargs) -> Tuple[str, Dict[str, Dict[str, str]]]:
105
+ """
106
+ We support template-style references to inputs, e.g., "{{.inputs.infile}}".
107
+
108
+ For FlyteFile and FlyteDirectory commands, e.g., "/var/inputs/inputs", we extract the key from strings that
109
+ begin with the specified `input_data_dir`.
110
+ """
111
+ from flyte.io import Dir, File
112
+
113
+ volume_binding: Dict[str, Dict[str, str]] = {}
114
+ path_k = _extract_path_command_key(cmd, str(self._input_data_dir))
115
+ keys = [path_k] if path_k else _extract_command_key(cmd)
116
+
117
+ command = cmd
118
+
119
+ if keys:
120
+ for k in keys:
121
+ input_val = kwargs.get(k)
122
+ # TODO: Add support file and directory transformer first
123
+ if input_val and type(input_val) in [File, Dir]:
124
+ if not path_k:
125
+ raise AssertionError(
126
+ "File and Directory commands should not use the template syntax "
127
+ "like this: {{.inputs.infile}}\n"
128
+ "Please use a path-like syntax, such as: /var/inputs/infile.\n"
129
+ "This requirement is due to how Flyte Propeller processes template syntax inputs."
130
+ )
131
+ local_flyte_file_or_dir_path = input_val.path
132
+ remote_flyte_file_or_dir_path = os.path.join(self._input_data_dir, k) # type: ignore
133
+ volume_binding[local_flyte_file_or_dir_path] = {
134
+ "bind": remote_flyte_file_or_dir_path,
135
+ "mode": "rw",
136
+ }
137
+ else:
138
+ command = command.replace(f"{{{{.inputs.{k}}}}}", str(input_val))
139
+ else:
140
+ command = cmd
141
+
142
+ return command, volume_binding
143
+
144
+ def _prepare_command_and_volumes(
145
+ self, cmd_and_args: List[str], **kwargs
146
+ ) -> Tuple[List[str], Dict[str, Dict[str, str]]]:
147
+ """
148
+ Prepares the command and volume bindings for the container based on input arguments and command templates.
149
+
150
+ Parameters:
151
+ - cmd_and_args (List[str]): The command and arguments to prepare.
152
+ - **kwargs: Keyword arguments representing task inputs.
153
+
154
+ Returns:
155
+ - Tuple[List[str], Dict[str, Dict[str, str]]]: A tuple containing the prepared commands and volume bindings.
156
+ """
157
+
158
+ commands = []
159
+ volume_bindings = {}
160
+
161
+ for cmd in cmd_and_args:
162
+ command, volume_binding = self._render_command_and_volume_binding(cmd, **kwargs)
163
+ commands.append(command)
164
+ volume_bindings.update(volume_binding)
165
+
166
+ return commands, volume_bindings
167
+
168
+ def _pull_image_if_not_exists(self, client, image: str):
169
+ try:
170
+ if not client.images.list(filters={"reference": image}):
171
+ logger.info(f"Pulling image: {image} for container task: {self.name}")
172
+ client.images.pull(image)
173
+ except Exception as e:
174
+ logger.error(f"Failed to pull image {image}: {e!s}")
175
+ raise
176
+
177
+ def _string_to_timedelta(self, s: str):
178
+ import datetime
179
+ import re
180
+
181
+ regex = r"(?:(\d+) days?, )?(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?"
182
+ parts = re.match(regex, s)
183
+ if not parts:
184
+ raise ValueError("Invalid timedelta string format")
185
+
186
+ days = int(parts.group(1)) if parts.group(1) else 0
187
+ hours = int(parts.group(2)) if parts.group(2) else 0
188
+ minutes = int(parts.group(3)) if parts.group(3) else 0
189
+ seconds = int(parts.group(4)) if parts.group(4) else 0
190
+ microseconds = int(parts.group(5)) if parts.group(5) else 0
191
+
192
+ return datetime.timedelta(
193
+ days=days,
194
+ hours=hours,
195
+ minutes=minutes,
196
+ seconds=seconds,
197
+ microseconds=microseconds,
198
+ )
199
+
200
+ async def _convert_output_val_to_correct_type(
201
+ self, output_path: pathlib.Path, output_val: Any, output_type: Type
202
+ ) -> Any:
203
+ import datetime
204
+
205
+ if issubclass(output_type, bool):
206
+ return output_val.lower() != "false"
207
+ elif issubclass(output_type, datetime.datetime):
208
+ return datetime.datetime.fromisoformat(output_val)
209
+ elif issubclass(output_type, datetime.timedelta):
210
+ return self._string_to_timedelta(output_val)
211
+ elif issubclass(output_type, File):
212
+ return await File.from_local(output_path)
213
+ elif issubclass(output_type, Dir):
214
+ return await Dir.from_local(output_path)
215
+ else:
216
+ return output_type(output_val)
217
+
218
+ async def _get_output(self, output_directory: pathlib.Path) -> Tuple[Any]:
219
+ output_items = []
220
+ if self._outputs:
221
+ for k, output_type in self._outputs.items():
222
+ output_path = output_directory / k
223
+ if os.path.isfile(output_path):
224
+ with output_path.open("r") as f:
225
+ output_val = f.read()
226
+ else:
227
+ output_val = None
228
+ parsed = await self._convert_output_val_to_correct_type(output_path, output_val, output_type)
229
+ output_items.append(parsed)
230
+ # return a tuple so that each element is treated as a separate output.
231
+ # this allows flyte to map the user-defined output types (dict) to individual values.
232
+ # if we returned a list instead, it would be treated as a single output.
233
+ return tuple(output_items)
234
+
235
+ async def execute(self, **kwargs) -> Any:
236
+ try:
237
+ import docker
238
+ except ImportError:
239
+ raise ImportError("Docker is not installed. Please install Docker by running `pip install docker`.")
240
+
241
+ # Normalize the input and output directories
242
+ self._input_data_dir = os.path.normpath(self._input_data_dir) if self._input_data_dir else ""
243
+ self._output_data_dir = os.path.normpath(self._output_data_dir) if self._output_data_dir else ""
244
+
245
+ output_directory = storage.get_random_local_directory()
246
+ cmd_and_args = (self._cmd or []) + (self._args or [])
247
+ commands, volume_bindings = self._prepare_command_and_volumes(cmd_and_args, **kwargs)
248
+ volume_bindings[str(output_directory)] = {"bind": self._output_data_dir, "mode": "rw"}
249
+
250
+ client = docker.from_env()
251
+ if isinstance(self._image, str):
252
+ raise AssertionError(f"Only Image objects are supported, not strings. Got {self._image} instead.")
253
+ uri = self._image.uri
254
+ self._pull_image_if_not_exists(client, uri)
255
+ print(f"Command: {commands!r}")
256
+
257
+ container = client.containers.run(uri, command=commands, remove=True, volumes=volume_bindings, detach=True)
258
+
259
+ # Wait for the container to finish the task
260
+ # TODO: Add a 'timeout' parameter to control the max wait time for the container to finish the task.
261
+
262
+ if self.local_logs:
263
+ for log in container.logs(stream=True):
264
+ print(f"[Local Container] {log.strip()!r}")
265
+
266
+ container.wait()
267
+
268
+ output = await self._get_output(output_directory)
269
+ return output
270
+
271
+ def data_loading_config(self, sctx: SerializationContext) -> tasks_pb2.DataLoadingConfig:
272
+ literal_to_protobuf = {
273
+ "JSON": tasks_pb2.DataLoadingConfig.JSON,
274
+ "YAML": tasks_pb2.DataLoadingConfig.YAML,
275
+ "PROTO": tasks_pb2.DataLoadingConfig.PROTO,
276
+ }
277
+
278
+ return tasks_pb2.DataLoadingConfig(
279
+ input_path=str(self._input_data_dir) if self._input_data_dir else None,
280
+ output_path=str(self._output_data_dir) if self._output_data_dir else None,
281
+ enabled=True,
282
+ format=literal_to_protobuf.get(self._metadata_format, "JSON"),
283
+ )
284
+
285
+ def container_args(self, sctx: SerializationContext) -> List[str]:
286
+ return self._cmd + (self._args if self._args else [])
flyte/git/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from ._config import config_from_root
2
+
3
+ __all__ = ["config_from_root"]
flyte/git/_config.py ADDED
@@ -0,0 +1,21 @@
1
+ import pathlib
2
+ import subprocess
3
+
4
+ import flyte.config
5
+
6
+
7
+ def config_from_root(path: pathlib.Path | str = ".flyte/config.yaml") -> flyte.config.Config | None:
8
+ """Get the config file from the git root directory.
9
+
10
+ By default, the config file is expected to be in `.flyte/config.yaml` in the git root directory.
11
+ """
12
+ try:
13
+ result = subprocess.run(["git", "rev-parse", "--show-toplevel"], check=False, capture_output=True, text=True)
14
+ if result.returncode != 0:
15
+ return None
16
+ root = pathlib.Path(result.stdout.strip())
17
+ if not (root / path).exists():
18
+ return None
19
+ return flyte.config.auto(root / path)
20
+ except Exception:
21
+ return None
flyte/io/__init__.py ADDED
@@ -0,0 +1,29 @@
1
+ """
2
+ ## IO data types
3
+
4
+ This package contains additional data types beyond the primitive data types in python to abstract data flow
5
+ of large datasets in Union.
6
+
7
+ """
8
+
9
+ __all__ = [
10
+ "PARQUET",
11
+ "DataFrame",
12
+ "DataFrameDecoder",
13
+ "DataFrameEncoder",
14
+ "DataFrameTransformerEngine",
15
+ "Dir",
16
+ "File",
17
+ "lazy_import_dataframe_handler",
18
+ ]
19
+
20
+ from ._dataframe import (
21
+ PARQUET,
22
+ DataFrame,
23
+ DataFrameDecoder,
24
+ DataFrameEncoder,
25
+ DataFrameTransformerEngine,
26
+ lazy_import_dataframe_handler,
27
+ )
28
+ from ._dir import Dir
29
+ from ._file import File
@@ -0,0 +1,131 @@
1
+ """
2
+ Flytekit DataFrame
3
+ ==========================================================
4
+ .. currentmodule:: flyte.io._dataframe
5
+
6
+ .. autosummary::
7
+ :template: custom.rst
8
+ :toctree: generated/
9
+
10
+ DataFrame
11
+ DataFrameDecoder
12
+ DataFrameEncoder
13
+ """
14
+
15
+ import functools
16
+
17
+ from flyte._logging import logger
18
+ from flyte._utils.lazy_module import is_imported
19
+
20
+ from .dataframe import (
21
+ PARQUET,
22
+ DataFrame,
23
+ DataFrameDecoder,
24
+ DataFrameEncoder,
25
+ DataFrameTransformerEngine,
26
+ DuplicateHandlerError,
27
+ )
28
+
29
+
30
+ @functools.lru_cache(maxsize=None)
31
+ def register_csv_handlers():
32
+ from .basic_dfs import CSVToPandasDecodingHandler, PandasToCSVEncodingHandler
33
+
34
+ DataFrameTransformerEngine.register(PandasToCSVEncodingHandler(), default_format_for_type=True)
35
+ DataFrameTransformerEngine.register(CSVToPandasDecodingHandler(), default_format_for_type=True)
36
+
37
+
38
+ @functools.lru_cache(maxsize=None)
39
+ def register_pandas_handlers():
40
+ import pandas as pd
41
+
42
+ from flyte.types._renderer import TopFrameRenderer
43
+
44
+ from .basic_dfs import PandasToParquetEncodingHandler, ParquetToPandasDecodingHandler
45
+
46
+ DataFrameTransformerEngine.register(PandasToParquetEncodingHandler(), default_format_for_type=True)
47
+ DataFrameTransformerEngine.register(ParquetToPandasDecodingHandler(), default_format_for_type=True)
48
+ DataFrameTransformerEngine.register_renderer(pd.DataFrame, TopFrameRenderer())
49
+
50
+
51
+ @functools.lru_cache(maxsize=None)
52
+ def register_arrow_handlers():
53
+ import pyarrow as pa
54
+
55
+ from flyte.types._renderer import ArrowRenderer
56
+
57
+ from .basic_dfs import ArrowToParquetEncodingHandler, ParquetToArrowDecodingHandler
58
+
59
+ DataFrameTransformerEngine.register(ArrowToParquetEncodingHandler(), default_format_for_type=True)
60
+ DataFrameTransformerEngine.register(ParquetToArrowDecodingHandler(), default_format_for_type=True)
61
+ DataFrameTransformerEngine.register_renderer(pa.Table, ArrowRenderer())
62
+
63
+
64
+ @functools.lru_cache(maxsize=None)
65
+ def register_bigquery_handlers():
66
+ try:
67
+ from .bigquery import (
68
+ ArrowToBQEncodingHandlers,
69
+ BQToArrowDecodingHandler,
70
+ BQToPandasDecodingHandler,
71
+ PandasToBQEncodingHandlers,
72
+ )
73
+
74
+ DataFrameTransformerEngine.register(PandasToBQEncodingHandlers())
75
+ DataFrameTransformerEngine.register(BQToPandasDecodingHandler())
76
+ DataFrameTransformerEngine.register(ArrowToBQEncodingHandlers())
77
+ DataFrameTransformerEngine.register(BQToArrowDecodingHandler())
78
+ except ImportError:
79
+ logger.info(
80
+ "We won't register bigquery handler for structured dataset because "
81
+ "we can't find the packages google-cloud-bigquery-storage and google-cloud-bigquery"
82
+ )
83
+
84
+
85
+ @functools.lru_cache(maxsize=None)
86
+ def register_snowflake_handlers():
87
+ try:
88
+ from .snowflake import PandasToSnowflakeEncodingHandlers, SnowflakeToPandasDecodingHandler
89
+
90
+ DataFrameTransformerEngine.register(SnowflakeToPandasDecodingHandler())
91
+ DataFrameTransformerEngine.register(PandasToSnowflakeEncodingHandlers())
92
+
93
+ except ImportError:
94
+ logger.info(
95
+ "We won't register snowflake handler for structured dataset because "
96
+ "we can't find package snowflake-connector-python"
97
+ )
98
+
99
+
100
+ def lazy_import_dataframe_handler():
101
+ if is_imported("pandas"):
102
+ try:
103
+ register_pandas_handlers()
104
+ register_csv_handlers()
105
+ except DuplicateHandlerError:
106
+ logger.debug("Transformer for pandas is already registered.")
107
+ if is_imported("pyarrow"):
108
+ try:
109
+ register_arrow_handlers()
110
+ except DuplicateHandlerError:
111
+ logger.debug("Transformer for arrow is already registered.")
112
+ if is_imported("google.cloud.bigquery"):
113
+ try:
114
+ register_bigquery_handlers()
115
+ except DuplicateHandlerError:
116
+ logger.debug("Transformer for bigquery is already registered.")
117
+ if is_imported("snowflake.connector"):
118
+ try:
119
+ register_snowflake_handlers()
120
+ except DuplicateHandlerError:
121
+ logger.debug("Transformer for snowflake is already registered.")
122
+
123
+
124
+ __all__ = [
125
+ "PARQUET",
126
+ "DataFrame",
127
+ "DataFrameDecoder",
128
+ "DataFrameEncoder",
129
+ "DataFrameTransformerEngine",
130
+ "lazy_import_dataframe_handler",
131
+ ]
@@ -0,0 +1,223 @@
1
+ import os
2
+ import typing
3
+ from pathlib import Path
4
+ from typing import TypeVar
5
+
6
+ from flyteidl2.core import literals_pb2, types_pb2
7
+ from fsspec.core import split_protocol, strip_protocol
8
+
9
+ import flyte.storage as storage
10
+ from flyte._logging import logger
11
+ from flyte._utils import lazy_module
12
+ from flyte.io._dataframe.dataframe import (
13
+ CSV,
14
+ PARQUET,
15
+ DataFrame,
16
+ DataFrameDecoder,
17
+ DataFrameEncoder,
18
+ )
19
+
20
+ if typing.TYPE_CHECKING:
21
+ import pandas as pd
22
+ import pyarrow as pa
23
+ else:
24
+ pd = lazy_module("pandas")
25
+ pa = lazy_module("pyarrow")
26
+
27
+ T = TypeVar("T")
28
+
29
+
30
+ def get_pandas_storage_options(uri: str, anonymous: bool = False) -> typing.Optional[typing.Dict]:
31
+ from pandas.io.common import is_fsspec_url # type: ignore
32
+
33
+ if is_fsspec_url(uri):
34
+ if uri.startswith("s3"):
35
+ return storage.get_configured_fsspec_kwargs("s3", anonymous=anonymous)
36
+ return {}
37
+
38
+ # Pandas does not allow storage_options for non-fsspec paths e.g. local.
39
+ return None
40
+
41
+
42
+ class PandasToCSVEncodingHandler(DataFrameEncoder):
43
+ def __init__(self):
44
+ super().__init__(pd.DataFrame, None, CSV)
45
+
46
+ async def encode(
47
+ self,
48
+ dataframe: DataFrame,
49
+ structured_dataset_type: types_pb2.StructuredDatasetType,
50
+ ) -> literals_pb2.StructuredDataset:
51
+ if not dataframe.uri:
52
+ from flyte._context import internal_ctx
53
+
54
+ ctx = internal_ctx()
55
+ uri = ctx.raw_data.get_random_remote_path()
56
+ else:
57
+ uri = typing.cast(str, dataframe.uri)
58
+
59
+ if not storage.is_remote(uri):
60
+ Path(uri).mkdir(parents=True, exist_ok=True)
61
+ csv_file = storage.join(uri, "data.csv")
62
+ df = typing.cast(pd.DataFrame, dataframe.val)
63
+ df.to_csv(
64
+ csv_file,
65
+ index=False,
66
+ storage_options=get_pandas_storage_options(uri=csv_file),
67
+ )
68
+ structured_dataset_type.format = CSV
69
+ return literals_pb2.StructuredDataset(
70
+ uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type=structured_dataset_type)
71
+ )
72
+
73
+
74
+ class CSVToPandasDecodingHandler(DataFrameDecoder):
75
+ def __init__(self):
76
+ super().__init__(pd.DataFrame, None, CSV)
77
+
78
+ async def decode(
79
+ self,
80
+ proto_value: literals_pb2.StructuredDataset,
81
+ current_task_metadata: literals_pb2.StructuredDatasetMetadata,
82
+ ) -> "pd.DataFrame":
83
+ uri = proto_value.uri
84
+ columns = None
85
+ kwargs = get_pandas_storage_options(uri=uri)
86
+ csv_file = storage.join(uri, "data.csv")
87
+ if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
88
+ columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
89
+ try:
90
+ import io
91
+
92
+ # The pattern used here is a bit wonky because of obstore issues with csv, getting early eof error.
93
+ buf = io.BytesIO()
94
+ async for chunk in storage.get_stream(csv_file):
95
+ buf.write(chunk)
96
+ buf.seek(0)
97
+ df = pd.read_csv(buf)
98
+ return df
99
+
100
+ except Exception as exc:
101
+ if exc.__class__.__name__ == "NoCredentialsError":
102
+ logger.debug("S3 source detected, attempting anonymous S3 access")
103
+ kwargs = get_pandas_storage_options(uri=uri, anonymous=True)
104
+ return pd.read_csv(csv_file, usecols=columns, storage_options=kwargs)
105
+ else:
106
+ raise
107
+
108
+
109
+ class PandasToParquetEncodingHandler(DataFrameEncoder):
110
+ def __init__(self):
111
+ super().__init__(pd.DataFrame, None, PARQUET)
112
+
113
+ async def encode(
114
+ self,
115
+ dataframe: DataFrame,
116
+ structured_dataset_type: types_pb2.StructuredDatasetType,
117
+ ) -> literals_pb2.StructuredDataset:
118
+ if not dataframe.uri:
119
+ from flyte._context import internal_ctx
120
+
121
+ ctx = internal_ctx()
122
+ uri = str(ctx.raw_data.get_random_remote_path())
123
+ else:
124
+ uri = typing.cast(str, dataframe.uri)
125
+
126
+ if not storage.is_remote(uri):
127
+ Path(uri).mkdir(parents=True, exist_ok=True)
128
+ path = os.path.join(uri, f"{0:05}")
129
+ df = typing.cast(pd.DataFrame, dataframe.val)
130
+ df.to_parquet(
131
+ path,
132
+ coerce_timestamps="us",
133
+ allow_truncated_timestamps=False,
134
+ storage_options=get_pandas_storage_options(uri=path),
135
+ )
136
+ structured_dataset_type.format = PARQUET
137
+ return literals_pb2.StructuredDataset(
138
+ uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type=structured_dataset_type)
139
+ )
140
+
141
+
142
+ class ParquetToPandasDecodingHandler(DataFrameDecoder):
143
+ def __init__(self):
144
+ super().__init__(pd.DataFrame, None, PARQUET)
145
+
146
+ async def decode(
147
+ self,
148
+ flyte_value: literals_pb2.StructuredDataset,
149
+ current_task_metadata: literals_pb2.StructuredDatasetMetadata,
150
+ ) -> "pd.DataFrame":
151
+ uri = flyte_value.uri
152
+ columns = None
153
+ kwargs = get_pandas_storage_options(uri=uri)
154
+ if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
155
+ columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
156
+ try:
157
+ return pd.read_parquet(uri, columns=columns, storage_options=kwargs)
158
+ except Exception as exc:
159
+ if exc.__class__.__name__ == "NoCredentialsError":
160
+ logger.debug("S3 source detected, attempting anonymous S3 access")
161
+ kwargs = get_pandas_storage_options(uri=uri, anonymous=True)
162
+ return pd.read_parquet(uri, columns=columns, storage_options=kwargs)
163
+ else:
164
+ raise
165
+
166
+
167
+ class ArrowToParquetEncodingHandler(DataFrameEncoder):
168
+ def __init__(self):
169
+ super().__init__(pa.Table, None, PARQUET)
170
+
171
+ async def encode(
172
+ self,
173
+ dataframe: DataFrame,
174
+ dataframe_type: types_pb2.StructuredDatasetType,
175
+ ) -> literals_pb2.StructuredDataset:
176
+ import pyarrow.parquet as pq
177
+
178
+ if not dataframe.uri:
179
+ from flyte._context import internal_ctx
180
+
181
+ ctx = internal_ctx()
182
+ uri = ctx.raw_data.get_random_remote_path()
183
+ else:
184
+ uri = typing.cast(str, dataframe.uri)
185
+
186
+ if not storage.is_remote(uri):
187
+ Path(uri).mkdir(parents=True, exist_ok=True)
188
+ path = os.path.join(uri, f"{0:05}")
189
+ filesystem = storage.get_underlying_filesystem(path=path)
190
+ pq.write_table(dataframe.val, strip_protocol(path), filesystem=filesystem)
191
+ return literals_pb2.StructuredDataset(uri=uri, metadata=literals_pb2.StructuredDatasetMetadata(dataframe_type))
192
+
193
+
194
+ class ParquetToArrowDecodingHandler(DataFrameDecoder):
195
+ def __init__(self):
196
+ super().__init__(pa.Table, None, PARQUET)
197
+
198
+ async def decode(
199
+ self,
200
+ proto_value: literals_pb2.StructuredDataset,
201
+ current_task_metadata: literals_pb2.StructuredDatasetMetadata,
202
+ ) -> "pa.Table":
203
+ import pyarrow.parquet as pq
204
+
205
+ uri = proto_value.uri
206
+ if not storage.is_remote(uri):
207
+ Path(uri).parent.mkdir(parents=True, exist_ok=True)
208
+ _, path = split_protocol(uri)
209
+
210
+ columns = None
211
+ if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
212
+ columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
213
+ try:
214
+ return pq.read_table(path, columns=columns)
215
+ except Exception as exc:
216
+ if exc.__class__.__name__ == "NoCredentialsError":
217
+ logger.debug("S3 source detected, attempting anonymous S3 access")
218
+ fs = storage.get_underlying_filesystem(path=uri, anonymous=True)
219
+ if fs is not None:
220
+ return pq.read_table(path, filesystem=fs, columns=columns)
221
+ return None
222
+ else:
223
+ raise