flyte 2.0.0b22__py3-none-any.whl → 2.0.0b30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. flyte/__init__.py +18 -2
  2. flyte/_bin/runtime.py +43 -5
  3. flyte/_cache/cache.py +4 -2
  4. flyte/_cache/local_cache.py +216 -0
  5. flyte/_code_bundle/_ignore.py +1 -1
  6. flyte/_code_bundle/_packaging.py +4 -4
  7. flyte/_code_bundle/_utils.py +14 -8
  8. flyte/_code_bundle/bundle.py +13 -5
  9. flyte/_constants.py +1 -0
  10. flyte/_context.py +4 -1
  11. flyte/_custom_context.py +73 -0
  12. flyte/_debug/constants.py +0 -1
  13. flyte/_debug/vscode.py +6 -1
  14. flyte/_deploy.py +223 -59
  15. flyte/_environment.py +5 -0
  16. flyte/_excepthook.py +1 -1
  17. flyte/_image.py +144 -82
  18. flyte/_initialize.py +95 -12
  19. flyte/_interface.py +2 -0
  20. flyte/_internal/controllers/_local_controller.py +65 -24
  21. flyte/_internal/controllers/_trace.py +1 -1
  22. flyte/_internal/controllers/remote/_action.py +13 -11
  23. flyte/_internal/controllers/remote/_client.py +1 -1
  24. flyte/_internal/controllers/remote/_controller.py +9 -4
  25. flyte/_internal/controllers/remote/_core.py +16 -16
  26. flyte/_internal/controllers/remote/_informer.py +4 -4
  27. flyte/_internal/controllers/remote/_service_protocol.py +7 -7
  28. flyte/_internal/imagebuild/docker_builder.py +139 -84
  29. flyte/_internal/imagebuild/image_builder.py +7 -13
  30. flyte/_internal/imagebuild/remote_builder.py +65 -13
  31. flyte/_internal/imagebuild/utils.py +51 -3
  32. flyte/_internal/resolvers/_task_module.py +5 -38
  33. flyte/_internal/resolvers/default.py +2 -2
  34. flyte/_internal/runtime/convert.py +42 -20
  35. flyte/_internal/runtime/entrypoints.py +24 -1
  36. flyte/_internal/runtime/io.py +21 -8
  37. flyte/_internal/runtime/resources_serde.py +20 -6
  38. flyte/_internal/runtime/reuse.py +1 -1
  39. flyte/_internal/runtime/rusty.py +20 -5
  40. flyte/_internal/runtime/task_serde.py +33 -27
  41. flyte/_internal/runtime/taskrunner.py +10 -1
  42. flyte/_internal/runtime/trigger_serde.py +160 -0
  43. flyte/_internal/runtime/types_serde.py +1 -1
  44. flyte/_keyring/file.py +39 -9
  45. flyte/_logging.py +79 -12
  46. flyte/_map.py +31 -12
  47. flyte/_module.py +70 -0
  48. flyte/_pod.py +2 -2
  49. flyte/_resources.py +213 -31
  50. flyte/_run.py +107 -41
  51. flyte/_task.py +66 -10
  52. flyte/_task_environment.py +96 -24
  53. flyte/_task_plugins.py +4 -2
  54. flyte/_trigger.py +1000 -0
  55. flyte/_utils/__init__.py +2 -1
  56. flyte/_utils/asyn.py +3 -1
  57. flyte/_utils/docker_credentials.py +173 -0
  58. flyte/_utils/module_loader.py +17 -2
  59. flyte/_version.py +3 -3
  60. flyte/cli/_abort.py +3 -3
  61. flyte/cli/_build.py +1 -3
  62. flyte/cli/_common.py +78 -7
  63. flyte/cli/_create.py +178 -3
  64. flyte/cli/_delete.py +23 -1
  65. flyte/cli/_deploy.py +49 -11
  66. flyte/cli/_get.py +79 -34
  67. flyte/cli/_params.py +8 -6
  68. flyte/cli/_plugins.py +209 -0
  69. flyte/cli/_run.py +127 -11
  70. flyte/cli/_serve.py +64 -0
  71. flyte/cli/_update.py +37 -0
  72. flyte/cli/_user.py +17 -0
  73. flyte/cli/main.py +30 -4
  74. flyte/config/_config.py +2 -0
  75. flyte/config/_internal.py +1 -0
  76. flyte/config/_reader.py +3 -3
  77. flyte/connectors/__init__.py +11 -0
  78. flyte/connectors/_connector.py +270 -0
  79. flyte/connectors/_server.py +197 -0
  80. flyte/connectors/utils.py +135 -0
  81. flyte/errors.py +10 -1
  82. flyte/extend.py +8 -1
  83. flyte/extras/_container.py +6 -1
  84. flyte/git/_config.py +11 -9
  85. flyte/io/__init__.py +2 -0
  86. flyte/io/_dataframe/__init__.py +2 -0
  87. flyte/io/_dataframe/basic_dfs.py +1 -1
  88. flyte/io/_dataframe/dataframe.py +12 -8
  89. flyte/io/_dir.py +551 -120
  90. flyte/io/_file.py +538 -141
  91. flyte/models.py +57 -12
  92. flyte/remote/__init__.py +6 -1
  93. flyte/remote/_action.py +18 -16
  94. flyte/remote/_client/_protocols.py +39 -4
  95. flyte/remote/_client/auth/_channel.py +10 -6
  96. flyte/remote/_client/controlplane.py +17 -5
  97. flyte/remote/_console.py +3 -2
  98. flyte/remote/_data.py +4 -3
  99. flyte/remote/_logs.py +3 -3
  100. flyte/remote/_run.py +47 -7
  101. flyte/remote/_secret.py +26 -17
  102. flyte/remote/_task.py +21 -9
  103. flyte/remote/_trigger.py +306 -0
  104. flyte/remote/_user.py +33 -0
  105. flyte/storage/__init__.py +6 -1
  106. flyte/storage/_parallel_reader.py +274 -0
  107. flyte/storage/_storage.py +185 -103
  108. flyte/types/__init__.py +16 -0
  109. flyte/types/_interface.py +2 -2
  110. flyte/types/_pickle.py +17 -4
  111. flyte/types/_string_literals.py +8 -9
  112. flyte/types/_type_engine.py +26 -19
  113. flyte/types/_utils.py +1 -1
  114. {flyte-2.0.0b22.data → flyte-2.0.0b30.data}/scripts/runtime.py +43 -5
  115. {flyte-2.0.0b22.dist-info → flyte-2.0.0b30.dist-info}/METADATA +8 -1
  116. flyte-2.0.0b30.dist-info/RECORD +192 -0
  117. flyte/_protos/__init__.py +0 -0
  118. flyte/_protos/common/authorization_pb2.py +0 -66
  119. flyte/_protos/common/authorization_pb2.pyi +0 -108
  120. flyte/_protos/common/authorization_pb2_grpc.py +0 -4
  121. flyte/_protos/common/identifier_pb2.py +0 -99
  122. flyte/_protos/common/identifier_pb2.pyi +0 -120
  123. flyte/_protos/common/identifier_pb2_grpc.py +0 -4
  124. flyte/_protos/common/identity_pb2.py +0 -48
  125. flyte/_protos/common/identity_pb2.pyi +0 -72
  126. flyte/_protos/common/identity_pb2_grpc.py +0 -4
  127. flyte/_protos/common/list_pb2.py +0 -36
  128. flyte/_protos/common/list_pb2.pyi +0 -71
  129. flyte/_protos/common/list_pb2_grpc.py +0 -4
  130. flyte/_protos/common/policy_pb2.py +0 -37
  131. flyte/_protos/common/policy_pb2.pyi +0 -27
  132. flyte/_protos/common/policy_pb2_grpc.py +0 -4
  133. flyte/_protos/common/role_pb2.py +0 -37
  134. flyte/_protos/common/role_pb2.pyi +0 -53
  135. flyte/_protos/common/role_pb2_grpc.py +0 -4
  136. flyte/_protos/common/runtime_version_pb2.py +0 -28
  137. flyte/_protos/common/runtime_version_pb2.pyi +0 -24
  138. flyte/_protos/common/runtime_version_pb2_grpc.py +0 -4
  139. flyte/_protos/imagebuilder/definition_pb2.py +0 -60
  140. flyte/_protos/imagebuilder/definition_pb2.pyi +0 -153
  141. flyte/_protos/imagebuilder/definition_pb2_grpc.py +0 -4
  142. flyte/_protos/imagebuilder/payload_pb2.py +0 -32
  143. flyte/_protos/imagebuilder/payload_pb2.pyi +0 -21
  144. flyte/_protos/imagebuilder/payload_pb2_grpc.py +0 -4
  145. flyte/_protos/imagebuilder/service_pb2.py +0 -29
  146. flyte/_protos/imagebuilder/service_pb2.pyi +0 -5
  147. flyte/_protos/imagebuilder/service_pb2_grpc.py +0 -66
  148. flyte/_protos/logs/dataplane/payload_pb2.py +0 -100
  149. flyte/_protos/logs/dataplane/payload_pb2.pyi +0 -177
  150. flyte/_protos/logs/dataplane/payload_pb2_grpc.py +0 -4
  151. flyte/_protos/secret/definition_pb2.py +0 -49
  152. flyte/_protos/secret/definition_pb2.pyi +0 -93
  153. flyte/_protos/secret/definition_pb2_grpc.py +0 -4
  154. flyte/_protos/secret/payload_pb2.py +0 -62
  155. flyte/_protos/secret/payload_pb2.pyi +0 -94
  156. flyte/_protos/secret/payload_pb2_grpc.py +0 -4
  157. flyte/_protos/secret/secret_pb2.py +0 -38
  158. flyte/_protos/secret/secret_pb2.pyi +0 -6
  159. flyte/_protos/secret/secret_pb2_grpc.py +0 -198
  160. flyte/_protos/secret/secret_pb2_grpc_grpc.py +0 -198
  161. flyte/_protos/validate/validate/validate_pb2.py +0 -76
  162. flyte/_protos/workflow/common_pb2.py +0 -27
  163. flyte/_protos/workflow/common_pb2.pyi +0 -14
  164. flyte/_protos/workflow/common_pb2_grpc.py +0 -4
  165. flyte/_protos/workflow/environment_pb2.py +0 -29
  166. flyte/_protos/workflow/environment_pb2.pyi +0 -12
  167. flyte/_protos/workflow/environment_pb2_grpc.py +0 -4
  168. flyte/_protos/workflow/node_execution_service_pb2.py +0 -26
  169. flyte/_protos/workflow/node_execution_service_pb2.pyi +0 -4
  170. flyte/_protos/workflow/node_execution_service_pb2_grpc.py +0 -32
  171. flyte/_protos/workflow/queue_service_pb2.py +0 -111
  172. flyte/_protos/workflow/queue_service_pb2.pyi +0 -168
  173. flyte/_protos/workflow/queue_service_pb2_grpc.py +0 -172
  174. flyte/_protos/workflow/run_definition_pb2.py +0 -123
  175. flyte/_protos/workflow/run_definition_pb2.pyi +0 -352
  176. flyte/_protos/workflow/run_definition_pb2_grpc.py +0 -4
  177. flyte/_protos/workflow/run_logs_service_pb2.py +0 -41
  178. flyte/_protos/workflow/run_logs_service_pb2.pyi +0 -28
  179. flyte/_protos/workflow/run_logs_service_pb2_grpc.py +0 -69
  180. flyte/_protos/workflow/run_service_pb2.py +0 -137
  181. flyte/_protos/workflow/run_service_pb2.pyi +0 -185
  182. flyte/_protos/workflow/run_service_pb2_grpc.py +0 -446
  183. flyte/_protos/workflow/state_service_pb2.py +0 -67
  184. flyte/_protos/workflow/state_service_pb2.pyi +0 -76
  185. flyte/_protos/workflow/state_service_pb2_grpc.py +0 -138
  186. flyte/_protos/workflow/task_definition_pb2.py +0 -82
  187. flyte/_protos/workflow/task_definition_pb2.pyi +0 -88
  188. flyte/_protos/workflow/task_definition_pb2_grpc.py +0 -4
  189. flyte/_protos/workflow/task_service_pb2.py +0 -60
  190. flyte/_protos/workflow/task_service_pb2.pyi +0 -59
  191. flyte/_protos/workflow/task_service_pb2_grpc.py +0 -138
  192. flyte-2.0.0b22.dist-info/RECORD +0 -250
  193. {flyte-2.0.0b22.data → flyte-2.0.0b30.data}/scripts/debug.py +0 -0
  194. {flyte-2.0.0b22.dist-info → flyte-2.0.0b30.dist-info}/WHEEL +0 -0
  195. {flyte-2.0.0b22.dist-info → flyte-2.0.0b30.dist-info}/entry_points.txt +0 -0
  196. {flyte-2.0.0b22.dist-info → flyte-2.0.0b30.dist-info}/licenses/LICENSE +0 -0
  197. {flyte-2.0.0b22.dist-info → flyte-2.0.0b30.dist-info}/top_level.txt +0 -0
flyte/io/_file.py CHANGED
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import inspect
3
4
  import os
5
+ import typing
4
6
  from contextlib import asynccontextmanager, contextmanager
5
7
  from pathlib import Path
6
8
  from typing import (
@@ -18,20 +20,25 @@ from typing import (
18
20
  )
19
21
 
20
22
  import aiofiles
21
- from flyteidl.core import literals_pb2, types_pb2
22
- from fsspec.asyn import AsyncFileSystem
23
+ from flyteidl2.core import literals_pb2, types_pb2
23
24
  from fsspec.utils import get_protocol
24
25
  from mashumaro.types import SerializableType
25
26
  from pydantic import BaseModel, Field, model_validator
26
27
  from pydantic.json_schema import SkipJsonSchema
27
28
 
29
+ import flyte.errors
28
30
  import flyte.storage as storage
29
31
  from flyte._context import internal_ctx
30
32
  from flyte._initialize import requires_initialization
31
- from flyte._logging import logger
32
33
  from flyte.io._hashing_io import AsyncHashingReader, HashingWriter, HashMethod, PrecomputedValue
33
34
  from flyte.types import TypeEngine, TypeTransformer, TypeTransformerFailedError
34
35
 
36
+ if typing.TYPE_CHECKING:
37
+ from obstore import AsyncReadableFile, AsyncWritableFile
38
+
39
+ if typing.TYPE_CHECKING:
40
+ from obstore import AsyncReadableFile, AsyncWritableFile
41
+
35
42
  # Type variable for the file format
36
43
  T = TypeVar("T")
37
44
 
@@ -39,63 +46,139 @@ T = TypeVar("T")
39
46
  class File(BaseModel, Generic[T], SerializableType):
40
47
  """
41
48
  A generic file class representing a file with a specified format.
42
- Provides both async and sync interfaces for file operations.
43
- Users must handle all I/O operations themselves by instantiating this class with the appropriate class methods.
49
+ Provides both async and sync interfaces for file operations. All methods without _sync suffix are async.
50
+
51
+ The class should be instantiated using one of the class methods. The constructor should be used only to
52
+ instantiate references to existing remote objects.
44
53
 
45
54
  The generic type T represents the format of the file.
46
55
 
47
- Example:
48
- ```python
49
- # Async usage
50
- from pandas import DataFrame
51
- csv_file = File[DataFrame](path="s3://my-bucket/data.csv")
56
+ Important methods:
57
+ - `from_existing_remote`: Create a File object from an existing remote file.
58
+ - `new_remote`: Create a new File reference for a remote file that will be written to.
52
59
 
53
- async with csv_file.open() as f:
54
- content = await f.read()
60
+ **Asynchronous methods**:
61
+ - `open`: Asynchronously open the file and return a file-like object.
62
+ - `download`: Asynchronously download the file to a local path.
63
+ - `from_local`: Asynchronously create a File object from a local file, uploading it to remote storage.
64
+ - `exists`: Asynchronously check if the file exists.
55
65
 
56
- # Sync alternative
57
- with csv_file.open_sync() as f:
58
- content = f.read()
59
- ```
66
+ **Synchronous methods** (suffixed with `_sync`):
67
+ - `open_sync`: Synchronously open the file and return a file-like object.
68
+ - `download_sync`: Synchronously download the file to a local path.
69
+ - `from_local_sync`: Synchronously create a File object from a local file, uploading it to remote storage.
70
+ - `exists_sync`: Synchronously check if the file exists.
60
71
 
61
- Example: Read a file input in a Task.
62
- ```
72
+ Example: Read a file input in a Task (Async).
73
+
74
+ ```python
63
75
  @env.task
64
- async def my_task(file: File[DataFrame]):
65
- async with file.open() as f:
66
- df = pd.read_csv(f)
76
+ async def read_file(file: File) -> str:
77
+ async with file.open("rb") as f:
78
+ content = bytes(await f.read())
79
+ return content.decode("utf-8")
67
80
  ```
68
81
 
69
- Example: Write a file by streaming it directly to blob storage
82
+ Example: Read a file input in a Task (Sync).
83
+
84
+ ```python
85
+ @env.task
86
+ def read_file_sync(file: File) -> str:
87
+ with file.open_sync("rb") as f:
88
+ content = f.read()
89
+ return content.decode("utf-8")
70
90
  ```
91
+
92
+ Example: Write a file by streaming it directly to blob storage (Async).
93
+
94
+ ```python
71
95
  @env.task
72
- async def my_task() -> File[DataFrame]:
73
- df = pd.DataFrame(...)
96
+ async def write_file() -> File:
74
97
  file = File.new_remote()
75
98
  async with file.open("wb") as f:
76
- df.to_csv(f)
77
- # No additional uploading will be done here.
99
+ await f.write(b"Hello, World!")
78
100
  return file
79
101
  ```
80
- Example: Write a file by writing it locally first, and then uploading it.
102
+
103
+ Example: Upload a local file to remote storage (Async).
104
+
105
+ ```python
106
+ @env.task
107
+ async def upload_file() -> File:
108
+ # Write to local file first
109
+ with open("/tmp/data.csv", "w") as f:
110
+ f.write("col1,col2\\n1,2\\n3,4\\n")
111
+ # Upload to remote storage
112
+ return await File.from_local("/tmp/data.csv")
81
113
  ```
114
+
115
+ Example: Upload a local file to remote storage (Sync).
116
+
117
+ ```python
82
118
  @env.task
83
- async def my_task() -> File[DataFrame]:
84
- # write to /tmp/data.csv
85
- return File.from_local("/tmp/data.csv", optional="s3://my-bucket/data.csv")
119
+ def upload_file_sync() -> File:
120
+ # Write to local file first
121
+ with open("/tmp/data.csv", "w") as f:
122
+ f.write("col1,col2\\n1,2\\n3,4\\n")
123
+ # Upload to remote storage
124
+ return File.from_local_sync("/tmp/data.csv")
86
125
  ```
87
126
 
88
- Example: From an existing remote file
127
+ Example: Download a file to local storage (Async).
128
+
129
+ ```python
130
+ @env.task
131
+ async def download_file(file: File) -> str:
132
+ local_path = await file.download()
133
+ # Process the local file
134
+ with open(local_path, "r") as f:
135
+ return f.read()
89
136
  ```
137
+
138
+ Example: Download a file to local storage (Sync).
139
+
140
+ ```python
90
141
  @env.task
91
- async def my_task() -> File[DataFrame]:
92
- return File.from_existing_remote("s3://my-bucket/data.csv")
142
+ def download_file_sync(file: File) -> str:
143
+ local_path = file.download_sync()
144
+ # Process the local file
145
+ with open(local_path, "r") as f:
146
+ return f.read()
93
147
  ```
94
148
 
95
- Example: Take a remote file as input and return the same one, should not do any copy
149
+ Example: Reference an existing remote file.
150
+
151
+ ```python
152
+ @env.task
153
+ async def process_existing_file() -> str:
154
+ file = File.from_existing_remote("s3://my-bucket/data.csv")
155
+ async with file.open("rb") as f:
156
+ content = await f.read()
157
+ return content.decode("utf-8")
96
158
  ```
159
+
160
+ Example: Check if a file exists (Async).
161
+
162
+ ```python
97
163
  @env.task
98
- async def my_task(file: File[DataFrame]) -> File[DataFrame]:
164
+ async def check_file(file: File) -> bool:
165
+ return await file.exists()
166
+ ```
167
+
168
+ Example: Check if a file exists (Sync).
169
+
170
+ ```python
171
+ @env.task
172
+ def check_file_sync(file: File) -> bool:
173
+ return file.exists_sync()
174
+ ```
175
+
176
+ Example: Pass through a file without copying.
177
+
178
+ ```python
179
+ @env.task
180
+ async def pass_through(file: File) -> File:
181
+ # No copy occurs - just passes the reference
99
182
  return file
100
183
  ```
101
184
 
@@ -116,20 +199,24 @@ class File(BaseModel, Generic[T], SerializableType):
116
199
  @model_validator(mode="before")
117
200
  @classmethod
118
201
  def pre_init(cls, data):
202
+ """Internal: Pydantic validator to set default name from path. Not intended for direct use."""
119
203
  if data.get("name") is None:
120
204
  data["name"] = Path(data["path"]).name
121
205
  return data
122
206
 
123
207
  def _serialize(self) -> Dict[str, Optional[str]]:
208
+ """Internal: Serialize File to dictionary. Not intended for direct use."""
124
209
  pyd_dump = self.model_dump()
125
210
  return pyd_dump
126
211
 
127
212
  @classmethod
128
213
  def _deserialize(cls, file_dump: Dict[str, Optional[str]]) -> File:
214
+ """Internal: Deserialize File from dictionary. Not intended for direct use."""
129
215
  return File.model_validate(file_dump)
130
216
 
131
217
  @classmethod
132
218
  def schema_match(cls, incoming: dict):
219
+ """Internal: Check if incoming schema matches File schema. Not intended for direct use."""
133
220
  this_schema = cls.model_json_schema()
134
221
  current_required = this_schema.get("required")
135
222
  incoming_required = incoming.get("required")
@@ -148,16 +235,27 @@ class File(BaseModel, Generic[T], SerializableType):
148
235
  """
149
236
  Create a new File reference for a remote file that will be written to.
150
237
 
151
- Example:
152
- ```
238
+ Use this when you want to create a new file and write to it directly without creating a local file first.
239
+
240
+ Example (Async):
241
+
242
+ ```python
153
243
  @env.task
154
- async def my_task() -> File[DataFrame]:
155
- df = pd.DataFrame(...)
244
+ async def create_csv() -> File:
245
+ df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
156
246
  file = File.new_remote()
157
247
  async with file.open("wb") as f:
158
248
  df.to_csv(f)
159
249
  return file
160
250
  ```
251
+
252
+ Args:
253
+ hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
254
+ it will be used as a precomputed cache key. If a HashMethod is provided, it will be used
255
+ to compute the hash as data is written.
256
+
257
+ Returns:
258
+ A new File instance with a generated remote path
161
259
  """
162
260
  ctx = internal_ctx()
163
261
  known_cache_key = hash_method if isinstance(hash_method, str) else None
@@ -170,17 +268,26 @@ class File(BaseModel, Generic[T], SerializableType):
170
268
  """
171
269
  Create a File reference from an existing remote file.
172
270
 
271
+ Use this when you want to reference a file that already exists in remote storage without uploading it.
272
+
173
273
  Example:
274
+
174
275
  ```python
175
276
  @env.task
176
- async def my_task() -> File[DataFrame]:
177
- return File.from_existing_remote("s3://my-bucket/data.csv")
277
+ async def process_existing_file() -> str:
278
+ file = File.from_existing_remote("s3://my-bucket/data.csv")
279
+ async with file.open("rb") as f:
280
+ content = await f.read()
281
+ return content.decode("utf-8")
178
282
  ```
179
283
 
180
284
  Args:
181
285
  remote_path: The remote path to the existing file
182
- file_cache_key: Optional hash value to use for discovery purposes. If not specified, the value of this
183
- File object will be hashed (basically the path, not the contents).
286
+ file_cache_key: Optional hash value to use for cache key computation. If not specified, the cache key
287
+ will be computed based on the file's attributes (path, name, format).
288
+
289
+ Returns:
290
+ A new File instance pointing to the existing remote file
184
291
  """
185
292
  return cls(path=remote_path, hash=file_cache_key)
186
293
 
@@ -193,92 +300,129 @@ class File(BaseModel, Generic[T], SerializableType):
193
300
  cache_options: Optional[dict] = None,
194
301
  compression: Optional[str] = None,
195
302
  **kwargs,
196
- ) -> AsyncGenerator[Union[IO[Any], "HashingWriter"], None]:
303
+ ) -> AsyncGenerator[Union[AsyncWritableFile, AsyncReadableFile, "HashingWriter"], None]:
197
304
  """
198
305
  Asynchronously open the file and return a file-like object.
199
306
 
307
+ Use this method in async tasks to read from or write to files directly.
308
+
309
+ Example (Async Read):
310
+
311
+ ```python
312
+ @env.task
313
+ async def read_file(f: File) -> str:
314
+ async with f.open("rb") as fh:
315
+ content = bytes(await fh.read())
316
+ return content.decode("utf-8")
317
+ ```
318
+
319
+ Example (Async Write):
320
+
321
+ ```python
322
+ @env.task
323
+ async def write_file() -> File:
324
+ f = File.new_remote()
325
+ async with f.open("wb") as fh:
326
+ await fh.write(b"Hello, World!")
327
+ return f
328
+ ```
329
+
330
+ Example (Streaming Read):
331
+
332
+ ```python
333
+ @env.task
334
+ async def stream_read(f: File) -> str:
335
+ content_parts = []
336
+ async with f.open("rb", block_size=1024) as fh:
337
+ while True:
338
+ chunk = await fh.read()
339
+ if not chunk:
340
+ break
341
+ content_parts.append(chunk)
342
+ return b"".join(content_parts).decode("utf-8")
343
+ ```
344
+
200
345
  Args:
201
- mode: The mode to open the file in (default: 'rb')
202
- block_size: Size of blocks for reading (bytes)
346
+ mode: The mode to open the file in (default: 'rb'). Common modes: 'rb' (read binary),
347
+ 'wb' (write binary), 'rt' (read text), 'wt' (write text)
348
+ block_size: Size of blocks for reading in bytes. Useful for streaming large files.
203
349
  cache_type: Caching mechanism to use ('readahead', 'mmap', 'bytes', 'none')
204
350
  cache_options: Dictionary of options for the cache
205
351
  compression: Compression format or None for auto-detection
206
352
  **kwargs: Additional arguments passed to fsspec's open method
207
353
 
208
354
  Returns:
209
- An async file-like object
210
-
211
- Example:
212
- ```python
213
- async with file.open('rb') as f:
214
- data = await f.read()
215
- ```
355
+ An async file-like object that can be used with async read/write operations
216
356
  """
217
- fs = storage.get_underlying_filesystem(path=self.path)
218
-
219
- # Set up cache options if provided
220
- if cache_options is None:
221
- cache_options = {}
222
-
223
- # Configure the open parameters
224
- open_kwargs = {"mode": mode, **kwargs}
225
- if compression:
226
- open_kwargs["compression"] = compression
227
-
228
- if block_size:
229
- open_kwargs["block_size"] = block_size
230
-
231
- # Apply caching strategy
232
- if cache_type != "none":
233
- open_kwargs["cache_type"] = cache_type
234
- open_kwargs["cache_options"] = cache_options
235
-
236
- # Use aiofiles for local files
237
- if fs.protocol == "file":
238
- async with aiofiles.open(self.path, mode=mode, **kwargs) as f:
239
- yield f
240
- else:
241
- # This code is broadly similar to what storage.get_stream does, but without actually reading from the stream
242
- file_handle = None
357
+ # Check if we should use obstore bypass
358
+ try:
359
+ fh = await storage.open(
360
+ self.path,
361
+ mode=mode,
362
+ cache_type=cache_type,
363
+ cache_options=cache_options,
364
+ compression=compression,
365
+ block_size=block_size,
366
+ **kwargs,
367
+ )
243
368
  try:
244
- if "b" not in mode:
245
- raise ValueError("Mode must include 'b' for binary access, when using remote files.")
246
- if isinstance(fs, AsyncFileSystem):
247
- file_handle = await fs.open_async(self.path, mode)
248
- yield file_handle
249
- return
250
- except NotImplementedError:
251
- logger.debug(f"{fs} doesn't implement 'open_async', falling back to sync")
369
+ yield fh
370
+ return
252
371
  finally:
253
- if file_handle is not None:
254
- file_handle.close()
255
-
256
- with fs.open(self.path, mode) as file_handle:
257
- if self.hash_method and self.hash is None:
258
- logger.debug(f"Wrapping file handle with hashing writer using {self.hash_method}")
259
- fh = HashingWriter(file_handle, accumulator=self.hash_method)
260
- yield fh
261
- self.hash = fh.result()
262
- fh.close()
372
+ if inspect.iscoroutinefunction(fh.close):
373
+ await fh.close()
263
374
  else:
264
- yield file_handle
265
- file_handle.close()
375
+ fh.close()
376
+ except flyte.errors.OnlyAsyncIOSupportedError:
377
+ # Fall back to aiofiles
378
+ fs = storage.get_underlying_filesystem(path=self.path)
379
+ if "file" in fs.protocol:
380
+ async with aiofiles.open(self.path, mode=mode, **kwargs) as f:
381
+ yield f
382
+ return
383
+ raise
384
+
385
+ async def exists(self) -> bool:
386
+ """
387
+ Asynchronously check if the file exists.
388
+
389
+ Example (Async):
390
+
391
+ ```python
392
+ @env.task
393
+ async def check_file(f: File) -> bool:
394
+ if await f.exists():
395
+ print("File exists!")
396
+ return True
397
+ return False
398
+ ```
399
+
400
+ Returns:
401
+ True if the file exists, False otherwise
402
+ """
403
+ return await storage.exists(self.path)
266
404
 
267
405
  def exists_sync(self) -> bool:
268
406
  """
269
407
  Synchronously check if the file exists.
270
408
 
409
+ Use this in non-async tasks or when you need synchronous file existence checking.
410
+
411
+ Example (Sync):
412
+
413
+ ```python
414
+ @env.task
415
+ def check_file_sync(f: File) -> bool:
416
+ if f.exists_sync():
417
+ print("File exists!")
418
+ return True
419
+ return False
420
+ ```
421
+
271
422
  Returns:
272
423
  True if the file exists, False otherwise
273
-
274
- Example:
275
- ```python
276
- if file.exists_sync():
277
- # Process the file
278
- ```
279
424
  """
280
- fs = storage.get_underlying_filesystem(path=self.path)
281
- return fs.exists(self.path)
425
+ return storage.exists_sync(self.path)
282
426
 
283
427
  @contextmanager
284
428
  def open_sync(
@@ -289,26 +433,44 @@ class File(BaseModel, Generic[T], SerializableType):
289
433
  cache_options: Optional[dict] = None,
290
434
  compression: Optional[str] = None,
291
435
  **kwargs,
292
- ) -> Generator[IO[Any]]:
436
+ ) -> Generator[IO[Any], None, None]:
293
437
  """
294
438
  Synchronously open the file and return a file-like object.
295
439
 
440
+ Use this method in non-async tasks to read from or write to files directly.
441
+
442
+ Example (Sync Read):
443
+
444
+ ```python
445
+ @env.task
446
+ def read_file_sync(f: File) -> str:
447
+ with f.open_sync("rb") as fh:
448
+ content = fh.read()
449
+ return content.decode("utf-8")
450
+ ```
451
+
452
+ Example (Sync Write):
453
+
454
+ ```python
455
+ @env.task
456
+ def write_file_sync() -> File:
457
+ f = File.new_remote()
458
+ with f.open_sync("wb") as fh:
459
+ fh.write(b"Hello, World!")
460
+ return f
461
+ ```
462
+
296
463
  Args:
297
- mode: The mode to open the file in (default: 'rb')
298
- block_size: Size of blocks for reading (bytes)
464
+ mode: The mode to open the file in (default: 'rb'). Common modes: 'rb' (read binary),
465
+ 'wb' (write binary), 'rt' (read text), 'wt' (write text)
466
+ block_size: Size of blocks for reading in bytes. Useful for streaming large files.
299
467
  cache_type: Caching mechanism to use ('readahead', 'mmap', 'bytes', 'none')
300
468
  cache_options: Dictionary of options for the cache
301
469
  compression: Compression format or None for auto-detection
302
470
  **kwargs: Additional arguments passed to fsspec's open method
303
471
 
304
472
  Returns:
305
- A file-like object
306
-
307
- Example:
308
- ```python
309
- with file.open_sync('rb') as f:
310
- data = f.read()
311
- ```
473
+ A file-like object that can be used with standard read/write operations
312
474
  """
313
475
  fs = storage.get_underlying_filesystem(path=self.path)
314
476
 
@@ -335,63 +497,188 @@ class File(BaseModel, Generic[T], SerializableType):
335
497
  """
336
498
  Asynchronously download the file to a local path.
337
499
 
500
+ Use this when you need to download a remote file to your local filesystem for processing.
501
+
502
+ Example (Async):
503
+
504
+ ```python
505
+ @env.task
506
+ async def download_and_process(f: File) -> str:
507
+ local_path = await f.download()
508
+ # Now process the local file
509
+ with open(local_path, "r") as fh:
510
+ return fh.read()
511
+ ```
512
+
513
+ Example (Download to specific path):
514
+
515
+ ```python
516
+ @env.task
517
+ async def download_to_path(f: File) -> str:
518
+ local_path = await f.download("/tmp/myfile.csv")
519
+ return local_path
520
+ ```
521
+
338
522
  Args:
339
523
  local_path: The local path to download the file to. If None, a temporary
340
- directory will be used.
524
+ directory will be used and a path will be generated.
341
525
 
342
526
  Returns:
343
- The path to the downloaded file
344
-
345
- Example:
346
- ```python
347
- local_file = await file.download('/tmp/myfile.csv')
348
- ```
527
+ The absolute path to the downloaded file
349
528
  """
350
529
  if local_path is None:
351
- local_path = storage.get_random_local_path(file_path_or_file_name=local_path)
530
+ local_path = storage.get_random_local_path(file_path_or_file_name=self.path)
352
531
  else:
532
+ # Preserve trailing separator if present (Path.absolute() strips it)
533
+ local_path_str = str(local_path)
534
+ has_trailing_sep = local_path_str.endswith(os.sep)
353
535
  local_path = str(Path(local_path).absolute())
536
+ if has_trailing_sep:
537
+ local_path = local_path + os.sep
354
538
 
355
539
  fs = storage.get_underlying_filesystem(path=self.path)
356
540
 
357
541
  # If it's already a local file, just copy it
358
542
  if "file" in fs.protocol:
543
+ # Apply directory logic for local-to-local copies
544
+ local_path_for_copy = local_path
545
+ if isinstance(local_path, str):
546
+ local_path_obj = Path(local_path)
547
+ # Check if it's a directory or ends with separator
548
+ if local_path.endswith(os.sep) or (local_path_obj.exists() and local_path_obj.is_dir()):
549
+ remote_filename = Path(self.path).name
550
+ local_path_for_copy = str(local_path_obj / remote_filename)
551
+
552
+ # Ensure parent directory exists
553
+ Path(local_path_for_copy).parent.mkdir(parents=True, exist_ok=True)
554
+
359
555
  # Use aiofiles for async copy
360
556
  async with aiofiles.open(self.path, "rb") as src:
361
- async with aiofiles.open(local_path, "wb") as dst:
557
+ async with aiofiles.open(local_path_for_copy, "wb") as dst:
362
558
  await dst.write(await src.read())
363
- return str(local_path)
559
+ return str(local_path_for_copy)
364
560
 
365
561
  # Otherwise download from remote using async functionality
366
- await storage.get(self.path, str(local_path))
562
+ result_path = await storage.get(self.path, str(local_path))
563
+ return result_path
564
+
565
+ def download_sync(self, local_path: Optional[Union[str, Path]] = None) -> str:
566
+ """
567
+ Synchronously download the file to a local path.
568
+
569
+ Use this in non-async tasks when you need to download a remote file to your local filesystem.
570
+
571
+ Example (Sync):
572
+
573
+ ```python
574
+ @env.task
575
+ def download_and_process_sync(f: File) -> str:
576
+ local_path = f.download_sync()
577
+ # Now process the local file
578
+ with open(local_path, "r") as fh:
579
+ return fh.read()
580
+ ```
581
+
582
+ Example (Download to specific path):
583
+
584
+ ```python
585
+ @env.task
586
+ def download_to_path_sync(f: File) -> str:
587
+ local_path = f.download_sync("/tmp/myfile.csv")
588
+ return local_path
589
+ ```
590
+
591
+ Args:
592
+ local_path: The local path to download the file to. If None, a temporary
593
+ directory will be used and a path will be generated.
594
+
595
+ Returns:
596
+ The absolute path to the downloaded file
597
+ """
598
+ if local_path is None:
599
+ local_path = storage.get_random_local_path(file_path_or_file_name=self.path)
600
+ else:
601
+ # Preserve trailing separator if present (Path.absolute() strips it)
602
+ local_path_str = str(local_path)
603
+ has_trailing_sep = local_path_str.endswith(os.sep)
604
+ local_path = str(Path(local_path).absolute())
605
+ if has_trailing_sep:
606
+ local_path = local_path + os.sep
607
+
608
+ fs = storage.get_underlying_filesystem(path=self.path)
609
+
610
+ # If it's already a local file, just copy it
611
+ if "file" in fs.protocol:
612
+ # Apply directory logic for local-to-local copies
613
+ local_path_for_copy = local_path
614
+ if isinstance(local_path, str):
615
+ local_path_obj = Path(local_path)
616
+ # Check if it's a directory or ends with separator
617
+ if local_path.endswith(os.sep) or (local_path_obj.exists() and local_path_obj.is_dir()):
618
+ remote_filename = Path(self.path).name
619
+ local_path_for_copy = str(local_path_obj / remote_filename)
620
+
621
+ # Ensure parent directory exists
622
+ Path(local_path_for_copy).parent.mkdir(parents=True, exist_ok=True)
623
+
624
+ # Use standard file operations for sync copy
625
+ import shutil
626
+
627
+ shutil.copy2(self.path, local_path_for_copy)
628
+ return str(local_path_for_copy)
629
+
630
+ # Otherwise download from remote using sync functionality
631
+ # Use the sync version of storage operations
632
+ with fs.open(self.path, "rb") as src:
633
+ with open(local_path, "wb") as dst:
634
+ dst.write(src.read())
367
635
  return str(local_path)
368
636
 
369
637
  @classmethod
370
638
  @requires_initialization
371
- async def from_local(
639
+ def from_local_sync(
372
640
  cls,
373
641
  local_path: Union[str, Path],
374
642
  remote_destination: Optional[str] = None,
375
643
  hash_method: Optional[HashMethod | str] = None,
376
644
  ) -> File[T]:
377
645
  """
378
- Create a new File object from a local file that will be uploaded to the configured remote store.
646
+ Synchronously create a new File object from a local file by uploading it to remote storage.
647
+
648
+ Use this in non-async tasks when you have a local file that needs to be uploaded to remote storage.
649
+
650
+ Example (Sync):
651
+
652
+ ```python
653
+ @env.task
654
+ def upload_local_file_sync() -> File:
655
+ # Create a local file
656
+ with open("/tmp/data.csv", "w") as f:
657
+ f.write("col1,col2\n1,2\n3,4\n")
658
+
659
+ # Upload to remote storage
660
+ remote_file = File.from_local_sync("/tmp/data.csv")
661
+ return remote_file
662
+ ```
663
+
664
+ Example (With specific destination):
665
+
666
+ ```python
667
+ @env.task
668
+ def upload_to_specific_path() -> File:
669
+ remote_file = File.from_local_sync("/tmp/data.csv", "s3://my-bucket/data.csv")
670
+ return remote_file
671
+ ```
379
672
 
380
673
  Args:
381
674
  local_path: Path to the local file
382
- remote_destination: Optional path to store the file remotely. If None, a path will be generated.
383
- hash_method: Pass this argument either as a set string or a HashMethod to use for
384
- determining a task's cache key if this File object is used as an input to said task. If not specified,
385
- the cache key will just be computed based on this object's attributes (i.e. path, name, format, etc.).
386
- If there is a set value you want to use, please pass an instance of the PrecomputedValue HashMethod.
675
+ remote_destination: Optional remote path to store the file. If None, a path will be automatically generated.
676
+ hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
677
+ it will be used as a precomputed cache key. If a HashMethod is provided, it will compute
678
+ the hash during upload. If not specified, the cache key will be based on file attributes.
387
679
 
388
680
  Returns:
389
- A new File instance pointing to the uploaded file
390
-
391
- Example:
392
- ```python
393
- remote_file = await File[DataFrame].from_local('/tmp/data.csv', 's3://bucket/data.csv')
394
- ```
681
+ A new File instance pointing to the uploaded remote file
395
682
  """
396
683
  if not os.path.exists(local_path):
397
684
  raise ValueError(f"File not found: {local_path}")
@@ -400,6 +687,116 @@ class File(BaseModel, Generic[T], SerializableType):
400
687
  protocol = get_protocol(remote_path)
401
688
  filename = Path(local_path).name
402
689
 
690
+ # If remote_destination was not set by the user, and the configured raw data path is also local,
691
+ # then let's optimize by not uploading.
692
+ hash_value = hash_method if isinstance(hash_method, str) else None
693
+ hash_method_obj = hash_method if isinstance(hash_method, HashMethod) else None
694
+
695
+ if "file" in protocol:
696
+ if remote_destination is None:
697
+ path = str(Path(local_path).absolute())
698
+ else:
699
+ # Otherwise, actually make a copy of the file
700
+ import shutil
701
+
702
+ if hash_method_obj:
703
+ # For hash computation, we need to read and write manually
704
+ with open(local_path, "rb") as src:
705
+ with open(remote_path, "wb") as dst:
706
+ dst_wrapper = HashingWriter(dst, accumulator=hash_method_obj)
707
+ dst_wrapper.write(src.read())
708
+ hash_value = dst_wrapper.result()
709
+ dst_wrapper.close()
710
+ else:
711
+ shutil.copy2(local_path, remote_path)
712
+ path = str(Path(remote_path).absolute())
713
+ else:
714
+ # Otherwise upload to remote using sync storage layer
715
+ fs = storage.get_underlying_filesystem(path=remote_path)
716
+
717
+ if hash_method_obj:
718
+ # We can skip the wrapper if the hash method is just a precomputed value
719
+ if not isinstance(hash_method_obj, PrecomputedValue):
720
+ with open(local_path, "rb") as src:
721
+ # For sync operations, we need to compute hash manually
722
+ data = src.read()
723
+ hash_method_obj.update(memoryview(data))
724
+ hash_value = hash_method_obj.result()
725
+
726
+ # Now write the data to remote
727
+ with fs.open(remote_path, "wb") as dst:
728
+ dst.write(data)
729
+ path = remote_path
730
+ else:
731
+ # Use sync file operations
732
+ with open(local_path, "rb") as src:
733
+ with fs.open(remote_path, "wb") as dst:
734
+ dst.write(src.read())
735
+ path = remote_path
736
+ hash_value = hash_method_obj.result()
737
+ else:
738
+ # Simple sync copy
739
+ with open(local_path, "rb") as src:
740
+ with fs.open(remote_path, "wb") as dst:
741
+ dst.write(src.read())
742
+ path = remote_path
743
+
744
+ f = cls(path=path, name=filename, hash_method=hash_method_obj, hash=hash_value)
745
+ return f
746
+
747
+ @classmethod
748
+ @requires_initialization
749
+ async def from_local(
750
+ cls,
751
+ local_path: Union[str, Path],
752
+ remote_destination: Optional[str] = None,
753
+ hash_method: Optional[HashMethod | str] = None,
754
+ ) -> File[T]:
755
+ """
756
+ Asynchronously create a new File object from a local file by uploading it to remote storage.
757
+
758
+ Use this in async tasks when you have a local file that needs to be uploaded to remote storage.
759
+
760
+ Example (Async):
761
+
762
+ ```python
763
+ @env.task
764
+ async def upload_local_file() -> File:
765
+ # Create a local file
766
+ async with aiofiles.open("/tmp/data.csv", "w") as f:
767
+ await f.write("col1,col2\n1,2\n3,4\n")
768
+
769
+ # Upload to remote storage
770
+ remote_file = await File.from_local("/tmp/data.csv")
771
+ return remote_file
772
+ ```
773
+
774
+ Example (With specific destination):
775
+
776
+ ```python
777
+ @env.task
778
+ async def upload_to_specific_path() -> File:
779
+ remote_file = await File.from_local("/tmp/data.csv", "s3://my-bucket/data.csv")
780
+ return remote_file
781
+ ```
782
+
783
+ Args:
784
+ local_path: Path to the local file
785
+ remote_destination: Optional remote path to store the file. If None, a path will be automatically generated.
786
+ hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
787
+ it will be used as a precomputed cache key. If a HashMethod is provided, it will compute
788
+ the hash during upload. If not specified, the cache key will be based on file attributes.
789
+
790
+ Returns:
791
+ A new File instance pointing to the uploaded remote file
792
+ """
793
+ if not os.path.exists(local_path):
794
+ raise ValueError(f"File not found: {local_path}")
795
+
796
+ filename = Path(local_path).name
797
+ remote_path = remote_destination or internal_ctx().raw_data.get_random_remote_path(filename)
798
+ protocol = get_protocol(remote_path)
799
+
403
800
  # If remote_destination was not set by the user, and the configured raw data path is also local,
404
801
  # then let's optimize by not uploading.
405
802
  hash_value = hash_method if isinstance(hash_method, str) else None