flyte 2.0.0b22__py3-none-any.whl → 2.0.0b23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flyte might be problematic. Click here for more details.
- flyte/__init__.py +5 -0
- flyte/_bin/runtime.py +35 -5
- flyte/_cache/cache.py +4 -2
- flyte/_cache/local_cache.py +215 -0
- flyte/_code_bundle/bundle.py +1 -0
- flyte/_debug/constants.py +0 -1
- flyte/_debug/vscode.py +6 -1
- flyte/_deploy.py +193 -52
- flyte/_environment.py +5 -0
- flyte/_excepthook.py +1 -1
- flyte/_image.py +101 -72
- flyte/_initialize.py +23 -0
- flyte/_internal/controllers/_local_controller.py +64 -24
- flyte/_internal/controllers/remote/_action.py +4 -1
- flyte/_internal/controllers/remote/_controller.py +5 -2
- flyte/_internal/controllers/remote/_core.py +6 -3
- flyte/_internal/controllers/remote/_informer.py +1 -1
- flyte/_internal/imagebuild/docker_builder.py +92 -28
- flyte/_internal/imagebuild/image_builder.py +7 -13
- flyte/_internal/imagebuild/remote_builder.py +6 -1
- flyte/_internal/runtime/io.py +13 -1
- flyte/_internal/runtime/rusty.py +17 -2
- flyte/_internal/runtime/task_serde.py +14 -20
- flyte/_internal/runtime/taskrunner.py +1 -1
- flyte/_internal/runtime/trigger_serde.py +153 -0
- flyte/_logging.py +1 -1
- flyte/_protos/common/identifier_pb2.py +19 -1
- flyte/_protos/common/identifier_pb2.pyi +22 -0
- flyte/_protos/workflow/common_pb2.py +14 -3
- flyte/_protos/workflow/common_pb2.pyi +49 -0
- flyte/_protos/workflow/queue_service_pb2.py +41 -35
- flyte/_protos/workflow/queue_service_pb2.pyi +26 -12
- flyte/_protos/workflow/queue_service_pb2_grpc.py +34 -0
- flyte/_protos/workflow/run_definition_pb2.py +38 -38
- flyte/_protos/workflow/run_definition_pb2.pyi +4 -2
- flyte/_protos/workflow/run_service_pb2.py +60 -50
- flyte/_protos/workflow/run_service_pb2.pyi +24 -6
- flyte/_protos/workflow/run_service_pb2_grpc.py +34 -0
- flyte/_protos/workflow/task_definition_pb2.py +15 -11
- flyte/_protos/workflow/task_definition_pb2.pyi +19 -2
- flyte/_protos/workflow/task_service_pb2.py +18 -17
- flyte/_protos/workflow/task_service_pb2.pyi +5 -2
- flyte/_protos/workflow/trigger_definition_pb2.py +66 -0
- flyte/_protos/workflow/trigger_definition_pb2.pyi +117 -0
- flyte/_protos/workflow/trigger_definition_pb2_grpc.py +4 -0
- flyte/_protos/workflow/trigger_service_pb2.py +96 -0
- flyte/_protos/workflow/trigger_service_pb2.pyi +110 -0
- flyte/_protos/workflow/trigger_service_pb2_grpc.py +281 -0
- flyte/_run.py +42 -15
- flyte/_task.py +35 -4
- flyte/_task_environment.py +60 -15
- flyte/_trigger.py +382 -0
- flyte/_version.py +3 -3
- flyte/cli/_abort.py +3 -3
- flyte/cli/_build.py +1 -3
- flyte/cli/_common.py +15 -2
- flyte/cli/_create.py +74 -0
- flyte/cli/_delete.py +23 -1
- flyte/cli/_deploy.py +5 -9
- flyte/cli/_get.py +75 -34
- flyte/cli/_params.py +4 -2
- flyte/cli/_run.py +12 -3
- flyte/cli/_update.py +36 -0
- flyte/cli/_user.py +17 -0
- flyte/cli/main.py +9 -1
- flyte/errors.py +9 -0
- flyte/io/_dir.py +513 -115
- flyte/io/_file.py +495 -135
- flyte/models.py +32 -0
- flyte/remote/__init__.py +6 -1
- flyte/remote/_client/_protocols.py +36 -2
- flyte/remote/_client/controlplane.py +19 -3
- flyte/remote/_run.py +42 -2
- flyte/remote/_task.py +14 -1
- flyte/remote/_trigger.py +308 -0
- flyte/remote/_user.py +33 -0
- flyte/storage/__init__.py +6 -1
- flyte/storage/_storage.py +119 -101
- flyte/types/_pickle.py +16 -3
- {flyte-2.0.0b22.data → flyte-2.0.0b23.data}/scripts/runtime.py +35 -5
- {flyte-2.0.0b22.dist-info → flyte-2.0.0b23.dist-info}/METADATA +3 -1
- {flyte-2.0.0b22.dist-info → flyte-2.0.0b23.dist-info}/RECORD +87 -75
- flyte/_protos/secret/secret_pb2_grpc_grpc.py +0 -198
- {flyte-2.0.0b22.data → flyte-2.0.0b23.data}/scripts/debug.py +0 -0
- {flyte-2.0.0b22.dist-info → flyte-2.0.0b23.dist-info}/WHEEL +0 -0
- {flyte-2.0.0b22.dist-info → flyte-2.0.0b23.dist-info}/entry_points.txt +0 -0
- {flyte-2.0.0b22.dist-info → flyte-2.0.0b23.dist-info}/licenses/LICENSE +0 -0
- {flyte-2.0.0b22.dist-info → flyte-2.0.0b23.dist-info}/top_level.txt +0 -0
flyte/io/_file.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import inspect
|
|
3
4
|
import os
|
|
5
|
+
import typing
|
|
4
6
|
from contextlib import asynccontextmanager, contextmanager
|
|
5
7
|
from pathlib import Path
|
|
6
8
|
from typing import (
|
|
@@ -19,19 +21,21 @@ from typing import (
|
|
|
19
21
|
|
|
20
22
|
import aiofiles
|
|
21
23
|
from flyteidl.core import literals_pb2, types_pb2
|
|
22
|
-
from fsspec.asyn import AsyncFileSystem
|
|
23
24
|
from fsspec.utils import get_protocol
|
|
24
25
|
from mashumaro.types import SerializableType
|
|
25
26
|
from pydantic import BaseModel, Field, model_validator
|
|
26
27
|
from pydantic.json_schema import SkipJsonSchema
|
|
27
28
|
|
|
29
|
+
import flyte.errors
|
|
28
30
|
import flyte.storage as storage
|
|
29
31
|
from flyte._context import internal_ctx
|
|
30
32
|
from flyte._initialize import requires_initialization
|
|
31
|
-
from flyte._logging import logger
|
|
32
33
|
from flyte.io._hashing_io import AsyncHashingReader, HashingWriter, HashMethod, PrecomputedValue
|
|
33
34
|
from flyte.types import TypeEngine, TypeTransformer, TypeTransformerFailedError
|
|
34
35
|
|
|
36
|
+
if typing.TYPE_CHECKING:
|
|
37
|
+
from obstore import AsyncReadableFile, AsyncWritableFile
|
|
38
|
+
|
|
35
39
|
# Type variable for the file format
|
|
36
40
|
T = TypeVar("T")
|
|
37
41
|
|
|
@@ -39,63 +43,139 @@ T = TypeVar("T")
|
|
|
39
43
|
class File(BaseModel, Generic[T], SerializableType):
|
|
40
44
|
"""
|
|
41
45
|
A generic file class representing a file with a specified format.
|
|
42
|
-
Provides both async and sync interfaces for file operations.
|
|
43
|
-
|
|
46
|
+
Provides both async and sync interfaces for file operations. All methods without _sync suffix are async.
|
|
47
|
+
|
|
48
|
+
The class should be instantiated using one of the class methods. The constructor should be used only to
|
|
49
|
+
instantiate references to existing remote objects.
|
|
44
50
|
|
|
45
51
|
The generic type T represents the format of the file.
|
|
46
52
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
from pandas import DataFrame
|
|
51
|
-
csv_file = File[DataFrame](path="s3://my-bucket/data.csv")
|
|
53
|
+
Important methods:
|
|
54
|
+
- `from_existing_remote`: Create a File object from an existing remote file.
|
|
55
|
+
- `new_remote`: Create a new File reference for a remote file that will be written to.
|
|
52
56
|
|
|
53
|
-
|
|
54
|
-
|
|
57
|
+
**Asynchronous methods**:
|
|
58
|
+
- `open`: Asynchronously open the file and return a file-like object.
|
|
59
|
+
- `download`: Asynchronously download the file to a local path.
|
|
60
|
+
- `from_local`: Asynchronously create a File object from a local file, uploading it to remote storage.
|
|
61
|
+
- `exists`: Asynchronously check if the file exists.
|
|
55
62
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
63
|
+
**Synchronous methods** (suffixed with `_sync`):
|
|
64
|
+
- `open_sync`: Synchronously open the file and return a file-like object.
|
|
65
|
+
- `download_sync`: Synchronously download the file to a local path.
|
|
66
|
+
- `from_local_sync`: Synchronously create a File object from a local file, uploading it to remote storage.
|
|
67
|
+
- `exists_sync`: Synchronously check if the file exists.
|
|
60
68
|
|
|
61
|
-
Example: Read a file input in a Task.
|
|
62
|
-
|
|
69
|
+
Example: Read a file input in a Task (Async).
|
|
70
|
+
|
|
71
|
+
```python
|
|
63
72
|
@env.task
|
|
64
|
-
async def
|
|
65
|
-
async with file.open() as f:
|
|
66
|
-
|
|
73
|
+
async def read_file(file: File) -> str:
|
|
74
|
+
async with file.open("rb") as f:
|
|
75
|
+
content = bytes(await f.read())
|
|
76
|
+
return content.decode("utf-8")
|
|
67
77
|
```
|
|
68
78
|
|
|
69
|
-
Example:
|
|
79
|
+
Example: Read a file input in a Task (Sync).
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
@env.task
|
|
83
|
+
def read_file_sync(file: File) -> str:
|
|
84
|
+
with file.open_sync("rb") as f:
|
|
85
|
+
content = f.read()
|
|
86
|
+
return content.decode("utf-8")
|
|
70
87
|
```
|
|
88
|
+
|
|
89
|
+
Example: Write a file by streaming it directly to blob storage (Async).
|
|
90
|
+
|
|
91
|
+
```python
|
|
71
92
|
@env.task
|
|
72
|
-
async def
|
|
73
|
-
df = pd.DataFrame(...)
|
|
93
|
+
async def write_file() -> File:
|
|
74
94
|
file = File.new_remote()
|
|
75
95
|
async with file.open("wb") as f:
|
|
76
|
-
|
|
77
|
-
# No additional uploading will be done here.
|
|
96
|
+
await f.write(b"Hello, World!")
|
|
78
97
|
return file
|
|
79
98
|
```
|
|
80
|
-
|
|
99
|
+
|
|
100
|
+
Example: Upload a local file to remote storage (Async).
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
@env.task
|
|
104
|
+
async def upload_file() -> File:
|
|
105
|
+
# Write to local file first
|
|
106
|
+
with open("/tmp/data.csv", "w") as f:
|
|
107
|
+
f.write("col1,col2\\n1,2\\n3,4\\n")
|
|
108
|
+
# Upload to remote storage
|
|
109
|
+
return await File.from_local("/tmp/data.csv")
|
|
81
110
|
```
|
|
111
|
+
|
|
112
|
+
Example: Upload a local file to remote storage (Sync).
|
|
113
|
+
|
|
114
|
+
```python
|
|
82
115
|
@env.task
|
|
83
|
-
|
|
84
|
-
#
|
|
85
|
-
|
|
116
|
+
def upload_file_sync() -> File:
|
|
117
|
+
# Write to local file first
|
|
118
|
+
with open("/tmp/data.csv", "w") as f:
|
|
119
|
+
f.write("col1,col2\\n1,2\\n3,4\\n")
|
|
120
|
+
# Upload to remote storage
|
|
121
|
+
return File.from_local_sync("/tmp/data.csv")
|
|
86
122
|
```
|
|
87
123
|
|
|
88
|
-
Example:
|
|
124
|
+
Example: Download a file to local storage (Async).
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
@env.task
|
|
128
|
+
async def download_file(file: File) -> str:
|
|
129
|
+
local_path = await file.download()
|
|
130
|
+
# Process the local file
|
|
131
|
+
with open(local_path, "r") as f:
|
|
132
|
+
return f.read()
|
|
89
133
|
```
|
|
134
|
+
|
|
135
|
+
Example: Download a file to local storage (Sync).
|
|
136
|
+
|
|
137
|
+
```python
|
|
90
138
|
@env.task
|
|
91
|
-
|
|
92
|
-
|
|
139
|
+
def download_file_sync(file: File) -> str:
|
|
140
|
+
local_path = file.download_sync()
|
|
141
|
+
# Process the local file
|
|
142
|
+
with open(local_path, "r") as f:
|
|
143
|
+
return f.read()
|
|
93
144
|
```
|
|
94
145
|
|
|
95
|
-
Example:
|
|
146
|
+
Example: Reference an existing remote file.
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
@env.task
|
|
150
|
+
async def process_existing_file() -> str:
|
|
151
|
+
file = File.from_existing_remote("s3://my-bucket/data.csv")
|
|
152
|
+
async with file.open("rb") as f:
|
|
153
|
+
content = await f.read()
|
|
154
|
+
return content.decode("utf-8")
|
|
96
155
|
```
|
|
156
|
+
|
|
157
|
+
Example: Check if a file exists (Async).
|
|
158
|
+
|
|
159
|
+
```python
|
|
97
160
|
@env.task
|
|
98
|
-
async def
|
|
161
|
+
async def check_file(file: File) -> bool:
|
|
162
|
+
return await file.exists()
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Example: Check if a file exists (Sync).
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
@env.task
|
|
169
|
+
def check_file_sync(file: File) -> bool:
|
|
170
|
+
return file.exists_sync()
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Example: Pass through a file without copying.
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
@env.task
|
|
177
|
+
async def pass_through(file: File) -> File:
|
|
178
|
+
# No copy occurs - just passes the reference
|
|
99
179
|
return file
|
|
100
180
|
```
|
|
101
181
|
|
|
@@ -116,20 +196,24 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
116
196
|
@model_validator(mode="before")
|
|
117
197
|
@classmethod
|
|
118
198
|
def pre_init(cls, data):
|
|
199
|
+
"""Internal: Pydantic validator to set default name from path. Not intended for direct use."""
|
|
119
200
|
if data.get("name") is None:
|
|
120
201
|
data["name"] = Path(data["path"]).name
|
|
121
202
|
return data
|
|
122
203
|
|
|
123
204
|
def _serialize(self) -> Dict[str, Optional[str]]:
|
|
205
|
+
"""Internal: Serialize File to dictionary. Not intended for direct use."""
|
|
124
206
|
pyd_dump = self.model_dump()
|
|
125
207
|
return pyd_dump
|
|
126
208
|
|
|
127
209
|
@classmethod
|
|
128
210
|
def _deserialize(cls, file_dump: Dict[str, Optional[str]]) -> File:
|
|
211
|
+
"""Internal: Deserialize File from dictionary. Not intended for direct use."""
|
|
129
212
|
return File.model_validate(file_dump)
|
|
130
213
|
|
|
131
214
|
@classmethod
|
|
132
215
|
def schema_match(cls, incoming: dict):
|
|
216
|
+
"""Internal: Check if incoming schema matches File schema. Not intended for direct use."""
|
|
133
217
|
this_schema = cls.model_json_schema()
|
|
134
218
|
current_required = this_schema.get("required")
|
|
135
219
|
incoming_required = incoming.get("required")
|
|
@@ -148,16 +232,27 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
148
232
|
"""
|
|
149
233
|
Create a new File reference for a remote file that will be written to.
|
|
150
234
|
|
|
151
|
-
|
|
152
|
-
|
|
235
|
+
Use this when you want to create a new file and write to it directly without creating a local file first.
|
|
236
|
+
|
|
237
|
+
Example (Async):
|
|
238
|
+
|
|
239
|
+
```python
|
|
153
240
|
@env.task
|
|
154
|
-
async def
|
|
155
|
-
df = pd.DataFrame(
|
|
241
|
+
async def create_csv() -> File:
|
|
242
|
+
df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
|
|
156
243
|
file = File.new_remote()
|
|
157
244
|
async with file.open("wb") as f:
|
|
158
245
|
df.to_csv(f)
|
|
159
246
|
return file
|
|
160
247
|
```
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
|
|
251
|
+
it will be used as a precomputed cache key. If a HashMethod is provided, it will be used
|
|
252
|
+
to compute the hash as data is written.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
A new File instance with a generated remote path
|
|
161
256
|
"""
|
|
162
257
|
ctx = internal_ctx()
|
|
163
258
|
known_cache_key = hash_method if isinstance(hash_method, str) else None
|
|
@@ -170,17 +265,26 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
170
265
|
"""
|
|
171
266
|
Create a File reference from an existing remote file.
|
|
172
267
|
|
|
268
|
+
Use this when you want to reference a file that already exists in remote storage without uploading it.
|
|
269
|
+
|
|
173
270
|
Example:
|
|
271
|
+
|
|
174
272
|
```python
|
|
175
273
|
@env.task
|
|
176
|
-
async def
|
|
177
|
-
|
|
274
|
+
async def process_existing_file() -> str:
|
|
275
|
+
file = File.from_existing_remote("s3://my-bucket/data.csv")
|
|
276
|
+
async with file.open("rb") as f:
|
|
277
|
+
content = await f.read()
|
|
278
|
+
return content.decode("utf-8")
|
|
178
279
|
```
|
|
179
280
|
|
|
180
281
|
Args:
|
|
181
282
|
remote_path: The remote path to the existing file
|
|
182
|
-
file_cache_key: Optional hash value to use for
|
|
183
|
-
|
|
283
|
+
file_cache_key: Optional hash value to use for cache key computation. If not specified, the cache key
|
|
284
|
+
will be computed based on the file's attributes (path, name, format).
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
A new File instance pointing to the existing remote file
|
|
184
288
|
"""
|
|
185
289
|
return cls(path=remote_path, hash=file_cache_key)
|
|
186
290
|
|
|
@@ -193,92 +297,129 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
193
297
|
cache_options: Optional[dict] = None,
|
|
194
298
|
compression: Optional[str] = None,
|
|
195
299
|
**kwargs,
|
|
196
|
-
) -> AsyncGenerator[Union[
|
|
300
|
+
) -> AsyncGenerator[Union[AsyncWritableFile, AsyncReadableFile, "HashingWriter"], None]:
|
|
197
301
|
"""
|
|
198
302
|
Asynchronously open the file and return a file-like object.
|
|
199
303
|
|
|
304
|
+
Use this method in async tasks to read from or write to files directly.
|
|
305
|
+
|
|
306
|
+
Example (Async Read):
|
|
307
|
+
|
|
308
|
+
```python
|
|
309
|
+
@env.task
|
|
310
|
+
async def read_file(f: File) -> str:
|
|
311
|
+
async with f.open("rb") as fh:
|
|
312
|
+
content = bytes(await fh.read())
|
|
313
|
+
return content.decode("utf-8")
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
Example (Async Write):
|
|
317
|
+
|
|
318
|
+
```python
|
|
319
|
+
@env.task
|
|
320
|
+
async def write_file() -> File:
|
|
321
|
+
f = File.new_remote()
|
|
322
|
+
async with f.open("wb") as fh:
|
|
323
|
+
await fh.write(b"Hello, World!")
|
|
324
|
+
return f
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
Example (Streaming Read):
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
@env.task
|
|
331
|
+
async def stream_read(f: File) -> str:
|
|
332
|
+
content_parts = []
|
|
333
|
+
async with f.open("rb", block_size=1024) as fh:
|
|
334
|
+
while True:
|
|
335
|
+
chunk = await fh.read()
|
|
336
|
+
if not chunk:
|
|
337
|
+
break
|
|
338
|
+
content_parts.append(chunk)
|
|
339
|
+
return b"".join(content_parts).decode("utf-8")
|
|
340
|
+
```
|
|
341
|
+
|
|
200
342
|
Args:
|
|
201
|
-
mode: The mode to open the file in (default: 'rb')
|
|
202
|
-
|
|
343
|
+
mode: The mode to open the file in (default: 'rb'). Common modes: 'rb' (read binary),
|
|
344
|
+
'wb' (write binary), 'rt' (read text), 'wt' (write text)
|
|
345
|
+
block_size: Size of blocks for reading in bytes. Useful for streaming large files.
|
|
203
346
|
cache_type: Caching mechanism to use ('readahead', 'mmap', 'bytes', 'none')
|
|
204
347
|
cache_options: Dictionary of options for the cache
|
|
205
348
|
compression: Compression format or None for auto-detection
|
|
206
349
|
**kwargs: Additional arguments passed to fsspec's open method
|
|
207
350
|
|
|
208
351
|
Returns:
|
|
209
|
-
An async file-like object
|
|
210
|
-
|
|
211
|
-
Example:
|
|
212
|
-
```python
|
|
213
|
-
async with file.open('rb') as f:
|
|
214
|
-
data = await f.read()
|
|
215
|
-
```
|
|
352
|
+
An async file-like object that can be used with async read/write operations
|
|
216
353
|
"""
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
if block_size:
|
|
229
|
-
open_kwargs["block_size"] = block_size
|
|
230
|
-
|
|
231
|
-
# Apply caching strategy
|
|
232
|
-
if cache_type != "none":
|
|
233
|
-
open_kwargs["cache_type"] = cache_type
|
|
234
|
-
open_kwargs["cache_options"] = cache_options
|
|
235
|
-
|
|
236
|
-
# Use aiofiles for local files
|
|
237
|
-
if fs.protocol == "file":
|
|
238
|
-
async with aiofiles.open(self.path, mode=mode, **kwargs) as f:
|
|
239
|
-
yield f
|
|
240
|
-
else:
|
|
241
|
-
# This code is broadly similar to what storage.get_stream does, but without actually reading from the stream
|
|
242
|
-
file_handle = None
|
|
354
|
+
# Check if we should use obstore bypass
|
|
355
|
+
try:
|
|
356
|
+
fh = await storage.open(
|
|
357
|
+
self.path,
|
|
358
|
+
mode=mode,
|
|
359
|
+
cache_type=cache_type,
|
|
360
|
+
cache_options=cache_options,
|
|
361
|
+
compression=compression,
|
|
362
|
+
block_size=block_size,
|
|
363
|
+
**kwargs,
|
|
364
|
+
)
|
|
243
365
|
try:
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
if isinstance(fs, AsyncFileSystem):
|
|
247
|
-
file_handle = await fs.open_async(self.path, mode)
|
|
248
|
-
yield file_handle
|
|
249
|
-
return
|
|
250
|
-
except NotImplementedError:
|
|
251
|
-
logger.debug(f"{fs} doesn't implement 'open_async', falling back to sync")
|
|
366
|
+
yield fh
|
|
367
|
+
return
|
|
252
368
|
finally:
|
|
253
|
-
if
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
with fs.open(self.path, mode) as file_handle:
|
|
257
|
-
if self.hash_method and self.hash is None:
|
|
258
|
-
logger.debug(f"Wrapping file handle with hashing writer using {self.hash_method}")
|
|
259
|
-
fh = HashingWriter(file_handle, accumulator=self.hash_method)
|
|
260
|
-
yield fh
|
|
261
|
-
self.hash = fh.result()
|
|
262
|
-
fh.close()
|
|
369
|
+
if inspect.iscoroutinefunction(fh.close):
|
|
370
|
+
await fh.close()
|
|
263
371
|
else:
|
|
264
|
-
|
|
265
|
-
|
|
372
|
+
fh.close()
|
|
373
|
+
except flyte.errors.OnlyAsyncIOSupportedError:
|
|
374
|
+
# Fall back to aiofiles
|
|
375
|
+
fs = storage.get_underlying_filesystem(path=self.path)
|
|
376
|
+
if "file" in fs.protocol:
|
|
377
|
+
async with aiofiles.open(self.path, mode=mode, **kwargs) as f:
|
|
378
|
+
yield f
|
|
379
|
+
return
|
|
380
|
+
raise
|
|
381
|
+
|
|
382
|
+
async def exists(self) -> bool:
|
|
383
|
+
"""
|
|
384
|
+
Asynchronously check if the file exists.
|
|
385
|
+
|
|
386
|
+
Example (Async):
|
|
387
|
+
|
|
388
|
+
```python
|
|
389
|
+
@env.task
|
|
390
|
+
async def check_file(f: File) -> bool:
|
|
391
|
+
if await f.exists():
|
|
392
|
+
print("File exists!")
|
|
393
|
+
return True
|
|
394
|
+
return False
|
|
395
|
+
```
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
True if the file exists, False otherwise
|
|
399
|
+
"""
|
|
400
|
+
return await storage.exists(self.path)
|
|
266
401
|
|
|
267
402
|
def exists_sync(self) -> bool:
|
|
268
403
|
"""
|
|
269
404
|
Synchronously check if the file exists.
|
|
270
405
|
|
|
406
|
+
Use this in non-async tasks or when you need synchronous file existence checking.
|
|
407
|
+
|
|
408
|
+
Example (Sync):
|
|
409
|
+
|
|
410
|
+
```python
|
|
411
|
+
@env.task
|
|
412
|
+
def check_file_sync(f: File) -> bool:
|
|
413
|
+
if f.exists_sync():
|
|
414
|
+
print("File exists!")
|
|
415
|
+
return True
|
|
416
|
+
return False
|
|
417
|
+
```
|
|
418
|
+
|
|
271
419
|
Returns:
|
|
272
420
|
True if the file exists, False otherwise
|
|
273
|
-
|
|
274
|
-
Example:
|
|
275
|
-
```python
|
|
276
|
-
if file.exists_sync():
|
|
277
|
-
# Process the file
|
|
278
|
-
```
|
|
279
421
|
"""
|
|
280
|
-
|
|
281
|
-
return fs.exists(self.path)
|
|
422
|
+
return storage.exists_sync(self.path)
|
|
282
423
|
|
|
283
424
|
@contextmanager
|
|
284
425
|
def open_sync(
|
|
@@ -289,26 +430,44 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
289
430
|
cache_options: Optional[dict] = None,
|
|
290
431
|
compression: Optional[str] = None,
|
|
291
432
|
**kwargs,
|
|
292
|
-
) -> Generator[IO[Any]]:
|
|
433
|
+
) -> Generator[IO[Any], None, None]:
|
|
293
434
|
"""
|
|
294
435
|
Synchronously open the file and return a file-like object.
|
|
295
436
|
|
|
437
|
+
Use this method in non-async tasks to read from or write to files directly.
|
|
438
|
+
|
|
439
|
+
Example (Sync Read):
|
|
440
|
+
|
|
441
|
+
```python
|
|
442
|
+
@env.task
|
|
443
|
+
def read_file_sync(f: File) -> str:
|
|
444
|
+
with f.open_sync("rb") as fh:
|
|
445
|
+
content = fh.read()
|
|
446
|
+
return content.decode("utf-8")
|
|
447
|
+
```
|
|
448
|
+
|
|
449
|
+
Example (Sync Write):
|
|
450
|
+
|
|
451
|
+
```python
|
|
452
|
+
@env.task
|
|
453
|
+
def write_file_sync() -> File:
|
|
454
|
+
f = File.new_remote()
|
|
455
|
+
with f.open_sync("wb") as fh:
|
|
456
|
+
fh.write(b"Hello, World!")
|
|
457
|
+
return f
|
|
458
|
+
```
|
|
459
|
+
|
|
296
460
|
Args:
|
|
297
|
-
mode: The mode to open the file in (default: 'rb')
|
|
298
|
-
|
|
461
|
+
mode: The mode to open the file in (default: 'rb'). Common modes: 'rb' (read binary),
|
|
462
|
+
'wb' (write binary), 'rt' (read text), 'wt' (write text)
|
|
463
|
+
block_size: Size of blocks for reading in bytes. Useful for streaming large files.
|
|
299
464
|
cache_type: Caching mechanism to use ('readahead', 'mmap', 'bytes', 'none')
|
|
300
465
|
cache_options: Dictionary of options for the cache
|
|
301
466
|
compression: Compression format or None for auto-detection
|
|
302
467
|
**kwargs: Additional arguments passed to fsspec's open method
|
|
303
468
|
|
|
304
469
|
Returns:
|
|
305
|
-
A file-like object
|
|
306
|
-
|
|
307
|
-
Example:
|
|
308
|
-
```python
|
|
309
|
-
with file.open_sync('rb') as f:
|
|
310
|
-
data = f.read()
|
|
311
|
-
```
|
|
470
|
+
A file-like object that can be used with standard read/write operations
|
|
312
471
|
"""
|
|
313
472
|
fs = storage.get_underlying_filesystem(path=self.path)
|
|
314
473
|
|
|
@@ -335,17 +494,34 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
335
494
|
"""
|
|
336
495
|
Asynchronously download the file to a local path.
|
|
337
496
|
|
|
497
|
+
Use this when you need to download a remote file to your local filesystem for processing.
|
|
498
|
+
|
|
499
|
+
Example (Async):
|
|
500
|
+
|
|
501
|
+
```python
|
|
502
|
+
@env.task
|
|
503
|
+
async def download_and_process(f: File) -> str:
|
|
504
|
+
local_path = await f.download()
|
|
505
|
+
# Now process the local file
|
|
506
|
+
with open(local_path, "r") as fh:
|
|
507
|
+
return fh.read()
|
|
508
|
+
```
|
|
509
|
+
|
|
510
|
+
Example (Download to specific path):
|
|
511
|
+
|
|
512
|
+
```python
|
|
513
|
+
@env.task
|
|
514
|
+
async def download_to_path(f: File) -> str:
|
|
515
|
+
local_path = await f.download("/tmp/myfile.csv")
|
|
516
|
+
return local_path
|
|
517
|
+
```
|
|
518
|
+
|
|
338
519
|
Args:
|
|
339
520
|
local_path: The local path to download the file to. If None, a temporary
|
|
340
|
-
directory will be used.
|
|
521
|
+
directory will be used and a path will be generated.
|
|
341
522
|
|
|
342
523
|
Returns:
|
|
343
|
-
The path to the downloaded file
|
|
344
|
-
|
|
345
|
-
Example:
|
|
346
|
-
```python
|
|
347
|
-
local_file = await file.download('/tmp/myfile.csv')
|
|
348
|
-
```
|
|
524
|
+
The absolute path to the downloaded file
|
|
349
525
|
"""
|
|
350
526
|
if local_path is None:
|
|
351
527
|
local_path = storage.get_random_local_path(file_path_or_file_name=local_path)
|
|
@@ -366,32 +542,216 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
366
542
|
await storage.get(self.path, str(local_path))
|
|
367
543
|
return str(local_path)
|
|
368
544
|
|
|
545
|
+
def download_sync(self, local_path: Optional[Union[str, Path]] = None) -> str:
|
|
546
|
+
"""
|
|
547
|
+
Synchronously download the file to a local path.
|
|
548
|
+
|
|
549
|
+
Use this in non-async tasks when you need to download a remote file to your local filesystem.
|
|
550
|
+
|
|
551
|
+
Example (Sync):
|
|
552
|
+
|
|
553
|
+
```python
|
|
554
|
+
@env.task
|
|
555
|
+
def download_and_process_sync(f: File) -> str:
|
|
556
|
+
local_path = f.download_sync()
|
|
557
|
+
# Now process the local file
|
|
558
|
+
with open(local_path, "r") as fh:
|
|
559
|
+
return fh.read()
|
|
560
|
+
```
|
|
561
|
+
|
|
562
|
+
Example (Download to specific path):
|
|
563
|
+
|
|
564
|
+
```python
|
|
565
|
+
@env.task
|
|
566
|
+
def download_to_path_sync(f: File) -> str:
|
|
567
|
+
local_path = f.download_sync("/tmp/myfile.csv")
|
|
568
|
+
return local_path
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
local_path: The local path to download the file to. If None, a temporary
|
|
573
|
+
directory will be used and a path will be generated.
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
The absolute path to the downloaded file
|
|
577
|
+
"""
|
|
578
|
+
if local_path is None:
|
|
579
|
+
local_path = storage.get_random_local_path(file_path_or_file_name=local_path)
|
|
580
|
+
else:
|
|
581
|
+
local_path = str(Path(local_path).absolute())
|
|
582
|
+
|
|
583
|
+
fs = storage.get_underlying_filesystem(path=self.path)
|
|
584
|
+
|
|
585
|
+
# If it's already a local file, just copy it
|
|
586
|
+
if "file" in fs.protocol:
|
|
587
|
+
# Use standard file operations for sync copy
|
|
588
|
+
import shutil
|
|
589
|
+
|
|
590
|
+
shutil.copy2(self.path, local_path)
|
|
591
|
+
return str(local_path)
|
|
592
|
+
|
|
593
|
+
# Otherwise download from remote using sync functionality
|
|
594
|
+
# Use the sync version of storage operations
|
|
595
|
+
with fs.open(self.path, "rb") as src:
|
|
596
|
+
with open(local_path, "wb") as dst:
|
|
597
|
+
dst.write(src.read())
|
|
598
|
+
return str(local_path)
|
|
599
|
+
|
|
369
600
|
@classmethod
|
|
370
601
|
@requires_initialization
|
|
371
|
-
|
|
602
|
+
def from_local_sync(
|
|
372
603
|
cls,
|
|
373
604
|
local_path: Union[str, Path],
|
|
374
605
|
remote_destination: Optional[str] = None,
|
|
375
606
|
hash_method: Optional[HashMethod | str] = None,
|
|
376
607
|
) -> File[T]:
|
|
377
608
|
"""
|
|
378
|
-
|
|
609
|
+
Synchronously create a new File object from a local file by uploading it to remote storage.
|
|
610
|
+
|
|
611
|
+
Use this in non-async tasks when you have a local file that needs to be uploaded to remote storage.
|
|
612
|
+
|
|
613
|
+
Example (Sync):
|
|
614
|
+
|
|
615
|
+
```python
|
|
616
|
+
@env.task
|
|
617
|
+
def upload_local_file_sync() -> File:
|
|
618
|
+
# Create a local file
|
|
619
|
+
with open("/tmp/data.csv", "w") as f:
|
|
620
|
+
f.write("col1,col2\n1,2\n3,4\n")
|
|
621
|
+
|
|
622
|
+
# Upload to remote storage
|
|
623
|
+
remote_file = File.from_local_sync("/tmp/data.csv")
|
|
624
|
+
return remote_file
|
|
625
|
+
```
|
|
626
|
+
|
|
627
|
+
Example (With specific destination):
|
|
628
|
+
|
|
629
|
+
```python
|
|
630
|
+
@env.task
|
|
631
|
+
def upload_to_specific_path() -> File:
|
|
632
|
+
remote_file = File.from_local_sync("/tmp/data.csv", "s3://my-bucket/data.csv")
|
|
633
|
+
return remote_file
|
|
634
|
+
```
|
|
379
635
|
|
|
380
636
|
Args:
|
|
381
637
|
local_path: Path to the local file
|
|
382
|
-
remote_destination: Optional path to store the file
|
|
383
|
-
hash_method:
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
If there is a set value you want to use, please pass an instance of the PrecomputedValue HashMethod.
|
|
638
|
+
remote_destination: Optional remote path to store the file. If None, a path will be automatically generated.
|
|
639
|
+
hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
|
|
640
|
+
it will be used as a precomputed cache key. If a HashMethod is provided, it will compute
|
|
641
|
+
the hash during upload. If not specified, the cache key will be based on file attributes.
|
|
387
642
|
|
|
388
643
|
Returns:
|
|
389
|
-
A new File instance pointing to the uploaded file
|
|
644
|
+
A new File instance pointing to the uploaded remote file
|
|
645
|
+
"""
|
|
646
|
+
if not os.path.exists(local_path):
|
|
647
|
+
raise ValueError(f"File not found: {local_path}")
|
|
390
648
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
649
|
+
remote_path = remote_destination or internal_ctx().raw_data.get_random_remote_path()
|
|
650
|
+
protocol = get_protocol(remote_path)
|
|
651
|
+
filename = Path(local_path).name
|
|
652
|
+
|
|
653
|
+
# If remote_destination was not set by the user, and the configured raw data path is also local,
|
|
654
|
+
# then let's optimize by not uploading.
|
|
655
|
+
hash_value = hash_method if isinstance(hash_method, str) else None
|
|
656
|
+
hash_method_obj = hash_method if isinstance(hash_method, HashMethod) else None
|
|
657
|
+
|
|
658
|
+
if "file" in protocol:
|
|
659
|
+
if remote_destination is None:
|
|
660
|
+
path = str(Path(local_path).absolute())
|
|
661
|
+
else:
|
|
662
|
+
# Otherwise, actually make a copy of the file
|
|
663
|
+
import shutil
|
|
664
|
+
|
|
665
|
+
if hash_method_obj:
|
|
666
|
+
# For hash computation, we need to read and write manually
|
|
667
|
+
with open(local_path, "rb") as src:
|
|
668
|
+
with open(remote_path, "wb") as dst:
|
|
669
|
+
dst_wrapper = HashingWriter(dst, accumulator=hash_method_obj)
|
|
670
|
+
dst_wrapper.write(src.read())
|
|
671
|
+
hash_value = dst_wrapper.result()
|
|
672
|
+
dst_wrapper.close()
|
|
673
|
+
else:
|
|
674
|
+
shutil.copy2(local_path, remote_path)
|
|
675
|
+
path = str(Path(remote_path).absolute())
|
|
676
|
+
else:
|
|
677
|
+
# Otherwise upload to remote using sync storage layer
|
|
678
|
+
fs = storage.get_underlying_filesystem(path=remote_path)
|
|
679
|
+
|
|
680
|
+
if hash_method_obj:
|
|
681
|
+
# We can skip the wrapper if the hash method is just a precomputed value
|
|
682
|
+
if not isinstance(hash_method_obj, PrecomputedValue):
|
|
683
|
+
with open(local_path, "rb") as src:
|
|
684
|
+
# For sync operations, we need to compute hash manually
|
|
685
|
+
data = src.read()
|
|
686
|
+
hash_method_obj.update(memoryview(data))
|
|
687
|
+
hash_value = hash_method_obj.result()
|
|
688
|
+
|
|
689
|
+
# Now write the data to remote
|
|
690
|
+
with fs.open(remote_path, "wb") as dst:
|
|
691
|
+
dst.write(data)
|
|
692
|
+
path = remote_path
|
|
693
|
+
else:
|
|
694
|
+
# Use sync file operations
|
|
695
|
+
with open(local_path, "rb") as src:
|
|
696
|
+
with fs.open(remote_path, "wb") as dst:
|
|
697
|
+
dst.write(src.read())
|
|
698
|
+
path = remote_path
|
|
699
|
+
hash_value = hash_method_obj.result()
|
|
700
|
+
else:
|
|
701
|
+
# Simple sync copy
|
|
702
|
+
with open(local_path, "rb") as src:
|
|
703
|
+
with fs.open(remote_path, "wb") as dst:
|
|
704
|
+
dst.write(src.read())
|
|
705
|
+
path = remote_path
|
|
706
|
+
|
|
707
|
+
f = cls(path=path, name=filename, hash_method=hash_method_obj, hash=hash_value)
|
|
708
|
+
return f
|
|
709
|
+
|
|
710
|
+
@classmethod
|
|
711
|
+
@requires_initialization
|
|
712
|
+
async def from_local(
|
|
713
|
+
cls,
|
|
714
|
+
local_path: Union[str, Path],
|
|
715
|
+
remote_destination: Optional[str] = None,
|
|
716
|
+
hash_method: Optional[HashMethod | str] = None,
|
|
717
|
+
) -> File[T]:
|
|
718
|
+
"""
|
|
719
|
+
Asynchronously create a new File object from a local file by uploading it to remote storage.
|
|
720
|
+
|
|
721
|
+
Use this in async tasks when you have a local file that needs to be uploaded to remote storage.
|
|
722
|
+
|
|
723
|
+
Example (Async):
|
|
724
|
+
|
|
725
|
+
```python
|
|
726
|
+
@env.task
|
|
727
|
+
async def upload_local_file() -> File:
|
|
728
|
+
# Create a local file
|
|
729
|
+
async with aiofiles.open("/tmp/data.csv", "w") as f:
|
|
730
|
+
await f.write("col1,col2\n1,2\n3,4\n")
|
|
731
|
+
|
|
732
|
+
# Upload to remote storage
|
|
733
|
+
remote_file = await File.from_local("/tmp/data.csv")
|
|
734
|
+
return remote_file
|
|
735
|
+
```
|
|
736
|
+
|
|
737
|
+
Example (With specific destination):
|
|
738
|
+
|
|
739
|
+
```python
|
|
740
|
+
@env.task
|
|
741
|
+
async def upload_to_specific_path() -> File:
|
|
742
|
+
remote_file = await File.from_local("/tmp/data.csv", "s3://my-bucket/data.csv")
|
|
743
|
+
return remote_file
|
|
744
|
+
```
|
|
745
|
+
|
|
746
|
+
Args:
|
|
747
|
+
local_path: Path to the local file
|
|
748
|
+
remote_destination: Optional remote path to store the file. If None, a path will be automatically generated.
|
|
749
|
+
hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
|
|
750
|
+
it will be used as a precomputed cache key. If a HashMethod is provided, it will compute
|
|
751
|
+
the hash during upload. If not specified, the cache key will be based on file attributes.
|
|
752
|
+
|
|
753
|
+
Returns:
|
|
754
|
+
A new File instance pointing to the uploaded remote file
|
|
395
755
|
"""
|
|
396
756
|
if not os.path.exists(local_path):
|
|
397
757
|
raise ValueError(f"File not found: {local_path}")
|