flyte 2.0.0b32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flyte might be problematic. Click here for more details.
- flyte/__init__.py +108 -0
- flyte/_bin/__init__.py +0 -0
- flyte/_bin/debug.py +38 -0
- flyte/_bin/runtime.py +195 -0
- flyte/_bin/serve.py +178 -0
- flyte/_build.py +26 -0
- flyte/_cache/__init__.py +12 -0
- flyte/_cache/cache.py +147 -0
- flyte/_cache/defaults.py +9 -0
- flyte/_cache/local_cache.py +216 -0
- flyte/_cache/policy_function_body.py +42 -0
- flyte/_code_bundle/__init__.py +8 -0
- flyte/_code_bundle/_ignore.py +121 -0
- flyte/_code_bundle/_packaging.py +218 -0
- flyte/_code_bundle/_utils.py +347 -0
- flyte/_code_bundle/bundle.py +266 -0
- flyte/_constants.py +1 -0
- flyte/_context.py +155 -0
- flyte/_custom_context.py +73 -0
- flyte/_debug/__init__.py +0 -0
- flyte/_debug/constants.py +38 -0
- flyte/_debug/utils.py +17 -0
- flyte/_debug/vscode.py +307 -0
- flyte/_deploy.py +408 -0
- flyte/_deployer.py +109 -0
- flyte/_doc.py +29 -0
- flyte/_docstring.py +32 -0
- flyte/_environment.py +122 -0
- flyte/_excepthook.py +37 -0
- flyte/_group.py +32 -0
- flyte/_hash.py +8 -0
- flyte/_image.py +1055 -0
- flyte/_initialize.py +628 -0
- flyte/_interface.py +119 -0
- flyte/_internal/__init__.py +3 -0
- flyte/_internal/controllers/__init__.py +129 -0
- flyte/_internal/controllers/_local_controller.py +239 -0
- flyte/_internal/controllers/_trace.py +48 -0
- flyte/_internal/controllers/remote/__init__.py +58 -0
- flyte/_internal/controllers/remote/_action.py +211 -0
- flyte/_internal/controllers/remote/_client.py +47 -0
- flyte/_internal/controllers/remote/_controller.py +583 -0
- flyte/_internal/controllers/remote/_core.py +465 -0
- flyte/_internal/controllers/remote/_informer.py +381 -0
- flyte/_internal/controllers/remote/_service_protocol.py +50 -0
- flyte/_internal/imagebuild/__init__.py +3 -0
- flyte/_internal/imagebuild/docker_builder.py +706 -0
- flyte/_internal/imagebuild/image_builder.py +277 -0
- flyte/_internal/imagebuild/remote_builder.py +386 -0
- flyte/_internal/imagebuild/utils.py +78 -0
- flyte/_internal/resolvers/__init__.py +0 -0
- flyte/_internal/resolvers/_task_module.py +21 -0
- flyte/_internal/resolvers/common.py +31 -0
- flyte/_internal/resolvers/default.py +28 -0
- flyte/_internal/runtime/__init__.py +0 -0
- flyte/_internal/runtime/convert.py +486 -0
- flyte/_internal/runtime/entrypoints.py +204 -0
- flyte/_internal/runtime/io.py +188 -0
- flyte/_internal/runtime/resources_serde.py +152 -0
- flyte/_internal/runtime/reuse.py +125 -0
- flyte/_internal/runtime/rusty.py +193 -0
- flyte/_internal/runtime/task_serde.py +362 -0
- flyte/_internal/runtime/taskrunner.py +209 -0
- flyte/_internal/runtime/trigger_serde.py +160 -0
- flyte/_internal/runtime/types_serde.py +54 -0
- flyte/_keyring/__init__.py +0 -0
- flyte/_keyring/file.py +115 -0
- flyte/_logging.py +300 -0
- flyte/_map.py +312 -0
- flyte/_module.py +72 -0
- flyte/_pod.py +30 -0
- flyte/_resources.py +473 -0
- flyte/_retry.py +32 -0
- flyte/_reusable_environment.py +102 -0
- flyte/_run.py +724 -0
- flyte/_secret.py +96 -0
- flyte/_task.py +550 -0
- flyte/_task_environment.py +316 -0
- flyte/_task_plugins.py +47 -0
- flyte/_timeout.py +47 -0
- flyte/_tools.py +27 -0
- flyte/_trace.py +119 -0
- flyte/_trigger.py +1000 -0
- flyte/_utils/__init__.py +30 -0
- flyte/_utils/asyn.py +121 -0
- flyte/_utils/async_cache.py +139 -0
- flyte/_utils/coro_management.py +27 -0
- flyte/_utils/docker_credentials.py +173 -0
- flyte/_utils/file_handling.py +72 -0
- flyte/_utils/helpers.py +134 -0
- flyte/_utils/lazy_module.py +54 -0
- flyte/_utils/module_loader.py +104 -0
- flyte/_utils/org_discovery.py +57 -0
- flyte/_utils/uv_script_parser.py +49 -0
- flyte/_version.py +34 -0
- flyte/app/__init__.py +22 -0
- flyte/app/_app_environment.py +157 -0
- flyte/app/_deploy.py +125 -0
- flyte/app/_input.py +160 -0
- flyte/app/_runtime/__init__.py +3 -0
- flyte/app/_runtime/app_serde.py +347 -0
- flyte/app/_types.py +101 -0
- flyte/app/extras/__init__.py +3 -0
- flyte/app/extras/_fastapi.py +151 -0
- flyte/cli/__init__.py +12 -0
- flyte/cli/_abort.py +28 -0
- flyte/cli/_build.py +114 -0
- flyte/cli/_common.py +468 -0
- flyte/cli/_create.py +371 -0
- flyte/cli/_delete.py +45 -0
- flyte/cli/_deploy.py +293 -0
- flyte/cli/_gen.py +176 -0
- flyte/cli/_get.py +370 -0
- flyte/cli/_option.py +33 -0
- flyte/cli/_params.py +554 -0
- flyte/cli/_plugins.py +209 -0
- flyte/cli/_run.py +597 -0
- flyte/cli/_serve.py +64 -0
- flyte/cli/_update.py +37 -0
- flyte/cli/_user.py +17 -0
- flyte/cli/main.py +221 -0
- flyte/config/__init__.py +3 -0
- flyte/config/_config.py +248 -0
- flyte/config/_internal.py +73 -0
- flyte/config/_reader.py +225 -0
- flyte/connectors/__init__.py +11 -0
- flyte/connectors/_connector.py +270 -0
- flyte/connectors/_server.py +197 -0
- flyte/connectors/utils.py +135 -0
- flyte/errors.py +243 -0
- flyte/extend.py +19 -0
- flyte/extras/__init__.py +5 -0
- flyte/extras/_container.py +286 -0
- flyte/git/__init__.py +3 -0
- flyte/git/_config.py +21 -0
- flyte/io/__init__.py +29 -0
- flyte/io/_dataframe/__init__.py +131 -0
- flyte/io/_dataframe/basic_dfs.py +223 -0
- flyte/io/_dataframe/dataframe.py +1026 -0
- flyte/io/_dir.py +910 -0
- flyte/io/_file.py +914 -0
- flyte/io/_hashing_io.py +342 -0
- flyte/models.py +479 -0
- flyte/py.typed +0 -0
- flyte/remote/__init__.py +35 -0
- flyte/remote/_action.py +738 -0
- flyte/remote/_app.py +57 -0
- flyte/remote/_client/__init__.py +0 -0
- flyte/remote/_client/_protocols.py +189 -0
- flyte/remote/_client/auth/__init__.py +12 -0
- flyte/remote/_client/auth/_auth_utils.py +14 -0
- flyte/remote/_client/auth/_authenticators/__init__.py +0 -0
- flyte/remote/_client/auth/_authenticators/base.py +403 -0
- flyte/remote/_client/auth/_authenticators/client_credentials.py +73 -0
- flyte/remote/_client/auth/_authenticators/device_code.py +117 -0
- flyte/remote/_client/auth/_authenticators/external_command.py +79 -0
- flyte/remote/_client/auth/_authenticators/factory.py +200 -0
- flyte/remote/_client/auth/_authenticators/pkce.py +516 -0
- flyte/remote/_client/auth/_channel.py +213 -0
- flyte/remote/_client/auth/_client_config.py +85 -0
- flyte/remote/_client/auth/_default_html.py +32 -0
- flyte/remote/_client/auth/_grpc_utils/__init__.py +0 -0
- flyte/remote/_client/auth/_grpc_utils/auth_interceptor.py +288 -0
- flyte/remote/_client/auth/_grpc_utils/default_metadata_interceptor.py +151 -0
- flyte/remote/_client/auth/_keyring.py +152 -0
- flyte/remote/_client/auth/_token_client.py +260 -0
- flyte/remote/_client/auth/errors.py +16 -0
- flyte/remote/_client/controlplane.py +128 -0
- flyte/remote/_common.py +30 -0
- flyte/remote/_console.py +19 -0
- flyte/remote/_data.py +161 -0
- flyte/remote/_logs.py +185 -0
- flyte/remote/_project.py +88 -0
- flyte/remote/_run.py +386 -0
- flyte/remote/_secret.py +142 -0
- flyte/remote/_task.py +527 -0
- flyte/remote/_trigger.py +306 -0
- flyte/remote/_user.py +33 -0
- flyte/report/__init__.py +3 -0
- flyte/report/_report.py +182 -0
- flyte/report/_template.html +124 -0
- flyte/storage/__init__.py +36 -0
- flyte/storage/_config.py +237 -0
- flyte/storage/_parallel_reader.py +274 -0
- flyte/storage/_remote_fs.py +34 -0
- flyte/storage/_storage.py +456 -0
- flyte/storage/_utils.py +5 -0
- flyte/syncify/__init__.py +56 -0
- flyte/syncify/_api.py +375 -0
- flyte/types/__init__.py +52 -0
- flyte/types/_interface.py +40 -0
- flyte/types/_pickle.py +145 -0
- flyte/types/_renderer.py +162 -0
- flyte/types/_string_literals.py +119 -0
- flyte/types/_type_engine.py +2254 -0
- flyte/types/_utils.py +80 -0
- flyte-2.0.0b32.data/scripts/debug.py +38 -0
- flyte-2.0.0b32.data/scripts/runtime.py +195 -0
- flyte-2.0.0b32.dist-info/METADATA +351 -0
- flyte-2.0.0b32.dist-info/RECORD +204 -0
- flyte-2.0.0b32.dist-info/WHEEL +5 -0
- flyte-2.0.0b32.dist-info/entry_points.txt +7 -0
- flyte-2.0.0b32.dist-info/licenses/LICENSE +201 -0
- flyte-2.0.0b32.dist-info/top_level.txt +1 -0
flyte/io/_file.py
ADDED
|
@@ -0,0 +1,914 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import inspect
|
|
4
|
+
import os
|
|
5
|
+
import typing
|
|
6
|
+
from contextlib import asynccontextmanager, contextmanager
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import (
|
|
9
|
+
IO,
|
|
10
|
+
Annotated,
|
|
11
|
+
Any,
|
|
12
|
+
AsyncGenerator,
|
|
13
|
+
Dict,
|
|
14
|
+
Generator,
|
|
15
|
+
Generic,
|
|
16
|
+
Optional,
|
|
17
|
+
Type,
|
|
18
|
+
TypeVar,
|
|
19
|
+
Union,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
import aiofiles
|
|
23
|
+
from flyteidl2.core import literals_pb2, types_pb2
|
|
24
|
+
from fsspec.utils import get_protocol
|
|
25
|
+
from mashumaro.types import SerializableType
|
|
26
|
+
from pydantic import BaseModel, Field, model_validator
|
|
27
|
+
from pydantic.json_schema import SkipJsonSchema
|
|
28
|
+
|
|
29
|
+
import flyte.errors
|
|
30
|
+
import flyte.storage as storage
|
|
31
|
+
from flyte._context import internal_ctx
|
|
32
|
+
from flyte._initialize import requires_initialization
|
|
33
|
+
from flyte.io._hashing_io import AsyncHashingReader, HashingWriter, HashMethod, PrecomputedValue
|
|
34
|
+
from flyte.types import TypeEngine, TypeTransformer, TypeTransformerFailedError
|
|
35
|
+
|
|
36
|
+
if typing.TYPE_CHECKING:
|
|
37
|
+
from obstore import AsyncReadableFile, AsyncWritableFile
|
|
38
|
+
|
|
39
|
+
if typing.TYPE_CHECKING:
|
|
40
|
+
from obstore import AsyncReadableFile, AsyncWritableFile
|
|
41
|
+
|
|
42
|
+
# Type variable for the file format
|
|
43
|
+
T = TypeVar("T")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class File(BaseModel, Generic[T], SerializableType):
|
|
47
|
+
"""
|
|
48
|
+
A generic file class representing a file with a specified format.
|
|
49
|
+
Provides both async and sync interfaces for file operations. All methods without _sync suffix are async.
|
|
50
|
+
|
|
51
|
+
The class should be instantiated using one of the class methods. The constructor should be used only to
|
|
52
|
+
instantiate references to existing remote objects.
|
|
53
|
+
|
|
54
|
+
The generic type T represents the format of the file.
|
|
55
|
+
|
|
56
|
+
Important methods:
|
|
57
|
+
- `from_existing_remote`: Create a File object from an existing remote file.
|
|
58
|
+
- `new_remote`: Create a new File reference for a remote file that will be written to.
|
|
59
|
+
|
|
60
|
+
**Asynchronous methods**:
|
|
61
|
+
- `open`: Asynchronously open the file and return a file-like object.
|
|
62
|
+
- `download`: Asynchronously download the file to a local path.
|
|
63
|
+
- `from_local`: Asynchronously create a File object from a local file, uploading it to remote storage.
|
|
64
|
+
- `exists`: Asynchronously check if the file exists.
|
|
65
|
+
|
|
66
|
+
**Synchronous methods** (suffixed with `_sync`):
|
|
67
|
+
- `open_sync`: Synchronously open the file and return a file-like object.
|
|
68
|
+
- `download_sync`: Synchronously download the file to a local path.
|
|
69
|
+
- `from_local_sync`: Synchronously create a File object from a local file, uploading it to remote storage.
|
|
70
|
+
- `exists_sync`: Synchronously check if the file exists.
|
|
71
|
+
|
|
72
|
+
Example: Read a file input in a Task (Async).
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
@env.task
|
|
76
|
+
async def read_file(file: File) -> str:
|
|
77
|
+
async with file.open("rb") as f:
|
|
78
|
+
content = bytes(await f.read())
|
|
79
|
+
return content.decode("utf-8")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Example: Read a file input in a Task (Sync).
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
@env.task
|
|
86
|
+
def read_file_sync(file: File) -> str:
|
|
87
|
+
with file.open_sync("rb") as f:
|
|
88
|
+
content = f.read()
|
|
89
|
+
return content.decode("utf-8")
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Example: Write a file by streaming it directly to blob storage (Async).
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
@env.task
|
|
96
|
+
async def write_file() -> File:
|
|
97
|
+
file = File.new_remote()
|
|
98
|
+
async with file.open("wb") as f:
|
|
99
|
+
await f.write(b"Hello, World!")
|
|
100
|
+
return file
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Example: Upload a local file to remote storage (Async).
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
@env.task
|
|
107
|
+
async def upload_file() -> File:
|
|
108
|
+
# Write to local file first
|
|
109
|
+
with open("/tmp/data.csv", "w") as f:
|
|
110
|
+
f.write("col1,col2\\n1,2\\n3,4\\n")
|
|
111
|
+
# Upload to remote storage
|
|
112
|
+
return await File.from_local("/tmp/data.csv")
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Example: Upload a local file to remote storage (Sync).
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
@env.task
|
|
119
|
+
def upload_file_sync() -> File:
|
|
120
|
+
# Write to local file first
|
|
121
|
+
with open("/tmp/data.csv", "w") as f:
|
|
122
|
+
f.write("col1,col2\\n1,2\\n3,4\\n")
|
|
123
|
+
# Upload to remote storage
|
|
124
|
+
return File.from_local_sync("/tmp/data.csv")
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Example: Download a file to local storage (Async).
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
@env.task
|
|
131
|
+
async def download_file(file: File) -> str:
|
|
132
|
+
local_path = await file.download()
|
|
133
|
+
# Process the local file
|
|
134
|
+
with open(local_path, "r") as f:
|
|
135
|
+
return f.read()
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Example: Download a file to local storage (Sync).
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
@env.task
|
|
142
|
+
def download_file_sync(file: File) -> str:
|
|
143
|
+
local_path = file.download_sync()
|
|
144
|
+
# Process the local file
|
|
145
|
+
with open(local_path, "r") as f:
|
|
146
|
+
return f.read()
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Example: Reference an existing remote file.
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
@env.task
|
|
153
|
+
async def process_existing_file() -> str:
|
|
154
|
+
file = File.from_existing_remote("s3://my-bucket/data.csv")
|
|
155
|
+
async with file.open("rb") as f:
|
|
156
|
+
content = await f.read()
|
|
157
|
+
return content.decode("utf-8")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Example: Check if a file exists (Async).
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
@env.task
|
|
164
|
+
async def check_file(file: File) -> bool:
|
|
165
|
+
return await file.exists()
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Example: Check if a file exists (Sync).
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
@env.task
|
|
172
|
+
def check_file_sync(file: File) -> bool:
|
|
173
|
+
return file.exists_sync()
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Example: Pass through a file without copying.
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
@env.task
|
|
180
|
+
async def pass_through(file: File) -> File:
|
|
181
|
+
# No copy occurs - just passes the reference
|
|
182
|
+
return file
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
path: The path to the file (can be local or remote)
|
|
187
|
+
name: Optional name for the file (defaults to basename of path)
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
path: str
|
|
191
|
+
name: Optional[str] = None
|
|
192
|
+
format: str = ""
|
|
193
|
+
hash: Optional[str] = None
|
|
194
|
+
hash_method: Annotated[Optional[HashMethod], Field(default=None, exclude=True), SkipJsonSchema()] = None
|
|
195
|
+
|
|
196
|
+
class Config:
|
|
197
|
+
arbitrary_types_allowed = True
|
|
198
|
+
|
|
199
|
+
@model_validator(mode="before")
|
|
200
|
+
@classmethod
|
|
201
|
+
def pre_init(cls, data):
|
|
202
|
+
"""Internal: Pydantic validator to set default name from path. Not intended for direct use."""
|
|
203
|
+
if data.get("name") is None:
|
|
204
|
+
data["name"] = Path(data["path"]).name
|
|
205
|
+
return data
|
|
206
|
+
|
|
207
|
+
def _serialize(self) -> Dict[str, Optional[str]]:
|
|
208
|
+
"""Internal: Serialize File to dictionary. Not intended for direct use."""
|
|
209
|
+
pyd_dump = self.model_dump()
|
|
210
|
+
return pyd_dump
|
|
211
|
+
|
|
212
|
+
@classmethod
|
|
213
|
+
def _deserialize(cls, file_dump: Dict[str, Optional[str]]) -> File:
|
|
214
|
+
"""Internal: Deserialize File from dictionary. Not intended for direct use."""
|
|
215
|
+
return File.model_validate(file_dump)
|
|
216
|
+
|
|
217
|
+
@classmethod
|
|
218
|
+
def schema_match(cls, incoming: dict):
|
|
219
|
+
"""Internal: Check if incoming schema matches File schema. Not intended for direct use."""
|
|
220
|
+
this_schema = cls.model_json_schema()
|
|
221
|
+
current_required = this_schema.get("required")
|
|
222
|
+
incoming_required = incoming.get("required")
|
|
223
|
+
if (
|
|
224
|
+
current_required
|
|
225
|
+
and incoming_required
|
|
226
|
+
and incoming.get("type") == this_schema.get("type")
|
|
227
|
+
and incoming.get("title") == this_schema.get("title")
|
|
228
|
+
and set(current_required) == set(incoming_required)
|
|
229
|
+
):
|
|
230
|
+
return True
|
|
231
|
+
|
|
232
|
+
@classmethod
|
|
233
|
+
@requires_initialization
|
|
234
|
+
def new_remote(cls, file_name: Optional[str] = None, hash_method: Optional[HashMethod | str] = None) -> File[T]:
|
|
235
|
+
"""
|
|
236
|
+
Create a new File reference for a remote file that will be written to.
|
|
237
|
+
|
|
238
|
+
Use this when you want to create a new file and write to it directly without creating a local file first.
|
|
239
|
+
|
|
240
|
+
Example (Async):
|
|
241
|
+
|
|
242
|
+
```python
|
|
243
|
+
@env.task
|
|
244
|
+
async def create_csv() -> File:
|
|
245
|
+
df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
|
|
246
|
+
file = File.new_remote()
|
|
247
|
+
async with file.open("wb") as f:
|
|
248
|
+
df.to_csv(f)
|
|
249
|
+
return file
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
file_name: Optional string specifying a remote file name. If not set,
|
|
254
|
+
a generated file name will be returned.
|
|
255
|
+
hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
|
|
256
|
+
it will be used as a precomputed cache key. If a HashMethod is provided, it will be used
|
|
257
|
+
to compute the hash as data is written.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
A new File instance with a generated remote path
|
|
261
|
+
"""
|
|
262
|
+
ctx = internal_ctx()
|
|
263
|
+
known_cache_key = hash_method if isinstance(hash_method, str) else None
|
|
264
|
+
method = hash_method if isinstance(hash_method, HashMethod) else None
|
|
265
|
+
|
|
266
|
+
return cls(
|
|
267
|
+
path=ctx.raw_data.get_random_remote_path(file_name=file_name), hash=known_cache_key, hash_method=method
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
@classmethod
|
|
271
|
+
def from_existing_remote(cls, remote_path: str, file_cache_key: Optional[str] = None) -> File[T]:
|
|
272
|
+
"""
|
|
273
|
+
Create a File reference from an existing remote file.
|
|
274
|
+
|
|
275
|
+
Use this when you want to reference a file that already exists in remote storage without uploading it.
|
|
276
|
+
|
|
277
|
+
Example:
|
|
278
|
+
|
|
279
|
+
```python
|
|
280
|
+
@env.task
|
|
281
|
+
async def process_existing_file() -> str:
|
|
282
|
+
file = File.from_existing_remote("s3://my-bucket/data.csv")
|
|
283
|
+
async with file.open("rb") as f:
|
|
284
|
+
content = await f.read()
|
|
285
|
+
return content.decode("utf-8")
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
remote_path: The remote path to the existing file
|
|
290
|
+
file_cache_key: Optional hash value to use for cache key computation. If not specified, the cache key
|
|
291
|
+
will be computed based on the file's attributes (path, name, format).
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
A new File instance pointing to the existing remote file
|
|
295
|
+
"""
|
|
296
|
+
return cls(path=remote_path, hash=file_cache_key)
|
|
297
|
+
|
|
298
|
+
@asynccontextmanager
|
|
299
|
+
async def open(
|
|
300
|
+
self,
|
|
301
|
+
mode: str = "rb",
|
|
302
|
+
block_size: Optional[int] = None,
|
|
303
|
+
cache_type: str = "readahead",
|
|
304
|
+
cache_options: Optional[dict] = None,
|
|
305
|
+
compression: Optional[str] = None,
|
|
306
|
+
**kwargs,
|
|
307
|
+
) -> AsyncGenerator[Union[AsyncWritableFile, AsyncReadableFile, "HashingWriter"], None]:
|
|
308
|
+
"""
|
|
309
|
+
Asynchronously open the file and return a file-like object.
|
|
310
|
+
|
|
311
|
+
Use this method in async tasks to read from or write to files directly.
|
|
312
|
+
|
|
313
|
+
Example (Async Read):
|
|
314
|
+
|
|
315
|
+
```python
|
|
316
|
+
@env.task
|
|
317
|
+
async def read_file(f: File) -> str:
|
|
318
|
+
async with f.open("rb") as fh:
|
|
319
|
+
content = bytes(await fh.read())
|
|
320
|
+
return content.decode("utf-8")
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
Example (Async Write):
|
|
324
|
+
|
|
325
|
+
```python
|
|
326
|
+
@env.task
|
|
327
|
+
async def write_file() -> File:
|
|
328
|
+
f = File.new_remote()
|
|
329
|
+
async with f.open("wb") as fh:
|
|
330
|
+
await fh.write(b"Hello, World!")
|
|
331
|
+
return f
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
Example (Streaming Read):
|
|
335
|
+
|
|
336
|
+
```python
|
|
337
|
+
@env.task
|
|
338
|
+
async def stream_read(f: File) -> str:
|
|
339
|
+
content_parts = []
|
|
340
|
+
async with f.open("rb", block_size=1024) as fh:
|
|
341
|
+
while True:
|
|
342
|
+
chunk = await fh.read()
|
|
343
|
+
if not chunk:
|
|
344
|
+
break
|
|
345
|
+
content_parts.append(chunk)
|
|
346
|
+
return b"".join(content_parts).decode("utf-8")
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
mode: The mode to open the file in (default: 'rb'). Common modes: 'rb' (read binary),
|
|
351
|
+
'wb' (write binary), 'rt' (read text), 'wt' (write text)
|
|
352
|
+
block_size: Size of blocks for reading in bytes. Useful for streaming large files.
|
|
353
|
+
cache_type: Caching mechanism to use ('readahead', 'mmap', 'bytes', 'none')
|
|
354
|
+
cache_options: Dictionary of options for the cache
|
|
355
|
+
compression: Compression format or None for auto-detection
|
|
356
|
+
**kwargs: Additional arguments passed to fsspec's open method
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
An async file-like object that can be used with async read/write operations
|
|
360
|
+
"""
|
|
361
|
+
# Check if we should use obstore bypass
|
|
362
|
+
try:
|
|
363
|
+
fh = await storage.open(
|
|
364
|
+
self.path,
|
|
365
|
+
mode=mode,
|
|
366
|
+
cache_type=cache_type,
|
|
367
|
+
cache_options=cache_options,
|
|
368
|
+
compression=compression,
|
|
369
|
+
block_size=block_size,
|
|
370
|
+
**kwargs,
|
|
371
|
+
)
|
|
372
|
+
try:
|
|
373
|
+
yield fh
|
|
374
|
+
return
|
|
375
|
+
finally:
|
|
376
|
+
if inspect.iscoroutinefunction(fh.close):
|
|
377
|
+
await fh.close()
|
|
378
|
+
else:
|
|
379
|
+
fh.close()
|
|
380
|
+
except flyte.errors.OnlyAsyncIOSupportedError:
|
|
381
|
+
# Fall back to aiofiles
|
|
382
|
+
fs = storage.get_underlying_filesystem(path=self.path)
|
|
383
|
+
if "file" in fs.protocol:
|
|
384
|
+
async with aiofiles.open(self.path, mode=mode, **kwargs) as f:
|
|
385
|
+
yield f
|
|
386
|
+
return
|
|
387
|
+
raise
|
|
388
|
+
|
|
389
|
+
async def exists(self) -> bool:
|
|
390
|
+
"""
|
|
391
|
+
Asynchronously check if the file exists.
|
|
392
|
+
|
|
393
|
+
Example (Async):
|
|
394
|
+
|
|
395
|
+
```python
|
|
396
|
+
@env.task
|
|
397
|
+
async def check_file(f: File) -> bool:
|
|
398
|
+
if await f.exists():
|
|
399
|
+
print("File exists!")
|
|
400
|
+
return True
|
|
401
|
+
return False
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
True if the file exists, False otherwise
|
|
406
|
+
"""
|
|
407
|
+
return await storage.exists(self.path)
|
|
408
|
+
|
|
409
|
+
def exists_sync(self) -> bool:
|
|
410
|
+
"""
|
|
411
|
+
Synchronously check if the file exists.
|
|
412
|
+
|
|
413
|
+
Use this in non-async tasks or when you need synchronous file existence checking.
|
|
414
|
+
|
|
415
|
+
Example (Sync):
|
|
416
|
+
|
|
417
|
+
```python
|
|
418
|
+
@env.task
|
|
419
|
+
def check_file_sync(f: File) -> bool:
|
|
420
|
+
if f.exists_sync():
|
|
421
|
+
print("File exists!")
|
|
422
|
+
return True
|
|
423
|
+
return False
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
Returns:
|
|
427
|
+
True if the file exists, False otherwise
|
|
428
|
+
"""
|
|
429
|
+
return storage.exists_sync(self.path)
|
|
430
|
+
|
|
431
|
+
@contextmanager
|
|
432
|
+
def open_sync(
|
|
433
|
+
self,
|
|
434
|
+
mode: str = "rb",
|
|
435
|
+
block_size: Optional[int] = None,
|
|
436
|
+
cache_type: str = "readahead",
|
|
437
|
+
cache_options: Optional[dict] = None,
|
|
438
|
+
compression: Optional[str] = None,
|
|
439
|
+
**kwargs,
|
|
440
|
+
) -> Generator[IO[Any], None, None]:
|
|
441
|
+
"""
|
|
442
|
+
Synchronously open the file and return a file-like object.
|
|
443
|
+
|
|
444
|
+
Use this method in non-async tasks to read from or write to files directly.
|
|
445
|
+
|
|
446
|
+
Example (Sync Read):
|
|
447
|
+
|
|
448
|
+
```python
|
|
449
|
+
@env.task
|
|
450
|
+
def read_file_sync(f: File) -> str:
|
|
451
|
+
with f.open_sync("rb") as fh:
|
|
452
|
+
content = fh.read()
|
|
453
|
+
return content.decode("utf-8")
|
|
454
|
+
```
|
|
455
|
+
|
|
456
|
+
Example (Sync Write):
|
|
457
|
+
|
|
458
|
+
```python
|
|
459
|
+
@env.task
|
|
460
|
+
def write_file_sync() -> File:
|
|
461
|
+
f = File.new_remote()
|
|
462
|
+
with f.open_sync("wb") as fh:
|
|
463
|
+
fh.write(b"Hello, World!")
|
|
464
|
+
return f
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
mode: The mode to open the file in (default: 'rb'). Common modes: 'rb' (read binary),
|
|
469
|
+
'wb' (write binary), 'rt' (read text), 'wt' (write text)
|
|
470
|
+
block_size: Size of blocks for reading in bytes. Useful for streaming large files.
|
|
471
|
+
cache_type: Caching mechanism to use ('readahead', 'mmap', 'bytes', 'none')
|
|
472
|
+
cache_options: Dictionary of options for the cache
|
|
473
|
+
compression: Compression format or None for auto-detection
|
|
474
|
+
**kwargs: Additional arguments passed to fsspec's open method
|
|
475
|
+
|
|
476
|
+
Returns:
|
|
477
|
+
A file-like object that can be used with standard read/write operations
|
|
478
|
+
"""
|
|
479
|
+
fs = storage.get_underlying_filesystem(path=self.path)
|
|
480
|
+
|
|
481
|
+
# Set up cache options if provided
|
|
482
|
+
if cache_options is None:
|
|
483
|
+
cache_options = {}
|
|
484
|
+
|
|
485
|
+
# Configure the open parameters
|
|
486
|
+
open_kwargs = {"mode": mode, "compression": compression, **kwargs}
|
|
487
|
+
|
|
488
|
+
if block_size:
|
|
489
|
+
open_kwargs["block_size"] = block_size
|
|
490
|
+
|
|
491
|
+
# Apply caching strategy
|
|
492
|
+
if cache_type != "none":
|
|
493
|
+
open_kwargs["cache_type"] = cache_type
|
|
494
|
+
open_kwargs["cache_options"] = cache_options
|
|
495
|
+
|
|
496
|
+
with fs.open(self.path, **open_kwargs) as f:
|
|
497
|
+
yield f
|
|
498
|
+
|
|
499
|
+
# TODO sync needs to be implemented
|
|
500
|
+
async def download(self, local_path: Optional[Union[str, Path]] = None) -> str:
|
|
501
|
+
"""
|
|
502
|
+
Asynchronously download the file to a local path.
|
|
503
|
+
|
|
504
|
+
Use this when you need to download a remote file to your local filesystem for processing.
|
|
505
|
+
|
|
506
|
+
Example (Async):
|
|
507
|
+
|
|
508
|
+
```python
|
|
509
|
+
@env.task
|
|
510
|
+
async def download_and_process(f: File) -> str:
|
|
511
|
+
local_path = await f.download()
|
|
512
|
+
# Now process the local file
|
|
513
|
+
with open(local_path, "r") as fh:
|
|
514
|
+
return fh.read()
|
|
515
|
+
```
|
|
516
|
+
|
|
517
|
+
Example (Download to specific path):
|
|
518
|
+
|
|
519
|
+
```python
|
|
520
|
+
@env.task
|
|
521
|
+
async def download_to_path(f: File) -> str:
|
|
522
|
+
local_path = await f.download("/tmp/myfile.csv")
|
|
523
|
+
return local_path
|
|
524
|
+
```
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
local_path: The local path to download the file to. If None, a temporary
|
|
528
|
+
directory will be used and a path will be generated.
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
The absolute path to the downloaded file
|
|
532
|
+
"""
|
|
533
|
+
if local_path is None:
|
|
534
|
+
local_path = storage.get_random_local_path(file_path_or_file_name=self.path)
|
|
535
|
+
else:
|
|
536
|
+
# Preserve trailing separator if present (Path.absolute() strips it)
|
|
537
|
+
local_path_str = str(local_path)
|
|
538
|
+
has_trailing_sep = local_path_str.endswith(os.sep)
|
|
539
|
+
local_path = str(Path(local_path).absolute())
|
|
540
|
+
if has_trailing_sep:
|
|
541
|
+
local_path = local_path + os.sep
|
|
542
|
+
|
|
543
|
+
fs = storage.get_underlying_filesystem(path=self.path)
|
|
544
|
+
|
|
545
|
+
# If it's already a local file, just copy it
|
|
546
|
+
if "file" in fs.protocol:
|
|
547
|
+
# Apply directory logic for local-to-local copies
|
|
548
|
+
local_path_for_copy = local_path
|
|
549
|
+
if isinstance(local_path, str):
|
|
550
|
+
local_path_obj = Path(local_path)
|
|
551
|
+
# Check if it's a directory or ends with separator
|
|
552
|
+
if local_path.endswith(os.sep) or (local_path_obj.exists() and local_path_obj.is_dir()):
|
|
553
|
+
remote_filename = Path(self.path).name
|
|
554
|
+
local_path_for_copy = str(local_path_obj / remote_filename)
|
|
555
|
+
|
|
556
|
+
# Ensure parent directory exists
|
|
557
|
+
Path(local_path_for_copy).parent.mkdir(parents=True, exist_ok=True)
|
|
558
|
+
|
|
559
|
+
# Use aiofiles for async copy
|
|
560
|
+
async with aiofiles.open(self.path, "rb") as src:
|
|
561
|
+
async with aiofiles.open(local_path_for_copy, "wb") as dst:
|
|
562
|
+
await dst.write(await src.read())
|
|
563
|
+
return str(local_path_for_copy)
|
|
564
|
+
|
|
565
|
+
# Otherwise download from remote using async functionality
|
|
566
|
+
result_path = await storage.get(self.path, str(local_path))
|
|
567
|
+
return result_path
|
|
568
|
+
|
|
569
|
+
def download_sync(self, local_path: Optional[Union[str, Path]] = None) -> str:
|
|
570
|
+
"""
|
|
571
|
+
Synchronously download the file to a local path.
|
|
572
|
+
|
|
573
|
+
Use this in non-async tasks when you need to download a remote file to your local filesystem.
|
|
574
|
+
|
|
575
|
+
Example (Sync):
|
|
576
|
+
|
|
577
|
+
```python
|
|
578
|
+
@env.task
|
|
579
|
+
def download_and_process_sync(f: File) -> str:
|
|
580
|
+
local_path = f.download_sync()
|
|
581
|
+
# Now process the local file
|
|
582
|
+
with open(local_path, "r") as fh:
|
|
583
|
+
return fh.read()
|
|
584
|
+
```
|
|
585
|
+
|
|
586
|
+
Example (Download to specific path):
|
|
587
|
+
|
|
588
|
+
```python
|
|
589
|
+
@env.task
|
|
590
|
+
def download_to_path_sync(f: File) -> str:
|
|
591
|
+
local_path = f.download_sync("/tmp/myfile.csv")
|
|
592
|
+
return local_path
|
|
593
|
+
```
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
local_path: The local path to download the file to. If None, a temporary
|
|
597
|
+
directory will be used and a path will be generated.
|
|
598
|
+
|
|
599
|
+
Returns:
|
|
600
|
+
The absolute path to the downloaded file
|
|
601
|
+
"""
|
|
602
|
+
if local_path is None:
|
|
603
|
+
local_path = storage.get_random_local_path(file_path_or_file_name=self.path)
|
|
604
|
+
else:
|
|
605
|
+
# Preserve trailing separator if present (Path.absolute() strips it)
|
|
606
|
+
local_path_str = str(local_path)
|
|
607
|
+
has_trailing_sep = local_path_str.endswith(os.sep)
|
|
608
|
+
local_path = str(Path(local_path).absolute())
|
|
609
|
+
if has_trailing_sep:
|
|
610
|
+
local_path = local_path + os.sep
|
|
611
|
+
|
|
612
|
+
fs = storage.get_underlying_filesystem(path=self.path)
|
|
613
|
+
|
|
614
|
+
# If it's already a local file, just copy it
|
|
615
|
+
if "file" in fs.protocol:
|
|
616
|
+
# Apply directory logic for local-to-local copies
|
|
617
|
+
local_path_for_copy = local_path
|
|
618
|
+
if isinstance(local_path, str):
|
|
619
|
+
local_path_obj = Path(local_path)
|
|
620
|
+
# Check if it's a directory or ends with separator
|
|
621
|
+
if local_path.endswith(os.sep) or (local_path_obj.exists() and local_path_obj.is_dir()):
|
|
622
|
+
remote_filename = Path(self.path).name
|
|
623
|
+
local_path_for_copy = str(local_path_obj / remote_filename)
|
|
624
|
+
|
|
625
|
+
# Ensure parent directory exists
|
|
626
|
+
Path(local_path_for_copy).parent.mkdir(parents=True, exist_ok=True)
|
|
627
|
+
|
|
628
|
+
# Use standard file operations for sync copy
|
|
629
|
+
import shutil
|
|
630
|
+
|
|
631
|
+
shutil.copy2(self.path, local_path_for_copy)
|
|
632
|
+
return str(local_path_for_copy)
|
|
633
|
+
|
|
634
|
+
# Otherwise download from remote using sync functionality
|
|
635
|
+
# Use the sync version of storage operations
|
|
636
|
+
with fs.open(self.path, "rb") as src:
|
|
637
|
+
with open(local_path, "wb") as dst:
|
|
638
|
+
dst.write(src.read())
|
|
639
|
+
return str(local_path)
|
|
640
|
+
|
|
641
|
+
@classmethod
|
|
642
|
+
@requires_initialization
|
|
643
|
+
def from_local_sync(
|
|
644
|
+
cls,
|
|
645
|
+
local_path: Union[str, Path],
|
|
646
|
+
remote_destination: Optional[str] = None,
|
|
647
|
+
hash_method: Optional[HashMethod | str] = None,
|
|
648
|
+
) -> File[T]:
|
|
649
|
+
"""
|
|
650
|
+
Synchronously create a new File object from a local file by uploading it to remote storage.
|
|
651
|
+
|
|
652
|
+
Use this in non-async tasks when you have a local file that needs to be uploaded to remote storage.
|
|
653
|
+
|
|
654
|
+
Example (Sync):
|
|
655
|
+
|
|
656
|
+
```python
|
|
657
|
+
@env.task
|
|
658
|
+
def upload_local_file_sync() -> File:
|
|
659
|
+
# Create a local file
|
|
660
|
+
with open("/tmp/data.csv", "w") as f:
|
|
661
|
+
f.write("col1,col2\n1,2\n3,4\n")
|
|
662
|
+
|
|
663
|
+
# Upload to remote storage
|
|
664
|
+
remote_file = File.from_local_sync("/tmp/data.csv")
|
|
665
|
+
return remote_file
|
|
666
|
+
```
|
|
667
|
+
|
|
668
|
+
Example (With specific destination):
|
|
669
|
+
|
|
670
|
+
```python
|
|
671
|
+
@env.task
|
|
672
|
+
def upload_to_specific_path() -> File:
|
|
673
|
+
remote_file = File.from_local_sync("/tmp/data.csv", "s3://my-bucket/data.csv")
|
|
674
|
+
return remote_file
|
|
675
|
+
```
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
local_path: Path to the local file
|
|
679
|
+
remote_destination: Optional remote path to store the file. If None, a path will be automatically generated.
|
|
680
|
+
hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
|
|
681
|
+
it will be used as a precomputed cache key. If a HashMethod is provided, it will compute
|
|
682
|
+
the hash during upload. If not specified, the cache key will be based on file attributes.
|
|
683
|
+
|
|
684
|
+
Returns:
|
|
685
|
+
A new File instance pointing to the uploaded remote file
|
|
686
|
+
"""
|
|
687
|
+
if not os.path.exists(local_path):
|
|
688
|
+
raise ValueError(f"File not found: {local_path}")
|
|
689
|
+
|
|
690
|
+
remote_path = remote_destination or internal_ctx().raw_data.get_random_remote_path()
|
|
691
|
+
protocol = get_protocol(remote_path)
|
|
692
|
+
filename = Path(local_path).name
|
|
693
|
+
|
|
694
|
+
# If remote_destination was not set by the user, and the configured raw data path is also local,
|
|
695
|
+
# then let's optimize by not uploading.
|
|
696
|
+
hash_value = hash_method if isinstance(hash_method, str) else None
|
|
697
|
+
hash_method_obj = hash_method if isinstance(hash_method, HashMethod) else None
|
|
698
|
+
|
|
699
|
+
if "file" in protocol:
|
|
700
|
+
if remote_destination is None:
|
|
701
|
+
path = str(Path(local_path).absolute())
|
|
702
|
+
else:
|
|
703
|
+
# Otherwise, actually make a copy of the file
|
|
704
|
+
import shutil
|
|
705
|
+
|
|
706
|
+
if hash_method_obj:
|
|
707
|
+
# For hash computation, we need to read and write manually
|
|
708
|
+
with open(local_path, "rb") as src:
|
|
709
|
+
with open(remote_path, "wb") as dst:
|
|
710
|
+
dst_wrapper = HashingWriter(dst, accumulator=hash_method_obj)
|
|
711
|
+
dst_wrapper.write(src.read())
|
|
712
|
+
hash_value = dst_wrapper.result()
|
|
713
|
+
dst_wrapper.close()
|
|
714
|
+
else:
|
|
715
|
+
shutil.copy2(local_path, remote_path)
|
|
716
|
+
path = str(Path(remote_path).absolute())
|
|
717
|
+
else:
|
|
718
|
+
# Otherwise upload to remote using sync storage layer
|
|
719
|
+
fs = storage.get_underlying_filesystem(path=remote_path)
|
|
720
|
+
|
|
721
|
+
if hash_method_obj:
|
|
722
|
+
# We can skip the wrapper if the hash method is just a precomputed value
|
|
723
|
+
if not isinstance(hash_method_obj, PrecomputedValue):
|
|
724
|
+
with open(local_path, "rb") as src:
|
|
725
|
+
# For sync operations, we need to compute hash manually
|
|
726
|
+
data = src.read()
|
|
727
|
+
hash_method_obj.update(memoryview(data))
|
|
728
|
+
hash_value = hash_method_obj.result()
|
|
729
|
+
|
|
730
|
+
# Now write the data to remote
|
|
731
|
+
with fs.open(remote_path, "wb") as dst:
|
|
732
|
+
dst.write(data)
|
|
733
|
+
path = remote_path
|
|
734
|
+
else:
|
|
735
|
+
# Use sync file operations
|
|
736
|
+
with open(local_path, "rb") as src:
|
|
737
|
+
with fs.open(remote_path, "wb") as dst:
|
|
738
|
+
dst.write(src.read())
|
|
739
|
+
path = remote_path
|
|
740
|
+
hash_value = hash_method_obj.result()
|
|
741
|
+
else:
|
|
742
|
+
# Simple sync copy
|
|
743
|
+
with open(local_path, "rb") as src:
|
|
744
|
+
with fs.open(remote_path, "wb") as dst:
|
|
745
|
+
dst.write(src.read())
|
|
746
|
+
path = remote_path
|
|
747
|
+
|
|
748
|
+
f = cls(path=path, name=filename, hash_method=hash_method_obj, hash=hash_value)
|
|
749
|
+
return f
|
|
750
|
+
|
|
751
|
+
@classmethod
|
|
752
|
+
@requires_initialization
|
|
753
|
+
async def from_local(
|
|
754
|
+
cls,
|
|
755
|
+
local_path: Union[str, Path],
|
|
756
|
+
remote_destination: Optional[str] = None,
|
|
757
|
+
hash_method: Optional[HashMethod | str] = None,
|
|
758
|
+
) -> File[T]:
|
|
759
|
+
"""
|
|
760
|
+
Asynchronously create a new File object from a local file by uploading it to remote storage.
|
|
761
|
+
|
|
762
|
+
Use this in async tasks when you have a local file that needs to be uploaded to remote storage.
|
|
763
|
+
|
|
764
|
+
Example (Async):
|
|
765
|
+
|
|
766
|
+
```python
|
|
767
|
+
@env.task
|
|
768
|
+
async def upload_local_file() -> File:
|
|
769
|
+
# Create a local file
|
|
770
|
+
async with aiofiles.open("/tmp/data.csv", "w") as f:
|
|
771
|
+
await f.write("col1,col2\n1,2\n3,4\n")
|
|
772
|
+
|
|
773
|
+
# Upload to remote storage
|
|
774
|
+
remote_file = await File.from_local("/tmp/data.csv")
|
|
775
|
+
return remote_file
|
|
776
|
+
```
|
|
777
|
+
|
|
778
|
+
Example (With specific destination):
|
|
779
|
+
|
|
780
|
+
```python
|
|
781
|
+
@env.task
|
|
782
|
+
async def upload_to_specific_path() -> File:
|
|
783
|
+
remote_file = await File.from_local("/tmp/data.csv", "s3://my-bucket/data.csv")
|
|
784
|
+
return remote_file
|
|
785
|
+
```
|
|
786
|
+
|
|
787
|
+
Args:
|
|
788
|
+
local_path: Path to the local file
|
|
789
|
+
remote_destination: Optional remote path to store the file. If None, a path will be automatically generated.
|
|
790
|
+
hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
|
|
791
|
+
it will be used as a precomputed cache key. If a HashMethod is provided, it will compute
|
|
792
|
+
the hash during upload. If not specified, the cache key will be based on file attributes.
|
|
793
|
+
|
|
794
|
+
Returns:
|
|
795
|
+
A new File instance pointing to the uploaded remote file
|
|
796
|
+
"""
|
|
797
|
+
if not os.path.exists(local_path):
|
|
798
|
+
raise ValueError(f"File not found: {local_path}")
|
|
799
|
+
|
|
800
|
+
filename = Path(local_path).name
|
|
801
|
+
remote_path = remote_destination or internal_ctx().raw_data.get_random_remote_path(filename)
|
|
802
|
+
protocol = get_protocol(remote_path)
|
|
803
|
+
|
|
804
|
+
# If remote_destination was not set by the user, and the configured raw data path is also local,
|
|
805
|
+
# then let's optimize by not uploading.
|
|
806
|
+
hash_value = hash_method if isinstance(hash_method, str) else None
|
|
807
|
+
hash_method = hash_method if isinstance(hash_method, HashMethod) else None
|
|
808
|
+
if "file" in protocol:
|
|
809
|
+
if remote_destination is None:
|
|
810
|
+
path = str(Path(local_path).absolute())
|
|
811
|
+
else:
|
|
812
|
+
# Otherwise, actually make a copy of the file
|
|
813
|
+
async with aiofiles.open(local_path, "rb") as src:
|
|
814
|
+
async with aiofiles.open(remote_path, "wb") as dst:
|
|
815
|
+
if hash_method:
|
|
816
|
+
dst_wrapper = HashingWriter(dst, accumulator=hash_method)
|
|
817
|
+
await dst_wrapper.write(await src.read())
|
|
818
|
+
hash_value = dst_wrapper.result()
|
|
819
|
+
else:
|
|
820
|
+
await dst.write(await src.read())
|
|
821
|
+
path = str(Path(remote_path).absolute())
|
|
822
|
+
else:
|
|
823
|
+
# Otherwise upload to remote using async storage layer
|
|
824
|
+
if hash_method:
|
|
825
|
+
# We can skip the wrapper if the hash method is just a precomputed value
|
|
826
|
+
if not isinstance(hash_method, PrecomputedValue):
|
|
827
|
+
async with aiofiles.open(local_path, "rb") as src:
|
|
828
|
+
src_wrapper = AsyncHashingReader(src, accumulator=hash_method)
|
|
829
|
+
path = await storage.put_stream(src_wrapper, to_path=remote_path)
|
|
830
|
+
hash_value = src_wrapper.result()
|
|
831
|
+
else:
|
|
832
|
+
path = await storage.put(str(local_path), remote_path)
|
|
833
|
+
hash_value = hash_method.result()
|
|
834
|
+
else:
|
|
835
|
+
path = await storage.put(str(local_path), remote_path)
|
|
836
|
+
|
|
837
|
+
f = cls(path=path, name=filename, hash_method=hash_method, hash=hash_value)
|
|
838
|
+
return f
|
|
839
|
+
|
|
840
|
+
|
|
841
|
+
class FileTransformer(TypeTransformer[File]):
|
|
842
|
+
"""
|
|
843
|
+
Transformer for File objects. This type transformer does not handle any i/o. That is now the responsibility of the
|
|
844
|
+
user.
|
|
845
|
+
"""
|
|
846
|
+
|
|
847
|
+
def __init__(self):
|
|
848
|
+
super().__init__(name="File", t=File)
|
|
849
|
+
|
|
850
|
+
def get_literal_type(self, t: Type[File]) -> types_pb2.LiteralType:
|
|
851
|
+
"""Get the Flyte literal type for a File type."""
|
|
852
|
+
return types_pb2.LiteralType(
|
|
853
|
+
blob=types_pb2.BlobType(
|
|
854
|
+
# todo: set format from generic
|
|
855
|
+
format="", # Format is determined by the generic type T
|
|
856
|
+
dimensionality=types_pb2.BlobType.BlobDimensionality.SINGLE,
|
|
857
|
+
)
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
async def to_literal(
|
|
861
|
+
self,
|
|
862
|
+
python_val: File,
|
|
863
|
+
python_type: Type[File],
|
|
864
|
+
expected: types_pb2.LiteralType,
|
|
865
|
+
) -> literals_pb2.Literal:
|
|
866
|
+
"""Convert a File object to a Flyte literal."""
|
|
867
|
+
if not isinstance(python_val, File):
|
|
868
|
+
raise TypeTransformerFailedError(f"Expected File object, received {type(python_val)}")
|
|
869
|
+
|
|
870
|
+
return literals_pb2.Literal(
|
|
871
|
+
scalar=literals_pb2.Scalar(
|
|
872
|
+
blob=literals_pb2.Blob(
|
|
873
|
+
metadata=literals_pb2.BlobMetadata(
|
|
874
|
+
type=types_pb2.BlobType(
|
|
875
|
+
format=python_val.format, dimensionality=types_pb2.BlobType.BlobDimensionality.SINGLE
|
|
876
|
+
)
|
|
877
|
+
),
|
|
878
|
+
uri=python_val.path,
|
|
879
|
+
)
|
|
880
|
+
),
|
|
881
|
+
hash=python_val.hash if python_val.hash else None,
|
|
882
|
+
)
|
|
883
|
+
|
|
884
|
+
async def to_python_value(
|
|
885
|
+
self,
|
|
886
|
+
lv: literals_pb2.Literal,
|
|
887
|
+
expected_python_type: Type[File],
|
|
888
|
+
) -> File:
|
|
889
|
+
"""Convert a Flyte literal to a File object."""
|
|
890
|
+
if not lv.scalar.HasField("blob"):
|
|
891
|
+
raise TypeTransformerFailedError(f"Expected blob literal, received {lv}")
|
|
892
|
+
if not lv.scalar.blob.metadata.type.dimensionality == types_pb2.BlobType.BlobDimensionality.SINGLE:
|
|
893
|
+
raise TypeTransformerFailedError(
|
|
894
|
+
f"Expected single part blob, received {lv.scalar.blob.metadata.type.dimensionality}"
|
|
895
|
+
)
|
|
896
|
+
|
|
897
|
+
uri = lv.scalar.blob.uri
|
|
898
|
+
filename = Path(uri).name
|
|
899
|
+
hash_value = lv.hash if lv.hash else None
|
|
900
|
+
f: File = File(path=uri, name=filename, format=lv.scalar.blob.metadata.type.format, hash=hash_value)
|
|
901
|
+
return f
|
|
902
|
+
|
|
903
|
+
def guess_python_type(self, literal_type: types_pb2.LiteralType) -> Type[File]:
|
|
904
|
+
"""Guess the Python type from a Flyte literal type."""
|
|
905
|
+
if (
|
|
906
|
+
literal_type.HasField("blob")
|
|
907
|
+
and literal_type.blob.dimensionality == types_pb2.BlobType.BlobDimensionality.SINGLE
|
|
908
|
+
and literal_type.blob.format != "PythonPickle" # see pickle transformer
|
|
909
|
+
):
|
|
910
|
+
return File
|
|
911
|
+
raise ValueError(f"Cannot guess python type from {literal_type}")
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
TypeEngine.register(FileTransformer())
|