flyte 2.0.0b13__py3-none-any.whl → 2.0.0b30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flyte/__init__.py +18 -2
- flyte/_bin/debug.py +38 -0
- flyte/_bin/runtime.py +62 -8
- flyte/_cache/cache.py +4 -2
- flyte/_cache/local_cache.py +216 -0
- flyte/_code_bundle/_ignore.py +12 -4
- flyte/_code_bundle/_packaging.py +13 -9
- flyte/_code_bundle/_utils.py +18 -10
- flyte/_code_bundle/bundle.py +17 -9
- flyte/_constants.py +1 -0
- flyte/_context.py +4 -1
- flyte/_custom_context.py +73 -0
- flyte/_debug/constants.py +38 -0
- flyte/_debug/utils.py +17 -0
- flyte/_debug/vscode.py +307 -0
- flyte/_deploy.py +235 -61
- flyte/_environment.py +20 -6
- flyte/_excepthook.py +1 -1
- flyte/_hash.py +1 -16
- flyte/_image.py +178 -81
- flyte/_initialize.py +132 -51
- flyte/_interface.py +39 -2
- flyte/_internal/controllers/__init__.py +4 -5
- flyte/_internal/controllers/_local_controller.py +70 -29
- flyte/_internal/controllers/_trace.py +1 -1
- flyte/_internal/controllers/remote/__init__.py +0 -2
- flyte/_internal/controllers/remote/_action.py +14 -16
- flyte/_internal/controllers/remote/_client.py +1 -1
- flyte/_internal/controllers/remote/_controller.py +68 -70
- flyte/_internal/controllers/remote/_core.py +127 -99
- flyte/_internal/controllers/remote/_informer.py +19 -10
- flyte/_internal/controllers/remote/_service_protocol.py +7 -7
- flyte/_internal/imagebuild/docker_builder.py +181 -69
- flyte/_internal/imagebuild/image_builder.py +0 -5
- flyte/_internal/imagebuild/remote_builder.py +155 -64
- flyte/_internal/imagebuild/utils.py +51 -2
- flyte/_internal/resolvers/_task_module.py +5 -38
- flyte/_internal/resolvers/default.py +2 -2
- flyte/_internal/runtime/convert.py +110 -21
- flyte/_internal/runtime/entrypoints.py +27 -1
- flyte/_internal/runtime/io.py +21 -8
- flyte/_internal/runtime/resources_serde.py +20 -6
- flyte/_internal/runtime/reuse.py +1 -1
- flyte/_internal/runtime/rusty.py +20 -5
- flyte/_internal/runtime/task_serde.py +34 -19
- flyte/_internal/runtime/taskrunner.py +22 -4
- flyte/_internal/runtime/trigger_serde.py +160 -0
- flyte/_internal/runtime/types_serde.py +1 -1
- flyte/_keyring/__init__.py +0 -0
- flyte/_keyring/file.py +115 -0
- flyte/_logging.py +201 -39
- flyte/_map.py +111 -14
- flyte/_module.py +70 -0
- flyte/_pod.py +4 -3
- flyte/_resources.py +213 -31
- flyte/_run.py +110 -39
- flyte/_task.py +75 -16
- flyte/_task_environment.py +105 -29
- flyte/_task_plugins.py +4 -2
- flyte/_trace.py +5 -0
- flyte/_trigger.py +1000 -0
- flyte/_utils/__init__.py +2 -1
- flyte/_utils/asyn.py +3 -1
- flyte/_utils/coro_management.py +2 -1
- flyte/_utils/docker_credentials.py +173 -0
- flyte/_utils/module_loader.py +17 -2
- flyte/_version.py +3 -3
- flyte/cli/_abort.py +3 -3
- flyte/cli/_build.py +3 -6
- flyte/cli/_common.py +78 -7
- flyte/cli/_create.py +182 -4
- flyte/cli/_delete.py +23 -1
- flyte/cli/_deploy.py +63 -16
- flyte/cli/_get.py +79 -34
- flyte/cli/_params.py +26 -10
- flyte/cli/_plugins.py +209 -0
- flyte/cli/_run.py +151 -26
- flyte/cli/_serve.py +64 -0
- flyte/cli/_update.py +37 -0
- flyte/cli/_user.py +17 -0
- flyte/cli/main.py +30 -4
- flyte/config/_config.py +10 -6
- flyte/config/_internal.py +1 -0
- flyte/config/_reader.py +29 -8
- flyte/connectors/__init__.py +11 -0
- flyte/connectors/_connector.py +270 -0
- flyte/connectors/_server.py +197 -0
- flyte/connectors/utils.py +135 -0
- flyte/errors.py +22 -2
- flyte/extend.py +8 -1
- flyte/extras/_container.py +6 -1
- flyte/git/__init__.py +3 -0
- flyte/git/_config.py +21 -0
- flyte/io/__init__.py +2 -0
- flyte/io/_dataframe/__init__.py +2 -0
- flyte/io/_dataframe/basic_dfs.py +17 -8
- flyte/io/_dataframe/dataframe.py +98 -132
- flyte/io/_dir.py +575 -113
- flyte/io/_file.py +582 -139
- flyte/io/_hashing_io.py +342 -0
- flyte/models.py +74 -15
- flyte/remote/__init__.py +6 -1
- flyte/remote/_action.py +34 -26
- flyte/remote/_client/_protocols.py +39 -4
- flyte/remote/_client/auth/_authenticators/device_code.py +4 -5
- flyte/remote/_client/auth/_authenticators/pkce.py +1 -1
- flyte/remote/_client/auth/_channel.py +10 -6
- flyte/remote/_client/controlplane.py +17 -5
- flyte/remote/_console.py +3 -2
- flyte/remote/_data.py +6 -6
- flyte/remote/_logs.py +3 -3
- flyte/remote/_run.py +64 -8
- flyte/remote/_secret.py +26 -17
- flyte/remote/_task.py +75 -33
- flyte/remote/_trigger.py +306 -0
- flyte/remote/_user.py +33 -0
- flyte/report/_report.py +1 -1
- flyte/storage/__init__.py +6 -1
- flyte/storage/_config.py +5 -1
- flyte/storage/_parallel_reader.py +274 -0
- flyte/storage/_storage.py +200 -103
- flyte/types/__init__.py +16 -0
- flyte/types/_interface.py +2 -2
- flyte/types/_pickle.py +35 -8
- flyte/types/_string_literals.py +8 -9
- flyte/types/_type_engine.py +40 -70
- flyte/types/_utils.py +1 -1
- flyte-2.0.0b30.data/scripts/debug.py +38 -0
- {flyte-2.0.0b13.data → flyte-2.0.0b30.data}/scripts/runtime.py +62 -8
- {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/METADATA +11 -3
- flyte-2.0.0b30.dist-info/RECORD +192 -0
- {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/entry_points.txt +3 -0
- flyte/_protos/common/authorization_pb2.py +0 -66
- flyte/_protos/common/authorization_pb2.pyi +0 -108
- flyte/_protos/common/authorization_pb2_grpc.py +0 -4
- flyte/_protos/common/identifier_pb2.py +0 -93
- flyte/_protos/common/identifier_pb2.pyi +0 -110
- flyte/_protos/common/identifier_pb2_grpc.py +0 -4
- flyte/_protos/common/identity_pb2.py +0 -48
- flyte/_protos/common/identity_pb2.pyi +0 -72
- flyte/_protos/common/identity_pb2_grpc.py +0 -4
- flyte/_protos/common/list_pb2.py +0 -36
- flyte/_protos/common/list_pb2.pyi +0 -71
- flyte/_protos/common/list_pb2_grpc.py +0 -4
- flyte/_protos/common/policy_pb2.py +0 -37
- flyte/_protos/common/policy_pb2.pyi +0 -27
- flyte/_protos/common/policy_pb2_grpc.py +0 -4
- flyte/_protos/common/role_pb2.py +0 -37
- flyte/_protos/common/role_pb2.pyi +0 -53
- flyte/_protos/common/role_pb2_grpc.py +0 -4
- flyte/_protos/common/runtime_version_pb2.py +0 -28
- flyte/_protos/common/runtime_version_pb2.pyi +0 -24
- flyte/_protos/common/runtime_version_pb2_grpc.py +0 -4
- flyte/_protos/imagebuilder/definition_pb2.py +0 -59
- flyte/_protos/imagebuilder/definition_pb2.pyi +0 -140
- flyte/_protos/imagebuilder/definition_pb2_grpc.py +0 -4
- flyte/_protos/imagebuilder/payload_pb2.py +0 -32
- flyte/_protos/imagebuilder/payload_pb2.pyi +0 -21
- flyte/_protos/imagebuilder/payload_pb2_grpc.py +0 -4
- flyte/_protos/imagebuilder/service_pb2.py +0 -29
- flyte/_protos/imagebuilder/service_pb2.pyi +0 -5
- flyte/_protos/imagebuilder/service_pb2_grpc.py +0 -66
- flyte/_protos/logs/dataplane/payload_pb2.py +0 -100
- flyte/_protos/logs/dataplane/payload_pb2.pyi +0 -177
- flyte/_protos/logs/dataplane/payload_pb2_grpc.py +0 -4
- flyte/_protos/secret/definition_pb2.py +0 -49
- flyte/_protos/secret/definition_pb2.pyi +0 -93
- flyte/_protos/secret/definition_pb2_grpc.py +0 -4
- flyte/_protos/secret/payload_pb2.py +0 -62
- flyte/_protos/secret/payload_pb2.pyi +0 -94
- flyte/_protos/secret/payload_pb2_grpc.py +0 -4
- flyte/_protos/secret/secret_pb2.py +0 -38
- flyte/_protos/secret/secret_pb2.pyi +0 -6
- flyte/_protos/secret/secret_pb2_grpc.py +0 -198
- flyte/_protos/secret/secret_pb2_grpc_grpc.py +0 -198
- flyte/_protos/validate/validate/validate_pb2.py +0 -76
- flyte/_protos/workflow/common_pb2.py +0 -27
- flyte/_protos/workflow/common_pb2.pyi +0 -14
- flyte/_protos/workflow/common_pb2_grpc.py +0 -4
- flyte/_protos/workflow/environment_pb2.py +0 -29
- flyte/_protos/workflow/environment_pb2.pyi +0 -12
- flyte/_protos/workflow/environment_pb2_grpc.py +0 -4
- flyte/_protos/workflow/node_execution_service_pb2.py +0 -26
- flyte/_protos/workflow/node_execution_service_pb2.pyi +0 -4
- flyte/_protos/workflow/node_execution_service_pb2_grpc.py +0 -32
- flyte/_protos/workflow/queue_service_pb2.py +0 -109
- flyte/_protos/workflow/queue_service_pb2.pyi +0 -166
- flyte/_protos/workflow/queue_service_pb2_grpc.py +0 -172
- flyte/_protos/workflow/run_definition_pb2.py +0 -121
- flyte/_protos/workflow/run_definition_pb2.pyi +0 -327
- flyte/_protos/workflow/run_definition_pb2_grpc.py +0 -4
- flyte/_protos/workflow/run_logs_service_pb2.py +0 -41
- flyte/_protos/workflow/run_logs_service_pb2.pyi +0 -28
- flyte/_protos/workflow/run_logs_service_pb2_grpc.py +0 -69
- flyte/_protos/workflow/run_service_pb2.py +0 -137
- flyte/_protos/workflow/run_service_pb2.pyi +0 -185
- flyte/_protos/workflow/run_service_pb2_grpc.py +0 -446
- flyte/_protos/workflow/state_service_pb2.py +0 -67
- flyte/_protos/workflow/state_service_pb2.pyi +0 -76
- flyte/_protos/workflow/state_service_pb2_grpc.py +0 -138
- flyte/_protos/workflow/task_definition_pb2.py +0 -79
- flyte/_protos/workflow/task_definition_pb2.pyi +0 -81
- flyte/_protos/workflow/task_definition_pb2_grpc.py +0 -4
- flyte/_protos/workflow/task_service_pb2.py +0 -60
- flyte/_protos/workflow/task_service_pb2.pyi +0 -59
- flyte/_protos/workflow/task_service_pb2_grpc.py +0 -138
- flyte-2.0.0b13.dist-info/RECORD +0 -239
- /flyte/{_protos → _debug}/__init__.py +0 -0
- {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/WHEEL +0 -0
- {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/licenses/LICENSE +0 -0
- {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/top_level.txt +0 -0
flyte/io/_file.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import inspect
|
|
3
4
|
import os
|
|
5
|
+
import typing
|
|
4
6
|
from contextlib import asynccontextmanager, contextmanager
|
|
5
7
|
from pathlib import Path
|
|
6
8
|
from typing import (
|
|
7
9
|
IO,
|
|
10
|
+
Annotated,
|
|
8
11
|
Any,
|
|
9
12
|
AsyncGenerator,
|
|
10
13
|
Dict,
|
|
@@ -17,18 +20,25 @@ from typing import (
|
|
|
17
20
|
)
|
|
18
21
|
|
|
19
22
|
import aiofiles
|
|
20
|
-
from
|
|
21
|
-
from fsspec.asyn import AsyncFileSystem
|
|
23
|
+
from flyteidl2.core import literals_pb2, types_pb2
|
|
22
24
|
from fsspec.utils import get_protocol
|
|
23
25
|
from mashumaro.types import SerializableType
|
|
24
|
-
from pydantic import BaseModel, model_validator
|
|
26
|
+
from pydantic import BaseModel, Field, model_validator
|
|
27
|
+
from pydantic.json_schema import SkipJsonSchema
|
|
25
28
|
|
|
29
|
+
import flyte.errors
|
|
26
30
|
import flyte.storage as storage
|
|
27
31
|
from flyte._context import internal_ctx
|
|
28
32
|
from flyte._initialize import requires_initialization
|
|
29
|
-
from flyte.
|
|
33
|
+
from flyte.io._hashing_io import AsyncHashingReader, HashingWriter, HashMethod, PrecomputedValue
|
|
30
34
|
from flyte.types import TypeEngine, TypeTransformer, TypeTransformerFailedError
|
|
31
35
|
|
|
36
|
+
if typing.TYPE_CHECKING:
|
|
37
|
+
from obstore import AsyncReadableFile, AsyncWritableFile
|
|
38
|
+
|
|
39
|
+
if typing.TYPE_CHECKING:
|
|
40
|
+
from obstore import AsyncReadableFile, AsyncWritableFile
|
|
41
|
+
|
|
32
42
|
# Type variable for the file format
|
|
33
43
|
T = TypeVar("T")
|
|
34
44
|
|
|
@@ -36,63 +46,139 @@ T = TypeVar("T")
|
|
|
36
46
|
class File(BaseModel, Generic[T], SerializableType):
|
|
37
47
|
"""
|
|
38
48
|
A generic file class representing a file with a specified format.
|
|
39
|
-
Provides both async and sync interfaces for file operations.
|
|
40
|
-
|
|
49
|
+
Provides both async and sync interfaces for file operations. All methods without _sync suffix are async.
|
|
50
|
+
|
|
51
|
+
The class should be instantiated using one of the class methods. The constructor should be used only to
|
|
52
|
+
instantiate references to existing remote objects.
|
|
41
53
|
|
|
42
54
|
The generic type T represents the format of the file.
|
|
43
55
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
from pandas import DataFrame
|
|
48
|
-
csv_file = File[DataFrame](path="s3://my-bucket/data.csv")
|
|
56
|
+
Important methods:
|
|
57
|
+
- `from_existing_remote`: Create a File object from an existing remote file.
|
|
58
|
+
- `new_remote`: Create a new File reference for a remote file that will be written to.
|
|
49
59
|
|
|
50
|
-
|
|
51
|
-
|
|
60
|
+
**Asynchronous methods**:
|
|
61
|
+
- `open`: Asynchronously open the file and return a file-like object.
|
|
62
|
+
- `download`: Asynchronously download the file to a local path.
|
|
63
|
+
- `from_local`: Asynchronously create a File object from a local file, uploading it to remote storage.
|
|
64
|
+
- `exists`: Asynchronously check if the file exists.
|
|
52
65
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
66
|
+
**Synchronous methods** (suffixed with `_sync`):
|
|
67
|
+
- `open_sync`: Synchronously open the file and return a file-like object.
|
|
68
|
+
- `download_sync`: Synchronously download the file to a local path.
|
|
69
|
+
- `from_local_sync`: Synchronously create a File object from a local file, uploading it to remote storage.
|
|
70
|
+
- `exists_sync`: Synchronously check if the file exists.
|
|
57
71
|
|
|
58
|
-
Example: Read a file input in a Task.
|
|
59
|
-
|
|
72
|
+
Example: Read a file input in a Task (Async).
|
|
73
|
+
|
|
74
|
+
```python
|
|
60
75
|
@env.task
|
|
61
|
-
async def
|
|
62
|
-
async with file.open() as f:
|
|
63
|
-
|
|
76
|
+
async def read_file(file: File) -> str:
|
|
77
|
+
async with file.open("rb") as f:
|
|
78
|
+
content = bytes(await f.read())
|
|
79
|
+
return content.decode("utf-8")
|
|
64
80
|
```
|
|
65
81
|
|
|
66
|
-
Example:
|
|
82
|
+
Example: Read a file input in a Task (Sync).
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
@env.task
|
|
86
|
+
def read_file_sync(file: File) -> str:
|
|
87
|
+
with file.open_sync("rb") as f:
|
|
88
|
+
content = f.read()
|
|
89
|
+
return content.decode("utf-8")
|
|
67
90
|
```
|
|
91
|
+
|
|
92
|
+
Example: Write a file by streaming it directly to blob storage (Async).
|
|
93
|
+
|
|
94
|
+
```python
|
|
68
95
|
@env.task
|
|
69
|
-
async def
|
|
70
|
-
df = pd.DataFrame(...)
|
|
96
|
+
async def write_file() -> File:
|
|
71
97
|
file = File.new_remote()
|
|
72
98
|
async with file.open("wb") as f:
|
|
73
|
-
|
|
74
|
-
# No additional uploading will be done here.
|
|
99
|
+
await f.write(b"Hello, World!")
|
|
75
100
|
return file
|
|
76
101
|
```
|
|
77
|
-
|
|
102
|
+
|
|
103
|
+
Example: Upload a local file to remote storage (Async).
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
@env.task
|
|
107
|
+
async def upload_file() -> File:
|
|
108
|
+
# Write to local file first
|
|
109
|
+
with open("/tmp/data.csv", "w") as f:
|
|
110
|
+
f.write("col1,col2\\n1,2\\n3,4\\n")
|
|
111
|
+
# Upload to remote storage
|
|
112
|
+
return await File.from_local("/tmp/data.csv")
|
|
78
113
|
```
|
|
114
|
+
|
|
115
|
+
Example: Upload a local file to remote storage (Sync).
|
|
116
|
+
|
|
117
|
+
```python
|
|
79
118
|
@env.task
|
|
80
|
-
|
|
81
|
-
#
|
|
82
|
-
|
|
119
|
+
def upload_file_sync() -> File:
|
|
120
|
+
# Write to local file first
|
|
121
|
+
with open("/tmp/data.csv", "w") as f:
|
|
122
|
+
f.write("col1,col2\\n1,2\\n3,4\\n")
|
|
123
|
+
# Upload to remote storage
|
|
124
|
+
return File.from_local_sync("/tmp/data.csv")
|
|
83
125
|
```
|
|
84
126
|
|
|
85
|
-
Example:
|
|
127
|
+
Example: Download a file to local storage (Async).
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
@env.task
|
|
131
|
+
async def download_file(file: File) -> str:
|
|
132
|
+
local_path = await file.download()
|
|
133
|
+
# Process the local file
|
|
134
|
+
with open(local_path, "r") as f:
|
|
135
|
+
return f.read()
|
|
86
136
|
```
|
|
137
|
+
|
|
138
|
+
Example: Download a file to local storage (Sync).
|
|
139
|
+
|
|
140
|
+
```python
|
|
87
141
|
@env.task
|
|
88
|
-
|
|
89
|
-
|
|
142
|
+
def download_file_sync(file: File) -> str:
|
|
143
|
+
local_path = file.download_sync()
|
|
144
|
+
# Process the local file
|
|
145
|
+
with open(local_path, "r") as f:
|
|
146
|
+
return f.read()
|
|
90
147
|
```
|
|
91
148
|
|
|
92
|
-
Example:
|
|
149
|
+
Example: Reference an existing remote file.
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
@env.task
|
|
153
|
+
async def process_existing_file() -> str:
|
|
154
|
+
file = File.from_existing_remote("s3://my-bucket/data.csv")
|
|
155
|
+
async with file.open("rb") as f:
|
|
156
|
+
content = await f.read()
|
|
157
|
+
return content.decode("utf-8")
|
|
93
158
|
```
|
|
159
|
+
|
|
160
|
+
Example: Check if a file exists (Async).
|
|
161
|
+
|
|
162
|
+
```python
|
|
94
163
|
@env.task
|
|
95
|
-
async def
|
|
164
|
+
async def check_file(file: File) -> bool:
|
|
165
|
+
return await file.exists()
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Example: Check if a file exists (Sync).
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
@env.task
|
|
172
|
+
def check_file_sync(file: File) -> bool:
|
|
173
|
+
return file.exists_sync()
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Example: Pass through a file without copying.
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
@env.task
|
|
180
|
+
async def pass_through(file: File) -> File:
|
|
181
|
+
# No copy occurs - just passes the reference
|
|
96
182
|
return file
|
|
97
183
|
```
|
|
98
184
|
|
|
@@ -104,6 +190,8 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
104
190
|
path: str
|
|
105
191
|
name: Optional[str] = None
|
|
106
192
|
format: str = ""
|
|
193
|
+
hash: Optional[str] = None
|
|
194
|
+
hash_method: Annotated[Optional[HashMethod], Field(default=None, exclude=True), SkipJsonSchema()] = None
|
|
107
195
|
|
|
108
196
|
class Config:
|
|
109
197
|
arbitrary_types_allowed = True
|
|
@@ -111,20 +199,24 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
111
199
|
@model_validator(mode="before")
|
|
112
200
|
@classmethod
|
|
113
201
|
def pre_init(cls, data):
|
|
202
|
+
"""Internal: Pydantic validator to set default name from path. Not intended for direct use."""
|
|
114
203
|
if data.get("name") is None:
|
|
115
204
|
data["name"] = Path(data["path"]).name
|
|
116
205
|
return data
|
|
117
206
|
|
|
118
207
|
def _serialize(self) -> Dict[str, Optional[str]]:
|
|
208
|
+
"""Internal: Serialize File to dictionary. Not intended for direct use."""
|
|
119
209
|
pyd_dump = self.model_dump()
|
|
120
210
|
return pyd_dump
|
|
121
211
|
|
|
122
212
|
@classmethod
|
|
123
213
|
def _deserialize(cls, file_dump: Dict[str, Optional[str]]) -> File:
|
|
214
|
+
"""Internal: Deserialize File from dictionary. Not intended for direct use."""
|
|
124
215
|
return File.model_validate(file_dump)
|
|
125
216
|
|
|
126
217
|
@classmethod
|
|
127
218
|
def schema_match(cls, incoming: dict):
|
|
219
|
+
"""Internal: Check if incoming schema matches File schema. Not intended for direct use."""
|
|
128
220
|
this_schema = cls.model_json_schema()
|
|
129
221
|
current_required = this_schema.get("required")
|
|
130
222
|
incoming_required = incoming.get("required")
|
|
@@ -139,41 +231,65 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
139
231
|
|
|
140
232
|
@classmethod
|
|
141
233
|
@requires_initialization
|
|
142
|
-
def new_remote(cls) -> File[T]:
|
|
234
|
+
def new_remote(cls, hash_method: Optional[HashMethod | str] = None) -> File[T]:
|
|
143
235
|
"""
|
|
144
236
|
Create a new File reference for a remote file that will be written to.
|
|
145
237
|
|
|
146
|
-
|
|
147
|
-
|
|
238
|
+
Use this when you want to create a new file and write to it directly without creating a local file first.
|
|
239
|
+
|
|
240
|
+
Example (Async):
|
|
241
|
+
|
|
242
|
+
```python
|
|
148
243
|
@env.task
|
|
149
|
-
async def
|
|
150
|
-
df = pd.DataFrame(
|
|
244
|
+
async def create_csv() -> File:
|
|
245
|
+
df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
|
|
151
246
|
file = File.new_remote()
|
|
152
247
|
async with file.open("wb") as f:
|
|
153
248
|
df.to_csv(f)
|
|
154
249
|
return file
|
|
155
250
|
```
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
|
|
254
|
+
it will be used as a precomputed cache key. If a HashMethod is provided, it will be used
|
|
255
|
+
to compute the hash as data is written.
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
A new File instance with a generated remote path
|
|
156
259
|
"""
|
|
157
260
|
ctx = internal_ctx()
|
|
261
|
+
known_cache_key = hash_method if isinstance(hash_method, str) else None
|
|
262
|
+
method = hash_method if isinstance(hash_method, HashMethod) else None
|
|
158
263
|
|
|
159
|
-
return cls(path=ctx.raw_data.get_random_remote_path())
|
|
264
|
+
return cls(path=ctx.raw_data.get_random_remote_path(), hash=known_cache_key, hash_method=method)
|
|
160
265
|
|
|
161
266
|
@classmethod
|
|
162
|
-
def from_existing_remote(cls, remote_path: str) -> File[T]:
|
|
267
|
+
def from_existing_remote(cls, remote_path: str, file_cache_key: Optional[str] = None) -> File[T]:
|
|
163
268
|
"""
|
|
164
269
|
Create a File reference from an existing remote file.
|
|
165
270
|
|
|
271
|
+
Use this when you want to reference a file that already exists in remote storage without uploading it.
|
|
272
|
+
|
|
166
273
|
Example:
|
|
274
|
+
|
|
167
275
|
```python
|
|
168
276
|
@env.task
|
|
169
|
-
async def
|
|
170
|
-
|
|
277
|
+
async def process_existing_file() -> str:
|
|
278
|
+
file = File.from_existing_remote("s3://my-bucket/data.csv")
|
|
279
|
+
async with file.open("rb") as f:
|
|
280
|
+
content = await f.read()
|
|
281
|
+
return content.decode("utf-8")
|
|
171
282
|
```
|
|
172
283
|
|
|
173
284
|
Args:
|
|
174
285
|
remote_path: The remote path to the existing file
|
|
286
|
+
file_cache_key: Optional hash value to use for cache key computation. If not specified, the cache key
|
|
287
|
+
will be computed based on the file's attributes (path, name, format).
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
A new File instance pointing to the existing remote file
|
|
175
291
|
"""
|
|
176
|
-
return cls(path=remote_path)
|
|
292
|
+
return cls(path=remote_path, hash=file_cache_key)
|
|
177
293
|
|
|
178
294
|
@asynccontextmanager
|
|
179
295
|
async def open(
|
|
@@ -184,84 +300,129 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
184
300
|
cache_options: Optional[dict] = None,
|
|
185
301
|
compression: Optional[str] = None,
|
|
186
302
|
**kwargs,
|
|
187
|
-
) -> AsyncGenerator[
|
|
303
|
+
) -> AsyncGenerator[Union[AsyncWritableFile, AsyncReadableFile, "HashingWriter"], None]:
|
|
188
304
|
"""
|
|
189
305
|
Asynchronously open the file and return a file-like object.
|
|
190
306
|
|
|
307
|
+
Use this method in async tasks to read from or write to files directly.
|
|
308
|
+
|
|
309
|
+
Example (Async Read):
|
|
310
|
+
|
|
311
|
+
```python
|
|
312
|
+
@env.task
|
|
313
|
+
async def read_file(f: File) -> str:
|
|
314
|
+
async with f.open("rb") as fh:
|
|
315
|
+
content = bytes(await fh.read())
|
|
316
|
+
return content.decode("utf-8")
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
Example (Async Write):
|
|
320
|
+
|
|
321
|
+
```python
|
|
322
|
+
@env.task
|
|
323
|
+
async def write_file() -> File:
|
|
324
|
+
f = File.new_remote()
|
|
325
|
+
async with f.open("wb") as fh:
|
|
326
|
+
await fh.write(b"Hello, World!")
|
|
327
|
+
return f
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
Example (Streaming Read):
|
|
331
|
+
|
|
332
|
+
```python
|
|
333
|
+
@env.task
|
|
334
|
+
async def stream_read(f: File) -> str:
|
|
335
|
+
content_parts = []
|
|
336
|
+
async with f.open("rb", block_size=1024) as fh:
|
|
337
|
+
while True:
|
|
338
|
+
chunk = await fh.read()
|
|
339
|
+
if not chunk:
|
|
340
|
+
break
|
|
341
|
+
content_parts.append(chunk)
|
|
342
|
+
return b"".join(content_parts).decode("utf-8")
|
|
343
|
+
```
|
|
344
|
+
|
|
191
345
|
Args:
|
|
192
|
-
mode: The mode to open the file in (default: 'rb')
|
|
193
|
-
|
|
346
|
+
mode: The mode to open the file in (default: 'rb'). Common modes: 'rb' (read binary),
|
|
347
|
+
'wb' (write binary), 'rt' (read text), 'wt' (write text)
|
|
348
|
+
block_size: Size of blocks for reading in bytes. Useful for streaming large files.
|
|
194
349
|
cache_type: Caching mechanism to use ('readahead', 'mmap', 'bytes', 'none')
|
|
195
350
|
cache_options: Dictionary of options for the cache
|
|
196
351
|
compression: Compression format or None for auto-detection
|
|
197
352
|
**kwargs: Additional arguments passed to fsspec's open method
|
|
198
353
|
|
|
199
354
|
Returns:
|
|
200
|
-
An async file-like object
|
|
201
|
-
|
|
202
|
-
Example:
|
|
203
|
-
```python
|
|
204
|
-
async with file.open('rb') as f:
|
|
205
|
-
data = await f.read()
|
|
206
|
-
```
|
|
355
|
+
An async file-like object that can be used with async read/write operations
|
|
207
356
|
"""
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
if block_size:
|
|
220
|
-
open_kwargs["block_size"] = block_size
|
|
221
|
-
|
|
222
|
-
# Apply caching strategy
|
|
223
|
-
if cache_type != "none":
|
|
224
|
-
open_kwargs["cache_type"] = cache_type
|
|
225
|
-
open_kwargs["cache_options"] = cache_options
|
|
226
|
-
|
|
227
|
-
# Use aiofiles for local files
|
|
228
|
-
if fs.protocol == "file":
|
|
229
|
-
async with aiofiles.open(self.path, mode=mode, **kwargs) as f:
|
|
230
|
-
yield f
|
|
231
|
-
else:
|
|
232
|
-
# This code is broadly similar to what storage.get_stream does, but without actually reading from the stream
|
|
233
|
-
file_handle = None
|
|
357
|
+
# Check if we should use obstore bypass
|
|
358
|
+
try:
|
|
359
|
+
fh = await storage.open(
|
|
360
|
+
self.path,
|
|
361
|
+
mode=mode,
|
|
362
|
+
cache_type=cache_type,
|
|
363
|
+
cache_options=cache_options,
|
|
364
|
+
compression=compression,
|
|
365
|
+
block_size=block_size,
|
|
366
|
+
**kwargs,
|
|
367
|
+
)
|
|
234
368
|
try:
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
if isinstance(fs, AsyncFileSystem):
|
|
238
|
-
file_handle = await fs.open_async(self.path, mode)
|
|
239
|
-
yield file_handle
|
|
240
|
-
return
|
|
241
|
-
except NotImplementedError:
|
|
242
|
-
logger.debug(f"{fs} doesn't implement 'open_async', falling back to sync")
|
|
369
|
+
yield fh
|
|
370
|
+
return
|
|
243
371
|
finally:
|
|
244
|
-
if
|
|
245
|
-
|
|
372
|
+
if inspect.iscoroutinefunction(fh.close):
|
|
373
|
+
await fh.close()
|
|
374
|
+
else:
|
|
375
|
+
fh.close()
|
|
376
|
+
except flyte.errors.OnlyAsyncIOSupportedError:
|
|
377
|
+
# Fall back to aiofiles
|
|
378
|
+
fs = storage.get_underlying_filesystem(path=self.path)
|
|
379
|
+
if "file" in fs.protocol:
|
|
380
|
+
async with aiofiles.open(self.path, mode=mode, **kwargs) as f:
|
|
381
|
+
yield f
|
|
382
|
+
return
|
|
383
|
+
raise
|
|
384
|
+
|
|
385
|
+
async def exists(self) -> bool:
|
|
386
|
+
"""
|
|
387
|
+
Asynchronously check if the file exists.
|
|
388
|
+
|
|
389
|
+
Example (Async):
|
|
246
390
|
|
|
247
|
-
|
|
248
|
-
|
|
391
|
+
```python
|
|
392
|
+
@env.task
|
|
393
|
+
async def check_file(f: File) -> bool:
|
|
394
|
+
if await f.exists():
|
|
395
|
+
print("File exists!")
|
|
396
|
+
return True
|
|
397
|
+
return False
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
True if the file exists, False otherwise
|
|
402
|
+
"""
|
|
403
|
+
return await storage.exists(self.path)
|
|
249
404
|
|
|
250
405
|
def exists_sync(self) -> bool:
|
|
251
406
|
"""
|
|
252
407
|
Synchronously check if the file exists.
|
|
253
408
|
|
|
409
|
+
Use this in non-async tasks or when you need synchronous file existence checking.
|
|
410
|
+
|
|
411
|
+
Example (Sync):
|
|
412
|
+
|
|
413
|
+
```python
|
|
414
|
+
@env.task
|
|
415
|
+
def check_file_sync(f: File) -> bool:
|
|
416
|
+
if f.exists_sync():
|
|
417
|
+
print("File exists!")
|
|
418
|
+
return True
|
|
419
|
+
return False
|
|
420
|
+
```
|
|
421
|
+
|
|
254
422
|
Returns:
|
|
255
423
|
True if the file exists, False otherwise
|
|
256
|
-
|
|
257
|
-
Example:
|
|
258
|
-
```python
|
|
259
|
-
if file.exists_sync():
|
|
260
|
-
# Process the file
|
|
261
|
-
```
|
|
262
424
|
"""
|
|
263
|
-
|
|
264
|
-
return fs.exists(self.path)
|
|
425
|
+
return storage.exists_sync(self.path)
|
|
265
426
|
|
|
266
427
|
@contextmanager
|
|
267
428
|
def open_sync(
|
|
@@ -272,26 +433,44 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
272
433
|
cache_options: Optional[dict] = None,
|
|
273
434
|
compression: Optional[str] = None,
|
|
274
435
|
**kwargs,
|
|
275
|
-
) -> Generator[IO[Any]]:
|
|
436
|
+
) -> Generator[IO[Any], None, None]:
|
|
276
437
|
"""
|
|
277
438
|
Synchronously open the file and return a file-like object.
|
|
278
439
|
|
|
440
|
+
Use this method in non-async tasks to read from or write to files directly.
|
|
441
|
+
|
|
442
|
+
Example (Sync Read):
|
|
443
|
+
|
|
444
|
+
```python
|
|
445
|
+
@env.task
|
|
446
|
+
def read_file_sync(f: File) -> str:
|
|
447
|
+
with f.open_sync("rb") as fh:
|
|
448
|
+
content = fh.read()
|
|
449
|
+
return content.decode("utf-8")
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
Example (Sync Write):
|
|
453
|
+
|
|
454
|
+
```python
|
|
455
|
+
@env.task
|
|
456
|
+
def write_file_sync() -> File:
|
|
457
|
+
f = File.new_remote()
|
|
458
|
+
with f.open_sync("wb") as fh:
|
|
459
|
+
fh.write(b"Hello, World!")
|
|
460
|
+
return f
|
|
461
|
+
```
|
|
462
|
+
|
|
279
463
|
Args:
|
|
280
|
-
mode: The mode to open the file in (default: 'rb')
|
|
281
|
-
|
|
464
|
+
mode: The mode to open the file in (default: 'rb'). Common modes: 'rb' (read binary),
|
|
465
|
+
'wb' (write binary), 'rt' (read text), 'wt' (write text)
|
|
466
|
+
block_size: Size of blocks for reading in bytes. Useful for streaming large files.
|
|
282
467
|
cache_type: Caching mechanism to use ('readahead', 'mmap', 'bytes', 'none')
|
|
283
468
|
cache_options: Dictionary of options for the cache
|
|
284
469
|
compression: Compression format or None for auto-detection
|
|
285
470
|
**kwargs: Additional arguments passed to fsspec's open method
|
|
286
471
|
|
|
287
472
|
Returns:
|
|
288
|
-
A file-like object
|
|
289
|
-
|
|
290
|
-
Example:
|
|
291
|
-
```python
|
|
292
|
-
with file.open_sync('rb') as f:
|
|
293
|
-
data = f.read()
|
|
294
|
-
```
|
|
473
|
+
A file-like object that can be used with standard read/write operations
|
|
295
474
|
"""
|
|
296
475
|
fs = storage.get_underlying_filesystem(path=self.path)
|
|
297
476
|
|
|
@@ -318,54 +497,188 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
318
497
|
"""
|
|
319
498
|
Asynchronously download the file to a local path.
|
|
320
499
|
|
|
500
|
+
Use this when you need to download a remote file to your local filesystem for processing.
|
|
501
|
+
|
|
502
|
+
Example (Async):
|
|
503
|
+
|
|
504
|
+
```python
|
|
505
|
+
@env.task
|
|
506
|
+
async def download_and_process(f: File) -> str:
|
|
507
|
+
local_path = await f.download()
|
|
508
|
+
# Now process the local file
|
|
509
|
+
with open(local_path, "r") as fh:
|
|
510
|
+
return fh.read()
|
|
511
|
+
```
|
|
512
|
+
|
|
513
|
+
Example (Download to specific path):
|
|
514
|
+
|
|
515
|
+
```python
|
|
516
|
+
@env.task
|
|
517
|
+
async def download_to_path(f: File) -> str:
|
|
518
|
+
local_path = await f.download("/tmp/myfile.csv")
|
|
519
|
+
return local_path
|
|
520
|
+
```
|
|
521
|
+
|
|
321
522
|
Args:
|
|
322
523
|
local_path: The local path to download the file to. If None, a temporary
|
|
323
|
-
directory will be used.
|
|
524
|
+
directory will be used and a path will be generated.
|
|
324
525
|
|
|
325
526
|
Returns:
|
|
326
|
-
The path to the downloaded file
|
|
327
|
-
|
|
328
|
-
Example:
|
|
329
|
-
```python
|
|
330
|
-
local_file = await file.download('/tmp/myfile.csv')
|
|
331
|
-
```
|
|
527
|
+
The absolute path to the downloaded file
|
|
332
528
|
"""
|
|
333
529
|
if local_path is None:
|
|
334
|
-
local_path = storage.get_random_local_path(file_path_or_file_name=
|
|
530
|
+
local_path = storage.get_random_local_path(file_path_or_file_name=self.path)
|
|
335
531
|
else:
|
|
532
|
+
# Preserve trailing separator if present (Path.absolute() strips it)
|
|
533
|
+
local_path_str = str(local_path)
|
|
534
|
+
has_trailing_sep = local_path_str.endswith(os.sep)
|
|
336
535
|
local_path = str(Path(local_path).absolute())
|
|
536
|
+
if has_trailing_sep:
|
|
537
|
+
local_path = local_path + os.sep
|
|
337
538
|
|
|
338
539
|
fs = storage.get_underlying_filesystem(path=self.path)
|
|
339
540
|
|
|
340
541
|
# If it's already a local file, just copy it
|
|
341
542
|
if "file" in fs.protocol:
|
|
543
|
+
# Apply directory logic for local-to-local copies
|
|
544
|
+
local_path_for_copy = local_path
|
|
545
|
+
if isinstance(local_path, str):
|
|
546
|
+
local_path_obj = Path(local_path)
|
|
547
|
+
# Check if it's a directory or ends with separator
|
|
548
|
+
if local_path.endswith(os.sep) or (local_path_obj.exists() and local_path_obj.is_dir()):
|
|
549
|
+
remote_filename = Path(self.path).name
|
|
550
|
+
local_path_for_copy = str(local_path_obj / remote_filename)
|
|
551
|
+
|
|
552
|
+
# Ensure parent directory exists
|
|
553
|
+
Path(local_path_for_copy).parent.mkdir(parents=True, exist_ok=True)
|
|
554
|
+
|
|
342
555
|
# Use aiofiles for async copy
|
|
343
556
|
async with aiofiles.open(self.path, "rb") as src:
|
|
344
|
-
async with aiofiles.open(
|
|
557
|
+
async with aiofiles.open(local_path_for_copy, "wb") as dst:
|
|
345
558
|
await dst.write(await src.read())
|
|
346
|
-
return str(
|
|
559
|
+
return str(local_path_for_copy)
|
|
347
560
|
|
|
348
561
|
# Otherwise download from remote using async functionality
|
|
349
|
-
await storage.get(self.path, str(local_path))
|
|
562
|
+
result_path = await storage.get(self.path, str(local_path))
|
|
563
|
+
return result_path
|
|
564
|
+
|
|
565
|
+
def download_sync(self, local_path: Optional[Union[str, Path]] = None) -> str:
|
|
566
|
+
"""
|
|
567
|
+
Synchronously download the file to a local path.
|
|
568
|
+
|
|
569
|
+
Use this in non-async tasks when you need to download a remote file to your local filesystem.
|
|
570
|
+
|
|
571
|
+
Example (Sync):
|
|
572
|
+
|
|
573
|
+
```python
|
|
574
|
+
@env.task
|
|
575
|
+
def download_and_process_sync(f: File) -> str:
|
|
576
|
+
local_path = f.download_sync()
|
|
577
|
+
# Now process the local file
|
|
578
|
+
with open(local_path, "r") as fh:
|
|
579
|
+
return fh.read()
|
|
580
|
+
```
|
|
581
|
+
|
|
582
|
+
Example (Download to specific path):
|
|
583
|
+
|
|
584
|
+
```python
|
|
585
|
+
@env.task
|
|
586
|
+
def download_to_path_sync(f: File) -> str:
|
|
587
|
+
local_path = f.download_sync("/tmp/myfile.csv")
|
|
588
|
+
return local_path
|
|
589
|
+
```
|
|
590
|
+
|
|
591
|
+
Args:
|
|
592
|
+
local_path: The local path to download the file to. If None, a temporary
|
|
593
|
+
directory will be used and a path will be generated.
|
|
594
|
+
|
|
595
|
+
Returns:
|
|
596
|
+
The absolute path to the downloaded file
|
|
597
|
+
"""
|
|
598
|
+
if local_path is None:
|
|
599
|
+
local_path = storage.get_random_local_path(file_path_or_file_name=self.path)
|
|
600
|
+
else:
|
|
601
|
+
# Preserve trailing separator if present (Path.absolute() strips it)
|
|
602
|
+
local_path_str = str(local_path)
|
|
603
|
+
has_trailing_sep = local_path_str.endswith(os.sep)
|
|
604
|
+
local_path = str(Path(local_path).absolute())
|
|
605
|
+
if has_trailing_sep:
|
|
606
|
+
local_path = local_path + os.sep
|
|
607
|
+
|
|
608
|
+
fs = storage.get_underlying_filesystem(path=self.path)
|
|
609
|
+
|
|
610
|
+
# If it's already a local file, just copy it
|
|
611
|
+
if "file" in fs.protocol:
|
|
612
|
+
# Apply directory logic for local-to-local copies
|
|
613
|
+
local_path_for_copy = local_path
|
|
614
|
+
if isinstance(local_path, str):
|
|
615
|
+
local_path_obj = Path(local_path)
|
|
616
|
+
# Check if it's a directory or ends with separator
|
|
617
|
+
if local_path.endswith(os.sep) or (local_path_obj.exists() and local_path_obj.is_dir()):
|
|
618
|
+
remote_filename = Path(self.path).name
|
|
619
|
+
local_path_for_copy = str(local_path_obj / remote_filename)
|
|
620
|
+
|
|
621
|
+
# Ensure parent directory exists
|
|
622
|
+
Path(local_path_for_copy).parent.mkdir(parents=True, exist_ok=True)
|
|
623
|
+
|
|
624
|
+
# Use standard file operations for sync copy
|
|
625
|
+
import shutil
|
|
626
|
+
|
|
627
|
+
shutil.copy2(self.path, local_path_for_copy)
|
|
628
|
+
return str(local_path_for_copy)
|
|
629
|
+
|
|
630
|
+
# Otherwise download from remote using sync functionality
|
|
631
|
+
# Use the sync version of storage operations
|
|
632
|
+
with fs.open(self.path, "rb") as src:
|
|
633
|
+
with open(local_path, "wb") as dst:
|
|
634
|
+
dst.write(src.read())
|
|
350
635
|
return str(local_path)
|
|
351
636
|
|
|
352
637
|
@classmethod
|
|
353
638
|
@requires_initialization
|
|
354
|
-
|
|
639
|
+
def from_local_sync(
|
|
640
|
+
cls,
|
|
641
|
+
local_path: Union[str, Path],
|
|
642
|
+
remote_destination: Optional[str] = None,
|
|
643
|
+
hash_method: Optional[HashMethod | str] = None,
|
|
644
|
+
) -> File[T]:
|
|
355
645
|
"""
|
|
356
|
-
|
|
646
|
+
Synchronously create a new File object from a local file by uploading it to remote storage.
|
|
647
|
+
|
|
648
|
+
Use this in non-async tasks when you have a local file that needs to be uploaded to remote storage.
|
|
649
|
+
|
|
650
|
+
Example (Sync):
|
|
651
|
+
|
|
652
|
+
```python
|
|
653
|
+
@env.task
|
|
654
|
+
def upload_local_file_sync() -> File:
|
|
655
|
+
# Create a local file
|
|
656
|
+
with open("/tmp/data.csv", "w") as f:
|
|
657
|
+
f.write("col1,col2\n1,2\n3,4\n")
|
|
658
|
+
|
|
659
|
+
# Upload to remote storage
|
|
660
|
+
remote_file = File.from_local_sync("/tmp/data.csv")
|
|
661
|
+
return remote_file
|
|
662
|
+
```
|
|
663
|
+
|
|
664
|
+
Example (With specific destination):
|
|
665
|
+
|
|
666
|
+
```python
|
|
667
|
+
@env.task
|
|
668
|
+
def upload_to_specific_path() -> File:
|
|
669
|
+
remote_file = File.from_local_sync("/tmp/data.csv", "s3://my-bucket/data.csv")
|
|
670
|
+
return remote_file
|
|
671
|
+
```
|
|
357
672
|
|
|
358
673
|
Args:
|
|
359
674
|
local_path: Path to the local file
|
|
360
|
-
remote_destination: Optional path to store the file
|
|
675
|
+
remote_destination: Optional remote path to store the file. If None, a path will be automatically generated.
|
|
676
|
+
hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
|
|
677
|
+
it will be used as a precomputed cache key. If a HashMethod is provided, it will compute
|
|
678
|
+
the hash during upload. If not specified, the cache key will be based on file attributes.
|
|
361
679
|
|
|
362
680
|
Returns:
|
|
363
|
-
A new File instance pointing to the uploaded file
|
|
364
|
-
|
|
365
|
-
Example:
|
|
366
|
-
```python
|
|
367
|
-
remote_file = await File[DataFrame].from_local('/tmp/data.csv', 's3://bucket/data.csv')
|
|
368
|
-
```
|
|
681
|
+
A new File instance pointing to the uploaded remote file
|
|
369
682
|
"""
|
|
370
683
|
if not os.path.exists(local_path):
|
|
371
684
|
raise ValueError(f"File not found: {local_path}")
|
|
@@ -376,20 +689,148 @@ class File(BaseModel, Generic[T], SerializableType):
|
|
|
376
689
|
|
|
377
690
|
# If remote_destination was not set by the user, and the configured raw data path is also local,
|
|
378
691
|
# then let's optimize by not uploading.
|
|
692
|
+
hash_value = hash_method if isinstance(hash_method, str) else None
|
|
693
|
+
hash_method_obj = hash_method if isinstance(hash_method, HashMethod) else None
|
|
694
|
+
|
|
379
695
|
if "file" in protocol:
|
|
380
696
|
if remote_destination is None:
|
|
381
697
|
path = str(Path(local_path).absolute())
|
|
382
698
|
else:
|
|
383
699
|
# Otherwise, actually make a copy of the file
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
700
|
+
import shutil
|
|
701
|
+
|
|
702
|
+
if hash_method_obj:
|
|
703
|
+
# For hash computation, we need to read and write manually
|
|
704
|
+
with open(local_path, "rb") as src:
|
|
705
|
+
with open(remote_path, "wb") as dst:
|
|
706
|
+
dst_wrapper = HashingWriter(dst, accumulator=hash_method_obj)
|
|
707
|
+
dst_wrapper.write(src.read())
|
|
708
|
+
hash_value = dst_wrapper.result()
|
|
709
|
+
dst_wrapper.close()
|
|
710
|
+
else:
|
|
711
|
+
shutil.copy2(local_path, remote_path)
|
|
712
|
+
path = str(Path(remote_path).absolute())
|
|
713
|
+
else:
|
|
714
|
+
# Otherwise upload to remote using sync storage layer
|
|
715
|
+
fs = storage.get_underlying_filesystem(path=remote_path)
|
|
716
|
+
|
|
717
|
+
if hash_method_obj:
|
|
718
|
+
# We can skip the wrapper if the hash method is just a precomputed value
|
|
719
|
+
if not isinstance(hash_method_obj, PrecomputedValue):
|
|
720
|
+
with open(local_path, "rb") as src:
|
|
721
|
+
# For sync operations, we need to compute hash manually
|
|
722
|
+
data = src.read()
|
|
723
|
+
hash_method_obj.update(memoryview(data))
|
|
724
|
+
hash_value = hash_method_obj.result()
|
|
725
|
+
|
|
726
|
+
# Now write the data to remote
|
|
727
|
+
with fs.open(remote_path, "wb") as dst:
|
|
728
|
+
dst.write(data)
|
|
729
|
+
path = remote_path
|
|
730
|
+
else:
|
|
731
|
+
# Use sync file operations
|
|
732
|
+
with open(local_path, "rb") as src:
|
|
733
|
+
with fs.open(remote_path, "wb") as dst:
|
|
734
|
+
dst.write(src.read())
|
|
735
|
+
path = remote_path
|
|
736
|
+
hash_value = hash_method_obj.result()
|
|
737
|
+
else:
|
|
738
|
+
# Simple sync copy
|
|
739
|
+
with open(local_path, "rb") as src:
|
|
740
|
+
with fs.open(remote_path, "wb") as dst:
|
|
741
|
+
dst.write(src.read())
|
|
742
|
+
path = remote_path
|
|
743
|
+
|
|
744
|
+
f = cls(path=path, name=filename, hash_method=hash_method_obj, hash=hash_value)
|
|
745
|
+
return f
|
|
746
|
+
|
|
747
|
+
@classmethod
|
|
748
|
+
@requires_initialization
|
|
749
|
+
async def from_local(
|
|
750
|
+
cls,
|
|
751
|
+
local_path: Union[str, Path],
|
|
752
|
+
remote_destination: Optional[str] = None,
|
|
753
|
+
hash_method: Optional[HashMethod | str] = None,
|
|
754
|
+
) -> File[T]:
|
|
755
|
+
"""
|
|
756
|
+
Asynchronously create a new File object from a local file by uploading it to remote storage.
|
|
757
|
+
|
|
758
|
+
Use this in async tasks when you have a local file that needs to be uploaded to remote storage.
|
|
759
|
+
|
|
760
|
+
Example (Async):
|
|
761
|
+
|
|
762
|
+
```python
|
|
763
|
+
@env.task
|
|
764
|
+
async def upload_local_file() -> File:
|
|
765
|
+
# Create a local file
|
|
766
|
+
async with aiofiles.open("/tmp/data.csv", "w") as f:
|
|
767
|
+
await f.write("col1,col2\n1,2\n3,4\n")
|
|
768
|
+
|
|
769
|
+
# Upload to remote storage
|
|
770
|
+
remote_file = await File.from_local("/tmp/data.csv")
|
|
771
|
+
return remote_file
|
|
772
|
+
```
|
|
773
|
+
|
|
774
|
+
Example (With specific destination):
|
|
775
|
+
|
|
776
|
+
```python
|
|
777
|
+
@env.task
|
|
778
|
+
async def upload_to_specific_path() -> File:
|
|
779
|
+
remote_file = await File.from_local("/tmp/data.csv", "s3://my-bucket/data.csv")
|
|
780
|
+
return remote_file
|
|
781
|
+
```
|
|
782
|
+
|
|
783
|
+
Args:
|
|
784
|
+
local_path: Path to the local file
|
|
785
|
+
remote_destination: Optional remote path to store the file. If None, a path will be automatically generated.
|
|
786
|
+
hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
|
|
787
|
+
it will be used as a precomputed cache key. If a HashMethod is provided, it will compute
|
|
788
|
+
the hash during upload. If not specified, the cache key will be based on file attributes.
|
|
789
|
+
|
|
790
|
+
Returns:
|
|
791
|
+
A new File instance pointing to the uploaded remote file
|
|
792
|
+
"""
|
|
793
|
+
if not os.path.exists(local_path):
|
|
794
|
+
raise ValueError(f"File not found: {local_path}")
|
|
795
|
+
|
|
796
|
+
filename = Path(local_path).name
|
|
797
|
+
remote_path = remote_destination or internal_ctx().raw_data.get_random_remote_path(filename)
|
|
798
|
+
protocol = get_protocol(remote_path)
|
|
799
|
+
|
|
800
|
+
# If remote_destination was not set by the user, and the configured raw data path is also local,
|
|
801
|
+
# then let's optimize by not uploading.
|
|
802
|
+
hash_value = hash_method if isinstance(hash_method, str) else None
|
|
803
|
+
hash_method = hash_method if isinstance(hash_method, HashMethod) else None
|
|
804
|
+
if "file" in protocol:
|
|
805
|
+
if remote_destination is None:
|
|
806
|
+
path = str(Path(local_path).absolute())
|
|
807
|
+
else:
|
|
808
|
+
# Otherwise, actually make a copy of the file
|
|
809
|
+
async with aiofiles.open(local_path, "rb") as src:
|
|
810
|
+
async with aiofiles.open(remote_path, "wb") as dst:
|
|
811
|
+
if hash_method:
|
|
812
|
+
dst_wrapper = HashingWriter(dst, accumulator=hash_method)
|
|
813
|
+
await dst_wrapper.write(await src.read())
|
|
814
|
+
hash_value = dst_wrapper.result()
|
|
815
|
+
else:
|
|
816
|
+
await dst.write(await src.read())
|
|
387
817
|
path = str(Path(remote_path).absolute())
|
|
388
818
|
else:
|
|
389
819
|
# Otherwise upload to remote using async storage layer
|
|
390
|
-
|
|
820
|
+
if hash_method:
|
|
821
|
+
# We can skip the wrapper if the hash method is just a precomputed value
|
|
822
|
+
if not isinstance(hash_method, PrecomputedValue):
|
|
823
|
+
async with aiofiles.open(local_path, "rb") as src:
|
|
824
|
+
src_wrapper = AsyncHashingReader(src, accumulator=hash_method)
|
|
825
|
+
path = await storage.put_stream(src_wrapper, to_path=remote_path)
|
|
826
|
+
hash_value = src_wrapper.result()
|
|
827
|
+
else:
|
|
828
|
+
path = await storage.put(str(local_path), remote_path)
|
|
829
|
+
hash_value = hash_method.result()
|
|
830
|
+
else:
|
|
831
|
+
path = await storage.put(str(local_path), remote_path)
|
|
391
832
|
|
|
392
|
-
f = cls(path=path, name=filename)
|
|
833
|
+
f = cls(path=path, name=filename, hash_method=hash_method, hash=hash_value)
|
|
393
834
|
return f
|
|
394
835
|
|
|
395
836
|
|
|
@@ -432,7 +873,8 @@ class FileTransformer(TypeTransformer[File]):
|
|
|
432
873
|
),
|
|
433
874
|
uri=python_val.path,
|
|
434
875
|
)
|
|
435
|
-
)
|
|
876
|
+
),
|
|
877
|
+
hash=python_val.hash if python_val.hash else None,
|
|
436
878
|
)
|
|
437
879
|
|
|
438
880
|
async def to_python_value(
|
|
@@ -450,7 +892,8 @@ class FileTransformer(TypeTransformer[File]):
|
|
|
450
892
|
|
|
451
893
|
uri = lv.scalar.blob.uri
|
|
452
894
|
filename = Path(uri).name
|
|
453
|
-
|
|
895
|
+
hash_value = lv.hash if lv.hash else None
|
|
896
|
+
f: File = File(path=uri, name=filename, format=lv.scalar.blob.metadata.type.format, hash=hash_value)
|
|
454
897
|
return f
|
|
455
898
|
|
|
456
899
|
def guess_python_type(self, literal_type: types_pb2.LiteralType) -> Type[File]:
|