flyte 2.0.0b13__py3-none-any.whl → 2.0.0b30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flyte/__init__.py +18 -2
- flyte/_bin/debug.py +38 -0
- flyte/_bin/runtime.py +62 -8
- flyte/_cache/cache.py +4 -2
- flyte/_cache/local_cache.py +216 -0
- flyte/_code_bundle/_ignore.py +12 -4
- flyte/_code_bundle/_packaging.py +13 -9
- flyte/_code_bundle/_utils.py +18 -10
- flyte/_code_bundle/bundle.py +17 -9
- flyte/_constants.py +1 -0
- flyte/_context.py +4 -1
- flyte/_custom_context.py +73 -0
- flyte/_debug/constants.py +38 -0
- flyte/_debug/utils.py +17 -0
- flyte/_debug/vscode.py +307 -0
- flyte/_deploy.py +235 -61
- flyte/_environment.py +20 -6
- flyte/_excepthook.py +1 -1
- flyte/_hash.py +1 -16
- flyte/_image.py +178 -81
- flyte/_initialize.py +132 -51
- flyte/_interface.py +39 -2
- flyte/_internal/controllers/__init__.py +4 -5
- flyte/_internal/controllers/_local_controller.py +70 -29
- flyte/_internal/controllers/_trace.py +1 -1
- flyte/_internal/controllers/remote/__init__.py +0 -2
- flyte/_internal/controllers/remote/_action.py +14 -16
- flyte/_internal/controllers/remote/_client.py +1 -1
- flyte/_internal/controllers/remote/_controller.py +68 -70
- flyte/_internal/controllers/remote/_core.py +127 -99
- flyte/_internal/controllers/remote/_informer.py +19 -10
- flyte/_internal/controllers/remote/_service_protocol.py +7 -7
- flyte/_internal/imagebuild/docker_builder.py +181 -69
- flyte/_internal/imagebuild/image_builder.py +0 -5
- flyte/_internal/imagebuild/remote_builder.py +155 -64
- flyte/_internal/imagebuild/utils.py +51 -2
- flyte/_internal/resolvers/_task_module.py +5 -38
- flyte/_internal/resolvers/default.py +2 -2
- flyte/_internal/runtime/convert.py +110 -21
- flyte/_internal/runtime/entrypoints.py +27 -1
- flyte/_internal/runtime/io.py +21 -8
- flyte/_internal/runtime/resources_serde.py +20 -6
- flyte/_internal/runtime/reuse.py +1 -1
- flyte/_internal/runtime/rusty.py +20 -5
- flyte/_internal/runtime/task_serde.py +34 -19
- flyte/_internal/runtime/taskrunner.py +22 -4
- flyte/_internal/runtime/trigger_serde.py +160 -0
- flyte/_internal/runtime/types_serde.py +1 -1
- flyte/_keyring/__init__.py +0 -0
- flyte/_keyring/file.py +115 -0
- flyte/_logging.py +201 -39
- flyte/_map.py +111 -14
- flyte/_module.py +70 -0
- flyte/_pod.py +4 -3
- flyte/_resources.py +213 -31
- flyte/_run.py +110 -39
- flyte/_task.py +75 -16
- flyte/_task_environment.py +105 -29
- flyte/_task_plugins.py +4 -2
- flyte/_trace.py +5 -0
- flyte/_trigger.py +1000 -0
- flyte/_utils/__init__.py +2 -1
- flyte/_utils/asyn.py +3 -1
- flyte/_utils/coro_management.py +2 -1
- flyte/_utils/docker_credentials.py +173 -0
- flyte/_utils/module_loader.py +17 -2
- flyte/_version.py +3 -3
- flyte/cli/_abort.py +3 -3
- flyte/cli/_build.py +3 -6
- flyte/cli/_common.py +78 -7
- flyte/cli/_create.py +182 -4
- flyte/cli/_delete.py +23 -1
- flyte/cli/_deploy.py +63 -16
- flyte/cli/_get.py +79 -34
- flyte/cli/_params.py +26 -10
- flyte/cli/_plugins.py +209 -0
- flyte/cli/_run.py +151 -26
- flyte/cli/_serve.py +64 -0
- flyte/cli/_update.py +37 -0
- flyte/cli/_user.py +17 -0
- flyte/cli/main.py +30 -4
- flyte/config/_config.py +10 -6
- flyte/config/_internal.py +1 -0
- flyte/config/_reader.py +29 -8
- flyte/connectors/__init__.py +11 -0
- flyte/connectors/_connector.py +270 -0
- flyte/connectors/_server.py +197 -0
- flyte/connectors/utils.py +135 -0
- flyte/errors.py +22 -2
- flyte/extend.py +8 -1
- flyte/extras/_container.py +6 -1
- flyte/git/__init__.py +3 -0
- flyte/git/_config.py +21 -0
- flyte/io/__init__.py +2 -0
- flyte/io/_dataframe/__init__.py +2 -0
- flyte/io/_dataframe/basic_dfs.py +17 -8
- flyte/io/_dataframe/dataframe.py +98 -132
- flyte/io/_dir.py +575 -113
- flyte/io/_file.py +582 -139
- flyte/io/_hashing_io.py +342 -0
- flyte/models.py +74 -15
- flyte/remote/__init__.py +6 -1
- flyte/remote/_action.py +34 -26
- flyte/remote/_client/_protocols.py +39 -4
- flyte/remote/_client/auth/_authenticators/device_code.py +4 -5
- flyte/remote/_client/auth/_authenticators/pkce.py +1 -1
- flyte/remote/_client/auth/_channel.py +10 -6
- flyte/remote/_client/controlplane.py +17 -5
- flyte/remote/_console.py +3 -2
- flyte/remote/_data.py +6 -6
- flyte/remote/_logs.py +3 -3
- flyte/remote/_run.py +64 -8
- flyte/remote/_secret.py +26 -17
- flyte/remote/_task.py +75 -33
- flyte/remote/_trigger.py +306 -0
- flyte/remote/_user.py +33 -0
- flyte/report/_report.py +1 -1
- flyte/storage/__init__.py +6 -1
- flyte/storage/_config.py +5 -1
- flyte/storage/_parallel_reader.py +274 -0
- flyte/storage/_storage.py +200 -103
- flyte/types/__init__.py +16 -0
- flyte/types/_interface.py +2 -2
- flyte/types/_pickle.py +35 -8
- flyte/types/_string_literals.py +8 -9
- flyte/types/_type_engine.py +40 -70
- flyte/types/_utils.py +1 -1
- flyte-2.0.0b30.data/scripts/debug.py +38 -0
- {flyte-2.0.0b13.data → flyte-2.0.0b30.data}/scripts/runtime.py +62 -8
- {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/METADATA +11 -3
- flyte-2.0.0b30.dist-info/RECORD +192 -0
- {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/entry_points.txt +3 -0
- flyte/_protos/common/authorization_pb2.py +0 -66
- flyte/_protos/common/authorization_pb2.pyi +0 -108
- flyte/_protos/common/authorization_pb2_grpc.py +0 -4
- flyte/_protos/common/identifier_pb2.py +0 -93
- flyte/_protos/common/identifier_pb2.pyi +0 -110
- flyte/_protos/common/identifier_pb2_grpc.py +0 -4
- flyte/_protos/common/identity_pb2.py +0 -48
- flyte/_protos/common/identity_pb2.pyi +0 -72
- flyte/_protos/common/identity_pb2_grpc.py +0 -4
- flyte/_protos/common/list_pb2.py +0 -36
- flyte/_protos/common/list_pb2.pyi +0 -71
- flyte/_protos/common/list_pb2_grpc.py +0 -4
- flyte/_protos/common/policy_pb2.py +0 -37
- flyte/_protos/common/policy_pb2.pyi +0 -27
- flyte/_protos/common/policy_pb2_grpc.py +0 -4
- flyte/_protos/common/role_pb2.py +0 -37
- flyte/_protos/common/role_pb2.pyi +0 -53
- flyte/_protos/common/role_pb2_grpc.py +0 -4
- flyte/_protos/common/runtime_version_pb2.py +0 -28
- flyte/_protos/common/runtime_version_pb2.pyi +0 -24
- flyte/_protos/common/runtime_version_pb2_grpc.py +0 -4
- flyte/_protos/imagebuilder/definition_pb2.py +0 -59
- flyte/_protos/imagebuilder/definition_pb2.pyi +0 -140
- flyte/_protos/imagebuilder/definition_pb2_grpc.py +0 -4
- flyte/_protos/imagebuilder/payload_pb2.py +0 -32
- flyte/_protos/imagebuilder/payload_pb2.pyi +0 -21
- flyte/_protos/imagebuilder/payload_pb2_grpc.py +0 -4
- flyte/_protos/imagebuilder/service_pb2.py +0 -29
- flyte/_protos/imagebuilder/service_pb2.pyi +0 -5
- flyte/_protos/imagebuilder/service_pb2_grpc.py +0 -66
- flyte/_protos/logs/dataplane/payload_pb2.py +0 -100
- flyte/_protos/logs/dataplane/payload_pb2.pyi +0 -177
- flyte/_protos/logs/dataplane/payload_pb2_grpc.py +0 -4
- flyte/_protos/secret/definition_pb2.py +0 -49
- flyte/_protos/secret/definition_pb2.pyi +0 -93
- flyte/_protos/secret/definition_pb2_grpc.py +0 -4
- flyte/_protos/secret/payload_pb2.py +0 -62
- flyte/_protos/secret/payload_pb2.pyi +0 -94
- flyte/_protos/secret/payload_pb2_grpc.py +0 -4
- flyte/_protos/secret/secret_pb2.py +0 -38
- flyte/_protos/secret/secret_pb2.pyi +0 -6
- flyte/_protos/secret/secret_pb2_grpc.py +0 -198
- flyte/_protos/secret/secret_pb2_grpc_grpc.py +0 -198
- flyte/_protos/validate/validate/validate_pb2.py +0 -76
- flyte/_protos/workflow/common_pb2.py +0 -27
- flyte/_protos/workflow/common_pb2.pyi +0 -14
- flyte/_protos/workflow/common_pb2_grpc.py +0 -4
- flyte/_protos/workflow/environment_pb2.py +0 -29
- flyte/_protos/workflow/environment_pb2.pyi +0 -12
- flyte/_protos/workflow/environment_pb2_grpc.py +0 -4
- flyte/_protos/workflow/node_execution_service_pb2.py +0 -26
- flyte/_protos/workflow/node_execution_service_pb2.pyi +0 -4
- flyte/_protos/workflow/node_execution_service_pb2_grpc.py +0 -32
- flyte/_protos/workflow/queue_service_pb2.py +0 -109
- flyte/_protos/workflow/queue_service_pb2.pyi +0 -166
- flyte/_protos/workflow/queue_service_pb2_grpc.py +0 -172
- flyte/_protos/workflow/run_definition_pb2.py +0 -121
- flyte/_protos/workflow/run_definition_pb2.pyi +0 -327
- flyte/_protos/workflow/run_definition_pb2_grpc.py +0 -4
- flyte/_protos/workflow/run_logs_service_pb2.py +0 -41
- flyte/_protos/workflow/run_logs_service_pb2.pyi +0 -28
- flyte/_protos/workflow/run_logs_service_pb2_grpc.py +0 -69
- flyte/_protos/workflow/run_service_pb2.py +0 -137
- flyte/_protos/workflow/run_service_pb2.pyi +0 -185
- flyte/_protos/workflow/run_service_pb2_grpc.py +0 -446
- flyte/_protos/workflow/state_service_pb2.py +0 -67
- flyte/_protos/workflow/state_service_pb2.pyi +0 -76
- flyte/_protos/workflow/state_service_pb2_grpc.py +0 -138
- flyte/_protos/workflow/task_definition_pb2.py +0 -79
- flyte/_protos/workflow/task_definition_pb2.pyi +0 -81
- flyte/_protos/workflow/task_definition_pb2_grpc.py +0 -4
- flyte/_protos/workflow/task_service_pb2.py +0 -60
- flyte/_protos/workflow/task_service_pb2.pyi +0 -59
- flyte/_protos/workflow/task_service_pb2_grpc.py +0 -138
- flyte-2.0.0b13.dist-info/RECORD +0 -239
- /flyte/{_protos → _debug}/__init__.py +0 -0
- {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/WHEEL +0 -0
- {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/licenses/LICENSE +0 -0
- {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/top_level.txt +0 -0
flyte/io/_dir.py
CHANGED
|
@@ -4,12 +4,14 @@ import os
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import AsyncIterator, Dict, Generic, Iterator, List, Optional, Type, TypeVar, Union
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from flyteidl2.core import literals_pb2, types_pb2
|
|
8
8
|
from fsspec.asyn import AsyncFileSystem
|
|
9
|
+
from fsspec.utils import get_protocol
|
|
9
10
|
from mashumaro.types import SerializableType
|
|
10
11
|
from pydantic import BaseModel, model_validator
|
|
11
12
|
|
|
12
13
|
import flyte.storage as storage
|
|
14
|
+
from flyte._context import internal_ctx
|
|
13
15
|
from flyte.io._file import File
|
|
14
16
|
from flyte.types import TypeEngine, TypeTransformer, TypeTransformerFailedError
|
|
15
17
|
|
|
@@ -20,34 +22,183 @@ T = TypeVar("T")
|
|
|
20
22
|
class Dir(BaseModel, Generic[T], SerializableType):
|
|
21
23
|
"""
|
|
22
24
|
A generic directory class representing a directory with files of a specified format.
|
|
23
|
-
Provides both async and sync interfaces for directory operations.
|
|
24
|
-
Users are responsible for handling all I/O - the type transformer for Dir does not do any automatic uploading
|
|
25
|
-
or downloading of files.
|
|
25
|
+
Provides both async and sync interfaces for directory operations. All methods without _sync suffix are async.
|
|
26
26
|
|
|
27
|
-
The
|
|
27
|
+
The class should be instantiated using one of the class methods. The constructor should only be used to
|
|
28
|
+
instantiate references to existing remote directories.
|
|
28
29
|
|
|
29
|
-
|
|
30
|
-
```python
|
|
31
|
-
# Async usage
|
|
32
|
-
from pandas import DataFrame
|
|
33
|
-
data_dir = Dir[DataFrame](path="s3://my-bucket/data/")
|
|
30
|
+
The generic type T represents the format of the files in the directory.
|
|
34
31
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
Important methods:
|
|
33
|
+
- `from_existing_remote`: Create a Dir object referencing an existing remote directory.
|
|
34
|
+
- `from_local` / `from_local_sync`: Upload a local directory to remote storage.
|
|
35
|
+
|
|
36
|
+
**Asynchronous methods**:
|
|
37
|
+
- `walk`: Asynchronously iterate through files in the directory.
|
|
38
|
+
- `list_files`: Asynchronously get a list of all files (non-recursive).
|
|
39
|
+
- `download`: Asynchronously download the entire directory to a local path.
|
|
40
|
+
- `exists`: Asynchronously check if the directory exists.
|
|
41
|
+
- `get_file`: Asynchronously get a specific file from the directory by name.
|
|
42
|
+
|
|
43
|
+
**Synchronous methods** (suffixed with `_sync`):
|
|
44
|
+
- `walk_sync`: Synchronously iterate through files in the directory.
|
|
45
|
+
- `list_files_sync`: Synchronously get a list of all files (non-recursive).
|
|
46
|
+
- `download_sync`: Synchronously download the entire directory to a local path.
|
|
47
|
+
- `exists_sync`: Synchronously check if the directory exists.
|
|
48
|
+
- `get_file_sync`: Synchronously get a specific file from the directory by name.
|
|
49
|
+
|
|
50
|
+
Example: Walk through directory files recursively (Async).
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
@env.task
|
|
54
|
+
async def process_all_files(d: Dir) -> int:
|
|
55
|
+
file_count = 0
|
|
56
|
+
async for file in d.walk(recursive=True):
|
|
57
|
+
async with file.open("rb") as f:
|
|
38
58
|
content = await f.read()
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
59
|
+
# Process content
|
|
60
|
+
file_count += 1
|
|
61
|
+
return file_count
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Example: Walk through directory files recursively (Sync).
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
@env.task
|
|
68
|
+
def process_all_files_sync(d: Dir) -> int:
|
|
69
|
+
file_count = 0
|
|
70
|
+
for file in d.walk_sync(recursive=True):
|
|
71
|
+
with file.open_sync("rb") as f:
|
|
43
72
|
content = f.read()
|
|
44
|
-
|
|
73
|
+
# Process content
|
|
74
|
+
file_count += 1
|
|
75
|
+
return file_count
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Example: List files in directory (Async).
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
@env.task
|
|
82
|
+
async def count_files(d: Dir) -> int:
|
|
83
|
+
files = await d.list_files()
|
|
84
|
+
return len(files)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Example: List files in directory (Sync).
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
@env.task
|
|
91
|
+
def count_files_sync(d: Dir) -> int:
|
|
92
|
+
files = d.list_files_sync()
|
|
93
|
+
return len(files)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Example: Get a specific file from directory (Async).
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
@env.task
|
|
100
|
+
async def read_config_file(d: Dir) -> str:
|
|
101
|
+
config_file = await d.get_file("config.json")
|
|
102
|
+
if config_file:
|
|
103
|
+
async with config_file.open("rb") as f:
|
|
104
|
+
return (await f.read()).decode("utf-8")
|
|
105
|
+
return "Config not found"
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Example: Get a specific file from directory (Sync).
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
@env.task
|
|
112
|
+
def read_config_file_sync(d: Dir) -> str:
|
|
113
|
+
config_file = d.get_file_sync("config.json")
|
|
114
|
+
if config_file:
|
|
115
|
+
with config_file.open_sync("rb") as f:
|
|
116
|
+
return f.read().decode("utf-8")
|
|
117
|
+
return "Config not found"
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Example: Upload a local directory to remote storage (Async).
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
@env.task
|
|
124
|
+
async def upload_directory() -> Dir:
|
|
125
|
+
# Create local directory with files
|
|
126
|
+
os.makedirs("/tmp/my_data", exist_ok=True)
|
|
127
|
+
with open("/tmp/my_data/file1.txt", "w") as f:
|
|
128
|
+
f.write("data1")
|
|
129
|
+
# Upload to remote storage
|
|
130
|
+
return await Dir.from_local("/tmp/my_data/")
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Example: Upload a local directory to remote storage (Sync).
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
@env.task
|
|
137
|
+
def upload_directory_sync() -> Dir:
|
|
138
|
+
# Create local directory with files
|
|
139
|
+
os.makedirs("/tmp/my_data", exist_ok=True)
|
|
140
|
+
with open("/tmp/my_data/file1.txt", "w") as f:
|
|
141
|
+
f.write("data1")
|
|
142
|
+
# Upload to remote storage
|
|
143
|
+
return Dir.from_local_sync("/tmp/my_data/")
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Example: Download a directory to local storage (Async).
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
@env.task
|
|
150
|
+
async def download_directory(d: Dir) -> str:
|
|
151
|
+
local_path = await d.download()
|
|
152
|
+
# Process files in local directory
|
|
153
|
+
return local_path
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Example: Download a directory to local storage (Sync).
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
@env.task
|
|
160
|
+
def download_directory_sync(d: Dir) -> str:
|
|
161
|
+
local_path = d.download_sync()
|
|
162
|
+
# Process files in local directory
|
|
163
|
+
return local_path
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Example: Reference an existing remote directory.
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
@env.task
|
|
170
|
+
async def process_existing_dir() -> int:
|
|
171
|
+
d = Dir.from_existing_remote("s3://my-bucket/data/")
|
|
172
|
+
files = await d.list_files()
|
|
173
|
+
return len(files)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Example: Check if directory exists (Async).
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
@env.task
|
|
180
|
+
async def check_directory(d: Dir) -> bool:
|
|
181
|
+
return await d.exists()
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
Example: Check if directory exists (Sync).
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
@env.task
|
|
188
|
+
def check_directory_sync(d: Dir) -> bool:
|
|
189
|
+
return d.exists_sync()
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
path: The path to the directory (can be local or remote)
|
|
194
|
+
name: Optional name for the directory (defaults to basename of path)
|
|
45
195
|
"""
|
|
46
196
|
|
|
47
197
|
# Represents either a local or remote path.
|
|
48
198
|
path: str
|
|
49
199
|
name: Optional[str] = None
|
|
50
200
|
format: str = ""
|
|
201
|
+
hash: Optional[str] = None
|
|
51
202
|
|
|
52
203
|
class Config:
|
|
53
204
|
arbitrary_types_allowed = True
|
|
@@ -55,20 +206,24 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
55
206
|
@model_validator(mode="before")
|
|
56
207
|
@classmethod
|
|
57
208
|
def pre_init(cls, data):
|
|
209
|
+
"""Internal: Pydantic validator to set default name from path. Not intended for direct use."""
|
|
58
210
|
if data.get("name") is None:
|
|
59
211
|
data["name"] = Path(data["path"]).name
|
|
60
212
|
return data
|
|
61
213
|
|
|
62
214
|
def _serialize(self) -> Dict[str, Optional[str]]:
|
|
215
|
+
"""Internal: Serialize Dir to dictionary. Not intended for direct use."""
|
|
63
216
|
pyd_dump = self.model_dump()
|
|
64
217
|
return pyd_dump
|
|
65
218
|
|
|
66
219
|
@classmethod
|
|
67
220
|
def _deserialize(cls, file_dump: Dict[str, Optional[str]]) -> Dir:
|
|
221
|
+
"""Internal: Deserialize Dir from dictionary. Not intended for direct use."""
|
|
68
222
|
return cls.model_validate(file_dump)
|
|
69
223
|
|
|
70
224
|
@classmethod
|
|
71
225
|
def schema_match(cls, incoming: dict):
|
|
226
|
+
"""Internal: Check if incoming schema matches Dir schema. Not intended for direct use."""
|
|
72
227
|
this_schema = cls.model_json_schema()
|
|
73
228
|
current_required = this_schema.get("required")
|
|
74
229
|
incoming_required = incoming.get("required")
|
|
@@ -85,19 +240,48 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
85
240
|
"""
|
|
86
241
|
Asynchronously walk through the directory and yield File objects.
|
|
87
242
|
|
|
243
|
+
Use this to iterate through all files in a directory. Each yielded File can be read directly without
|
|
244
|
+
downloading.
|
|
245
|
+
|
|
246
|
+
Example (Async - Recursive):
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
@env.task
|
|
250
|
+
async def list_all_files(d: Dir) -> list[str]:
|
|
251
|
+
file_names = []
|
|
252
|
+
async for file in d.walk(recursive=True):
|
|
253
|
+
file_names.append(file.name)
|
|
254
|
+
return file_names
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
Example (Async - Non-recursive):
|
|
258
|
+
|
|
259
|
+
```python
|
|
260
|
+
@env.task
|
|
261
|
+
async def list_top_level_files(d: Dir) -> list[str]:
|
|
262
|
+
file_names = []
|
|
263
|
+
async for file in d.walk(recursive=False):
|
|
264
|
+
file_names.append(file.name)
|
|
265
|
+
return file_names
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
Example (Async - With max depth):
|
|
269
|
+
|
|
270
|
+
```python
|
|
271
|
+
@env.task
|
|
272
|
+
async def list_files_max_depth(d: Dir) -> list[str]:
|
|
273
|
+
file_names = []
|
|
274
|
+
async for file in d.walk(recursive=True, max_depth=2):
|
|
275
|
+
file_names.append(file.name)
|
|
276
|
+
return file_names
|
|
277
|
+
```
|
|
278
|
+
|
|
88
279
|
Args:
|
|
89
|
-
recursive: If True, recursively walk subdirectories
|
|
90
|
-
max_depth: Maximum depth for recursive walking
|
|
280
|
+
recursive: If True, recursively walk subdirectories. If False, only list files in the top-level directory.
|
|
281
|
+
max_depth: Maximum depth for recursive walking. If None, walk through all subdirectories.
|
|
91
282
|
|
|
92
283
|
Yields:
|
|
93
284
|
File objects for each file found in the directory
|
|
94
|
-
|
|
95
|
-
Example:
|
|
96
|
-
```python
|
|
97
|
-
async for file in directory.walk():
|
|
98
|
-
local_path = await file.download()
|
|
99
|
-
# Process the file
|
|
100
|
-
```
|
|
101
285
|
"""
|
|
102
286
|
fs = storage.get_underlying_filesystem(path=self.path)
|
|
103
287
|
if recursive is False:
|
|
@@ -124,20 +308,48 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
124
308
|
"""
|
|
125
309
|
Synchronously walk through the directory and yield File objects.
|
|
126
310
|
|
|
311
|
+
Use this in non-async tasks to iterate through all files in a directory.
|
|
312
|
+
|
|
313
|
+
Example (Sync - Recursive):
|
|
314
|
+
|
|
315
|
+
```python
|
|
316
|
+
@env.task
|
|
317
|
+
def list_all_files_sync(d: Dir) -> list[str]:
|
|
318
|
+
file_names = []
|
|
319
|
+
for file in d.walk_sync(recursive=True):
|
|
320
|
+
file_names.append(file.name)
|
|
321
|
+
return file_names
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
Example (Sync - With file pattern):
|
|
325
|
+
|
|
326
|
+
```python
|
|
327
|
+
@env.task
|
|
328
|
+
def list_text_files(d: Dir) -> list[str]:
|
|
329
|
+
file_names = []
|
|
330
|
+
for file in d.walk_sync(recursive=True, file_pattern="*.txt"):
|
|
331
|
+
file_names.append(file.name)
|
|
332
|
+
return file_names
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
Example (Sync - Non-recursive with max depth):
|
|
336
|
+
|
|
337
|
+
```python
|
|
338
|
+
@env.task
|
|
339
|
+
def list_files_limited(d: Dir) -> list[str]:
|
|
340
|
+
file_names = []
|
|
341
|
+
for file in d.walk_sync(recursive=True, max_depth=2):
|
|
342
|
+
file_names.append(file.name)
|
|
343
|
+
return file_names
|
|
344
|
+
```
|
|
345
|
+
|
|
127
346
|
Args:
|
|
128
|
-
recursive: If True, recursively walk subdirectories
|
|
129
|
-
file_pattern: Glob pattern to filter files
|
|
130
|
-
max_depth: Maximum depth for recursive walking
|
|
347
|
+
recursive: If True, recursively walk subdirectories. If False, only list files in the top-level directory.
|
|
348
|
+
file_pattern: Glob pattern to filter files (e.g., "*.txt", "*.csv"). Default is "*" (all files).
|
|
349
|
+
max_depth: Maximum depth for recursive walking. If None, walk through all subdirectories.
|
|
131
350
|
|
|
132
351
|
Yields:
|
|
133
352
|
File objects for each file found in the directory
|
|
134
|
-
|
|
135
|
-
Example:
|
|
136
|
-
```python
|
|
137
|
-
for file in directory.walk_sync():
|
|
138
|
-
local_path = file.download_sync()
|
|
139
|
-
# Process the file
|
|
140
|
-
```
|
|
141
353
|
"""
|
|
142
354
|
fs = storage.get_underlying_filesystem(path=self.path)
|
|
143
355
|
for parent, _, files in fs.walk(self.path, maxdepth=max_depth):
|
|
@@ -152,15 +364,33 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
152
364
|
"""
|
|
153
365
|
Asynchronously get a list of all files in the directory (non-recursive).
|
|
154
366
|
|
|
367
|
+
Use this when you need a list of all files in the top-level directory at once.
|
|
368
|
+
|
|
155
369
|
Returns:
|
|
156
|
-
A list of File objects
|
|
370
|
+
A list of File objects for files in the top-level directory
|
|
157
371
|
|
|
158
|
-
Example:
|
|
159
|
-
|
|
160
|
-
|
|
372
|
+
Example (Async):
|
|
373
|
+
|
|
374
|
+
```python
|
|
375
|
+
@env.task
|
|
376
|
+
async def count_files(d: Dir) -> int:
|
|
377
|
+
files = await d.list_files()
|
|
378
|
+
return len(files)
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
Example (Async - Process files):
|
|
382
|
+
|
|
383
|
+
```python
|
|
384
|
+
@env.task
|
|
385
|
+
async def process_all_files(d: Dir) -> list[str]:
|
|
386
|
+
files = await d.list_files()
|
|
387
|
+
contents = []
|
|
161
388
|
for file in files:
|
|
162
|
-
|
|
163
|
-
|
|
389
|
+
async with file.open("rb") as f:
|
|
390
|
+
content = await f.read()
|
|
391
|
+
contents.append(content.decode("utf-8"))
|
|
392
|
+
return contents
|
|
393
|
+
```
|
|
164
394
|
"""
|
|
165
395
|
# todo: this should probably also just defer to fsspec.find()
|
|
166
396
|
files = []
|
|
@@ -172,15 +402,33 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
172
402
|
"""
|
|
173
403
|
Synchronously get a list of all files in the directory (non-recursive).
|
|
174
404
|
|
|
405
|
+
Use this in non-async tasks when you need a list of all files in the top-level directory at once.
|
|
406
|
+
|
|
175
407
|
Returns:
|
|
176
|
-
A list of File objects
|
|
408
|
+
A list of File objects for files in the top-level directory
|
|
177
409
|
|
|
178
|
-
Example:
|
|
179
|
-
|
|
180
|
-
|
|
410
|
+
Example (Sync):
|
|
411
|
+
|
|
412
|
+
```python
|
|
413
|
+
@env.task
|
|
414
|
+
def count_files_sync(d: Dir) -> int:
|
|
415
|
+
files = d.list_files_sync()
|
|
416
|
+
return len(files)
|
|
417
|
+
```
|
|
418
|
+
|
|
419
|
+
Example (Sync - Process files):
|
|
420
|
+
|
|
421
|
+
```python
|
|
422
|
+
@env.task
|
|
423
|
+
def process_all_files_sync(d: Dir) -> list[str]:
|
|
424
|
+
files = d.list_files_sync()
|
|
425
|
+
contents = []
|
|
181
426
|
for file in files:
|
|
182
|
-
|
|
183
|
-
|
|
427
|
+
with file.open_sync("rb") as f:
|
|
428
|
+
content = f.read()
|
|
429
|
+
contents.append(content.decode("utf-8"))
|
|
430
|
+
return contents
|
|
431
|
+
```
|
|
184
432
|
"""
|
|
185
433
|
return list(self.walk_sync(recursive=False))
|
|
186
434
|
|
|
@@ -188,19 +436,43 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
188
436
|
"""
|
|
189
437
|
Asynchronously download the entire directory to a local path.
|
|
190
438
|
|
|
439
|
+
Use this when you need to download all files in a directory to your local filesystem for processing.
|
|
440
|
+
|
|
441
|
+
Example (Async):
|
|
442
|
+
|
|
443
|
+
```python
|
|
444
|
+
@env.task
|
|
445
|
+
async def download_directory(d: Dir) -> str:
|
|
446
|
+
local_dir = await d.download()
|
|
447
|
+
# Process files in the local directory
|
|
448
|
+
return local_dir
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
Example (Async - Download to specific path):
|
|
452
|
+
|
|
453
|
+
```python
|
|
454
|
+
@env.task
|
|
455
|
+
async def download_to_path(d: Dir) -> str:
|
|
456
|
+
local_dir = await d.download("/tmp/my_data/")
|
|
457
|
+
return local_dir
|
|
458
|
+
```
|
|
459
|
+
|
|
191
460
|
Args:
|
|
192
461
|
local_path: The local path to download the directory to. If None, a temporary
|
|
193
|
-
directory will be used.
|
|
462
|
+
directory will be used and a path will be generated.
|
|
194
463
|
|
|
195
464
|
Returns:
|
|
196
|
-
The path to the downloaded directory
|
|
197
|
-
|
|
198
|
-
Example:
|
|
199
|
-
```python
|
|
200
|
-
local_dir = await directory.download('/tmp/my_data/')
|
|
201
|
-
```
|
|
465
|
+
The absolute path to the downloaded directory
|
|
202
466
|
"""
|
|
203
|
-
|
|
467
|
+
# If no local_path specified, create a unique path + append source directory name
|
|
468
|
+
if local_path is None:
|
|
469
|
+
unique_path = storage.get_random_local_path()
|
|
470
|
+
source_dirname = Path(self.path).name # will need to be updated for windows
|
|
471
|
+
local_dest = str(Path(unique_path) / source_dirname)
|
|
472
|
+
else:
|
|
473
|
+
# If local_path is specified, use it directly (contents go into it)
|
|
474
|
+
local_dest = str(local_path)
|
|
475
|
+
|
|
204
476
|
if not storage.is_remote(self.path):
|
|
205
477
|
if not local_path or local_path == self.path:
|
|
206
478
|
# Skip copying
|
|
@@ -215,25 +487,49 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
215
487
|
await loop.run_in_executor(None, lambda: shutil.copytree(self.path, local_dest, dirs_exist_ok=True))
|
|
216
488
|
|
|
217
489
|
await copy_tree()
|
|
490
|
+
return local_dest
|
|
218
491
|
return await storage.get(self.path, local_dest, recursive=True)
|
|
219
492
|
|
|
220
493
|
def download_sync(self, local_path: Optional[Union[str, Path]] = None) -> str:
|
|
221
494
|
"""
|
|
222
495
|
Synchronously download the entire directory to a local path.
|
|
223
496
|
|
|
497
|
+
Use this in non-async tasks when you need to download all files in a directory to your local filesystem.
|
|
498
|
+
|
|
499
|
+
Example (Sync):
|
|
500
|
+
|
|
501
|
+
```python
|
|
502
|
+
@env.task
|
|
503
|
+
def download_directory_sync(d: Dir) -> str:
|
|
504
|
+
local_dir = d.download_sync()
|
|
505
|
+
# Process files in the local directory
|
|
506
|
+
return local_dir
|
|
507
|
+
```
|
|
508
|
+
|
|
509
|
+
Example (Sync - Download to specific path):
|
|
510
|
+
|
|
511
|
+
```python
|
|
512
|
+
@env.task
|
|
513
|
+
def download_to_path_sync(d: Dir) -> str:
|
|
514
|
+
local_dir = d.download_sync("/tmp/my_data/")
|
|
515
|
+
return local_dir
|
|
516
|
+
```
|
|
224
517
|
Args:
|
|
225
518
|
local_path: The local path to download the directory to. If None, a temporary
|
|
226
|
-
directory will be used.
|
|
519
|
+
directory will be used and a path will be generated.
|
|
227
520
|
|
|
228
521
|
Returns:
|
|
229
|
-
The path to the downloaded directory
|
|
230
|
-
|
|
231
|
-
Example:
|
|
232
|
-
```python
|
|
233
|
-
local_dir = directory.download_sync('/tmp/my_data/')
|
|
234
|
-
```
|
|
522
|
+
The absolute path to the downloaded directory
|
|
235
523
|
"""
|
|
236
|
-
|
|
524
|
+
# If no local_path specified, create a unique path + append source directory name
|
|
525
|
+
if local_path is None:
|
|
526
|
+
unique_path = storage.get_random_local_path()
|
|
527
|
+
source_dirname = Path(self.path).name
|
|
528
|
+
local_dest = str(Path(unique_path) / source_dirname)
|
|
529
|
+
else:
|
|
530
|
+
# If local_path is specified, use it directly (contents go into it)
|
|
531
|
+
local_dest = str(local_path)
|
|
532
|
+
|
|
237
533
|
if not storage.is_remote(self.path):
|
|
238
534
|
if not local_path or local_path == self.path:
|
|
239
535
|
# Skip copying
|
|
@@ -243,52 +539,188 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
243
539
|
import shutil
|
|
244
540
|
|
|
245
541
|
shutil.copytree(self.path, local_dest, dirs_exist_ok=True)
|
|
542
|
+
return local_dest
|
|
246
543
|
|
|
247
|
-
|
|
248
|
-
|
|
544
|
+
fs = storage.get_underlying_filesystem(path=self.path)
|
|
545
|
+
fs.get(self.path, local_dest, recursive=True)
|
|
546
|
+
return local_dest
|
|
249
547
|
|
|
250
548
|
@classmethod
|
|
251
|
-
async def from_local(
|
|
549
|
+
async def from_local(
|
|
550
|
+
cls,
|
|
551
|
+
local_path: Union[str, Path],
|
|
552
|
+
remote_destination: Optional[str] = None,
|
|
553
|
+
dir_cache_key: Optional[str] = None,
|
|
554
|
+
) -> Dir[T]:
|
|
252
555
|
"""
|
|
253
|
-
Asynchronously create a new Dir by uploading a local directory to
|
|
556
|
+
Asynchronously create a new Dir by uploading a local directory to remote storage.
|
|
557
|
+
|
|
558
|
+
Use this in async tasks when you have a local directory that needs to be uploaded to remote storage.
|
|
559
|
+
|
|
560
|
+
Example (Async):
|
|
561
|
+
|
|
562
|
+
```python
|
|
563
|
+
@env.task
|
|
564
|
+
async def upload_local_directory() -> Dir:
|
|
565
|
+
# Create a local directory with files
|
|
566
|
+
os.makedirs("/tmp/data_dir", exist_ok=True)
|
|
567
|
+
with open("/tmp/data_dir/file1.txt", "w") as f:
|
|
568
|
+
f.write("data1")
|
|
569
|
+
|
|
570
|
+
# Upload to remote storage
|
|
571
|
+
remote_dir = await Dir.from_local("/tmp/data_dir/")
|
|
572
|
+
return remote_dir
|
|
573
|
+
```
|
|
574
|
+
|
|
575
|
+
Example (Async - With specific destination):
|
|
576
|
+
|
|
577
|
+
```python
|
|
578
|
+
@env.task
|
|
579
|
+
async def upload_to_specific_path() -> Dir:
|
|
580
|
+
remote_dir = await Dir.from_local("/tmp/data_dir/", "s3://my-bucket/data/")
|
|
581
|
+
return remote_dir
|
|
582
|
+
```
|
|
254
583
|
|
|
584
|
+
Example (Async - With cache key):
|
|
585
|
+
|
|
586
|
+
```python
|
|
587
|
+
@env.task
|
|
588
|
+
async def upload_with_cache_key() -> Dir:
|
|
589
|
+
remote_dir = await Dir.from_local("/tmp/data_dir/", dir_cache_key="my_cache_key_123")
|
|
590
|
+
return remote_dir
|
|
591
|
+
```
|
|
255
592
|
Args:
|
|
256
593
|
local_path: Path to the local directory
|
|
257
|
-
|
|
594
|
+
remote_destination: Optional remote path to store the directory. If None, a path will be automatically
|
|
595
|
+
generated.
|
|
596
|
+
dir_cache_key: Optional precomputed hash value to use for cache key computation when this Dir is used
|
|
597
|
+
as an input to discoverable tasks. If not specified, the cache key will be based on
|
|
598
|
+
directory attributes.
|
|
258
599
|
|
|
259
600
|
Returns:
|
|
260
601
|
A new Dir instance pointing to the uploaded directory
|
|
261
|
-
|
|
262
|
-
Example:
|
|
263
|
-
```python
|
|
264
|
-
remote_dir = await Dir[DataFrame].from_local('/tmp/data_dir/', 's3://bucket/data/')
|
|
265
|
-
```
|
|
266
602
|
"""
|
|
267
603
|
local_path_str = str(local_path)
|
|
268
604
|
dirname = os.path.basename(os.path.normpath(local_path_str))
|
|
605
|
+
resolved_remote_path = remote_destination or internal_ctx().raw_data.get_random_remote_path(dirname)
|
|
606
|
+
protocol = get_protocol(resolved_remote_path)
|
|
269
607
|
|
|
270
|
-
|
|
271
|
-
|
|
608
|
+
# Shortcut for local, don't copy and just return
|
|
609
|
+
if "file" in protocol and remote_destination is None:
|
|
610
|
+
output_path = str(Path(local_path).absolute())
|
|
611
|
+
return cls(path=output_path, name=dirname, hash=dir_cache_key)
|
|
612
|
+
|
|
613
|
+
# todo: in the future, mirror File and set the file to_path here
|
|
614
|
+
output_path = await storage.put(from_path=local_path_str, to_path=remote_destination, recursive=True)
|
|
615
|
+
return cls(path=output_path, name=dirname, hash=dir_cache_key)
|
|
272
616
|
|
|
273
617
|
@classmethod
|
|
274
|
-
def from_local_sync(
|
|
618
|
+
def from_local_sync(
|
|
619
|
+
cls,
|
|
620
|
+
local_path: Union[str, Path],
|
|
621
|
+
remote_destination: Optional[str] = None,
|
|
622
|
+
dir_cache_key: Optional[str] = None,
|
|
623
|
+
) -> Dir[T]:
|
|
275
624
|
"""
|
|
276
|
-
Synchronously create a new Dir by uploading a local directory to
|
|
625
|
+
Synchronously create a new Dir by uploading a local directory to remote storage.
|
|
626
|
+
|
|
627
|
+
Use this in non-async tasks when you have a local directory that needs to be uploaded to remote storage.
|
|
628
|
+
|
|
629
|
+
Example (Sync):
|
|
630
|
+
|
|
631
|
+
```python
|
|
632
|
+
@env.task
|
|
633
|
+
def upload_local_directory_sync() -> Dir:
|
|
634
|
+
# Create a local directory with files
|
|
635
|
+
os.makedirs("/tmp/data_dir", exist_ok=True)
|
|
636
|
+
with open("/tmp/data_dir/file1.txt", "w") as f:
|
|
637
|
+
f.write("data1")
|
|
638
|
+
|
|
639
|
+
# Upload to remote storage
|
|
640
|
+
remote_dir = Dir.from_local_sync("/tmp/data_dir/")
|
|
641
|
+
return remote_dir
|
|
642
|
+
```
|
|
643
|
+
|
|
644
|
+
Example (Sync - With specific destination):
|
|
645
|
+
|
|
646
|
+
```python
|
|
647
|
+
@env.task
|
|
648
|
+
def upload_to_specific_path_sync() -> Dir:
|
|
649
|
+
remote_dir = Dir.from_local_sync("/tmp/data_dir/", "s3://my-bucket/data/")
|
|
650
|
+
return remote_dir
|
|
651
|
+
```
|
|
652
|
+
|
|
653
|
+
Example (Sync - With cache key):
|
|
654
|
+
|
|
655
|
+
```python
|
|
656
|
+
@env.task
|
|
657
|
+
def upload_with_cache_key_sync() -> Dir:
|
|
658
|
+
remote_dir = Dir.from_local_sync("/tmp/data_dir/", dir_cache_key="my_cache_key_123")
|
|
659
|
+
return remote_dir
|
|
660
|
+
```
|
|
277
661
|
|
|
278
662
|
Args:
|
|
279
663
|
local_path: Path to the local directory
|
|
280
|
-
|
|
664
|
+
remote_destination: Optional remote path to store the directory. If None, a path will be automatically
|
|
665
|
+
generated.
|
|
666
|
+
dir_cache_key: Optional precomputed hash value to use for cache key computation when this Dir is used
|
|
667
|
+
as an input to discoverable tasks. If not specified, the cache key will be based on
|
|
668
|
+
directory attributes.
|
|
281
669
|
|
|
282
670
|
Returns:
|
|
283
671
|
A new Dir instance pointing to the uploaded directory
|
|
672
|
+
"""
|
|
673
|
+
local_path_str = str(local_path)
|
|
674
|
+
dirname = os.path.basename(os.path.normpath(local_path_str))
|
|
675
|
+
|
|
676
|
+
resolved_remote_path = remote_destination or internal_ctx().raw_data.get_random_remote_path(dirname)
|
|
677
|
+
protocol = get_protocol(resolved_remote_path)
|
|
678
|
+
|
|
679
|
+
# Shortcut for local, don't copy and just return
|
|
680
|
+
if "file" in protocol and remote_destination is None:
|
|
681
|
+
output_path = str(Path(local_path).absolute())
|
|
682
|
+
return cls(path=output_path, name=dirname, hash=dir_cache_key)
|
|
683
|
+
|
|
684
|
+
fs = storage.get_underlying_filesystem(path=resolved_remote_path)
|
|
685
|
+
fs.put(local_path_str, resolved_remote_path, recursive=True)
|
|
686
|
+
return cls(path=resolved_remote_path, name=dirname, hash=dir_cache_key)
|
|
687
|
+
|
|
688
|
+
@classmethod
|
|
689
|
+
def from_existing_remote(cls, remote_path: str, dir_cache_key: Optional[str] = None) -> Dir[T]:
|
|
690
|
+
"""
|
|
691
|
+
Create a Dir reference from an existing remote directory.
|
|
692
|
+
|
|
693
|
+
Use this when you want to reference a directory that already exists in remote storage without uploading it.
|
|
284
694
|
|
|
285
695
|
Example:
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
696
|
+
|
|
697
|
+
```python
|
|
698
|
+
@env.task
|
|
699
|
+
async def process_existing_directory() -> int:
|
|
700
|
+
d = Dir.from_existing_remote("s3://my-bucket/data/")
|
|
701
|
+
files = await d.list_files()
|
|
702
|
+
return len(files)
|
|
703
|
+
```
|
|
704
|
+
|
|
705
|
+
Example (With cache key):
|
|
706
|
+
|
|
707
|
+
```python
|
|
708
|
+
@env.task
|
|
709
|
+
async def process_with_cache_key() -> int:
|
|
710
|
+
d = Dir.from_existing_remote("s3://my-bucket/data/", dir_cache_key="abc123")
|
|
711
|
+
files = await d.list_files()
|
|
712
|
+
return len(files)
|
|
713
|
+
```
|
|
714
|
+
|
|
715
|
+
Args:
|
|
716
|
+
remote_path: The remote path to the existing directory
|
|
717
|
+
dir_cache_key: Optional hash value to use for cache key computation. If not specified,
|
|
718
|
+
the cache key will be computed based on the directory's attributes.
|
|
719
|
+
|
|
720
|
+
Returns:
|
|
721
|
+
A new Dir instance pointing to the existing remote directory
|
|
289
722
|
"""
|
|
290
|
-
|
|
291
|
-
raise NotImplementedError("Sync upload is not implemented for remote paths")
|
|
723
|
+
return cls(path=remote_path, hash=dir_cache_key)
|
|
292
724
|
|
|
293
725
|
async def exists(self) -> bool:
|
|
294
726
|
"""
|
|
@@ -297,11 +729,16 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
297
729
|
Returns:
|
|
298
730
|
True if the directory exists, False otherwise
|
|
299
731
|
|
|
300
|
-
Example:
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
732
|
+
Example (Async):
|
|
733
|
+
|
|
734
|
+
```python
|
|
735
|
+
@env.task
|
|
736
|
+
async def check_directory(d: Dir) -> bool:
|
|
737
|
+
if await d.exists():
|
|
738
|
+
print("Directory exists!")
|
|
739
|
+
return True
|
|
740
|
+
return False
|
|
741
|
+
```
|
|
305
742
|
"""
|
|
306
743
|
fs = storage.get_underlying_filesystem(path=self.path)
|
|
307
744
|
if isinstance(fs, AsyncFileSystem):
|
|
@@ -313,34 +750,49 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
313
750
|
"""
|
|
314
751
|
Synchronously check if the directory exists.
|
|
315
752
|
|
|
753
|
+
Use this in non-async tasks or when you need synchronous directory existence checking.
|
|
754
|
+
|
|
316
755
|
Returns:
|
|
317
756
|
True if the directory exists, False otherwise
|
|
318
757
|
|
|
319
|
-
Example:
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
758
|
+
Example (Sync):
|
|
759
|
+
|
|
760
|
+
```python
|
|
761
|
+
@env.task
|
|
762
|
+
def check_directory_sync(d: Dir) -> bool:
|
|
763
|
+
if d.exists_sync():
|
|
764
|
+
print("Directory exists!")
|
|
765
|
+
return True
|
|
766
|
+
return False
|
|
767
|
+
```
|
|
324
768
|
"""
|
|
325
769
|
fs = storage.get_underlying_filesystem(path=self.path)
|
|
326
770
|
return fs.exists(self.path)
|
|
327
771
|
|
|
328
772
|
async def get_file(self, file_name: str) -> Optional[File[T]]:
|
|
329
773
|
"""
|
|
330
|
-
Asynchronously get a specific file from the directory.
|
|
774
|
+
Asynchronously get a specific file from the directory by name.
|
|
775
|
+
|
|
776
|
+
Use this when you know the name of a specific file in the directory you want to access.
|
|
777
|
+
|
|
778
|
+
Example (Async):
|
|
779
|
+
|
|
780
|
+
```python
|
|
781
|
+
@env.task
|
|
782
|
+
async def read_specific_file(d: Dir) -> str:
|
|
783
|
+
file = await d.get_file("data.csv")
|
|
784
|
+
if file:
|
|
785
|
+
async with file.open("rb") as f:
|
|
786
|
+
content = await f.read()
|
|
787
|
+
return content.decode("utf-8")
|
|
788
|
+
return "File not found"
|
|
789
|
+
```
|
|
331
790
|
|
|
332
791
|
Args:
|
|
333
792
|
file_name: The name of the file to get
|
|
334
793
|
|
|
335
794
|
Returns:
|
|
336
795
|
A File instance if the file exists, None otherwise
|
|
337
|
-
|
|
338
|
-
Example:
|
|
339
|
-
```python
|
|
340
|
-
file = await directory.get_file("data.csv")
|
|
341
|
-
if file:
|
|
342
|
-
# Process the file
|
|
343
|
-
```
|
|
344
796
|
"""
|
|
345
797
|
fs = storage.get_underlying_filesystem(path=self.path)
|
|
346
798
|
file_path = fs.sep.join([self.path, file_name])
|
|
@@ -352,20 +804,28 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
352
804
|
|
|
353
805
|
def get_file_sync(self, file_name: str) -> Optional[File[T]]:
|
|
354
806
|
"""
|
|
355
|
-
Synchronously get a specific file from the directory.
|
|
807
|
+
Synchronously get a specific file from the directory by name.
|
|
808
|
+
|
|
809
|
+
Use this in non-async tasks when you know the name of a specific file in the directory you want to access.
|
|
810
|
+
|
|
811
|
+
Example (Sync):
|
|
812
|
+
|
|
813
|
+
```python
|
|
814
|
+
@env.task
|
|
815
|
+
def read_specific_file_sync(d: Dir) -> str:
|
|
816
|
+
file = d.get_file_sync("data.csv")
|
|
817
|
+
if file:
|
|
818
|
+
with file.open_sync("rb") as f:
|
|
819
|
+
content = f.read()
|
|
820
|
+
return content.decode("utf-8")
|
|
821
|
+
return "File not found"
|
|
822
|
+
```
|
|
356
823
|
|
|
357
824
|
Args:
|
|
358
825
|
file_name: The name of the file to get
|
|
359
826
|
|
|
360
827
|
Returns:
|
|
361
828
|
A File instance if the file exists, None otherwise
|
|
362
|
-
|
|
363
|
-
Example:
|
|
364
|
-
```python
|
|
365
|
-
file = directory.get_file_sync("data.csv")
|
|
366
|
-
if file:
|
|
367
|
-
# Process the file
|
|
368
|
-
```
|
|
369
829
|
"""
|
|
370
830
|
file_path = os.path.join(self.path, file_name)
|
|
371
831
|
file = File[T](path=file_path)
|
|
@@ -414,7 +874,8 @@ class DirTransformer(TypeTransformer[Dir]):
|
|
|
414
874
|
),
|
|
415
875
|
uri=python_val.path,
|
|
416
876
|
)
|
|
417
|
-
)
|
|
877
|
+
),
|
|
878
|
+
hash=python_val.hash if python_val.hash else None,
|
|
418
879
|
)
|
|
419
880
|
|
|
420
881
|
async def to_python_value(
|
|
@@ -432,7 +893,8 @@ class DirTransformer(TypeTransformer[Dir]):
|
|
|
432
893
|
|
|
433
894
|
uri = lv.scalar.blob.uri
|
|
434
895
|
filename = Path(uri).name
|
|
435
|
-
|
|
896
|
+
hash_value = lv.hash if lv.hash else None
|
|
897
|
+
f: Dir = Dir(path=uri, name=filename, format=lv.scalar.blob.metadata.type.format, hash=hash_value)
|
|
436
898
|
return f
|
|
437
899
|
|
|
438
900
|
def guess_python_type(self, literal_type: types_pb2.LiteralType) -> Type[Dir]:
|