flyte 2.0.0b22__py3-none-any.whl → 2.0.0b24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flyte might be problematic. Click here for more details.
- flyte/__init__.py +7 -1
- flyte/_bin/runtime.py +35 -5
- flyte/_cache/cache.py +4 -2
- flyte/_cache/local_cache.py +215 -0
- flyte/_code_bundle/bundle.py +10 -2
- flyte/_context.py +4 -1
- flyte/_debug/constants.py +0 -1
- flyte/_debug/vscode.py +6 -1
- flyte/_deploy.py +193 -52
- flyte/_environment.py +5 -0
- flyte/_excepthook.py +1 -1
- flyte/_image.py +104 -75
- flyte/_initialize.py +51 -0
- flyte/_internal/controllers/_local_controller.py +64 -24
- flyte/_internal/controllers/remote/_action.py +4 -1
- flyte/_internal/controllers/remote/_controller.py +5 -2
- flyte/_internal/controllers/remote/_core.py +6 -3
- flyte/_internal/controllers/remote/_informer.py +1 -1
- flyte/_internal/imagebuild/docker_builder.py +92 -28
- flyte/_internal/imagebuild/image_builder.py +7 -13
- flyte/_internal/imagebuild/remote_builder.py +6 -1
- flyte/_internal/runtime/io.py +13 -1
- flyte/_internal/runtime/rusty.py +17 -2
- flyte/_internal/runtime/task_serde.py +11 -21
- flyte/_internal/runtime/taskrunner.py +1 -1
- flyte/_internal/runtime/trigger_serde.py +153 -0
- flyte/_logging.py +1 -1
- flyte/_protos/common/identifier_pb2.py +19 -1
- flyte/_protos/common/identifier_pb2.pyi +22 -0
- flyte/_protos/workflow/common_pb2.py +14 -3
- flyte/_protos/workflow/common_pb2.pyi +49 -0
- flyte/_protos/workflow/queue_service_pb2.py +41 -35
- flyte/_protos/workflow/queue_service_pb2.pyi +26 -12
- flyte/_protos/workflow/queue_service_pb2_grpc.py +34 -0
- flyte/_protos/workflow/run_definition_pb2.py +38 -38
- flyte/_protos/workflow/run_definition_pb2.pyi +4 -2
- flyte/_protos/workflow/run_service_pb2.py +60 -50
- flyte/_protos/workflow/run_service_pb2.pyi +24 -6
- flyte/_protos/workflow/run_service_pb2_grpc.py +34 -0
- flyte/_protos/workflow/task_definition_pb2.py +15 -11
- flyte/_protos/workflow/task_definition_pb2.pyi +19 -2
- flyte/_protos/workflow/task_service_pb2.py +18 -17
- flyte/_protos/workflow/task_service_pb2.pyi +5 -2
- flyte/_protos/workflow/trigger_definition_pb2.py +66 -0
- flyte/_protos/workflow/trigger_definition_pb2.pyi +117 -0
- flyte/_protos/workflow/trigger_definition_pb2_grpc.py +4 -0
- flyte/_protos/workflow/trigger_service_pb2.py +96 -0
- flyte/_protos/workflow/trigger_service_pb2.pyi +110 -0
- flyte/_protos/workflow/trigger_service_pb2_grpc.py +281 -0
- flyte/_run.py +42 -15
- flyte/_task.py +36 -4
- flyte/_task_environment.py +62 -15
- flyte/_trigger.py +382 -0
- flyte/_version.py +3 -3
- flyte/cli/_abort.py +3 -3
- flyte/cli/_build.py +1 -3
- flyte/cli/_common.py +29 -2
- flyte/cli/_create.py +74 -0
- flyte/cli/_delete.py +23 -1
- flyte/cli/_deploy.py +13 -9
- flyte/cli/_get.py +75 -34
- flyte/cli/_params.py +4 -2
- flyte/cli/_run.py +27 -22
- flyte/cli/_update.py +36 -0
- flyte/cli/_user.py +17 -0
- flyte/cli/main.py +9 -1
- flyte/errors.py +9 -0
- flyte/extend.py +4 -0
- flyte/io/_dir.py +513 -115
- flyte/io/_file.py +495 -135
- flyte/models.py +32 -0
- flyte/remote/__init__.py +6 -1
- flyte/remote/_client/_protocols.py +36 -2
- flyte/remote/_client/controlplane.py +19 -3
- flyte/remote/_run.py +42 -2
- flyte/remote/_task.py +14 -1
- flyte/remote/_trigger.py +308 -0
- flyte/remote/_user.py +33 -0
- flyte/storage/__init__.py +6 -1
- flyte/storage/_storage.py +119 -101
- flyte/types/_pickle.py +16 -3
- {flyte-2.0.0b22.data → flyte-2.0.0b24.data}/scripts/runtime.py +35 -5
- {flyte-2.0.0b22.dist-info → flyte-2.0.0b24.dist-info}/METADATA +3 -1
- {flyte-2.0.0b22.dist-info → flyte-2.0.0b24.dist-info}/RECORD +89 -77
- flyte/_protos/secret/secret_pb2_grpc_grpc.py +0 -198
- {flyte-2.0.0b22.data → flyte-2.0.0b24.data}/scripts/debug.py +0 -0
- {flyte-2.0.0b22.dist-info → flyte-2.0.0b24.dist-info}/WHEEL +0 -0
- {flyte-2.0.0b22.dist-info → flyte-2.0.0b24.dist-info}/entry_points.txt +0 -0
- {flyte-2.0.0b22.dist-info → flyte-2.0.0b24.dist-info}/licenses/LICENSE +0 -0
- {flyte-2.0.0b22.dist-info → flyte-2.0.0b24.dist-info}/top_level.txt +0 -0
flyte/io/_dir.py
CHANGED
|
@@ -20,28 +20,176 @@ T = TypeVar("T")
|
|
|
20
20
|
class Dir(BaseModel, Generic[T], SerializableType):
|
|
21
21
|
"""
|
|
22
22
|
A generic directory class representing a directory with files of a specified format.
|
|
23
|
-
Provides both async and sync interfaces for directory operations.
|
|
24
|
-
|
|
25
|
-
|
|
23
|
+
Provides both async and sync interfaces for directory operations. All methods without _sync suffix are async.
|
|
24
|
+
|
|
25
|
+
The class should be instantiated using one of the class methods. The constructor should only be used to
|
|
26
|
+
instantiate references to existing remote directories.
|
|
26
27
|
|
|
27
28
|
The generic type T represents the format of the files in the directory.
|
|
28
29
|
|
|
29
|
-
|
|
30
|
+
Important methods:
|
|
31
|
+
- `from_existing_remote`: Create a Dir object referencing an existing remote directory.
|
|
32
|
+
- `from_local` / `from_local_sync`: Upload a local directory to remote storage.
|
|
33
|
+
|
|
34
|
+
**Asynchronous methods**:
|
|
35
|
+
- `walk`: Asynchronously iterate through files in the directory.
|
|
36
|
+
- `list_files`: Asynchronously get a list of all files (non-recursive).
|
|
37
|
+
- `download`: Asynchronously download the entire directory to a local path.
|
|
38
|
+
- `exists`: Asynchronously check if the directory exists.
|
|
39
|
+
- `get_file`: Asynchronously get a specific file from the directory by name.
|
|
40
|
+
|
|
41
|
+
**Synchronous methods** (suffixed with `_sync`):
|
|
42
|
+
- `walk_sync`: Synchronously iterate through files in the directory.
|
|
43
|
+
- `list_files_sync`: Synchronously get a list of all files (non-recursive).
|
|
44
|
+
- `download_sync`: Synchronously download the entire directory to a local path.
|
|
45
|
+
- `exists_sync`: Synchronously check if the directory exists.
|
|
46
|
+
- `get_file_sync`: Synchronously get a specific file from the directory by name.
|
|
47
|
+
|
|
48
|
+
Example: Walk through directory files recursively (Async).
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
@env.task
|
|
52
|
+
async def process_all_files(d: Dir) -> int:
|
|
53
|
+
file_count = 0
|
|
54
|
+
async for file in d.walk(recursive=True):
|
|
55
|
+
async with file.open("rb") as f:
|
|
56
|
+
content = await f.read()
|
|
57
|
+
# Process content
|
|
58
|
+
file_count += 1
|
|
59
|
+
return file_count
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Example: Walk through directory files recursively (Sync).
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
@env.task
|
|
66
|
+
def process_all_files_sync(d: Dir) -> int:
|
|
67
|
+
file_count = 0
|
|
68
|
+
for file in d.walk_sync(recursive=True):
|
|
69
|
+
with file.open_sync("rb") as f:
|
|
70
|
+
content = f.read()
|
|
71
|
+
# Process content
|
|
72
|
+
file_count += 1
|
|
73
|
+
return file_count
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Example: List files in directory (Async).
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
@env.task
|
|
80
|
+
async def count_files(d: Dir) -> int:
|
|
81
|
+
files = await d.list_files()
|
|
82
|
+
return len(files)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Example: List files in directory (Sync).
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
@env.task
|
|
89
|
+
def count_files_sync(d: Dir) -> int:
|
|
90
|
+
files = d.list_files_sync()
|
|
91
|
+
return len(files)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Example: Get a specific file from directory (Async).
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
@env.task
|
|
98
|
+
async def read_config_file(d: Dir) -> str:
|
|
99
|
+
config_file = await d.get_file("config.json")
|
|
100
|
+
if config_file:
|
|
101
|
+
async with config_file.open("rb") as f:
|
|
102
|
+
return (await f.read()).decode("utf-8")
|
|
103
|
+
return "Config not found"
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Example: Get a specific file from directory (Sync).
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
@env.task
|
|
110
|
+
def read_config_file_sync(d: Dir) -> str:
|
|
111
|
+
config_file = d.get_file_sync("config.json")
|
|
112
|
+
if config_file:
|
|
113
|
+
with config_file.open_sync("rb") as f:
|
|
114
|
+
return f.read().decode("utf-8")
|
|
115
|
+
return "Config not found"
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Example: Upload a local directory to remote storage (Async).
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
@env.task
|
|
122
|
+
async def upload_directory() -> Dir:
|
|
123
|
+
# Create local directory with files
|
|
124
|
+
os.makedirs("/tmp/my_data", exist_ok=True)
|
|
125
|
+
with open("/tmp/my_data/file1.txt", "w") as f:
|
|
126
|
+
f.write("data1")
|
|
127
|
+
# Upload to remote storage
|
|
128
|
+
return await Dir.from_local("/tmp/my_data/")
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Example: Upload a local directory to remote storage (Sync).
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
@env.task
|
|
135
|
+
def upload_directory_sync() -> Dir:
|
|
136
|
+
# Create local directory with files
|
|
137
|
+
os.makedirs("/tmp/my_data", exist_ok=True)
|
|
138
|
+
with open("/tmp/my_data/file1.txt", "w") as f:
|
|
139
|
+
f.write("data1")
|
|
140
|
+
# Upload to remote storage
|
|
141
|
+
return Dir.from_local_sync("/tmp/my_data/")
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Example: Download a directory to local storage (Async).
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
@env.task
|
|
148
|
+
async def download_directory(d: Dir) -> str:
|
|
149
|
+
local_path = await d.download()
|
|
150
|
+
# Process files in local directory
|
|
151
|
+
return local_path
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Example: Download a directory to local storage (Sync).
|
|
155
|
+
|
|
30
156
|
```python
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
async for file in data_dir.walk():
|
|
37
|
-
async with file.open() as f:
|
|
38
|
-
content = await f.read()
|
|
39
|
-
|
|
40
|
-
# Sync alternative
|
|
41
|
-
for file in data_dir.walk_sync():
|
|
42
|
-
with file.open_sync() as f:
|
|
43
|
-
content = f.read()
|
|
157
|
+
@env.task
|
|
158
|
+
def download_directory_sync(d: Dir) -> str:
|
|
159
|
+
local_path = d.download_sync()
|
|
160
|
+
# Process files in local directory
|
|
161
|
+
return local_path
|
|
44
162
|
```
|
|
163
|
+
|
|
164
|
+
Example: Reference an existing remote directory.
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
@env.task
|
|
168
|
+
async def process_existing_dir() -> int:
|
|
169
|
+
d = Dir.from_existing_remote("s3://my-bucket/data/")
|
|
170
|
+
files = await d.list_files()
|
|
171
|
+
return len(files)
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Example: Check if directory exists (Async).
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
@env.task
|
|
178
|
+
async def check_directory(d: Dir) -> bool:
|
|
179
|
+
return await d.exists()
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Example: Check if directory exists (Sync).
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
@env.task
|
|
186
|
+
def check_directory_sync(d: Dir) -> bool:
|
|
187
|
+
return d.exists_sync()
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
path: The path to the directory (can be local or remote)
|
|
192
|
+
name: Optional name for the directory (defaults to basename of path)
|
|
45
193
|
"""
|
|
46
194
|
|
|
47
195
|
# Represents either a local or remote path.
|
|
@@ -56,20 +204,24 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
56
204
|
@model_validator(mode="before")
|
|
57
205
|
@classmethod
|
|
58
206
|
def pre_init(cls, data):
|
|
207
|
+
"""Internal: Pydantic validator to set default name from path. Not intended for direct use."""
|
|
59
208
|
if data.get("name") is None:
|
|
60
209
|
data["name"] = Path(data["path"]).name
|
|
61
210
|
return data
|
|
62
211
|
|
|
63
212
|
def _serialize(self) -> Dict[str, Optional[str]]:
|
|
213
|
+
"""Internal: Serialize Dir to dictionary. Not intended for direct use."""
|
|
64
214
|
pyd_dump = self.model_dump()
|
|
65
215
|
return pyd_dump
|
|
66
216
|
|
|
67
217
|
@classmethod
|
|
68
218
|
def _deserialize(cls, file_dump: Dict[str, Optional[str]]) -> Dir:
|
|
219
|
+
"""Internal: Deserialize Dir from dictionary. Not intended for direct use."""
|
|
69
220
|
return cls.model_validate(file_dump)
|
|
70
221
|
|
|
71
222
|
@classmethod
|
|
72
223
|
def schema_match(cls, incoming: dict):
|
|
224
|
+
"""Internal: Check if incoming schema matches Dir schema. Not intended for direct use."""
|
|
73
225
|
this_schema = cls.model_json_schema()
|
|
74
226
|
current_required = this_schema.get("required")
|
|
75
227
|
incoming_required = incoming.get("required")
|
|
@@ -86,19 +238,48 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
86
238
|
"""
|
|
87
239
|
Asynchronously walk through the directory and yield File objects.
|
|
88
240
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
max_depth: Maximum depth for recursive walking
|
|
241
|
+
Use this to iterate through all files in a directory. Each yielded File can be read directly without
|
|
242
|
+
downloading.
|
|
92
243
|
|
|
93
|
-
|
|
94
|
-
|
|
244
|
+
Example (Async - Recursive):
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
@env.task
|
|
248
|
+
async def list_all_files(d: Dir) -> list[str]:
|
|
249
|
+
file_names = []
|
|
250
|
+
async for file in d.walk(recursive=True):
|
|
251
|
+
file_names.append(file.name)
|
|
252
|
+
return file_names
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
Example (Async - Non-recursive):
|
|
95
256
|
|
|
96
|
-
Example:
|
|
97
257
|
```python
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
258
|
+
@env.task
|
|
259
|
+
async def list_top_level_files(d: Dir) -> list[str]:
|
|
260
|
+
file_names = []
|
|
261
|
+
async for file in d.walk(recursive=False):
|
|
262
|
+
file_names.append(file.name)
|
|
263
|
+
return file_names
|
|
101
264
|
```
|
|
265
|
+
|
|
266
|
+
Example (Async - With max depth):
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
@env.task
|
|
270
|
+
async def list_files_max_depth(d: Dir) -> list[str]:
|
|
271
|
+
file_names = []
|
|
272
|
+
async for file in d.walk(recursive=True, max_depth=2):
|
|
273
|
+
file_names.append(file.name)
|
|
274
|
+
return file_names
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
recursive: If True, recursively walk subdirectories. If False, only list files in the top-level directory.
|
|
279
|
+
max_depth: Maximum depth for recursive walking. If None, walk through all subdirectories.
|
|
280
|
+
|
|
281
|
+
Yields:
|
|
282
|
+
File objects for each file found in the directory
|
|
102
283
|
"""
|
|
103
284
|
fs = storage.get_underlying_filesystem(path=self.path)
|
|
104
285
|
if recursive is False:
|
|
@@ -125,20 +306,48 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
125
306
|
"""
|
|
126
307
|
Synchronously walk through the directory and yield File objects.
|
|
127
308
|
|
|
128
|
-
|
|
129
|
-
recursive: If True, recursively walk subdirectories
|
|
130
|
-
file_pattern: Glob pattern to filter files
|
|
131
|
-
max_depth: Maximum depth for recursive walking
|
|
309
|
+
Use this in non-async tasks to iterate through all files in a directory.
|
|
132
310
|
|
|
133
|
-
|
|
134
|
-
|
|
311
|
+
Example (Sync - Recursive):
|
|
312
|
+
|
|
313
|
+
```python
|
|
314
|
+
@env.task
|
|
315
|
+
def list_all_files_sync(d: Dir) -> list[str]:
|
|
316
|
+
file_names = []
|
|
317
|
+
for file in d.walk_sync(recursive=True):
|
|
318
|
+
file_names.append(file.name)
|
|
319
|
+
return file_names
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
Example (Sync - With file pattern):
|
|
323
|
+
|
|
324
|
+
```python
|
|
325
|
+
@env.task
|
|
326
|
+
def list_text_files(d: Dir) -> list[str]:
|
|
327
|
+
file_names = []
|
|
328
|
+
for file in d.walk_sync(recursive=True, file_pattern="*.txt"):
|
|
329
|
+
file_names.append(file.name)
|
|
330
|
+
return file_names
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
Example (Sync - Non-recursive with max depth):
|
|
135
334
|
|
|
136
|
-
Example:
|
|
137
335
|
```python
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
336
|
+
@env.task
|
|
337
|
+
def list_files_limited(d: Dir) -> list[str]:
|
|
338
|
+
file_names = []
|
|
339
|
+
for file in d.walk_sync(recursive=True, max_depth=2):
|
|
340
|
+
file_names.append(file.name)
|
|
341
|
+
return file_names
|
|
141
342
|
```
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
recursive: If True, recursively walk subdirectories. If False, only list files in the top-level directory.
|
|
346
|
+
file_pattern: Glob pattern to filter files (e.g., "*.txt", "*.csv"). Default is "*" (all files).
|
|
347
|
+
max_depth: Maximum depth for recursive walking. If None, walk through all subdirectories.
|
|
348
|
+
|
|
349
|
+
Yields:
|
|
350
|
+
File objects for each file found in the directory
|
|
142
351
|
"""
|
|
143
352
|
fs = storage.get_underlying_filesystem(path=self.path)
|
|
144
353
|
for parent, _, files in fs.walk(self.path, maxdepth=max_depth):
|
|
@@ -153,14 +362,32 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
153
362
|
"""
|
|
154
363
|
Asynchronously get a list of all files in the directory (non-recursive).
|
|
155
364
|
|
|
365
|
+
Use this when you need a list of all files in the top-level directory at once.
|
|
366
|
+
|
|
156
367
|
Returns:
|
|
157
|
-
A list of File objects
|
|
368
|
+
A list of File objects for files in the top-level directory
|
|
369
|
+
|
|
370
|
+
Example (Async):
|
|
158
371
|
|
|
159
|
-
Example:
|
|
160
372
|
```python
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
373
|
+
@env.task
|
|
374
|
+
async def count_files(d: Dir) -> int:
|
|
375
|
+
files = await d.list_files()
|
|
376
|
+
return len(files)
|
|
377
|
+
```
|
|
378
|
+
|
|
379
|
+
Example (Async - Process files):
|
|
380
|
+
|
|
381
|
+
```python
|
|
382
|
+
@env.task
|
|
383
|
+
async def process_all_files(d: Dir) -> list[str]:
|
|
384
|
+
files = await d.list_files()
|
|
385
|
+
contents = []
|
|
386
|
+
for file in files:
|
|
387
|
+
async with file.open("rb") as f:
|
|
388
|
+
content = await f.read()
|
|
389
|
+
contents.append(content.decode("utf-8"))
|
|
390
|
+
return contents
|
|
164
391
|
```
|
|
165
392
|
"""
|
|
166
393
|
# todo: this should probably also just defer to fsspec.find()
|
|
@@ -173,14 +400,32 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
173
400
|
"""
|
|
174
401
|
Synchronously get a list of all files in the directory (non-recursive).
|
|
175
402
|
|
|
403
|
+
Use this in non-async tasks when you need a list of all files in the top-level directory at once.
|
|
404
|
+
|
|
176
405
|
Returns:
|
|
177
|
-
A list of File objects
|
|
406
|
+
A list of File objects for files in the top-level directory
|
|
407
|
+
|
|
408
|
+
Example (Sync):
|
|
178
409
|
|
|
179
|
-
Example:
|
|
180
410
|
```python
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
411
|
+
@env.task
|
|
412
|
+
def count_files_sync(d: Dir) -> int:
|
|
413
|
+
files = d.list_files_sync()
|
|
414
|
+
return len(files)
|
|
415
|
+
```
|
|
416
|
+
|
|
417
|
+
Example (Sync - Process files):
|
|
418
|
+
|
|
419
|
+
```python
|
|
420
|
+
@env.task
|
|
421
|
+
def process_all_files_sync(d: Dir) -> list[str]:
|
|
422
|
+
files = d.list_files_sync()
|
|
423
|
+
contents = []
|
|
424
|
+
for file in files:
|
|
425
|
+
with file.open_sync("rb") as f:
|
|
426
|
+
content = f.read()
|
|
427
|
+
contents.append(content.decode("utf-8"))
|
|
428
|
+
return contents
|
|
184
429
|
```
|
|
185
430
|
"""
|
|
186
431
|
return list(self.walk_sync(recursive=False))
|
|
@@ -189,17 +434,33 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
189
434
|
"""
|
|
190
435
|
Asynchronously download the entire directory to a local path.
|
|
191
436
|
|
|
192
|
-
|
|
193
|
-
local_path: The local path to download the directory to. If None, a temporary
|
|
194
|
-
directory will be used.
|
|
437
|
+
Use this when you need to download all files in a directory to your local filesystem for processing.
|
|
195
438
|
|
|
196
|
-
|
|
197
|
-
The path to the downloaded directory
|
|
439
|
+
Example (Async):
|
|
198
440
|
|
|
199
|
-
Example:
|
|
200
441
|
```python
|
|
201
|
-
|
|
442
|
+
@env.task
|
|
443
|
+
async def download_directory(d: Dir) -> str:
|
|
444
|
+
local_dir = await d.download()
|
|
445
|
+
# Process files in the local directory
|
|
446
|
+
return local_dir
|
|
202
447
|
```
|
|
448
|
+
|
|
449
|
+
Example (Async - Download to specific path):
|
|
450
|
+
|
|
451
|
+
```python
|
|
452
|
+
@env.task
|
|
453
|
+
async def download_to_path(d: Dir) -> str:
|
|
454
|
+
local_dir = await d.download("/tmp/my_data/")
|
|
455
|
+
return local_dir
|
|
456
|
+
```
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
local_path: The local path to download the directory to. If None, a temporary
|
|
460
|
+
directory will be used and a path will be generated.
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
The absolute path to the downloaded directory
|
|
203
464
|
"""
|
|
204
465
|
local_dest = str(local_path) if local_path else str(storage.get_random_local_path())
|
|
205
466
|
if not storage.is_remote(self.path):
|
|
@@ -222,17 +483,32 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
222
483
|
"""
|
|
223
484
|
Synchronously download the entire directory to a local path.
|
|
224
485
|
|
|
225
|
-
|
|
226
|
-
local_path: The local path to download the directory to. If None, a temporary
|
|
227
|
-
directory will be used.
|
|
486
|
+
Use this in non-async tasks when you need to download all files in a directory to your local filesystem.
|
|
228
487
|
|
|
229
|
-
|
|
230
|
-
The path to the downloaded directory
|
|
488
|
+
Example (Sync):
|
|
231
489
|
|
|
232
|
-
Example:
|
|
233
490
|
```python
|
|
234
|
-
|
|
491
|
+
@env.task
|
|
492
|
+
def download_directory_sync(d: Dir) -> str:
|
|
493
|
+
local_dir = d.download_sync()
|
|
494
|
+
# Process files in the local directory
|
|
495
|
+
return local_dir
|
|
235
496
|
```
|
|
497
|
+
|
|
498
|
+
Example (Sync - Download to specific path):
|
|
499
|
+
|
|
500
|
+
```python
|
|
501
|
+
@env.task
|
|
502
|
+
def download_to_path_sync(d: Dir) -> str:
|
|
503
|
+
local_dir = d.download_sync("/tmp/my_data/")
|
|
504
|
+
return local_dir
|
|
505
|
+
```
|
|
506
|
+
Args:
|
|
507
|
+
local_path: The local path to download the directory to. If None, a temporary
|
|
508
|
+
directory will be used and a path will be generated.
|
|
509
|
+
|
|
510
|
+
Returns:
|
|
511
|
+
The absolute path to the downloaded directory
|
|
236
512
|
"""
|
|
237
513
|
local_dest = str(local_path) if local_path else str(storage.get_random_local_path())
|
|
238
514
|
if not storage.is_remote(self.path):
|
|
@@ -245,8 +521,9 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
245
521
|
|
|
246
522
|
shutil.copytree(self.path, local_dest, dirs_exist_ok=True)
|
|
247
523
|
|
|
248
|
-
|
|
249
|
-
|
|
524
|
+
fs = storage.get_underlying_filesystem(path=self.path)
|
|
525
|
+
fs.get(self.path, local_dest, recursive=True)
|
|
526
|
+
return local_dest
|
|
250
527
|
|
|
251
528
|
@classmethod
|
|
252
529
|
async def from_local(
|
|
@@ -256,23 +533,51 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
256
533
|
dir_cache_key: Optional[str] = None,
|
|
257
534
|
) -> Dir[T]:
|
|
258
535
|
"""
|
|
259
|
-
Asynchronously create a new Dir by uploading a local directory to
|
|
536
|
+
Asynchronously create a new Dir by uploading a local directory to remote storage.
|
|
537
|
+
|
|
538
|
+
Use this in async tasks when you have a local directory that needs to be uploaded to remote storage.
|
|
539
|
+
|
|
540
|
+
Example (Async):
|
|
541
|
+
|
|
542
|
+
```python
|
|
543
|
+
@env.task
|
|
544
|
+
async def upload_local_directory() -> Dir:
|
|
545
|
+
# Create a local directory with files
|
|
546
|
+
os.makedirs("/tmp/data_dir", exist_ok=True)
|
|
547
|
+
with open("/tmp/data_dir/file1.txt", "w") as f:
|
|
548
|
+
f.write("data1")
|
|
549
|
+
|
|
550
|
+
# Upload to remote storage
|
|
551
|
+
remote_dir = await Dir.from_local("/tmp/data_dir/")
|
|
552
|
+
return remote_dir
|
|
553
|
+
```
|
|
260
554
|
|
|
555
|
+
Example (Async - With specific destination):
|
|
556
|
+
|
|
557
|
+
```python
|
|
558
|
+
@env.task
|
|
559
|
+
async def upload_to_specific_path() -> Dir:
|
|
560
|
+
remote_dir = await Dir.from_local("/tmp/data_dir/", "s3://my-bucket/data/")
|
|
561
|
+
return remote_dir
|
|
562
|
+
```
|
|
563
|
+
|
|
564
|
+
Example (Async - With cache key):
|
|
565
|
+
|
|
566
|
+
```python
|
|
567
|
+
@env.task
|
|
568
|
+
async def upload_with_cache_key() -> Dir:
|
|
569
|
+
remote_dir = await Dir.from_local("/tmp/data_dir/", dir_cache_key="my_cache_key_123")
|
|
570
|
+
return remote_dir
|
|
571
|
+
```
|
|
261
572
|
Args:
|
|
262
573
|
local_path: Path to the local directory
|
|
263
|
-
remote_path: Optional path to store the directory
|
|
264
|
-
dir_cache_key:
|
|
265
|
-
|
|
574
|
+
remote_path: Optional remote path to store the directory. If None, a path will be automatically generated.
|
|
575
|
+
dir_cache_key: Optional precomputed hash value to use for cache key computation when this Dir is used
|
|
576
|
+
as an input to discoverable tasks. If not specified, the cache key will be based on
|
|
577
|
+
directory attributes.
|
|
266
578
|
|
|
267
579
|
Returns:
|
|
268
580
|
A new Dir instance pointing to the uploaded directory
|
|
269
|
-
|
|
270
|
-
Example:
|
|
271
|
-
```python
|
|
272
|
-
remote_dir = await Dir[DataFrame].from_local('/tmp/data_dir/', 's3://bucket/data/')
|
|
273
|
-
# With a known hash value you want to use for cache key calculation
|
|
274
|
-
remote_dir = await Dir[DataFrame].from_local('/tmp/data_dir/', 's3://bucket/data/', dir_cache_key='abc123')
|
|
275
|
-
```
|
|
276
581
|
"""
|
|
277
582
|
local_path_str = str(local_path)
|
|
278
583
|
dirname = os.path.basename(os.path.normpath(local_path_str))
|
|
@@ -281,43 +586,108 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
281
586
|
return cls(path=output_path, name=dirname, hash=dir_cache_key)
|
|
282
587
|
|
|
283
588
|
@classmethod
|
|
284
|
-
def
|
|
589
|
+
def from_local_sync(
|
|
590
|
+
cls,
|
|
591
|
+
local_path: Union[str, Path],
|
|
592
|
+
remote_path: Optional[str] = None,
|
|
593
|
+
dir_cache_key: Optional[str] = None,
|
|
594
|
+
) -> Dir[T]:
|
|
285
595
|
"""
|
|
286
|
-
|
|
596
|
+
Synchronously create a new Dir by uploading a local directory to remote storage.
|
|
287
597
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
the cache key will be computed based on this object's attributes.
|
|
598
|
+
Use this in non-async tasks when you have a local directory that needs to be uploaded to remote storage.
|
|
599
|
+
|
|
600
|
+
Example (Sync):
|
|
292
601
|
|
|
293
|
-
Example:
|
|
294
602
|
```python
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
603
|
+
@env.task
|
|
604
|
+
def upload_local_directory_sync() -> Dir:
|
|
605
|
+
# Create a local directory with files
|
|
606
|
+
os.makedirs("/tmp/data_dir", exist_ok=True)
|
|
607
|
+
with open("/tmp/data_dir/file1.txt", "w") as f:
|
|
608
|
+
f.write("data1")
|
|
609
|
+
|
|
610
|
+
# Upload to remote storage
|
|
611
|
+
remote_dir = Dir.from_local_sync("/tmp/data_dir/")
|
|
612
|
+
return remote_dir
|
|
298
613
|
```
|
|
299
|
-
"""
|
|
300
|
-
return cls(path=remote_path, hash=dir_cache_key)
|
|
301
614
|
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
615
|
+
Example (Sync - With specific destination):
|
|
616
|
+
|
|
617
|
+
```python
|
|
618
|
+
@env.task
|
|
619
|
+
def upload_to_specific_path_sync() -> Dir:
|
|
620
|
+
remote_dir = Dir.from_local_sync("/tmp/data_dir/", "s3://my-bucket/data/")
|
|
621
|
+
return remote_dir
|
|
622
|
+
```
|
|
623
|
+
|
|
624
|
+
Example (Sync - With cache key):
|
|
625
|
+
|
|
626
|
+
```python
|
|
627
|
+
@env.task
|
|
628
|
+
def upload_with_cache_key_sync() -> Dir:
|
|
629
|
+
remote_dir = Dir.from_local_sync("/tmp/data_dir/", dir_cache_key="my_cache_key_123")
|
|
630
|
+
return remote_dir
|
|
631
|
+
```
|
|
306
632
|
|
|
307
633
|
Args:
|
|
308
634
|
local_path: Path to the local directory
|
|
309
|
-
remote_path: Optional path to store the directory
|
|
635
|
+
remote_path: Optional remote path to store the directory. If None, a path will be automatically generated.
|
|
636
|
+
dir_cache_key: Optional precomputed hash value to use for cache key computation when this Dir is used
|
|
637
|
+
as an input to discoverable tasks. If not specified, the cache key will be based on
|
|
638
|
+
directory attributes.
|
|
310
639
|
|
|
311
640
|
Returns:
|
|
312
641
|
A new Dir instance pointing to the uploaded directory
|
|
642
|
+
"""
|
|
643
|
+
local_path_str = str(local_path)
|
|
644
|
+
dirname = os.path.basename(os.path.normpath(local_path_str))
|
|
645
|
+
|
|
646
|
+
if not remote_path:
|
|
647
|
+
from flyte._context import internal_ctx
|
|
648
|
+
|
|
649
|
+
ctx = internal_ctx()
|
|
650
|
+
remote_path = ctx.raw_data.get_random_remote_path(dirname)
|
|
651
|
+
fs = storage.get_underlying_filesystem(path=remote_path)
|
|
652
|
+
fs.put(local_path_str, remote_path, recursive=True)
|
|
653
|
+
return cls(path=remote_path, name=dirname, hash=dir_cache_key)
|
|
654
|
+
|
|
655
|
+
@classmethod
|
|
656
|
+
def from_existing_remote(cls, remote_path: str, dir_cache_key: Optional[str] = None) -> Dir[T]:
|
|
657
|
+
"""
|
|
658
|
+
Create a Dir reference from an existing remote directory.
|
|
659
|
+
|
|
660
|
+
Use this when you want to reference a directory that already exists in remote storage without uploading it.
|
|
313
661
|
|
|
314
662
|
Example:
|
|
663
|
+
|
|
315
664
|
```python
|
|
316
|
-
|
|
665
|
+
@env.task
|
|
666
|
+
async def process_existing_directory() -> int:
|
|
667
|
+
d = Dir.from_existing_remote("s3://my-bucket/data/")
|
|
668
|
+
files = await d.list_files()
|
|
669
|
+
return len(files)
|
|
317
670
|
```
|
|
671
|
+
|
|
672
|
+
Example (With cache key):
|
|
673
|
+
|
|
674
|
+
```python
|
|
675
|
+
@env.task
|
|
676
|
+
async def process_with_cache_key() -> int:
|
|
677
|
+
d = Dir.from_existing_remote("s3://my-bucket/data/", dir_cache_key="abc123")
|
|
678
|
+
files = await d.list_files()
|
|
679
|
+
return len(files)
|
|
680
|
+
```
|
|
681
|
+
|
|
682
|
+
Args:
|
|
683
|
+
remote_path: The remote path to the existing directory
|
|
684
|
+
dir_cache_key: Optional hash value to use for cache key computation. If not specified,
|
|
685
|
+
the cache key will be computed based on the directory's attributes.
|
|
686
|
+
|
|
687
|
+
Returns:
|
|
688
|
+
A new Dir instance pointing to the existing remote directory
|
|
318
689
|
"""
|
|
319
|
-
|
|
320
|
-
raise NotImplementedError("Sync upload is not implemented for remote paths")
|
|
690
|
+
return cls(path=remote_path, hash=dir_cache_key)
|
|
321
691
|
|
|
322
692
|
async def exists(self) -> bool:
|
|
323
693
|
"""
|
|
@@ -326,10 +696,15 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
326
696
|
Returns:
|
|
327
697
|
True if the directory exists, False otherwise
|
|
328
698
|
|
|
329
|
-
Example:
|
|
699
|
+
Example (Async):
|
|
700
|
+
|
|
330
701
|
```python
|
|
331
|
-
|
|
332
|
-
|
|
702
|
+
@env.task
|
|
703
|
+
async def check_directory(d: Dir) -> bool:
|
|
704
|
+
if await d.exists():
|
|
705
|
+
print("Directory exists!")
|
|
706
|
+
return True
|
|
707
|
+
return False
|
|
333
708
|
```
|
|
334
709
|
"""
|
|
335
710
|
fs = storage.get_underlying_filesystem(path=self.path)
|
|
@@ -342,13 +717,20 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
342
717
|
"""
|
|
343
718
|
Synchronously check if the directory exists.
|
|
344
719
|
|
|
720
|
+
Use this in non-async tasks or when you need synchronous directory existence checking.
|
|
721
|
+
|
|
345
722
|
Returns:
|
|
346
723
|
True if the directory exists, False otherwise
|
|
347
724
|
|
|
348
|
-
Example:
|
|
725
|
+
Example (Sync):
|
|
726
|
+
|
|
349
727
|
```python
|
|
350
|
-
|
|
351
|
-
|
|
728
|
+
@env.task
|
|
729
|
+
def check_directory_sync(d: Dir) -> bool:
|
|
730
|
+
if d.exists_sync():
|
|
731
|
+
print("Directory exists!")
|
|
732
|
+
return True
|
|
733
|
+
return False
|
|
352
734
|
```
|
|
353
735
|
"""
|
|
354
736
|
fs = storage.get_underlying_filesystem(path=self.path)
|
|
@@ -356,20 +738,28 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
356
738
|
|
|
357
739
|
async def get_file(self, file_name: str) -> Optional[File[T]]:
|
|
358
740
|
"""
|
|
359
|
-
Asynchronously get a specific file from the directory.
|
|
741
|
+
Asynchronously get a specific file from the directory by name.
|
|
742
|
+
|
|
743
|
+
Use this when you know the name of a specific file in the directory you want to access.
|
|
744
|
+
|
|
745
|
+
Example (Async):
|
|
746
|
+
|
|
747
|
+
```python
|
|
748
|
+
@env.task
|
|
749
|
+
async def read_specific_file(d: Dir) -> str:
|
|
750
|
+
file = await d.get_file("data.csv")
|
|
751
|
+
if file:
|
|
752
|
+
async with file.open("rb") as f:
|
|
753
|
+
content = await f.read()
|
|
754
|
+
return content.decode("utf-8")
|
|
755
|
+
return "File not found"
|
|
756
|
+
```
|
|
360
757
|
|
|
361
758
|
Args:
|
|
362
759
|
file_name: The name of the file to get
|
|
363
760
|
|
|
364
761
|
Returns:
|
|
365
762
|
A File instance if the file exists, None otherwise
|
|
366
|
-
|
|
367
|
-
Example:
|
|
368
|
-
```python
|
|
369
|
-
file = await directory.get_file("data.csv")
|
|
370
|
-
if file:
|
|
371
|
-
# Process the file
|
|
372
|
-
```
|
|
373
763
|
"""
|
|
374
764
|
fs = storage.get_underlying_filesystem(path=self.path)
|
|
375
765
|
file_path = fs.sep.join([self.path, file_name])
|
|
@@ -381,20 +771,28 @@ class Dir(BaseModel, Generic[T], SerializableType):
|
|
|
381
771
|
|
|
382
772
|
def get_file_sync(self, file_name: str) -> Optional[File[T]]:
|
|
383
773
|
"""
|
|
384
|
-
Synchronously get a specific file from the directory.
|
|
774
|
+
Synchronously get a specific file from the directory by name.
|
|
775
|
+
|
|
776
|
+
Use this in non-async tasks when you know the name of a specific file in the directory you want to access.
|
|
777
|
+
|
|
778
|
+
Example (Sync):
|
|
779
|
+
|
|
780
|
+
```python
|
|
781
|
+
@env.task
|
|
782
|
+
def read_specific_file_sync(d: Dir) -> str:
|
|
783
|
+
file = d.get_file_sync("data.csv")
|
|
784
|
+
if file:
|
|
785
|
+
with file.open_sync("rb") as f:
|
|
786
|
+
content = f.read()
|
|
787
|
+
return content.decode("utf-8")
|
|
788
|
+
return "File not found"
|
|
789
|
+
```
|
|
385
790
|
|
|
386
791
|
Args:
|
|
387
792
|
file_name: The name of the file to get
|
|
388
793
|
|
|
389
794
|
Returns:
|
|
390
795
|
A File instance if the file exists, None otherwise
|
|
391
|
-
|
|
392
|
-
Example:
|
|
393
|
-
```python
|
|
394
|
-
file = directory.get_file_sync("data.csv")
|
|
395
|
-
if file:
|
|
396
|
-
# Process the file
|
|
397
|
-
```
|
|
398
796
|
"""
|
|
399
797
|
file_path = os.path.join(self.path, file_name)
|
|
400
798
|
file = File[T](path=file_path)
|