flyte 2.0.0b13__py3-none-any.whl → 2.0.0b30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. flyte/__init__.py +18 -2
  2. flyte/_bin/debug.py +38 -0
  3. flyte/_bin/runtime.py +62 -8
  4. flyte/_cache/cache.py +4 -2
  5. flyte/_cache/local_cache.py +216 -0
  6. flyte/_code_bundle/_ignore.py +12 -4
  7. flyte/_code_bundle/_packaging.py +13 -9
  8. flyte/_code_bundle/_utils.py +18 -10
  9. flyte/_code_bundle/bundle.py +17 -9
  10. flyte/_constants.py +1 -0
  11. flyte/_context.py +4 -1
  12. flyte/_custom_context.py +73 -0
  13. flyte/_debug/constants.py +38 -0
  14. flyte/_debug/utils.py +17 -0
  15. flyte/_debug/vscode.py +307 -0
  16. flyte/_deploy.py +235 -61
  17. flyte/_environment.py +20 -6
  18. flyte/_excepthook.py +1 -1
  19. flyte/_hash.py +1 -16
  20. flyte/_image.py +178 -81
  21. flyte/_initialize.py +132 -51
  22. flyte/_interface.py +39 -2
  23. flyte/_internal/controllers/__init__.py +4 -5
  24. flyte/_internal/controllers/_local_controller.py +70 -29
  25. flyte/_internal/controllers/_trace.py +1 -1
  26. flyte/_internal/controllers/remote/__init__.py +0 -2
  27. flyte/_internal/controllers/remote/_action.py +14 -16
  28. flyte/_internal/controllers/remote/_client.py +1 -1
  29. flyte/_internal/controllers/remote/_controller.py +68 -70
  30. flyte/_internal/controllers/remote/_core.py +127 -99
  31. flyte/_internal/controllers/remote/_informer.py +19 -10
  32. flyte/_internal/controllers/remote/_service_protocol.py +7 -7
  33. flyte/_internal/imagebuild/docker_builder.py +181 -69
  34. flyte/_internal/imagebuild/image_builder.py +0 -5
  35. flyte/_internal/imagebuild/remote_builder.py +155 -64
  36. flyte/_internal/imagebuild/utils.py +51 -2
  37. flyte/_internal/resolvers/_task_module.py +5 -38
  38. flyte/_internal/resolvers/default.py +2 -2
  39. flyte/_internal/runtime/convert.py +110 -21
  40. flyte/_internal/runtime/entrypoints.py +27 -1
  41. flyte/_internal/runtime/io.py +21 -8
  42. flyte/_internal/runtime/resources_serde.py +20 -6
  43. flyte/_internal/runtime/reuse.py +1 -1
  44. flyte/_internal/runtime/rusty.py +20 -5
  45. flyte/_internal/runtime/task_serde.py +34 -19
  46. flyte/_internal/runtime/taskrunner.py +22 -4
  47. flyte/_internal/runtime/trigger_serde.py +160 -0
  48. flyte/_internal/runtime/types_serde.py +1 -1
  49. flyte/_keyring/__init__.py +0 -0
  50. flyte/_keyring/file.py +115 -0
  51. flyte/_logging.py +201 -39
  52. flyte/_map.py +111 -14
  53. flyte/_module.py +70 -0
  54. flyte/_pod.py +4 -3
  55. flyte/_resources.py +213 -31
  56. flyte/_run.py +110 -39
  57. flyte/_task.py +75 -16
  58. flyte/_task_environment.py +105 -29
  59. flyte/_task_plugins.py +4 -2
  60. flyte/_trace.py +5 -0
  61. flyte/_trigger.py +1000 -0
  62. flyte/_utils/__init__.py +2 -1
  63. flyte/_utils/asyn.py +3 -1
  64. flyte/_utils/coro_management.py +2 -1
  65. flyte/_utils/docker_credentials.py +173 -0
  66. flyte/_utils/module_loader.py +17 -2
  67. flyte/_version.py +3 -3
  68. flyte/cli/_abort.py +3 -3
  69. flyte/cli/_build.py +3 -6
  70. flyte/cli/_common.py +78 -7
  71. flyte/cli/_create.py +182 -4
  72. flyte/cli/_delete.py +23 -1
  73. flyte/cli/_deploy.py +63 -16
  74. flyte/cli/_get.py +79 -34
  75. flyte/cli/_params.py +26 -10
  76. flyte/cli/_plugins.py +209 -0
  77. flyte/cli/_run.py +151 -26
  78. flyte/cli/_serve.py +64 -0
  79. flyte/cli/_update.py +37 -0
  80. flyte/cli/_user.py +17 -0
  81. flyte/cli/main.py +30 -4
  82. flyte/config/_config.py +10 -6
  83. flyte/config/_internal.py +1 -0
  84. flyte/config/_reader.py +29 -8
  85. flyte/connectors/__init__.py +11 -0
  86. flyte/connectors/_connector.py +270 -0
  87. flyte/connectors/_server.py +197 -0
  88. flyte/connectors/utils.py +135 -0
  89. flyte/errors.py +22 -2
  90. flyte/extend.py +8 -1
  91. flyte/extras/_container.py +6 -1
  92. flyte/git/__init__.py +3 -0
  93. flyte/git/_config.py +21 -0
  94. flyte/io/__init__.py +2 -0
  95. flyte/io/_dataframe/__init__.py +2 -0
  96. flyte/io/_dataframe/basic_dfs.py +17 -8
  97. flyte/io/_dataframe/dataframe.py +98 -132
  98. flyte/io/_dir.py +575 -113
  99. flyte/io/_file.py +582 -139
  100. flyte/io/_hashing_io.py +342 -0
  101. flyte/models.py +74 -15
  102. flyte/remote/__init__.py +6 -1
  103. flyte/remote/_action.py +34 -26
  104. flyte/remote/_client/_protocols.py +39 -4
  105. flyte/remote/_client/auth/_authenticators/device_code.py +4 -5
  106. flyte/remote/_client/auth/_authenticators/pkce.py +1 -1
  107. flyte/remote/_client/auth/_channel.py +10 -6
  108. flyte/remote/_client/controlplane.py +17 -5
  109. flyte/remote/_console.py +3 -2
  110. flyte/remote/_data.py +6 -6
  111. flyte/remote/_logs.py +3 -3
  112. flyte/remote/_run.py +64 -8
  113. flyte/remote/_secret.py +26 -17
  114. flyte/remote/_task.py +75 -33
  115. flyte/remote/_trigger.py +306 -0
  116. flyte/remote/_user.py +33 -0
  117. flyte/report/_report.py +1 -1
  118. flyte/storage/__init__.py +6 -1
  119. flyte/storage/_config.py +5 -1
  120. flyte/storage/_parallel_reader.py +274 -0
  121. flyte/storage/_storage.py +200 -103
  122. flyte/types/__init__.py +16 -0
  123. flyte/types/_interface.py +2 -2
  124. flyte/types/_pickle.py +35 -8
  125. flyte/types/_string_literals.py +8 -9
  126. flyte/types/_type_engine.py +40 -70
  127. flyte/types/_utils.py +1 -1
  128. flyte-2.0.0b30.data/scripts/debug.py +38 -0
  129. {flyte-2.0.0b13.data → flyte-2.0.0b30.data}/scripts/runtime.py +62 -8
  130. {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/METADATA +11 -3
  131. flyte-2.0.0b30.dist-info/RECORD +192 -0
  132. {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/entry_points.txt +3 -0
  133. flyte/_protos/common/authorization_pb2.py +0 -66
  134. flyte/_protos/common/authorization_pb2.pyi +0 -108
  135. flyte/_protos/common/authorization_pb2_grpc.py +0 -4
  136. flyte/_protos/common/identifier_pb2.py +0 -93
  137. flyte/_protos/common/identifier_pb2.pyi +0 -110
  138. flyte/_protos/common/identifier_pb2_grpc.py +0 -4
  139. flyte/_protos/common/identity_pb2.py +0 -48
  140. flyte/_protos/common/identity_pb2.pyi +0 -72
  141. flyte/_protos/common/identity_pb2_grpc.py +0 -4
  142. flyte/_protos/common/list_pb2.py +0 -36
  143. flyte/_protos/common/list_pb2.pyi +0 -71
  144. flyte/_protos/common/list_pb2_grpc.py +0 -4
  145. flyte/_protos/common/policy_pb2.py +0 -37
  146. flyte/_protos/common/policy_pb2.pyi +0 -27
  147. flyte/_protos/common/policy_pb2_grpc.py +0 -4
  148. flyte/_protos/common/role_pb2.py +0 -37
  149. flyte/_protos/common/role_pb2.pyi +0 -53
  150. flyte/_protos/common/role_pb2_grpc.py +0 -4
  151. flyte/_protos/common/runtime_version_pb2.py +0 -28
  152. flyte/_protos/common/runtime_version_pb2.pyi +0 -24
  153. flyte/_protos/common/runtime_version_pb2_grpc.py +0 -4
  154. flyte/_protos/imagebuilder/definition_pb2.py +0 -59
  155. flyte/_protos/imagebuilder/definition_pb2.pyi +0 -140
  156. flyte/_protos/imagebuilder/definition_pb2_grpc.py +0 -4
  157. flyte/_protos/imagebuilder/payload_pb2.py +0 -32
  158. flyte/_protos/imagebuilder/payload_pb2.pyi +0 -21
  159. flyte/_protos/imagebuilder/payload_pb2_grpc.py +0 -4
  160. flyte/_protos/imagebuilder/service_pb2.py +0 -29
  161. flyte/_protos/imagebuilder/service_pb2.pyi +0 -5
  162. flyte/_protos/imagebuilder/service_pb2_grpc.py +0 -66
  163. flyte/_protos/logs/dataplane/payload_pb2.py +0 -100
  164. flyte/_protos/logs/dataplane/payload_pb2.pyi +0 -177
  165. flyte/_protos/logs/dataplane/payload_pb2_grpc.py +0 -4
  166. flyte/_protos/secret/definition_pb2.py +0 -49
  167. flyte/_protos/secret/definition_pb2.pyi +0 -93
  168. flyte/_protos/secret/definition_pb2_grpc.py +0 -4
  169. flyte/_protos/secret/payload_pb2.py +0 -62
  170. flyte/_protos/secret/payload_pb2.pyi +0 -94
  171. flyte/_protos/secret/payload_pb2_grpc.py +0 -4
  172. flyte/_protos/secret/secret_pb2.py +0 -38
  173. flyte/_protos/secret/secret_pb2.pyi +0 -6
  174. flyte/_protos/secret/secret_pb2_grpc.py +0 -198
  175. flyte/_protos/secret/secret_pb2_grpc_grpc.py +0 -198
  176. flyte/_protos/validate/validate/validate_pb2.py +0 -76
  177. flyte/_protos/workflow/common_pb2.py +0 -27
  178. flyte/_protos/workflow/common_pb2.pyi +0 -14
  179. flyte/_protos/workflow/common_pb2_grpc.py +0 -4
  180. flyte/_protos/workflow/environment_pb2.py +0 -29
  181. flyte/_protos/workflow/environment_pb2.pyi +0 -12
  182. flyte/_protos/workflow/environment_pb2_grpc.py +0 -4
  183. flyte/_protos/workflow/node_execution_service_pb2.py +0 -26
  184. flyte/_protos/workflow/node_execution_service_pb2.pyi +0 -4
  185. flyte/_protos/workflow/node_execution_service_pb2_grpc.py +0 -32
  186. flyte/_protos/workflow/queue_service_pb2.py +0 -109
  187. flyte/_protos/workflow/queue_service_pb2.pyi +0 -166
  188. flyte/_protos/workflow/queue_service_pb2_grpc.py +0 -172
  189. flyte/_protos/workflow/run_definition_pb2.py +0 -121
  190. flyte/_protos/workflow/run_definition_pb2.pyi +0 -327
  191. flyte/_protos/workflow/run_definition_pb2_grpc.py +0 -4
  192. flyte/_protos/workflow/run_logs_service_pb2.py +0 -41
  193. flyte/_protos/workflow/run_logs_service_pb2.pyi +0 -28
  194. flyte/_protos/workflow/run_logs_service_pb2_grpc.py +0 -69
  195. flyte/_protos/workflow/run_service_pb2.py +0 -137
  196. flyte/_protos/workflow/run_service_pb2.pyi +0 -185
  197. flyte/_protos/workflow/run_service_pb2_grpc.py +0 -446
  198. flyte/_protos/workflow/state_service_pb2.py +0 -67
  199. flyte/_protos/workflow/state_service_pb2.pyi +0 -76
  200. flyte/_protos/workflow/state_service_pb2_grpc.py +0 -138
  201. flyte/_protos/workflow/task_definition_pb2.py +0 -79
  202. flyte/_protos/workflow/task_definition_pb2.pyi +0 -81
  203. flyte/_protos/workflow/task_definition_pb2_grpc.py +0 -4
  204. flyte/_protos/workflow/task_service_pb2.py +0 -60
  205. flyte/_protos/workflow/task_service_pb2.pyi +0 -59
  206. flyte/_protos/workflow/task_service_pb2_grpc.py +0 -138
  207. flyte-2.0.0b13.dist-info/RECORD +0 -239
  208. /flyte/{_protos → _debug}/__init__.py +0 -0
  209. {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/WHEEL +0 -0
  210. {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/licenses/LICENSE +0 -0
  211. {flyte-2.0.0b13.dist-info → flyte-2.0.0b30.dist-info}/top_level.txt +0 -0
flyte/io/_dir.py CHANGED
@@ -4,12 +4,14 @@ import os
4
4
  from pathlib import Path
5
5
  from typing import AsyncIterator, Dict, Generic, Iterator, List, Optional, Type, TypeVar, Union
6
6
 
7
- from flyteidl.core import literals_pb2, types_pb2
7
+ from flyteidl2.core import literals_pb2, types_pb2
8
8
  from fsspec.asyn import AsyncFileSystem
9
+ from fsspec.utils import get_protocol
9
10
  from mashumaro.types import SerializableType
10
11
  from pydantic import BaseModel, model_validator
11
12
 
12
13
  import flyte.storage as storage
14
+ from flyte._context import internal_ctx
13
15
  from flyte.io._file import File
14
16
  from flyte.types import TypeEngine, TypeTransformer, TypeTransformerFailedError
15
17
 
@@ -20,34 +22,183 @@ T = TypeVar("T")
20
22
  class Dir(BaseModel, Generic[T], SerializableType):
21
23
  """
22
24
  A generic directory class representing a directory with files of a specified format.
23
- Provides both async and sync interfaces for directory operations.
24
- Users are responsible for handling all I/O - the type transformer for Dir does not do any automatic uploading
25
- or downloading of files.
25
+ Provides both async and sync interfaces for directory operations. All methods without _sync suffix are async.
26
26
 
27
- The generic type T represents the format of the files in the directory.
27
+ The class should be instantiated using one of the class methods. The constructor should only be used to
28
+ instantiate references to existing remote directories.
28
29
 
29
- Example:
30
- ```python
31
- # Async usage
32
- from pandas import DataFrame
33
- data_dir = Dir[DataFrame](path="s3://my-bucket/data/")
30
+ The generic type T represents the format of the files in the directory.
34
31
 
35
- # Walk through files
36
- async for file in data_dir.walk():
37
- async with file.open() as f:
32
+ Important methods:
33
+ - `from_existing_remote`: Create a Dir object referencing an existing remote directory.
34
+ - `from_local` / `from_local_sync`: Upload a local directory to remote storage.
35
+
36
+ **Asynchronous methods**:
37
+ - `walk`: Asynchronously iterate through files in the directory.
38
+ - `list_files`: Asynchronously get a list of all files (non-recursive).
39
+ - `download`: Asynchronously download the entire directory to a local path.
40
+ - `exists`: Asynchronously check if the directory exists.
41
+ - `get_file`: Asynchronously get a specific file from the directory by name.
42
+
43
+ **Synchronous methods** (suffixed with `_sync`):
44
+ - `walk_sync`: Synchronously iterate through files in the directory.
45
+ - `list_files_sync`: Synchronously get a list of all files (non-recursive).
46
+ - `download_sync`: Synchronously download the entire directory to a local path.
47
+ - `exists_sync`: Synchronously check if the directory exists.
48
+ - `get_file_sync`: Synchronously get a specific file from the directory by name.
49
+
50
+ Example: Walk through directory files recursively (Async).
51
+
52
+ ```python
53
+ @env.task
54
+ async def process_all_files(d: Dir) -> int:
55
+ file_count = 0
56
+ async for file in d.walk(recursive=True):
57
+ async with file.open("rb") as f:
38
58
  content = await f.read()
39
-
40
- # Sync alternative
41
- for file in data_dir.walk_sync():
42
- with file.open_sync() as f:
59
+ # Process content
60
+ file_count += 1
61
+ return file_count
62
+ ```
63
+
64
+ Example: Walk through directory files recursively (Sync).
65
+
66
+ ```python
67
+ @env.task
68
+ def process_all_files_sync(d: Dir) -> int:
69
+ file_count = 0
70
+ for file in d.walk_sync(recursive=True):
71
+ with file.open_sync("rb") as f:
43
72
  content = f.read()
44
- ```
73
+ # Process content
74
+ file_count += 1
75
+ return file_count
76
+ ```
77
+
78
+ Example: List files in directory (Async).
79
+
80
+ ```python
81
+ @env.task
82
+ async def count_files(d: Dir) -> int:
83
+ files = await d.list_files()
84
+ return len(files)
85
+ ```
86
+
87
+ Example: List files in directory (Sync).
88
+
89
+ ```python
90
+ @env.task
91
+ def count_files_sync(d: Dir) -> int:
92
+ files = d.list_files_sync()
93
+ return len(files)
94
+ ```
95
+
96
+ Example: Get a specific file from directory (Async).
97
+
98
+ ```python
99
+ @env.task
100
+ async def read_config_file(d: Dir) -> str:
101
+ config_file = await d.get_file("config.json")
102
+ if config_file:
103
+ async with config_file.open("rb") as f:
104
+ return (await f.read()).decode("utf-8")
105
+ return "Config not found"
106
+ ```
107
+
108
+ Example: Get a specific file from directory (Sync).
109
+
110
+ ```python
111
+ @env.task
112
+ def read_config_file_sync(d: Dir) -> str:
113
+ config_file = d.get_file_sync("config.json")
114
+ if config_file:
115
+ with config_file.open_sync("rb") as f:
116
+ return f.read().decode("utf-8")
117
+ return "Config not found"
118
+ ```
119
+
120
+ Example: Upload a local directory to remote storage (Async).
121
+
122
+ ```python
123
+ @env.task
124
+ async def upload_directory() -> Dir:
125
+ # Create local directory with files
126
+ os.makedirs("/tmp/my_data", exist_ok=True)
127
+ with open("/tmp/my_data/file1.txt", "w") as f:
128
+ f.write("data1")
129
+ # Upload to remote storage
130
+ return await Dir.from_local("/tmp/my_data/")
131
+ ```
132
+
133
+ Example: Upload a local directory to remote storage (Sync).
134
+
135
+ ```python
136
+ @env.task
137
+ def upload_directory_sync() -> Dir:
138
+ # Create local directory with files
139
+ os.makedirs("/tmp/my_data", exist_ok=True)
140
+ with open("/tmp/my_data/file1.txt", "w") as f:
141
+ f.write("data1")
142
+ # Upload to remote storage
143
+ return Dir.from_local_sync("/tmp/my_data/")
144
+ ```
145
+
146
+ Example: Download a directory to local storage (Async).
147
+
148
+ ```python
149
+ @env.task
150
+ async def download_directory(d: Dir) -> str:
151
+ local_path = await d.download()
152
+ # Process files in local directory
153
+ return local_path
154
+ ```
155
+
156
+ Example: Download a directory to local storage (Sync).
157
+
158
+ ```python
159
+ @env.task
160
+ def download_directory_sync(d: Dir) -> str:
161
+ local_path = d.download_sync()
162
+ # Process files in local directory
163
+ return local_path
164
+ ```
165
+
166
+ Example: Reference an existing remote directory.
167
+
168
+ ```python
169
+ @env.task
170
+ async def process_existing_dir() -> int:
171
+ d = Dir.from_existing_remote("s3://my-bucket/data/")
172
+ files = await d.list_files()
173
+ return len(files)
174
+ ```
175
+
176
+ Example: Check if directory exists (Async).
177
+
178
+ ```python
179
+ @env.task
180
+ async def check_directory(d: Dir) -> bool:
181
+ return await d.exists()
182
+ ```
183
+
184
+ Example: Check if directory exists (Sync).
185
+
186
+ ```python
187
+ @env.task
188
+ def check_directory_sync(d: Dir) -> bool:
189
+ return d.exists_sync()
190
+ ```
191
+
192
+ Args:
193
+ path: The path to the directory (can be local or remote)
194
+ name: Optional name for the directory (defaults to basename of path)
45
195
  """
46
196
 
47
197
  # Represents either a local or remote path.
48
198
  path: str
49
199
  name: Optional[str] = None
50
200
  format: str = ""
201
+ hash: Optional[str] = None
51
202
 
52
203
  class Config:
53
204
  arbitrary_types_allowed = True
@@ -55,20 +206,24 @@ class Dir(BaseModel, Generic[T], SerializableType):
55
206
  @model_validator(mode="before")
56
207
  @classmethod
57
208
  def pre_init(cls, data):
209
+ """Internal: Pydantic validator to set default name from path. Not intended for direct use."""
58
210
  if data.get("name") is None:
59
211
  data["name"] = Path(data["path"]).name
60
212
  return data
61
213
 
62
214
  def _serialize(self) -> Dict[str, Optional[str]]:
215
+ """Internal: Serialize Dir to dictionary. Not intended for direct use."""
63
216
  pyd_dump = self.model_dump()
64
217
  return pyd_dump
65
218
 
66
219
  @classmethod
67
220
  def _deserialize(cls, file_dump: Dict[str, Optional[str]]) -> Dir:
221
+ """Internal: Deserialize Dir from dictionary. Not intended for direct use."""
68
222
  return cls.model_validate(file_dump)
69
223
 
70
224
  @classmethod
71
225
  def schema_match(cls, incoming: dict):
226
+ """Internal: Check if incoming schema matches Dir schema. Not intended for direct use."""
72
227
  this_schema = cls.model_json_schema()
73
228
  current_required = this_schema.get("required")
74
229
  incoming_required = incoming.get("required")
@@ -85,19 +240,48 @@ class Dir(BaseModel, Generic[T], SerializableType):
85
240
  """
86
241
  Asynchronously walk through the directory and yield File objects.
87
242
 
243
+ Use this to iterate through all files in a directory. Each yielded File can be read directly without
244
+ downloading.
245
+
246
+ Example (Async - Recursive):
247
+
248
+ ```python
249
+ @env.task
250
+ async def list_all_files(d: Dir) -> list[str]:
251
+ file_names = []
252
+ async for file in d.walk(recursive=True):
253
+ file_names.append(file.name)
254
+ return file_names
255
+ ```
256
+
257
+ Example (Async - Non-recursive):
258
+
259
+ ```python
260
+ @env.task
261
+ async def list_top_level_files(d: Dir) -> list[str]:
262
+ file_names = []
263
+ async for file in d.walk(recursive=False):
264
+ file_names.append(file.name)
265
+ return file_names
266
+ ```
267
+
268
+ Example (Async - With max depth):
269
+
270
+ ```python
271
+ @env.task
272
+ async def list_files_max_depth(d: Dir) -> list[str]:
273
+ file_names = []
274
+ async for file in d.walk(recursive=True, max_depth=2):
275
+ file_names.append(file.name)
276
+ return file_names
277
+ ```
278
+
88
279
  Args:
89
- recursive: If True, recursively walk subdirectories
90
- max_depth: Maximum depth for recursive walking
280
+ recursive: If True, recursively walk subdirectories. If False, only list files in the top-level directory.
281
+ max_depth: Maximum depth for recursive walking. If None, walk through all subdirectories.
91
282
 
92
283
  Yields:
93
284
  File objects for each file found in the directory
94
-
95
- Example:
96
- ```python
97
- async for file in directory.walk():
98
- local_path = await file.download()
99
- # Process the file
100
- ```
101
285
  """
102
286
  fs = storage.get_underlying_filesystem(path=self.path)
103
287
  if recursive is False:
@@ -124,20 +308,48 @@ class Dir(BaseModel, Generic[T], SerializableType):
124
308
  """
125
309
  Synchronously walk through the directory and yield File objects.
126
310
 
311
+ Use this in non-async tasks to iterate through all files in a directory.
312
+
313
+ Example (Sync - Recursive):
314
+
315
+ ```python
316
+ @env.task
317
+ def list_all_files_sync(d: Dir) -> list[str]:
318
+ file_names = []
319
+ for file in d.walk_sync(recursive=True):
320
+ file_names.append(file.name)
321
+ return file_names
322
+ ```
323
+
324
+ Example (Sync - With file pattern):
325
+
326
+ ```python
327
+ @env.task
328
+ def list_text_files(d: Dir) -> list[str]:
329
+ file_names = []
330
+ for file in d.walk_sync(recursive=True, file_pattern="*.txt"):
331
+ file_names.append(file.name)
332
+ return file_names
333
+ ```
334
+
335
+ Example (Sync - Non-recursive with max depth):
336
+
337
+ ```python
338
+ @env.task
339
+ def list_files_limited(d: Dir) -> list[str]:
340
+ file_names = []
341
+ for file in d.walk_sync(recursive=True, max_depth=2):
342
+ file_names.append(file.name)
343
+ return file_names
344
+ ```
345
+
127
346
  Args:
128
- recursive: If True, recursively walk subdirectories
129
- file_pattern: Glob pattern to filter files
130
- max_depth: Maximum depth for recursive walking
347
+ recursive: If True, recursively walk subdirectories. If False, only list files in the top-level directory.
348
+ file_pattern: Glob pattern to filter files (e.g., "*.txt", "*.csv"). Default is "*" (all files).
349
+ max_depth: Maximum depth for recursive walking. If None, walk through all subdirectories.
131
350
 
132
351
  Yields:
133
352
  File objects for each file found in the directory
134
-
135
- Example:
136
- ```python
137
- for file in directory.walk_sync():
138
- local_path = file.download_sync()
139
- # Process the file
140
- ```
141
353
  """
142
354
  fs = storage.get_underlying_filesystem(path=self.path)
143
355
  for parent, _, files in fs.walk(self.path, maxdepth=max_depth):
@@ -152,15 +364,33 @@ class Dir(BaseModel, Generic[T], SerializableType):
152
364
  """
153
365
  Asynchronously get a list of all files in the directory (non-recursive).
154
366
 
367
+ Use this when you need a list of all files in the top-level directory at once.
368
+
155
369
  Returns:
156
- A list of File objects
370
+ A list of File objects for files in the top-level directory
157
371
 
158
- Example:
159
- ```python
160
- files = await directory.list_files()
372
+ Example (Async):
373
+
374
+ ```python
375
+ @env.task
376
+ async def count_files(d: Dir) -> int:
377
+ files = await d.list_files()
378
+ return len(files)
379
+ ```
380
+
381
+ Example (Async - Process files):
382
+
383
+ ```python
384
+ @env.task
385
+ async def process_all_files(d: Dir) -> list[str]:
386
+ files = await d.list_files()
387
+ contents = []
161
388
  for file in files:
162
- # Process the file
163
- ```
389
+ async with file.open("rb") as f:
390
+ content = await f.read()
391
+ contents.append(content.decode("utf-8"))
392
+ return contents
393
+ ```
164
394
  """
165
395
  # todo: this should probably also just defer to fsspec.find()
166
396
  files = []
@@ -172,15 +402,33 @@ class Dir(BaseModel, Generic[T], SerializableType):
172
402
  """
173
403
  Synchronously get a list of all files in the directory (non-recursive).
174
404
 
405
+ Use this in non-async tasks when you need a list of all files in the top-level directory at once.
406
+
175
407
  Returns:
176
- A list of File objects
408
+ A list of File objects for files in the top-level directory
177
409
 
178
- Example:
179
- ```python
180
- files = directory.list_files_sync()
410
+ Example (Sync):
411
+
412
+ ```python
413
+ @env.task
414
+ def count_files_sync(d: Dir) -> int:
415
+ files = d.list_files_sync()
416
+ return len(files)
417
+ ```
418
+
419
+ Example (Sync - Process files):
420
+
421
+ ```python
422
+ @env.task
423
+ def process_all_files_sync(d: Dir) -> list[str]:
424
+ files = d.list_files_sync()
425
+ contents = []
181
426
  for file in files:
182
- # Process the file
183
- ```
427
+ with file.open_sync("rb") as f:
428
+ content = f.read()
429
+ contents.append(content.decode("utf-8"))
430
+ return contents
431
+ ```
184
432
  """
185
433
  return list(self.walk_sync(recursive=False))
186
434
 
@@ -188,19 +436,43 @@ class Dir(BaseModel, Generic[T], SerializableType):
188
436
  """
189
437
  Asynchronously download the entire directory to a local path.
190
438
 
439
+ Use this when you need to download all files in a directory to your local filesystem for processing.
440
+
441
+ Example (Async):
442
+
443
+ ```python
444
+ @env.task
445
+ async def download_directory(d: Dir) -> str:
446
+ local_dir = await d.download()
447
+ # Process files in the local directory
448
+ return local_dir
449
+ ```
450
+
451
+ Example (Async - Download to specific path):
452
+
453
+ ```python
454
+ @env.task
455
+ async def download_to_path(d: Dir) -> str:
456
+ local_dir = await d.download("/tmp/my_data/")
457
+ return local_dir
458
+ ```
459
+
191
460
  Args:
192
461
  local_path: The local path to download the directory to. If None, a temporary
193
- directory will be used.
462
+ directory will be used and a path will be generated.
194
463
 
195
464
  Returns:
196
- The path to the downloaded directory
197
-
198
- Example:
199
- ```python
200
- local_dir = await directory.download('/tmp/my_data/')
201
- ```
465
+ The absolute path to the downloaded directory
202
466
  """
203
- local_dest = str(local_path) if local_path else str(storage.get_random_local_path())
467
+ # If no local_path specified, create a unique path + append source directory name
468
+ if local_path is None:
469
+ unique_path = storage.get_random_local_path()
470
+ source_dirname = Path(self.path).name # will need to be updated for windows
471
+ local_dest = str(Path(unique_path) / source_dirname)
472
+ else:
473
+ # If local_path is specified, use it directly (contents go into it)
474
+ local_dest = str(local_path)
475
+
204
476
  if not storage.is_remote(self.path):
205
477
  if not local_path or local_path == self.path:
206
478
  # Skip copying
@@ -215,25 +487,49 @@ class Dir(BaseModel, Generic[T], SerializableType):
215
487
  await loop.run_in_executor(None, lambda: shutil.copytree(self.path, local_dest, dirs_exist_ok=True))
216
488
 
217
489
  await copy_tree()
490
+ return local_dest
218
491
  return await storage.get(self.path, local_dest, recursive=True)
219
492
 
220
493
  def download_sync(self, local_path: Optional[Union[str, Path]] = None) -> str:
221
494
  """
222
495
  Synchronously download the entire directory to a local path.
223
496
 
497
+ Use this in non-async tasks when you need to download all files in a directory to your local filesystem.
498
+
499
+ Example (Sync):
500
+
501
+ ```python
502
+ @env.task
503
+ def download_directory_sync(d: Dir) -> str:
504
+ local_dir = d.download_sync()
505
+ # Process files in the local directory
506
+ return local_dir
507
+ ```
508
+
509
+ Example (Sync - Download to specific path):
510
+
511
+ ```python
512
+ @env.task
513
+ def download_to_path_sync(d: Dir) -> str:
514
+ local_dir = d.download_sync("/tmp/my_data/")
515
+ return local_dir
516
+ ```
224
517
  Args:
225
518
  local_path: The local path to download the directory to. If None, a temporary
226
- directory will be used.
519
+ directory will be used and a path will be generated.
227
520
 
228
521
  Returns:
229
- The path to the downloaded directory
230
-
231
- Example:
232
- ```python
233
- local_dir = directory.download_sync('/tmp/my_data/')
234
- ```
522
+ The absolute path to the downloaded directory
235
523
  """
236
- local_dest = str(local_path) if local_path else str(storage.get_random_local_path())
524
+ # If no local_path specified, create a unique path + append source directory name
525
+ if local_path is None:
526
+ unique_path = storage.get_random_local_path()
527
+ source_dirname = Path(self.path).name
528
+ local_dest = str(Path(unique_path) / source_dirname)
529
+ else:
530
+ # If local_path is specified, use it directly (contents go into it)
531
+ local_dest = str(local_path)
532
+
237
533
  if not storage.is_remote(self.path):
238
534
  if not local_path or local_path == self.path:
239
535
  # Skip copying
@@ -243,52 +539,188 @@ class Dir(BaseModel, Generic[T], SerializableType):
243
539
  import shutil
244
540
 
245
541
  shutil.copytree(self.path, local_dest, dirs_exist_ok=True)
542
+ return local_dest
246
543
 
247
- # Figure this out when we figure out the final sync story
248
- raise NotImplementedError("Sync download is not implemented for remote paths")
544
+ fs = storage.get_underlying_filesystem(path=self.path)
545
+ fs.get(self.path, local_dest, recursive=True)
546
+ return local_dest
249
547
 
250
548
  @classmethod
251
- async def from_local(cls, local_path: Union[str, Path], remote_path: Optional[str] = None) -> Dir[T]:
549
+ async def from_local(
550
+ cls,
551
+ local_path: Union[str, Path],
552
+ remote_destination: Optional[str] = None,
553
+ dir_cache_key: Optional[str] = None,
554
+ ) -> Dir[T]:
252
555
  """
253
- Asynchronously create a new Dir by uploading a local directory to the configured remote store.
556
+ Asynchronously create a new Dir by uploading a local directory to remote storage.
557
+
558
+ Use this in async tasks when you have a local directory that needs to be uploaded to remote storage.
559
+
560
+ Example (Async):
561
+
562
+ ```python
563
+ @env.task
564
+ async def upload_local_directory() -> Dir:
565
+ # Create a local directory with files
566
+ os.makedirs("/tmp/data_dir", exist_ok=True)
567
+ with open("/tmp/data_dir/file1.txt", "w") as f:
568
+ f.write("data1")
569
+
570
+ # Upload to remote storage
571
+ remote_dir = await Dir.from_local("/tmp/data_dir/")
572
+ return remote_dir
573
+ ```
574
+
575
+ Example (Async - With specific destination):
576
+
577
+ ```python
578
+ @env.task
579
+ async def upload_to_specific_path() -> Dir:
580
+ remote_dir = await Dir.from_local("/tmp/data_dir/", "s3://my-bucket/data/")
581
+ return remote_dir
582
+ ```
254
583
 
584
+ Example (Async - With cache key):
585
+
586
+ ```python
587
+ @env.task
588
+ async def upload_with_cache_key() -> Dir:
589
+ remote_dir = await Dir.from_local("/tmp/data_dir/", dir_cache_key="my_cache_key_123")
590
+ return remote_dir
591
+ ```
255
592
  Args:
256
593
  local_path: Path to the local directory
257
- remote_path: Optional path to store the directory remotely. If None, a path will be generated.
594
+ remote_destination: Optional remote path to store the directory. If None, a path will be automatically
595
+ generated.
596
+ dir_cache_key: Optional precomputed hash value to use for cache key computation when this Dir is used
597
+ as an input to discoverable tasks. If not specified, the cache key will be based on
598
+ directory attributes.
258
599
 
259
600
  Returns:
260
601
  A new Dir instance pointing to the uploaded directory
261
-
262
- Example:
263
- ```python
264
- remote_dir = await Dir[DataFrame].from_local('/tmp/data_dir/', 's3://bucket/data/')
265
- ```
266
602
  """
267
603
  local_path_str = str(local_path)
268
604
  dirname = os.path.basename(os.path.normpath(local_path_str))
605
+ resolved_remote_path = remote_destination or internal_ctx().raw_data.get_random_remote_path(dirname)
606
+ protocol = get_protocol(resolved_remote_path)
269
607
 
270
- output_path = await storage.put(from_path=local_path_str, to_path=remote_path, recursive=True)
271
- return cls(path=output_path, name=dirname)
608
+ # Shortcut for local, don't copy and just return
609
+ if "file" in protocol and remote_destination is None:
610
+ output_path = str(Path(local_path).absolute())
611
+ return cls(path=output_path, name=dirname, hash=dir_cache_key)
612
+
613
+ # todo: in the future, mirror File and set the file to_path here
614
+ output_path = await storage.put(from_path=local_path_str, to_path=remote_destination, recursive=True)
615
+ return cls(path=output_path, name=dirname, hash=dir_cache_key)
272
616
 
273
617
  @classmethod
274
- def from_local_sync(cls, local_path: Union[str, Path], remote_path: Optional[str] = None) -> Dir[T]:
618
+ def from_local_sync(
619
+ cls,
620
+ local_path: Union[str, Path],
621
+ remote_destination: Optional[str] = None,
622
+ dir_cache_key: Optional[str] = None,
623
+ ) -> Dir[T]:
275
624
  """
276
- Synchronously create a new Dir by uploading a local directory to the configured remote store.
625
+ Synchronously create a new Dir by uploading a local directory to remote storage.
626
+
627
+ Use this in non-async tasks when you have a local directory that needs to be uploaded to remote storage.
628
+
629
+ Example (Sync):
630
+
631
+ ```python
632
+ @env.task
633
+ def upload_local_directory_sync() -> Dir:
634
+ # Create a local directory with files
635
+ os.makedirs("/tmp/data_dir", exist_ok=True)
636
+ with open("/tmp/data_dir/file1.txt", "w") as f:
637
+ f.write("data1")
638
+
639
+ # Upload to remote storage
640
+ remote_dir = Dir.from_local_sync("/tmp/data_dir/")
641
+ return remote_dir
642
+ ```
643
+
644
+ Example (Sync - With specific destination):
645
+
646
+ ```python
647
+ @env.task
648
+ def upload_to_specific_path_sync() -> Dir:
649
+ remote_dir = Dir.from_local_sync("/tmp/data_dir/", "s3://my-bucket/data/")
650
+ return remote_dir
651
+ ```
652
+
653
+ Example (Sync - With cache key):
654
+
655
+ ```python
656
+ @env.task
657
+ def upload_with_cache_key_sync() -> Dir:
658
+ remote_dir = Dir.from_local_sync("/tmp/data_dir/", dir_cache_key="my_cache_key_123")
659
+ return remote_dir
660
+ ```
277
661
 
278
662
  Args:
279
663
  local_path: Path to the local directory
280
- remote_path: Optional path to store the directory remotely. If None, a path will be generated.
664
+ remote_destination: Optional remote path to store the directory. If None, a path will be automatically
665
+ generated.
666
+ dir_cache_key: Optional precomputed hash value to use for cache key computation when this Dir is used
667
+ as an input to discoverable tasks. If not specified, the cache key will be based on
668
+ directory attributes.
281
669
 
282
670
  Returns:
283
671
  A new Dir instance pointing to the uploaded directory
672
+ """
673
+ local_path_str = str(local_path)
674
+ dirname = os.path.basename(os.path.normpath(local_path_str))
675
+
676
+ resolved_remote_path = remote_destination or internal_ctx().raw_data.get_random_remote_path(dirname)
677
+ protocol = get_protocol(resolved_remote_path)
678
+
679
+ # Shortcut for local, don't copy and just return
680
+ if "file" in protocol and remote_destination is None:
681
+ output_path = str(Path(local_path).absolute())
682
+ return cls(path=output_path, name=dirname, hash=dir_cache_key)
683
+
684
+ fs = storage.get_underlying_filesystem(path=resolved_remote_path)
685
+ fs.put(local_path_str, resolved_remote_path, recursive=True)
686
+ return cls(path=resolved_remote_path, name=dirname, hash=dir_cache_key)
687
+
688
+ @classmethod
689
+ def from_existing_remote(cls, remote_path: str, dir_cache_key: Optional[str] = None) -> Dir[T]:
690
+ """
691
+ Create a Dir reference from an existing remote directory.
692
+
693
+ Use this when you want to reference a directory that already exists in remote storage without uploading it.
284
694
 
285
695
  Example:
286
- ```python
287
- remote_dir = Dir[DataFrame].from_local_sync('/tmp/data_dir/', 's3://bucket/data/')
288
- ```
696
+
697
+ ```python
698
+ @env.task
699
+ async def process_existing_directory() -> int:
700
+ d = Dir.from_existing_remote("s3://my-bucket/data/")
701
+ files = await d.list_files()
702
+ return len(files)
703
+ ```
704
+
705
+ Example (With cache key):
706
+
707
+ ```python
708
+ @env.task
709
+ async def process_with_cache_key() -> int:
710
+ d = Dir.from_existing_remote("s3://my-bucket/data/", dir_cache_key="abc123")
711
+ files = await d.list_files()
712
+ return len(files)
713
+ ```
714
+
715
+ Args:
716
+ remote_path: The remote path to the existing directory
717
+ dir_cache_key: Optional hash value to use for cache key computation. If not specified,
718
+ the cache key will be computed based on the directory's attributes.
719
+
720
+ Returns:
721
+ A new Dir instance pointing to the existing remote directory
289
722
  """
290
- # Implement this after we figure out the final sync story
291
- raise NotImplementedError("Sync upload is not implemented for remote paths")
723
+ return cls(path=remote_path, hash=dir_cache_key)
292
724
 
293
725
  async def exists(self) -> bool:
294
726
  """
@@ -297,11 +729,16 @@ class Dir(BaseModel, Generic[T], SerializableType):
297
729
  Returns:
298
730
  True if the directory exists, False otherwise
299
731
 
300
- Example:
301
- ```python
302
- if await directory.exists():
303
- # Process the directory
304
- ```
732
+ Example (Async):
733
+
734
+ ```python
735
+ @env.task
736
+ async def check_directory(d: Dir) -> bool:
737
+ if await d.exists():
738
+ print("Directory exists!")
739
+ return True
740
+ return False
741
+ ```
305
742
  """
306
743
  fs = storage.get_underlying_filesystem(path=self.path)
307
744
  if isinstance(fs, AsyncFileSystem):
@@ -313,34 +750,49 @@ class Dir(BaseModel, Generic[T], SerializableType):
313
750
  """
314
751
  Synchronously check if the directory exists.
315
752
 
753
+ Use this in non-async tasks or when you need synchronous directory existence checking.
754
+
316
755
  Returns:
317
756
  True if the directory exists, False otherwise
318
757
 
319
- Example:
320
- ```python
321
- if directory.exists_sync():
322
- # Process the directory
323
- ```
758
+ Example (Sync):
759
+
760
+ ```python
761
+ @env.task
762
+ def check_directory_sync(d: Dir) -> bool:
763
+ if d.exists_sync():
764
+ print("Directory exists!")
765
+ return True
766
+ return False
767
+ ```
324
768
  """
325
769
  fs = storage.get_underlying_filesystem(path=self.path)
326
770
  return fs.exists(self.path)
327
771
 
328
772
  async def get_file(self, file_name: str) -> Optional[File[T]]:
329
773
  """
330
- Asynchronously get a specific file from the directory.
774
+ Asynchronously get a specific file from the directory by name.
775
+
776
+ Use this when you know the name of a specific file in the directory you want to access.
777
+
778
+ Example (Async):
779
+
780
+ ```python
781
+ @env.task
782
+ async def read_specific_file(d: Dir) -> str:
783
+ file = await d.get_file("data.csv")
784
+ if file:
785
+ async with file.open("rb") as f:
786
+ content = await f.read()
787
+ return content.decode("utf-8")
788
+ return "File not found"
789
+ ```
331
790
 
332
791
  Args:
333
792
  file_name: The name of the file to get
334
793
 
335
794
  Returns:
336
795
  A File instance if the file exists, None otherwise
337
-
338
- Example:
339
- ```python
340
- file = await directory.get_file("data.csv")
341
- if file:
342
- # Process the file
343
- ```
344
796
  """
345
797
  fs = storage.get_underlying_filesystem(path=self.path)
346
798
  file_path = fs.sep.join([self.path, file_name])
@@ -352,20 +804,28 @@ class Dir(BaseModel, Generic[T], SerializableType):
352
804
 
353
805
  def get_file_sync(self, file_name: str) -> Optional[File[T]]:
354
806
  """
355
- Synchronously get a specific file from the directory.
807
+ Synchronously get a specific file from the directory by name.
808
+
809
+ Use this in non-async tasks when you know the name of a specific file in the directory you want to access.
810
+
811
+ Example (Sync):
812
+
813
+ ```python
814
+ @env.task
815
+ def read_specific_file_sync(d: Dir) -> str:
816
+ file = d.get_file_sync("data.csv")
817
+ if file:
818
+ with file.open_sync("rb") as f:
819
+ content = f.read()
820
+ return content.decode("utf-8")
821
+ return "File not found"
822
+ ```
356
823
 
357
824
  Args:
358
825
  file_name: The name of the file to get
359
826
 
360
827
  Returns:
361
828
  A File instance if the file exists, None otherwise
362
-
363
- Example:
364
- ```python
365
- file = directory.get_file_sync("data.csv")
366
- if file:
367
- # Process the file
368
- ```
369
829
  """
370
830
  file_path = os.path.join(self.path, file_name)
371
831
  file = File[T](path=file_path)
@@ -414,7 +874,8 @@ class DirTransformer(TypeTransformer[Dir]):
414
874
  ),
415
875
  uri=python_val.path,
416
876
  )
417
- )
877
+ ),
878
+ hash=python_val.hash if python_val.hash else None,
418
879
  )
419
880
 
420
881
  async def to_python_value(
@@ -432,7 +893,8 @@ class DirTransformer(TypeTransformer[Dir]):
432
893
 
433
894
  uri = lv.scalar.blob.uri
434
895
  filename = Path(uri).name
435
- f: Dir = Dir(path=uri, name=filename, format=lv.scalar.blob.metadata.type.format)
896
+ hash_value = lv.hash if lv.hash else None
897
+ f: Dir = Dir(path=uri, name=filename, format=lv.scalar.blob.metadata.type.format, hash=hash_value)
436
898
  return f
437
899
 
438
900
  def guess_python_type(self, literal_type: types_pb2.LiteralType) -> Type[Dir]: