flyte 2.0.0b22__py3-none-any.whl → 2.0.0b23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of flyte might be problematic. Click here for more details.

Files changed (88) hide show
  1. flyte/__init__.py +5 -0
  2. flyte/_bin/runtime.py +35 -5
  3. flyte/_cache/cache.py +4 -2
  4. flyte/_cache/local_cache.py +215 -0
  5. flyte/_code_bundle/bundle.py +1 -0
  6. flyte/_debug/constants.py +0 -1
  7. flyte/_debug/vscode.py +6 -1
  8. flyte/_deploy.py +193 -52
  9. flyte/_environment.py +5 -0
  10. flyte/_excepthook.py +1 -1
  11. flyte/_image.py +101 -72
  12. flyte/_initialize.py +23 -0
  13. flyte/_internal/controllers/_local_controller.py +64 -24
  14. flyte/_internal/controllers/remote/_action.py +4 -1
  15. flyte/_internal/controllers/remote/_controller.py +5 -2
  16. flyte/_internal/controllers/remote/_core.py +6 -3
  17. flyte/_internal/controllers/remote/_informer.py +1 -1
  18. flyte/_internal/imagebuild/docker_builder.py +92 -28
  19. flyte/_internal/imagebuild/image_builder.py +7 -13
  20. flyte/_internal/imagebuild/remote_builder.py +6 -1
  21. flyte/_internal/runtime/io.py +13 -1
  22. flyte/_internal/runtime/rusty.py +17 -2
  23. flyte/_internal/runtime/task_serde.py +14 -20
  24. flyte/_internal/runtime/taskrunner.py +1 -1
  25. flyte/_internal/runtime/trigger_serde.py +153 -0
  26. flyte/_logging.py +1 -1
  27. flyte/_protos/common/identifier_pb2.py +19 -1
  28. flyte/_protos/common/identifier_pb2.pyi +22 -0
  29. flyte/_protos/workflow/common_pb2.py +14 -3
  30. flyte/_protos/workflow/common_pb2.pyi +49 -0
  31. flyte/_protos/workflow/queue_service_pb2.py +41 -35
  32. flyte/_protos/workflow/queue_service_pb2.pyi +26 -12
  33. flyte/_protos/workflow/queue_service_pb2_grpc.py +34 -0
  34. flyte/_protos/workflow/run_definition_pb2.py +38 -38
  35. flyte/_protos/workflow/run_definition_pb2.pyi +4 -2
  36. flyte/_protos/workflow/run_service_pb2.py +60 -50
  37. flyte/_protos/workflow/run_service_pb2.pyi +24 -6
  38. flyte/_protos/workflow/run_service_pb2_grpc.py +34 -0
  39. flyte/_protos/workflow/task_definition_pb2.py +15 -11
  40. flyte/_protos/workflow/task_definition_pb2.pyi +19 -2
  41. flyte/_protos/workflow/task_service_pb2.py +18 -17
  42. flyte/_protos/workflow/task_service_pb2.pyi +5 -2
  43. flyte/_protos/workflow/trigger_definition_pb2.py +66 -0
  44. flyte/_protos/workflow/trigger_definition_pb2.pyi +117 -0
  45. flyte/_protos/workflow/trigger_definition_pb2_grpc.py +4 -0
  46. flyte/_protos/workflow/trigger_service_pb2.py +96 -0
  47. flyte/_protos/workflow/trigger_service_pb2.pyi +110 -0
  48. flyte/_protos/workflow/trigger_service_pb2_grpc.py +281 -0
  49. flyte/_run.py +42 -15
  50. flyte/_task.py +35 -4
  51. flyte/_task_environment.py +60 -15
  52. flyte/_trigger.py +382 -0
  53. flyte/_version.py +3 -3
  54. flyte/cli/_abort.py +3 -3
  55. flyte/cli/_build.py +1 -3
  56. flyte/cli/_common.py +15 -2
  57. flyte/cli/_create.py +74 -0
  58. flyte/cli/_delete.py +23 -1
  59. flyte/cli/_deploy.py +5 -9
  60. flyte/cli/_get.py +75 -34
  61. flyte/cli/_params.py +4 -2
  62. flyte/cli/_run.py +12 -3
  63. flyte/cli/_update.py +36 -0
  64. flyte/cli/_user.py +17 -0
  65. flyte/cli/main.py +9 -1
  66. flyte/errors.py +9 -0
  67. flyte/io/_dir.py +513 -115
  68. flyte/io/_file.py +495 -135
  69. flyte/models.py +32 -0
  70. flyte/remote/__init__.py +6 -1
  71. flyte/remote/_client/_protocols.py +36 -2
  72. flyte/remote/_client/controlplane.py +19 -3
  73. flyte/remote/_run.py +42 -2
  74. flyte/remote/_task.py +14 -1
  75. flyte/remote/_trigger.py +308 -0
  76. flyte/remote/_user.py +33 -0
  77. flyte/storage/__init__.py +6 -1
  78. flyte/storage/_storage.py +119 -101
  79. flyte/types/_pickle.py +16 -3
  80. {flyte-2.0.0b22.data → flyte-2.0.0b23.data}/scripts/runtime.py +35 -5
  81. {flyte-2.0.0b22.dist-info → flyte-2.0.0b23.dist-info}/METADATA +3 -1
  82. {flyte-2.0.0b22.dist-info → flyte-2.0.0b23.dist-info}/RECORD +87 -75
  83. flyte/_protos/secret/secret_pb2_grpc_grpc.py +0 -198
  84. {flyte-2.0.0b22.data → flyte-2.0.0b23.data}/scripts/debug.py +0 -0
  85. {flyte-2.0.0b22.dist-info → flyte-2.0.0b23.dist-info}/WHEEL +0 -0
  86. {flyte-2.0.0b22.dist-info → flyte-2.0.0b23.dist-info}/entry_points.txt +0 -0
  87. {flyte-2.0.0b22.dist-info → flyte-2.0.0b23.dist-info}/licenses/LICENSE +0 -0
  88. {flyte-2.0.0b22.dist-info → flyte-2.0.0b23.dist-info}/top_level.txt +0 -0
flyte/io/_file.py CHANGED
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import inspect
3
4
  import os
5
+ import typing
4
6
  from contextlib import asynccontextmanager, contextmanager
5
7
  from pathlib import Path
6
8
  from typing import (
@@ -19,19 +21,21 @@ from typing import (
19
21
 
20
22
  import aiofiles
21
23
  from flyteidl.core import literals_pb2, types_pb2
22
- from fsspec.asyn import AsyncFileSystem
23
24
  from fsspec.utils import get_protocol
24
25
  from mashumaro.types import SerializableType
25
26
  from pydantic import BaseModel, Field, model_validator
26
27
  from pydantic.json_schema import SkipJsonSchema
27
28
 
29
+ import flyte.errors
28
30
  import flyte.storage as storage
29
31
  from flyte._context import internal_ctx
30
32
  from flyte._initialize import requires_initialization
31
- from flyte._logging import logger
32
33
  from flyte.io._hashing_io import AsyncHashingReader, HashingWriter, HashMethod, PrecomputedValue
33
34
  from flyte.types import TypeEngine, TypeTransformer, TypeTransformerFailedError
34
35
 
36
+ if typing.TYPE_CHECKING:
37
+ from obstore import AsyncReadableFile, AsyncWritableFile
38
+
35
39
  # Type variable for the file format
36
40
  T = TypeVar("T")
37
41
 
@@ -39,63 +43,139 @@ T = TypeVar("T")
39
43
  class File(BaseModel, Generic[T], SerializableType):
40
44
  """
41
45
  A generic file class representing a file with a specified format.
42
- Provides both async and sync interfaces for file operations.
43
- Users must handle all I/O operations themselves by instantiating this class with the appropriate class methods.
46
+ Provides both async and sync interfaces for file operations. All methods without _sync suffix are async.
47
+
48
+ The class should be instantiated using one of the class methods. The constructor should be used only to
49
+ instantiate references to existing remote objects.
44
50
 
45
51
  The generic type T represents the format of the file.
46
52
 
47
- Example:
48
- ```python
49
- # Async usage
50
- from pandas import DataFrame
51
- csv_file = File[DataFrame](path="s3://my-bucket/data.csv")
53
+ Important methods:
54
+ - `from_existing_remote`: Create a File object from an existing remote file.
55
+ - `new_remote`: Create a new File reference for a remote file that will be written to.
52
56
 
53
- async with csv_file.open() as f:
54
- content = await f.read()
57
+ **Asynchronous methods**:
58
+ - `open`: Asynchronously open the file and return a file-like object.
59
+ - `download`: Asynchronously download the file to a local path.
60
+ - `from_local`: Asynchronously create a File object from a local file, uploading it to remote storage.
61
+ - `exists`: Asynchronously check if the file exists.
55
62
 
56
- # Sync alternative
57
- with csv_file.open_sync() as f:
58
- content = f.read()
59
- ```
63
+ **Synchronous methods** (suffixed with `_sync`):
64
+ - `open_sync`: Synchronously open the file and return a file-like object.
65
+ - `download_sync`: Synchronously download the file to a local path.
66
+ - `from_local_sync`: Synchronously create a File object from a local file, uploading it to remote storage.
67
+ - `exists_sync`: Synchronously check if the file exists.
60
68
 
61
- Example: Read a file input in a Task.
62
- ```
69
+ Example: Read a file input in a Task (Async).
70
+
71
+ ```python
63
72
  @env.task
64
- async def my_task(file: File[DataFrame]):
65
- async with file.open() as f:
66
- df = pd.read_csv(f)
73
+ async def read_file(file: File) -> str:
74
+ async with file.open("rb") as f:
75
+ content = bytes(await f.read())
76
+ return content.decode("utf-8")
67
77
  ```
68
78
 
69
- Example: Write a file by streaming it directly to blob storage
79
+ Example: Read a file input in a Task (Sync).
80
+
81
+ ```python
82
+ @env.task
83
+ def read_file_sync(file: File) -> str:
84
+ with file.open_sync("rb") as f:
85
+ content = f.read()
86
+ return content.decode("utf-8")
70
87
  ```
88
+
89
+ Example: Write a file by streaming it directly to blob storage (Async).
90
+
91
+ ```python
71
92
  @env.task
72
- async def my_task() -> File[DataFrame]:
73
- df = pd.DataFrame(...)
93
+ async def write_file() -> File:
74
94
  file = File.new_remote()
75
95
  async with file.open("wb") as f:
76
- df.to_csv(f)
77
- # No additional uploading will be done here.
96
+ await f.write(b"Hello, World!")
78
97
  return file
79
98
  ```
80
- Example: Write a file by writing it locally first, and then uploading it.
99
+
100
+ Example: Upload a local file to remote storage (Async).
101
+
102
+ ```python
103
+ @env.task
104
+ async def upload_file() -> File:
105
+ # Write to local file first
106
+ with open("/tmp/data.csv", "w") as f:
107
+ f.write("col1,col2\\n1,2\\n3,4\\n")
108
+ # Upload to remote storage
109
+ return await File.from_local("/tmp/data.csv")
81
110
  ```
111
+
112
+ Example: Upload a local file to remote storage (Sync).
113
+
114
+ ```python
82
115
  @env.task
83
- async def my_task() -> File[DataFrame]:
84
- # write to /tmp/data.csv
85
- return File.from_local("/tmp/data.csv", optional="s3://my-bucket/data.csv")
116
+ def upload_file_sync() -> File:
117
+ # Write to local file first
118
+ with open("/tmp/data.csv", "w") as f:
119
+ f.write("col1,col2\\n1,2\\n3,4\\n")
120
+ # Upload to remote storage
121
+ return File.from_local_sync("/tmp/data.csv")
86
122
  ```
87
123
 
88
- Example: From an existing remote file
124
+ Example: Download a file to local storage (Async).
125
+
126
+ ```python
127
+ @env.task
128
+ async def download_file(file: File) -> str:
129
+ local_path = await file.download()
130
+ # Process the local file
131
+ with open(local_path, "r") as f:
132
+ return f.read()
89
133
  ```
134
+
135
+ Example: Download a file to local storage (Sync).
136
+
137
+ ```python
90
138
  @env.task
91
- async def my_task() -> File[DataFrame]:
92
- return File.from_existing_remote("s3://my-bucket/data.csv")
139
+ def download_file_sync(file: File) -> str:
140
+ local_path = file.download_sync()
141
+ # Process the local file
142
+ with open(local_path, "r") as f:
143
+ return f.read()
93
144
  ```
94
145
 
95
- Example: Take a remote file as input and return the same one, should not do any copy
146
+ Example: Reference an existing remote file.
147
+
148
+ ```python
149
+ @env.task
150
+ async def process_existing_file() -> str:
151
+ file = File.from_existing_remote("s3://my-bucket/data.csv")
152
+ async with file.open("rb") as f:
153
+ content = await f.read()
154
+ return content.decode("utf-8")
96
155
  ```
156
+
157
+ Example: Check if a file exists (Async).
158
+
159
+ ```python
97
160
  @env.task
98
- async def my_task(file: File[DataFrame]) -> File[DataFrame]:
161
+ async def check_file(file: File) -> bool:
162
+ return await file.exists()
163
+ ```
164
+
165
+ Example: Check if a file exists (Sync).
166
+
167
+ ```python
168
+ @env.task
169
+ def check_file_sync(file: File) -> bool:
170
+ return file.exists_sync()
171
+ ```
172
+
173
+ Example: Pass through a file without copying.
174
+
175
+ ```python
176
+ @env.task
177
+ async def pass_through(file: File) -> File:
178
+ # No copy occurs - just passes the reference
99
179
  return file
100
180
  ```
101
181
 
@@ -116,20 +196,24 @@ class File(BaseModel, Generic[T], SerializableType):
116
196
  @model_validator(mode="before")
117
197
  @classmethod
118
198
  def pre_init(cls, data):
199
+ """Internal: Pydantic validator to set default name from path. Not intended for direct use."""
119
200
  if data.get("name") is None:
120
201
  data["name"] = Path(data["path"]).name
121
202
  return data
122
203
 
123
204
  def _serialize(self) -> Dict[str, Optional[str]]:
205
+ """Internal: Serialize File to dictionary. Not intended for direct use."""
124
206
  pyd_dump = self.model_dump()
125
207
  return pyd_dump
126
208
 
127
209
  @classmethod
128
210
  def _deserialize(cls, file_dump: Dict[str, Optional[str]]) -> File:
211
+ """Internal: Deserialize File from dictionary. Not intended for direct use."""
129
212
  return File.model_validate(file_dump)
130
213
 
131
214
  @classmethod
132
215
  def schema_match(cls, incoming: dict):
216
+ """Internal: Check if incoming schema matches File schema. Not intended for direct use."""
133
217
  this_schema = cls.model_json_schema()
134
218
  current_required = this_schema.get("required")
135
219
  incoming_required = incoming.get("required")
@@ -148,16 +232,27 @@ class File(BaseModel, Generic[T], SerializableType):
148
232
  """
149
233
  Create a new File reference for a remote file that will be written to.
150
234
 
151
- Example:
152
- ```
235
+ Use this when you want to create a new file and write to it directly without creating a local file first.
236
+
237
+ Example (Async):
238
+
239
+ ```python
153
240
  @env.task
154
- async def my_task() -> File[DataFrame]:
155
- df = pd.DataFrame(...)
241
+ async def create_csv() -> File:
242
+ df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
156
243
  file = File.new_remote()
157
244
  async with file.open("wb") as f:
158
245
  df.to_csv(f)
159
246
  return file
160
247
  ```
248
+
249
+ Args:
250
+ hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
251
+ it will be used as a precomputed cache key. If a HashMethod is provided, it will be used
252
+ to compute the hash as data is written.
253
+
254
+ Returns:
255
+ A new File instance with a generated remote path
161
256
  """
162
257
  ctx = internal_ctx()
163
258
  known_cache_key = hash_method if isinstance(hash_method, str) else None
@@ -170,17 +265,26 @@ class File(BaseModel, Generic[T], SerializableType):
170
265
  """
171
266
  Create a File reference from an existing remote file.
172
267
 
268
+ Use this when you want to reference a file that already exists in remote storage without uploading it.
269
+
173
270
  Example:
271
+
174
272
  ```python
175
273
  @env.task
176
- async def my_task() -> File[DataFrame]:
177
- return File.from_existing_remote("s3://my-bucket/data.csv")
274
+ async def process_existing_file() -> str:
275
+ file = File.from_existing_remote("s3://my-bucket/data.csv")
276
+ async with file.open("rb") as f:
277
+ content = await f.read()
278
+ return content.decode("utf-8")
178
279
  ```
179
280
 
180
281
  Args:
181
282
  remote_path: The remote path to the existing file
182
- file_cache_key: Optional hash value to use for discovery purposes. If not specified, the value of this
183
- File object will be hashed (basically the path, not the contents).
283
+ file_cache_key: Optional hash value to use for cache key computation. If not specified, the cache key
284
+ will be computed based on the file's attributes (path, name, format).
285
+
286
+ Returns:
287
+ A new File instance pointing to the existing remote file
184
288
  """
185
289
  return cls(path=remote_path, hash=file_cache_key)
186
290
 
@@ -193,92 +297,129 @@ class File(BaseModel, Generic[T], SerializableType):
193
297
  cache_options: Optional[dict] = None,
194
298
  compression: Optional[str] = None,
195
299
  **kwargs,
196
- ) -> AsyncGenerator[Union[IO[Any], "HashingWriter"], None]:
300
+ ) -> AsyncGenerator[Union[AsyncWritableFile, AsyncReadableFile, "HashingWriter"], None]:
197
301
  """
198
302
  Asynchronously open the file and return a file-like object.
199
303
 
304
+ Use this method in async tasks to read from or write to files directly.
305
+
306
+ Example (Async Read):
307
+
308
+ ```python
309
+ @env.task
310
+ async def read_file(f: File) -> str:
311
+ async with f.open("rb") as fh:
312
+ content = bytes(await fh.read())
313
+ return content.decode("utf-8")
314
+ ```
315
+
316
+ Example (Async Write):
317
+
318
+ ```python
319
+ @env.task
320
+ async def write_file() -> File:
321
+ f = File.new_remote()
322
+ async with f.open("wb") as fh:
323
+ await fh.write(b"Hello, World!")
324
+ return f
325
+ ```
326
+
327
+ Example (Streaming Read):
328
+
329
+ ```python
330
+ @env.task
331
+ async def stream_read(f: File) -> str:
332
+ content_parts = []
333
+ async with f.open("rb", block_size=1024) as fh:
334
+ while True:
335
+ chunk = await fh.read()
336
+ if not chunk:
337
+ break
338
+ content_parts.append(chunk)
339
+ return b"".join(content_parts).decode("utf-8")
340
+ ```
341
+
200
342
  Args:
201
- mode: The mode to open the file in (default: 'rb')
202
- block_size: Size of blocks for reading (bytes)
343
+ mode: The mode to open the file in (default: 'rb'). Common modes: 'rb' (read binary),
344
+ 'wb' (write binary), 'rt' (read text), 'wt' (write text)
345
+ block_size: Size of blocks for reading in bytes. Useful for streaming large files.
203
346
  cache_type: Caching mechanism to use ('readahead', 'mmap', 'bytes', 'none')
204
347
  cache_options: Dictionary of options for the cache
205
348
  compression: Compression format or None for auto-detection
206
349
  **kwargs: Additional arguments passed to fsspec's open method
207
350
 
208
351
  Returns:
209
- An async file-like object
210
-
211
- Example:
212
- ```python
213
- async with file.open('rb') as f:
214
- data = await f.read()
215
- ```
352
+ An async file-like object that can be used with async read/write operations
216
353
  """
217
- fs = storage.get_underlying_filesystem(path=self.path)
218
-
219
- # Set up cache options if provided
220
- if cache_options is None:
221
- cache_options = {}
222
-
223
- # Configure the open parameters
224
- open_kwargs = {"mode": mode, **kwargs}
225
- if compression:
226
- open_kwargs["compression"] = compression
227
-
228
- if block_size:
229
- open_kwargs["block_size"] = block_size
230
-
231
- # Apply caching strategy
232
- if cache_type != "none":
233
- open_kwargs["cache_type"] = cache_type
234
- open_kwargs["cache_options"] = cache_options
235
-
236
- # Use aiofiles for local files
237
- if fs.protocol == "file":
238
- async with aiofiles.open(self.path, mode=mode, **kwargs) as f:
239
- yield f
240
- else:
241
- # This code is broadly similar to what storage.get_stream does, but without actually reading from the stream
242
- file_handle = None
354
+ # Check if we should use obstore bypass
355
+ try:
356
+ fh = await storage.open(
357
+ self.path,
358
+ mode=mode,
359
+ cache_type=cache_type,
360
+ cache_options=cache_options,
361
+ compression=compression,
362
+ block_size=block_size,
363
+ **kwargs,
364
+ )
243
365
  try:
244
- if "b" not in mode:
245
- raise ValueError("Mode must include 'b' for binary access, when using remote files.")
246
- if isinstance(fs, AsyncFileSystem):
247
- file_handle = await fs.open_async(self.path, mode)
248
- yield file_handle
249
- return
250
- except NotImplementedError:
251
- logger.debug(f"{fs} doesn't implement 'open_async', falling back to sync")
366
+ yield fh
367
+ return
252
368
  finally:
253
- if file_handle is not None:
254
- file_handle.close()
255
-
256
- with fs.open(self.path, mode) as file_handle:
257
- if self.hash_method and self.hash is None:
258
- logger.debug(f"Wrapping file handle with hashing writer using {self.hash_method}")
259
- fh = HashingWriter(file_handle, accumulator=self.hash_method)
260
- yield fh
261
- self.hash = fh.result()
262
- fh.close()
369
+ if inspect.iscoroutinefunction(fh.close):
370
+ await fh.close()
263
371
  else:
264
- yield file_handle
265
- file_handle.close()
372
+ fh.close()
373
+ except flyte.errors.OnlyAsyncIOSupportedError:
374
+ # Fall back to aiofiles
375
+ fs = storage.get_underlying_filesystem(path=self.path)
376
+ if "file" in fs.protocol:
377
+ async with aiofiles.open(self.path, mode=mode, **kwargs) as f:
378
+ yield f
379
+ return
380
+ raise
381
+
382
+ async def exists(self) -> bool:
383
+ """
384
+ Asynchronously check if the file exists.
385
+
386
+ Example (Async):
387
+
388
+ ```python
389
+ @env.task
390
+ async def check_file(f: File) -> bool:
391
+ if await f.exists():
392
+ print("File exists!")
393
+ return True
394
+ return False
395
+ ```
396
+
397
+ Returns:
398
+ True if the file exists, False otherwise
399
+ """
400
+ return await storage.exists(self.path)
266
401
 
267
402
  def exists_sync(self) -> bool:
268
403
  """
269
404
  Synchronously check if the file exists.
270
405
 
406
+ Use this in non-async tasks or when you need synchronous file existence checking.
407
+
408
+ Example (Sync):
409
+
410
+ ```python
411
+ @env.task
412
+ def check_file_sync(f: File) -> bool:
413
+ if f.exists_sync():
414
+ print("File exists!")
415
+ return True
416
+ return False
417
+ ```
418
+
271
419
  Returns:
272
420
  True if the file exists, False otherwise
273
-
274
- Example:
275
- ```python
276
- if file.exists_sync():
277
- # Process the file
278
- ```
279
421
  """
280
- fs = storage.get_underlying_filesystem(path=self.path)
281
- return fs.exists(self.path)
422
+ return storage.exists_sync(self.path)
282
423
 
283
424
  @contextmanager
284
425
  def open_sync(
@@ -289,26 +430,44 @@ class File(BaseModel, Generic[T], SerializableType):
289
430
  cache_options: Optional[dict] = None,
290
431
  compression: Optional[str] = None,
291
432
  **kwargs,
292
- ) -> Generator[IO[Any]]:
433
+ ) -> Generator[IO[Any], None, None]:
293
434
  """
294
435
  Synchronously open the file and return a file-like object.
295
436
 
437
+ Use this method in non-async tasks to read from or write to files directly.
438
+
439
+ Example (Sync Read):
440
+
441
+ ```python
442
+ @env.task
443
+ def read_file_sync(f: File) -> str:
444
+ with f.open_sync("rb") as fh:
445
+ content = fh.read()
446
+ return content.decode("utf-8")
447
+ ```
448
+
449
+ Example (Sync Write):
450
+
451
+ ```python
452
+ @env.task
453
+ def write_file_sync() -> File:
454
+ f = File.new_remote()
455
+ with f.open_sync("wb") as fh:
456
+ fh.write(b"Hello, World!")
457
+ return f
458
+ ```
459
+
296
460
  Args:
297
- mode: The mode to open the file in (default: 'rb')
298
- block_size: Size of blocks for reading (bytes)
461
+ mode: The mode to open the file in (default: 'rb'). Common modes: 'rb' (read binary),
462
+ 'wb' (write binary), 'rt' (read text), 'wt' (write text)
463
+ block_size: Size of blocks for reading in bytes. Useful for streaming large files.
299
464
  cache_type: Caching mechanism to use ('readahead', 'mmap', 'bytes', 'none')
300
465
  cache_options: Dictionary of options for the cache
301
466
  compression: Compression format or None for auto-detection
302
467
  **kwargs: Additional arguments passed to fsspec's open method
303
468
 
304
469
  Returns:
305
- A file-like object
306
-
307
- Example:
308
- ```python
309
- with file.open_sync('rb') as f:
310
- data = f.read()
311
- ```
470
+ A file-like object that can be used with standard read/write operations
312
471
  """
313
472
  fs = storage.get_underlying_filesystem(path=self.path)
314
473
 
@@ -335,17 +494,34 @@ class File(BaseModel, Generic[T], SerializableType):
335
494
  """
336
495
  Asynchronously download the file to a local path.
337
496
 
497
+ Use this when you need to download a remote file to your local filesystem for processing.
498
+
499
+ Example (Async):
500
+
501
+ ```python
502
+ @env.task
503
+ async def download_and_process(f: File) -> str:
504
+ local_path = await f.download()
505
+ # Now process the local file
506
+ with open(local_path, "r") as fh:
507
+ return fh.read()
508
+ ```
509
+
510
+ Example (Download to specific path):
511
+
512
+ ```python
513
+ @env.task
514
+ async def download_to_path(f: File) -> str:
515
+ local_path = await f.download("/tmp/myfile.csv")
516
+ return local_path
517
+ ```
518
+
338
519
  Args:
339
520
  local_path: The local path to download the file to. If None, a temporary
340
- directory will be used.
521
+ directory will be used and a path will be generated.
341
522
 
342
523
  Returns:
343
- The path to the downloaded file
344
-
345
- Example:
346
- ```python
347
- local_file = await file.download('/tmp/myfile.csv')
348
- ```
524
+ The absolute path to the downloaded file
349
525
  """
350
526
  if local_path is None:
351
527
  local_path = storage.get_random_local_path(file_path_or_file_name=local_path)
@@ -366,32 +542,216 @@ class File(BaseModel, Generic[T], SerializableType):
366
542
  await storage.get(self.path, str(local_path))
367
543
  return str(local_path)
368
544
 
545
+ def download_sync(self, local_path: Optional[Union[str, Path]] = None) -> str:
546
+ """
547
+ Synchronously download the file to a local path.
548
+
549
+ Use this in non-async tasks when you need to download a remote file to your local filesystem.
550
+
551
+ Example (Sync):
552
+
553
+ ```python
554
+ @env.task
555
+ def download_and_process_sync(f: File) -> str:
556
+ local_path = f.download_sync()
557
+ # Now process the local file
558
+ with open(local_path, "r") as fh:
559
+ return fh.read()
560
+ ```
561
+
562
+ Example (Download to specific path):
563
+
564
+ ```python
565
+ @env.task
566
+ def download_to_path_sync(f: File) -> str:
567
+ local_path = f.download_sync("/tmp/myfile.csv")
568
+ return local_path
569
+ ```
570
+
571
+ Args:
572
+ local_path: The local path to download the file to. If None, a temporary
573
+ directory will be used and a path will be generated.
574
+
575
+ Returns:
576
+ The absolute path to the downloaded file
577
+ """
578
+ if local_path is None:
579
+ local_path = storage.get_random_local_path(file_path_or_file_name=local_path)
580
+ else:
581
+ local_path = str(Path(local_path).absolute())
582
+
583
+ fs = storage.get_underlying_filesystem(path=self.path)
584
+
585
+ # If it's already a local file, just copy it
586
+ if "file" in fs.protocol:
587
+ # Use standard file operations for sync copy
588
+ import shutil
589
+
590
+ shutil.copy2(self.path, local_path)
591
+ return str(local_path)
592
+
593
+ # Otherwise download from remote using sync functionality
594
+ # Use the sync version of storage operations
595
+ with fs.open(self.path, "rb") as src:
596
+ with open(local_path, "wb") as dst:
597
+ dst.write(src.read())
598
+ return str(local_path)
599
+
369
600
  @classmethod
370
601
  @requires_initialization
371
- async def from_local(
602
+ def from_local_sync(
372
603
  cls,
373
604
  local_path: Union[str, Path],
374
605
  remote_destination: Optional[str] = None,
375
606
  hash_method: Optional[HashMethod | str] = None,
376
607
  ) -> File[T]:
377
608
  """
378
- Create a new File object from a local file that will be uploaded to the configured remote store.
609
+ Synchronously create a new File object from a local file by uploading it to remote storage.
610
+
611
+ Use this in non-async tasks when you have a local file that needs to be uploaded to remote storage.
612
+
613
+ Example (Sync):
614
+
615
+ ```python
616
+ @env.task
617
+ def upload_local_file_sync() -> File:
618
+ # Create a local file
619
+ with open("/tmp/data.csv", "w") as f:
620
+ f.write("col1,col2\n1,2\n3,4\n")
621
+
622
+ # Upload to remote storage
623
+ remote_file = File.from_local_sync("/tmp/data.csv")
624
+ return remote_file
625
+ ```
626
+
627
+ Example (With specific destination):
628
+
629
+ ```python
630
+ @env.task
631
+ def upload_to_specific_path() -> File:
632
+ remote_file = File.from_local_sync("/tmp/data.csv", "s3://my-bucket/data.csv")
633
+ return remote_file
634
+ ```
379
635
 
380
636
  Args:
381
637
  local_path: Path to the local file
382
- remote_destination: Optional path to store the file remotely. If None, a path will be generated.
383
- hash_method: Pass this argument either as a set string or a HashMethod to use for
384
- determining a task's cache key if this File object is used as an input to said task. If not specified,
385
- the cache key will just be computed based on this object's attributes (i.e. path, name, format, etc.).
386
- If there is a set value you want to use, please pass an instance of the PrecomputedValue HashMethod.
638
+ remote_destination: Optional remote path to store the file. If None, a path will be automatically generated.
639
+ hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
640
+ it will be used as a precomputed cache key. If a HashMethod is provided, it will compute
641
+ the hash during upload. If not specified, the cache key will be based on file attributes.
387
642
 
388
643
  Returns:
389
- A new File instance pointing to the uploaded file
644
+ A new File instance pointing to the uploaded remote file
645
+ """
646
+ if not os.path.exists(local_path):
647
+ raise ValueError(f"File not found: {local_path}")
390
648
 
391
- Example:
392
- ```python
393
- remote_file = await File[DataFrame].from_local('/tmp/data.csv', 's3://bucket/data.csv')
394
- ```
649
+ remote_path = remote_destination or internal_ctx().raw_data.get_random_remote_path()
650
+ protocol = get_protocol(remote_path)
651
+ filename = Path(local_path).name
652
+
653
+ # If remote_destination was not set by the user, and the configured raw data path is also local,
654
+ # then let's optimize by not uploading.
655
+ hash_value = hash_method if isinstance(hash_method, str) else None
656
+ hash_method_obj = hash_method if isinstance(hash_method, HashMethod) else None
657
+
658
+ if "file" in protocol:
659
+ if remote_destination is None:
660
+ path = str(Path(local_path).absolute())
661
+ else:
662
+ # Otherwise, actually make a copy of the file
663
+ import shutil
664
+
665
+ if hash_method_obj:
666
+ # For hash computation, we need to read and write manually
667
+ with open(local_path, "rb") as src:
668
+ with open(remote_path, "wb") as dst:
669
+ dst_wrapper = HashingWriter(dst, accumulator=hash_method_obj)
670
+ dst_wrapper.write(src.read())
671
+ hash_value = dst_wrapper.result()
672
+ dst_wrapper.close()
673
+ else:
674
+ shutil.copy2(local_path, remote_path)
675
+ path = str(Path(remote_path).absolute())
676
+ else:
677
+ # Otherwise upload to remote using sync storage layer
678
+ fs = storage.get_underlying_filesystem(path=remote_path)
679
+
680
+ if hash_method_obj:
681
+ # We can skip the wrapper if the hash method is just a precomputed value
682
+ if not isinstance(hash_method_obj, PrecomputedValue):
683
+ with open(local_path, "rb") as src:
684
+ # For sync operations, we need to compute hash manually
685
+ data = src.read()
686
+ hash_method_obj.update(memoryview(data))
687
+ hash_value = hash_method_obj.result()
688
+
689
+ # Now write the data to remote
690
+ with fs.open(remote_path, "wb") as dst:
691
+ dst.write(data)
692
+ path = remote_path
693
+ else:
694
+ # Use sync file operations
695
+ with open(local_path, "rb") as src:
696
+ with fs.open(remote_path, "wb") as dst:
697
+ dst.write(src.read())
698
+ path = remote_path
699
+ hash_value = hash_method_obj.result()
700
+ else:
701
+ # Simple sync copy
702
+ with open(local_path, "rb") as src:
703
+ with fs.open(remote_path, "wb") as dst:
704
+ dst.write(src.read())
705
+ path = remote_path
706
+
707
+ f = cls(path=path, name=filename, hash_method=hash_method_obj, hash=hash_value)
708
+ return f
709
+
710
+ @classmethod
711
+ @requires_initialization
712
+ async def from_local(
713
+ cls,
714
+ local_path: Union[str, Path],
715
+ remote_destination: Optional[str] = None,
716
+ hash_method: Optional[HashMethod | str] = None,
717
+ ) -> File[T]:
718
+ """
719
+ Asynchronously create a new File object from a local file by uploading it to remote storage.
720
+
721
+ Use this in async tasks when you have a local file that needs to be uploaded to remote storage.
722
+
723
+ Example (Async):
724
+
725
+ ```python
726
+ @env.task
727
+ async def upload_local_file() -> File:
728
+ # Create a local file
729
+ async with aiofiles.open("/tmp/data.csv", "w") as f:
730
+ await f.write("col1,col2\n1,2\n3,4\n")
731
+
732
+ # Upload to remote storage
733
+ remote_file = await File.from_local("/tmp/data.csv")
734
+ return remote_file
735
+ ```
736
+
737
+ Example (With specific destination):
738
+
739
+ ```python
740
+ @env.task
741
+ async def upload_to_specific_path() -> File:
742
+ remote_file = await File.from_local("/tmp/data.csv", "s3://my-bucket/data.csv")
743
+ return remote_file
744
+ ```
745
+
746
+ Args:
747
+ local_path: Path to the local file
748
+ remote_destination: Optional remote path to store the file. If None, a path will be automatically generated.
749
+ hash_method: Optional HashMethod or string to use for cache key computation. If a string is provided,
750
+ it will be used as a precomputed cache key. If a HashMethod is provided, it will compute
751
+ the hash during upload. If not specified, the cache key will be based on file attributes.
752
+
753
+ Returns:
754
+ A new File instance pointing to the uploaded remote file
395
755
  """
396
756
  if not os.path.exists(local_path):
397
757
  raise ValueError(f"File not found: {local_path}")