flytekitplugins-async-fsspec 1.10.3b5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flytekitplugins-async-fsspec-1.10.3b5/PKG-INFO +21 -0
- flytekitplugins-async-fsspec-1.10.3b5/README.md +14 -0
- flytekitplugins-async-fsspec-1.10.3b5/flytekitplugins/async_fsspec/__init__.py +16 -0
- flytekitplugins-async-fsspec-1.10.3b5/flytekitplugins/async_fsspec/s3fs/__init__.py +0 -0
- flytekitplugins-async-fsspec-1.10.3b5/flytekitplugins/async_fsspec/s3fs/constants.py +6 -0
- flytekitplugins-async-fsspec-1.10.3b5/flytekitplugins/async_fsspec/s3fs/s3fs.py +240 -0
- flytekitplugins-async-fsspec-1.10.3b5/flytekitplugins_async_fsspec.egg-info/PKG-INFO +21 -0
- flytekitplugins-async-fsspec-1.10.3b5/flytekitplugins_async_fsspec.egg-info/SOURCES.txt +14 -0
- flytekitplugins-async-fsspec-1.10.3b5/flytekitplugins_async_fsspec.egg-info/dependency_links.txt +1 -0
- flytekitplugins-async-fsspec-1.10.3b5/flytekitplugins_async_fsspec.egg-info/entry_points.txt +2 -0
- flytekitplugins-async-fsspec-1.10.3b5/flytekitplugins_async_fsspec.egg-info/namespace_packages.txt +1 -0
- flytekitplugins-async-fsspec-1.10.3b5/flytekitplugins_async_fsspec.egg-info/requires.txt +1 -0
- flytekitplugins-async-fsspec-1.10.3b5/flytekitplugins_async_fsspec.egg-info/top_level.txt +1 -0
- flytekitplugins-async-fsspec-1.10.3b5/setup.cfg +4 -0
- flytekitplugins-async-fsspec-1.10.3b5/setup.py +37 -0
- flytekitplugins-async-fsspec-1.10.3b5/tests/test_s3fs.py +201 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: flytekitplugins-async-fsspec
|
|
3
|
+
Version: 1.10.3b5
|
|
4
|
+
Summary: This package holds the data persistence plugins for flytekit
|
|
5
|
+
Author: flyteorg
|
|
6
|
+
Author-email: admin@flyte.org
|
|
7
|
+
License: apache2
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Topic :: Software Development
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Requires-Dist: flytekit
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Flytekit Async fsspec Plugin
|
|
2
|
+
|
|
3
|
+
The Flyte async fsspec plugin is a powerful addition to the Flyte ecosystem designed to optimize the performance of object transmission. This plugin focuses on overriding key methods of the file systems in fsspec to introduce efficiency improvements, resulting in accelerated data transfers between Flyte workflows and object storage.
|
|
4
|
+
|
|
5
|
+
Currently, the async fsspec plugin improves the following file systems:
|
|
6
|
+
1. s3fs
|
|
7
|
+
|
|
8
|
+
To install the plugin, run the following command:
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
pip install flytekitplugins-async-fsspec
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Once installed, the plugin will automatically override the original file system and register optimized ones, seamlessly integrating with your Flyte workflows.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
.. currentmodule:: flytekitplugins.async_fsspec
|
|
3
|
+
|
|
4
|
+
This package contains things that are useful when extending Flytekit.
|
|
5
|
+
|
|
6
|
+
.. autosummary::
|
|
7
|
+
:template: custom.rst
|
|
8
|
+
:toctree: generated/
|
|
9
|
+
|
|
10
|
+
AsyncS3FileSystem
|
|
11
|
+
"""
|
|
12
|
+
import fsspec
|
|
13
|
+
|
|
14
|
+
from .s3fs.s3fs import AsyncS3FileSystem
|
|
15
|
+
|
|
16
|
+
fsspec.register_implementation("s3", AsyncS3FileSystem)
|
|
File without changes
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import mimetypes
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from fsspec.callbacks import _DEFAULT_CALLBACK
|
|
6
|
+
from s3fs import S3FileSystem
|
|
7
|
+
from s3fs.core import S3_RETRYABLE_ERRORS, version_id_kw
|
|
8
|
+
|
|
9
|
+
from .constants import (
|
|
10
|
+
DEFAULT_CONCURRENT_DOWNLOAD,
|
|
11
|
+
DEFAULT_CONCURRENT_UPLOAD,
|
|
12
|
+
DEFAULT_DOWNLOAD_BODY_READ_SIZE,
|
|
13
|
+
DEFAULT_DOWNLOAD_CHUNK_SIZE,
|
|
14
|
+
DEFAULT_UPLOAD_CHUNK_SIZE,
|
|
15
|
+
SINGLE_OBJECT_UPLOAD_LIMIT,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AsyncS3FileSystem(S3FileSystem):
|
|
20
|
+
def __init__(self, **s3kwargs):
|
|
21
|
+
super().__init__(**s3kwargs)
|
|
22
|
+
|
|
23
|
+
async def _put_file(
|
|
24
|
+
self,
|
|
25
|
+
lpath,
|
|
26
|
+
rpath,
|
|
27
|
+
callback=_DEFAULT_CALLBACK,
|
|
28
|
+
chunksize=DEFAULT_UPLOAD_CHUNK_SIZE,
|
|
29
|
+
concurrent_upload=DEFAULT_CONCURRENT_UPLOAD,
|
|
30
|
+
**kwargs,
|
|
31
|
+
):
|
|
32
|
+
"""
|
|
33
|
+
Put a file from lpath to rpath.
|
|
34
|
+
Args:
|
|
35
|
+
lpath (str): The local path of the file to be uploaded.
|
|
36
|
+
rpath (str): The remote path which the file should be uploaded to.
|
|
37
|
+
callback (function, optional): The callback function.
|
|
38
|
+
chunksize (int, optional): Upload chunksize. Defaults to 50 * 2**20 (50MB).
|
|
39
|
+
concurrent_upload (int, optional): The number of concurrent upload when using multipart upload. Defaults to 4.
|
|
40
|
+
"""
|
|
41
|
+
bucket, key, _ = self.split_path(rpath)
|
|
42
|
+
if os.path.isdir(lpath):
|
|
43
|
+
if key:
|
|
44
|
+
# don't make remote "directory"
|
|
45
|
+
return
|
|
46
|
+
else:
|
|
47
|
+
await self._mkdir(lpath)
|
|
48
|
+
size = os.path.getsize(lpath)
|
|
49
|
+
callback.set_size(size)
|
|
50
|
+
|
|
51
|
+
if "ContentType" not in kwargs:
|
|
52
|
+
content_type, _ = mimetypes.guess_type(lpath)
|
|
53
|
+
if content_type is not None:
|
|
54
|
+
kwargs["ContentType"] = content_type
|
|
55
|
+
|
|
56
|
+
with open(lpath, "rb") as f0:
|
|
57
|
+
if size < min(SINGLE_OBJECT_UPLOAD_LIMIT, 2 * chunksize):
|
|
58
|
+
chunk = f0.read()
|
|
59
|
+
await self._call_s3("put_object", Bucket=bucket, Key=key, Body=chunk, **kwargs)
|
|
60
|
+
callback.relative_update(size)
|
|
61
|
+
else:
|
|
62
|
+
mpu = await self._call_s3("create_multipart_upload", Bucket=bucket, Key=key, **kwargs)
|
|
63
|
+
|
|
64
|
+
# async function to upload a single chunk
|
|
65
|
+
async def upload_chunk(chunk, part_number):
|
|
66
|
+
result = await self._call_s3(
|
|
67
|
+
"upload_part",
|
|
68
|
+
Bucket=bucket,
|
|
69
|
+
PartNumber=part_number,
|
|
70
|
+
UploadId=mpu["UploadId"],
|
|
71
|
+
Body=chunk,
|
|
72
|
+
Key=key,
|
|
73
|
+
)
|
|
74
|
+
callback.relative_update(len(chunk))
|
|
75
|
+
return {"PartNumber": part_number, "ETag": result["ETag"]}
|
|
76
|
+
|
|
77
|
+
tasks = set()
|
|
78
|
+
part_number = 1
|
|
79
|
+
parts = []
|
|
80
|
+
read_end = False
|
|
81
|
+
while True:
|
|
82
|
+
while len(tasks) < concurrent_upload:
|
|
83
|
+
chunk = f0.read(chunksize)
|
|
84
|
+
if not chunk:
|
|
85
|
+
read_end = True
|
|
86
|
+
break
|
|
87
|
+
tasks.add(asyncio.create_task(upload_chunk(chunk, part_number)))
|
|
88
|
+
part_number += 1
|
|
89
|
+
if read_end:
|
|
90
|
+
break
|
|
91
|
+
done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
|
|
92
|
+
parts.extend(map(lambda x: x.result(), done))
|
|
93
|
+
tasks = pending
|
|
94
|
+
|
|
95
|
+
parts.extend(await asyncio.gather(*tasks))
|
|
96
|
+
parts.sort(key=lambda part: part["PartNumber"])
|
|
97
|
+
await self._call_s3(
|
|
98
|
+
"complete_multipart_upload",
|
|
99
|
+
Bucket=bucket,
|
|
100
|
+
Key=key,
|
|
101
|
+
UploadId=mpu["UploadId"],
|
|
102
|
+
MultipartUpload={"Parts": parts},
|
|
103
|
+
)
|
|
104
|
+
while rpath:
|
|
105
|
+
self.invalidate_cache(rpath)
|
|
106
|
+
rpath = self._parent(rpath)
|
|
107
|
+
|
|
108
|
+
async def _get_file(
|
|
109
|
+
self,
|
|
110
|
+
rpath,
|
|
111
|
+
lpath,
|
|
112
|
+
callback=_DEFAULT_CALLBACK,
|
|
113
|
+
version_id=None,
|
|
114
|
+
chunksize=DEFAULT_DOWNLOAD_CHUNK_SIZE,
|
|
115
|
+
concurrent_download=DEFAULT_CONCURRENT_DOWNLOAD,
|
|
116
|
+
):
|
|
117
|
+
"""
|
|
118
|
+
Get a file from rpath to lpath.
|
|
119
|
+
Args:
|
|
120
|
+
rpath (str): The remote path of the file to be downloaded.
|
|
121
|
+
lpath (str): The local path which the file should be downloaded to.
|
|
122
|
+
callback (function, optional): The callback function.
|
|
123
|
+
chunksize (int, optional): Download chunksize. Defaults to 50 * 2**20 (50MB).
|
|
124
|
+
version_id (str, optional): The version id of the file. Defaults to None.
|
|
125
|
+
"""
|
|
126
|
+
if os.path.isdir(lpath):
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
# get the file size
|
|
130
|
+
file_info = await self._info(path=rpath, version_id=version_id)
|
|
131
|
+
file_size = file_info["size"]
|
|
132
|
+
|
|
133
|
+
bucket, key, vers = self.split_path(rpath)
|
|
134
|
+
|
|
135
|
+
# the async function to get a range of the remote file
|
|
136
|
+
async def _open_file(start_byte: int, end_byte: int = None):
|
|
137
|
+
kw = self.req_kw.copy()
|
|
138
|
+
if end_byte:
|
|
139
|
+
kw["Range"] = f"bytes={start_byte}-{end_byte}"
|
|
140
|
+
else:
|
|
141
|
+
kw["Range"] = f"bytes={start_byte}"
|
|
142
|
+
resp = await self._call_s3(
|
|
143
|
+
"get_object",
|
|
144
|
+
Bucket=bucket,
|
|
145
|
+
Key=key,
|
|
146
|
+
**version_id_kw(version_id or vers),
|
|
147
|
+
**kw,
|
|
148
|
+
)
|
|
149
|
+
return resp["Body"], resp.get("ContentLength", None)
|
|
150
|
+
|
|
151
|
+
# Refer to s3fs's implementation
|
|
152
|
+
async def handle_read_error(body, failed_reads, restart_byte, end_byte=None):
|
|
153
|
+
if failed_reads >= self.retries:
|
|
154
|
+
raise
|
|
155
|
+
try:
|
|
156
|
+
body.close()
|
|
157
|
+
except Exception:
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
await asyncio.sleep(min(1.7**failed_reads * 0.1, 15))
|
|
161
|
+
body, _ = await _open_file(restart_byte, end_byte)
|
|
162
|
+
return body
|
|
163
|
+
|
|
164
|
+
# According to s3fs documentation, some file systems might not be able to measure the file’s size,
|
|
165
|
+
# in which case, the returned dict will include 'size': None. When we cannot get the file size
|
|
166
|
+
# in advance, we keep using the original implementation of s3fs.
|
|
167
|
+
if file_size is None:
|
|
168
|
+
# From s3fs
|
|
169
|
+
body, content_length = await _open_file(start_byte=0)
|
|
170
|
+
callback.set_size(content_length)
|
|
171
|
+
|
|
172
|
+
failed_reads = 0
|
|
173
|
+
bytes_read = 0
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
with open(lpath, "wb") as f0:
|
|
177
|
+
while True:
|
|
178
|
+
try:
|
|
179
|
+
chunk = await body.read(DEFAULT_DOWNLOAD_BODY_READ_SIZE)
|
|
180
|
+
except S3_RETRYABLE_ERRORS:
|
|
181
|
+
failed_reads += 1
|
|
182
|
+
body = await handle_read_error(body, failed_reads, bytes_read)
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
if not chunk:
|
|
186
|
+
break
|
|
187
|
+
|
|
188
|
+
f0.write(chunk)
|
|
189
|
+
bytes_read += len(chunk)
|
|
190
|
+
callback.relative_update(len(chunk))
|
|
191
|
+
finally:
|
|
192
|
+
try:
|
|
193
|
+
body.close()
|
|
194
|
+
except Exception:
|
|
195
|
+
pass
|
|
196
|
+
else:
|
|
197
|
+
callback.set_size(file_size)
|
|
198
|
+
with open(lpath, "wb") as f0:
|
|
199
|
+
# async function to download a single chunk
|
|
200
|
+
async def download_chunk(chunk_index: int):
|
|
201
|
+
start_byte = chunk_index * chunksize
|
|
202
|
+
end_byte = min(start_byte + chunksize, file_size) - 1
|
|
203
|
+
body, _ = await _open_file(start_byte, end_byte)
|
|
204
|
+
failed_reads = 0
|
|
205
|
+
bytes_read = 0
|
|
206
|
+
try:
|
|
207
|
+
while True:
|
|
208
|
+
try:
|
|
209
|
+
chunk = await body.read(DEFAULT_DOWNLOAD_BODY_READ_SIZE)
|
|
210
|
+
except S3_RETRYABLE_ERRORS:
|
|
211
|
+
failed_reads += 1
|
|
212
|
+
body = await handle_read_error(body, failed_reads, start_byte + bytes_read, end_byte)
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
if not chunk:
|
|
216
|
+
break
|
|
217
|
+
|
|
218
|
+
f0.seek(start_byte + bytes_read)
|
|
219
|
+
f0.write(chunk)
|
|
220
|
+
bytes_read += len(chunk)
|
|
221
|
+
callback.relative_update(len(chunk))
|
|
222
|
+
finally:
|
|
223
|
+
try:
|
|
224
|
+
body.close()
|
|
225
|
+
except Exception:
|
|
226
|
+
pass
|
|
227
|
+
|
|
228
|
+
chunk_count = file_size // chunksize
|
|
229
|
+
if file_size % chunksize > 0:
|
|
230
|
+
chunk_count += 1
|
|
231
|
+
|
|
232
|
+
tasks = set()
|
|
233
|
+
current_chunk = 0
|
|
234
|
+
while current_chunk < chunk_count:
|
|
235
|
+
while current_chunk < chunk_count and len(tasks) < concurrent_download:
|
|
236
|
+
tasks.add(asyncio.create_task(download_chunk(current_chunk)))
|
|
237
|
+
current_chunk += 1
|
|
238
|
+
_, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
|
|
239
|
+
tasks = pending
|
|
240
|
+
await asyncio.gather(*tasks)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: flytekitplugins-async-fsspec
|
|
3
|
+
Version: 1.10.3b5
|
|
4
|
+
Summary: This package holds the data persistence plugins for flytekit
|
|
5
|
+
Author: flyteorg
|
|
6
|
+
Author-email: admin@flyte.org
|
|
7
|
+
License: apache2
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Topic :: Software Development
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Requires-Dist: flytekit
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
flytekitplugins/async_fsspec/__init__.py
|
|
4
|
+
flytekitplugins/async_fsspec/s3fs/__init__.py
|
|
5
|
+
flytekitplugins/async_fsspec/s3fs/constants.py
|
|
6
|
+
flytekitplugins/async_fsspec/s3fs/s3fs.py
|
|
7
|
+
flytekitplugins_async_fsspec.egg-info/PKG-INFO
|
|
8
|
+
flytekitplugins_async_fsspec.egg-info/SOURCES.txt
|
|
9
|
+
flytekitplugins_async_fsspec.egg-info/dependency_links.txt
|
|
10
|
+
flytekitplugins_async_fsspec.egg-info/entry_points.txt
|
|
11
|
+
flytekitplugins_async_fsspec.egg-info/namespace_packages.txt
|
|
12
|
+
flytekitplugins_async_fsspec.egg-info/requires.txt
|
|
13
|
+
flytekitplugins_async_fsspec.egg-info/top_level.txt
|
|
14
|
+
tests/test_s3fs.py
|
flytekitplugins-async-fsspec-1.10.3b5/flytekitplugins_async_fsspec.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
flytekitplugins-async-fsspec-1.10.3b5/flytekitplugins_async_fsspec.egg-info/namespace_packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
flytekitplugins
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
flytekit
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
flytekitplugins
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from setuptools import setup
|
|
2
|
+
|
|
3
|
+
PLUGIN_NAME = "async_fsspec"
|
|
4
|
+
|
|
5
|
+
microlib_name = "flytekitplugins-async-fsspec"
|
|
6
|
+
|
|
7
|
+
plugin_requires = ["flytekit"]
|
|
8
|
+
|
|
9
|
+
__version__ = "1.10.3b5"
|
|
10
|
+
|
|
11
|
+
setup(
|
|
12
|
+
name=microlib_name,
|
|
13
|
+
version=__version__,
|
|
14
|
+
author="flyteorg",
|
|
15
|
+
author_email="admin@flyte.org",
|
|
16
|
+
description="This package holds the data persistence plugins for flytekit",
|
|
17
|
+
namespace_packages=["flytekitplugins"],
|
|
18
|
+
packages=[f"flytekitplugins.{PLUGIN_NAME}", f"flytekitplugins.{PLUGIN_NAME}.s3fs"],
|
|
19
|
+
install_requires=plugin_requires,
|
|
20
|
+
license="apache2",
|
|
21
|
+
python_requires=">=3.8",
|
|
22
|
+
classifiers=[
|
|
23
|
+
"Intended Audience :: Science/Research",
|
|
24
|
+
"Intended Audience :: Developers",
|
|
25
|
+
"License :: OSI Approved :: Apache Software License",
|
|
26
|
+
"Programming Language :: Python :: 3.8",
|
|
27
|
+
"Programming Language :: Python :: 3.9",
|
|
28
|
+
"Programming Language :: Python :: 3.10",
|
|
29
|
+
"Programming Language :: Python :: 3.11",
|
|
30
|
+
"Topic :: Scientific/Engineering",
|
|
31
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
32
|
+
"Topic :: Software Development",
|
|
33
|
+
"Topic :: Software Development :: Libraries",
|
|
34
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
35
|
+
],
|
|
36
|
+
entry_points={"flytekit.plugins": [f"{PLUGIN_NAME}=flytekitplugins.{PLUGIN_NAME}"]},
|
|
37
|
+
)
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from unittest import mock
|
|
3
|
+
from unittest.mock import MagicMock, mock_open
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from flytekitplugins.async_fsspec import AsyncS3FileSystem
|
|
7
|
+
from flytekitplugins.async_fsspec.s3fs.constants import DEFAULT_DOWNLOAD_CHUNK_SIZE, DEFAULT_UPLOAD_CHUNK_SIZE
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._parent")
|
|
11
|
+
@mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem.invalidate_cache")
|
|
12
|
+
@mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._call_s3")
|
|
13
|
+
@mock.patch("mimetypes.guess_type")
|
|
14
|
+
@mock.patch("os.path.getsize")
|
|
15
|
+
@pytest.mark.asyncio
|
|
16
|
+
async def test_put_file_single_object_upload(
|
|
17
|
+
mock_getsize, mock_guess_type, mock_call_s3, mock_invalidate_cache, mock_parent
|
|
18
|
+
):
|
|
19
|
+
mock_bucket = "mock-bucket"
|
|
20
|
+
mock_file_name = "mock_file_name"
|
|
21
|
+
mock_file_size = 32 * 2**20 # 32MB
|
|
22
|
+
mock_getsize.return_value = mock_file_size
|
|
23
|
+
mock_guess_type.return_value = (None, None)
|
|
24
|
+
mock_parent.return_value = None
|
|
25
|
+
mock_body = os.urandom(mock_file_size)
|
|
26
|
+
m = mock_open(read_data=mock_body)
|
|
27
|
+
|
|
28
|
+
with mock.patch("builtins.open", m):
|
|
29
|
+
asyncs3fs = AsyncS3FileSystem()
|
|
30
|
+
await asyncs3fs._put_file(lpath=f"/{mock_file_name}", rpath=f"s3://{mock_bucket}/{mock_file_name}")
|
|
31
|
+
|
|
32
|
+
mock_call_s3.assert_called_once_with("put_object", Bucket=mock_bucket, Key=mock_file_name, Body=mock_body)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._parent")
|
|
36
|
+
@mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem.invalidate_cache")
|
|
37
|
+
@mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._call_s3")
|
|
38
|
+
@mock.patch("mimetypes.guess_type")
|
|
39
|
+
@mock.patch("os.path.getsize")
|
|
40
|
+
@pytest.mark.asyncio
|
|
41
|
+
async def test_put_file_multipart_upload(
|
|
42
|
+
mock_getsize, mock_guess_type, mock_call_s3, mock_invalidate_cache, mock_parent
|
|
43
|
+
):
|
|
44
|
+
mock_bucket = "mock-bucket"
|
|
45
|
+
mock_file_name = "mock_file_name"
|
|
46
|
+
mock_upload_id = "mock_upload_id"
|
|
47
|
+
mock_ETag = "mock_ETag"
|
|
48
|
+
mock_file_size = 256 * 2**20 # 256MB
|
|
49
|
+
mock_getsize.return_value = mock_file_size
|
|
50
|
+
mock_guess_type.return_value = (None, None)
|
|
51
|
+
|
|
52
|
+
def call_s3_side_effect(*args, **kwargs):
|
|
53
|
+
if args[0] == "create_multipart_upload":
|
|
54
|
+
return {"UploadId": mock_upload_id}
|
|
55
|
+
elif args[0] == "upload_part":
|
|
56
|
+
part_number = kwargs["PartNumber"]
|
|
57
|
+
return {"ETag": f"{mock_ETag}{part_number}"}
|
|
58
|
+
elif args[0] == "complete_multipart_upload":
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
mock_call_s3.side_effect = call_s3_side_effect
|
|
62
|
+
|
|
63
|
+
mock_parent.return_value = None
|
|
64
|
+
|
|
65
|
+
mock_body = os.urandom(mock_file_size)
|
|
66
|
+
m = mock_open(read_data=mock_body)
|
|
67
|
+
|
|
68
|
+
with mock.patch("builtins.open", m):
|
|
69
|
+
asyncs3fs = AsyncS3FileSystem()
|
|
70
|
+
await asyncs3fs._put_file(lpath=f"/{mock_file_name}", rpath=f"s3://{mock_bucket}/{mock_file_name}")
|
|
71
|
+
|
|
72
|
+
mock_chunk_count = mock_file_size // DEFAULT_UPLOAD_CHUNK_SIZE
|
|
73
|
+
if mock_file_size % DEFAULT_UPLOAD_CHUNK_SIZE > 0:
|
|
74
|
+
mock_chunk_count += 1
|
|
75
|
+
put_object_calls = []
|
|
76
|
+
for i in range(mock_chunk_count):
|
|
77
|
+
part_number = i + 1
|
|
78
|
+
start_byte = i * DEFAULT_UPLOAD_CHUNK_SIZE
|
|
79
|
+
end_byte = min(start_byte + DEFAULT_UPLOAD_CHUNK_SIZE, mock_file_size)
|
|
80
|
+
body = mock_body[start_byte:end_byte]
|
|
81
|
+
put_object_calls.append(
|
|
82
|
+
mock.call(
|
|
83
|
+
"upload_part",
|
|
84
|
+
Bucket=mock_bucket,
|
|
85
|
+
Key=mock_file_name,
|
|
86
|
+
PartNumber=part_number,
|
|
87
|
+
UploadId=mock_upload_id,
|
|
88
|
+
Body=body,
|
|
89
|
+
),
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
mock_call_s3.assert_has_calls(
|
|
93
|
+
put_object_calls
|
|
94
|
+
+ [
|
|
95
|
+
mock.call("create_multipart_upload", Bucket=mock_bucket, Key=mock_file_name),
|
|
96
|
+
mock.call(
|
|
97
|
+
"complete_multipart_upload",
|
|
98
|
+
Bucket=mock_bucket,
|
|
99
|
+
Key=mock_file_name,
|
|
100
|
+
UploadId=mock_upload_id,
|
|
101
|
+
MultipartUpload={
|
|
102
|
+
"Parts": [{"PartNumber": i, "ETag": f"{mock_ETag}{i}"} for i in range(1, mock_chunk_count + 1)]
|
|
103
|
+
},
|
|
104
|
+
),
|
|
105
|
+
],
|
|
106
|
+
any_order=True,
|
|
107
|
+
)
|
|
108
|
+
assert mock_call_s3.call_count == 2 + mock_chunk_count
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._call_s3")
|
|
112
|
+
@mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._info")
|
|
113
|
+
@mock.patch("os.path.isdir")
|
|
114
|
+
@pytest.mark.asyncio
|
|
115
|
+
async def test_get_file_file_size_is_none(mock_isdir, mock_info, mock_call_s3):
|
|
116
|
+
mock_bucket = "mock-bucket"
|
|
117
|
+
mock_file_name = "mock_file_name"
|
|
118
|
+
mock_file_size = 32 * 2**20 # 32MB
|
|
119
|
+
mock_isdir.return_value = False
|
|
120
|
+
mock_info.return_value = {"size": None}
|
|
121
|
+
|
|
122
|
+
file_been_read = 0
|
|
123
|
+
|
|
124
|
+
async def read_side_effect(*args, **kwargs):
|
|
125
|
+
read_size = args[0]
|
|
126
|
+
nonlocal file_been_read
|
|
127
|
+
real_read_size = min(read_size, mock_file_size - file_been_read)
|
|
128
|
+
if real_read_size == 0:
|
|
129
|
+
return None
|
|
130
|
+
file_been_read += real_read_size
|
|
131
|
+
return os.urandom(real_read_size)
|
|
132
|
+
|
|
133
|
+
mock_chunk = MagicMock()
|
|
134
|
+
mock_chunk.read.side_effect = read_side_effect
|
|
135
|
+
mock_call_s3.return_value = {"Body": mock_chunk, "ContentLength": mock_file_size}
|
|
136
|
+
|
|
137
|
+
m = mock_open()
|
|
138
|
+
|
|
139
|
+
with mock.patch("builtins.open", m):
|
|
140
|
+
asyncs3fs = AsyncS3FileSystem()
|
|
141
|
+
await asyncs3fs._get_file(lpath=f"/{mock_file_name}", rpath=f"s3://{mock_bucket}/{mock_file_name}")
|
|
142
|
+
|
|
143
|
+
assert file_been_read == mock_file_size
|
|
144
|
+
mock_call_s3.assert_called_once_with("get_object", Bucket=mock_bucket, Key=mock_file_name, Range="bytes=0")
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
@mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._call_s3")
|
|
148
|
+
@mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._info")
|
|
149
|
+
@mock.patch("os.path.isdir")
|
|
150
|
+
@pytest.mark.asyncio
|
|
151
|
+
async def test_get_file_file_size_is_not_none(mock_isdir, mock_info, mock_call_s3):
|
|
152
|
+
mock_bucket = "mock-bucket"
|
|
153
|
+
mock_file_name = "mock_file_name"
|
|
154
|
+
mock_file_size = 256 * 2**20 # 256MB
|
|
155
|
+
mock_isdir.return_value = False
|
|
156
|
+
mock_info.return_value = {"size": mock_file_size}
|
|
157
|
+
|
|
158
|
+
file_been_read = 0
|
|
159
|
+
|
|
160
|
+
def call_s3_side_effect(*args, **kwargs):
|
|
161
|
+
start_byte, end_byte = kwargs["Range"][6:].split("-")
|
|
162
|
+
start_byte, end_byte = int(start_byte), int(end_byte)
|
|
163
|
+
chunk_size = end_byte - start_byte + 1
|
|
164
|
+
chunk_been_read = 0
|
|
165
|
+
|
|
166
|
+
async def read_side_effect(*args, **kwargs):
|
|
167
|
+
nonlocal chunk_been_read
|
|
168
|
+
nonlocal file_been_read
|
|
169
|
+
read_size = args[0]
|
|
170
|
+
real_read_size = min(read_size, chunk_size - chunk_been_read)
|
|
171
|
+
if real_read_size == 0:
|
|
172
|
+
return None
|
|
173
|
+
chunk_been_read += real_read_size
|
|
174
|
+
file_been_read += real_read_size
|
|
175
|
+
return os.urandom(real_read_size)
|
|
176
|
+
|
|
177
|
+
mock_chunk = MagicMock()
|
|
178
|
+
mock_chunk.read.side_effect = read_side_effect
|
|
179
|
+
return {"Body": mock_chunk, "ContentLength": chunk_size}
|
|
180
|
+
|
|
181
|
+
mock_call_s3.side_effect = call_s3_side_effect
|
|
182
|
+
|
|
183
|
+
m = mock_open()
|
|
184
|
+
with mock.patch("builtins.open", m):
|
|
185
|
+
asyncs3fs = AsyncS3FileSystem()
|
|
186
|
+
await asyncs3fs._get_file(lpath=f"/{mock_file_name}", rpath=f"s3://{mock_bucket}/{mock_file_name}")
|
|
187
|
+
|
|
188
|
+
assert file_been_read == mock_file_size
|
|
189
|
+
|
|
190
|
+
mock_chunk_count = mock_file_size // DEFAULT_DOWNLOAD_CHUNK_SIZE
|
|
191
|
+
if mock_file_size % DEFAULT_DOWNLOAD_CHUNK_SIZE > 0:
|
|
192
|
+
mock_chunk_count += 1
|
|
193
|
+
get_object_calls = []
|
|
194
|
+
for i in range(mock_chunk_count):
|
|
195
|
+
start_byte = i * DEFAULT_DOWNLOAD_CHUNK_SIZE
|
|
196
|
+
end_byte = min(start_byte + DEFAULT_DOWNLOAD_CHUNK_SIZE, mock_file_size) - 1
|
|
197
|
+
get_object_calls.append(
|
|
198
|
+
mock.call("get_object", Bucket=mock_bucket, Key=mock_file_name, Range=f"bytes={start_byte}-{end_byte}")
|
|
199
|
+
)
|
|
200
|
+
mock_call_s3.assert_has_calls(get_object_calls)
|
|
201
|
+
assert mock_call_s3.call_count == len(get_object_calls)
|