flytekitplugins-async-fsspec 1.10.3b5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.1
2
+ Name: flytekitplugins-async-fsspec
3
+ Version: 1.10.3b5
4
+ Summary: This package holds the data persistence plugins for flytekit
5
+ Author: flyteorg
6
+ Author-email: admin@flyte.org
7
+ License: apache2
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Topic :: Scientific/Engineering
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Topic :: Software Development
18
+ Classifier: Topic :: Software Development :: Libraries
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.8
21
+ Requires-Dist: flytekit
@@ -0,0 +1,14 @@
1
+ # Flytekit Async fsspec Plugin
2
+
3
+ The Flyte async fsspec plugin is a powerful addition to the Flyte ecosystem designed to optimize the performance of object transmission. This plugin focuses on overriding key methods of the file systems in fsspec to introduce efficiency improvements, resulting in accelerated data transfers between Flyte workflows and object storage.
4
+
5
+ Currently, the async fsspec plugin improves the following file systems:
6
+ 1. s3fs
7
+
8
+ To install the plugin, run the following command:
9
+
10
+ ```bash
11
+ pip install flytekitplugins-async-fsspec
12
+ ```
13
+
14
+ Once installed, the plugin will automatically override the original file system and register optimized ones, seamlessly integrating with your Flyte workflows.
@@ -0,0 +1,16 @@
1
+ """
2
+ .. currentmodule:: flytekitplugins.async_fsspec
3
+
4
+ This package contains things that are useful when extending Flytekit.
5
+
6
+ .. autosummary::
7
+ :template: custom.rst
8
+ :toctree: generated/
9
+
10
+ AsyncS3FileSystem
11
+ """
12
+ import fsspec
13
+
14
+ from .s3fs.s3fs import AsyncS3FileSystem
15
+
16
+ fsspec.register_implementation("s3", AsyncS3FileSystem)
@@ -0,0 +1,6 @@
1
+ DEFAULT_UPLOAD_CHUNK_SIZE = 50 * 2**20 # 50MB
2
+ DEFAULT_CONCURRENT_UPLOAD = 4
3
+ SINGLE_OBJECT_UPLOAD_LIMIT = 5 * 2**30 # 5GB
4
+ DEFAULT_DOWNLOAD_CHUNK_SIZE = 50 * 2**20 # 50MB
5
+ DEFAULT_CONCURRENT_DOWNLOAD = 4
6
+ DEFAULT_DOWNLOAD_BODY_READ_SIZE = 2**16 # from s3fs
@@ -0,0 +1,240 @@
1
+ import asyncio
2
+ import mimetypes
3
+ import os
4
+
5
+ from fsspec.callbacks import _DEFAULT_CALLBACK
6
+ from s3fs import S3FileSystem
7
+ from s3fs.core import S3_RETRYABLE_ERRORS, version_id_kw
8
+
9
+ from .constants import (
10
+ DEFAULT_CONCURRENT_DOWNLOAD,
11
+ DEFAULT_CONCURRENT_UPLOAD,
12
+ DEFAULT_DOWNLOAD_BODY_READ_SIZE,
13
+ DEFAULT_DOWNLOAD_CHUNK_SIZE,
14
+ DEFAULT_UPLOAD_CHUNK_SIZE,
15
+ SINGLE_OBJECT_UPLOAD_LIMIT,
16
+ )
17
+
18
+
19
+ class AsyncS3FileSystem(S3FileSystem):
20
+ def __init__(self, **s3kwargs):
21
+ super().__init__(**s3kwargs)
22
+
23
+ async def _put_file(
24
+ self,
25
+ lpath,
26
+ rpath,
27
+ callback=_DEFAULT_CALLBACK,
28
+ chunksize=DEFAULT_UPLOAD_CHUNK_SIZE,
29
+ concurrent_upload=DEFAULT_CONCURRENT_UPLOAD,
30
+ **kwargs,
31
+ ):
32
+ """
33
+ Put a file from lpath to rpath.
34
+ Args:
35
+ lpath (str): The local path of the file to be uploaded.
36
+ rpath (str): The remote path which the file should be uploaded to.
37
+ callback (function, optional): The callback function.
38
+ chunksize (int, optional): Upload chunksize. Defaults to 50 * 2**20 (50MB).
39
+ concurrent_upload (int, optional): The number of concurrent upload when using multipart upload. Defaults to 4.
40
+ """
41
+ bucket, key, _ = self.split_path(rpath)
42
+ if os.path.isdir(lpath):
43
+ if key:
44
+ # don't make remote "directory"
45
+ return
46
+ else:
47
+ await self._mkdir(lpath)
48
+ size = os.path.getsize(lpath)
49
+ callback.set_size(size)
50
+
51
+ if "ContentType" not in kwargs:
52
+ content_type, _ = mimetypes.guess_type(lpath)
53
+ if content_type is not None:
54
+ kwargs["ContentType"] = content_type
55
+
56
+ with open(lpath, "rb") as f0:
57
+ if size < min(SINGLE_OBJECT_UPLOAD_LIMIT, 2 * chunksize):
58
+ chunk = f0.read()
59
+ await self._call_s3("put_object", Bucket=bucket, Key=key, Body=chunk, **kwargs)
60
+ callback.relative_update(size)
61
+ else:
62
+ mpu = await self._call_s3("create_multipart_upload", Bucket=bucket, Key=key, **kwargs)
63
+
64
+ # async function to upload a single chunk
65
+ async def upload_chunk(chunk, part_number):
66
+ result = await self._call_s3(
67
+ "upload_part",
68
+ Bucket=bucket,
69
+ PartNumber=part_number,
70
+ UploadId=mpu["UploadId"],
71
+ Body=chunk,
72
+ Key=key,
73
+ )
74
+ callback.relative_update(len(chunk))
75
+ return {"PartNumber": part_number, "ETag": result["ETag"]}
76
+
77
+ tasks = set()
78
+ part_number = 1
79
+ parts = []
80
+ read_end = False
81
+ while True:
82
+ while len(tasks) < concurrent_upload:
83
+ chunk = f0.read(chunksize)
84
+ if not chunk:
85
+ read_end = True
86
+ break
87
+ tasks.add(asyncio.create_task(upload_chunk(chunk, part_number)))
88
+ part_number += 1
89
+ if read_end:
90
+ break
91
+ done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
92
+ parts.extend(map(lambda x: x.result(), done))
93
+ tasks = pending
94
+
95
+ parts.extend(await asyncio.gather(*tasks))
96
+ parts.sort(key=lambda part: part["PartNumber"])
97
+ await self._call_s3(
98
+ "complete_multipart_upload",
99
+ Bucket=bucket,
100
+ Key=key,
101
+ UploadId=mpu["UploadId"],
102
+ MultipartUpload={"Parts": parts},
103
+ )
104
+ while rpath:
105
+ self.invalidate_cache(rpath)
106
+ rpath = self._parent(rpath)
107
+
108
+ async def _get_file(
109
+ self,
110
+ rpath,
111
+ lpath,
112
+ callback=_DEFAULT_CALLBACK,
113
+ version_id=None,
114
+ chunksize=DEFAULT_DOWNLOAD_CHUNK_SIZE,
115
+ concurrent_download=DEFAULT_CONCURRENT_DOWNLOAD,
116
+ ):
117
+ """
118
+ Get a file from rpath to lpath.
119
+ Args:
120
+ rpath (str): The remote path of the file to be downloaded.
121
+ lpath (str): The local path which the file should be downloaded to.
122
+ callback (function, optional): The callback function.
123
+ chunksize (int, optional): Download chunksize. Defaults to 50 * 2**20 (50MB).
124
+ version_id (str, optional): The version id of the file. Defaults to None.
125
+ """
126
+ if os.path.isdir(lpath):
127
+ return
128
+
129
+ # get the file size
130
+ file_info = await self._info(path=rpath, version_id=version_id)
131
+ file_size = file_info["size"]
132
+
133
+ bucket, key, vers = self.split_path(rpath)
134
+
135
+ # the async function to get a range of the remote file
136
+ async def _open_file(start_byte: int, end_byte: int = None):
137
+ kw = self.req_kw.copy()
138
+ if end_byte:
139
+ kw["Range"] = f"bytes={start_byte}-{end_byte}"
140
+ else:
141
+ kw["Range"] = f"bytes={start_byte}"
142
+ resp = await self._call_s3(
143
+ "get_object",
144
+ Bucket=bucket,
145
+ Key=key,
146
+ **version_id_kw(version_id or vers),
147
+ **kw,
148
+ )
149
+ return resp["Body"], resp.get("ContentLength", None)
150
+
151
+ # Refer to s3fs's implementation
152
+ async def handle_read_error(body, failed_reads, restart_byte, end_byte=None):
153
+ if failed_reads >= self.retries:
154
+ raise
155
+ try:
156
+ body.close()
157
+ except Exception:
158
+ pass
159
+
160
+ await asyncio.sleep(min(1.7**failed_reads * 0.1, 15))
161
+ body, _ = await _open_file(restart_byte, end_byte)
162
+ return body
163
+
164
+ # According to s3fs documentation, some file systems might not be able to measure the file’s size,
165
+ # in which case, the returned dict will include 'size': None. When we cannot get the file size
166
+ # in advance, we keep using the original implementation of s3fs.
167
+ if file_size is None:
168
+ # From s3fs
169
+ body, content_length = await _open_file(start_byte=0)
170
+ callback.set_size(content_length)
171
+
172
+ failed_reads = 0
173
+ bytes_read = 0
174
+
175
+ try:
176
+ with open(lpath, "wb") as f0:
177
+ while True:
178
+ try:
179
+ chunk = await body.read(DEFAULT_DOWNLOAD_BODY_READ_SIZE)
180
+ except S3_RETRYABLE_ERRORS:
181
+ failed_reads += 1
182
+ body = await handle_read_error(body, failed_reads, bytes_read)
183
+ continue
184
+
185
+ if not chunk:
186
+ break
187
+
188
+ f0.write(chunk)
189
+ bytes_read += len(chunk)
190
+ callback.relative_update(len(chunk))
191
+ finally:
192
+ try:
193
+ body.close()
194
+ except Exception:
195
+ pass
196
+ else:
197
+ callback.set_size(file_size)
198
+ with open(lpath, "wb") as f0:
199
+ # async function to download a single chunk
200
+ async def download_chunk(chunk_index: int):
201
+ start_byte = chunk_index * chunksize
202
+ end_byte = min(start_byte + chunksize, file_size) - 1
203
+ body, _ = await _open_file(start_byte, end_byte)
204
+ failed_reads = 0
205
+ bytes_read = 0
206
+ try:
207
+ while True:
208
+ try:
209
+ chunk = await body.read(DEFAULT_DOWNLOAD_BODY_READ_SIZE)
210
+ except S3_RETRYABLE_ERRORS:
211
+ failed_reads += 1
212
+ body = await handle_read_error(body, failed_reads, start_byte + bytes_read, end_byte)
213
+ continue
214
+
215
+ if not chunk:
216
+ break
217
+
218
+ f0.seek(start_byte + bytes_read)
219
+ f0.write(chunk)
220
+ bytes_read += len(chunk)
221
+ callback.relative_update(len(chunk))
222
+ finally:
223
+ try:
224
+ body.close()
225
+ except Exception:
226
+ pass
227
+
228
+ chunk_count = file_size // chunksize
229
+ if file_size % chunksize > 0:
230
+ chunk_count += 1
231
+
232
+ tasks = set()
233
+ current_chunk = 0
234
+ while current_chunk < chunk_count:
235
+ while current_chunk < chunk_count and len(tasks) < concurrent_download:
236
+ tasks.add(asyncio.create_task(download_chunk(current_chunk)))
237
+ current_chunk += 1
238
+ _, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
239
+ tasks = pending
240
+ await asyncio.gather(*tasks)
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.1
2
+ Name: flytekitplugins-async-fsspec
3
+ Version: 1.10.3b5
4
+ Summary: This package holds the data persistence plugins for flytekit
5
+ Author: flyteorg
6
+ Author-email: admin@flyte.org
7
+ License: apache2
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Topic :: Scientific/Engineering
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Topic :: Software Development
18
+ Classifier: Topic :: Software Development :: Libraries
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.8
21
+ Requires-Dist: flytekit
@@ -0,0 +1,14 @@
1
+ README.md
2
+ setup.py
3
+ flytekitplugins/async_fsspec/__init__.py
4
+ flytekitplugins/async_fsspec/s3fs/__init__.py
5
+ flytekitplugins/async_fsspec/s3fs/constants.py
6
+ flytekitplugins/async_fsspec/s3fs/s3fs.py
7
+ flytekitplugins_async_fsspec.egg-info/PKG-INFO
8
+ flytekitplugins_async_fsspec.egg-info/SOURCES.txt
9
+ flytekitplugins_async_fsspec.egg-info/dependency_links.txt
10
+ flytekitplugins_async_fsspec.egg-info/entry_points.txt
11
+ flytekitplugins_async_fsspec.egg-info/namespace_packages.txt
12
+ flytekitplugins_async_fsspec.egg-info/requires.txt
13
+ flytekitplugins_async_fsspec.egg-info/top_level.txt
14
+ tests/test_s3fs.py
@@ -0,0 +1,2 @@
1
+ [flytekit.plugins]
2
+ async_fsspec = flytekitplugins.async_fsspec
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,37 @@
1
+ from setuptools import setup
2
+
3
+ PLUGIN_NAME = "async_fsspec"
4
+
5
+ microlib_name = "flytekitplugins-async-fsspec"
6
+
7
+ plugin_requires = ["flytekit"]
8
+
9
+ __version__ = "1.10.3b5"
10
+
11
+ setup(
12
+ name=microlib_name,
13
+ version=__version__,
14
+ author="flyteorg",
15
+ author_email="admin@flyte.org",
16
+ description="This package holds the data persistence plugins for flytekit",
17
+ namespace_packages=["flytekitplugins"],
18
+ packages=[f"flytekitplugins.{PLUGIN_NAME}", f"flytekitplugins.{PLUGIN_NAME}.s3fs"],
19
+ install_requires=plugin_requires,
20
+ license="apache2",
21
+ python_requires=">=3.8",
22
+ classifiers=[
23
+ "Intended Audience :: Science/Research",
24
+ "Intended Audience :: Developers",
25
+ "License :: OSI Approved :: Apache Software License",
26
+ "Programming Language :: Python :: 3.8",
27
+ "Programming Language :: Python :: 3.9",
28
+ "Programming Language :: Python :: 3.10",
29
+ "Programming Language :: Python :: 3.11",
30
+ "Topic :: Scientific/Engineering",
31
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
32
+ "Topic :: Software Development",
33
+ "Topic :: Software Development :: Libraries",
34
+ "Topic :: Software Development :: Libraries :: Python Modules",
35
+ ],
36
+ entry_points={"flytekit.plugins": [f"{PLUGIN_NAME}=flytekitplugins.{PLUGIN_NAME}"]},
37
+ )
@@ -0,0 +1,201 @@
1
+ import os
2
+ from unittest import mock
3
+ from unittest.mock import MagicMock, mock_open
4
+
5
+ import pytest
6
+ from flytekitplugins.async_fsspec import AsyncS3FileSystem
7
+ from flytekitplugins.async_fsspec.s3fs.constants import DEFAULT_DOWNLOAD_CHUNK_SIZE, DEFAULT_UPLOAD_CHUNK_SIZE
8
+
9
+
10
+ @mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._parent")
11
+ @mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem.invalidate_cache")
12
+ @mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._call_s3")
13
+ @mock.patch("mimetypes.guess_type")
14
+ @mock.patch("os.path.getsize")
15
+ @pytest.mark.asyncio
16
+ async def test_put_file_single_object_upload(
17
+ mock_getsize, mock_guess_type, mock_call_s3, mock_invalidate_cache, mock_parent
18
+ ):
19
+ mock_bucket = "mock-bucket"
20
+ mock_file_name = "mock_file_name"
21
+ mock_file_size = 32 * 2**20 # 32MB
22
+ mock_getsize.return_value = mock_file_size
23
+ mock_guess_type.return_value = (None, None)
24
+ mock_parent.return_value = None
25
+ mock_body = os.urandom(mock_file_size)
26
+ m = mock_open(read_data=mock_body)
27
+
28
+ with mock.patch("builtins.open", m):
29
+ asyncs3fs = AsyncS3FileSystem()
30
+ await asyncs3fs._put_file(lpath=f"/{mock_file_name}", rpath=f"s3://{mock_bucket}/{mock_file_name}")
31
+
32
+ mock_call_s3.assert_called_once_with("put_object", Bucket=mock_bucket, Key=mock_file_name, Body=mock_body)
33
+
34
+
35
+ @mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._parent")
36
+ @mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem.invalidate_cache")
37
+ @mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._call_s3")
38
+ @mock.patch("mimetypes.guess_type")
39
+ @mock.patch("os.path.getsize")
40
+ @pytest.mark.asyncio
41
+ async def test_put_file_multipart_upload(
42
+ mock_getsize, mock_guess_type, mock_call_s3, mock_invalidate_cache, mock_parent
43
+ ):
44
+ mock_bucket = "mock-bucket"
45
+ mock_file_name = "mock_file_name"
46
+ mock_upload_id = "mock_upload_id"
47
+ mock_ETag = "mock_ETag"
48
+ mock_file_size = 256 * 2**20 # 256MB
49
+ mock_getsize.return_value = mock_file_size
50
+ mock_guess_type.return_value = (None, None)
51
+
52
+ def call_s3_side_effect(*args, **kwargs):
53
+ if args[0] == "create_multipart_upload":
54
+ return {"UploadId": mock_upload_id}
55
+ elif args[0] == "upload_part":
56
+ part_number = kwargs["PartNumber"]
57
+ return {"ETag": f"{mock_ETag}{part_number}"}
58
+ elif args[0] == "complete_multipart_upload":
59
+ return None
60
+
61
+ mock_call_s3.side_effect = call_s3_side_effect
62
+
63
+ mock_parent.return_value = None
64
+
65
+ mock_body = os.urandom(mock_file_size)
66
+ m = mock_open(read_data=mock_body)
67
+
68
+ with mock.patch("builtins.open", m):
69
+ asyncs3fs = AsyncS3FileSystem()
70
+ await asyncs3fs._put_file(lpath=f"/{mock_file_name}", rpath=f"s3://{mock_bucket}/{mock_file_name}")
71
+
72
+ mock_chunk_count = mock_file_size // DEFAULT_UPLOAD_CHUNK_SIZE
73
+ if mock_file_size % DEFAULT_UPLOAD_CHUNK_SIZE > 0:
74
+ mock_chunk_count += 1
75
+ put_object_calls = []
76
+ for i in range(mock_chunk_count):
77
+ part_number = i + 1
78
+ start_byte = i * DEFAULT_UPLOAD_CHUNK_SIZE
79
+ end_byte = min(start_byte + DEFAULT_UPLOAD_CHUNK_SIZE, mock_file_size)
80
+ body = mock_body[start_byte:end_byte]
81
+ put_object_calls.append(
82
+ mock.call(
83
+ "upload_part",
84
+ Bucket=mock_bucket,
85
+ Key=mock_file_name,
86
+ PartNumber=part_number,
87
+ UploadId=mock_upload_id,
88
+ Body=body,
89
+ ),
90
+ )
91
+
92
+ mock_call_s3.assert_has_calls(
93
+ put_object_calls
94
+ + [
95
+ mock.call("create_multipart_upload", Bucket=mock_bucket, Key=mock_file_name),
96
+ mock.call(
97
+ "complete_multipart_upload",
98
+ Bucket=mock_bucket,
99
+ Key=mock_file_name,
100
+ UploadId=mock_upload_id,
101
+ MultipartUpload={
102
+ "Parts": [{"PartNumber": i, "ETag": f"{mock_ETag}{i}"} for i in range(1, mock_chunk_count + 1)]
103
+ },
104
+ ),
105
+ ],
106
+ any_order=True,
107
+ )
108
+ assert mock_call_s3.call_count == 2 + mock_chunk_count
109
+
110
+
111
+ @mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._call_s3")
112
+ @mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._info")
113
+ @mock.patch("os.path.isdir")
114
+ @pytest.mark.asyncio
115
+ async def test_get_file_file_size_is_none(mock_isdir, mock_info, mock_call_s3):
116
+ mock_bucket = "mock-bucket"
117
+ mock_file_name = "mock_file_name"
118
+ mock_file_size = 32 * 2**20 # 32MB
119
+ mock_isdir.return_value = False
120
+ mock_info.return_value = {"size": None}
121
+
122
+ file_been_read = 0
123
+
124
+ async def read_side_effect(*args, **kwargs):
125
+ read_size = args[0]
126
+ nonlocal file_been_read
127
+ real_read_size = min(read_size, mock_file_size - file_been_read)
128
+ if real_read_size == 0:
129
+ return None
130
+ file_been_read += real_read_size
131
+ return os.urandom(real_read_size)
132
+
133
+ mock_chunk = MagicMock()
134
+ mock_chunk.read.side_effect = read_side_effect
135
+ mock_call_s3.return_value = {"Body": mock_chunk, "ContentLength": mock_file_size}
136
+
137
+ m = mock_open()
138
+
139
+ with mock.patch("builtins.open", m):
140
+ asyncs3fs = AsyncS3FileSystem()
141
+ await asyncs3fs._get_file(lpath=f"/{mock_file_name}", rpath=f"s3://{mock_bucket}/{mock_file_name}")
142
+
143
+ assert file_been_read == mock_file_size
144
+ mock_call_s3.assert_called_once_with("get_object", Bucket=mock_bucket, Key=mock_file_name, Range="bytes=0")
145
+
146
+
147
+ @mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._call_s3")
148
+ @mock.patch("flytekitplugins.async_fsspec.AsyncS3FileSystem._info")
149
+ @mock.patch("os.path.isdir")
150
+ @pytest.mark.asyncio
151
+ async def test_get_file_file_size_is_not_none(mock_isdir, mock_info, mock_call_s3):
152
+ mock_bucket = "mock-bucket"
153
+ mock_file_name = "mock_file_name"
154
+ mock_file_size = 256 * 2**20 # 256MB
155
+ mock_isdir.return_value = False
156
+ mock_info.return_value = {"size": mock_file_size}
157
+
158
+ file_been_read = 0
159
+
160
+ def call_s3_side_effect(*args, **kwargs):
161
+ start_byte, end_byte = kwargs["Range"][6:].split("-")
162
+ start_byte, end_byte = int(start_byte), int(end_byte)
163
+ chunk_size = end_byte - start_byte + 1
164
+ chunk_been_read = 0
165
+
166
+ async def read_side_effect(*args, **kwargs):
167
+ nonlocal chunk_been_read
168
+ nonlocal file_been_read
169
+ read_size = args[0]
170
+ real_read_size = min(read_size, chunk_size - chunk_been_read)
171
+ if real_read_size == 0:
172
+ return None
173
+ chunk_been_read += real_read_size
174
+ file_been_read += real_read_size
175
+ return os.urandom(real_read_size)
176
+
177
+ mock_chunk = MagicMock()
178
+ mock_chunk.read.side_effect = read_side_effect
179
+ return {"Body": mock_chunk, "ContentLength": chunk_size}
180
+
181
+ mock_call_s3.side_effect = call_s3_side_effect
182
+
183
+ m = mock_open()
184
+ with mock.patch("builtins.open", m):
185
+ asyncs3fs = AsyncS3FileSystem()
186
+ await asyncs3fs._get_file(lpath=f"/{mock_file_name}", rpath=f"s3://{mock_bucket}/{mock_file_name}")
187
+
188
+ assert file_been_read == mock_file_size
189
+
190
+ mock_chunk_count = mock_file_size // DEFAULT_DOWNLOAD_CHUNK_SIZE
191
+ if mock_file_size % DEFAULT_DOWNLOAD_CHUNK_SIZE > 0:
192
+ mock_chunk_count += 1
193
+ get_object_calls = []
194
+ for i in range(mock_chunk_count):
195
+ start_byte = i * DEFAULT_DOWNLOAD_CHUNK_SIZE
196
+ end_byte = min(start_byte + DEFAULT_DOWNLOAD_CHUNK_SIZE, mock_file_size) - 1
197
+ get_object_calls.append(
198
+ mock.call("get_object", Bucket=mock_bucket, Key=mock_file_name, Range=f"bytes={start_byte}-{end_byte}")
199
+ )
200
+ mock_call_s3.assert_has_calls(get_object_calls)
201
+ assert mock_call_s3.call_count == len(get_object_calls)