megfile 3.1.0.post2__py3-none-any.whl → 3.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +2 -4
- megfile/__init__.py +394 -203
- megfile/cli.py +258 -238
- megfile/config.py +25 -21
- megfile/errors.py +124 -114
- megfile/fs.py +174 -140
- megfile/fs_path.py +462 -354
- megfile/hdfs.py +133 -101
- megfile/hdfs_path.py +290 -236
- megfile/http.py +15 -14
- megfile/http_path.py +111 -107
- megfile/interfaces.py +70 -65
- megfile/lib/base_prefetch_reader.py +94 -69
- megfile/lib/combine_reader.py +13 -12
- megfile/lib/compare.py +17 -13
- megfile/lib/compat.py +1 -5
- megfile/lib/fnmatch.py +29 -30
- megfile/lib/glob.py +54 -55
- megfile/lib/hdfs_prefetch_reader.py +40 -25
- megfile/lib/hdfs_tools.py +1 -3
- megfile/lib/http_prefetch_reader.py +69 -46
- megfile/lib/joinpath.py +5 -5
- megfile/lib/lazy_handler.py +7 -3
- megfile/lib/s3_buffered_writer.py +61 -52
- megfile/lib/s3_cached_handler.py +14 -13
- megfile/lib/s3_limited_seekable_writer.py +38 -28
- megfile/lib/s3_memory_handler.py +35 -29
- megfile/lib/s3_pipe_handler.py +25 -24
- megfile/lib/s3_prefetch_reader.py +71 -52
- megfile/lib/s3_share_cache_reader.py +37 -24
- megfile/lib/shadow_handler.py +8 -3
- megfile/lib/stdio_handler.py +9 -8
- megfile/lib/url.py +3 -3
- megfile/pathlike.py +259 -228
- megfile/s3.py +220 -153
- megfile/s3_path.py +977 -802
- megfile/sftp.py +190 -156
- megfile/sftp_path.py +540 -450
- megfile/smart.py +397 -330
- megfile/smart_path.py +100 -105
- megfile/stdio.py +10 -9
- megfile/stdio_path.py +32 -35
- megfile/utils/__init__.py +75 -54
- megfile/utils/mutex.py +11 -14
- megfile/version.py +1 -1
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/METADATA +5 -8
- megfile-3.1.2.dist-info/RECORD +55 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/WHEEL +1 -1
- scripts/convert_results_to_sarif.py +45 -78
- scripts/generate_file.py +140 -64
- megfile-3.1.0.post2.dist-info/RECORD +0 -55
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/LICENSE +0 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/entry_points.txt +0 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/top_level.txt +0 -0
megfile/interfaces.py
CHANGED
|
@@ -3,7 +3,16 @@ from abc import ABC, abstractmethod
|
|
|
3
3
|
from io import IOBase, UnsupportedOperation
|
|
4
4
|
from typing import IO, AnyStr, Iterable, List, Optional
|
|
5
5
|
|
|
6
|
-
from megfile.pathlike import
|
|
6
|
+
from megfile.pathlike import (
|
|
7
|
+
Access,
|
|
8
|
+
BasePath,
|
|
9
|
+
BaseURIPath,
|
|
10
|
+
FileEntry,
|
|
11
|
+
PathLike,
|
|
12
|
+
Self,
|
|
13
|
+
StatResult,
|
|
14
|
+
URIPath,
|
|
15
|
+
)
|
|
7
16
|
|
|
8
17
|
__all__ = [
|
|
9
18
|
"Access",
|
|
@@ -28,33 +37,32 @@ __all__ = [
|
|
|
28
37
|
def fullname(o):
|
|
29
38
|
klass = o.__class__
|
|
30
39
|
module = klass.__module__
|
|
31
|
-
if module ==
|
|
40
|
+
if module == "builtins":
|
|
32
41
|
return klass.__qualname__ # avoid outputs like 'builtins.str'
|
|
33
|
-
return module +
|
|
42
|
+
return module + "." + klass.__qualname__
|
|
34
43
|
|
|
35
44
|
|
|
36
45
|
# 1. Default value of closed is False
|
|
37
46
|
# 2. closed is set to True when close() are called
|
|
38
47
|
# 3. close() will only be called once
|
|
39
48
|
class Closable(ABC):
|
|
40
|
-
|
|
41
49
|
@property
|
|
42
50
|
def closed(self) -> bool:
|
|
43
|
-
|
|
44
|
-
return getattr(self,
|
|
51
|
+
"""Return True if the file-like object is closed."""
|
|
52
|
+
return getattr(self, "__closed__", False)
|
|
45
53
|
|
|
46
54
|
@abstractmethod
|
|
47
55
|
def _close(self) -> None:
|
|
48
56
|
pass # pragma: no cover
|
|
49
57
|
|
|
50
58
|
def close(self) -> None:
|
|
51
|
-
|
|
59
|
+
"""Flush and close the file-like object.
|
|
52
60
|
|
|
53
61
|
This method has no effect if the file is already closed.
|
|
54
|
-
|
|
55
|
-
if not getattr(self,
|
|
62
|
+
"""
|
|
63
|
+
if not getattr(self, "__closed__", False):
|
|
56
64
|
self._close()
|
|
57
|
-
setattr(self,
|
|
65
|
+
setattr(self, "__closed__", True)
|
|
58
66
|
|
|
59
67
|
def __enter__(self: Self) -> Self:
|
|
60
68
|
return self
|
|
@@ -64,23 +72,25 @@ class Closable(ABC):
|
|
|
64
72
|
|
|
65
73
|
|
|
66
74
|
class FileLike(Closable, IOBase, IO[AnyStr], ABC): # pytype: disable=signature-mismatch
|
|
67
|
-
|
|
68
75
|
def fileno(self) -> int:
|
|
69
|
-
raise UnsupportedOperation(
|
|
76
|
+
raise UnsupportedOperation("not a local file")
|
|
70
77
|
|
|
71
78
|
def isatty(self) -> bool:
|
|
72
79
|
return False
|
|
73
80
|
|
|
74
81
|
def __repr__(self) -> str:
|
|
75
|
-
return
|
|
76
|
-
fullname(self),
|
|
82
|
+
return "<%s name=%r mode=%r>" % (
|
|
83
|
+
fullname(self),
|
|
84
|
+
self.name,
|
|
85
|
+
self.mode,
|
|
86
|
+
) # pragma: no cover
|
|
77
87
|
|
|
78
88
|
def seekable(self) -> bool:
|
|
79
|
-
|
|
89
|
+
"""Return True if the file-like object can be sought."""
|
|
80
90
|
return False
|
|
81
91
|
|
|
82
92
|
def seek(self, offset: int, whence: int = os.SEEK_SET) -> int:
|
|
83
|
-
|
|
93
|
+
"""Change stream position.
|
|
84
94
|
|
|
85
95
|
Seek to byte `offset` relative to position indicated by `whence`:
|
|
86
96
|
0 Start of stream (the default). `offset` should be >= 0;
|
|
@@ -88,37 +98,37 @@ class FileLike(Closable, IOBase, IO[AnyStr], ABC): # pytype: disable=signature-
|
|
|
88
98
|
2 End of stream - `offset` usually negative.
|
|
89
99
|
|
|
90
100
|
Return the new absolute position.
|
|
91
|
-
|
|
92
|
-
raise UnsupportedOperation(
|
|
101
|
+
"""
|
|
102
|
+
raise UnsupportedOperation("not seekable") # pragma: no cover
|
|
93
103
|
|
|
94
104
|
def readable(self) -> bool:
|
|
95
|
-
|
|
105
|
+
"""Return True if the file-like object can be read."""
|
|
96
106
|
return False # pragma: no cover
|
|
97
107
|
|
|
98
108
|
def writable(self) -> bool:
|
|
99
|
-
|
|
109
|
+
"""Return True if the file-like object can be written."""
|
|
100
110
|
return False
|
|
101
111
|
|
|
102
112
|
def flush(self) -> None:
|
|
103
|
-
|
|
113
|
+
"""Flush write buffers, if applicable.
|
|
104
114
|
|
|
105
115
|
This is not implemented for read-only and non-blocking streams.
|
|
106
|
-
|
|
116
|
+
"""
|
|
107
117
|
|
|
108
118
|
def __del__(self) -> None:
|
|
109
|
-
# TODO: Next version should turn on __del__ for auto closing,
|
|
119
|
+
# TODO: Next version should turn on __del__ for auto closing,
|
|
120
|
+
# and disable this in child class like CombineReader
|
|
110
121
|
pass
|
|
111
122
|
|
|
112
123
|
|
|
113
124
|
class Seekable(FileLike, ABC):
|
|
114
|
-
|
|
115
125
|
def seekable(self) -> bool:
|
|
116
|
-
|
|
126
|
+
"""Return True if the file-like object can be sought."""
|
|
117
127
|
return True
|
|
118
128
|
|
|
119
129
|
@abstractmethod
|
|
120
130
|
def seek(self, offset: int, whence: int = os.SEEK_SET) -> int:
|
|
121
|
-
|
|
131
|
+
"""Change stream position.
|
|
122
132
|
|
|
123
133
|
Seek to byte offset `cookie` relative to position indicated by `whence`:
|
|
124
134
|
0 Start of stream (the default). `cookie` should be >= 0;
|
|
@@ -126,42 +136,41 @@ class Seekable(FileLike, ABC):
|
|
|
126
136
|
2 End of stream - `cookie` usually negative.
|
|
127
137
|
|
|
128
138
|
Return the new absolute position.
|
|
129
|
-
|
|
139
|
+
"""
|
|
130
140
|
|
|
131
141
|
|
|
132
142
|
class Readable(FileLike[AnyStr], ABC):
|
|
133
|
-
|
|
134
143
|
def readable(self) -> bool:
|
|
135
|
-
|
|
144
|
+
"""Return True if the file-like object can be read."""
|
|
136
145
|
return True
|
|
137
146
|
|
|
138
147
|
@abstractmethod
|
|
139
148
|
def read(self, size: Optional[int] = None) -> AnyStr:
|
|
140
|
-
|
|
149
|
+
"""Read at most `size` bytes or string, returned as a bytes or string object.
|
|
141
150
|
|
|
142
151
|
If the `size` argument is negative, read until EOF is reached.
|
|
143
152
|
Return an empty bytes or string object at EOF.
|
|
144
|
-
|
|
153
|
+
"""
|
|
145
154
|
|
|
146
155
|
@abstractmethod
|
|
147
156
|
def readline(self, size: Optional[int] = None) -> AnyStr: # pyre-ignore[15]
|
|
148
|
-
|
|
157
|
+
"""Next line from the file, as a bytes or string object.
|
|
149
158
|
|
|
150
|
-
Retain newline. A non-negative `size` argument limits the maximum number of
|
|
159
|
+
Retain newline. A non-negative `size` argument limits the maximum number of
|
|
160
|
+
bytes or string to return (an incomplete line may be returned then).
|
|
151
161
|
Return an empty bytes object at EOF.
|
|
152
|
-
|
|
162
|
+
"""
|
|
153
163
|
|
|
154
|
-
def readlines( # pyre-ignore[15]
|
|
155
|
-
|
|
156
|
-
'''Return a list of lines from the stream.'''
|
|
164
|
+
def readlines(self, hint: Optional[int] = None) -> List[AnyStr]: # pyre-ignore[15]
|
|
165
|
+
"""Return a list of lines from the stream."""
|
|
157
166
|
return self.read(size=hint).splitlines(True) # pyre-ignore[7]
|
|
158
167
|
|
|
159
168
|
def readinto(self, buffer: bytearray) -> int:
|
|
160
|
-
|
|
169
|
+
"""Read bytes into buffer.
|
|
161
170
|
|
|
162
171
|
Returns number of bytes read (0 for EOF), or None if the object
|
|
163
172
|
is set not to block and has no data to read.
|
|
164
|
-
|
|
173
|
+
"""
|
|
165
174
|
if "b" not in self.mode:
|
|
166
175
|
raise OSError("'readinto' only works on binary files")
|
|
167
176
|
|
|
@@ -180,41 +189,40 @@ class Readable(FileLike[AnyStr], ABC):
|
|
|
180
189
|
return self
|
|
181
190
|
|
|
182
191
|
def truncate(self, size: Optional[int] = None) -> int:
|
|
183
|
-
raise OSError(
|
|
192
|
+
raise OSError("not writable")
|
|
184
193
|
|
|
185
194
|
def write(self, data: AnyStr) -> int:
|
|
186
|
-
raise OSError(
|
|
195
|
+
raise OSError("not writable")
|
|
187
196
|
|
|
188
197
|
def writelines(self, lines: Iterable[AnyStr]) -> None: # pyre-ignore[14]
|
|
189
|
-
raise OSError(
|
|
198
|
+
raise OSError("not writable")
|
|
190
199
|
|
|
191
200
|
|
|
192
201
|
class Writable(FileLike[AnyStr], ABC):
|
|
193
|
-
|
|
194
202
|
def writable(self) -> bool:
|
|
195
|
-
|
|
203
|
+
"""Return True if the file-like object can be written."""
|
|
196
204
|
return True
|
|
197
205
|
|
|
198
206
|
@abstractmethod
|
|
199
207
|
def write(self, data: AnyStr) -> int:
|
|
200
|
-
|
|
208
|
+
"""Write bytes or string to file.
|
|
201
209
|
|
|
202
210
|
Return the number of bytes or string written.
|
|
203
|
-
|
|
211
|
+
"""
|
|
204
212
|
|
|
205
213
|
def writelines(self, lines: Iterable[AnyStr]) -> None: # pyre-ignore[14]
|
|
206
|
-
|
|
214
|
+
"""Write `lines` to the file.
|
|
207
215
|
|
|
208
|
-
Note that newlines are not added.
|
|
209
|
-
`lines` can be any iterable object producing bytes-like or string-like objects.
|
|
216
|
+
Note that newlines are not added.
|
|
217
|
+
`lines` can be any iterable object producing bytes-like or string-like objects.
|
|
210
218
|
This is equivalent to calling write() for each element.
|
|
211
|
-
|
|
219
|
+
"""
|
|
212
220
|
for line in lines:
|
|
213
221
|
self.write(line)
|
|
214
222
|
|
|
215
223
|
def truncate(self, size: Optional[int] = None) -> int:
|
|
216
224
|
"""
|
|
217
|
-
Resize the stream to the given size in bytes.
|
|
225
|
+
Resize the stream to the given size in bytes.
|
|
218
226
|
|
|
219
227
|
:param size: resize size, defaults to None
|
|
220
228
|
:type size: int, optional
|
|
@@ -223,21 +231,19 @@ class Writable(FileLike[AnyStr], ABC):
|
|
|
223
231
|
:return: The new file size.
|
|
224
232
|
:rtype: int
|
|
225
233
|
"""
|
|
226
|
-
raise UnsupportedOperation(
|
|
234
|
+
raise UnsupportedOperation("not support truncate")
|
|
227
235
|
|
|
228
236
|
def read(self, size: Optional[int] = None) -> AnyStr:
|
|
229
|
-
raise OSError(
|
|
237
|
+
raise OSError("not readable")
|
|
230
238
|
|
|
231
239
|
def readline(self, size: Optional[int] = None) -> AnyStr: # pyre-ignore[15]
|
|
232
|
-
raise OSError(
|
|
240
|
+
raise OSError("not readable")
|
|
233
241
|
|
|
234
|
-
def readlines( # pyre-ignore[15]
|
|
235
|
-
|
|
236
|
-
raise OSError('not readable')
|
|
242
|
+
def readlines(self, hint: Optional[int] = None) -> List[AnyStr]: # pyre-ignore[15]
|
|
243
|
+
raise OSError("not readable")
|
|
237
244
|
|
|
238
245
|
|
|
239
246
|
class FileCacher(ABC):
|
|
240
|
-
|
|
241
247
|
@property
|
|
242
248
|
@abstractmethod
|
|
243
249
|
def cache_path(self) -> str:
|
|
@@ -245,21 +251,21 @@ class FileCacher(ABC):
|
|
|
245
251
|
|
|
246
252
|
@property
|
|
247
253
|
def closed(self) -> bool:
|
|
248
|
-
|
|
249
|
-
return getattr(self,
|
|
254
|
+
"""Return True if the file-like object is closed."""
|
|
255
|
+
return getattr(self, "__closed__", False)
|
|
250
256
|
|
|
251
257
|
@abstractmethod
|
|
252
258
|
def _close(self) -> None:
|
|
253
259
|
pass # pragma: no cover
|
|
254
260
|
|
|
255
261
|
def close(self) -> None:
|
|
256
|
-
|
|
262
|
+
"""Flush and close the file-like object.
|
|
257
263
|
|
|
258
264
|
This method has no effect if the file is already closed.
|
|
259
|
-
|
|
260
|
-
if not getattr(self,
|
|
265
|
+
"""
|
|
266
|
+
if not getattr(self, "__closed__", False):
|
|
261
267
|
self._close()
|
|
262
|
-
setattr(self,
|
|
268
|
+
setattr(self, "__closed__", True)
|
|
263
269
|
|
|
264
270
|
def __enter__(self) -> str:
|
|
265
271
|
return self.cache_path
|
|
@@ -282,7 +288,6 @@ class NullCacher(FileCacher):
|
|
|
282
288
|
|
|
283
289
|
|
|
284
290
|
class ContextIterator(Closable):
|
|
285
|
-
|
|
286
291
|
def __init__(self, iterable: Iterable) -> None:
|
|
287
292
|
self._iter = iter(iterable)
|
|
288
293
|
|
|
@@ -8,15 +8,22 @@ from math import ceil
|
|
|
8
8
|
from statistics import mean
|
|
9
9
|
from typing import Optional
|
|
10
10
|
|
|
11
|
-
from megfile.config import
|
|
11
|
+
from megfile.config import (
|
|
12
|
+
BACKOFF_FACTOR,
|
|
13
|
+
BACKOFF_INITIAL,
|
|
14
|
+
DEFAULT_BLOCK_CAPACITY,
|
|
15
|
+
DEFAULT_BLOCK_SIZE,
|
|
16
|
+
DEFAULT_MAX_RETRY_TIMES,
|
|
17
|
+
GLOBAL_MAX_WORKERS,
|
|
18
|
+
NEWLINE,
|
|
19
|
+
)
|
|
12
20
|
from megfile.interfaces import Readable, Seekable
|
|
13
|
-
from megfile.utils import get_human_size, process_local
|
|
21
|
+
from megfile.utils import ProcessLocal, get_human_size, process_local
|
|
14
22
|
|
|
15
23
|
_logger = get_logger(__name__)
|
|
16
24
|
|
|
17
25
|
|
|
18
26
|
class SeekRecord:
|
|
19
|
-
|
|
20
27
|
def __init__(self, seek_index: int):
|
|
21
28
|
self.seek_index = seek_index
|
|
22
29
|
self.seek_count = 0
|
|
@@ -24,38 +31,49 @@ class SeekRecord:
|
|
|
24
31
|
|
|
25
32
|
|
|
26
33
|
class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
27
|
-
|
|
28
|
-
Reader to fast read the remote file content.
|
|
29
|
-
This will divide the file content into equal parts of block_size size,
|
|
34
|
+
"""
|
|
35
|
+
Reader to fast read the remote file content.
|
|
36
|
+
This will divide the file content into equal parts of block_size size,
|
|
30
37
|
and will use LRU to cache at most block_capacity blocks in memory.
|
|
31
|
-
open(), seek() and read() will trigger prefetch read.
|
|
32
|
-
The prefetch will cached block_forward blocks of data from offset position
|
|
38
|
+
open(), seek() and read() will trigger prefetch read.
|
|
39
|
+
The prefetch will cached block_forward blocks of data from offset position
|
|
33
40
|
(the position after reading if the called function is read).
|
|
34
|
-
|
|
41
|
+
"""
|
|
35
42
|
|
|
36
43
|
def __init__(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
44
|
+
self,
|
|
45
|
+
*,
|
|
46
|
+
block_size: int = DEFAULT_BLOCK_SIZE,
|
|
47
|
+
block_capacity: int = DEFAULT_BLOCK_CAPACITY,
|
|
48
|
+
block_forward: Optional[int] = None,
|
|
49
|
+
max_retries: int = DEFAULT_MAX_RETRY_TIMES,
|
|
50
|
+
max_workers: Optional[int] = None,
|
|
51
|
+
**kwargs,
|
|
52
|
+
):
|
|
46
53
|
self._is_auto_scaling = block_forward is None
|
|
47
54
|
if block_forward is None:
|
|
48
55
|
block_forward = max(block_capacity - 1, 1)
|
|
49
56
|
|
|
50
|
-
|
|
51
|
-
|
|
57
|
+
if block_capacity <= block_forward:
|
|
58
|
+
# TODO: replace AssertionError with ValueError in 4.0.0
|
|
59
|
+
raise AssertionError(
|
|
60
|
+
"block_capacity should greater than block_forward, "
|
|
61
|
+
"got: block_capacity=%s, block_forward=%s"
|
|
62
|
+
% (block_capacity, block_forward)
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# user maybe put block_size with 'numpy.uint64' type
|
|
66
|
+
block_size = int(block_size)
|
|
52
67
|
|
|
53
68
|
self._max_retries = max_retries
|
|
54
69
|
self._block_size = block_size
|
|
55
70
|
self._block_capacity = block_capacity # Max number of blocks
|
|
56
|
-
self._block_forward = block_forward # Number of blocks every prefetch, which should be smaller than block_capacity
|
|
57
71
|
|
|
58
|
-
|
|
72
|
+
# Number of blocks every prefetch, which should be smaller than block_capacity
|
|
73
|
+
self._block_forward = block_forward
|
|
74
|
+
|
|
75
|
+
self._process_local = ProcessLocal()
|
|
76
|
+
|
|
59
77
|
self._content_size = self._get_content_size()
|
|
60
78
|
self._block_stop = ceil(self._content_size / block_size)
|
|
61
79
|
|
|
@@ -68,20 +86,25 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
68
86
|
self._is_global_executor = False
|
|
69
87
|
if max_workers is None:
|
|
70
88
|
self._executor = process_local(
|
|
71
|
-
f
|
|
89
|
+
f"{self.__class__.__name__}.executor",
|
|
72
90
|
ThreadPoolExecutor,
|
|
73
|
-
max_workers=GLOBAL_MAX_WORKERS
|
|
91
|
+
max_workers=GLOBAL_MAX_WORKERS,
|
|
92
|
+
)
|
|
74
93
|
self._is_global_executor = True
|
|
75
94
|
else:
|
|
76
95
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
77
96
|
self._seek_buffer(0)
|
|
78
97
|
|
|
79
|
-
_logger.debug(
|
|
98
|
+
_logger.debug("open file: %r, mode: %s" % (self.name, self.mode))
|
|
80
99
|
|
|
81
100
|
@abstractmethod
|
|
82
101
|
def _get_content_size(self):
|
|
83
102
|
pass
|
|
84
103
|
|
|
104
|
+
@property
|
|
105
|
+
def _futures(self):
|
|
106
|
+
return self._process_local("futures", self._get_futures)
|
|
107
|
+
|
|
85
108
|
def _get_futures(self):
|
|
86
109
|
return LRUCacheFutureManager()
|
|
87
110
|
|
|
@@ -92,7 +115,7 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
92
115
|
|
|
93
116
|
@property
|
|
94
117
|
def mode(self) -> str:
|
|
95
|
-
return
|
|
118
|
+
return "rb"
|
|
96
119
|
|
|
97
120
|
def tell(self) -> int:
|
|
98
121
|
return self._offset
|
|
@@ -105,15 +128,15 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
105
128
|
def _offset(self, value: int):
|
|
106
129
|
if value > self._backoff_size:
|
|
107
130
|
_logger.debug(
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
131
|
+
"reading file: %r, current offset / total size: %s / %s"
|
|
132
|
+
% (self.name, get_human_size(value), get_human_size(self._content_size))
|
|
133
|
+
)
|
|
111
134
|
while value > self._backoff_size:
|
|
112
135
|
self._backoff_size *= BACKOFF_FACTOR
|
|
113
136
|
self.__offset = value
|
|
114
137
|
|
|
115
138
|
def seek(self, offset: int, whence: int = os.SEEK_SET) -> int:
|
|
116
|
-
|
|
139
|
+
"""Change stream position.
|
|
117
140
|
|
|
118
141
|
Seek to byte offset pos relative to position indicated by whence:
|
|
119
142
|
|
|
@@ -122,10 +145,10 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
122
145
|
2 End of stream - pos usually negative.
|
|
123
146
|
|
|
124
147
|
Returns the new absolute position.
|
|
125
|
-
|
|
148
|
+
"""
|
|
149
|
+
offset = int(offset) # user maybe put offset with 'numpy.uint64' type
|
|
126
150
|
if self.closed:
|
|
127
|
-
raise IOError(
|
|
128
|
-
|
|
151
|
+
raise IOError("file already closed: %r" % self.name)
|
|
129
152
|
if whence == os.SEEK_CUR:
|
|
130
153
|
target_offset = self._offset + offset
|
|
131
154
|
elif whence == os.SEEK_END:
|
|
@@ -133,7 +156,7 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
133
156
|
elif whence == os.SEEK_SET:
|
|
134
157
|
target_offset = offset
|
|
135
158
|
else:
|
|
136
|
-
raise ValueError(
|
|
159
|
+
raise ValueError("invalid whence: %r" % whence)
|
|
137
160
|
|
|
138
161
|
if target_offset == self._offset:
|
|
139
162
|
return target_offset
|
|
@@ -145,35 +168,34 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
145
168
|
return self._offset
|
|
146
169
|
|
|
147
170
|
def read(self, size: Optional[int] = None) -> bytes:
|
|
148
|
-
|
|
171
|
+
"""Read at most size bytes, returned as a bytes object.
|
|
149
172
|
|
|
150
173
|
If the size argument is negative, read until EOF is reached.
|
|
151
174
|
Return an empty bytes object at EOF.
|
|
152
|
-
|
|
175
|
+
"""
|
|
153
176
|
if self.closed:
|
|
154
|
-
raise IOError(
|
|
177
|
+
raise IOError("file already closed: %r" % self.name)
|
|
155
178
|
|
|
156
179
|
if len(self._seek_history) > 0:
|
|
157
180
|
self._seek_history[-1].read_count += 1
|
|
158
181
|
|
|
159
182
|
if self._offset >= self._content_size:
|
|
160
|
-
return b
|
|
183
|
+
return b""
|
|
161
184
|
|
|
162
|
-
if size is None:
|
|
185
|
+
if size is None or size < 0:
|
|
163
186
|
size = self._content_size - self._offset
|
|
164
187
|
else:
|
|
165
|
-
assert size >= 0, 'size should greater than 0, got: %r' % size
|
|
166
188
|
size = min(size, self._content_size - self._offset)
|
|
167
189
|
|
|
168
190
|
if self._block_forward == 1:
|
|
169
191
|
block_index = self._offset // self._block_size
|
|
170
192
|
if len(self._seek_history) > 0:
|
|
171
|
-
mean_read_count = mean(
|
|
172
|
-
item.read_count for item in self._seek_history)
|
|
193
|
+
mean_read_count = mean(item.read_count for item in self._seek_history)
|
|
173
194
|
else:
|
|
174
195
|
mean_read_count = 0
|
|
175
196
|
if block_index not in self._futures and mean_read_count < 3:
|
|
176
|
-
# No using LRP will be better if read() are always called less than 3
|
|
197
|
+
# No using LRP will be better if read() are always called less than 3
|
|
198
|
+
# times after seek()
|
|
177
199
|
return self._read(size)
|
|
178
200
|
|
|
179
201
|
data = self._buffer.read(size)
|
|
@@ -192,24 +214,24 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
192
214
|
return buffer.getvalue()
|
|
193
215
|
|
|
194
216
|
def readline(self, size: Optional[int] = None) -> bytes:
|
|
195
|
-
|
|
217
|
+
"""Next line from the file, as a bytes object.
|
|
196
218
|
|
|
197
219
|
Retain newline. A non-negative size argument limits the maximum
|
|
198
220
|
number of bytes to return (an incomplete line may be returned then).
|
|
221
|
+
If the size argument is negative, read until EOF is reached.
|
|
199
222
|
Return an empty bytes object at EOF.
|
|
200
|
-
|
|
223
|
+
"""
|
|
201
224
|
if self.closed:
|
|
202
|
-
raise IOError(
|
|
225
|
+
raise IOError("file already closed: %r" % self.name)
|
|
203
226
|
|
|
204
227
|
if len(self._seek_history) > 0:
|
|
205
228
|
self._seek_history[-1].read_count += 1
|
|
206
229
|
if self._offset >= self._content_size:
|
|
207
|
-
return b
|
|
230
|
+
return b""
|
|
208
231
|
|
|
209
|
-
if size is None:
|
|
232
|
+
if size is None or size < 0:
|
|
210
233
|
size = self._content_size - self._offset
|
|
211
234
|
else:
|
|
212
|
-
assert size >= 0, 'size should greater than 0, got: %r' % size
|
|
213
235
|
size = min(size, self._content_size - self._offset)
|
|
214
236
|
|
|
215
237
|
data = self._buffer.readline(size)
|
|
@@ -231,21 +253,22 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
231
253
|
|
|
232
254
|
def _read(self, size: int) -> bytes:
|
|
233
255
|
if size == 0 or self._offset >= self._content_size:
|
|
234
|
-
return b
|
|
256
|
+
return b""
|
|
235
257
|
|
|
236
|
-
data = self._fetch_response(
|
|
237
|
-
|
|
258
|
+
data = self._fetch_response(start=self._offset, end=self._offset + size - 1)[
|
|
259
|
+
"Body"
|
|
260
|
+
].read()
|
|
238
261
|
self.seek(size, os.SEEK_CUR)
|
|
239
262
|
return data
|
|
240
263
|
|
|
241
264
|
def readinto(self, buffer: bytearray) -> int:
|
|
242
|
-
|
|
265
|
+
"""Read bytes into buffer.
|
|
243
266
|
|
|
244
267
|
Returns number of bytes read (0 for EOF), or None if the object
|
|
245
268
|
is set not to block and has no data to read.
|
|
246
|
-
|
|
269
|
+
"""
|
|
247
270
|
if self.closed:
|
|
248
|
-
raise IOError(
|
|
271
|
+
raise IOError("file already closed: %r" % self.name)
|
|
249
272
|
|
|
250
273
|
if self._offset >= self._content_size:
|
|
251
274
|
return 0
|
|
@@ -254,7 +277,7 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
254
277
|
size = min(size, self._content_size - self._offset)
|
|
255
278
|
|
|
256
279
|
data = self._buffer.read(size)
|
|
257
|
-
buffer[:len(data)] = data
|
|
280
|
+
buffer[: len(data)] = data
|
|
258
281
|
if len(data) == size:
|
|
259
282
|
self._offset += len(data)
|
|
260
283
|
return size
|
|
@@ -263,7 +286,7 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
263
286
|
while offset < size:
|
|
264
287
|
remain_size = size - offset
|
|
265
288
|
data = self._next_buffer.read(remain_size)
|
|
266
|
-
buffer[offset:offset + len(data)] = data
|
|
289
|
+
buffer[offset : offset + len(data)] = data
|
|
267
290
|
offset += len(data)
|
|
268
291
|
|
|
269
292
|
self._offset += offset
|
|
@@ -300,8 +323,11 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
300
323
|
|
|
301
324
|
@property
|
|
302
325
|
def _next_buffer(self) -> BytesIO:
|
|
303
|
-
# Get next buffer by this function when finished reading current buffer
|
|
304
|
-
#
|
|
326
|
+
# Get next buffer by this function when finished reading current buffer
|
|
327
|
+
# (self._buffer)
|
|
328
|
+
#
|
|
329
|
+
# Make sure that _buffer is used before using _next_buffer(), or will make
|
|
330
|
+
# _cached_offset invalid
|
|
305
331
|
self._block_index += 1
|
|
306
332
|
self._cached_offset = 0
|
|
307
333
|
return self._buffer
|
|
@@ -313,7 +339,8 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
313
339
|
history = []
|
|
314
340
|
for item in self._seek_history:
|
|
315
341
|
if item.seek_count > self._block_capacity * 2:
|
|
316
|
-
# seek interval is bigger than self._block_capacity * 2, drop it
|
|
342
|
+
# seek interval is bigger than self._block_capacity * 2, drop it
|
|
343
|
+
# from self._seek_history
|
|
317
344
|
continue
|
|
318
345
|
if index - 1 < item.seek_index < index + 2:
|
|
319
346
|
continue
|
|
@@ -322,23 +349,22 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
322
349
|
history.append(SeekRecord(index))
|
|
323
350
|
self._seek_history = history
|
|
324
351
|
self._block_forward = max(
|
|
325
|
-
(self._block_capacity - 1) // len(self._seek_history), 1
|
|
352
|
+
(self._block_capacity - 1) // len(self._seek_history), 1
|
|
353
|
+
)
|
|
326
354
|
|
|
327
355
|
self._cached_offset = offset
|
|
328
356
|
self._block_index = index
|
|
329
357
|
|
|
330
358
|
@abstractmethod
|
|
331
359
|
def _fetch_response(
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
end: Optional[int] = None) -> dict:
|
|
360
|
+
self, start: Optional[int] = None, end: Optional[int] = None
|
|
361
|
+
) -> dict:
|
|
335
362
|
pass
|
|
336
363
|
|
|
337
364
|
def _fetch_buffer(self, index: int) -> BytesIO:
|
|
338
|
-
start, end = index * self._block_size, (
|
|
339
|
-
index + 1) * self._block_size - 1
|
|
365
|
+
start, end = index * self._block_size, (index + 1) * self._block_size - 1
|
|
340
366
|
response = self._fetch_response(start=start, end=end)
|
|
341
|
-
return response[
|
|
367
|
+
return response["Body"]
|
|
342
368
|
|
|
343
369
|
def _submit_future(self, index: int):
|
|
344
370
|
if index < 0 or index >= self._block_stop:
|
|
@@ -355,7 +381,7 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
355
381
|
self._futures.cleanup(self._block_capacity)
|
|
356
382
|
|
|
357
383
|
def _close(self):
|
|
358
|
-
_logger.debug(
|
|
384
|
+
_logger.debug("close file: %r" % self.name)
|
|
359
385
|
|
|
360
386
|
if not self._is_global_executor:
|
|
361
387
|
self._executor.shutdown()
|
|
@@ -363,7 +389,6 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
363
389
|
|
|
364
390
|
|
|
365
391
|
class LRUCacheFutureManager(OrderedDict):
|
|
366
|
-
|
|
367
392
|
def __init__(self):
|
|
368
393
|
super().__init__()
|
|
369
394
|
|