megfile 3.1.0.post2__py3-none-any.whl → 3.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. docs/conf.py +2 -4
  2. megfile/__init__.py +394 -203
  3. megfile/cli.py +258 -238
  4. megfile/config.py +25 -21
  5. megfile/errors.py +124 -114
  6. megfile/fs.py +174 -140
  7. megfile/fs_path.py +462 -354
  8. megfile/hdfs.py +133 -101
  9. megfile/hdfs_path.py +290 -236
  10. megfile/http.py +15 -14
  11. megfile/http_path.py +111 -107
  12. megfile/interfaces.py +70 -65
  13. megfile/lib/base_prefetch_reader.py +94 -69
  14. megfile/lib/combine_reader.py +13 -12
  15. megfile/lib/compare.py +17 -13
  16. megfile/lib/compat.py +1 -5
  17. megfile/lib/fnmatch.py +29 -30
  18. megfile/lib/glob.py +54 -55
  19. megfile/lib/hdfs_prefetch_reader.py +40 -25
  20. megfile/lib/hdfs_tools.py +1 -3
  21. megfile/lib/http_prefetch_reader.py +69 -46
  22. megfile/lib/joinpath.py +5 -5
  23. megfile/lib/lazy_handler.py +7 -3
  24. megfile/lib/s3_buffered_writer.py +61 -52
  25. megfile/lib/s3_cached_handler.py +14 -13
  26. megfile/lib/s3_limited_seekable_writer.py +38 -28
  27. megfile/lib/s3_memory_handler.py +35 -29
  28. megfile/lib/s3_pipe_handler.py +25 -24
  29. megfile/lib/s3_prefetch_reader.py +71 -52
  30. megfile/lib/s3_share_cache_reader.py +37 -24
  31. megfile/lib/shadow_handler.py +8 -3
  32. megfile/lib/stdio_handler.py +9 -8
  33. megfile/lib/url.py +3 -3
  34. megfile/pathlike.py +259 -228
  35. megfile/s3.py +220 -153
  36. megfile/s3_path.py +977 -802
  37. megfile/sftp.py +190 -156
  38. megfile/sftp_path.py +540 -450
  39. megfile/smart.py +397 -330
  40. megfile/smart_path.py +100 -105
  41. megfile/stdio.py +10 -9
  42. megfile/stdio_path.py +32 -35
  43. megfile/utils/__init__.py +75 -54
  44. megfile/utils/mutex.py +11 -14
  45. megfile/version.py +1 -1
  46. {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/METADATA +5 -8
  47. megfile-3.1.2.dist-info/RECORD +55 -0
  48. {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/WHEEL +1 -1
  49. scripts/convert_results_to_sarif.py +45 -78
  50. scripts/generate_file.py +140 -64
  51. megfile-3.1.0.post2.dist-info/RECORD +0 -55
  52. {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/LICENSE +0 -0
  53. {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/LICENSE.pyre +0 -0
  54. {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/entry_points.txt +0 -0
  55. {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/top_level.txt +0 -0
@@ -3,8 +3,17 @@ from typing import Optional
3
3
 
4
4
  import requests
5
5
 
6
- from megfile.config import DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, HTTP_MAX_RETRY_TIMES
7
- from megfile.errors import HttpBodyIncompleteError, UnsupportedError, http_should_retry, patch_method
6
+ from megfile.config import (
7
+ DEFAULT_BLOCK_CAPACITY,
8
+ DEFAULT_BLOCK_SIZE,
9
+ HTTP_MAX_RETRY_TIMES,
10
+ )
11
+ from megfile.errors import (
12
+ HttpBodyIncompleteError,
13
+ UnsupportedError,
14
+ http_should_retry,
15
+ patch_method,
16
+ )
8
17
  from megfile.lib.base_prefetch_reader import BasePrefetchReader
9
18
  from megfile.lib.compat import fspath
10
19
  from megfile.pathlike import PathLike
@@ -13,24 +22,29 @@ DEFAULT_TIMEOUT = (60, 60 * 60 * 24)
13
22
 
14
23
 
15
24
  class HttpPrefetchReader(BasePrefetchReader):
16
- '''
17
- Reader to fast read the http content, service must support Accept-Ranges.
18
- This will divide the file content into equal parts of block_size size, and will use LRU to cache at most block_capacity blocks in memory.
19
- open(), seek() and read() will trigger prefetch read.
20
- The prefetch will cached block_forward blocks of data from offset position (the position after reading if the called function is read).
21
- '''
25
+ """
26
+ Reader to fast read the http content, service must support Accept-Ranges.
22
27
 
23
- def __init__(
24
- self,
25
- url: PathLike,
26
- *,
27
- content_size: Optional[int] = None,
28
- block_size: int = DEFAULT_BLOCK_SIZE,
29
- block_capacity: int = DEFAULT_BLOCK_CAPACITY,
30
- block_forward: Optional[int] = None,
31
- max_retries: int = HTTP_MAX_RETRY_TIMES,
32
- max_workers: Optional[int] = None):
28
+ This will divide the file content into equal parts of block_size size, and will use
29
+ LRU to cache at most block_capacity blocks in memory.
30
+
31
+ open(), seek() and read() will trigger prefetch read.
32
+
33
+ The prefetch will cached block_forward blocks of data from offset position
34
+ (the position after reading if the called function is read).
35
+ """
33
36
 
37
+ def __init__(
38
+ self,
39
+ url: PathLike,
40
+ *,
41
+ content_size: Optional[int] = None,
42
+ block_size: int = DEFAULT_BLOCK_SIZE,
43
+ block_capacity: int = DEFAULT_BLOCK_CAPACITY,
44
+ block_forward: Optional[int] = None,
45
+ max_retries: int = HTTP_MAX_RETRY_TIMES,
46
+ max_workers: Optional[int] = None,
47
+ ):
34
48
  self._url = url
35
49
  self._content_size = content_size
36
50
 
@@ -39,68 +53,77 @@ class HttpPrefetchReader(BasePrefetchReader):
39
53
  block_capacity=block_capacity,
40
54
  block_forward=block_forward,
41
55
  max_retries=max_retries,
42
- max_workers=max_workers)
56
+ max_workers=max_workers,
57
+ )
43
58
 
44
59
  def _get_content_size(self) -> int:
45
60
  if self._content_size is not None:
46
61
  return self._content_size
47
62
 
48
63
  first_index_response = self._fetch_response()
49
- if first_index_response['Headers'].get('Accept-Ranges') != 'bytes':
64
+ if first_index_response["Headers"].get("Accept-Ranges") != "bytes":
50
65
  raise UnsupportedError(
51
- f'Unsupported server, server must support Accept-Ranges: {self._url}',
66
+ f"Unsupported server, server must support Accept-Ranges: {self._url}",
52
67
  path=fspath(self._url),
53
68
  )
54
- return first_index_response['Headers']['Content-Length']
69
+ return first_index_response["Headers"]["Content-Length"]
55
70
 
56
71
  @property
57
72
  def name(self) -> str:
58
73
  return fspath(self._url)
59
74
 
60
75
  def _fetch_response(
61
- self,
62
- start: Optional[int] = None,
63
- end: Optional[int] = None) -> dict:
64
-
76
+ self, start: Optional[int] = None, end: Optional[int] = None
77
+ ) -> dict:
65
78
  def fetch_response() -> dict:
66
79
  request_kwargs = {}
67
- if hasattr(self._url, 'request_kwargs'):
80
+ if hasattr(self._url, "request_kwargs"):
68
81
  request_kwargs = self._url.request_kwargs # pyre-ignore[16]
69
- timeout = request_kwargs.pop('timeout', DEFAULT_TIMEOUT)
70
- stream = request_kwargs.pop('stream', True)
82
+ timeout = request_kwargs.pop("timeout", DEFAULT_TIMEOUT)
83
+ stream = request_kwargs.pop("stream", True)
71
84
 
72
85
  if start is None or end is None:
73
- with requests.get(fspath(self._url), timeout=timeout,
74
- stream=stream, **request_kwargs) as response:
86
+ with requests.get(
87
+ fspath(self._url), timeout=timeout, stream=stream, **request_kwargs
88
+ ) as response:
75
89
  return {
76
- 'Headers': response.headers,
77
- 'Cookies': response.cookies,
78
- 'StatusCode': response.status_code,
90
+ "Headers": response.headers,
91
+ "Cookies": response.cookies,
92
+ "StatusCode": response.status_code,
79
93
  }
80
94
  else:
81
95
  range_end = end
82
96
  if self._content_size is not None:
83
97
  range_end = min(range_end, self._content_size - 1)
84
- headers = request_kwargs.pop('headers', {})
98
+ headers = request_kwargs.pop("headers", {})
85
99
  headers["Range"] = f"bytes={start}-{range_end}"
86
- with requests.get(fspath(self._url), timeout=timeout,
87
- headers=headers, stream=stream,
88
- **request_kwargs) as response:
89
- if len(response.content) != int(
90
- response.headers['Content-Length']):
100
+ with requests.get(
101
+ fspath(self._url),
102
+ timeout=timeout,
103
+ headers=headers,
104
+ stream=stream,
105
+ **request_kwargs,
106
+ ) as response:
107
+ if len(response.content) != int(response.headers["Content-Length"]):
91
108
  raise HttpBodyIncompleteError(
92
- f"The downloaded content is incomplete, expected size: {response.headers['Content-Length']}, actual size: {len(response.content)}",
109
+ "The downloaded content is incomplete, "
110
+ "expected size: %s, actual size: %d"
111
+ % (
112
+ response.headers["Content-Length"],
113
+ len(response.content),
114
+ )
93
115
  )
94
116
  return {
95
- 'Body': BytesIO(response.content),
96
- 'Headers': response.headers,
97
- 'Cookies': response.cookies,
98
- 'StatusCode': response.status_code,
117
+ "Body": BytesIO(response.content),
118
+ "Headers": response.headers,
119
+ "Cookies": response.cookies,
120
+ "StatusCode": response.status_code,
99
121
  }
100
122
 
101
123
  fetch_response = patch_method(
102
124
  fetch_response,
103
125
  max_retries=self._max_retries,
104
- should_retry=http_should_retry)
126
+ should_retry=http_should_retry,
127
+ )
105
128
 
106
129
  return fetch_response()
megfile/lib/joinpath.py CHANGED
@@ -10,22 +10,22 @@ def uri_join(path: str, *other_paths: str) -> str:
10
10
  return path
11
11
 
12
12
  first_path = path
13
- if first_path.endswith('/'):
13
+ if first_path.endswith("/"):
14
14
  first_path = first_path[:-1]
15
15
 
16
16
  last_path = other_paths[-1]
17
- if last_path.startswith('/'):
17
+ if last_path.startswith("/"):
18
18
  last_path = last_path[1:]
19
19
 
20
20
  middle_paths = []
21
21
  for other_path in other_paths[:-1]:
22
- if other_path.startswith('/'):
22
+ if other_path.startswith("/"):
23
23
  other_path = other_path[1:]
24
- if other_path.endswith('/'):
24
+ if other_path.endswith("/"):
25
25
  other_path = other_path[:-1]
26
26
  middle_paths.append(other_path)
27
27
 
28
- return '/'.join([first_path, *middle_paths, last_path])
28
+ return "/".join([first_path, *middle_paths, last_path])
29
29
 
30
30
  # Imp. 2
31
31
  # other_paths = (other_path.lstrip('/') for other_path in other_paths)
@@ -7,9 +7,13 @@ from megfile.utils import get_content_size
7
7
 
8
8
 
9
9
  class LazyHandler(Readable, Seekable, Writable):
10
- ''' Create a File-Like Object, maintaining file pointer, to avoid misunderstanding the position when read / write / seek
11
- It can be roughly regarded as the copy function of the file handle, but you need to be careful with the write handle, because no matter which copy will modify the data itself
12
- '''
10
+ """Create a File-Like Object, maintaining file pointer,
11
+ to avoid misunderstanding the position when read / write / seek.
12
+
13
+ It can be roughly regarded as the copy function of the file handle,
14
+ but you need to be careful with the write handle,
15
+ because no matter which copy will modify the data itself.
16
+ """
13
17
 
14
18
  def __init__(self, path: str, mode: str, open_func: Callable, **options):
15
19
  self._open_func = open_func
@@ -5,13 +5,20 @@ from logging import getLogger as get_logger
5
5
  from threading import Lock
6
6
  from typing import NamedTuple, Optional
7
7
 
8
- from megfile.config import BACKOFF_FACTOR, BACKOFF_INITIAL, DEFAULT_MAX_BLOCK_SIZE, DEFAULT_MAX_BUFFER_SIZE, DEFAULT_MIN_BLOCK_SIZE, GLOBAL_MAX_WORKERS
8
+ from megfile.config import (
9
+ BACKOFF_FACTOR,
10
+ BACKOFF_INITIAL,
11
+ DEFAULT_MAX_BLOCK_SIZE,
12
+ DEFAULT_MAX_BUFFER_SIZE,
13
+ DEFAULT_MIN_BLOCK_SIZE,
14
+ GLOBAL_MAX_WORKERS,
15
+ )
9
16
  from megfile.errors import raise_s3_error
10
17
  from megfile.interfaces import Writable
11
18
  from megfile.utils import get_human_size, process_local
12
19
 
13
20
  _logger = get_logger(__name__)
14
- '''
21
+ """
15
22
  class PartResult(NamedTuple):
16
23
 
17
24
  etag: str
@@ -19,41 +26,39 @@ class PartResult(NamedTuple):
19
26
  content_size: int
20
27
 
21
28
  in Python 3.6+
22
- '''
29
+ """
23
30
 
24
31
  _PartResult = NamedTuple(
25
- 'PartResult', [('etag', str), ('part_number', int), ('content_size', int)])
32
+ "PartResult", [("etag", str), ("part_number", int), ("content_size", int)]
33
+ )
26
34
 
27
35
 
28
36
  class PartResult(_PartResult):
29
-
30
37
  def asdict(self):
31
- return {
32
- 'PartNumber': self.part_number,
33
- 'ETag': self.etag,
34
- }
38
+ return {"PartNumber": self.part_number, "ETag": self.etag}
35
39
 
36
40
 
37
41
  class S3BufferedWriter(Writable[bytes]):
38
-
39
42
  def __init__(
40
- self,
41
- bucket: str,
42
- key: str,
43
- *,
44
- s3_client,
45
- block_size: int = DEFAULT_MIN_BLOCK_SIZE,
46
- max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
47
- max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
48
- max_workers: Optional[int] = None,
49
- profile_name: Optional[str] = None):
50
-
43
+ self,
44
+ bucket: str,
45
+ key: str,
46
+ *,
47
+ s3_client,
48
+ block_size: int = DEFAULT_MIN_BLOCK_SIZE,
49
+ max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
50
+ max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
51
+ max_workers: Optional[int] = None,
52
+ profile_name: Optional[str] = None,
53
+ ):
51
54
  self._bucket = bucket
52
55
  self._key = key
53
56
  self._client = s3_client
54
57
  self._profile_name = profile_name
55
58
 
56
- self._block_size = block_size
59
+ # user maybe put block_size with 'numpy.uint64' type
60
+ self._block_size = int(block_size)
61
+
57
62
  self._max_block_size = max_block_size
58
63
  self._max_buffer_size = max_buffer_size
59
64
  self._total_buffer_size = 0
@@ -66,9 +71,10 @@ class S3BufferedWriter(Writable[bytes]):
66
71
  self._is_global_executor = False
67
72
  if max_workers is None:
68
73
  self._executor = process_local(
69
- 'S3BufferedWriter.executor',
74
+ "S3BufferedWriter.executor",
70
75
  ThreadPoolExecutor,
71
- max_workers=GLOBAL_MAX_WORKERS)
76
+ max_workers=GLOBAL_MAX_WORKERS,
77
+ )
72
78
  self._is_global_executor = True
73
79
  else:
74
80
  self._executor = ThreadPoolExecutor(max_workers=max_workers)
@@ -77,17 +83,19 @@ class S3BufferedWriter(Writable[bytes]):
77
83
  self.__upload_id = None
78
84
  self.__upload_id_lock = Lock()
79
85
 
80
- _logger.debug('open file: %r, mode: %s' % (self.name, self.mode))
86
+ _logger.debug("open file: %r, mode: %s" % (self.name, self.mode))
81
87
 
82
88
  @property
83
89
  def name(self) -> str:
84
- return 's3%s://%s/%s' % (
90
+ return "s3%s://%s/%s" % (
85
91
  f"+{self._profile_name}" if self._profile_name else "",
86
- self._bucket, self._key)
92
+ self._bucket,
93
+ self._key,
94
+ )
87
95
 
88
96
  @property
89
97
  def mode(self) -> str:
90
- return 'wb'
98
+ return "wb"
91
99
 
92
100
  def tell(self) -> int:
93
101
  return self._offset
@@ -100,8 +108,9 @@ class S3BufferedWriter(Writable[bytes]):
100
108
  def _content_size(self, value: int):
101
109
  if value > self._backoff_size:
102
110
  _logger.debug(
103
- 'writing file: %r, current size: %s' %
104
- (self.name, get_human_size(value)))
111
+ "writing file: %r, current size: %s"
112
+ % (self.name, get_human_size(value))
113
+ )
105
114
  while value > self._backoff_size:
106
115
  self._backoff_size *= BACKOFF_FACTOR
107
116
  self.__content_size = value
@@ -116,9 +125,8 @@ class S3BufferedWriter(Writable[bytes]):
116
125
  if self.__upload_id is None:
117
126
  with raise_s3_error(self.name):
118
127
  self.__upload_id = self._client.create_multipart_upload(
119
- Bucket=self._bucket,
120
- Key=self._key,
121
- )['UploadId']
128
+ Bucket=self._bucket, Key=self._key
129
+ )["UploadId"]
122
130
  return self.__upload_id
123
131
 
124
132
  @property
@@ -126,22 +134,19 @@ class S3BufferedWriter(Writable[bytes]):
126
134
  return self._total_buffer_size - sum(
127
135
  future.result().content_size
128
136
  for future in self._futures.values()
129
- if future.done())
137
+ if future.done()
138
+ )
130
139
 
131
140
  @property
132
141
  def _uploading_futures(self):
133
- return [
134
- future for future in self._futures.values() if not future.done()
135
- ]
142
+ return [future for future in self._futures.values() if not future.done()]
136
143
 
137
144
  @property
138
145
  def _multipart_upload(self):
139
146
  return {
140
- 'Parts':
141
- [
142
- future.result().asdict()
143
- for _, future in sorted(self._futures.items())
144
- ],
147
+ "Parts": [
148
+ future.result().asdict() for _, future in sorted(self._futures.items())
149
+ ]
145
150
  }
146
151
 
147
152
  def _upload_buffer(self, part_number, content):
@@ -153,24 +158,29 @@ class S3BufferedWriter(Writable[bytes]):
153
158
  UploadId=self._upload_id,
154
159
  PartNumber=part_number,
155
160
  Body=content,
156
- )['ETag'], part_number, len(content))
161
+ )["ETag"],
162
+ part_number,
163
+ len(content),
164
+ )
157
165
 
158
166
  def _submit_upload_buffer(self, part_number, content):
159
167
  self._futures[part_number] = self._executor.submit(
160
- self._upload_buffer, part_number, content)
168
+ self._upload_buffer, part_number, content
169
+ )
161
170
  self._total_buffer_size += len(content)
162
171
  while self._buffer_size > self._max_buffer_size:
163
172
  wait(self._uploading_futures, return_when=FIRST_COMPLETED)
164
173
 
165
174
  def _submit_upload_content(self, content: bytes):
166
- # s3 part needs at least 5MB, so we need to divide content into equal-size parts, and give last part more size
175
+ # s3 part needs at least 5MB,
176
+ # so we need to divide content into equal-size parts,
177
+ # and give last part more size.
167
178
  # e.g. 257MB can be divided into 2 parts, 128MB and 129MB
168
179
  offset = 0
169
180
  while len(content) - offset - self._max_block_size > self._block_size:
170
181
  self._part_number += 1
171
182
  offset_stop = offset + self._max_block_size
172
- self._submit_upload_buffer(
173
- self._part_number, content[offset:offset_stop])
183
+ self._submit_upload_buffer(self._part_number, content[offset:offset_stop])
174
184
  offset = offset_stop
175
185
  self._part_number += 1
176
186
  self._submit_upload_buffer(self._part_number, content[offset:])
@@ -184,7 +194,7 @@ class S3BufferedWriter(Writable[bytes]):
184
194
 
185
195
  def write(self, data: bytes) -> int:
186
196
  if self.closed:
187
- raise IOError('file already closed: %r' % self.name)
197
+ raise IOError("file already closed: %r" % self.name)
188
198
 
189
199
  result = self._buffer.write(data)
190
200
  if self._buffer.tell() >= self._block_size:
@@ -198,14 +208,13 @@ class S3BufferedWriter(Writable[bytes]):
198
208
  self._executor.shutdown()
199
209
 
200
210
  def _close(self):
201
- _logger.debug('close file: %r' % self.name)
211
+ _logger.debug("close file: %r" % self.name)
202
212
 
203
213
  if not self._is_multipart:
204
214
  with raise_s3_error(self.name):
205
215
  self._client.put_object(
206
- Bucket=self._bucket,
207
- Key=self._key,
208
- Body=self._buffer.getvalue())
216
+ Bucket=self._bucket, Key=self._key, Body=self._buffer.getvalue()
217
+ )
209
218
  self._shutdown()
210
219
  return
211
220
 
@@ -7,19 +7,20 @@ from megfile.utils import generate_cache_path
7
7
 
8
8
 
9
9
  class S3CachedHandler(S3MemoryHandler):
10
-
11
10
  def __init__(
12
- self,
13
- bucket: str,
14
- key: str,
15
- mode: str,
16
- *,
17
- s3_client,
18
- cache_path: Optional[str] = None,
19
- remove_cache_when_open: bool = True,
20
- profile_name: Optional[str] = None):
21
-
22
- assert mode in ('rb', 'wb', 'ab', 'rb+', 'wb+', 'ab+')
11
+ self,
12
+ bucket: str,
13
+ key: str,
14
+ mode: str,
15
+ *,
16
+ s3_client,
17
+ cache_path: Optional[str] = None,
18
+ remove_cache_when_open: bool = True,
19
+ profile_name: Optional[str] = None,
20
+ ):
21
+ if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
22
+ # TODO: replace AssertionError with ValueError in 4.0.0
23
+ raise AssertionError("unacceptable mode: %r" % mode)
23
24
 
24
25
  self._bucket = bucket
25
26
  self._key = key
@@ -31,7 +32,7 @@ class S3CachedHandler(S3MemoryHandler):
31
32
  cache_path = generate_cache_path(self.name)
32
33
 
33
34
  self._cache_path = cache_path
34
- self._fileobj = open(self._cache_path, 'wb+')
35
+ self._fileobj = open(self._cache_path, "wb+")
35
36
  self._download_fileobj()
36
37
 
37
38
  if remove_cache_when_open:
@@ -3,7 +3,11 @@ from io import BytesIO
3
3
  from logging import getLogger as get_logger
4
4
  from typing import Optional
5
5
 
6
- from megfile.config import DEFAULT_MAX_BLOCK_SIZE, DEFAULT_MAX_BUFFER_SIZE, DEFAULT_MIN_BLOCK_SIZE
6
+ from megfile.config import (
7
+ DEFAULT_MAX_BLOCK_SIZE,
8
+ DEFAULT_MAX_BUFFER_SIZE,
9
+ DEFAULT_MIN_BLOCK_SIZE,
10
+ )
7
11
  from megfile.errors import raise_s3_error
8
12
  from megfile.interfaces import Seekable
9
13
  from megfile.lib.s3_buffered_writer import S3BufferedWriter
@@ -12,27 +16,27 @@ _logger = get_logger(__name__)
12
16
 
13
17
 
14
18
  class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
15
- ''' For file format like msgpack and mp4, it's a pain that you need to write
19
+ """For file format like msgpack and mp4, it's a pain that you need to write
16
20
  header before writing the data. So it's kind of hard to make streaming write
17
21
  to unseekable file system like s3. In this case, we will try to keep the first
18
22
  and last parts of data in memory, so we can come back to head again and write
19
23
  the header at the last second.
20
- '''
24
+ """
21
25
 
22
26
  def __init__(
23
- self,
24
- bucket: str,
25
- key: str,
26
- *,
27
- s3_client,
28
- block_size: int = DEFAULT_MIN_BLOCK_SIZE,
29
- head_block_size: Optional[int] = None,
30
- tail_block_size: Optional[int] = None,
31
- max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
32
- max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
33
- max_workers: Optional[int] = None,
34
- profile_name: Optional[str] = None):
35
-
27
+ self,
28
+ bucket: str,
29
+ key: str,
30
+ *,
31
+ s3_client,
32
+ block_size: int = DEFAULT_MIN_BLOCK_SIZE,
33
+ head_block_size: Optional[int] = None,
34
+ tail_block_size: Optional[int] = None,
35
+ max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
36
+ max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
37
+ max_workers: Optional[int] = None,
38
+ profile_name: Optional[str] = None,
39
+ ):
36
40
  super().__init__(
37
41
  bucket,
38
42
  key,
@@ -41,7 +45,8 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
41
45
  max_block_size=max_block_size,
42
46
  max_buffer_size=max_buffer_size,
43
47
  max_workers=max_workers,
44
- profile_name=profile_name)
48
+ profile_name=profile_name,
49
+ )
45
50
 
46
51
  self._head_block_size = head_block_size or block_size
47
52
  self._tail_block_size = tail_block_size or block_size
@@ -61,8 +66,9 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
61
66
 
62
67
  def seek(self, offset: int, whence: int = os.SEEK_SET) -> int:
63
68
  if self.closed:
64
- raise IOError('file already closed: %r' % self.name)
69
+ raise IOError("file already closed: %r" % self.name)
65
70
 
71
+ offset = int(offset) # user maybe put offset with 'numpy.uint64' type
66
72
  if whence == os.SEEK_SET:
67
73
  target_offset = offset
68
74
  elif whence == os.SEEK_CUR:
@@ -70,7 +76,7 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
70
76
  elif whence == os.SEEK_END:
71
77
  target_offset = self._content_size + offset
72
78
  else:
73
- raise OSError('Unsupported whence value: %d' % whence)
79
+ raise OSError("Unsupported whence value: %d" % whence)
74
80
 
75
81
  if target_offset < self._head_block_size:
76
82
  self._head_buffer.seek(target_offset)
@@ -78,15 +84,16 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
78
84
  self._buffer.seek(target_offset - self._tail_offset)
79
85
  else:
80
86
  raise OSError(
81
- 'Can only seek inside of head, or seek to tail, target offset: %d'
82
- % target_offset)
87
+ "Can only seek inside of head, or seek to tail, target offset: %d"
88
+ % target_offset
89
+ )
83
90
 
84
91
  self._offset = target_offset
85
92
  return self._offset
86
93
 
87
94
  def write(self, data: bytes) -> int:
88
95
  if self.closed:
89
- raise IOError('file already closed: %r' % self.name)
96
+ raise IOError("file already closed: %r" % self.name)
90
97
 
91
98
  if self._head_size != self._head_block_size: # no tail part yet
92
99
  self._write_to_head(data)
@@ -96,8 +103,9 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
96
103
  self._write_to_tail(data)
97
104
  else:
98
105
  raise OSError(
99
- 'Can only write inside of head, or write to tail, current offset: %d'
100
- % self._offset)
106
+ "Can only write inside of head, or write to tail, current offset: %d"
107
+ % self._offset
108
+ )
101
109
  return len(data)
102
110
 
103
111
  def _write_to_head(self, data: bytes):
@@ -116,8 +124,9 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
116
124
  def _write_to_head_after_tail_part_created(self, data: bytes):
117
125
  if self._offset + len(data) > self._head_block_size:
118
126
  raise Exception(
119
- 'Head part overflow, %d bytes left but try to write %d bytes' %
120
- (self._head_block_size - self._offset, len(data)))
127
+ "Head part overflow, %d bytes left but try to write %d bytes"
128
+ % (self._head_block_size - self._offset, len(data))
129
+ )
121
130
  self._head_buffer.write(data)
122
131
  self._offset += len(data)
123
132
 
@@ -139,14 +148,15 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
139
148
  self._submit_upload_content(content[:offset])
140
149
 
141
150
  def _close(self):
142
- _logger.debug('close file: %r' % self.name)
151
+ _logger.debug("close file: %r" % self.name)
143
152
 
144
153
  if not self._is_multipart:
145
154
  with raise_s3_error(self.name):
146
155
  self._client.put_object(
147
156
  Bucket=self._bucket,
148
157
  Key=self._key,
149
- Body=self._head_buffer.getvalue() + self._buffer.getvalue())
158
+ Body=self._head_buffer.getvalue() + self._buffer.getvalue(),
159
+ )
150
160
  self._shutdown()
151
161
  return
152
162