megfile 3.1.6.post1__py3-none-any.whl → 4.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megfile/cli.py +12 -7
- megfile/config.py +27 -39
- megfile/fs.py +169 -12
- megfile/fs_path.py +183 -260
- megfile/hdfs.py +106 -5
- megfile/hdfs_path.py +34 -90
- megfile/http.py +50 -1
- megfile/http_path.py +27 -65
- megfile/interfaces.py +1 -8
- megfile/lib/base_prefetch_reader.py +62 -78
- megfile/lib/combine_reader.py +5 -0
- megfile/lib/glob.py +3 -6
- megfile/lib/hdfs_prefetch_reader.py +7 -7
- megfile/lib/http_prefetch_reader.py +6 -6
- megfile/lib/s3_buffered_writer.py +71 -65
- megfile/lib/s3_cached_handler.py +1 -2
- megfile/lib/s3_limited_seekable_writer.py +3 -7
- megfile/lib/s3_memory_handler.py +1 -2
- megfile/lib/s3_pipe_handler.py +1 -2
- megfile/lib/s3_prefetch_reader.py +10 -19
- megfile/lib/s3_share_cache_reader.py +8 -5
- megfile/pathlike.py +397 -401
- megfile/s3.py +118 -17
- megfile/s3_path.py +126 -209
- megfile/sftp.py +300 -10
- megfile/sftp_path.py +46 -322
- megfile/smart.py +33 -27
- megfile/smart_path.py +9 -14
- megfile/stdio.py +1 -1
- megfile/stdio_path.py +2 -2
- megfile/utils/__init__.py +3 -4
- megfile/version.py +1 -1
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/METADATA +7 -7
- megfile-4.0.0.post1.dist-info/RECORD +52 -0
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/WHEEL +1 -1
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/top_level.txt +0 -2
- docs/conf.py +0 -65
- megfile-3.1.6.post1.dist-info/RECORD +0 -55
- scripts/convert_results_to_sarif.py +0 -91
- scripts/generate_file.py +0 -344
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/LICENSE +0 -0
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/entry_points.txt +0 -0
megfile/hdfs.py
CHANGED
|
@@ -1,11 +1,8 @@
|
|
|
1
1
|
from typing import IO, BinaryIO, Iterator, List, Optional, Tuple
|
|
2
2
|
|
|
3
|
+
from megfile.config import READER_BLOCK_SIZE, READER_MAX_BUFFER_SIZE
|
|
3
4
|
from megfile.hdfs_path import (
|
|
4
5
|
HdfsPath,
|
|
5
|
-
hdfs_glob,
|
|
6
|
-
hdfs_glob_stat,
|
|
7
|
-
hdfs_iglob,
|
|
8
|
-
hdfs_makedirs,
|
|
9
6
|
is_hdfs,
|
|
10
7
|
)
|
|
11
8
|
from megfile.interfaces import FileEntry, PathLike, StatResult
|
|
@@ -300,8 +297,112 @@ def hdfs_open(
|
|
|
300
297
|
buffering: Optional[int] = None,
|
|
301
298
|
encoding: Optional[str] = None,
|
|
302
299
|
errors: Optional[str] = None,
|
|
300
|
+
max_workers: Optional[int] = None,
|
|
301
|
+
max_buffer_size: int = READER_MAX_BUFFER_SIZE,
|
|
302
|
+
block_forward: Optional[int] = None,
|
|
303
|
+
block_size: int = READER_BLOCK_SIZE,
|
|
303
304
|
**kwargs,
|
|
304
305
|
) -> IO:
|
|
306
|
+
"""
|
|
307
|
+
Open a file on the specified path.
|
|
308
|
+
|
|
309
|
+
:param path: Given path
|
|
310
|
+
:param mode: Mode to open the file. Supports 'r', 'rb', 'w', 'wb', 'a', 'ab'.
|
|
311
|
+
:param buffering: Optional integer used to set the buffering policy.
|
|
312
|
+
:param encoding: Name of the encoding used to decode or encode the file.
|
|
313
|
+
Should only be used in text mode.
|
|
314
|
+
:param errors: Optional string specifying how encoding and decoding errors are
|
|
315
|
+
to be handled. Cannot be used in binary mode.
|
|
316
|
+
:param max_workers: Max download thread number, `None` by default,
|
|
317
|
+
will use global thread pool with 8 threads.
|
|
318
|
+
:param max_buffer_size: Max cached buffer size in memory, 128MB by default.
|
|
319
|
+
Set to `0` will disable cache.
|
|
320
|
+
:param block_forward: Number of blocks of data for reader cached from the
|
|
321
|
+
offset position.
|
|
322
|
+
:param block_size: Size of a single block for reader, default is 8MB.
|
|
323
|
+
:returns: A file-like object.
|
|
324
|
+
:raises ValueError: If an unacceptable mode is provided.
|
|
325
|
+
"""
|
|
305
326
|
return HdfsPath(path).open(
|
|
306
|
-
mode,
|
|
327
|
+
mode,
|
|
328
|
+
buffering=buffering,
|
|
329
|
+
encoding=encoding,
|
|
330
|
+
errors=errors,
|
|
331
|
+
max_workers=max_workers,
|
|
332
|
+
max_buffer_size=max_buffer_size,
|
|
333
|
+
block_forward=block_forward,
|
|
334
|
+
block_size=block_size,
|
|
307
335
|
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def hdfs_glob(
|
|
339
|
+
path: PathLike, recursive: bool = True, missing_ok: bool = True
|
|
340
|
+
) -> List[str]:
|
|
341
|
+
"""Return hdfs path list in ascending alphabetical order,
|
|
342
|
+
in which path matches glob pattern
|
|
343
|
+
|
|
344
|
+
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
345
|
+
raise UnsupportedError
|
|
346
|
+
|
|
347
|
+
:param recursive: If False, `**` will not search directory recursively
|
|
348
|
+
:param missing_ok: If False and target path doesn't match any file,
|
|
349
|
+
raise FileNotFoundError
|
|
350
|
+
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
351
|
+
:returns: A list contains paths match `path`
|
|
352
|
+
"""
|
|
353
|
+
return list(hdfs_iglob(path, recursive=recursive, missing_ok=missing_ok))
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def hdfs_glob_stat(
|
|
357
|
+
path: PathLike, recursive: bool = True, missing_ok: bool = True
|
|
358
|
+
) -> Iterator[FileEntry]:
|
|
359
|
+
"""Return a generator contains tuples of path and file stat,
|
|
360
|
+
in ascending alphabetical order, in which path matches glob pattern
|
|
361
|
+
|
|
362
|
+
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
363
|
+
raise UnsupportedError
|
|
364
|
+
|
|
365
|
+
:param recursive: If False, `**` will not search directory recursively
|
|
366
|
+
:param missing_ok: If False and target path doesn't match any file,
|
|
367
|
+
raise FileNotFoundError
|
|
368
|
+
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
369
|
+
:returns: A generator contains tuples of path and file stat,
|
|
370
|
+
in which paths match `path`
|
|
371
|
+
"""
|
|
372
|
+
return HdfsPath(path).glob_stat(
|
|
373
|
+
pattern="", recursive=recursive, missing_ok=missing_ok
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def hdfs_iglob(
|
|
378
|
+
path: PathLike, recursive: bool = True, missing_ok: bool = True
|
|
379
|
+
) -> Iterator[str]:
|
|
380
|
+
"""Return hdfs path iterator in ascending alphabetical order,
|
|
381
|
+
in which path matches glob pattern
|
|
382
|
+
|
|
383
|
+
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
384
|
+
raise UnsupportedError
|
|
385
|
+
|
|
386
|
+
:param recursive: If False, `**` will not search directory recursively
|
|
387
|
+
:param missing_ok: If False and target path doesn't match any file,
|
|
388
|
+
raise FileNotFoundError
|
|
389
|
+
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
390
|
+
:returns: An iterator contains paths match `path`
|
|
391
|
+
"""
|
|
392
|
+
for path_obj in HdfsPath(path).iglob(
|
|
393
|
+
pattern="", recursive=recursive, missing_ok=missing_ok
|
|
394
|
+
):
|
|
395
|
+
yield path_obj.path_with_protocol
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def hdfs_makedirs(path: PathLike, exist_ok: bool = False):
|
|
399
|
+
"""
|
|
400
|
+
Create an hdfs directory.
|
|
401
|
+
Purely creating directory is invalid because it's unavailable on OSS.
|
|
402
|
+
This function is to test the target bucket have WRITE access.
|
|
403
|
+
|
|
404
|
+
:param path: Given path
|
|
405
|
+
:param exist_ok: If False and target directory exists, raise S3FileExistsError
|
|
406
|
+
:raises: FileExistsError
|
|
407
|
+
"""
|
|
408
|
+
return HdfsPath(path).mkdir(parents=True, exist_ok=exist_ok)
|
megfile/hdfs_path.py
CHANGED
|
@@ -6,6 +6,11 @@ import sys
|
|
|
6
6
|
from functools import cached_property, lru_cache
|
|
7
7
|
from typing import IO, BinaryIO, Iterator, List, Optional, Tuple
|
|
8
8
|
|
|
9
|
+
from megfile.config import (
|
|
10
|
+
HDFS_MAX_RETRY_TIMES,
|
|
11
|
+
READER_BLOCK_SIZE,
|
|
12
|
+
READER_MAX_BUFFER_SIZE,
|
|
13
|
+
)
|
|
9
14
|
from megfile.errors import _create_missing_ok_generator, raise_hdfs_error
|
|
10
15
|
from megfile.interfaces import FileEntry, PathLike, StatResult, URIPath
|
|
11
16
|
from megfile.lib.compat import fspath
|
|
@@ -19,17 +24,13 @@ from megfile.utils import _is_pickle
|
|
|
19
24
|
__all__ = [
|
|
20
25
|
"HdfsPath",
|
|
21
26
|
"is_hdfs",
|
|
22
|
-
"hdfs_glob",
|
|
23
|
-
"hdfs_glob_stat",
|
|
24
|
-
"hdfs_iglob",
|
|
25
|
-
"hdfs_makedirs",
|
|
26
27
|
]
|
|
27
28
|
|
|
28
29
|
HDFS_USER = "HDFS_USER"
|
|
29
30
|
HDFS_URL = "HDFS_URL"
|
|
30
31
|
HDFS_ROOT = "HDFS_ROOT"
|
|
31
32
|
HDFS_TIMEOUT = "HDFS_TIMEOUT"
|
|
32
|
-
HDFS_TOKEN = "HDFS_TOKEN"
|
|
33
|
+
HDFS_TOKEN = "HDFS_TOKEN" # nosec B105
|
|
33
34
|
HDFS_CONFIG_PATH = "HDFS_CONFIG_PATH"
|
|
34
35
|
MAX_RETRIES = 10
|
|
35
36
|
DEFAULT_HDFS_TIMEOUT = 10
|
|
@@ -97,79 +98,6 @@ def get_hdfs_client(profile_name: Optional[str] = None):
|
|
|
97
98
|
return hdfs_api.InsecureClient(**config)
|
|
98
99
|
|
|
99
100
|
|
|
100
|
-
def hdfs_glob(
|
|
101
|
-
path: PathLike, recursive: bool = True, missing_ok: bool = True
|
|
102
|
-
) -> List[str]:
|
|
103
|
-
"""Return hdfs path list in ascending alphabetical order,
|
|
104
|
-
in which path matches glob pattern
|
|
105
|
-
|
|
106
|
-
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
107
|
-
raise UnsupportedError
|
|
108
|
-
|
|
109
|
-
:param recursive: If False, `**` will not search directory recursively
|
|
110
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
111
|
-
raise FileNotFoundError
|
|
112
|
-
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
113
|
-
:returns: A list contains paths match `path`
|
|
114
|
-
"""
|
|
115
|
-
return list(hdfs_iglob(path, recursive=recursive, missing_ok=missing_ok))
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def hdfs_glob_stat(
|
|
119
|
-
path: PathLike, recursive: bool = True, missing_ok: bool = True
|
|
120
|
-
) -> Iterator[FileEntry]:
|
|
121
|
-
"""Return a generator contains tuples of path and file stat,
|
|
122
|
-
in ascending alphabetical order, in which path matches glob pattern
|
|
123
|
-
|
|
124
|
-
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
125
|
-
raise UnsupportedError
|
|
126
|
-
|
|
127
|
-
:param recursive: If False, `**` will not search directory recursively
|
|
128
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
129
|
-
raise FileNotFoundError
|
|
130
|
-
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
131
|
-
:returns: A generator contains tuples of path and file stat,
|
|
132
|
-
in which paths match `path`
|
|
133
|
-
"""
|
|
134
|
-
return HdfsPath(path).glob_stat(
|
|
135
|
-
pattern="", recursive=recursive, missing_ok=missing_ok
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def hdfs_iglob(
|
|
140
|
-
path: PathLike, recursive: bool = True, missing_ok: bool = True
|
|
141
|
-
) -> Iterator[str]:
|
|
142
|
-
"""Return hdfs path iterator in ascending alphabetical order,
|
|
143
|
-
in which path matches glob pattern
|
|
144
|
-
|
|
145
|
-
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
146
|
-
raise UnsupportedError
|
|
147
|
-
|
|
148
|
-
:param recursive: If False, `**` will not search directory recursively
|
|
149
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
150
|
-
raise FileNotFoundError
|
|
151
|
-
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
152
|
-
:returns: An iterator contains paths match `path`
|
|
153
|
-
"""
|
|
154
|
-
for path_obj in HdfsPath(path).iglob(
|
|
155
|
-
pattern="", recursive=recursive, missing_ok=missing_ok
|
|
156
|
-
):
|
|
157
|
-
yield path_obj.path_with_protocol
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
def hdfs_makedirs(path: PathLike, exist_ok: bool = False):
|
|
161
|
-
"""
|
|
162
|
-
Create an hdfs directory.
|
|
163
|
-
Purely creating directory is invalid because it's unavailable on OSS.
|
|
164
|
-
This function is to test the target bucket have WRITE access.
|
|
165
|
-
|
|
166
|
-
:param path: Given path
|
|
167
|
-
:param exist_ok: If False and target directory exists, raise S3FileExistsError
|
|
168
|
-
:raises: FileExistsError
|
|
169
|
-
"""
|
|
170
|
-
return HdfsPath(path).mkdir(parents=True, exist_ok=exist_ok)
|
|
171
|
-
|
|
172
|
-
|
|
173
101
|
@SmartPath.register
|
|
174
102
|
class HdfsPath(URIPath):
|
|
175
103
|
protocol = "hdfs"
|
|
@@ -641,8 +569,31 @@ class HdfsPath(URIPath):
|
|
|
641
569
|
buffering: Optional[int] = None,
|
|
642
570
|
encoding: Optional[str] = None,
|
|
643
571
|
errors: Optional[str] = None,
|
|
572
|
+
max_workers: Optional[int] = None,
|
|
573
|
+
max_buffer_size: int = READER_MAX_BUFFER_SIZE,
|
|
574
|
+
block_forward: Optional[int] = None,
|
|
575
|
+
block_size: int = READER_BLOCK_SIZE,
|
|
644
576
|
**kwargs,
|
|
645
577
|
) -> IO:
|
|
578
|
+
"""
|
|
579
|
+
Open a file on the specified path.
|
|
580
|
+
|
|
581
|
+
:param mode: Mode to open the file. Supports 'r', 'rb', 'w', 'wb', 'a', 'ab'.
|
|
582
|
+
:param buffering: Optional integer used to set the buffering policy.
|
|
583
|
+
:param encoding: Name of the encoding used to decode or encode the file.
|
|
584
|
+
Should only be used in text mode.
|
|
585
|
+
:param errors: Optional string specifying how encoding and decoding errors are
|
|
586
|
+
to be handled. Cannot be used in binary mode.
|
|
587
|
+
:param max_workers: Max download thread number, `None` by default,
|
|
588
|
+
will use global thread pool with 8 threads.
|
|
589
|
+
:param max_buffer_size: Max cached buffer size in memory, 128MB by default.
|
|
590
|
+
Set to `0` will disable cache.
|
|
591
|
+
:param block_forward: Number of blocks of data for reader cached from the
|
|
592
|
+
offset position.
|
|
593
|
+
:param block_size: Size of a single block for reader, default is 8MB.
|
|
594
|
+
:returns: A file-like object.
|
|
595
|
+
:raises ValueError: If an unacceptable mode is provided.
|
|
596
|
+
"""
|
|
646
597
|
if "+" in mode:
|
|
647
598
|
raise ValueError("unacceptable mode: %r" % mode)
|
|
648
599
|
|
|
@@ -653,22 +604,15 @@ class HdfsPath(URIPath):
|
|
|
653
604
|
|
|
654
605
|
with raise_hdfs_error(self.path_with_protocol):
|
|
655
606
|
if mode in ("r", "rb"):
|
|
656
|
-
keys = [
|
|
657
|
-
"block_size",
|
|
658
|
-
"block_capacity",
|
|
659
|
-
"block_forward",
|
|
660
|
-
"max_retries",
|
|
661
|
-
"max_workers",
|
|
662
|
-
]
|
|
663
|
-
input_kwargs = {}
|
|
664
|
-
for key in keys:
|
|
665
|
-
if key in kwargs:
|
|
666
|
-
input_kwargs[key] = kwargs[key]
|
|
667
607
|
file_obj = HdfsPrefetchReader(
|
|
668
608
|
hdfs_path=self.path_without_protocol,
|
|
669
609
|
client=self._client,
|
|
670
610
|
profile_name=self._profile_name,
|
|
671
|
-
|
|
611
|
+
block_size=block_size,
|
|
612
|
+
max_buffer_size=max_buffer_size,
|
|
613
|
+
block_forward=block_forward,
|
|
614
|
+
max_retries=HDFS_MAX_RETRY_TIMES,
|
|
615
|
+
max_workers=max_workers,
|
|
672
616
|
)
|
|
673
617
|
if _is_pickle(file_obj):
|
|
674
618
|
file_obj = io.BufferedReader(file_obj) # type: ignore
|
megfile/http.py
CHANGED
|
@@ -1,4 +1,8 @@
|
|
|
1
|
-
from
|
|
1
|
+
from io import BufferedReader
|
|
2
|
+
from typing import Optional, Union
|
|
3
|
+
|
|
4
|
+
from megfile.config import READER_BLOCK_SIZE, READER_MAX_BUFFER_SIZE
|
|
5
|
+
from megfile.http_path import HttpPath, HttpPrefetchReader, get_http_session, is_http
|
|
2
6
|
from megfile.interfaces import PathLike, StatResult
|
|
3
7
|
|
|
4
8
|
__all__ = [
|
|
@@ -12,6 +16,51 @@ __all__ = [
|
|
|
12
16
|
]
|
|
13
17
|
|
|
14
18
|
|
|
19
|
+
def http_open(
|
|
20
|
+
path: PathLike,
|
|
21
|
+
mode: str = "rb",
|
|
22
|
+
*,
|
|
23
|
+
encoding: Optional[str] = None,
|
|
24
|
+
errors: Optional[str] = None,
|
|
25
|
+
max_workers: Optional[int] = None,
|
|
26
|
+
max_buffer_size: int = READER_MAX_BUFFER_SIZE,
|
|
27
|
+
block_forward: Optional[int] = None,
|
|
28
|
+
block_size: int = READER_BLOCK_SIZE,
|
|
29
|
+
**kwargs,
|
|
30
|
+
) -> Union[BufferedReader, HttpPrefetchReader]:
|
|
31
|
+
"""Open a BytesIO to read binary data of given http(s) url
|
|
32
|
+
|
|
33
|
+
.. note ::
|
|
34
|
+
|
|
35
|
+
Essentially, it reads data of http(s) url to memory by requests,
|
|
36
|
+
and then return BytesIO to user.
|
|
37
|
+
|
|
38
|
+
:param path: Given path
|
|
39
|
+
:param mode: Only supports 'r' or 'rb' mode now
|
|
40
|
+
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
41
|
+
the file. This should only be used in text mode.
|
|
42
|
+
:param errors: errors is an optional string that specifies how encoding and decoding
|
|
43
|
+
errors are to be handled—this cannot be used in binary mode.
|
|
44
|
+
:param max_workers: Max download thread number, `None` by default,
|
|
45
|
+
will use global thread pool with 8 threads.
|
|
46
|
+
:param max_buffer_size: Max cached buffer size in memory, 128MB by default.
|
|
47
|
+
Set to `0` will disable cache.
|
|
48
|
+
:param block_forward: How many blocks of data cached from offset position
|
|
49
|
+
:param block_size: Size of single block, 8MB by default. Each block will be uploaded
|
|
50
|
+
or downloaded by single thread.
|
|
51
|
+
:return: A file-like object with http(s) data
|
|
52
|
+
"""
|
|
53
|
+
return HttpPath(path).open(
|
|
54
|
+
mode,
|
|
55
|
+
encoding=encoding,
|
|
56
|
+
errors=errors,
|
|
57
|
+
max_workers=max_workers,
|
|
58
|
+
max_buffer_size=max_buffer_size,
|
|
59
|
+
block_forward=block_forward,
|
|
60
|
+
block_size=block_size,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
15
64
|
def http_stat(path: PathLike, follow_symlinks=True) -> StatResult:
|
|
16
65
|
"""
|
|
17
66
|
Get StatResult of http_url response, including size and mtime,
|
megfile/http_path.py
CHANGED
|
@@ -9,20 +9,27 @@ from typing import Iterable, Iterator, Optional, Tuple, Union
|
|
|
9
9
|
import requests
|
|
10
10
|
from urllib3 import HTTPResponse
|
|
11
11
|
|
|
12
|
-
from megfile.config import
|
|
12
|
+
from megfile.config import (
|
|
13
|
+
HTTP_MAX_RETRY_TIMES,
|
|
14
|
+
READER_BLOCK_SIZE,
|
|
15
|
+
READER_MAX_BUFFER_SIZE,
|
|
16
|
+
)
|
|
13
17
|
from megfile.errors import http_should_retry, patch_method, translate_http_error
|
|
14
18
|
from megfile.interfaces import PathLike, Readable, StatResult, URIPath
|
|
15
19
|
from megfile.lib.compat import fspath
|
|
16
20
|
from megfile.lib.http_prefetch_reader import DEFAULT_TIMEOUT, HttpPrefetchReader
|
|
17
|
-
from megfile.lib.s3_buffered_writer import DEFAULT_MAX_BUFFER_SIZE
|
|
18
21
|
from megfile.lib.url import get_url_scheme
|
|
19
22
|
from megfile.smart_path import SmartPath
|
|
20
23
|
from megfile.utils import _is_pickle, binary_open
|
|
21
24
|
|
|
22
|
-
__all__ = [
|
|
25
|
+
__all__ = [
|
|
26
|
+
"HttpPath",
|
|
27
|
+
"HttpsPath",
|
|
28
|
+
"get_http_session",
|
|
29
|
+
"is_http",
|
|
30
|
+
]
|
|
23
31
|
|
|
24
32
|
_logger = get_logger(__name__)
|
|
25
|
-
max_retries = HTTP_MAX_RETRY_TIMES
|
|
26
33
|
|
|
27
34
|
|
|
28
35
|
def get_http_session(
|
|
@@ -101,7 +108,7 @@ def get_http_session(
|
|
|
101
108
|
|
|
102
109
|
session.request = patch_method(
|
|
103
110
|
partial(session.request, timeout=timeout),
|
|
104
|
-
max_retries=
|
|
111
|
+
max_retries=HTTP_MAX_RETRY_TIMES,
|
|
105
112
|
should_retry=http_should_retry,
|
|
106
113
|
before_callback=before_callback,
|
|
107
114
|
after_callback=after_callback,
|
|
@@ -127,48 +134,6 @@ def is_http(path: PathLike) -> bool:
|
|
|
127
134
|
return scheme == "http" or scheme == "https"
|
|
128
135
|
|
|
129
136
|
|
|
130
|
-
def http_open(
|
|
131
|
-
path: PathLike,
|
|
132
|
-
mode: str = "rb",
|
|
133
|
-
*,
|
|
134
|
-
encoding: Optional[str] = None,
|
|
135
|
-
errors: Optional[str] = None,
|
|
136
|
-
max_concurrency: Optional[int] = None,
|
|
137
|
-
max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
|
|
138
|
-
forward_ratio: Optional[float] = None,
|
|
139
|
-
block_size: int = DEFAULT_BLOCK_SIZE,
|
|
140
|
-
**kwargs,
|
|
141
|
-
) -> Union[BufferedReader, HttpPrefetchReader]:
|
|
142
|
-
"""Open a BytesIO to read binary data of given http(s) url
|
|
143
|
-
|
|
144
|
-
.. note ::
|
|
145
|
-
|
|
146
|
-
Essentially, it reads data of http(s) url to memory by requests,
|
|
147
|
-
and then return BytesIO to user.
|
|
148
|
-
|
|
149
|
-
:param path: Given path
|
|
150
|
-
:param mode: Only supports 'rb' mode now
|
|
151
|
-
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
152
|
-
the file. This should only be used in text mode.
|
|
153
|
-
:param errors: errors is an optional string that specifies how encoding and decoding
|
|
154
|
-
errors are to be handled—this cannot be used in binary mode.
|
|
155
|
-
:param max_concurrency: Max download thread number, None by default
|
|
156
|
-
:param max_buffer_size: Max cached buffer size in memory, 128MB by default
|
|
157
|
-
:param block_size: Size of single block, 8MB by default. Each block will be uploaded
|
|
158
|
-
or downloaded by single thread.
|
|
159
|
-
:return: BytesIO initialized with http(s) data
|
|
160
|
-
"""
|
|
161
|
-
return HttpPath(path).open(
|
|
162
|
-
mode,
|
|
163
|
-
encoding=encoding,
|
|
164
|
-
errors=errors,
|
|
165
|
-
max_concurrency=max_concurrency,
|
|
166
|
-
max_buffer_size=max_buffer_size,
|
|
167
|
-
forward_ratio=forward_ratio,
|
|
168
|
-
block_size=block_size,
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
|
|
172
137
|
@SmartPath.register
|
|
173
138
|
class HttpPath(URIPath):
|
|
174
139
|
protocol = "http"
|
|
@@ -185,10 +150,10 @@ class HttpPath(URIPath):
|
|
|
185
150
|
self,
|
|
186
151
|
mode: str = "rb",
|
|
187
152
|
*,
|
|
188
|
-
|
|
189
|
-
max_buffer_size: int =
|
|
190
|
-
|
|
191
|
-
block_size: int =
|
|
153
|
+
max_workers: Optional[int] = None,
|
|
154
|
+
max_buffer_size: int = READER_MAX_BUFFER_SIZE,
|
|
155
|
+
block_forward: Optional[int] = None,
|
|
156
|
+
block_size: int = READER_BLOCK_SIZE,
|
|
192
157
|
**kwargs,
|
|
193
158
|
) -> Union[BufferedReader, HttpPrefetchReader]:
|
|
194
159
|
"""Open a BytesIO to read binary data of given http(s) url
|
|
@@ -198,16 +163,19 @@ class HttpPath(URIPath):
|
|
|
198
163
|
Essentially, it reads data of http(s) url to memory by requests,
|
|
199
164
|
and then return BytesIO to user.
|
|
200
165
|
|
|
201
|
-
:param mode: Only supports 'rb' mode now
|
|
166
|
+
:param mode: Only supports 'r' or 'rb' mode now
|
|
202
167
|
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
203
168
|
the file. This should only be used in text mode.
|
|
204
169
|
:param errors: errors is an optional string that specifies how encoding and
|
|
205
170
|
decoding errors are to be handled—this cannot be used in binary mode.
|
|
206
|
-
:param
|
|
207
|
-
|
|
171
|
+
:param max_workers: Max download thread number, `None` by default,
|
|
172
|
+
will use global thread pool with 8 threads.
|
|
173
|
+
:param max_buffer_size: Max cached buffer size in memory, 128MB by default.
|
|
174
|
+
Set to `0` will disable cache.
|
|
175
|
+
:param block_forward: How many blocks of data cached from offset position
|
|
208
176
|
:param block_size: Size of single block, 8MB by default. Each block will
|
|
209
177
|
be uploaded or downloaded by single thread.
|
|
210
|
-
:return:
|
|
178
|
+
:return: A file-like object with http(s) data
|
|
211
179
|
"""
|
|
212
180
|
if mode not in ("rb",):
|
|
213
181
|
raise ValueError("unacceptable mode: %r" % mode)
|
|
@@ -234,20 +202,14 @@ class HttpPath(URIPath):
|
|
|
234
202
|
):
|
|
235
203
|
response.close()
|
|
236
204
|
|
|
237
|
-
block_capacity = max_buffer_size // block_size
|
|
238
|
-
if forward_ratio is None:
|
|
239
|
-
block_forward = None
|
|
240
|
-
else:
|
|
241
|
-
block_forward = max(int(block_capacity * forward_ratio), 1)
|
|
242
|
-
|
|
243
205
|
reader = HttpPrefetchReader(
|
|
244
206
|
self,
|
|
245
207
|
content_size=content_size,
|
|
246
|
-
max_retries=max_retries,
|
|
247
|
-
max_workers=max_concurrency,
|
|
248
|
-
block_capacity=block_capacity,
|
|
249
|
-
block_forward=block_forward,
|
|
250
208
|
block_size=block_size,
|
|
209
|
+
max_buffer_size=max_buffer_size,
|
|
210
|
+
block_forward=block_forward,
|
|
211
|
+
max_retries=HTTP_MAX_RETRY_TIMES,
|
|
212
|
+
max_workers=max_workers,
|
|
251
213
|
)
|
|
252
214
|
if _is_pickle(reader):
|
|
253
215
|
reader = BufferedReader(reader) # type: ignore
|
megfile/interfaces.py
CHANGED
|
@@ -6,7 +6,6 @@ from typing import IO, AnyStr, Iterable, List, Optional
|
|
|
6
6
|
from megfile.pathlike import (
|
|
7
7
|
Access,
|
|
8
8
|
BasePath,
|
|
9
|
-
BaseURIPath,
|
|
10
9
|
FileEntry,
|
|
11
10
|
PathLike,
|
|
12
11
|
Self,
|
|
@@ -17,11 +16,9 @@ from megfile.pathlike import (
|
|
|
17
16
|
__all__ = [
|
|
18
17
|
"Access",
|
|
19
18
|
"BasePath",
|
|
20
|
-
"BaseURIPath",
|
|
21
19
|
"FileEntry",
|
|
22
20
|
"PathLike",
|
|
23
21
|
"StatResult",
|
|
24
|
-
"URIPath",
|
|
25
22
|
"fullname",
|
|
26
23
|
"Closable",
|
|
27
24
|
"FileLike",
|
|
@@ -31,6 +28,7 @@ __all__ = [
|
|
|
31
28
|
"FileCacher",
|
|
32
29
|
"NullCacher",
|
|
33
30
|
"ContextIterator",
|
|
31
|
+
"URIPath",
|
|
34
32
|
]
|
|
35
33
|
|
|
36
34
|
|
|
@@ -115,11 +113,6 @@ class FileLike(Closable, IOBase, IO[AnyStr], ABC): # pytype: disable=signature-
|
|
|
115
113
|
This is not implemented for read-only and non-blocking streams.
|
|
116
114
|
"""
|
|
117
115
|
|
|
118
|
-
def __del__(self) -> None:
|
|
119
|
-
# TODO: Next version should turn on __del__ for auto closing,
|
|
120
|
-
# and disable this in child class like CombineReader
|
|
121
|
-
pass
|
|
122
|
-
|
|
123
116
|
|
|
124
117
|
class Seekable(FileLike, ABC):
|
|
125
118
|
def seekable(self) -> bool:
|