megfile 3.1.6.post1__py3-none-any.whl → 4.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megfile/cli.py +12 -7
- megfile/config.py +27 -39
- megfile/fs.py +169 -12
- megfile/fs_path.py +183 -260
- megfile/hdfs.py +106 -5
- megfile/hdfs_path.py +34 -90
- megfile/http.py +50 -1
- megfile/http_path.py +27 -65
- megfile/interfaces.py +1 -8
- megfile/lib/base_prefetch_reader.py +62 -78
- megfile/lib/combine_reader.py +5 -0
- megfile/lib/glob.py +3 -6
- megfile/lib/hdfs_prefetch_reader.py +7 -7
- megfile/lib/http_prefetch_reader.py +6 -6
- megfile/lib/s3_buffered_writer.py +71 -65
- megfile/lib/s3_cached_handler.py +1 -2
- megfile/lib/s3_limited_seekable_writer.py +3 -7
- megfile/lib/s3_memory_handler.py +1 -2
- megfile/lib/s3_pipe_handler.py +1 -2
- megfile/lib/s3_prefetch_reader.py +10 -19
- megfile/lib/s3_share_cache_reader.py +8 -5
- megfile/pathlike.py +397 -401
- megfile/s3.py +118 -17
- megfile/s3_path.py +126 -209
- megfile/sftp.py +300 -10
- megfile/sftp_path.py +46 -322
- megfile/smart.py +33 -27
- megfile/smart_path.py +9 -14
- megfile/stdio.py +1 -1
- megfile/stdio_path.py +2 -2
- megfile/utils/__init__.py +3 -4
- megfile/version.py +1 -1
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/METADATA +7 -7
- megfile-4.0.0.post1.dist-info/RECORD +52 -0
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/WHEEL +1 -1
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/top_level.txt +0 -2
- docs/conf.py +0 -65
- megfile-3.1.6.post1.dist-info/RECORD +0 -55
- scripts/convert_results_to_sarif.py +0 -91
- scripts/generate_file.py +0 -344
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/LICENSE +0 -0
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/entry_points.txt +0 -0
megfile/cli.py
CHANGED
|
@@ -10,7 +10,7 @@ from functools import partial
|
|
|
10
10
|
import click
|
|
11
11
|
from tqdm import tqdm
|
|
12
12
|
|
|
13
|
-
from megfile.config import
|
|
13
|
+
from megfile.config import READER_BLOCK_SIZE
|
|
14
14
|
from megfile.hdfs_path import DEFAULT_HDFS_TIMEOUT
|
|
15
15
|
from megfile.interfaces import FileEntry
|
|
16
16
|
from megfile.lib.glob import get_non_glob_dir, has_magic
|
|
@@ -349,6 +349,9 @@ def sync(
|
|
|
349
349
|
quiet: bool,
|
|
350
350
|
skip: bool,
|
|
351
351
|
):
|
|
352
|
+
if not smart_exists(dst_path):
|
|
353
|
+
force = True
|
|
354
|
+
|
|
352
355
|
with ThreadPoolExecutor(max_workers=worker) as executor:
|
|
353
356
|
if has_magic(src_path):
|
|
354
357
|
src_root_path = get_non_glob_dir(src_path)
|
|
@@ -411,7 +414,7 @@ def sync(
|
|
|
411
414
|
dict(
|
|
412
415
|
src_root_path=src_root_path,
|
|
413
416
|
dst_root_path=dst_path,
|
|
414
|
-
|
|
417
|
+
src_file_entry=file_entry,
|
|
415
418
|
callback=callback,
|
|
416
419
|
followlinks=True,
|
|
417
420
|
callback_after_copy_file=callback_after_copy_file,
|
|
@@ -484,11 +487,11 @@ def tail(path: str, lines: int, follow: bool):
|
|
|
484
487
|
f.seek(0, os.SEEK_SET)
|
|
485
488
|
|
|
486
489
|
for current_offset in range(
|
|
487
|
-
file_size -
|
|
490
|
+
file_size - READER_BLOCK_SIZE, 0 - READER_BLOCK_SIZE, -READER_BLOCK_SIZE
|
|
488
491
|
):
|
|
489
492
|
current_offset = max(0, current_offset)
|
|
490
493
|
f.seek(current_offset)
|
|
491
|
-
block_lines = f.read(
|
|
494
|
+
block_lines = f.read(READER_BLOCK_SIZE).split(b"\n")
|
|
492
495
|
if len(line_list) > 0:
|
|
493
496
|
block_lines[-1] += line_list[0]
|
|
494
497
|
block_lines.extend(line_list[1:])
|
|
@@ -524,9 +527,11 @@ def to(path: str, append: bool, stdout: bool):
|
|
|
524
527
|
mode = "wb"
|
|
525
528
|
if append:
|
|
526
529
|
mode = "ab"
|
|
527
|
-
with
|
|
528
|
-
|
|
529
|
-
|
|
530
|
+
with (
|
|
531
|
+
smart_open("stdio://0", "rb") as stdin,
|
|
532
|
+
smart_open(path, mode) as f,
|
|
533
|
+
smart_open("stdio://1", "wb") as stdout_fd,
|
|
534
|
+
):
|
|
530
535
|
length = 16 * 1024
|
|
531
536
|
while True:
|
|
532
537
|
buf = stdin.read(length)
|
megfile/config.py
CHANGED
|
@@ -1,53 +1,39 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from logging import getLogger
|
|
3
2
|
|
|
4
|
-
_logger = getLogger(__name__)
|
|
5
3
|
|
|
6
|
-
|
|
4
|
+
def to_boolean(value):
|
|
5
|
+
return value.lower() in ("true", "yes", "1")
|
|
7
6
|
|
|
8
|
-
if os.getenv("MEGFILE_MAX_BUFFER_SIZE"):
|
|
9
|
-
DEFAULT_MAX_BUFFER_SIZE = int(os.environ["MEGFILE_MAX_BUFFER_SIZE"])
|
|
10
|
-
if DEFAULT_MAX_BUFFER_SIZE < DEFAULT_BLOCK_SIZE:
|
|
11
|
-
DEFAULT_MAX_BUFFER_SIZE = DEFAULT_BLOCK_SIZE
|
|
12
|
-
_logger.warning(
|
|
13
|
-
"Env 'MEGFILE_MAX_BUFFER_SIZE' is smaller than block size, "
|
|
14
|
-
"will not use buffer."
|
|
15
|
-
)
|
|
16
|
-
DEFAULT_BLOCK_CAPACITY = DEFAULT_MAX_BUFFER_SIZE // DEFAULT_BLOCK_SIZE
|
|
17
|
-
if os.getenv("MEGFILE_BLOCK_CAPACITY"):
|
|
18
|
-
_logger.warning(
|
|
19
|
-
"Env 'MEGFILE_MAX_BUFFER_SIZE' and 'MEGFILE_BLOCK_CAPACITY' are both set, "
|
|
20
|
-
"'MEGFILE_BLOCK_CAPACITY' will be ignored."
|
|
21
|
-
)
|
|
22
|
-
elif os.getenv("MEGFILE_BLOCK_CAPACITY"):
|
|
23
|
-
DEFAULT_BLOCK_CAPACITY = int(os.environ["MEGFILE_BLOCK_CAPACITY"])
|
|
24
|
-
DEFAULT_MAX_BUFFER_SIZE = DEFAULT_BLOCK_SIZE * DEFAULT_BLOCK_CAPACITY
|
|
25
|
-
else:
|
|
26
|
-
DEFAULT_MAX_BUFFER_SIZE = 128 * 2**20
|
|
27
|
-
DEFAULT_BLOCK_CAPACITY = 16
|
|
28
7
|
|
|
29
|
-
|
|
8
|
+
READER_BLOCK_SIZE = int(os.getenv("MEGFILE_READER_BLOCK_SIZE") or 8 * 2**20)
|
|
9
|
+
if READER_BLOCK_SIZE <= 0:
|
|
10
|
+
raise ValueError(
|
|
11
|
+
f"'MEGFILE_READER_BLOCK_SIZE' must bigger than 0, got {READER_BLOCK_SIZE}"
|
|
12
|
+
)
|
|
13
|
+
READER_MAX_BUFFER_SIZE = int(os.getenv("MEGFILE_READER_MAX_BUFFER_SIZE") or 128 * 2**20)
|
|
30
14
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
15
|
+
# Multi-upload in aws s3 has a maximum of 10,000 parts,
|
|
16
|
+
# so the maximum supported file size is MEGFILE_WRITE_BLOCK_SIZE * 10,000,
|
|
17
|
+
# the largest object that can be uploaded in a single PUT is 5 TB in aws s3.
|
|
18
|
+
WRITER_BLOCK_SIZE = int(os.getenv("MEGFILE_WRITER_BLOCK_SIZE") or 8 * 2**20)
|
|
19
|
+
if WRITER_BLOCK_SIZE <= 0:
|
|
20
|
+
raise ValueError(
|
|
21
|
+
f"'MEGFILE_WRITER_BLOCK_SIZE' must bigger than 0, got {WRITER_BLOCK_SIZE}"
|
|
22
|
+
)
|
|
23
|
+
WRITER_MAX_BUFFER_SIZE = int(os.getenv("MEGFILE_WRITER_MAX_BUFFER_SIZE") or 128 * 2**20)
|
|
24
|
+
DEFAULT_WRITER_BLOCK_AUTOSCALE = not os.getenv("MEGFILE_WRITER_BLOCK_SIZE")
|
|
25
|
+
if os.getenv("MEGFILE_WRITER_BLOCK_AUTOSCALE"):
|
|
26
|
+
DEFAULT_WRITER_BLOCK_AUTOSCALE = to_boolean(
|
|
27
|
+
os.environ["MEGFILE_WRITER_BLOCK_AUTOSCALE"].lower()
|
|
28
|
+
)
|
|
40
29
|
|
|
41
|
-
GLOBAL_MAX_WORKERS = int(os.getenv("MEGFILE_MAX_WORKERS") or
|
|
42
|
-
DEFAULT_MAX_RETRY_TIMES = int(os.getenv("MEGFILE_MAX_RETRY_TIMES") or 10)
|
|
43
|
-
|
|
44
|
-
# for logging the size of file had read or wrote
|
|
45
|
-
BACKOFF_INITIAL = 64 * 2**20 # 64MB
|
|
46
|
-
BACKOFF_FACTOR = 4
|
|
30
|
+
GLOBAL_MAX_WORKERS = int(os.getenv("MEGFILE_MAX_WORKERS") or 8)
|
|
47
31
|
|
|
48
32
|
NEWLINE = ord("\n")
|
|
49
33
|
|
|
50
34
|
S3_CLIENT_CACHE_MODE = os.getenv("MEGFILE_S3_CLIENT_CACHE_MODE") or "thread_local"
|
|
35
|
+
|
|
36
|
+
DEFAULT_MAX_RETRY_TIMES = int(os.getenv("MEGFILE_MAX_RETRY_TIMES") or 10)
|
|
51
37
|
S3_MAX_RETRY_TIMES = int(
|
|
52
38
|
os.getenv("MEGFILE_S3_MAX_RETRY_TIMES") or DEFAULT_MAX_RETRY_TIMES
|
|
53
39
|
)
|
|
@@ -61,4 +47,6 @@ SFTP_MAX_RETRY_TIMES = int(
|
|
|
61
47
|
os.getenv("MEGFILE_SFTP_MAX_RETRY_TIMES") or DEFAULT_MAX_RETRY_TIMES
|
|
62
48
|
)
|
|
63
49
|
|
|
50
|
+
SFTP_HOST_KEY_POLICY = os.getenv("MEGFILE_SFTP_HOST_KEY_POLICY")
|
|
51
|
+
|
|
64
52
|
HTTP_AUTH_HEADERS = ("Authorization", "Www-Authenticate", "Cookie", "Cookie2")
|
megfile/fs.py
CHANGED
|
@@ -1,20 +1,10 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from typing import BinaryIO, Callable, Iterator, List, Optional, Tuple
|
|
2
3
|
|
|
3
4
|
from megfile.fs_path import (
|
|
4
5
|
FSPath,
|
|
5
6
|
_make_stat,
|
|
6
|
-
fs_cwd,
|
|
7
|
-
fs_glob,
|
|
8
|
-
fs_glob_stat,
|
|
9
|
-
fs_home,
|
|
10
|
-
fs_iglob,
|
|
11
|
-
fs_lstat,
|
|
12
|
-
fs_makedirs,
|
|
13
|
-
fs_move,
|
|
14
7
|
fs_path_join,
|
|
15
|
-
fs_readlink,
|
|
16
|
-
fs_rename,
|
|
17
|
-
fs_resolve,
|
|
18
8
|
is_fs,
|
|
19
9
|
)
|
|
20
10
|
from megfile.interfaces import Access, FileEntry, PathLike, StatResult
|
|
@@ -22,7 +12,6 @@ from megfile.interfaces import Access, FileEntry, PathLike, StatResult
|
|
|
22
12
|
__all__ = [
|
|
23
13
|
"is_fs",
|
|
24
14
|
"fs_path_join",
|
|
25
|
-
"_make_stat",
|
|
26
15
|
"fs_readlink",
|
|
27
16
|
"fs_cwd",
|
|
28
17
|
"fs_home",
|
|
@@ -430,3 +419,171 @@ def fs_save_as(file_object: BinaryIO, path: PathLike):
|
|
|
430
419
|
:param file_object: stream to be read
|
|
431
420
|
"""
|
|
432
421
|
return FSPath(path).save(file_object)
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def fs_readlink(path) -> str:
|
|
425
|
+
"""
|
|
426
|
+
Return a string representing the path to which the symbolic link points.
|
|
427
|
+
:returns: Return a string representing the path to which the symbolic link points.
|
|
428
|
+
"""
|
|
429
|
+
return FSPath(path).readlink().path_without_protocol # pyre-ignore[7]
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def fs_cwd() -> str:
|
|
433
|
+
"""Return current working directory
|
|
434
|
+
|
|
435
|
+
returns: Current working directory
|
|
436
|
+
"""
|
|
437
|
+
return os.getcwd()
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def fs_home():
|
|
441
|
+
"""Return the home directory
|
|
442
|
+
|
|
443
|
+
returns: Home directory path
|
|
444
|
+
"""
|
|
445
|
+
return os.path.expanduser("~")
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def fs_iglob(
|
|
449
|
+
path: PathLike, recursive: bool = True, missing_ok: bool = True
|
|
450
|
+
) -> Iterator[str]:
|
|
451
|
+
"""Return path iterator in ascending alphabetical order,
|
|
452
|
+
in which path matches glob pattern
|
|
453
|
+
|
|
454
|
+
1. If doesn't match any path, return empty list
|
|
455
|
+
Notice: ``glob.glob`` in standard library returns ['a/'] instead of empty list
|
|
456
|
+
when pathname is like `a/**`, recursive is True and directory 'a' doesn't exist.
|
|
457
|
+
fs_glob behaves like ``glob.glob`` in standard library under such circumstance.
|
|
458
|
+
2. No guarantee that each path in result is different, which means:
|
|
459
|
+
Assume there exists a path `/a/b/c/b/d.txt`
|
|
460
|
+
use path pattern like `/**/b/**/*.txt` to glob,
|
|
461
|
+
the path above will be returned twice
|
|
462
|
+
3. `**` will match any matched file, directory, symlink and '' by default,
|
|
463
|
+
when recursive is `True`
|
|
464
|
+
4. fs_glob returns same as glob.glob(pathname, recursive=True)
|
|
465
|
+
in ascending alphabetical order.
|
|
466
|
+
5. Hidden files (filename stars with '.') will not be found in the result
|
|
467
|
+
|
|
468
|
+
:param recursive: If False, `**` will not search directory recursively
|
|
469
|
+
:param missing_ok: If False and target path doesn't match any file,
|
|
470
|
+
raise FileNotFoundError
|
|
471
|
+
:returns: An iterator contains paths match `pathname`
|
|
472
|
+
"""
|
|
473
|
+
for path_obj in FSPath(path).iglob(
|
|
474
|
+
pattern="", recursive=recursive, missing_ok=missing_ok
|
|
475
|
+
):
|
|
476
|
+
yield path_obj.path_without_protocol # pyre-ignore[7]
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def fs_glob(
|
|
480
|
+
path: PathLike, recursive: bool = True, missing_ok: bool = True
|
|
481
|
+
) -> List[str]:
|
|
482
|
+
"""Return path list in ascending alphabetical order,
|
|
483
|
+
in which path matches glob pattern
|
|
484
|
+
|
|
485
|
+
1. If doesn't match any path, return empty list
|
|
486
|
+
Notice: ``glob.glob`` in standard library returns ['a/'] instead of empty list
|
|
487
|
+
when pathname is like `a/**`, recursive is True and directory 'a' doesn't exist.
|
|
488
|
+
fs_glob behaves like ``glob.glob`` in standard library under such circumstance.
|
|
489
|
+
2. No guarantee that each path in result is different, which means:
|
|
490
|
+
Assume there exists a path `/a/b/c/b/d.txt`
|
|
491
|
+
use path pattern like `/**/b/**/*.txt` to glob,
|
|
492
|
+
the path above will be returned twice
|
|
493
|
+
3. `**` will match any matched file, directory, symlink and '' by default,
|
|
494
|
+
when recursive is `True`
|
|
495
|
+
4. fs_glob returns same as glob.glob(pathname, recursive=True)
|
|
496
|
+
in ascending alphabetical order.
|
|
497
|
+
5. Hidden files (filename stars with '.') will not be found in the result
|
|
498
|
+
|
|
499
|
+
:param recursive: If False, `**` will not search directory recursively
|
|
500
|
+
:param missing_ok: If False and target path doesn't match any file,
|
|
501
|
+
raise FileNotFoundError
|
|
502
|
+
:returns: A list contains paths match `pathname`
|
|
503
|
+
"""
|
|
504
|
+
return list(fs_iglob(path=path, recursive=recursive, missing_ok=missing_ok))
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def fs_glob_stat(
|
|
508
|
+
path: PathLike, recursive: bool = True, missing_ok: bool = True
|
|
509
|
+
) -> Iterator[FileEntry]:
|
|
510
|
+
"""Return a list contains tuples of path and file stat,
|
|
511
|
+
in ascending alphabetical order, in which path matches glob pattern
|
|
512
|
+
|
|
513
|
+
1. If doesn't match any path, return empty list
|
|
514
|
+
Notice: ``glob.glob`` in standard library returns ['a/'] instead of empty list
|
|
515
|
+
when pathname is like `a/**`, recursive is True and directory 'a' doesn't exist.
|
|
516
|
+
fs_glob behaves like ``glob.glob`` in standard library under such circumstance.
|
|
517
|
+
2. No guarantee that each path in result is different, which means:
|
|
518
|
+
Assume there exists a path `/a/b/c/b/d.txt`
|
|
519
|
+
use path pattern like `/**/b/**/*.txt` to glob,
|
|
520
|
+
the path above will be returned twice.
|
|
521
|
+
3. `**` will match any matched file, directory, symlink and '' by default,
|
|
522
|
+
when recursive is `True`
|
|
523
|
+
4. fs_glob returns same as glob.glob(pathname, recursive=True)
|
|
524
|
+
in ascending alphabetical order.
|
|
525
|
+
5. Hidden files (filename stars with '.') will not be found in the result
|
|
526
|
+
|
|
527
|
+
:param recursive: If False, `**` will not search directory recursively
|
|
528
|
+
:param missing_ok: If False and target path doesn't match any file,
|
|
529
|
+
raise FileNotFoundError
|
|
530
|
+
:returns: A list contains tuples of path and file stat,
|
|
531
|
+
in which paths match `pathname`
|
|
532
|
+
"""
|
|
533
|
+
for path in fs_iglob(path=path, recursive=recursive, missing_ok=missing_ok):
|
|
534
|
+
yield FileEntry(os.path.basename(path), path, _make_stat(os.lstat(path)))
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def fs_resolve(path: PathLike) -> str:
|
|
538
|
+
"""Equal to fs_realpath, return the real path of given path
|
|
539
|
+
|
|
540
|
+
:param path: Given path
|
|
541
|
+
:returns: Real path of given path
|
|
542
|
+
"""
|
|
543
|
+
return FSPath(path).realpath()
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def fs_makedirs(path: PathLike, exist_ok: bool = False):
|
|
547
|
+
"""
|
|
548
|
+
make a directory on fs, including parent directory
|
|
549
|
+
|
|
550
|
+
If there exists a file on the path, raise FileExistsError
|
|
551
|
+
|
|
552
|
+
:param path: Given path
|
|
553
|
+
:param exist_ok: If False and target directory exists, raise FileExistsError
|
|
554
|
+
:raises: FileExistsError
|
|
555
|
+
"""
|
|
556
|
+
return FSPath(path).mkdir(parents=True, exist_ok=exist_ok)
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def fs_lstat(path: PathLike) -> StatResult:
|
|
560
|
+
"""
|
|
561
|
+
Like Path.stat() but, if the path points to a symbolic link,
|
|
562
|
+
return the symbolic link’s information rather than its target’s.
|
|
563
|
+
|
|
564
|
+
:param path: Given path
|
|
565
|
+
:returns: StatResult
|
|
566
|
+
"""
|
|
567
|
+
return FSPath(path).lstat()
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def fs_rename(src_path: PathLike, dst_path: PathLike, overwrite: bool = True) -> None:
|
|
571
|
+
"""
|
|
572
|
+
rename file on fs
|
|
573
|
+
|
|
574
|
+
:param src_path: Given path
|
|
575
|
+
:param dst_path: Given destination path
|
|
576
|
+
:param overwrite: whether or not overwrite file when exists
|
|
577
|
+
"""
|
|
578
|
+
FSPath(src_path).rename(dst_path, overwrite)
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
def fs_move(src_path: PathLike, dst_path: PathLike, overwrite: bool = True) -> None:
|
|
582
|
+
"""
|
|
583
|
+
rename file on fs
|
|
584
|
+
|
|
585
|
+
:param src_path: Given path
|
|
586
|
+
:param dst_path: Given destination path
|
|
587
|
+
:param overwrite: whether or not overwrite file when exists
|
|
588
|
+
"""
|
|
589
|
+
return fs_rename(src_path, dst_path, overwrite)
|