megfile 3.1.6.post1__py3-none-any.whl → 4.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. megfile/cli.py +12 -7
  2. megfile/config.py +27 -39
  3. megfile/fs.py +169 -12
  4. megfile/fs_path.py +183 -260
  5. megfile/hdfs.py +106 -5
  6. megfile/hdfs_path.py +34 -90
  7. megfile/http.py +50 -1
  8. megfile/http_path.py +27 -65
  9. megfile/interfaces.py +1 -8
  10. megfile/lib/base_prefetch_reader.py +62 -78
  11. megfile/lib/combine_reader.py +5 -0
  12. megfile/lib/glob.py +3 -6
  13. megfile/lib/hdfs_prefetch_reader.py +7 -7
  14. megfile/lib/http_prefetch_reader.py +6 -6
  15. megfile/lib/s3_buffered_writer.py +71 -65
  16. megfile/lib/s3_cached_handler.py +1 -2
  17. megfile/lib/s3_limited_seekable_writer.py +3 -7
  18. megfile/lib/s3_memory_handler.py +1 -2
  19. megfile/lib/s3_pipe_handler.py +1 -2
  20. megfile/lib/s3_prefetch_reader.py +10 -19
  21. megfile/lib/s3_share_cache_reader.py +8 -5
  22. megfile/pathlike.py +397 -401
  23. megfile/s3.py +118 -17
  24. megfile/s3_path.py +126 -209
  25. megfile/sftp.py +300 -10
  26. megfile/sftp_path.py +46 -322
  27. megfile/smart.py +33 -27
  28. megfile/smart_path.py +9 -14
  29. megfile/stdio.py +1 -1
  30. megfile/stdio_path.py +2 -2
  31. megfile/utils/__init__.py +3 -4
  32. megfile/version.py +1 -1
  33. {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/METADATA +7 -7
  34. megfile-4.0.0.post1.dist-info/RECORD +52 -0
  35. {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/WHEEL +1 -1
  36. {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/top_level.txt +0 -2
  37. docs/conf.py +0 -65
  38. megfile-3.1.6.post1.dist-info/RECORD +0 -55
  39. scripts/convert_results_to_sarif.py +0 -91
  40. scripts/generate_file.py +0 -344
  41. {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/LICENSE +0 -0
  42. {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/LICENSE.pyre +0 -0
  43. {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/entry_points.txt +0 -0
megfile/cli.py CHANGED
@@ -10,7 +10,7 @@ from functools import partial
10
10
  import click
11
11
  from tqdm import tqdm
12
12
 
13
- from megfile.config import DEFAULT_BLOCK_SIZE
13
+ from megfile.config import READER_BLOCK_SIZE
14
14
  from megfile.hdfs_path import DEFAULT_HDFS_TIMEOUT
15
15
  from megfile.interfaces import FileEntry
16
16
  from megfile.lib.glob import get_non_glob_dir, has_magic
@@ -349,6 +349,9 @@ def sync(
349
349
  quiet: bool,
350
350
  skip: bool,
351
351
  ):
352
+ if not smart_exists(dst_path):
353
+ force = True
354
+
352
355
  with ThreadPoolExecutor(max_workers=worker) as executor:
353
356
  if has_magic(src_path):
354
357
  src_root_path = get_non_glob_dir(src_path)
@@ -411,7 +414,7 @@ def sync(
411
414
  dict(
412
415
  src_root_path=src_root_path,
413
416
  dst_root_path=dst_path,
414
- src_file_path=file_entry.path,
417
+ src_file_entry=file_entry,
415
418
  callback=callback,
416
419
  followlinks=True,
417
420
  callback_after_copy_file=callback_after_copy_file,
@@ -484,11 +487,11 @@ def tail(path: str, lines: int, follow: bool):
484
487
  f.seek(0, os.SEEK_SET)
485
488
 
486
489
  for current_offset in range(
487
- file_size - DEFAULT_BLOCK_SIZE, 0 - DEFAULT_BLOCK_SIZE, -DEFAULT_BLOCK_SIZE
490
+ file_size - READER_BLOCK_SIZE, 0 - READER_BLOCK_SIZE, -READER_BLOCK_SIZE
488
491
  ):
489
492
  current_offset = max(0, current_offset)
490
493
  f.seek(current_offset)
491
- block_lines = f.read(DEFAULT_BLOCK_SIZE).split(b"\n")
494
+ block_lines = f.read(READER_BLOCK_SIZE).split(b"\n")
492
495
  if len(line_list) > 0:
493
496
  block_lines[-1] += line_list[0]
494
497
  block_lines.extend(line_list[1:])
@@ -524,9 +527,11 @@ def to(path: str, append: bool, stdout: bool):
524
527
  mode = "wb"
525
528
  if append:
526
529
  mode = "ab"
527
- with smart_open("stdio://0", "rb") as stdin, smart_open(
528
- path, mode
529
- ) as f, smart_open("stdio://1", "wb") as stdout_fd:
530
+ with (
531
+ smart_open("stdio://0", "rb") as stdin,
532
+ smart_open(path, mode) as f,
533
+ smart_open("stdio://1", "wb") as stdout_fd,
534
+ ):
530
535
  length = 16 * 1024
531
536
  while True:
532
537
  buf = stdin.read(length)
megfile/config.py CHANGED
@@ -1,53 +1,39 @@
1
1
  import os
2
- from logging import getLogger
3
2
 
4
- _logger = getLogger(__name__)
5
3
 
6
- DEFAULT_BLOCK_SIZE = int(os.getenv("MEGFILE_BLOCK_SIZE") or 8 * 2**20)
4
+ def to_boolean(value):
5
+ return value.lower() in ("true", "yes", "1")
7
6
 
8
- if os.getenv("MEGFILE_MAX_BUFFER_SIZE"):
9
- DEFAULT_MAX_BUFFER_SIZE = int(os.environ["MEGFILE_MAX_BUFFER_SIZE"])
10
- if DEFAULT_MAX_BUFFER_SIZE < DEFAULT_BLOCK_SIZE:
11
- DEFAULT_MAX_BUFFER_SIZE = DEFAULT_BLOCK_SIZE
12
- _logger.warning(
13
- "Env 'MEGFILE_MAX_BUFFER_SIZE' is smaller than block size, "
14
- "will not use buffer."
15
- )
16
- DEFAULT_BLOCK_CAPACITY = DEFAULT_MAX_BUFFER_SIZE // DEFAULT_BLOCK_SIZE
17
- if os.getenv("MEGFILE_BLOCK_CAPACITY"):
18
- _logger.warning(
19
- "Env 'MEGFILE_MAX_BUFFER_SIZE' and 'MEGFILE_BLOCK_CAPACITY' are both set, "
20
- "'MEGFILE_BLOCK_CAPACITY' will be ignored."
21
- )
22
- elif os.getenv("MEGFILE_BLOCK_CAPACITY"):
23
- DEFAULT_BLOCK_CAPACITY = int(os.environ["MEGFILE_BLOCK_CAPACITY"])
24
- DEFAULT_MAX_BUFFER_SIZE = DEFAULT_BLOCK_SIZE * DEFAULT_BLOCK_CAPACITY
25
- else:
26
- DEFAULT_MAX_BUFFER_SIZE = 128 * 2**20
27
- DEFAULT_BLOCK_CAPACITY = 16
28
7
 
29
- DEFAULT_MIN_BLOCK_SIZE = int(os.getenv("MEGFILE_MIN_BLOCK_SIZE") or DEFAULT_BLOCK_SIZE)
8
+ READER_BLOCK_SIZE = int(os.getenv("MEGFILE_READER_BLOCK_SIZE") or 8 * 2**20)
9
+ if READER_BLOCK_SIZE <= 0:
10
+ raise ValueError(
11
+ f"'MEGFILE_READER_BLOCK_SIZE' must bigger than 0, got {READER_BLOCK_SIZE}"
12
+ )
13
+ READER_MAX_BUFFER_SIZE = int(os.getenv("MEGFILE_READER_MAX_BUFFER_SIZE") or 128 * 2**20)
30
14
 
31
- if os.getenv("MEGFILE_MAX_BLOCK_SIZE"):
32
- DEFAULT_MAX_BLOCK_SIZE = int(os.environ["MEGFILE_MAX_BLOCK_SIZE"])
33
- if DEFAULT_MAX_BLOCK_SIZE < DEFAULT_BLOCK_SIZE:
34
- DEFAULT_MAX_BLOCK_SIZE = DEFAULT_BLOCK_SIZE
35
- _logger.warning(
36
- "Env 'MEGFILE_MAX_BLOCK_SIZE' is smaller than block size, will be ignored."
37
- )
38
- else:
39
- DEFAULT_MAX_BLOCK_SIZE = max(128 * 2**20, DEFAULT_BLOCK_SIZE)
15
+ # Multi-upload in aws s3 has a maximum of 10,000 parts,
16
+ # so the maximum supported file size is MEGFILE_WRITE_BLOCK_SIZE * 10,000,
17
+ # the largest object that can be uploaded in a single PUT is 5 TB in aws s3.
18
+ WRITER_BLOCK_SIZE = int(os.getenv("MEGFILE_WRITER_BLOCK_SIZE") or 8 * 2**20)
19
+ if WRITER_BLOCK_SIZE <= 0:
20
+ raise ValueError(
21
+ f"'MEGFILE_WRITER_BLOCK_SIZE' must bigger than 0, got {WRITER_BLOCK_SIZE}"
22
+ )
23
+ WRITER_MAX_BUFFER_SIZE = int(os.getenv("MEGFILE_WRITER_MAX_BUFFER_SIZE") or 128 * 2**20)
24
+ DEFAULT_WRITER_BLOCK_AUTOSCALE = not os.getenv("MEGFILE_WRITER_BLOCK_SIZE")
25
+ if os.getenv("MEGFILE_WRITER_BLOCK_AUTOSCALE"):
26
+ DEFAULT_WRITER_BLOCK_AUTOSCALE = to_boolean(
27
+ os.environ["MEGFILE_WRITER_BLOCK_AUTOSCALE"].lower()
28
+ )
40
29
 
41
- GLOBAL_MAX_WORKERS = int(os.getenv("MEGFILE_MAX_WORKERS") or 32)
42
- DEFAULT_MAX_RETRY_TIMES = int(os.getenv("MEGFILE_MAX_RETRY_TIMES") or 10)
43
-
44
- # for logging the size of file had read or wrote
45
- BACKOFF_INITIAL = 64 * 2**20 # 64MB
46
- BACKOFF_FACTOR = 4
30
+ GLOBAL_MAX_WORKERS = int(os.getenv("MEGFILE_MAX_WORKERS") or 8)
47
31
 
48
32
  NEWLINE = ord("\n")
49
33
 
50
34
  S3_CLIENT_CACHE_MODE = os.getenv("MEGFILE_S3_CLIENT_CACHE_MODE") or "thread_local"
35
+
36
+ DEFAULT_MAX_RETRY_TIMES = int(os.getenv("MEGFILE_MAX_RETRY_TIMES") or 10)
51
37
  S3_MAX_RETRY_TIMES = int(
52
38
  os.getenv("MEGFILE_S3_MAX_RETRY_TIMES") or DEFAULT_MAX_RETRY_TIMES
53
39
  )
@@ -61,4 +47,6 @@ SFTP_MAX_RETRY_TIMES = int(
61
47
  os.getenv("MEGFILE_SFTP_MAX_RETRY_TIMES") or DEFAULT_MAX_RETRY_TIMES
62
48
  )
63
49
 
50
+ SFTP_HOST_KEY_POLICY = os.getenv("MEGFILE_SFTP_HOST_KEY_POLICY")
51
+
64
52
  HTTP_AUTH_HEADERS = ("Authorization", "Www-Authenticate", "Cookie", "Cookie2")
megfile/fs.py CHANGED
@@ -1,20 +1,10 @@
1
+ import os
1
2
  from typing import BinaryIO, Callable, Iterator, List, Optional, Tuple
2
3
 
3
4
  from megfile.fs_path import (
4
5
  FSPath,
5
6
  _make_stat,
6
- fs_cwd,
7
- fs_glob,
8
- fs_glob_stat,
9
- fs_home,
10
- fs_iglob,
11
- fs_lstat,
12
- fs_makedirs,
13
- fs_move,
14
7
  fs_path_join,
15
- fs_readlink,
16
- fs_rename,
17
- fs_resolve,
18
8
  is_fs,
19
9
  )
20
10
  from megfile.interfaces import Access, FileEntry, PathLike, StatResult
@@ -22,7 +12,6 @@ from megfile.interfaces import Access, FileEntry, PathLike, StatResult
22
12
  __all__ = [
23
13
  "is_fs",
24
14
  "fs_path_join",
25
- "_make_stat",
26
15
  "fs_readlink",
27
16
  "fs_cwd",
28
17
  "fs_home",
@@ -430,3 +419,171 @@ def fs_save_as(file_object: BinaryIO, path: PathLike):
430
419
  :param file_object: stream to be read
431
420
  """
432
421
  return FSPath(path).save(file_object)
422
+
423
+
424
+ def fs_readlink(path) -> str:
425
+ """
426
+ Return a string representing the path to which the symbolic link points.
427
+ :returns: Return a string representing the path to which the symbolic link points.
428
+ """
429
+ return FSPath(path).readlink().path_without_protocol # pyre-ignore[7]
430
+
431
+
432
+ def fs_cwd() -> str:
433
+ """Return current working directory
434
+
435
+ returns: Current working directory
436
+ """
437
+ return os.getcwd()
438
+
439
+
440
+ def fs_home():
441
+ """Return the home directory
442
+
443
+ returns: Home directory path
444
+ """
445
+ return os.path.expanduser("~")
446
+
447
+
448
+ def fs_iglob(
449
+ path: PathLike, recursive: bool = True, missing_ok: bool = True
450
+ ) -> Iterator[str]:
451
+ """Return path iterator in ascending alphabetical order,
452
+ in which path matches glob pattern
453
+
454
+ 1. If doesn't match any path, return empty list
455
+ Notice: ``glob.glob`` in standard library returns ['a/'] instead of empty list
456
+ when pathname is like `a/**`, recursive is True and directory 'a' doesn't exist.
457
+ fs_glob behaves like ``glob.glob`` in standard library under such circumstance.
458
+ 2. No guarantee that each path in result is different, which means:
459
+ Assume there exists a path `/a/b/c/b/d.txt`
460
+ use path pattern like `/**/b/**/*.txt` to glob,
461
+ the path above will be returned twice
462
+ 3. `**` will match any matched file, directory, symlink and '' by default,
463
+ when recursive is `True`
464
+ 4. fs_glob returns same as glob.glob(pathname, recursive=True)
465
+ in ascending alphabetical order.
466
+ 5. Hidden files (filename stars with '.') will not be found in the result
467
+
468
+ :param recursive: If False, `**` will not search directory recursively
469
+ :param missing_ok: If False and target path doesn't match any file,
470
+ raise FileNotFoundError
471
+ :returns: An iterator contains paths match `pathname`
472
+ """
473
+ for path_obj in FSPath(path).iglob(
474
+ pattern="", recursive=recursive, missing_ok=missing_ok
475
+ ):
476
+ yield path_obj.path_without_protocol # pyre-ignore[7]
477
+
478
+
479
+ def fs_glob(
480
+ path: PathLike, recursive: bool = True, missing_ok: bool = True
481
+ ) -> List[str]:
482
+ """Return path list in ascending alphabetical order,
483
+ in which path matches glob pattern
484
+
485
+ 1. If doesn't match any path, return empty list
486
+ Notice: ``glob.glob`` in standard library returns ['a/'] instead of empty list
487
+ when pathname is like `a/**`, recursive is True and directory 'a' doesn't exist.
488
+ fs_glob behaves like ``glob.glob`` in standard library under such circumstance.
489
+ 2. No guarantee that each path in result is different, which means:
490
+ Assume there exists a path `/a/b/c/b/d.txt`
491
+ use path pattern like `/**/b/**/*.txt` to glob,
492
+ the path above will be returned twice
493
+ 3. `**` will match any matched file, directory, symlink and '' by default,
494
+ when recursive is `True`
495
+ 4. fs_glob returns same as glob.glob(pathname, recursive=True)
496
+ in ascending alphabetical order.
497
+ 5. Hidden files (filename stars with '.') will not be found in the result
498
+
499
+ :param recursive: If False, `**` will not search directory recursively
500
+ :param missing_ok: If False and target path doesn't match any file,
501
+ raise FileNotFoundError
502
+ :returns: A list contains paths match `pathname`
503
+ """
504
+ return list(fs_iglob(path=path, recursive=recursive, missing_ok=missing_ok))
505
+
506
+
507
+ def fs_glob_stat(
508
+ path: PathLike, recursive: bool = True, missing_ok: bool = True
509
+ ) -> Iterator[FileEntry]:
510
+ """Return a list contains tuples of path and file stat,
511
+ in ascending alphabetical order, in which path matches glob pattern
512
+
513
+ 1. If doesn't match any path, return empty list
514
+ Notice: ``glob.glob`` in standard library returns ['a/'] instead of empty list
515
+ when pathname is like `a/**`, recursive is True and directory 'a' doesn't exist.
516
+ fs_glob behaves like ``glob.glob`` in standard library under such circumstance.
517
+ 2. No guarantee that each path in result is different, which means:
518
+ Assume there exists a path `/a/b/c/b/d.txt`
519
+ use path pattern like `/**/b/**/*.txt` to glob,
520
+ the path above will be returned twice.
521
+ 3. `**` will match any matched file, directory, symlink and '' by default,
522
+ when recursive is `True`
523
+ 4. fs_glob returns same as glob.glob(pathname, recursive=True)
524
+ in ascending alphabetical order.
525
+ 5. Hidden files (filename stars with '.') will not be found in the result
526
+
527
+ :param recursive: If False, `**` will not search directory recursively
528
+ :param missing_ok: If False and target path doesn't match any file,
529
+ raise FileNotFoundError
530
+ :returns: A list contains tuples of path and file stat,
531
+ in which paths match `pathname`
532
+ """
533
+ for path in fs_iglob(path=path, recursive=recursive, missing_ok=missing_ok):
534
+ yield FileEntry(os.path.basename(path), path, _make_stat(os.lstat(path)))
535
+
536
+
537
+ def fs_resolve(path: PathLike) -> str:
538
+ """Equal to fs_realpath, return the real path of given path
539
+
540
+ :param path: Given path
541
+ :returns: Real path of given path
542
+ """
543
+ return FSPath(path).realpath()
544
+
545
+
546
+ def fs_makedirs(path: PathLike, exist_ok: bool = False):
547
+ """
548
+ make a directory on fs, including parent directory
549
+
550
+ If there exists a file on the path, raise FileExistsError
551
+
552
+ :param path: Given path
553
+ :param exist_ok: If False and target directory exists, raise FileExistsError
554
+ :raises: FileExistsError
555
+ """
556
+ return FSPath(path).mkdir(parents=True, exist_ok=exist_ok)
557
+
558
+
559
+ def fs_lstat(path: PathLike) -> StatResult:
560
+ """
561
+ Like Path.stat() but, if the path points to a symbolic link,
562
+ return the symbolic link’s information rather than its target’s.
563
+
564
+ :param path: Given path
565
+ :returns: StatResult
566
+ """
567
+ return FSPath(path).lstat()
568
+
569
+
570
+ def fs_rename(src_path: PathLike, dst_path: PathLike, overwrite: bool = True) -> None:
571
+ """
572
+ rename file on fs
573
+
574
+ :param src_path: Given path
575
+ :param dst_path: Given destination path
576
+ :param overwrite: whether or not overwrite file when exists
577
+ """
578
+ FSPath(src_path).rename(dst_path, overwrite)
579
+
580
+
581
+ def fs_move(src_path: PathLike, dst_path: PathLike, overwrite: bool = True) -> None:
582
+ """
583
+ rename file on fs
584
+
585
+ :param src_path: Given path
586
+ :param dst_path: Given destination path
587
+ :param overwrite: whether or not overwrite file when exists
588
+ """
589
+ return fs_rename(src_path, dst_path, overwrite)