megfile 3.1.6__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. megfile/cli.py +12 -7
  2. megfile/config.py +34 -44
  3. megfile/fs.py +169 -11
  4. megfile/fs_path.py +183 -259
  5. megfile/hdfs.py +106 -5
  6. megfile/hdfs_path.py +34 -90
  7. megfile/http.py +50 -1
  8. megfile/http_path.py +27 -65
  9. megfile/interfaces.py +1 -8
  10. megfile/lib/base_prefetch_reader.py +62 -78
  11. megfile/lib/combine_reader.py +5 -0
  12. megfile/lib/glob.py +3 -6
  13. megfile/lib/hdfs_prefetch_reader.py +7 -7
  14. megfile/lib/http_prefetch_reader.py +6 -6
  15. megfile/lib/s3_buffered_writer.py +67 -64
  16. megfile/lib/s3_cached_handler.py +1 -2
  17. megfile/lib/s3_limited_seekable_writer.py +3 -7
  18. megfile/lib/s3_memory_handler.py +1 -2
  19. megfile/lib/s3_pipe_handler.py +1 -2
  20. megfile/lib/s3_prefetch_reader.py +15 -20
  21. megfile/lib/s3_share_cache_reader.py +8 -5
  22. megfile/pathlike.py +397 -401
  23. megfile/s3.py +118 -17
  24. megfile/s3_path.py +150 -224
  25. megfile/sftp.py +300 -10
  26. megfile/sftp_path.py +46 -322
  27. megfile/smart.py +33 -27
  28. megfile/smart_path.py +9 -14
  29. megfile/stdio.py +1 -1
  30. megfile/stdio_path.py +2 -2
  31. megfile/utils/__init__.py +11 -4
  32. megfile/version.py +1 -1
  33. {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/METADATA +7 -7
  34. megfile-4.0.0.dist-info/RECORD +52 -0
  35. {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/WHEEL +1 -1
  36. {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/top_level.txt +0 -2
  37. docs/conf.py +0 -65
  38. megfile-3.1.6.dist-info/RECORD +0 -55
  39. scripts/convert_results_to_sarif.py +0 -91
  40. scripts/generate_file.py +0 -344
  41. {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/LICENSE +0 -0
  42. {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/LICENSE.pyre +0 -0
  43. {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/entry_points.txt +0 -0
megfile/cli.py CHANGED
@@ -10,7 +10,7 @@ from functools import partial
10
10
  import click
11
11
  from tqdm import tqdm
12
12
 
13
- from megfile.config import DEFAULT_BLOCK_SIZE
13
+ from megfile.config import READER_BLOCK_SIZE
14
14
  from megfile.hdfs_path import DEFAULT_HDFS_TIMEOUT
15
15
  from megfile.interfaces import FileEntry
16
16
  from megfile.lib.glob import get_non_glob_dir, has_magic
@@ -349,6 +349,9 @@ def sync(
349
349
  quiet: bool,
350
350
  skip: bool,
351
351
  ):
352
+ if not smart_exists(dst_path):
353
+ force = True
354
+
352
355
  with ThreadPoolExecutor(max_workers=worker) as executor:
353
356
  if has_magic(src_path):
354
357
  src_root_path = get_non_glob_dir(src_path)
@@ -411,7 +414,7 @@ def sync(
411
414
  dict(
412
415
  src_root_path=src_root_path,
413
416
  dst_root_path=dst_path,
414
- src_file_path=file_entry.path,
417
+ src_file_entry=file_entry,
415
418
  callback=callback,
416
419
  followlinks=True,
417
420
  callback_after_copy_file=callback_after_copy_file,
@@ -484,11 +487,11 @@ def tail(path: str, lines: int, follow: bool):
484
487
  f.seek(0, os.SEEK_SET)
485
488
 
486
489
  for current_offset in range(
487
- file_size - DEFAULT_BLOCK_SIZE, 0 - DEFAULT_BLOCK_SIZE, -DEFAULT_BLOCK_SIZE
490
+ file_size - READER_BLOCK_SIZE, 0 - READER_BLOCK_SIZE, -READER_BLOCK_SIZE
488
491
  ):
489
492
  current_offset = max(0, current_offset)
490
493
  f.seek(current_offset)
491
- block_lines = f.read(DEFAULT_BLOCK_SIZE).split(b"\n")
494
+ block_lines = f.read(READER_BLOCK_SIZE).split(b"\n")
492
495
  if len(line_list) > 0:
493
496
  block_lines[-1] += line_list[0]
494
497
  block_lines.extend(line_list[1:])
@@ -524,9 +527,11 @@ def to(path: str, append: bool, stdout: bool):
524
527
  mode = "wb"
525
528
  if append:
526
529
  mode = "ab"
527
- with smart_open("stdio://0", "rb") as stdin, smart_open(
528
- path, mode
529
- ) as f, smart_open("stdio://1", "wb") as stdout_fd:
530
+ with (
531
+ smart_open("stdio://0", "rb") as stdin,
532
+ smart_open(path, mode) as f,
533
+ smart_open("stdio://1", "wb") as stdout_fd,
534
+ ):
530
535
  length = 16 * 1024
531
536
  while True:
532
537
  buf = stdin.read(length)
megfile/config.py CHANGED
@@ -1,53 +1,39 @@
1
1
  import os
2
- from logging import getLogger
3
-
4
- _logger = getLogger(__name__)
5
-
6
- DEFAULT_BLOCK_SIZE = int(os.getenv("MEGFILE_BLOCK_SIZE") or 8 * 2**20)
7
-
8
- if os.getenv("MEGFILE_MAX_BUFFER_SIZE"):
9
- DEFAULT_MAX_BUFFER_SIZE = int(os.environ["MEGFILE_MAX_BUFFER_SIZE"])
10
- if DEFAULT_MAX_BUFFER_SIZE < DEFAULT_BLOCK_SIZE:
11
- DEFAULT_MAX_BUFFER_SIZE = DEFAULT_BLOCK_SIZE
12
- _logger.warning(
13
- "Env 'MEGFILE_MAX_BUFFER_SIZE' is smaller than block size, "
14
- "will not use buffer."
15
- )
16
- DEFAULT_BLOCK_CAPACITY = DEFAULT_MAX_BUFFER_SIZE // DEFAULT_BLOCK_SIZE
17
- if os.getenv("MEGFILE_BLOCK_CAPACITY"):
18
- _logger.warning(
19
- "Env 'MEGFILE_MAX_BUFFER_SIZE' and 'MEGFILE_BLOCK_CAPACITY' are both set, "
20
- "'MEGFILE_BLOCK_CAPACITY' will be ignored."
21
- )
22
- elif os.getenv("MEGFILE_BLOCK_CAPACITY"):
23
- DEFAULT_BLOCK_CAPACITY = int(os.environ["MEGFILE_BLOCK_CAPACITY"])
24
- DEFAULT_MAX_BUFFER_SIZE = DEFAULT_BLOCK_SIZE * DEFAULT_BLOCK_CAPACITY
25
- else:
26
- DEFAULT_MAX_BUFFER_SIZE = 128 * 2**20
27
- DEFAULT_BLOCK_CAPACITY = 16
28
-
29
- DEFAULT_MIN_BLOCK_SIZE = int(os.getenv("MEGFILE_MIN_BLOCK_SIZE") or DEFAULT_BLOCK_SIZE)
30
-
31
- if os.getenv("MEGFILE_MAX_BLOCK_SIZE"):
32
- DEFAULT_MAX_BLOCK_SIZE = int(os.environ["MEGFILE_MAX_BLOCK_SIZE"])
33
- if DEFAULT_MAX_BLOCK_SIZE < DEFAULT_BLOCK_SIZE:
34
- DEFAULT_MAX_BLOCK_SIZE = DEFAULT_BLOCK_SIZE
35
- _logger.warning(
36
- "Env 'MEGFILE_MAX_BLOCK_SIZE' is smaller than block size, will be ignored."
37
- )
38
- else:
39
- DEFAULT_MAX_BLOCK_SIZE = max(128 * 2**20, DEFAULT_BLOCK_SIZE)
40
-
41
- GLOBAL_MAX_WORKERS = int(os.getenv("MEGFILE_MAX_WORKERS") or 32)
42
- DEFAULT_MAX_RETRY_TIMES = int(os.getenv("MEGFILE_MAX_RETRY_TIMES") or 10)
43
2
 
44
- # for logging the size of file had read or wrote
45
- BACKOFF_INITIAL = 64 * 2**20 # 64MB
46
- BACKOFF_FACTOR = 4
3
+
4
+ def to_boolean(value):
5
+ return value.lower() in ("true", "yes", "1")
6
+
7
+
8
+ READER_BLOCK_SIZE = int(os.getenv("MEGFILE_READER_BLOCK_SIZE") or 8 * 2**20)
9
+ if READER_BLOCK_SIZE <= 0:
10
+ raise ValueError(
11
+ f"'MEGFILE_READER_BLOCK_SIZE' must bigger than 0, got {READER_BLOCK_SIZE}"
12
+ )
13
+ READER_MAX_BUFFER_SIZE = int(os.getenv("MEGFILE_READER_MAX_BUFFER_SIZE") or 128 * 2**20)
14
+
15
+ # Multi-upload in aws s3 has a maximum of 10,000 parts,
16
+ # so the maximum supported file size is MEGFILE_WRITE_BLOCK_SIZE * 10,000,
17
+ # the largest object that can be uploaded in a single PUT is 5 TB in aws s3.
18
+ WRITER_BLOCK_SIZE = int(os.getenv("MEGFILE_WRITER_BLOCK_SIZE") or 8 * 2**20)
19
+ if WRITER_BLOCK_SIZE <= 0:
20
+ raise ValueError(
21
+ f"'MEGFILE_WRITER_BLOCK_SIZE' must bigger than 0, got {WRITER_BLOCK_SIZE}"
22
+ )
23
+ WRITER_MAX_BUFFER_SIZE = int(os.getenv("MEGFILE_WRITER_MAX_BUFFER_SIZE") or 128 * 2**20)
24
+ DEFAULT_WRITER_BLOCK_AUTOSCALE = not os.getenv("MEGFILE_WRITER_BLOCK_SIZE")
25
+ if os.getenv("MEGFILE_WRITER_BLOCK_AUTOSCALE"):
26
+ DEFAULT_WRITER_BLOCK_AUTOSCALE = to_boolean(
27
+ os.environ["MEGFILE_WRITER_BLOCK_AUTOSCALE"].lower()
28
+ )
29
+
30
+ GLOBAL_MAX_WORKERS = int(os.getenv("MEGFILE_MAX_WORKERS") or 8)
47
31
 
48
32
  NEWLINE = ord("\n")
49
33
 
50
34
  S3_CLIENT_CACHE_MODE = os.getenv("MEGFILE_S3_CLIENT_CACHE_MODE") or "thread_local"
35
+
36
+ DEFAULT_MAX_RETRY_TIMES = int(os.getenv("MEGFILE_MAX_RETRY_TIMES") or 10)
51
37
  S3_MAX_RETRY_TIMES = int(
52
38
  os.getenv("MEGFILE_S3_MAX_RETRY_TIMES") or DEFAULT_MAX_RETRY_TIMES
53
39
  )
@@ -60,3 +46,7 @@ HDFS_MAX_RETRY_TIMES = int(
60
46
  SFTP_MAX_RETRY_TIMES = int(
61
47
  os.getenv("MEGFILE_SFTP_MAX_RETRY_TIMES") or DEFAULT_MAX_RETRY_TIMES
62
48
  )
49
+
50
+ SFTP_HOST_KEY_POLICY = os.getenv("MEGFILE_SFTP_HOST_KEY_POLICY")
51
+
52
+ HTTP_AUTH_HEADERS = ("Authorization", "Www-Authenticate", "Cookie", "Cookie2")
megfile/fs.py CHANGED
@@ -1,20 +1,10 @@
1
+ import os
1
2
  from typing import BinaryIO, Callable, Iterator, List, Optional, Tuple
2
3
 
3
4
  from megfile.fs_path import (
4
5
  FSPath,
5
6
  _make_stat,
6
- fs_cwd,
7
- fs_glob,
8
- fs_glob_stat,
9
- fs_home,
10
- fs_iglob,
11
- fs_lstat,
12
- fs_makedirs,
13
- fs_move,
14
7
  fs_path_join,
15
- fs_readlink,
16
- fs_rename,
17
- fs_resolve,
18
8
  is_fs,
19
9
  )
20
10
  from megfile.interfaces import Access, FileEntry, PathLike, StatResult
@@ -430,3 +420,171 @@ def fs_save_as(file_object: BinaryIO, path: PathLike):
430
420
  :param file_object: stream to be read
431
421
  """
432
422
  return FSPath(path).save(file_object)
423
+
424
+
425
+ def fs_readlink(path) -> str:
426
+ """
427
+ Return a string representing the path to which the symbolic link points.
428
+ :returns: Return a string representing the path to which the symbolic link points.
429
+ """
430
+ return FSPath(path).readlink().path_without_protocol # pyre-ignore[7]
431
+
432
+
433
+ def fs_cwd() -> str:
434
+ """Return current working directory
435
+
436
+ returns: Current working directory
437
+ """
438
+ return os.getcwd()
439
+
440
+
441
+ def fs_home():
442
+ """Return the home directory
443
+
444
+ returns: Home directory path
445
+ """
446
+ return os.path.expanduser("~")
447
+
448
+
449
+ def fs_iglob(
450
+ path: PathLike, recursive: bool = True, missing_ok: bool = True
451
+ ) -> Iterator[str]:
452
+ """Return path iterator in ascending alphabetical order,
453
+ in which path matches glob pattern
454
+
455
+ 1. If doesn't match any path, return empty list
456
+ Notice: ``glob.glob`` in standard library returns ['a/'] instead of empty list
457
+ when pathname is like `a/**`, recursive is True and directory 'a' doesn't exist.
458
+ fs_glob behaves like ``glob.glob`` in standard library under such circumstance.
459
+ 2. No guarantee that each path in result is different, which means:
460
+ Assume there exists a path `/a/b/c/b/d.txt`
461
+ use path pattern like `/**/b/**/*.txt` to glob,
462
+ the path above will be returned twice
463
+ 3. `**` will match any matched file, directory, symlink and '' by default,
464
+ when recursive is `True`
465
+ 4. fs_glob returns same as glob.glob(pathname, recursive=True)
466
+ in ascending alphabetical order.
467
+ 5. Hidden files (filename stars with '.') will not be found in the result
468
+
469
+ :param recursive: If False, `**` will not search directory recursively
470
+ :param missing_ok: If False and target path doesn't match any file,
471
+ raise FileNotFoundError
472
+ :returns: An iterator contains paths match `pathname`
473
+ """
474
+ for path_obj in FSPath(path).iglob(
475
+ pattern="", recursive=recursive, missing_ok=missing_ok
476
+ ):
477
+ yield path_obj.path_without_protocol # pyre-ignore[7]
478
+
479
+
480
+ def fs_glob(
481
+ path: PathLike, recursive: bool = True, missing_ok: bool = True
482
+ ) -> List[str]:
483
+ """Return path list in ascending alphabetical order,
484
+ in which path matches glob pattern
485
+
486
+ 1. If doesn't match any path, return empty list
487
+ Notice: ``glob.glob`` in standard library returns ['a/'] instead of empty list
488
+ when pathname is like `a/**`, recursive is True and directory 'a' doesn't exist.
489
+ fs_glob behaves like ``glob.glob`` in standard library under such circumstance.
490
+ 2. No guarantee that each path in result is different, which means:
491
+ Assume there exists a path `/a/b/c/b/d.txt`
492
+ use path pattern like `/**/b/**/*.txt` to glob,
493
+ the path above will be returned twice
494
+ 3. `**` will match any matched file, directory, symlink and '' by default,
495
+ when recursive is `True`
496
+ 4. fs_glob returns same as glob.glob(pathname, recursive=True)
497
+ in ascending alphabetical order.
498
+ 5. Hidden files (filename stars with '.') will not be found in the result
499
+
500
+ :param recursive: If False, `**` will not search directory recursively
501
+ :param missing_ok: If False and target path doesn't match any file,
502
+ raise FileNotFoundError
503
+ :returns: A list contains paths match `pathname`
504
+ """
505
+ return list(fs_iglob(path=path, recursive=recursive, missing_ok=missing_ok))
506
+
507
+
508
+ def fs_glob_stat(
509
+ path: PathLike, recursive: bool = True, missing_ok: bool = True
510
+ ) -> Iterator[FileEntry]:
511
+ """Return a list contains tuples of path and file stat,
512
+ in ascending alphabetical order, in which path matches glob pattern
513
+
514
+ 1. If doesn't match any path, return empty list
515
+ Notice: ``glob.glob`` in standard library returns ['a/'] instead of empty list
516
+ when pathname is like `a/**`, recursive is True and directory 'a' doesn't exist.
517
+ fs_glob behaves like ``glob.glob`` in standard library under such circumstance.
518
+ 2. No guarantee that each path in result is different, which means:
519
+ Assume there exists a path `/a/b/c/b/d.txt`
520
+ use path pattern like `/**/b/**/*.txt` to glob,
521
+ the path above will be returned twice.
522
+ 3. `**` will match any matched file, directory, symlink and '' by default,
523
+ when recursive is `True`
524
+ 4. fs_glob returns same as glob.glob(pathname, recursive=True)
525
+ in ascending alphabetical order.
526
+ 5. Hidden files (filename stars with '.') will not be found in the result
527
+
528
+ :param recursive: If False, `**` will not search directory recursively
529
+ :param missing_ok: If False and target path doesn't match any file,
530
+ raise FileNotFoundError
531
+ :returns: A list contains tuples of path and file stat,
532
+ in which paths match `pathname`
533
+ """
534
+ for path in fs_iglob(path=path, recursive=recursive, missing_ok=missing_ok):
535
+ yield FileEntry(os.path.basename(path), path, _make_stat(os.lstat(path)))
536
+
537
+
538
+ def fs_resolve(path: PathLike) -> str:
539
+ """Equal to fs_realpath, return the real path of given path
540
+
541
+ :param path: Given path
542
+ :returns: Real path of given path
543
+ """
544
+ return FSPath(path).realpath()
545
+
546
+
547
+ def fs_makedirs(path: PathLike, exist_ok: bool = False):
548
+ """
549
+ make a directory on fs, including parent directory
550
+
551
+ If there exists a file on the path, raise FileExistsError
552
+
553
+ :param path: Given path
554
+ :param exist_ok: If False and target directory exists, raise FileExistsError
555
+ :raises: FileExistsError
556
+ """
557
+ return FSPath(path).mkdir(parents=True, exist_ok=exist_ok)
558
+
559
+
560
+ def fs_lstat(path: PathLike) -> StatResult:
561
+ """
562
+ Like Path.stat() but, if the path points to a symbolic link,
563
+ return the symbolic link’s information rather than its target’s.
564
+
565
+ :param path: Given path
566
+ :returns: StatResult
567
+ """
568
+ return FSPath(path).lstat()
569
+
570
+
571
+ def fs_rename(src_path: PathLike, dst_path: PathLike, overwrite: bool = True) -> None:
572
+ """
573
+ rename file on fs
574
+
575
+ :param src_path: Given path
576
+ :param dst_path: Given destination path
577
+ :param overwrite: whether or not overwrite file when exists
578
+ """
579
+ FSPath(src_path).rename(dst_path, overwrite)
580
+
581
+
582
+ def fs_move(src_path: PathLike, dst_path: PathLike, overwrite: bool = True) -> None:
583
+ """
584
+ rename file on fs
585
+
586
+ :param src_path: Given path
587
+ :param dst_path: Given destination path
588
+ :param overwrite: whether or not overwrite file when exists
589
+ """
590
+ return fs_rename(src_path, dst_path, overwrite)