megfile 4.0.0.post1__py3-none-any.whl → 4.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
megfile/__init__.py CHANGED
@@ -121,6 +121,7 @@ from megfile.s3_path import S3Path
121
121
  from megfile.sftp import (
122
122
  is_sftp,
123
123
  sftp_absolute,
124
+ sftp_add_host_key,
124
125
  sftp_chmod,
125
126
  sftp_concat,
126
127
  sftp_copy,
@@ -371,6 +372,7 @@ __all__ = [
371
372
  "sftp_copy",
372
373
  "sftp_sync",
373
374
  "sftp_concat",
375
+ "sftp_add_host_key",
374
376
  "is_hdfs",
375
377
  "hdfs_exists",
376
378
  "hdfs_stat",
megfile/cli.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import configparser
2
- import logging
3
2
  import os
4
3
  import shutil
5
4
  import sys
@@ -10,10 +9,11 @@ from functools import partial
10
9
  import click
11
10
  from tqdm import tqdm
12
11
 
13
- from megfile.config import READER_BLOCK_SIZE
12
+ from megfile.config import READER_BLOCK_SIZE, SFTP_HOST_KEY_POLICY, set_log_level
14
13
  from megfile.hdfs_path import DEFAULT_HDFS_TIMEOUT
15
14
  from megfile.interfaces import FileEntry
16
15
  from megfile.lib.glob import get_non_glob_dir, has_magic
16
+ from megfile.sftp import sftp_add_host_key
17
17
  from megfile.smart import (
18
18
  _smart_sync_single_file,
19
19
  smart_copy,
@@ -44,29 +44,34 @@ from megfile.smart_path import SmartPath
44
44
  from megfile.utils import get_human_size
45
45
  from megfile.version import VERSION
46
46
 
47
- logging.basicConfig(level=logging.ERROR)
48
- logging.getLogger("megfile").setLevel(level=logging.INFO)
49
- DEBUG = False
47
+ options = {}
48
+ set_log_level()
50
49
 
51
50
 
52
51
  @click.group()
53
52
  @click.option("--debug", is_flag=True, help="Enable debug mode.")
54
- def cli(debug):
53
+ @click.option(
54
+ "--log-level",
55
+ type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"]),
56
+ help="Set logging level.",
57
+ )
58
+ def cli(debug, log_level):
55
59
  """
56
60
  Client for megfile.
57
61
 
58
62
  If you install megfile with ``--user``,
59
63
  you also need configure ``$HOME/.local/bin`` into ``$PATH``.
60
64
  """
61
- global DEBUG
62
- DEBUG = debug
65
+ options["debug"] = debug
66
+ options["log_level"] = log_level or ("DEBUG" if debug else "INFO")
67
+ set_log_level(options["log_level"])
63
68
 
64
69
 
65
70
  def safe_cli(): # pragma: no cover
66
71
  try:
67
72
  cli()
68
73
  except Exception as e:
69
- if DEBUG:
74
+ if options.get("debug", False):
70
75
  raise
71
76
  else:
72
77
  click.echo(f"\n[{type(e).__name__}] {e}", err=True)
@@ -110,6 +115,23 @@ def smart_list_stat(path):
110
115
  yield from smart_scandir(path)
111
116
 
112
117
 
118
+ def _sftp_prompt_host_key(path):
119
+ if SFTP_HOST_KEY_POLICY == "auto":
120
+ return
121
+
122
+ path = SmartPath(path)
123
+ if path.protocol == "sftp":
124
+ hostname = (
125
+ path.pathlike._urlsplit_parts.hostname # pytype: disable=attribute-error
126
+ )
127
+ port = path.pathlike._urlsplit_parts.port # pytype: disable=attribute-error
128
+ sftp_add_host_key(
129
+ hostname=hostname,
130
+ port=port,
131
+ prompt=True,
132
+ )
133
+
134
+
113
135
  def _ls(path: str, long: bool, recursive: bool, human_readable: bool):
114
136
  base_path = path
115
137
  full_path = False
@@ -121,6 +143,9 @@ def _ls(path: str, long: bool, recursive: bool, human_readable: bool):
121
143
  scan_func = smart_scan_stat
122
144
  else:
123
145
  scan_func = smart_list_stat
146
+
147
+ _sftp_prompt_host_key(base_path)
148
+
124
149
  if long:
125
150
  if human_readable:
126
151
  echo_func = human_echo
@@ -209,6 +234,10 @@ def cp(
209
234
  ):
210
235
  if not no_target_directory and (dst_path.endswith("/") or smart_isdir(dst_path)):
211
236
  dst_path = smart_path_join(dst_path, os.path.basename(src_path))
237
+
238
+ _sftp_prompt_host_key(src_path)
239
+ _sftp_prompt_host_key(dst_path)
240
+
212
241
  if recursive:
213
242
  with ThreadPoolExecutor(max_workers=(os.cpu_count() or 1) * 2) as executor:
214
243
  if progress_bar:
@@ -274,6 +303,10 @@ def mv(
274
303
  ):
275
304
  if not no_target_directory and (dst_path.endswith("/") or smart_isdir(dst_path)):
276
305
  dst_path = smart_path_join(dst_path, os.path.basename(src_path))
306
+
307
+ _sftp_prompt_host_key(src_path)
308
+ _sftp_prompt_host_key(dst_path)
309
+
277
310
  if progress_bar:
278
311
  src_protocol, _ = SmartPath._extract_protocol(src_path)
279
312
  dst_protocol, _ = SmartPath._extract_protocol(dst_path)
@@ -324,6 +357,8 @@ def mv(
324
357
  "under the specified directory or prefix.",
325
358
  )
326
359
  def rm(path: str, recursive: bool):
360
+ _sftp_prompt_host_key(path)
361
+
327
362
  remove_func = smart_remove if recursive else smart_unlink
328
363
  remove_func(path)
329
364
 
@@ -349,6 +384,9 @@ def sync(
349
384
  quiet: bool,
350
385
  skip: bool,
351
386
  ):
387
+ _sftp_prompt_host_key(src_path)
388
+ _sftp_prompt_host_key(dst_path)
389
+
352
390
  if not smart_exists(dst_path):
353
391
  force = True
354
392
 
@@ -434,18 +472,24 @@ def sync(
434
472
  @cli.command(short_help="Make the path if it doesn't already exist.")
435
473
  @click.argument("path")
436
474
  def mkdir(path: str):
475
+ _sftp_prompt_host_key(path)
476
+
437
477
  smart_makedirs(path)
438
478
 
439
479
 
440
480
  @cli.command(short_help="Make the file if it doesn't already exist.")
441
481
  @click.argument("path")
442
482
  def touch(path: str):
483
+ _sftp_prompt_host_key(path)
484
+
443
485
  smart_touch(path)
444
486
 
445
487
 
446
488
  @cli.command(short_help="Concatenate any files and send them to stdout.")
447
489
  @click.argument("path")
448
490
  def cat(path: str):
491
+ _sftp_prompt_host_key(path)
492
+
449
493
  with smart_open(path, "rb") as f:
450
494
  shutil.copyfileobj(f, sys.stdout.buffer) # pytype: disable=wrong-arg-types
451
495
 
@@ -458,6 +502,8 @@ def cat(path: str):
458
502
  "-n", "--lines", type=click.INT, default=10, help="print the first NUM lines"
459
503
  )
460
504
  def head(path: str, lines: int):
505
+ _sftp_prompt_host_key(path)
506
+
461
507
  with smart_open(path, "rb") as f:
462
508
  for _ in range(lines):
463
509
  try:
@@ -480,6 +526,8 @@ def head(path: str, lines: int):
480
526
  "-f", "--follow", is_flag=True, help="output appended data as the file grows"
481
527
  )
482
528
  def tail(path: str, lines: int, follow: bool):
529
+ _sftp_prompt_host_key(path)
530
+
483
531
  line_list = []
484
532
  with smart_open(path, "rb") as f:
485
533
  f.seek(0, os.SEEK_END)
@@ -524,6 +572,8 @@ def tail(path: str, lines: int, follow: bool):
524
572
  @click.option("-a", "--append", is_flag=True, help="Append to the given file")
525
573
  @click.option("-o", "--stdout", is_flag=True, help="File content to standard output")
526
574
  def to(path: str, append: bool, stdout: bool):
575
+ _sftp_prompt_host_key(path)
576
+
527
577
  mode = "wb"
528
578
  if append:
529
579
  mode = "ab"
@@ -545,24 +595,32 @@ def to(path: str, append: bool, stdout: bool):
545
595
  @cli.command(short_help="Produce an md5sum file for all the objects in the path.")
546
596
  @click.argument("path")
547
597
  def md5sum(path: str):
598
+ _sftp_prompt_host_key(path)
599
+
548
600
  click.echo(smart_getmd5(path, recalculate=True))
549
601
 
550
602
 
551
603
  @cli.command(short_help="Return the total size and number of objects in remote:path.")
552
604
  @click.argument("path")
553
605
  def size(path: str):
606
+ _sftp_prompt_host_key(path)
607
+
554
608
  click.echo(smart_getsize(path))
555
609
 
556
610
 
557
611
  @cli.command(short_help="Return the mtime and number of objects in remote:path.")
558
612
  @click.argument("path")
559
613
  def mtime(path: str):
614
+ _sftp_prompt_host_key(path)
615
+
560
616
  click.echo(smart_getmtime(path))
561
617
 
562
618
 
563
619
  @cli.command(short_help="Return the stat and number of objects in remote:path.")
564
620
  @click.argument("path")
565
621
  def stat(path: str):
622
+ _sftp_prompt_host_key(path)
623
+
566
624
  click.echo(smart_stat(path))
567
625
 
568
626
 
megfile/config.py CHANGED
@@ -1,30 +1,109 @@
1
+ import logging
1
2
  import os
3
+ import typing as T
2
4
 
3
5
 
4
- def to_boolean(value):
6
+ def parse_quantity(quantity: T.Union[str, int]) -> int:
7
+ """
8
+ Parse kubernetes canonical form quantity like 200Mi to a int number.
9
+ Supported SI suffixes:
10
+ base1024: Ki | Mi | Gi | Ti | Pi | Ei
11
+ base1000: "" | k | M | G | T | P | E
12
+
13
+ (International System of units; See: http://physics.nist.gov/cuu/Units/binary.html)
14
+
15
+ Input:
16
+ quantity: string. kubernetes canonical form quantity
17
+
18
+ Returns:
19
+ Int
20
+
21
+ Raises:
22
+ ValueError on invalid or unknown input
23
+ """
24
+ if isinstance(quantity, int):
25
+ return quantity
26
+
27
+ exponents = {"K": 1, "k": 1, "M": 2, "G": 3, "T": 4, "P": 5, "E": 6}
28
+
29
+ number = quantity
30
+ suffix = None
31
+ if len(quantity) >= 2 and quantity[-1] == "i":
32
+ if quantity[-2] in exponents:
33
+ number = quantity[:-2]
34
+ suffix = quantity[-2:]
35
+ elif len(quantity) >= 1 and quantity[-1] in exponents:
36
+ number = quantity[:-1]
37
+ suffix = quantity[-1:]
38
+
39
+ try:
40
+ number = int(number)
41
+ except ValueError:
42
+ raise ValueError("Invalid number format: {}".format(number))
43
+
44
+ if suffix is None:
45
+ return number
46
+
47
+ if suffix.endswith("i"):
48
+ base = 1024
49
+ elif len(suffix) == 1:
50
+ base = 1000
51
+ else:
52
+ raise ValueError("{} has unknown suffix".format(quantity))
53
+
54
+ # handle SI inconsistency
55
+ if suffix == "ki":
56
+ raise ValueError("{} has unknown suffix".format(quantity))
57
+
58
+ if suffix[0] not in exponents:
59
+ raise ValueError("{} has unknown suffix".format(quantity))
60
+
61
+ exponent = int(exponents[suffix[0]])
62
+ return number * (base**exponent) # pytype: disable=bad-return-type
63
+
64
+
65
+ def parse_boolean(value: T.Optional[str], default: bool = False):
66
+ if value is None:
67
+ return default
5
68
  return value.lower() in ("true", "yes", "1")
6
69
 
7
70
 
8
- READER_BLOCK_SIZE = int(os.getenv("MEGFILE_READER_BLOCK_SIZE") or 8 * 2**20)
71
+ def set_log_level(level: T.Optional[T.Union[int, str]] = None):
72
+ logging.basicConfig(
73
+ level=logging.ERROR,
74
+ format=(
75
+ "%(asctime)s | %(levelname)-8s | "
76
+ "%(name)s:%(funcName)s:%(lineno)d - %(message)s"
77
+ ),
78
+ )
79
+ level = level or os.getenv("MEGFILE_LOG_LEVEL") or logging.INFO
80
+ logging.getLogger("megfile").setLevel(level)
81
+
82
+
83
+ READER_BLOCK_SIZE = parse_quantity(os.getenv("MEGFILE_READER_BLOCK_SIZE") or 8 * 2**20)
9
84
  if READER_BLOCK_SIZE <= 0:
10
85
  raise ValueError(
11
86
  f"'MEGFILE_READER_BLOCK_SIZE' must bigger than 0, got {READER_BLOCK_SIZE}"
12
87
  )
13
- READER_MAX_BUFFER_SIZE = int(os.getenv("MEGFILE_READER_MAX_BUFFER_SIZE") or 128 * 2**20)
88
+ READER_MAX_BUFFER_SIZE = parse_quantity(
89
+ os.getenv("MEGFILE_READER_MAX_BUFFER_SIZE") or 128 * 2**20
90
+ )
14
91
 
15
92
  # Multi-upload in aws s3 has a maximum of 10,000 parts,
16
93
  # so the maximum supported file size is MEGFILE_WRITE_BLOCK_SIZE * 10,000,
17
94
  # the largest object that can be uploaded in a single PUT is 5 TB in aws s3.
18
- WRITER_BLOCK_SIZE = int(os.getenv("MEGFILE_WRITER_BLOCK_SIZE") or 8 * 2**20)
95
+ WRITER_BLOCK_SIZE = parse_quantity(os.getenv("MEGFILE_WRITER_BLOCK_SIZE") or 8 * 2**20)
19
96
  if WRITER_BLOCK_SIZE <= 0:
20
97
  raise ValueError(
21
98
  f"'MEGFILE_WRITER_BLOCK_SIZE' must bigger than 0, got {WRITER_BLOCK_SIZE}"
22
99
  )
23
- WRITER_MAX_BUFFER_SIZE = int(os.getenv("MEGFILE_WRITER_MAX_BUFFER_SIZE") or 128 * 2**20)
100
+ WRITER_MAX_BUFFER_SIZE = parse_quantity(
101
+ os.getenv("MEGFILE_WRITER_MAX_BUFFER_SIZE") or 128 * 2**20
102
+ )
24
103
  DEFAULT_WRITER_BLOCK_AUTOSCALE = not os.getenv("MEGFILE_WRITER_BLOCK_SIZE")
25
104
  if os.getenv("MEGFILE_WRITER_BLOCK_AUTOSCALE"):
26
- DEFAULT_WRITER_BLOCK_AUTOSCALE = to_boolean(
27
- os.environ["MEGFILE_WRITER_BLOCK_AUTOSCALE"].lower()
105
+ DEFAULT_WRITER_BLOCK_AUTOSCALE = parse_boolean(
106
+ os.environ["MEGFILE_WRITER_BLOCK_AUTOSCALE"]
28
107
  )
29
108
 
30
109
  GLOBAL_MAX_WORKERS = int(os.getenv("MEGFILE_MAX_WORKERS") or 8)
@@ -50,3 +129,6 @@ SFTP_MAX_RETRY_TIMES = int(
50
129
  SFTP_HOST_KEY_POLICY = os.getenv("MEGFILE_SFTP_HOST_KEY_POLICY")
51
130
 
52
131
  HTTP_AUTH_HEADERS = ("Authorization", "Www-Authenticate", "Cookie", "Cookie2")
132
+
133
+ if os.getenv("MEGFILE_LOG_LEVEL"):
134
+ set_log_level()
@@ -52,7 +52,6 @@ class S3PrefetchReader(BasePrefetchReader):
52
52
  self._client = s3_client
53
53
  self._profile_name = profile_name
54
54
  self._content_etag = None
55
- self._content_info = None
56
55
 
57
56
  super().__init__(
58
57
  block_size=block_size,
@@ -63,6 +62,11 @@ class S3PrefetchReader(BasePrefetchReader):
63
62
  )
64
63
 
65
64
  def _get_content_size(self):
65
+ if self._block_capacity <= 0:
66
+ response = self._client.head_object(Bucket=self._bucket, Key=self._key)
67
+ self._content_etag = response["ETag"]
68
+ return int(response["ContentLength"])
69
+
66
70
  try:
67
71
  start, end = 0, self._block_size - 1
68
72
  first_index_response = self._fetch_response(start=start, end=end)
@@ -77,12 +81,10 @@ class S3PrefetchReader(BasePrefetchReader):
77
81
  first_index_response = self._fetch_response()
78
82
  content_size = int(first_index_response["ContentLength"])
79
83
 
80
- if self._block_capacity > 0:
81
- first_future = Future()
82
- first_future.set_result(first_index_response["Body"])
83
- self._insert_futures(index=0, future=first_future)
84
+ first_future = Future()
85
+ first_future.set_result(first_index_response["Body"])
86
+ self._insert_futures(index=0, future=first_future)
84
87
  self._content_etag = first_index_response["ETag"]
85
- self._content_info = first_index_response
86
88
  return content_size
87
89
 
88
90
  @property
@@ -121,7 +123,7 @@ class S3PrefetchReader(BasePrefetchReader):
121
123
  if etag is not None and etag != self._content_etag:
122
124
  raise S3FileChangedError(
123
125
  "File changed: %r, etag before: %s, after: %s"
124
- % (self.name, self._content_info, response)
126
+ % (self.name, self._content_etag, etag)
125
127
  )
126
128
 
127
129
  return response["Body"]
megfile/s3_path.py CHANGED
@@ -23,7 +23,7 @@ from megfile.config import (
23
23
  S3_MAX_RETRY_TIMES,
24
24
  WRITER_BLOCK_SIZE,
25
25
  WRITER_MAX_BUFFER_SIZE,
26
- to_boolean,
26
+ parse_boolean,
27
27
  )
28
28
  from megfile.errors import (
29
29
  S3BucketNotFoundError,
@@ -253,12 +253,6 @@ def get_env_var(env_name: str, profile_name=None):
253
253
  return os.getenv(env_name.upper())
254
254
 
255
255
 
256
- def parse_boolean(value: Optional[str], default: bool = False) -> bool:
257
- if value is None:
258
- return default
259
- return to_boolean(value)
260
-
261
-
262
256
  def get_access_token(profile_name=None):
263
257
  access_key = get_env_var("AWS_ACCESS_KEY_ID", profile_name=profile_name)
264
258
  secret_key = get_env_var("AWS_SECRET_ACCESS_KEY", profile_name=profile_name)
@@ -1003,13 +997,15 @@ def s3_buffered_open(
1003
997
  profile_name=s3_url._profile_name,
1004
998
  )
1005
999
  else:
1000
+ if max_buffer_size is None:
1001
+ max_buffer_size = READER_MAX_BUFFER_SIZE
1006
1002
  reader = S3PrefetchReader(
1007
1003
  bucket,
1008
1004
  key,
1009
1005
  s3_client=client,
1010
1006
  max_retries=max_retries,
1011
1007
  max_workers=max_workers,
1012
- max_buffer_size=max_buffer_size or READER_MAX_BUFFER_SIZE,
1008
+ max_buffer_size=max_buffer_size,
1013
1009
  block_forward=block_forward,
1014
1010
  block_size=block_size or READER_BLOCK_SIZE,
1015
1011
  profile_name=s3_url._profile_name,
@@ -1019,23 +1015,27 @@ def s3_buffered_open(
1019
1015
  return reader
1020
1016
 
1021
1017
  if limited_seekable:
1018
+ if max_buffer_size is None:
1019
+ max_buffer_size = WRITER_MAX_BUFFER_SIZE
1022
1020
  writer = S3LimitedSeekableWriter(
1023
1021
  bucket,
1024
1022
  key,
1025
1023
  s3_client=client,
1026
1024
  max_workers=max_workers,
1027
1025
  block_size=block_size or WRITER_BLOCK_SIZE,
1028
- max_buffer_size=max_buffer_size or WRITER_MAX_BUFFER_SIZE,
1026
+ max_buffer_size=max_buffer_size,
1029
1027
  profile_name=s3_url._profile_name,
1030
1028
  )
1031
1029
  else:
1030
+ if max_buffer_size is None:
1031
+ max_buffer_size = WRITER_MAX_BUFFER_SIZE
1032
1032
  writer = S3BufferedWriter(
1033
1033
  bucket,
1034
1034
  key,
1035
1035
  s3_client=client,
1036
1036
  max_workers=max_workers,
1037
1037
  block_size=block_size or WRITER_BLOCK_SIZE,
1038
- max_buffer_size=max_buffer_size or WRITER_MAX_BUFFER_SIZE,
1038
+ max_buffer_size=max_buffer_size,
1039
1039
  profile_name=s3_url._profile_name,
1040
1040
  )
1041
1041
  if buffered or _is_pickle(writer):
megfile/sftp.py CHANGED
@@ -1,7 +1,11 @@
1
+ import base64
2
+ import hashlib
1
3
  import os
2
4
  from logging import getLogger as get_logger
3
5
  from typing import IO, BinaryIO, Callable, Iterator, List, Optional, Tuple
4
6
 
7
+ import paramiko
8
+
5
9
  from megfile.interfaces import FileEntry, PathLike, StatResult
6
10
  from megfile.lib.compat import fspath
7
11
  from megfile.lib.joinpath import uri_join
@@ -52,6 +56,7 @@ __all__ = [
52
56
  "sftp_rmdir",
53
57
  "sftp_copy",
54
58
  "sftp_sync",
59
+ "sftp_add_host_key",
55
60
  ]
56
61
 
57
62
 
@@ -739,3 +744,75 @@ def sftp_sync(
739
744
  :param overwrite: whether or not overwrite file when exists, default is True
740
745
  """
741
746
  return SftpPath(src_path).sync(dst_path, followlinks, force, overwrite)
747
+
748
+
749
+ def _check_input(input_str: str, fingerprint: str, times: int = 0) -> bool:
750
+ answers = input_str.strip()
751
+ if answers.lower() in ("yes", "y") or answers == fingerprint:
752
+ return True
753
+ elif answers.lower() in ("no", "n"):
754
+ return False
755
+ elif times >= 10:
756
+ _logger.warning("Retried more than 10 times, give up")
757
+ return False
758
+ else:
759
+ input_str = input("Please type 'yes', 'no' or the fingerprint: ")
760
+ return _check_input(input_str, fingerprint, times=times + 1)
761
+
762
+
763
+ def _prompt_add_to_known_hosts(hostname, key) -> bool:
764
+ fingerprint = hashlib.sha256(key.asbytes()).digest()
765
+ fingerprint = f"SHA256:{base64.b64encode(fingerprint).decode('utf-8')}"
766
+ answers = input(f"""The authenticity of host '{hostname}' can't be established.
767
+ {key.get_name().upper()} key fingerprint is {fingerprint}.
768
+ This key is not known by any other names.
769
+ Are you sure you want to continue connecting (yes/no/[fingerprint])? """)
770
+ return _check_input(answers, fingerprint)
771
+
772
+
773
+ def sftp_add_host_key(
774
+ hostname: str,
775
+ port: int = 22,
776
+ prompt: bool = False,
777
+ host_key_path: Optional["str"] = None,
778
+ ):
779
+ """Add a host key to known_hosts.
780
+
781
+ :param hostname: hostname
782
+ :param port: port, default is 22
783
+ :param prompt: If True, requires user input of 'yes' or 'no' to decide whether to
784
+ add this host key
785
+ :param host_key_path: path of known_hosts, default is ~/.ssh/known_hosts
786
+ """
787
+ if not host_key_path:
788
+ host_key_path = os.path.expanduser("~/.ssh/known_hosts")
789
+
790
+ if not os.path.exists(host_key_path):
791
+ dirname = os.path.dirname(host_key_path)
792
+ if dirname and dirname != ".":
793
+ os.makedirs(dirname, exist_ok=True, mode=0o700)
794
+ with open(host_key_path, "w"):
795
+ pass
796
+ os.chmod(host_key_path, 0o600)
797
+
798
+ host_key = paramiko.hostkeys.HostKeys(host_key_path)
799
+ if host_key.lookup(hostname):
800
+ return
801
+
802
+ transport = paramiko.Transport(
803
+ (
804
+ hostname,
805
+ port,
806
+ )
807
+ )
808
+ transport.connect()
809
+ key = transport.get_remote_server_key()
810
+ transport.close()
811
+
812
+ if prompt:
813
+ result = _prompt_add_to_known_hosts(hostname, key)
814
+ if not result:
815
+ return
816
+
817
+ host_key.add(hostname, key.get_name(), key)
818
+ host_key.save(host_key_path)
megfile/sftp_path.py CHANGED
@@ -208,6 +208,7 @@ def _get_ssh_client(
208
208
  policy = policies.get(SFTP_HOST_KEY_POLICY, default_policy)() # pyre-ignore[29]
209
209
 
210
210
  ssh_client = paramiko.SSHClient()
211
+ ssh_client.load_system_host_keys()
211
212
  ssh_client.set_missing_host_key_policy(policy)
212
213
  max_unauth_connections = int(os.getenv(SFTP_MAX_UNAUTH_CONN, 10))
213
214
  try:
megfile/smart.py CHANGED
@@ -1001,11 +1001,13 @@ def smart_load_content(
1001
1001
  return s3_load_content(path, start, stop)
1002
1002
 
1003
1003
  with smart_open(path, "rb") as fd:
1004
- if start:
1004
+ if start is not None:
1005
1005
  fd.seek(start)
1006
1006
  offset = -1
1007
- if start and stop:
1008
- offset = stop - start
1007
+ if stop is not None:
1008
+ offset = stop - (start or 0) # start may be None
1009
+ if offset < 0:
1010
+ raise ValueError("stop should be greater than start")
1009
1011
  return fd.read(offset) # pytype: disable=bad-return-type
1010
1012
 
1011
1013
 
megfile/version.py CHANGED
@@ -1 +1 @@
1
- VERSION = "4.0.0.post1"
1
+ VERSION = "4.0.2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: megfile
3
- Version: 4.0.0.post1
3
+ Version: 4.0.2
4
4
  Summary: Megvii file operation library
5
5
  Author-email: megvii <megfile@megvii.com>
6
6
  Project-URL: Homepage, https://github.com/megvii-research/megfile
@@ -200,10 +200,10 @@ s3 =
200
200
  $ megfile config s3 accesskey secretkey \
201
201
  --addressing-style virtual \
202
202
  --endpoint-url https://tos-s3-cn-beijing.ivolces.com \
203
- --profile tos
203
+ --profile-name tos
204
204
 
205
205
  # create alias
206
- $ megfile alias tos s3+tos
206
+ $ megfile config alias tos s3+tos
207
207
  ```
208
208
 
209
209
  You can get the configuration from `~/.config/megfile/aliases.conf`, like:
@@ -212,6 +212,9 @@ You can get the configuration from `~/.config/megfile/aliases.conf`, like:
212
212
  protocol = s3+tos
213
213
  ```
214
214
 
215
+ ## Speed Test
216
+ [![10GiB](https://github.com/megvii-research/megfile/blob/main/scripts/speed_test/10GiB.png?raw=true)](https://megvii-research.github.io/megfile/speed_test.html)
217
+ [![10MiB](https://github.com/megvii-research/megfile/blob/main/scripts/speed_test/10MiB.png?raw=true)](https://megvii-research.github.io/megfile/speed_test.html)
215
218
 
216
219
  ## How to Contribute
217
220
  * We welcome everyone to contribute code to the `megfile` project, but the contributed code needs to meet the following conditions as much as possible:
@@ -1,6 +1,6 @@
1
- megfile/__init__.py,sha256=i2Lbq_VxIgppaqwkxG0_H35dRfcjJ4mCYWjprOf4hHo,7318
2
- megfile/cli.py,sha256=yRRd555mk-yxxg0XShnot068oEuvX1E4qY-hb-uVLcc,23549
3
- megfile/config.py,sha256=kcnqVGrUKSuZ3YqLHpeJPYj94WO2SIH2S6n755lUXew,1942
1
+ megfile/__init__.py,sha256=7oEfu410CFKzDWZ9RjL5xEJ1gtkJkTfvPrL_7TWdJuY,7366
2
+ megfile/cli.py,sha256=e3VVr8oe8iR7L_PtpNtyqAvQL_WgJzzEz8oewSAlgX4,24887
3
+ megfile/config.py,sha256=_6HiGeXEyk6RjPdjA0eEj1unq9iLJV_vQJBzQ-eHNvs,4185
4
4
  megfile/errors.py,sha256=a55qKQgyfiLmV-qnojUFzq2gu9JXpj3ZiC2qVaWyUTA,14160
5
5
  megfile/fs.py,sha256=bPGbapv41FzME87X3MhSNQRjNmHrI23FuFnjPT0ukQs,18239
6
6
  megfile/fs_path.py,sha256=ZK-po1xqhHocMb9Vrxf5K9tDx3zxQmGxNIHY3Z7Akp8,39085
@@ -11,14 +11,14 @@ megfile/http_path.py,sha256=c-xAu5wDxcTevmIUmrNEy-m-QiCfDJToaVI7y8SVIUI,14492
11
11
  megfile/interfaces.py,sha256=p4UvVZpeLx5djd6bqqDaygIx_s-_AxIVj-gudTch4JE,8467
12
12
  megfile/pathlike.py,sha256=vfuTBqSTIciRxkkqMfLfnBxWTEl9yns1yR8zgK4Raw0,31268
13
13
  megfile/s3.py,sha256=zqAegH5tijcztEKcfHXmOYhAR880nTxaAzc2O0JJnjc,16661
14
- megfile/s3_path.py,sha256=oBA9GdOseEtQJmh7LMDOf1sGamsEERs6Sm1jHpdksO8,93343
15
- megfile/sftp.py,sha256=WP7soS6WuSzHPc72iRHklkBoQG_4lNPbOCGwEy2Zwhw,24075
16
- megfile/sftp_path.py,sha256=jzLEt1RnzX00GCApyy1NSCGj7eQKen_bz0DETD6oVyg,43565
17
- megfile/smart.py,sha256=WS9ZZbMAMPrOR2z8sb8hxVYCVsCRMhGGUvcboQjCkBw,36771
14
+ megfile/s3_path.py,sha256=lpUKy4n5DTf6hK6TvPhMjt_ZgdIXO4vcyK_VLaGkvhg,93395
15
+ megfile/sftp.py,sha256=0ZnQlmhgvs7pYjFTcvvOyxTo2IUurE-hp1GN0hnIrdQ,26473
16
+ megfile/sftp_path.py,sha256=4rcbn3wqcOEs71W6qWu1efcj6MZUgrZm6U0Jan-eB70,43604
17
+ megfile/smart.py,sha256=Ps8acPx6jeG1UJnRD8xL2aQjRp7IMW8sV6VFkMF0TQk,36910
18
18
  megfile/smart_path.py,sha256=Bqg95T2-XZrRXWhH7GT-jMCYzD7i1SIXdczQxtOxiPs,7583
19
19
  megfile/stdio.py,sha256=C_cGID_npthpwoPcsJMMEqqbVUPUnDxxJV9jLY2_D7c,635
20
20
  megfile/stdio_path.py,sha256=L8ODNIwO79UIv13YYc2OTr6f4XTv4ZPyvBeRk83-AjA,2700
21
- megfile/version.py,sha256=4XWC9d4BvU4TuG3QOaPY1dppiaFLKYtd6zGozoT0xdM,25
21
+ megfile/version.py,sha256=secXeY4K_CdlbaUs9G--LGWfWlf1VqdtopDxqRzq2JQ,19
22
22
  megfile/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  megfile/lib/base_prefetch_reader.py,sha256=6Dy2ZwlowqAvyUUa7bpQLCKOclmmUDhqEF-_CDDp0Og,13100
24
24
  megfile/lib/combine_reader.py,sha256=nKGAug29lOpNIZuLKu7_qVrJJRpXL_J4jxLglWbGJ1w,4808
@@ -36,17 +36,27 @@ megfile/lib/s3_cached_handler.py,sha256=X8PdeRC-BY6eSmOO5f2BeyjTPxyEwNtHgmAm9Vgm
36
36
  megfile/lib/s3_limited_seekable_writer.py,sha256=mUeoTS98LHluwDN7zxdCVcsjOGBT1bOYV8nRvi9QMGE,6212
37
37
  megfile/lib/s3_memory_handler.py,sha256=4uzBzz2jfRI_u6jl0CpOGAhpNJhDQo18FSAweauCUFs,4136
38
38
  megfile/lib/s3_pipe_handler.py,sha256=dm7NnZd1Ym5ABS1GvOQtoCJEO_CB8e6p4sUhLiid0go,3622
39
- megfile/lib/s3_prefetch_reader.py,sha256=wnc-0K6USDBXKgVRZtp-W8aWf5J87J_ediJK6IJWLho,4332
39
+ megfile/lib/s3_prefetch_reader.py,sha256=dHltiM5Ui-SY4pqhvIsmC0iNmprXwlczDD4lNHB5WrQ,4418
40
40
  megfile/lib/s3_share_cache_reader.py,sha256=LVWKxHdHo0_zUIW4o8yqNvplqqwezUPeYEt02Vj-WNM,3754
41
41
  megfile/lib/shadow_handler.py,sha256=TntewlvIW9ZxCfmqASDQREHoiZ8v42faOe9sovQYQz0,2779
42
42
  megfile/lib/stdio_handler.py,sha256=IDdgENLQlhigEwkLL4zStueVSzdWg7xVcTF_koof_Ek,1987
43
43
  megfile/lib/url.py,sha256=ER32pWy9Q2MAk3TraAaNEBWIqUeBmLuM57ol2cs7-Ks,103
44
44
  megfile/utils/__init__.py,sha256=sATf_NlsSTYIMEiA8-gM6K1M-Q1K6_7rx2VM31hrqaA,10838
45
45
  megfile/utils/mutex.py,sha256=asb8opGLgK22RiuBJUnfsvB8LnMmodP8KzCVHKmQBWA,2561
46
- megfile-4.0.0.post1.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
47
- megfile-4.0.0.post1.dist-info/LICENSE.pyre,sha256=9lf5nT-5ZH25JijpYAequ0bl8E8z5JmZB1qrjiUMp84,1080
48
- megfile-4.0.0.post1.dist-info/METADATA,sha256=wBX578pPpvPTAxGBK4kiszH6Qi7kwhUMruNOOIIFQHE,9234
49
- megfile-4.0.0.post1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
50
- megfile-4.0.0.post1.dist-info/entry_points.txt,sha256=M6ZWSSv5_5_QtIpZafy3vq7WuOJ_5dSGQQnEZbByt2Q,49
51
- megfile-4.0.0.post1.dist-info/top_level.txt,sha256=IaHHoRXeemLL6kTM5YuC3H0UyOnTdZH9J324TkeBneo,36
52
- megfile-4.0.0.post1.dist-info/RECORD,,
46
+ scripts/speed_test/code/iopath_read.py,sha256=O1Qs3mpvD9S_XCuRH2A2FpGWxCOSw6qZvEBrtPeRL1E,661
47
+ scripts/speed_test/code/iopath_write.py,sha256=Mm0efW1J09RJ_CK5i1xtG2hJuaaslikin8qVpuRFP_Q,704
48
+ scripts/speed_test/code/megfile_read.py,sha256=sAMebUiMColHDv3JEkXplImAHvn_IF1-g3BIJxhcQYE,239
49
+ scripts/speed_test/code/megfile_write.py,sha256=bzn-i2bGH4eRcsVvkhXK35KsQkX2v2oEsOJ0Ft5saj0,257
50
+ scripts/speed_test/code/pyarrow_read.py,sha256=2QBGKjGV2Dvl2ukOntLSag33pF55l3tfZ2Z6dLUjovw,305
51
+ scripts/speed_test/code/pyarrow_write.py,sha256=U1puLh-ljSXv772bZYAyhzmxhPOq4aR4j-QtwdM9hG0,328
52
+ scripts/speed_test/code/s3fs_read.py,sha256=XiTA-qrYblUs-jQWXSnvNg5Wo722C_g47aMMfo5XJBY,380
53
+ scripts/speed_test/code/s3fs_write.py,sha256=gdXKkWXYGjLJlRT_J64pJN85XvRg3bZexcAJQEMXwtw,402
54
+ scripts/speed_test/code/smart_open_read.py,sha256=SA02jHwS9Y31yFtV9CoJcfND5dR0eA_HsGmGNUrpQls,515
55
+ scripts/speed_test/code/smart_open_write.py,sha256=jDxFJdY97yNH889jz3pawBoei3yaqy8pEMvC_ymHFtM,537
56
+ megfile-4.0.2.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
57
+ megfile-4.0.2.dist-info/LICENSE.pyre,sha256=9lf5nT-5ZH25JijpYAequ0bl8E8z5JmZB1qrjiUMp84,1080
58
+ megfile-4.0.2.dist-info/METADATA,sha256=A8TXdy5RWnQFzA7e2r9h757InExc7l7JaSN3O3felgk,9578
59
+ megfile-4.0.2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
60
+ megfile-4.0.2.dist-info/entry_points.txt,sha256=M6ZWSSv5_5_QtIpZafy3vq7WuOJ_5dSGQQnEZbByt2Q,49
61
+ megfile-4.0.2.dist-info/top_level.txt,sha256=fVg49lk5B9L7jyfWUXWxb0DDSuw5pbr0OU62Tvx8J8M,44
62
+ megfile-4.0.2.dist-info/RECORD,,
@@ -3,3 +3,4 @@ docs
3
3
  html_cov
4
4
  html_doc
5
5
  megfile
6
+ scripts
@@ -0,0 +1,29 @@
1
+ import os
2
+ import time
3
+
4
+ import boto3
5
+ from iopath.common.file_io import PathManager
6
+ from iopath.common.s3 import S3PathHandler
7
+
8
+ times = 10240
9
+ s3_path = "s3://bucketA/large.txt"
10
+
11
+ start = time.time()
12
+
13
+ path_manager = PathManager()
14
+
15
+ session = boto3.Session(
16
+ aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
17
+ aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
18
+ )
19
+ client = session.client("s3", endpoint_url=os.environ["OSS_ENDPOINT"])
20
+ handler = S3PathHandler()
21
+ handler.client = client
22
+
23
+ path_manager.register_handler(handler)
24
+
25
+ with path_manager.open(s3_path, "rb") as f:
26
+ for i in range(times):
27
+ f.read(1024 * 1024)
28
+
29
+ print(time.time() - start)
@@ -0,0 +1,30 @@
1
+ import os
2
+ import time
3
+
4
+ import boto3
5
+ from iopath.common.file_io import PathManager
6
+ from iopath.common.s3 import S3PathHandler
7
+
8
+ times = 10240
9
+ s3_path = "s3://bucketA/large.txt"
10
+ block = b"1" * 1024 * 1024
11
+
12
+ start = time.time()
13
+
14
+ path_manager = PathManager()
15
+
16
+ session = boto3.Session(
17
+ aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
18
+ aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
19
+ )
20
+ client = session.client("s3", endpoint_url=os.environ["OSS_ENDPOINT"])
21
+ handler = S3PathHandler()
22
+ handler.client = client
23
+
24
+ path_manager.register_handler(handler)
25
+
26
+ with path_manager.open(s3_path, "wb") as f:
27
+ for i in range(times):
28
+ f.write(block)
29
+
30
+ print(time.time() - start) # write 10GB 91.642
@@ -0,0 +1,13 @@
1
+ import time
2
+
3
+ from megfile import smart_open
4
+
5
+ times = 10240
6
+ s3_path = "s3://bucketA/large.txt"
7
+
8
+ start = time.time()
9
+ with smart_open(s3_path, "rb") as f:
10
+ for i in range(times):
11
+ f.read(1024 * 1024 * 1)
12
+
13
+ print(time.time() - start)
@@ -0,0 +1,14 @@
1
+ import time
2
+
3
+ from megfile import smart_open
4
+
5
+ times = 10240
6
+ s3_path = "s3://bucketA/large.txt"
7
+ block = b"1" * 1024 * 1024
8
+
9
+ start = time.time()
10
+ with smart_open(s3_path, "wb") as f:
11
+ for i in range(times):
12
+ f.write(block)
13
+
14
+ print(time.time() - start)
@@ -0,0 +1,17 @@
1
+ import os
2
+ import time
3
+
4
+ from pyarrow import fs
5
+
6
+ times = 10240
7
+ s3_path = "bucketA/large.txt"
8
+
9
+ start = time.time()
10
+
11
+ s3 = fs.S3FileSystem(endpoint_override=os.environ["OSS_ENDPOINT"])
12
+
13
+ with s3.open_input_stream(s3_path) as f:
14
+ for i in range(times):
15
+ f.read(1024 * 1024)
16
+
17
+ print(time.time() - start)
@@ -0,0 +1,18 @@
1
+ import os
2
+ import time
3
+
4
+ from pyarrow import fs
5
+
6
+ times = 10240
7
+ block = b"1" * 1024 * 1024
8
+ s3_path = "bucketA/large.txt"
9
+
10
+ start = time.time()
11
+
12
+ s3 = fs.S3FileSystem(endpoint_override=os.environ["OSS_ENDPOINT"])
13
+
14
+ with s3.open_output_stream(s3_path) as f:
15
+ for i in range(times):
16
+ f.write(block)
17
+
18
+ print(time.time() - start)
@@ -0,0 +1,21 @@
1
+ import os
2
+ import time
3
+
4
+ import s3fs
5
+
6
+ times = 10240
7
+ s3_path = "bucketA/large.txt"
8
+
9
+ start = time.time()
10
+
11
+ s3 = s3fs.S3FileSystem(
12
+ endpoint_url=os.environ["OSS_ENDPOINT"],
13
+ key=os.environ["AWS_ACCESS_KEY_ID"],
14
+ secret=os.environ["AWS_SECRET_ACCESS_KEY"],
15
+ )
16
+
17
+ with s3.open(s3_path, "rb") as f:
18
+ for i in range(times):
19
+ f.read(1024 * 1024)
20
+
21
+ print(time.time() - start)
@@ -0,0 +1,22 @@
1
+ import os
2
+ import time
3
+
4
+ import s3fs
5
+
6
+ times = 10240
7
+ block = b"1" * 1024 * 1024
8
+ s3_path = "bucketA/large.txt"
9
+
10
+ start = time.time()
11
+
12
+ s3 = s3fs.S3FileSystem(
13
+ endpoint_url=os.environ["OSS_ENDPOINT"],
14
+ key=os.environ["AWS_ACCESS_KEY_ID"],
15
+ secret=os.environ["AWS_SECRET_ACCESS_KEY"],
16
+ )
17
+
18
+ with s3.open(s3_path, "wb") as f:
19
+ for i in range(times):
20
+ f.write(block)
21
+
22
+ print(time.time() - start)
@@ -0,0 +1,25 @@
1
+ import os
2
+ import time
3
+
4
+ import boto3
5
+ from smart_open import open
6
+
7
+ times = 10240
8
+ s3_path = "s3://bucketA/large.txt"
9
+
10
+ start = time.time()
11
+ session = boto3.Session(
12
+ aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
13
+ aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
14
+ )
15
+ with open(
16
+ s3_path,
17
+ "rb",
18
+ transport_params={
19
+ "client": session.client("s3", endpoint_url=os.environ["OSS_ENDPOINT"])
20
+ },
21
+ ) as f:
22
+ for i in range(times):
23
+ f.read(1024 * 1024)
24
+
25
+ print(time.time() - start)
@@ -0,0 +1,26 @@
1
+ import os
2
+ import time
3
+
4
+ import boto3
5
+ from smart_open import open
6
+
7
+ times = 10240
8
+ s3_path = "s3://bucketA/large.txt"
9
+ block = b"1" * 1024 * 1024
10
+
11
+ start = time.time()
12
+ session = boto3.Session(
13
+ aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
14
+ aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
15
+ )
16
+ with open(
17
+ s3_path,
18
+ "wb",
19
+ transport_params={
20
+ "client": session.client("s3", endpoint_url=os.environ["OSS_ENDPOINT"])
21
+ },
22
+ ) as f:
23
+ for i in range(times):
24
+ f.write(block)
25
+
26
+ print(time.time() - start)