megfile 2.2.7__py3-none-any.whl → 2.2.8.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megfile/__init__.py +26 -0
- megfile/cli.py +57 -5
- megfile/errors.py +25 -0
- megfile/fs.py +2 -12
- megfile/fs_path.py +11 -8
- megfile/hdfs.py +269 -0
- megfile/hdfs_path.py +630 -0
- megfile/http_path.py +61 -2
- megfile/lib/hdfs_prefetch_reader.py +51 -0
- megfile/lib/hdfs_tools.py +21 -0
- megfile/pathlike.py +4 -0
- megfile/s3.py +2 -7
- megfile/s3_path.py +17 -5
- megfile/sftp.py +2 -12
- megfile/sftp_path.py +12 -8
- megfile/version.py +1 -1
- {megfile-2.2.7.dist-info → megfile-2.2.8.post1.dist-info}/METADATA +41 -87
- {megfile-2.2.7.dist-info → megfile-2.2.8.post1.dist-info}/RECORD +23 -19
- {megfile-2.2.7.dist-info → megfile-2.2.8.post1.dist-info}/WHEEL +1 -1
- {megfile-2.2.7.dist-info → megfile-2.2.8.post1.dist-info}/LICENSE +0 -0
- {megfile-2.2.7.dist-info → megfile-2.2.8.post1.dist-info}/LICENSE.pyre +0 -0
- {megfile-2.2.7.dist-info → megfile-2.2.8.post1.dist-info}/entry_points.txt +0 -0
- {megfile-2.2.7.dist-info → megfile-2.2.8.post1.dist-info}/top_level.txt +0 -0
megfile/__init__.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from megfile.fs import fs_abspath, fs_access, fs_cwd, fs_exists, fs_expanduser, fs_getmd5, fs_getmtime, fs_getsize, fs_glob, fs_glob_stat, fs_home, fs_iglob, fs_isabs, fs_isdir, fs_isfile, fs_islink, fs_ismount, fs_listdir, fs_load_from, fs_lstat, fs_makedirs, fs_move, fs_readlink, fs_realpath, fs_relpath, fs_remove, fs_rename, fs_resolve, fs_save_as, fs_scan, fs_scan_stat, fs_scandir, fs_stat, fs_symlink, fs_sync, fs_unlink, fs_walk, is_fs
|
|
2
2
|
from megfile.fs_path import FSPath
|
|
3
|
+
from megfile.hdfs import hdfs_exists, hdfs_getmd5, hdfs_getmtime, hdfs_getsize, hdfs_glob, hdfs_glob_stat, hdfs_iglob, hdfs_isdir, hdfs_isfile, hdfs_listdir, hdfs_load_from, hdfs_makedirs, hdfs_move, hdfs_open, hdfs_remove, hdfs_save_as, hdfs_scan, hdfs_scan_stat, hdfs_scandir, hdfs_stat, hdfs_unlink, hdfs_walk, is_hdfs
|
|
4
|
+
from megfile.hdfs_path import HdfsPath
|
|
3
5
|
from megfile.http import http_exists, http_getmtime, http_getsize, http_open, http_stat, is_http
|
|
4
6
|
from megfile.http_path import HttpPath, HttpsPath
|
|
5
7
|
from megfile.s3 import is_s3, s3_access, s3_buffered_open, s3_cached_open, s3_concat, s3_copy, s3_download, s3_exists, s3_getmd5, s3_getmtime, s3_getsize, s3_glob, s3_glob_stat, s3_hasbucket, s3_iglob, s3_isdir, s3_isfile, s3_listdir, s3_load_content, s3_load_from, s3_lstat, s3_makedirs, s3_memory_open, s3_move, s3_open, s3_path_join, s3_pipe_open, s3_prefetch_open, s3_readlink, s3_remove, s3_rename, s3_save_as, s3_scan, s3_scan_stat, s3_scandir, s3_stat, s3_symlink, s3_sync, s3_unlink, s3_upload, s3_walk
|
|
@@ -178,6 +180,29 @@ __all__ = [
|
|
|
178
180
|
'sftp_copy',
|
|
179
181
|
'sftp_sync',
|
|
180
182
|
'sftp_concat',
|
|
183
|
+
'is_hdfs',
|
|
184
|
+
'hdfs_exists',
|
|
185
|
+
'hdfs_stat',
|
|
186
|
+
'hdfs_getmtime',
|
|
187
|
+
'hdfs_getsize',
|
|
188
|
+
'hdfs_isdir',
|
|
189
|
+
'hdfs_isfile',
|
|
190
|
+
'hdfs_listdir',
|
|
191
|
+
'hdfs_load_from',
|
|
192
|
+
'hdfs_move',
|
|
193
|
+
'hdfs_remove',
|
|
194
|
+
'hdfs_scan',
|
|
195
|
+
'hdfs_scan_stat',
|
|
196
|
+
'hdfs_scandir',
|
|
197
|
+
'hdfs_unlink',
|
|
198
|
+
'hdfs_walk',
|
|
199
|
+
'hdfs_getmd5',
|
|
200
|
+
'hdfs_save_as',
|
|
201
|
+
'hdfs_open',
|
|
202
|
+
'hdfs_glob',
|
|
203
|
+
'hdfs_glob_stat',
|
|
204
|
+
'hdfs_iglob',
|
|
205
|
+
'hdfs_makedirs',
|
|
181
206
|
'S3Path',
|
|
182
207
|
'FSPath',
|
|
183
208
|
'HttpPath',
|
|
@@ -185,4 +210,5 @@ __all__ = [
|
|
|
185
210
|
'StdioPath',
|
|
186
211
|
'SmartPath',
|
|
187
212
|
'SftpPath',
|
|
213
|
+
'HdfsPath',
|
|
188
214
|
]
|
megfile/cli.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import configparser
|
|
1
2
|
import logging
|
|
2
3
|
import os
|
|
3
4
|
import shutil
|
|
@@ -9,9 +10,10 @@ from functools import partial
|
|
|
9
10
|
import click
|
|
10
11
|
from tqdm import tqdm
|
|
11
12
|
|
|
13
|
+
from megfile.hdfs_path import DEFAULT_HDFS_TIMEOUT
|
|
12
14
|
from megfile.interfaces import FileEntry
|
|
13
15
|
from megfile.lib.glob import get_non_glob_dir, has_magic
|
|
14
|
-
from megfile.smart import _smart_sync_single_file, smart_copy, smart_getmd5, smart_getmtime, smart_getsize, smart_glob_stat, smart_isdir, smart_isfile, smart_makedirs, smart_move, smart_open, smart_path_join, smart_remove, smart_rename, smart_scan_stat, smart_scandir, smart_stat, smart_sync, smart_sync_with_progress, smart_touch, smart_unlink
|
|
16
|
+
from megfile.smart import _smart_sync_single_file, smart_copy, smart_exists, smart_getmd5, smart_getmtime, smart_getsize, smart_glob_stat, smart_isdir, smart_isfile, smart_makedirs, smart_move, smart_open, smart_path_join, smart_remove, smart_rename, smart_scan_stat, smart_scandir, smart_stat, smart_sync, smart_sync_with_progress, smart_touch, smart_unlink
|
|
15
17
|
from megfile.smart_path import SmartPath
|
|
16
18
|
from megfile.utils import get_human_size
|
|
17
19
|
from megfile.version import VERSION
|
|
@@ -286,7 +288,7 @@ def sync(
|
|
|
286
288
|
src_root_path = get_non_glob_dir(src_path)
|
|
287
289
|
|
|
288
290
|
def scan_func(path):
|
|
289
|
-
for glob_file_entry in smart_glob_stat(path):
|
|
291
|
+
for glob_file_entry in smart_glob_stat(path, missing_ok=False):
|
|
290
292
|
if glob_file_entry.is_file():
|
|
291
293
|
yield glob_file_entry
|
|
292
294
|
else:
|
|
@@ -295,7 +297,8 @@ def sync(
|
|
|
295
297
|
yield file_entry
|
|
296
298
|
else:
|
|
297
299
|
src_root_path = src_path
|
|
298
|
-
scan_func = partial(
|
|
300
|
+
scan_func = partial(
|
|
301
|
+
smart_scan_stat, followlinks=True, missing_ok=False)
|
|
299
302
|
|
|
300
303
|
if progress_bar and not quiet:
|
|
301
304
|
print('building progress bar', end='\r')
|
|
@@ -493,7 +496,8 @@ def config():
|
|
|
493
496
|
'--path',
|
|
494
497
|
type=str,
|
|
495
498
|
default='~/.aws/credentials',
|
|
496
|
-
help='s3 config file'
|
|
499
|
+
help='s3 config file, default is $HOME/.aws/credentials',
|
|
500
|
+
)
|
|
497
501
|
@click.option(
|
|
498
502
|
'-n', '--profile-name', type=str, default='default', help='s3 config file')
|
|
499
503
|
@click.argument('aws_access_key_id')
|
|
@@ -504,6 +508,8 @@ def config():
|
|
|
504
508
|
def s3(
|
|
505
509
|
path, profile_name, aws_access_key_id, aws_secret_access_key,
|
|
506
510
|
endpoint_url, addressing_style, no_cover):
|
|
511
|
+
path = os.path.expanduser(path)
|
|
512
|
+
|
|
507
513
|
config_dict = {
|
|
508
514
|
'name': profile_name,
|
|
509
515
|
'aws_access_key_id': aws_access_key_id,
|
|
@@ -533,6 +539,7 @@ def s3(
|
|
|
533
539
|
s3['addressing_style'])
|
|
534
540
|
return content
|
|
535
541
|
|
|
542
|
+
os.makedirs(os.path.dirname(path), exist_ok=True) # make sure dirpath exist
|
|
536
543
|
if not os.path.exists(path): #If this file doesn't exist.
|
|
537
544
|
content_str = dumps(config_dict)
|
|
538
545
|
with open(path, 'w') as fp:
|
|
@@ -556,7 +563,7 @@ def s3(
|
|
|
556
563
|
# Given profile_name has been used.
|
|
557
564
|
if cur_name == profile_name:
|
|
558
565
|
if no_cover: # default True(cover the same-name config).
|
|
559
|
-
raise NameError(f'
|
|
566
|
+
raise NameError(f'profile-name has been used: {profile_name}')
|
|
560
567
|
used = True
|
|
561
568
|
sections[i] = dumps(config_dict)
|
|
562
569
|
continue
|
|
@@ -570,6 +577,51 @@ def s3(
|
|
|
570
577
|
click.echo(f'Your oss config has been saved into {path}')
|
|
571
578
|
|
|
572
579
|
|
|
580
|
+
@config.command(short_help='Return the config file for s3')
|
|
581
|
+
@click.argument('url')
|
|
582
|
+
@click.option(
|
|
583
|
+
'-p',
|
|
584
|
+
'--path',
|
|
585
|
+
default='~/.hdfscli.cfg',
|
|
586
|
+
help='s3 config file, default is $HOME/.hdfscli.cfg',
|
|
587
|
+
)
|
|
588
|
+
@click.option('-n', '--profile-name', default='default', help='s3 config file')
|
|
589
|
+
@click.option('-u', '--user', help='user name')
|
|
590
|
+
@click.option('-r', '--root', help="hdfs path's root dir")
|
|
591
|
+
@click.option('-t', '--token', help="token for requesting hdfs server")
|
|
592
|
+
@click.option(
|
|
593
|
+
'-o',
|
|
594
|
+
'--timeout',
|
|
595
|
+
help=f"request hdfs server timeout, default {DEFAULT_HDFS_TIMEOUT}")
|
|
596
|
+
@click.option('--no-cover', is_flag=True, help='Not cover the same-name config')
|
|
597
|
+
def hdfs(url, path, profile_name, user, root, token, timeout, no_cover):
|
|
598
|
+
path = os.path.expanduser(path)
|
|
599
|
+
current_config = {
|
|
600
|
+
'url': url,
|
|
601
|
+
'user': user,
|
|
602
|
+
'root': root,
|
|
603
|
+
'token': token,
|
|
604
|
+
'timeout': timeout,
|
|
605
|
+
}
|
|
606
|
+
profile_name = f"{profile_name}.alias"
|
|
607
|
+
config = configparser.ConfigParser()
|
|
608
|
+
if os.path.exists(path):
|
|
609
|
+
config.read(path)
|
|
610
|
+
if 'global' not in config.sections():
|
|
611
|
+
config['global'] = {'default.alias': 'default'}
|
|
612
|
+
if profile_name in config.sections():
|
|
613
|
+
if no_cover:
|
|
614
|
+
raise NameError(f'profile-name has been used: {profile_name[:-6]}')
|
|
615
|
+
else:
|
|
616
|
+
config[profile_name] = {}
|
|
617
|
+
for key, value in current_config.items():
|
|
618
|
+
if value:
|
|
619
|
+
config[profile_name][key] = value
|
|
620
|
+
with open(path, 'w') as fp:
|
|
621
|
+
config.write(fp)
|
|
622
|
+
click.echo(f'Your hdfs config has been saved into {path}')
|
|
623
|
+
|
|
624
|
+
|
|
573
625
|
if __name__ == '__main__':
|
|
574
626
|
# Usage: python -m megfile.cli
|
|
575
627
|
safe_cli() # pragma: no cover
|
megfile/errors.py
CHANGED
|
@@ -384,3 +384,28 @@ def s3_error_code_should_retry(error: str) -> bool:
|
|
|
384
384
|
if error in ['InternalError', 'ServiceUnavailable', 'SlowDown']:
|
|
385
385
|
return True
|
|
386
386
|
return False
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def translate_hdfs_error(hdfs_error: Exception, hdfs_path: PathLike):
|
|
390
|
+
from megfile.lib.hdfs_tools import hdfs_api
|
|
391
|
+
|
|
392
|
+
if hdfs_api and isinstance(hdfs_error, hdfs_api.HdfsError):
|
|
393
|
+
if hdfs_error.message and 'Path is not a file' in hdfs_error.message: # pytype: disable=attribute-error
|
|
394
|
+
return IsADirectoryError('Is a directory: %r' % hdfs_path)
|
|
395
|
+
elif hdfs_error.message and 'Path is not a directory' in hdfs_error.message: # pytype: disable=attribute-error
|
|
396
|
+
return NotADirectoryError('Not a directory: %r' % hdfs_path)
|
|
397
|
+
elif hdfs_error.status_code in (401, 403): # pytype: disable=attribute-error
|
|
398
|
+
return PermissionError('Permission denied: %r' % hdfs_path)
|
|
399
|
+
elif hdfs_error.status_code == 400: # pytype: disable=attribute-error
|
|
400
|
+
return ValueError(f'{hdfs_error.message}, path: {hdfs_path}') # pytype: disable=attribute-error
|
|
401
|
+
elif hdfs_error.status_code == 404: # pytype: disable=attribute-error
|
|
402
|
+
return FileNotFoundError(f'No match file: {hdfs_path}')
|
|
403
|
+
return hdfs_error
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
@contextmanager
|
|
407
|
+
def raise_hdfs_error(hdfs_path: PathLike):
|
|
408
|
+
try:
|
|
409
|
+
yield
|
|
410
|
+
except Exception as error:
|
|
411
|
+
raise translate_hdfs_error(error, hdfs_path)
|
megfile/fs.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import BinaryIO, Callable, Iterator, List, Optional, Tuple
|
|
2
2
|
|
|
3
|
-
from megfile.fs_path import FSPath, StatResult, _make_stat, fs_cwd, fs_glob, fs_glob_stat, fs_home, fs_iglob, fs_makedirs, fs_move, fs_path_join, fs_readlink, fs_rename, fs_resolve, is_fs
|
|
3
|
+
from megfile.fs_path import FSPath, StatResult, _make_stat, fs_cwd, fs_glob, fs_glob_stat, fs_home, fs_iglob, fs_lstat, fs_makedirs, fs_move, fs_path_join, fs_readlink, fs_rename, fs_resolve, is_fs
|
|
4
4
|
from megfile.interfaces import Access, FileEntry, PathLike, StatResult
|
|
5
5
|
|
|
6
6
|
__all__ = [
|
|
@@ -18,6 +18,7 @@ __all__ = [
|
|
|
18
18
|
'fs_resolve',
|
|
19
19
|
'fs_move',
|
|
20
20
|
'fs_makedirs',
|
|
21
|
+
'fs_lstat',
|
|
21
22
|
'fs_isabs',
|
|
22
23
|
'fs_abspath',
|
|
23
24
|
'fs_access',
|
|
@@ -36,7 +37,6 @@ __all__ = [
|
|
|
36
37
|
'fs_scan_stat',
|
|
37
38
|
'fs_scandir',
|
|
38
39
|
'fs_stat',
|
|
39
|
-
'fs_lstat',
|
|
40
40
|
'fs_unlink',
|
|
41
41
|
'fs_walk',
|
|
42
42
|
'fs_getmd5',
|
|
@@ -260,16 +260,6 @@ def fs_stat(path: PathLike, follow_symlinks=True) -> StatResult:
|
|
|
260
260
|
return FSPath(path).stat(follow_symlinks)
|
|
261
261
|
|
|
262
262
|
|
|
263
|
-
def fs_lstat(path: PathLike) -> StatResult:
|
|
264
|
-
'''
|
|
265
|
-
Like Path.stat() but, if the path points to a symbolic link, return the symbolic link’s information rather than its target’s.
|
|
266
|
-
|
|
267
|
-
:param path: Given path
|
|
268
|
-
:returns: StatResult
|
|
269
|
-
'''
|
|
270
|
-
return FSPath(path).lstat()
|
|
271
|
-
|
|
272
|
-
|
|
273
263
|
def fs_unlink(path: PathLike, missing_ok: bool = False) -> None:
|
|
274
264
|
'''
|
|
275
265
|
Remove the file on fs
|
megfile/fs_path.py
CHANGED
|
@@ -37,6 +37,7 @@ __all__ = [
|
|
|
37
37
|
'fs_resolve',
|
|
38
38
|
'fs_move',
|
|
39
39
|
'fs_makedirs',
|
|
40
|
+
'fs_lstat',
|
|
40
41
|
]
|
|
41
42
|
|
|
42
43
|
|
|
@@ -200,6 +201,16 @@ def fs_makedirs(path: PathLike, exist_ok: bool = False):
|
|
|
200
201
|
return FSPath(path).mkdir(parents=True, exist_ok=exist_ok)
|
|
201
202
|
|
|
202
203
|
|
|
204
|
+
def fs_lstat(path: PathLike) -> StatResult:
|
|
205
|
+
'''
|
|
206
|
+
Like Path.stat() but, if the path points to a symbolic link, return the symbolic link’s information rather than its target’s.
|
|
207
|
+
|
|
208
|
+
:param path: Given path
|
|
209
|
+
:returns: StatResult
|
|
210
|
+
'''
|
|
211
|
+
return FSPath(path).lstat()
|
|
212
|
+
|
|
213
|
+
|
|
203
214
|
@SmartPath.register
|
|
204
215
|
class FSPath(URIPath):
|
|
205
216
|
"""file protocol
|
|
@@ -601,14 +612,6 @@ class FSPath(URIPath):
|
|
|
601
612
|
mtime = stat.st_mtime
|
|
602
613
|
return result._replace(size=size, ctime=ctime, mtime=mtime)
|
|
603
614
|
|
|
604
|
-
def lstat(self) -> StatResult:
|
|
605
|
-
'''
|
|
606
|
-
Like Path.stat() but, if the path points to a symbolic link, return the symbolic link’s information rather than its target’s.
|
|
607
|
-
|
|
608
|
-
:returns: StatResult
|
|
609
|
-
'''
|
|
610
|
-
return self.stat(follow_symlinks=False)
|
|
611
|
-
|
|
612
615
|
def unlink(self, missing_ok: bool = False) -> None:
|
|
613
616
|
'''
|
|
614
617
|
Remove the file on fs
|
megfile/hdfs.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
from typing import IO, AnyStr, BinaryIO, Iterator, List, Optional, Tuple
|
|
2
|
+
|
|
3
|
+
from megfile.hdfs_path import HdfsPath, hdfs_glob, hdfs_glob_stat, hdfs_iglob, hdfs_makedirs, is_hdfs
|
|
4
|
+
from megfile.interfaces import FileEntry, PathLike, StatResult
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
'is_hdfs',
|
|
8
|
+
'hdfs_glob',
|
|
9
|
+
'hdfs_glob_stat',
|
|
10
|
+
'hdfs_iglob',
|
|
11
|
+
'hdfs_makedirs',
|
|
12
|
+
'hdfs_exists',
|
|
13
|
+
'hdfs_stat',
|
|
14
|
+
'hdfs_getmtime',
|
|
15
|
+
'hdfs_getsize',
|
|
16
|
+
'hdfs_isdir',
|
|
17
|
+
'hdfs_isfile',
|
|
18
|
+
'hdfs_listdir',
|
|
19
|
+
'hdfs_load_from',
|
|
20
|
+
'hdfs_move',
|
|
21
|
+
'hdfs_remove',
|
|
22
|
+
'hdfs_scan',
|
|
23
|
+
'hdfs_scan_stat',
|
|
24
|
+
'hdfs_scandir',
|
|
25
|
+
'hdfs_unlink',
|
|
26
|
+
'hdfs_walk',
|
|
27
|
+
'hdfs_getmd5',
|
|
28
|
+
'hdfs_save_as',
|
|
29
|
+
'hdfs_open',
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def hdfs_exists(path: PathLike, followlinks: bool = False) -> bool:
|
|
34
|
+
'''
|
|
35
|
+
Test if path exists
|
|
36
|
+
|
|
37
|
+
If the bucket of path are not permitted to read, return False
|
|
38
|
+
|
|
39
|
+
:param path: Given path
|
|
40
|
+
:returns: True if path eixsts, else False
|
|
41
|
+
'''
|
|
42
|
+
return HdfsPath(path).exists(followlinks)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def hdfs_stat(path: PathLike, follow_symlinks=True) -> StatResult:
|
|
46
|
+
'''
|
|
47
|
+
Get StatResult of path file, including file size and mtime, referring to hdfs_getsize and hdfs_getmtime
|
|
48
|
+
|
|
49
|
+
If path is not an existent path, which means hdfs_exist(path) returns False, then raise FileNotFoundError
|
|
50
|
+
If attempt to get StatResult of complete hdfs, such as hdfs_dir_url == 'hdfs://', raise BucketNotFoundError
|
|
51
|
+
|
|
52
|
+
:param path: Given path
|
|
53
|
+
:returns: StatResult
|
|
54
|
+
:raises: FileNotFoundError
|
|
55
|
+
'''
|
|
56
|
+
return HdfsPath(path).stat(follow_symlinks)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def hdfs_getmtime(path: PathLike, follow_symlinks: bool = False) -> float:
|
|
60
|
+
'''
|
|
61
|
+
Get last-modified time of the file on the given path path (in Unix timestamp format).
|
|
62
|
+
If the path is an existent directory, return the latest modified time of all file in it. The mtime of empty directory is 1970-01-01 00:00:00
|
|
63
|
+
|
|
64
|
+
If path is not an existent path, which means hdfs_exist(path) returns False, then raise FileNotFoundError
|
|
65
|
+
|
|
66
|
+
:param path: Given path
|
|
67
|
+
:returns: Last-modified time
|
|
68
|
+
:raises: FileNotFoundError
|
|
69
|
+
'''
|
|
70
|
+
return HdfsPath(path).getmtime(follow_symlinks)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def hdfs_getsize(path: PathLike, follow_symlinks: bool = False) -> int:
|
|
74
|
+
'''
|
|
75
|
+
Get file size on the given path path (in bytes).
|
|
76
|
+
If the path in a directory, return the sum of all file size in it, including file in subdirectories (if exist).
|
|
77
|
+
The result excludes the size of directory itself. In other words, return 0 Byte on an empty directory path.
|
|
78
|
+
|
|
79
|
+
If path is not an existent path, which means hdfs_exist(path) returns False, then raise FileNotFoundError
|
|
80
|
+
|
|
81
|
+
:param path: Given path
|
|
82
|
+
:returns: File size
|
|
83
|
+
:raises: FileNotFoundError
|
|
84
|
+
'''
|
|
85
|
+
return HdfsPath(path).getsize(follow_symlinks)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def hdfs_isdir(path: PathLike, followlinks: bool = False) -> bool:
|
|
89
|
+
'''
|
|
90
|
+
Test if an hdfs url is directory
|
|
91
|
+
Specific procedures are as follows:
|
|
92
|
+
If there exists a suffix, of which ``os.path.join(path, suffix)`` is a file
|
|
93
|
+
If the url is empty bucket or hdfs://
|
|
94
|
+
|
|
95
|
+
:param path: Given path
|
|
96
|
+
:param followlinks: whether followlinks is True or False, result is the same. Because hdfs symlink not support dir.
|
|
97
|
+
:returns: True if path is hdfs directory, else False
|
|
98
|
+
'''
|
|
99
|
+
return HdfsPath(path).is_dir(followlinks)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def hdfs_isfile(path: PathLike, followlinks: bool = False) -> bool:
|
|
103
|
+
'''
|
|
104
|
+
Test if an path is file
|
|
105
|
+
|
|
106
|
+
:param path: Given path
|
|
107
|
+
:returns: True if path is hdfs file, else False
|
|
108
|
+
'''
|
|
109
|
+
return HdfsPath(path).is_file(followlinks)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def hdfs_listdir(path: PathLike, followlinks: bool = False) -> List[str]:
|
|
113
|
+
'''
|
|
114
|
+
Get all contents of given path.
|
|
115
|
+
|
|
116
|
+
:param path: Given path
|
|
117
|
+
:returns: All contents have prefix of path.
|
|
118
|
+
:raises: FileNotFoundError, NotADirectoryError
|
|
119
|
+
'''
|
|
120
|
+
return HdfsPath(path).listdir(followlinks)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def hdfs_load_from(path: PathLike, followlinks: bool = False) -> BinaryIO:
|
|
124
|
+
'''Read all content in binary on specified path and write into memory
|
|
125
|
+
|
|
126
|
+
User should close the BinaryIO manually
|
|
127
|
+
|
|
128
|
+
:param path: Given path
|
|
129
|
+
:returns: BinaryIO
|
|
130
|
+
'''
|
|
131
|
+
return HdfsPath(path).load(followlinks)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def hdfs_move(src_path: PathLike, dst_path: PathLike) -> None:
|
|
135
|
+
'''
|
|
136
|
+
Move file/directory path from src_path to dst_path
|
|
137
|
+
|
|
138
|
+
:param src_path: Given path
|
|
139
|
+
:param dst_path: Given destination path
|
|
140
|
+
'''
|
|
141
|
+
return HdfsPath(src_path).move(dst_path)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def hdfs_remove(path: PathLike, missing_ok: bool = False) -> None:
|
|
145
|
+
'''
|
|
146
|
+
Remove the file or directory on hdfs, `hdfs://` and `hdfs://bucket` are not permitted to remove
|
|
147
|
+
|
|
148
|
+
:param path: Given path
|
|
149
|
+
:param missing_ok: if False and target file/directory not exists, raise FileNotFoundError
|
|
150
|
+
:raises: FileNotFoundError, UnsupportedError
|
|
151
|
+
'''
|
|
152
|
+
return HdfsPath(path).remove(missing_ok)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def hdfs_scan(
|
|
156
|
+
path: PathLike, missing_ok: bool = True,
|
|
157
|
+
followlinks: bool = False) -> Iterator[str]:
|
|
158
|
+
'''
|
|
159
|
+
Iteratively traverse only files in given hdfs directory.
|
|
160
|
+
Every iteration on generator yields a path string.
|
|
161
|
+
|
|
162
|
+
If path is a file path, yields the file only
|
|
163
|
+
If path is a non-existent path, return an empty generator
|
|
164
|
+
If path is a bucket path, return all file paths in the bucket
|
|
165
|
+
If path is an empty bucket, return an empty generator
|
|
166
|
+
If path doesn't contain any bucket, which is path == 'hdfs://', raise UnsupportedError. walk() on complete hdfs is not supported in megfile
|
|
167
|
+
|
|
168
|
+
:param path: Given path
|
|
169
|
+
:param missing_ok: If False and there's no file in the directory, raise FileNotFoundError
|
|
170
|
+
:raises: UnsupportedError
|
|
171
|
+
:returns: A file path generator
|
|
172
|
+
'''
|
|
173
|
+
return HdfsPath(path).scan(missing_ok, followlinks)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def hdfs_scan_stat(
|
|
177
|
+
path: PathLike, missing_ok: bool = True,
|
|
178
|
+
followlinks: bool = False) -> Iterator[FileEntry]:
|
|
179
|
+
'''
|
|
180
|
+
Iteratively traverse only files in given directory.
|
|
181
|
+
Every iteration on generator yields a tuple of path string and file stat
|
|
182
|
+
|
|
183
|
+
:param path: Given path
|
|
184
|
+
:param missing_ok: If False and there's no file in the directory, raise FileNotFoundError
|
|
185
|
+
:raises: UnsupportedError
|
|
186
|
+
:returns: A file path generator
|
|
187
|
+
'''
|
|
188
|
+
return HdfsPath(path).scan_stat(missing_ok, followlinks)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def hdfs_scandir(path: PathLike,
|
|
192
|
+
followlinks: bool = False) -> Iterator[FileEntry]:
|
|
193
|
+
'''
|
|
194
|
+
Get all contents of given path, the order of result is not guaranteed.
|
|
195
|
+
|
|
196
|
+
:param path: Given path
|
|
197
|
+
:returns: All contents have prefix of path
|
|
198
|
+
:raises: FileNotFoundError, NotADirectoryError
|
|
199
|
+
'''
|
|
200
|
+
return HdfsPath(path).scandir(followlinks)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def hdfs_unlink(path: PathLike, missing_ok: bool = False) -> None:
|
|
204
|
+
'''
|
|
205
|
+
Remove the file on hdfs
|
|
206
|
+
|
|
207
|
+
:param path: Given path
|
|
208
|
+
:param missing_ok: if False and target file not exists, raise FileNotFoundError
|
|
209
|
+
:raises: FileNotFoundError, IsADirectoryError
|
|
210
|
+
'''
|
|
211
|
+
return HdfsPath(path).unlink(missing_ok)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def hdfs_walk(path: PathLike, followlinks: bool = False
|
|
215
|
+
) -> Iterator[Tuple[str, List[str], List[str]]]:
|
|
216
|
+
'''
|
|
217
|
+
Iteratively traverse the given hdfs directory, in top-bottom order. In other words, firstly traverse parent directory, if subdirectories exist, traverse the subdirectories.
|
|
218
|
+
Every iteration on generator yields a 3-tuple: (root, dirs, files)
|
|
219
|
+
|
|
220
|
+
- root: Current hdfs path;
|
|
221
|
+
- dirs: Name list of subdirectories in current directory.
|
|
222
|
+
- files: Name list of files in current directory.
|
|
223
|
+
|
|
224
|
+
If path is a file path, return an empty generator
|
|
225
|
+
If path is a non-existent path, return an empty generator
|
|
226
|
+
If path is a bucket path, bucket will be the top directory, and will be returned at first iteration of generator
|
|
227
|
+
If path is an empty bucket, only yield one 3-tuple (notes: hdfs doesn't have empty directory)
|
|
228
|
+
If path doesn't contain any bucket, which is path == 'hdfs://', raise UnsupportedError. walk() on complete hdfs is not supported in megfile
|
|
229
|
+
|
|
230
|
+
:param path: Given path
|
|
231
|
+
:param followlinks: whether followlinks is True or False, result is the same. Because hdfs not support symlink.
|
|
232
|
+
:returns: A 3-tuple generator
|
|
233
|
+
'''
|
|
234
|
+
return HdfsPath(path).walk(followlinks)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def hdfs_getmd5(
|
|
238
|
+
path: PathLike, recalculate: bool = False,
|
|
239
|
+
followlinks: bool = False) -> str:
|
|
240
|
+
'''
|
|
241
|
+
Get checksum of the file or dir.
|
|
242
|
+
|
|
243
|
+
:param path: Given path
|
|
244
|
+
:param recalculate: Ignore this parameter, just for compatibility
|
|
245
|
+
:param followlinks: Ignore this parameter, just for compatibility
|
|
246
|
+
:returns: checksum
|
|
247
|
+
'''
|
|
248
|
+
return HdfsPath(path).md5(recalculate, followlinks)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def hdfs_save_as(file_object: BinaryIO, path: PathLike):
|
|
252
|
+
'''Write the opened binary stream to specified path, but the stream won't be closed
|
|
253
|
+
|
|
254
|
+
:param path: Given path
|
|
255
|
+
:param file_object: Stream to be read
|
|
256
|
+
'''
|
|
257
|
+
return HdfsPath(path).save(file_object)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def hdfs_open(
|
|
261
|
+
path: PathLike,
|
|
262
|
+
mode: str = 'r',
|
|
263
|
+
*,
|
|
264
|
+
buffering: Optional[int] = None,
|
|
265
|
+
encoding: Optional[str] = None,
|
|
266
|
+
errors: Optional[str] = None,
|
|
267
|
+
**kwargs) -> IO[AnyStr]: # pytype: disable=signature-mismatch
|
|
268
|
+
return HdfsPath(path).open(
|
|
269
|
+
mode, buffering=buffering, encoding=encoding, errors=errors)
|