megfile 2.2.7__py3-none-any.whl → 2.2.8.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
megfile/hdfs_path.py ADDED
@@ -0,0 +1,630 @@
1
+ import hashlib
2
+ import io
3
+ import os
4
+ import sys
5
+ from functools import lru_cache
6
+ from typing import IO, AnyStr, BinaryIO, Iterator, List, Optional, Tuple
7
+
8
+ from megfile.errors import _create_missing_ok_generator, raise_hdfs_error
9
+ from megfile.interfaces import FileEntry, PathLike, StatResult, URIPath
10
+ from megfile.lib.compat import fspath
11
+ from megfile.lib.glob import FSFunc, iglob
12
+ from megfile.lib.hdfs_prefetch_reader import HdfsPrefetchReader
13
+ from megfile.lib.hdfs_tools import hdfs_api
14
+ from megfile.lib.url import get_url_scheme
15
+ from megfile.pathlike import PathLike, URIPath
16
+ from megfile.smart_path import SmartPath
17
+ from megfile.utils import cachedproperty
18
+
19
+ __all__ = [
20
+ 'HdfsPath',
21
+ 'is_hdfs',
22
+ 'hdfs_glob',
23
+ 'hdfs_glob_stat',
24
+ 'hdfs_iglob',
25
+ 'hdfs_makedirs',
26
+ ]
27
+
28
+ HDFS_USER = "HDFS_USER"
29
+ HDFS_URL = "HDFS_URL"
30
+ HDFS_ROOT = "HDFS_ROOT"
31
+ HDFS_TIMEOUT = "HDFS_TIMEOUT"
32
+ HDFS_TOKEN = "HDFS_TOKEN"
33
+ HDFS_CONFIG_PATH = "HDFS_CONFIG_PATH"
34
+ MAX_RETRIES = 10
35
+ DEFAULT_HDFS_TIMEOUT = 10
36
+
37
+
38
+ def is_hdfs(path: PathLike) -> bool: # pytype: disable=invalid-annotation
39
+ '''Test if a path is sftp path
40
+
41
+ :param path: Path to be tested
42
+ :returns: True of a path is sftp path, else False
43
+ '''
44
+ return fspath(path).startswith("hdfs://")
45
+
46
+
47
+ def get_hdfs_config(profile_name: Optional[str] = None):
48
+ env_profile = f"{profile_name.upper()}__" if profile_name else ""
49
+ config = {
50
+ 'user': os.getenv(f"{env_profile}{HDFS_USER}"),
51
+ 'url': os.getenv(f"{env_profile}{HDFS_URL}"),
52
+ 'root': os.getenv(f"{env_profile}{HDFS_ROOT}"),
53
+ 'timeout': DEFAULT_HDFS_TIMEOUT,
54
+ 'token': os.getenv(f"{env_profile}{HDFS_TOKEN}"),
55
+ }
56
+ timeout_env = f"{env_profile}{HDFS_TIMEOUT}"
57
+ if os.getenv(timeout_env):
58
+ config['timeout'] = int(os.getenv(timeout_env))
59
+
60
+ config_path = os.getenv(HDFS_CONFIG_PATH) or os.path.expanduser(
61
+ '~/.hdfscli.cfg')
62
+ if os.path.exists(config_path):
63
+ all_config = hdfs_api.config.Config(path=config_path)
64
+ if not profile_name:
65
+ if (all_config.has_section(all_config.global_section) and
66
+ all_config.has_option(all_config.global_section,
67
+ 'default.alias')):
68
+ profile_name = all_config.get(
69
+ all_config.global_section, 'default.alias')
70
+ for suffix in ('.alias', '_alias'):
71
+ section = '{}{}'.format(profile_name, suffix)
72
+ if all_config.has_section(section):
73
+ options = dict(all_config.items(section))
74
+ for key, value in config.items():
75
+ if not value and options.get(key):
76
+ config[key] = options[key]
77
+ break
78
+
79
+ if config['url']:
80
+ return config
81
+
82
+ raise hdfs_api.HdfsError(
83
+ 'Config error, please set enviroments or use "refile config hdfs ..."')
84
+
85
+
86
+ @lru_cache()
87
+ def get_hdfs_client(profile_name: Optional[str] = None):
88
+ if not hdfs_api: # pragma: no cover
89
+ raise ImportError(
90
+ "hdfs not found, please `pip install 'megfile[hdfs]'`")
91
+
92
+ config = get_hdfs_config(profile_name)
93
+ if config['token']:
94
+ config.pop('user', None)
95
+ return hdfs_api.TokenClient(**config)
96
+ config.pop('token', None)
97
+ return hdfs_api.InsecureClient(**config)
98
+
99
+
100
+ def hdfs_glob(
101
+ path: PathLike,
102
+ recursive: bool = True,
103
+ missing_ok: bool = True,
104
+ ) -> List[str]:
105
+ '''Return hdfs path list in ascending alphabetical order, in which path matches glob pattern
106
+ Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
107
+
108
+ :param recursive: If False, `**` will not search directory recursively
109
+ :param missing_ok: If False and target path doesn't match any file, raise FileNotFoundError
110
+ :raises: UnsupportedError, when bucket part contains wildcard characters
111
+ :returns: A list contains paths match `path`
112
+ '''
113
+ return list(hdfs_iglob(path, recursive=recursive, missing_ok=missing_ok))
114
+
115
+
116
+ def hdfs_glob_stat(
117
+ path: PathLike, recursive: bool = True,
118
+ missing_ok: bool = True) -> Iterator[FileEntry]:
119
+ '''Return a generator contains tuples of path and file stat, in ascending alphabetical order, in which path matches glob pattern
120
+ Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
121
+
122
+ :param recursive: If False, `**` will not search directory recursively
123
+ :param missing_ok: If False and target path doesn't match any file, raise FileNotFoundError
124
+ :raises: UnsupportedError, when bucket part contains wildcard characters
125
+ :returns: A generator contains tuples of path and file stat, in which paths match `path`
126
+ '''
127
+ return HdfsPath(path).glob_stat(
128
+ pattern="", recursive=recursive, missing_ok=missing_ok)
129
+
130
+
131
+ def hdfs_iglob(
132
+ path: PathLike,
133
+ recursive: bool = True,
134
+ missing_ok: bool = True,
135
+ ) -> Iterator[str]:
136
+ '''Return hdfs path iterator in ascending alphabetical order, in which path matches glob pattern
137
+ Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
138
+
139
+ :param recursive: If False, `**` will not search directory recursively
140
+ :param missing_ok: If False and target path doesn't match any file, raise FileNotFoundError
141
+ :raises: UnsupportedError, when bucket part contains wildcard characters
142
+ :returns: An iterator contains paths match `path`
143
+ '''
144
+ for path_obj in HdfsPath(path).iglob(pattern="", recursive=recursive,
145
+ missing_ok=missing_ok):
146
+ yield path_obj.path_with_protocol
147
+
148
+
149
+ def hdfs_makedirs(path: PathLike, exist_ok: bool = False):
150
+ '''
151
+ Create an hdfs directory.
152
+ Purely creating directory is invalid because it's unavailable on OSS.
153
+ This function is to test the target bucket have WRITE access.
154
+
155
+ :param path: Given path
156
+ :param exist_ok: If False and target directory exists, raise S3FileExistsError
157
+ :raises: FileExistsError
158
+ '''
159
+ return HdfsPath(path).mkdir(parents=True, exist_ok=exist_ok)
160
+
161
+
162
+ @SmartPath.register
163
+ class HdfsPath(URIPath):
164
+ protocol = "hdfs"
165
+
166
+ def __init__(self, path: PathLike, *other_paths: PathLike):
167
+ super().__init__(path, *other_paths)
168
+ protocol = get_url_scheme(self.path)
169
+ self._protocol_with_profile = self.protocol
170
+ self._profile_name = None
171
+ if protocol.startswith('hdfs+'):
172
+ self._protocol_with_profile = protocol
173
+ self._profile_name = protocol[5:]
174
+
175
+ @property
176
+ def _client(self):
177
+ return get_hdfs_client(profile_name=self._profile_name)
178
+
179
+ @cachedproperty
180
+ def path_with_protocol(self) -> str:
181
+ '''Return path with protocol, like hdfs://path'''
182
+ path = self.path
183
+ protocol_prefix = self._protocol_with_profile + "://"
184
+ if path.startswith(protocol_prefix):
185
+ return path
186
+ return protocol_prefix + path.lstrip('/')
187
+
188
+ @cachedproperty
189
+ def path_without_protocol(self) -> str:
190
+ '''Return path without protocol, example: if path is hdfs://path, return path'''
191
+ path = self.path
192
+ protocol_prefix = self._protocol_with_profile + "://"
193
+ if path.startswith(protocol_prefix):
194
+ path = path[len(protocol_prefix):]
195
+ return path
196
+
197
+ @cachedproperty
198
+ def parts(self) -> Tuple[str]:
199
+ '''A tuple giving access to the path’s various components'''
200
+ parts = [f"{self._protocol_with_profile}://"]
201
+ path = self.path_without_protocol
202
+ path = path.lstrip('/')
203
+ if path != '':
204
+ parts.extend(path.split('/'))
205
+ return tuple(parts)
206
+
207
+ def exists(self, followlinks: bool = False) -> bool:
208
+ '''
209
+ Test if path exists
210
+
211
+ If the bucket of path are not permitted to read, return False
212
+
213
+ :returns: True if path eixsts, else False
214
+ '''
215
+ return bool(
216
+ self._client.status(self.path_without_protocol, strict=False))
217
+
218
+ def stat(self, follow_symlinks=True) -> StatResult:
219
+ '''
220
+ Get StatResult of path file, including file size and mtime, referring to hdfs_getsize and hdfs_getmtime
221
+
222
+ If path is not an existent path, which means hdfs_exist(path) returns False, then raise FileNotFoundError
223
+ If attempt to get StatResult of complete hdfs, such as hdfs_dir_url == 'hdfs://', raise BucketNotFoundError
224
+
225
+ :returns: StatResult
226
+ :raises: FileNotFoundError
227
+ '''
228
+ with raise_hdfs_error(self.path_with_protocol):
229
+ stat_data = self._client.status(self.path_without_protocol)
230
+ return StatResult(
231
+ size=stat_data['length'],
232
+ mtime=stat_data['modificationTime'] / 1000,
233
+ isdir=stat_data['type'] == 'DIRECTORY',
234
+ islnk=False,
235
+ extra=stat_data,
236
+ )
237
+
238
+ def getmtime(self, follow_symlinks: bool = False) -> float:
239
+ '''
240
+ Get last-modified time of the file on the given path path (in Unix timestamp format).
241
+ If the path is an existent directory, return the latest modified time of all file in it. The mtime of empty directory is 1970-01-01 00:00:00
242
+
243
+ If path is not an existent path, which means hdfs_exist(path) returns False, then raise FileNotFoundError
244
+
245
+ :returns: Last-modified time
246
+ :raises: FileNotFoundError
247
+ '''
248
+ return self.stat(follow_symlinks=follow_symlinks).mtime
249
+
250
+ def getsize(self, follow_symlinks: bool = False) -> int:
251
+ '''
252
+ Get file size on the given path path (in bytes).
253
+ If the path in a directory, return the sum of all file size in it, including file in subdirectories (if exist).
254
+ The result excludes the size of directory itself. In other words, return 0 Byte on an empty directory path.
255
+
256
+ If path is not an existent path, which means hdfs_exist(path) returns False, then raise FileNotFoundError
257
+
258
+ :returns: File size
259
+ :raises: FileNotFoundError
260
+ '''
261
+ return self.stat(follow_symlinks=follow_symlinks).size
262
+
263
+ def glob(
264
+ self,
265
+ pattern,
266
+ recursive: bool = True,
267
+ missing_ok: bool = True,
268
+ ) -> List['HdfsPath']:
269
+ '''Return hdfs path list, in which path matches glob pattern
270
+ Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
271
+
272
+ :param pattern: Glob the given relative pattern in the directory represented by this path
273
+ :param recursive: If False, `**` will not search directory recursively
274
+ :param missing_ok: If False and target path doesn't match any file, raise FileNotFoundError
275
+ :raises: UnsupportedError, when bucket part contains wildcard characters
276
+ :returns: A list contains paths match `hdfs_pathname`
277
+ '''
278
+ return list(
279
+ self.iglob(
280
+ pattern=pattern, recursive=recursive, missing_ok=missing_ok))
281
+
282
+ def glob_stat(
283
+ self, pattern, recursive: bool = True,
284
+ missing_ok: bool = True) -> Iterator[FileEntry]:
285
+ '''Return a generator contains tuples of path and file stat, in which path matches glob pattern
286
+ Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
287
+
288
+ :param pattern: Glob the given relative pattern in the directory represented by this path
289
+ :param recursive: If False, `**` will not search directory recursively
290
+ :param missing_ok: If False and target path doesn't match any file, raise FileNotFoundError
291
+ :raises: UnsupportedError, when bucket part contains wildcard characters
292
+ :returns: A generator contains tuples of path and file stat, in which paths match `hdfs_pathname`
293
+ '''
294
+ for path_obj in self.iglob(pattern=pattern, recursive=recursive,
295
+ missing_ok=missing_ok):
296
+ yield FileEntry(path_obj.name, path_obj.path, path_obj.stat())
297
+
298
+ def iglob(
299
+ self,
300
+ pattern,
301
+ recursive: bool = True,
302
+ missing_ok: bool = True,
303
+ ) -> Iterator['HdfsPath']:
304
+ '''Return hdfs path iterator, in which path matches glob pattern
305
+ Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
306
+
307
+ :param pattern: Glob the given relative pattern in the directory represented by this path
308
+ :param recursive: If False, `**` will not search directory recursively
309
+ :param missing_ok: If False and target path doesn't match any file, raise FileNotFoundError
310
+ :raises: UnsupportedError, when bucket part contains wildcard characters
311
+ :returns: An iterator contains paths match `hdfs_pathname`
312
+ '''
313
+ glob_path = self.path_with_protocol
314
+ if pattern:
315
+ glob_path = self.joinpath(pattern).path_with_protocol
316
+
317
+ def _scandir(dirname: str) -> Iterator[Tuple[str, bool]]:
318
+ for entry in self.from_path(dirname).scandir():
319
+ yield entry.name, entry.is_dir()
320
+
321
+ def _exist(path: PathLike, followlinks: bool = False):
322
+ return self.from_path(path).exists(followlinks=followlinks)
323
+
324
+ def _is_dir(path: PathLike, followlinks: bool = False):
325
+ return self.from_path(path).is_dir(followlinks=followlinks)
326
+
327
+ fs_func = FSFunc(_exist, _is_dir, _scandir)
328
+ for real_path in _create_missing_ok_generator(
329
+ iglob(fspath(glob_path), recursive=recursive, fs=fs_func),
330
+ missing_ok, FileNotFoundError('No match file: %r' % glob_path)):
331
+ yield self.from_path(real_path)
332
+
333
+ def is_dir(self, followlinks: bool = False) -> bool:
334
+ '''
335
+ Test if an hdfs url is directory
336
+ Specific procedures are as follows:
337
+ If there exists a suffix, of which ``os.path.join(path, suffix)`` is a file
338
+ If the url is empty bucket or hdfs://
339
+
340
+ :param followlinks: whether followlinks is True or False, result is the same. Because hdfs symlink not support dir.
341
+ :returns: True if path is hdfs directory, else False
342
+ '''
343
+ return self.stat().is_dir()
344
+
345
+ def is_file(self, followlinks: bool = False) -> bool:
346
+ '''
347
+ Test if an path is file
348
+
349
+ :returns: True if path is hdfs file, else False
350
+ '''
351
+ return self.stat().is_file()
352
+
353
+ def listdir(self, followlinks: bool = False) -> List[str]:
354
+ '''
355
+ Get all contents of given path.
356
+
357
+ :returns: All contents have prefix of path.
358
+ :raises: FileNotFoundError, NotADirectoryError
359
+ '''
360
+ if not self.is_dir():
361
+ raise NotADirectoryError('Not a directory: %r' % self.path)
362
+ with raise_hdfs_error(self.path_with_protocol):
363
+ return self._client.list(self.path_without_protocol)
364
+
365
+ def iterdir(self, followlinks: bool = False) -> Iterator['HdfsPath']:
366
+ '''
367
+ Get all contents of given path.
368
+
369
+ :returns: All contents have prefix of path.
370
+ :raises: FileNotFoundError, NotADirectoryError
371
+ '''
372
+ for filename in self.listdir(followlinks=followlinks):
373
+ yield self.joinpath(filename) # pytype: disable=bad-return-type
374
+
375
+ def load(self, followlinks: bool = False) -> BinaryIO:
376
+ '''Read all content in binary on specified path and write into memory
377
+
378
+ User should close the BinaryIO manually
379
+
380
+ :returns: BinaryIO
381
+ '''
382
+
383
+ buffer = io.BytesIO()
384
+ with self.open('rb') as f:
385
+ buffer.write(f.read())
386
+ buffer.seek(0)
387
+ return buffer
388
+
389
+ def mkdir(self, mode=0o777, parents: bool = False, exist_ok: bool = False):
390
+ '''
391
+ Create an hdfs directory.
392
+ Purely creating directory is invalid because it's unavailable on OSS.
393
+ This function is to test the target bucket have WRITE access.
394
+
395
+ :param mode: Octal permission to set on the newly created directory.
396
+ These permissions will only be set on directories that do not already exist.
397
+ :param parents: parents is ignored, only be compatible with pathlib.Path
398
+ :param exist_ok: If False and target directory exists, raise FileExistsError
399
+ :raises: BucketNotFoundError, FileExistsError
400
+ '''
401
+ if not exist_ok and self.exists():
402
+ raise FileExistsError('File exists: %r' % self.path)
403
+ with raise_hdfs_error(self.path_with_protocol):
404
+ self._client.makedirs(self.path_without_protocol, permission=mode)
405
+
406
+ def rename(self, dst_path: PathLike) -> 'HdfsPath':
407
+ '''
408
+ Move hdfs file path from src_path to dst_path
409
+
410
+ :param dst_path: Given destination path
411
+ '''
412
+ dst_path = self.from_path(dst_path)
413
+ with raise_hdfs_error(self.path_with_protocol):
414
+ self._client.rename(
415
+ self.path_without_protocol, dst_path.path_without_protocol)
416
+ return dst_path
417
+
418
+ def move(self, dst_path: PathLike) -> None:
419
+ '''
420
+ Move file/directory path from src_path to dst_path
421
+
422
+ :param dst_path: Given destination path
423
+ '''
424
+ self.rename(dst_path=dst_path)
425
+
426
+ def remove(self, missing_ok: bool = False) -> None:
427
+ '''
428
+ Remove the file or directory on hdfs, `hdfs://` and `hdfs://bucket` are not permitted to remove
429
+
430
+ :param missing_ok: if False and target file/directory not exists, raise FileNotFoundError
431
+ :raises: FileNotFoundError, UnsupportedError
432
+ '''
433
+ try:
434
+ with raise_hdfs_error(self.path_with_protocol):
435
+ self._client.delete(self.path_without_protocol, recursive=True)
436
+ except Exception as e:
437
+ if not missing_ok or not isinstance(e, FileNotFoundError):
438
+ raise
439
+
440
+ def scan(self, missing_ok: bool = True,
441
+ followlinks: bool = False) -> Iterator[str]:
442
+ '''
443
+ Iteratively traverse only files in given hdfs directory.
444
+ Every iteration on generator yields a path string.
445
+
446
+ If path is a file path, yields the file only
447
+ If path is a non-existent path, return an empty generator
448
+ If path is a bucket path, return all file paths in the bucket
449
+ If path is an empty bucket, return an empty generator
450
+ If path doesn't contain any bucket, which is path == 'hdfs://', raise UnsupportedError. walk() on complete hdfs is not supported in megfile
451
+
452
+ :param missing_ok: If False and there's no file in the directory, raise FileNotFoundError
453
+ :raises: UnsupportedError
454
+ :returns: A file path generator
455
+ '''
456
+ for file_entry in self.scan_stat(missing_ok=missing_ok,
457
+ followlinks=followlinks):
458
+ yield file_entry.path
459
+
460
+ def scan_stat(self, missing_ok: bool = True,
461
+ followlinks: bool = False) -> Iterator[FileEntry]:
462
+ '''
463
+ Iteratively traverse only files in given directory.
464
+ Every iteration on generator yields a tuple of path string and file stat
465
+
466
+ :param missing_ok: If False and there's no file in the directory, raise FileNotFoundError
467
+ :raises: UnsupportedError
468
+ :returns: A file path generator
469
+ '''
470
+ with raise_hdfs_error(self.path_with_protocol):
471
+ for (root,
472
+ _root_status), _dir_infos, file_infos in self._client.walk(
473
+ self.path_without_protocol, status=True,
474
+ ignore_missing=missing_ok):
475
+ for filename, stat_data in file_infos:
476
+ yield FileEntry(
477
+ name=filename,
478
+ path=self.from_path(
479
+ f"{self._protocol_with_profile}://{root.lstrip('/')}"
480
+ ).joinpath(filename).path_with_protocol,
481
+ stat=StatResult(
482
+ size=stat_data['length'],
483
+ mtime=stat_data['modificationTime'] / 1000,
484
+ isdir=False,
485
+ islnk=False,
486
+ extra=stat_data,
487
+ ))
488
+
489
+ def scandir(self, followlinks: bool = False) -> Iterator[FileEntry]:
490
+ '''
491
+ Get all contents of given path, the order of result is not guaranteed.
492
+
493
+ :returns: All contents have prefix of path
494
+ :raises: FileNotFoundError, NotADirectoryError
495
+ '''
496
+ with raise_hdfs_error(self.path_with_protocol):
497
+ for filename, stat_data in self._client.list(
498
+ self.path_without_protocol, status=True):
499
+ yield FileEntry(
500
+ name=filename,
501
+ path=self.joinpath(filename).path_with_protocol,
502
+ stat=StatResult(
503
+ size=stat_data['length'],
504
+ mtime=stat_data['modificationTime'] / 1000,
505
+ isdir=stat_data['type'] == 'DIRECTORY',
506
+ islnk=False,
507
+ extra=stat_data,
508
+ ))
509
+
510
+ def unlink(self, missing_ok: bool = False) -> None:
511
+ '''
512
+ Remove the file on hdfs
513
+
514
+ :param missing_ok: if False and target file not exists, raise FileNotFoundError
515
+ :raises: FileNotFoundError, IsADirectoryError
516
+ '''
517
+ if self.is_dir():
518
+ raise IsADirectoryError('Path is a directory: %r' % self.path)
519
+ self.remove(missing_ok=missing_ok)
520
+
521
+ def walk(self, followlinks: bool = False
522
+ ) -> Iterator[Tuple[str, List[str], List[str]]]:
523
+ '''
524
+ Iteratively traverse the given hdfs directory, in top-bottom order. In other words, firstly traverse parent directory, if subdirectories exist, traverse the subdirectories.
525
+ Every iteration on generator yields a 3-tuple: (root, dirs, files)
526
+
527
+ - root: Current hdfs path;
528
+ - dirs: Name list of subdirectories in current directory.
529
+ - files: Name list of files in current directory.
530
+
531
+ If path is a file path, return an empty generator
532
+ If path is a non-existent path, return an empty generator
533
+ If path is a bucket path, bucket will be the top directory, and will be returned at first iteration of generator
534
+ If path is an empty bucket, only yield one 3-tuple (notes: hdfs doesn't have empty directory)
535
+ If path doesn't contain any bucket, which is path == 'hdfs://', raise UnsupportedError. walk() on complete hdfs is not supported in megfile
536
+
537
+ :param followlinks: whether followlinks is True or False, result is the same. Because hdfs not support symlink.
538
+ :returns: A 3-tuple generator
539
+ '''
540
+ with raise_hdfs_error(self.path_with_protocol):
541
+ for path, dirs, files in self._client.walk(
542
+ self.path_without_protocol, ignore_missing=True,
543
+ allow_dir_changes=True):
544
+ yield f"{self._protocol_with_profile}://{path.lstrip('/')}", dirs, files
545
+
546
+ def md5(self, recalculate: bool = False, followlinks: bool = False) -> str:
547
+ '''
548
+ Get checksum of the file or dir.
549
+
550
+ :param recalculate: Ignore this parameter, just for compatibility
551
+ :param followlinks: Ignore this parameter, just for compatibility
552
+ :returns: checksum
553
+ '''
554
+ if self.is_dir(followlinks=followlinks):
555
+ hash_md5 = hashlib.md5() # nosec
556
+ for file_name in self.listdir():
557
+ chunk = self.joinpath(file_name).md5( # pytype: disable=attribute-error
558
+ recalculate=recalculate).encode()
559
+ hash_md5.update(chunk)
560
+ return hash_md5.hexdigest()
561
+ with raise_hdfs_error(self.path_with_protocol):
562
+ return self._client.checksum(self.path_without_protocol)['bytes']
563
+
564
+ def save(self, file_object: BinaryIO):
565
+ '''Write the opened binary stream to specified path, but the stream won't be closed
566
+
567
+ :param file_object: Stream to be read
568
+ '''
569
+ with raise_hdfs_error(self.path_with_protocol):
570
+ self._client.write(
571
+ self.path_without_protocol, overwrite=True, data=file_object)
572
+
573
+ def open(
574
+ self,
575
+ mode: str = 'r',
576
+ *,
577
+ buffering: Optional[int] = None,
578
+ encoding: Optional[str] = None,
579
+ errors: Optional[str] = None,
580
+ **kwargs) -> IO[AnyStr]: # pytype: disable=signature-mismatch
581
+ if '+' in mode:
582
+ raise ValueError('unacceptable mode: %r' % mode)
583
+
584
+ if 'b' in mode:
585
+ encoding = None
586
+ elif not encoding:
587
+ encoding = sys.getdefaultencoding()
588
+
589
+ with raise_hdfs_error(self.path_with_protocol):
590
+ if mode in ('r', 'rb'):
591
+ keys = [
592
+ 'block_size', 'block_capacity', 'block_forward',
593
+ 'max_retries', 'max_workers'
594
+ ]
595
+ input_kwargs = {}
596
+ for key in keys:
597
+ if key in kwargs:
598
+ input_kwargs[key] = kwargs[key]
599
+ file_obj = HdfsPrefetchReader(
600
+ hdfs_path=self.path_without_protocol,
601
+ client=self._client,
602
+ profile_name=self._profile_name,
603
+ **input_kwargs)
604
+ if 'b' not in mode:
605
+ file_obj = io.TextIOWrapper(
606
+ file_obj, encoding=encoding, errors=errors) # pytype: disable=wrong-arg-types
607
+ file_obj.mode = mode
608
+ return file_obj # pytype: disable=bad-return-type
609
+ elif mode in ('w', 'wb'):
610
+ return self._client.write( # pytype: disable=bad-return-type
611
+ self.path_without_protocol,
612
+ overwrite=True,
613
+ buffersize=buffering,
614
+ encoding=encoding)
615
+ elif mode in ('a', 'ab'):
616
+ return self._client.write( # pytype: disable=bad-return-type
617
+ self.path_without_protocol,
618
+ append=True,
619
+ buffersize=buffering,
620
+ encoding=encoding)
621
+ raise ValueError('unacceptable mode: %r' % mode)
622
+
623
+ def absolute(self) -> 'HdfsPath':
624
+ '''
625
+ Make the path absolute, without normalization or resolving symlinks. Returns a new path object
626
+ '''
627
+ with raise_hdfs_error(self.path_with_protocol):
628
+ real_path = self._client.resolve(self.path_without_protocol)
629
+ return self.from_path(
630
+ f"{self._protocol_with_profile}:///{real_path.lstrip('/')}")