megfile 4.2.5__py3-none-any.whl → 5.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
megfile/hdfs.py DELETED
@@ -1,408 +0,0 @@
1
- from typing import IO, BinaryIO, Iterator, List, Optional, Tuple
2
-
3
- from megfile.config import READER_BLOCK_SIZE, READER_MAX_BUFFER_SIZE
4
- from megfile.hdfs_path import (
5
- HdfsPath,
6
- is_hdfs,
7
- )
8
- from megfile.interfaces import FileEntry, PathLike, StatResult
9
-
10
- __all__ = [
11
- "is_hdfs",
12
- "hdfs_glob",
13
- "hdfs_glob_stat",
14
- "hdfs_iglob",
15
- "hdfs_makedirs",
16
- "hdfs_exists",
17
- "hdfs_stat",
18
- "hdfs_getmtime",
19
- "hdfs_getsize",
20
- "hdfs_isdir",
21
- "hdfs_isfile",
22
- "hdfs_listdir",
23
- "hdfs_load_from",
24
- "hdfs_move",
25
- "hdfs_remove",
26
- "hdfs_scan",
27
- "hdfs_scan_stat",
28
- "hdfs_scandir",
29
- "hdfs_unlink",
30
- "hdfs_walk",
31
- "hdfs_getmd5",
32
- "hdfs_save_as",
33
- "hdfs_open",
34
- ]
35
-
36
-
37
- def hdfs_exists(path: PathLike, followlinks: bool = False) -> bool:
38
- """
39
- Test if path exists
40
-
41
- If the bucket of path are not permitted to read, return False
42
-
43
- :param path: Given path
44
- :returns: True if path exists, else False
45
- """
46
- return HdfsPath(path).exists(followlinks)
47
-
48
-
49
- def hdfs_stat(path: PathLike, follow_symlinks=True) -> StatResult:
50
- """
51
- Get StatResult of path file, including file size and mtime,
52
- referring to hdfs_getsize and hdfs_getmtime
53
-
54
- If path is not an existent path, which means hdfs_exist(path) returns False,
55
- then raise FileNotFoundError
56
-
57
- If attempt to get StatResult of complete hdfs, such as hdfs_dir_url == 'hdfs://',
58
- raise BucketNotFoundError
59
-
60
- :param path: Given path
61
- :returns: StatResult
62
- :raises: FileNotFoundError
63
- """
64
- return HdfsPath(path).stat(follow_symlinks)
65
-
66
-
67
- def hdfs_getmtime(path: PathLike, follow_symlinks: bool = False) -> float:
68
- """
69
- Get last-modified time of the file on the given path path (in Unix timestamp
70
- format).
71
- If the path is an existent directory, return the latest modified time of all
72
- file in it. The mtime of empty directory is 1970-01-01 00:00:00
73
-
74
- If path is not an existent path, which means hdfs_exist(path) returns False,
75
- then raise FileNotFoundError
76
-
77
- :param path: Given path
78
- :returns: Last-modified time
79
- :raises: FileNotFoundError
80
- """
81
- return HdfsPath(path).getmtime(follow_symlinks)
82
-
83
-
84
- def hdfs_getsize(path: PathLike, follow_symlinks: bool = False) -> int:
85
- """
86
- Get file size on the given path path (in bytes).
87
- If the path in a directory, return the sum of all file size in it,
88
- including file in subdirectories (if exist).
89
-
90
- The result excludes the size of directory itself. In other words,
91
- return 0 Byte on an empty directory path.
92
-
93
- If path is not an existent path, which means hdfs_exist(path) returns False,
94
- then raise FileNotFoundError
95
-
96
- :param path: Given path
97
- :returns: File size
98
- :raises: FileNotFoundError
99
- """
100
- return HdfsPath(path).getsize(follow_symlinks)
101
-
102
-
103
- def hdfs_isdir(path: PathLike, followlinks: bool = False) -> bool:
104
- """
105
- Test if an hdfs url is directory
106
- Specific procedures are as follows:
107
- If there exists a suffix, of which ``os.path.join(path, suffix)`` is a file
108
- If the url is empty bucket or hdfs://
109
-
110
- :param path: Given path
111
- :param followlinks: whether followlinks is True or False, result is the same.
112
- Because hdfs symlink not support dir.
113
- :returns: True if path is hdfs directory, else False
114
- """
115
- return HdfsPath(path).is_dir(followlinks)
116
-
117
-
118
- def hdfs_isfile(path: PathLike, followlinks: bool = False) -> bool:
119
- """
120
- Test if an path is file
121
-
122
- :param path: Given path
123
- :returns: True if path is hdfs file, else False
124
- """
125
- return HdfsPath(path).is_file(followlinks)
126
-
127
-
128
- def hdfs_listdir(path: PathLike) -> List[str]:
129
- """
130
- Get all contents of given path.
131
-
132
- :param path: Given path
133
- :returns: All contents have prefix of path.
134
- :raises: FileNotFoundError, NotADirectoryError
135
- """
136
- return HdfsPath(path).listdir()
137
-
138
-
139
- def hdfs_load_from(path: PathLike) -> BinaryIO:
140
- """Read all content in binary on specified path and write into memory
141
-
142
- User should close the BinaryIO manually
143
-
144
- :param path: Given path
145
- :returns: BinaryIO
146
- """
147
- return HdfsPath(path).load()
148
-
149
-
150
- def hdfs_move(src_path: PathLike, dst_path: PathLike, overwrite: bool = True) -> None:
151
- """
152
- Move file/directory path from src_path to dst_path
153
-
154
- :param src_path: Given path
155
- :param dst_path: Given destination path
156
- """
157
- return HdfsPath(src_path).move(dst_path, overwrite)
158
-
159
-
160
- def hdfs_remove(path: PathLike, missing_ok: bool = False) -> None:
161
- """
162
- Remove the file or directory on hdfs, `hdfs://` and `hdfs://bucket` are not
163
- permitted to remove
164
-
165
- :param path: Given path
166
- :param missing_ok: if False and target file/directory not exists,
167
- raise FileNotFoundError
168
- :raises: FileNotFoundError, UnsupportedError
169
- """
170
- return HdfsPath(path).remove(missing_ok)
171
-
172
-
173
- def hdfs_scan(
174
- path: PathLike, missing_ok: bool = True, followlinks: bool = False
175
- ) -> Iterator[str]:
176
- """
177
- Iteratively traverse only files in given hdfs directory.
178
- Every iteration on generator yields a path string.
179
-
180
- If path is a file path, yields the file only
181
- If path is a non-existent path, return an empty generator
182
- If path is a bucket path, return all file paths in the bucket
183
- If path is an empty bucket, return an empty generator
184
- If path doesn't contain any bucket, which is path == 'hdfs://',
185
- raise UnsupportedError. walk() on complete hdfs is not supported in megfile
186
-
187
- :param path: Given path
188
- :param missing_ok: If False and there's no file in the directory,
189
- raise FileNotFoundError
190
- :raises: UnsupportedError
191
- :returns: A file path generator
192
- """
193
- return HdfsPath(path).scan(missing_ok, followlinks)
194
-
195
-
196
- def hdfs_scan_stat(
197
- path: PathLike, missing_ok: bool = True, followlinks: bool = False
198
- ) -> Iterator[FileEntry]:
199
- """
200
- Iteratively traverse only files in given directory.
201
- Every iteration on generator yields a tuple of path string and file stat
202
-
203
- :param path: Given path
204
- :param missing_ok: If False and there's no file in the directory,
205
- raise FileNotFoundError
206
- :raises: UnsupportedError
207
- :returns: A file path generator
208
- """
209
- return HdfsPath(path).scan_stat(missing_ok, followlinks)
210
-
211
-
212
- def hdfs_scandir(path: PathLike) -> Iterator[FileEntry]:
213
- """
214
- Get all contents of given path, the order of result is in arbitrary order.
215
-
216
- :param path: Given path
217
- :returns: All contents have prefix of path
218
- :raises: FileNotFoundError, NotADirectoryError
219
- """
220
- return HdfsPath(path).scandir()
221
-
222
-
223
- def hdfs_unlink(path: PathLike, missing_ok: bool = False) -> None:
224
- """
225
- Remove the file on hdfs
226
-
227
- :param path: Given path
228
- :param missing_ok: if False and target file not exists, raise FileNotFoundError
229
- :raises: FileNotFoundError, IsADirectoryError
230
- """
231
- return HdfsPath(path).unlink(missing_ok)
232
-
233
-
234
- def hdfs_walk(
235
- path: PathLike, followlinks: bool = False
236
- ) -> Iterator[Tuple[str, List[str], List[str]]]:
237
- """
238
- Iteratively traverse the given hdfs directory, in top-bottom order.
239
- In other words, firstly traverse parent directory, if subdirectories exist,
240
- traverse the subdirectories.
241
-
242
- Every iteration on generator yields a 3-tuple: (root, dirs, files)
243
-
244
- - root: Current hdfs path;
245
- - dirs: Name list of subdirectories in current directory.
246
- - files: Name list of files in current directory.
247
-
248
- If path is a file path, return an empty generator
249
-
250
- If path is a non-existent path, return an empty generator
251
-
252
- If path is a bucket path, bucket will be the top directory,
253
- and will be returned at first iteration of generator
254
-
255
- If path is an empty bucket, only yield one 3-tuple
256
- (notes: hdfs doesn't have empty directory)
257
-
258
- If path doesn't contain any bucket, which is path == 'hdfs://',
259
- raise UnsupportedError. walk() on complete hdfs is not supported in megfile
260
-
261
- :param path: Given path
262
- :param followlinks: whether followlinks is True or False, result is the same.
263
- Because hdfs not support symlink.
264
- :returns: A 3-tuple generator
265
- """
266
- return HdfsPath(path).walk(followlinks)
267
-
268
-
269
- def hdfs_getmd5(
270
- path: PathLike, recalculate: bool = False, followlinks: bool = False
271
- ) -> str:
272
- """
273
- Get checksum of the file or dir.
274
-
275
- :param path: Given path
276
- :param recalculate: Ignore this parameter, just for compatibility
277
- :param followlinks: Ignore this parameter, just for compatibility
278
- :returns: checksum
279
- """
280
- return HdfsPath(path).md5(recalculate, followlinks)
281
-
282
-
283
- def hdfs_save_as(file_object: BinaryIO, path: PathLike):
284
- """Write the opened binary stream to specified path,
285
- but the stream won't be closed
286
-
287
- :param path: Given path
288
- :param file_object: Stream to be read
289
- """
290
- return HdfsPath(path).save(file_object)
291
-
292
-
293
- def hdfs_open(
294
- path: PathLike,
295
- mode: str = "r",
296
- *,
297
- buffering: Optional[int] = None,
298
- encoding: Optional[str] = None,
299
- errors: Optional[str] = None,
300
- max_workers: Optional[int] = None,
301
- max_buffer_size: int = READER_MAX_BUFFER_SIZE,
302
- block_forward: Optional[int] = None,
303
- block_size: int = READER_BLOCK_SIZE,
304
- **kwargs,
305
- ) -> IO:
306
- """
307
- Open a file on the specified path.
308
-
309
- :param path: Given path
310
- :param mode: Mode to open the file. Supports 'r', 'rb', 'w', 'wb', 'a', 'ab'.
311
- :param buffering: Optional integer used to set the buffering policy.
312
- :param encoding: Name of the encoding used to decode or encode the file.
313
- Should only be used in text mode.
314
- :param errors: Optional string specifying how encoding and decoding errors are
315
- to be handled. Cannot be used in binary mode.
316
- :param max_workers: Max download thread number, `None` by default,
317
- will use global thread pool with 8 threads.
318
- :param max_buffer_size: Max cached buffer size in memory, 128MB by default.
319
- Set to `0` will disable cache.
320
- :param block_forward: Number of blocks of data for reader cached from the
321
- offset position.
322
- :param block_size: Size of a single block for reader, default is 8MB.
323
- :returns: A file-like object.
324
- :raises ValueError: If an unacceptable mode is provided.
325
- """
326
- return HdfsPath(path).open(
327
- mode,
328
- buffering=buffering,
329
- encoding=encoding,
330
- errors=errors,
331
- max_workers=max_workers,
332
- max_buffer_size=max_buffer_size,
333
- block_forward=block_forward,
334
- block_size=block_size,
335
- )
336
-
337
-
338
- def hdfs_glob(
339
- path: PathLike, recursive: bool = True, missing_ok: bool = True
340
- ) -> List[str]:
341
- """Return hdfs path list in ascending alphabetical order,
342
- in which path matches glob pattern
343
-
344
- Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
345
- raise UnsupportedError
346
-
347
- :param recursive: If False, `**` will not search directory recursively
348
- :param missing_ok: If False and target path doesn't match any file,
349
- raise FileNotFoundError
350
- :raises: UnsupportedError, when bucket part contains wildcard characters
351
- :returns: A list contains paths match `path`
352
- """
353
- return list(hdfs_iglob(path, recursive=recursive, missing_ok=missing_ok))
354
-
355
-
356
- def hdfs_glob_stat(
357
- path: PathLike, recursive: bool = True, missing_ok: bool = True
358
- ) -> Iterator[FileEntry]:
359
- """Return a generator contains tuples of path and file stat,
360
- in ascending alphabetical order, in which path matches glob pattern
361
-
362
- Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
363
- raise UnsupportedError
364
-
365
- :param recursive: If False, `**` will not search directory recursively
366
- :param missing_ok: If False and target path doesn't match any file,
367
- raise FileNotFoundError
368
- :raises: UnsupportedError, when bucket part contains wildcard characters
369
- :returns: A generator contains tuples of path and file stat,
370
- in which paths match `path`
371
- """
372
- return HdfsPath(path).glob_stat(
373
- pattern="", recursive=recursive, missing_ok=missing_ok
374
- )
375
-
376
-
377
- def hdfs_iglob(
378
- path: PathLike, recursive: bool = True, missing_ok: bool = True
379
- ) -> Iterator[str]:
380
- """Return hdfs path iterator in ascending alphabetical order,
381
- in which path matches glob pattern
382
-
383
- Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
384
- raise UnsupportedError
385
-
386
- :param recursive: If False, `**` will not search directory recursively
387
- :param missing_ok: If False and target path doesn't match any file,
388
- raise FileNotFoundError
389
- :raises: UnsupportedError, when bucket part contains wildcard characters
390
- :returns: An iterator contains paths match `path`
391
- """
392
- for path_obj in HdfsPath(path).iglob(
393
- pattern="", recursive=recursive, missing_ok=missing_ok
394
- ):
395
- yield path_obj.path_with_protocol
396
-
397
-
398
- def hdfs_makedirs(path: PathLike, exist_ok: bool = False):
399
- """
400
- Create an hdfs directory.
401
- Purely creating directory is invalid because it's unavailable on OSS.
402
- This function is to test the target bucket have WRITE access.
403
-
404
- :param path: Given path
405
- :param exist_ok: If False and target directory exists, raise S3FileExistsError
406
- :raises: FileExistsError
407
- """
408
- return HdfsPath(path).mkdir(parents=True, exist_ok=exist_ok)
megfile/http.py DELETED
@@ -1,114 +0,0 @@
1
- from io import BufferedReader
2
- from typing import Optional, Union
3
-
4
- from megfile.config import READER_BLOCK_SIZE, READER_MAX_BUFFER_SIZE
5
- from megfile.http_path import HttpPath, HttpPrefetchReader, get_http_session, is_http
6
- from megfile.interfaces import PathLike, StatResult
7
-
8
- __all__ = [
9
- "get_http_session",
10
- "is_http",
11
- "http_open",
12
- "http_stat",
13
- "http_getsize",
14
- "http_getmtime",
15
- "http_exists",
16
- ]
17
-
18
-
19
- def http_open(
20
- path: PathLike,
21
- mode: str = "rb",
22
- *,
23
- encoding: Optional[str] = None,
24
- errors: Optional[str] = None,
25
- max_workers: Optional[int] = None,
26
- max_buffer_size: int = READER_MAX_BUFFER_SIZE,
27
- block_forward: Optional[int] = None,
28
- block_size: int = READER_BLOCK_SIZE,
29
- **kwargs,
30
- ) -> Union[BufferedReader, HttpPrefetchReader]:
31
- """Open a BytesIO to read binary data of given http(s) url
32
-
33
- .. note ::
34
-
35
- Essentially, it reads data of http(s) url to memory by requests,
36
- and then return BytesIO to user.
37
-
38
- :param path: Given path
39
- :param mode: Only supports 'r' or 'rb' mode now
40
- :param encoding: encoding is the name of the encoding used to decode or encode
41
- the file. This should only be used in text mode.
42
- :param errors: errors is an optional string that specifies how encoding and decoding
43
- errors are to be handled—this cannot be used in binary mode.
44
- :param max_workers: Max download thread number, `None` by default,
45
- will use global thread pool with 8 threads.
46
- :param max_buffer_size: Max cached buffer size in memory, 128MB by default.
47
- Set to `0` will disable cache.
48
- :param block_forward: How many blocks of data cached from offset position
49
- :param block_size: Size of single block, 8MB by default. Each block will be uploaded
50
- or downloaded by single thread.
51
- :return: A file-like object with http(s) data
52
- """
53
- return HttpPath(path).open(
54
- mode,
55
- encoding=encoding,
56
- errors=errors,
57
- max_workers=max_workers,
58
- max_buffer_size=max_buffer_size,
59
- block_forward=block_forward,
60
- block_size=block_size,
61
- )
62
-
63
-
64
- def http_stat(path: PathLike, follow_symlinks=True) -> StatResult:
65
- """
66
- Get StatResult of http_url response, including size and mtime,
67
- referring to http_getsize and http_getmtime
68
-
69
- :param path: Given path
70
- :param follow_symlinks: Ignore this parameter, just for compatibility
71
- :returns: StatResult
72
- :raises: HttpPermissionError, HttpFileNotFoundError
73
- """
74
- return HttpPath(path).stat(follow_symlinks)
75
-
76
-
77
- def http_getsize(path: PathLike, follow_symlinks: bool = False) -> int:
78
- """
79
- Get file size on the given http_url path.
80
-
81
- If http response header don't support Content-Length, will return None
82
-
83
- :param path: Given path
84
- :param follow_symlinks: Ignore this parameter, just for compatibility
85
- :returns: File size (in bytes)
86
- :raises: HttpPermissionError, HttpFileNotFoundError
87
- """
88
- return HttpPath(path).getsize(follow_symlinks)
89
-
90
-
91
- def http_getmtime(path: PathLike, follow_symlinks: bool = False) -> float:
92
- """
93
- Get Last-Modified time of the http request on the given http_url path.
94
-
95
- If http response header don't support Last-Modified, will return None
96
-
97
- :param path: Given path
98
- :param follow_symlinks: Ignore this parameter, just for compatibility
99
- :returns: Last-Modified time (in Unix timestamp format)
100
- :raises: HttpPermissionError, HttpFileNotFoundError
101
- """
102
- return HttpPath(path).getmtime(follow_symlinks)
103
-
104
-
105
- def http_exists(path: PathLike, followlinks: bool = False) -> bool:
106
- """Test if http path exists
107
-
108
- :param path: Given path
109
- :param followlinks: ignore this parameter, just for compatibility
110
- :type followlinks: bool, optional
111
- :return: return True if exists
112
- :rtype: bool
113
- """
114
- return HttpPath(path).exists(followlinks)