megfile 3.1.0.post2__py3-none-any.whl → 3.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +2 -4
- megfile/__init__.py +394 -203
- megfile/cli.py +258 -238
- megfile/config.py +25 -21
- megfile/errors.py +124 -114
- megfile/fs.py +174 -140
- megfile/fs_path.py +462 -354
- megfile/hdfs.py +133 -101
- megfile/hdfs_path.py +290 -236
- megfile/http.py +15 -14
- megfile/http_path.py +111 -107
- megfile/interfaces.py +70 -65
- megfile/lib/base_prefetch_reader.py +94 -69
- megfile/lib/combine_reader.py +13 -12
- megfile/lib/compare.py +17 -13
- megfile/lib/compat.py +1 -5
- megfile/lib/fnmatch.py +29 -30
- megfile/lib/glob.py +54 -55
- megfile/lib/hdfs_prefetch_reader.py +40 -25
- megfile/lib/hdfs_tools.py +1 -3
- megfile/lib/http_prefetch_reader.py +69 -46
- megfile/lib/joinpath.py +5 -5
- megfile/lib/lazy_handler.py +7 -3
- megfile/lib/s3_buffered_writer.py +61 -52
- megfile/lib/s3_cached_handler.py +14 -13
- megfile/lib/s3_limited_seekable_writer.py +38 -28
- megfile/lib/s3_memory_handler.py +35 -29
- megfile/lib/s3_pipe_handler.py +25 -24
- megfile/lib/s3_prefetch_reader.py +71 -52
- megfile/lib/s3_share_cache_reader.py +37 -24
- megfile/lib/shadow_handler.py +8 -3
- megfile/lib/stdio_handler.py +9 -8
- megfile/lib/url.py +3 -3
- megfile/pathlike.py +259 -228
- megfile/s3.py +220 -153
- megfile/s3_path.py +977 -802
- megfile/sftp.py +190 -156
- megfile/sftp_path.py +540 -450
- megfile/smart.py +397 -330
- megfile/smart_path.py +100 -105
- megfile/stdio.py +10 -9
- megfile/stdio_path.py +32 -35
- megfile/utils/__init__.py +75 -54
- megfile/utils/mutex.py +11 -14
- megfile/version.py +1 -1
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/METADATA +5 -8
- megfile-3.1.2.dist-info/RECORD +55 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/WHEEL +1 -1
- scripts/convert_results_to_sarif.py +45 -78
- scripts/generate_file.py +140 -64
- megfile-3.1.0.post2.dist-info/RECORD +0 -55
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/LICENSE +0 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/entry_points.txt +0 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/top_level.txt +0 -0
megfile/hdfs.py
CHANGED
|
@@ -1,163 +1,182 @@
|
|
|
1
1
|
from typing import IO, BinaryIO, Iterator, List, Optional, Tuple
|
|
2
2
|
|
|
3
|
-
from megfile.hdfs_path import
|
|
3
|
+
from megfile.hdfs_path import (
|
|
4
|
+
HdfsPath,
|
|
5
|
+
hdfs_glob,
|
|
6
|
+
hdfs_glob_stat,
|
|
7
|
+
hdfs_iglob,
|
|
8
|
+
hdfs_makedirs,
|
|
9
|
+
is_hdfs,
|
|
10
|
+
)
|
|
4
11
|
from megfile.interfaces import FileEntry, PathLike, StatResult
|
|
5
12
|
|
|
6
13
|
__all__ = [
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
14
|
+
"is_hdfs",
|
|
15
|
+
"hdfs_glob",
|
|
16
|
+
"hdfs_glob_stat",
|
|
17
|
+
"hdfs_iglob",
|
|
18
|
+
"hdfs_makedirs",
|
|
19
|
+
"hdfs_exists",
|
|
20
|
+
"hdfs_stat",
|
|
21
|
+
"hdfs_getmtime",
|
|
22
|
+
"hdfs_getsize",
|
|
23
|
+
"hdfs_isdir",
|
|
24
|
+
"hdfs_isfile",
|
|
25
|
+
"hdfs_listdir",
|
|
26
|
+
"hdfs_load_from",
|
|
27
|
+
"hdfs_move",
|
|
28
|
+
"hdfs_remove",
|
|
29
|
+
"hdfs_scan",
|
|
30
|
+
"hdfs_scan_stat",
|
|
31
|
+
"hdfs_scandir",
|
|
32
|
+
"hdfs_unlink",
|
|
33
|
+
"hdfs_walk",
|
|
34
|
+
"hdfs_getmd5",
|
|
35
|
+
"hdfs_save_as",
|
|
36
|
+
"hdfs_open",
|
|
30
37
|
]
|
|
31
38
|
|
|
32
39
|
|
|
33
40
|
def hdfs_exists(path: PathLike, followlinks: bool = False) -> bool:
|
|
34
|
-
|
|
41
|
+
"""
|
|
35
42
|
Test if path exists
|
|
36
43
|
|
|
37
44
|
If the bucket of path are not permitted to read, return False
|
|
38
45
|
|
|
39
46
|
:param path: Given path
|
|
40
47
|
:returns: True if path exists, else False
|
|
41
|
-
|
|
48
|
+
"""
|
|
42
49
|
return HdfsPath(path).exists(followlinks)
|
|
43
50
|
|
|
44
51
|
|
|
45
52
|
def hdfs_stat(path: PathLike, follow_symlinks=True) -> StatResult:
|
|
46
|
-
|
|
47
|
-
Get StatResult of path file, including file size and mtime,
|
|
53
|
+
"""
|
|
54
|
+
Get StatResult of path file, including file size and mtime,
|
|
55
|
+
referring to hdfs_getsize and hdfs_getmtime
|
|
48
56
|
|
|
49
|
-
If path is not an existent path, which means hdfs_exist(path) returns False,
|
|
50
|
-
|
|
57
|
+
If path is not an existent path, which means hdfs_exist(path) returns False,
|
|
58
|
+
then raise FileNotFoundError
|
|
59
|
+
|
|
60
|
+
If attempt to get StatResult of complete hdfs, such as hdfs_dir_url == 'hdfs://',
|
|
61
|
+
raise BucketNotFoundError
|
|
51
62
|
|
|
52
63
|
:param path: Given path
|
|
53
64
|
:returns: StatResult
|
|
54
65
|
:raises: FileNotFoundError
|
|
55
|
-
|
|
66
|
+
"""
|
|
56
67
|
return HdfsPath(path).stat(follow_symlinks)
|
|
57
68
|
|
|
58
69
|
|
|
59
70
|
def hdfs_getmtime(path: PathLike, follow_symlinks: bool = False) -> float:
|
|
60
|
-
|
|
61
|
-
Get last-modified time of the file on the given path path (in Unix timestamp
|
|
62
|
-
|
|
71
|
+
"""
|
|
72
|
+
Get last-modified time of the file on the given path path (in Unix timestamp
|
|
73
|
+
format).
|
|
74
|
+
If the path is an existent directory, return the latest modified time of all
|
|
75
|
+
file in it. The mtime of empty directory is 1970-01-01 00:00:00
|
|
63
76
|
|
|
64
|
-
If path is not an existent path, which means hdfs_exist(path) returns False,
|
|
77
|
+
If path is not an existent path, which means hdfs_exist(path) returns False,
|
|
78
|
+
then raise FileNotFoundError
|
|
65
79
|
|
|
66
80
|
:param path: Given path
|
|
67
81
|
:returns: Last-modified time
|
|
68
82
|
:raises: FileNotFoundError
|
|
69
|
-
|
|
83
|
+
"""
|
|
70
84
|
return HdfsPath(path).getmtime(follow_symlinks)
|
|
71
85
|
|
|
72
86
|
|
|
73
87
|
def hdfs_getsize(path: PathLike, follow_symlinks: bool = False) -> int:
|
|
74
|
-
|
|
88
|
+
"""
|
|
75
89
|
Get file size on the given path path (in bytes).
|
|
76
|
-
If the path in a directory, return the sum of all file size in it,
|
|
77
|
-
|
|
90
|
+
If the path in a directory, return the sum of all file size in it,
|
|
91
|
+
including file in subdirectories (if exist).
|
|
92
|
+
|
|
93
|
+
The result excludes the size of directory itself. In other words,
|
|
94
|
+
return 0 Byte on an empty directory path.
|
|
78
95
|
|
|
79
|
-
If path is not an existent path, which means hdfs_exist(path) returns False,
|
|
96
|
+
If path is not an existent path, which means hdfs_exist(path) returns False,
|
|
97
|
+
then raise FileNotFoundError
|
|
80
98
|
|
|
81
99
|
:param path: Given path
|
|
82
100
|
:returns: File size
|
|
83
101
|
:raises: FileNotFoundError
|
|
84
|
-
|
|
102
|
+
"""
|
|
85
103
|
return HdfsPath(path).getsize(follow_symlinks)
|
|
86
104
|
|
|
87
105
|
|
|
88
106
|
def hdfs_isdir(path: PathLike, followlinks: bool = False) -> bool:
|
|
89
|
-
|
|
107
|
+
"""
|
|
90
108
|
Test if an hdfs url is directory
|
|
91
109
|
Specific procedures are as follows:
|
|
92
110
|
If there exists a suffix, of which ``os.path.join(path, suffix)`` is a file
|
|
93
111
|
If the url is empty bucket or hdfs://
|
|
94
112
|
|
|
95
113
|
:param path: Given path
|
|
96
|
-
:param followlinks: whether followlinks is True or False, result is the same.
|
|
114
|
+
:param followlinks: whether followlinks is True or False, result is the same.
|
|
115
|
+
Because hdfs symlink not support dir.
|
|
97
116
|
:returns: True if path is hdfs directory, else False
|
|
98
|
-
|
|
117
|
+
"""
|
|
99
118
|
return HdfsPath(path).is_dir(followlinks)
|
|
100
119
|
|
|
101
120
|
|
|
102
121
|
def hdfs_isfile(path: PathLike, followlinks: bool = False) -> bool:
|
|
103
|
-
|
|
122
|
+
"""
|
|
104
123
|
Test if an path is file
|
|
105
124
|
|
|
106
125
|
:param path: Given path
|
|
107
126
|
:returns: True if path is hdfs file, else False
|
|
108
|
-
|
|
127
|
+
"""
|
|
109
128
|
return HdfsPath(path).is_file(followlinks)
|
|
110
129
|
|
|
111
130
|
|
|
112
131
|
def hdfs_listdir(path: PathLike, followlinks: bool = False) -> List[str]:
|
|
113
|
-
|
|
132
|
+
"""
|
|
114
133
|
Get all contents of given path.
|
|
115
134
|
|
|
116
135
|
:param path: Given path
|
|
117
136
|
:returns: All contents have prefix of path.
|
|
118
137
|
:raises: FileNotFoundError, NotADirectoryError
|
|
119
|
-
|
|
138
|
+
"""
|
|
120
139
|
return HdfsPath(path).listdir(followlinks)
|
|
121
140
|
|
|
122
141
|
|
|
123
142
|
def hdfs_load_from(path: PathLike, followlinks: bool = False) -> BinaryIO:
|
|
124
|
-
|
|
143
|
+
"""Read all content in binary on specified path and write into memory
|
|
125
144
|
|
|
126
145
|
User should close the BinaryIO manually
|
|
127
146
|
|
|
128
147
|
:param path: Given path
|
|
129
148
|
:returns: BinaryIO
|
|
130
|
-
|
|
149
|
+
"""
|
|
131
150
|
return HdfsPath(path).load(followlinks)
|
|
132
151
|
|
|
133
152
|
|
|
134
|
-
def hdfs_move(
|
|
135
|
-
|
|
136
|
-
'''
|
|
153
|
+
def hdfs_move(src_path: PathLike, dst_path: PathLike, overwrite: bool = True) -> None:
|
|
154
|
+
"""
|
|
137
155
|
Move file/directory path from src_path to dst_path
|
|
138
156
|
|
|
139
157
|
:param src_path: Given path
|
|
140
158
|
:param dst_path: Given destination path
|
|
141
|
-
|
|
159
|
+
"""
|
|
142
160
|
return HdfsPath(src_path).move(dst_path, overwrite)
|
|
143
161
|
|
|
144
162
|
|
|
145
163
|
def hdfs_remove(path: PathLike, missing_ok: bool = False) -> None:
|
|
146
|
-
|
|
147
|
-
Remove the file or directory on hdfs, `hdfs://` and `hdfs://bucket` are not
|
|
164
|
+
"""
|
|
165
|
+
Remove the file or directory on hdfs, `hdfs://` and `hdfs://bucket` are not
|
|
166
|
+
permitted to remove
|
|
148
167
|
|
|
149
168
|
:param path: Given path
|
|
150
|
-
:param missing_ok: if False and target file/directory not exists,
|
|
169
|
+
:param missing_ok: if False and target file/directory not exists,
|
|
170
|
+
raise FileNotFoundError
|
|
151
171
|
:raises: FileNotFoundError, UnsupportedError
|
|
152
|
-
|
|
172
|
+
"""
|
|
153
173
|
return HdfsPath(path).remove(missing_ok)
|
|
154
174
|
|
|
155
175
|
|
|
156
176
|
def hdfs_scan(
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
'''
|
|
177
|
+
path: PathLike, missing_ok: bool = True, followlinks: bool = False
|
|
178
|
+
) -> Iterator[str]:
|
|
179
|
+
"""
|
|
161
180
|
Iteratively traverse only files in given hdfs directory.
|
|
162
181
|
Every iteration on generator yields a path string.
|
|
163
182
|
|
|
@@ -165,61 +184,64 @@ def hdfs_scan(
|
|
|
165
184
|
If path is a non-existent path, return an empty generator
|
|
166
185
|
If path is a bucket path, return all file paths in the bucket
|
|
167
186
|
If path is an empty bucket, return an empty generator
|
|
168
|
-
If path doesn't contain any bucket, which is path == 'hdfs://',
|
|
187
|
+
If path doesn't contain any bucket, which is path == 'hdfs://',
|
|
188
|
+
raise UnsupportedError. walk() on complete hdfs is not supported in megfile
|
|
169
189
|
|
|
170
190
|
:param path: Given path
|
|
171
|
-
:param missing_ok: If False and there's no file in the directory,
|
|
191
|
+
:param missing_ok: If False and there's no file in the directory,
|
|
192
|
+
raise FileNotFoundError
|
|
172
193
|
:raises: UnsupportedError
|
|
173
194
|
:returns: A file path generator
|
|
174
|
-
|
|
195
|
+
"""
|
|
175
196
|
return HdfsPath(path).scan(missing_ok, followlinks)
|
|
176
197
|
|
|
177
198
|
|
|
178
199
|
def hdfs_scan_stat(
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
'''
|
|
200
|
+
path: PathLike, missing_ok: bool = True, followlinks: bool = False
|
|
201
|
+
) -> Iterator[FileEntry]:
|
|
202
|
+
"""
|
|
183
203
|
Iteratively traverse only files in given directory.
|
|
184
204
|
Every iteration on generator yields a tuple of path string and file stat
|
|
185
205
|
|
|
186
206
|
:param path: Given path
|
|
187
|
-
:param missing_ok: If False and there's no file in the directory,
|
|
207
|
+
:param missing_ok: If False and there's no file in the directory,
|
|
208
|
+
raise FileNotFoundError
|
|
188
209
|
:raises: UnsupportedError
|
|
189
210
|
:returns: A file path generator
|
|
190
|
-
|
|
211
|
+
"""
|
|
191
212
|
return HdfsPath(path).scan_stat(missing_ok, followlinks)
|
|
192
213
|
|
|
193
214
|
|
|
194
|
-
def hdfs_scandir(path: PathLike,
|
|
195
|
-
|
|
196
|
-
'''
|
|
215
|
+
def hdfs_scandir(path: PathLike, followlinks: bool = False) -> Iterator[FileEntry]:
|
|
216
|
+
"""
|
|
197
217
|
Get all contents of given path, the order of result is not guaranteed.
|
|
198
218
|
|
|
199
219
|
:param path: Given path
|
|
200
220
|
:returns: All contents have prefix of path
|
|
201
221
|
:raises: FileNotFoundError, NotADirectoryError
|
|
202
|
-
|
|
222
|
+
"""
|
|
203
223
|
return HdfsPath(path).scandir(followlinks)
|
|
204
224
|
|
|
205
225
|
|
|
206
226
|
def hdfs_unlink(path: PathLike, missing_ok: bool = False) -> None:
|
|
207
|
-
|
|
227
|
+
"""
|
|
208
228
|
Remove the file on hdfs
|
|
209
229
|
|
|
210
230
|
:param path: Given path
|
|
211
231
|
:param missing_ok: if False and target file not exists, raise FileNotFoundError
|
|
212
232
|
:raises: FileNotFoundError, IsADirectoryError
|
|
213
|
-
|
|
233
|
+
"""
|
|
214
234
|
return HdfsPath(path).unlink(missing_ok)
|
|
215
235
|
|
|
216
236
|
|
|
217
237
|
def hdfs_walk(
|
|
218
|
-
|
|
219
|
-
followlinks: bool = False
|
|
238
|
+
path: PathLike, followlinks: bool = False
|
|
220
239
|
) -> Iterator[Tuple[str, List[str], List[str]]]:
|
|
221
|
-
|
|
222
|
-
Iteratively traverse the given hdfs directory, in top-bottom order.
|
|
240
|
+
"""
|
|
241
|
+
Iteratively traverse the given hdfs directory, in top-bottom order.
|
|
242
|
+
In other words, firstly traverse parent directory, if subdirectories exist,
|
|
243
|
+
traverse the subdirectories.
|
|
244
|
+
|
|
223
245
|
Every iteration on generator yields a 3-tuple: (root, dirs, files)
|
|
224
246
|
|
|
225
247
|
- root: Current hdfs path;
|
|
@@ -227,49 +249,59 @@ def hdfs_walk(
|
|
|
227
249
|
- files: Name list of files in current directory.
|
|
228
250
|
|
|
229
251
|
If path is a file path, return an empty generator
|
|
252
|
+
|
|
230
253
|
If path is a non-existent path, return an empty generator
|
|
231
|
-
|
|
232
|
-
If path is
|
|
233
|
-
|
|
254
|
+
|
|
255
|
+
If path is a bucket path, bucket will be the top directory,
|
|
256
|
+
and will be returned at first iteration of generator
|
|
257
|
+
|
|
258
|
+
If path is an empty bucket, only yield one 3-tuple
|
|
259
|
+
(notes: hdfs doesn't have empty directory)
|
|
260
|
+
|
|
261
|
+
If path doesn't contain any bucket, which is path == 'hdfs://',
|
|
262
|
+
raise UnsupportedError. walk() on complete hdfs is not supported in megfile
|
|
234
263
|
|
|
235
264
|
:param path: Given path
|
|
236
|
-
:param followlinks: whether followlinks is True or False, result is the same.
|
|
265
|
+
:param followlinks: whether followlinks is True or False, result is the same.
|
|
266
|
+
Because hdfs not support symlink.
|
|
237
267
|
:returns: A 3-tuple generator
|
|
238
|
-
|
|
268
|
+
"""
|
|
239
269
|
return HdfsPath(path).walk(followlinks)
|
|
240
270
|
|
|
241
271
|
|
|
242
272
|
def hdfs_getmd5(
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
'''
|
|
273
|
+
path: PathLike, recalculate: bool = False, followlinks: bool = False
|
|
274
|
+
) -> str:
|
|
275
|
+
"""
|
|
247
276
|
Get checksum of the file or dir.
|
|
248
277
|
|
|
249
278
|
:param path: Given path
|
|
250
279
|
:param recalculate: Ignore this parameter, just for compatibility
|
|
251
280
|
:param followlinks: Ignore this parameter, just for compatibility
|
|
252
281
|
:returns: checksum
|
|
253
|
-
|
|
282
|
+
"""
|
|
254
283
|
return HdfsPath(path).md5(recalculate, followlinks)
|
|
255
284
|
|
|
256
285
|
|
|
257
286
|
def hdfs_save_as(file_object: BinaryIO, path: PathLike):
|
|
258
|
-
|
|
287
|
+
"""Write the opened binary stream to specified path,
|
|
288
|
+
but the stream won't be closed
|
|
259
289
|
|
|
260
290
|
:param path: Given path
|
|
261
291
|
:param file_object: Stream to be read
|
|
262
|
-
|
|
292
|
+
"""
|
|
263
293
|
return HdfsPath(path).save(file_object)
|
|
264
294
|
|
|
265
295
|
|
|
266
296
|
def hdfs_open(
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
297
|
+
path: PathLike,
|
|
298
|
+
mode: str = "r",
|
|
299
|
+
*,
|
|
300
|
+
buffering: Optional[int] = None,
|
|
301
|
+
encoding: Optional[str] = None,
|
|
302
|
+
errors: Optional[str] = None,
|
|
303
|
+
**kwargs,
|
|
304
|
+
) -> IO:
|
|
274
305
|
return HdfsPath(path).open(
|
|
275
|
-
mode, buffering=buffering, encoding=encoding, errors=errors
|
|
306
|
+
mode, buffering=buffering, encoding=encoding, errors=errors
|
|
307
|
+
)
|