megfile 3.1.1__py3-none-any.whl → 3.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. docs/conf.py +2 -4
  2. megfile/__init__.py +394 -203
  3. megfile/cli.py +258 -238
  4. megfile/config.py +25 -21
  5. megfile/errors.py +126 -114
  6. megfile/fs.py +174 -140
  7. megfile/fs_path.py +462 -354
  8. megfile/hdfs.py +133 -101
  9. megfile/hdfs_path.py +290 -236
  10. megfile/http.py +15 -14
  11. megfile/http_path.py +111 -107
  12. megfile/interfaces.py +70 -65
  13. megfile/lib/base_prefetch_reader.py +84 -65
  14. megfile/lib/combine_reader.py +12 -12
  15. megfile/lib/compare.py +17 -13
  16. megfile/lib/compat.py +1 -5
  17. megfile/lib/fnmatch.py +29 -30
  18. megfile/lib/glob.py +46 -54
  19. megfile/lib/hdfs_prefetch_reader.py +40 -25
  20. megfile/lib/hdfs_tools.py +1 -3
  21. megfile/lib/http_prefetch_reader.py +69 -46
  22. megfile/lib/joinpath.py +5 -5
  23. megfile/lib/lazy_handler.py +7 -3
  24. megfile/lib/s3_buffered_writer.py +58 -51
  25. megfile/lib/s3_cached_handler.py +13 -14
  26. megfile/lib/s3_limited_seekable_writer.py +37 -28
  27. megfile/lib/s3_memory_handler.py +34 -30
  28. megfile/lib/s3_pipe_handler.py +24 -25
  29. megfile/lib/s3_prefetch_reader.py +71 -52
  30. megfile/lib/s3_share_cache_reader.py +37 -24
  31. megfile/lib/shadow_handler.py +7 -3
  32. megfile/lib/stdio_handler.py +9 -8
  33. megfile/lib/url.py +3 -3
  34. megfile/pathlike.py +259 -228
  35. megfile/s3.py +220 -153
  36. megfile/s3_path.py +977 -802
  37. megfile/sftp.py +190 -156
  38. megfile/sftp_path.py +540 -450
  39. megfile/smart.py +397 -330
  40. megfile/smart_path.py +100 -105
  41. megfile/stdio.py +10 -9
  42. megfile/stdio_path.py +32 -35
  43. megfile/utils/__init__.py +73 -54
  44. megfile/utils/mutex.py +11 -14
  45. megfile/version.py +1 -1
  46. {megfile-3.1.1.dist-info → megfile-3.1.3.dist-info}/METADATA +5 -8
  47. megfile-3.1.3.dist-info/RECORD +55 -0
  48. {megfile-3.1.1.dist-info → megfile-3.1.3.dist-info}/WHEEL +1 -1
  49. scripts/convert_results_to_sarif.py +45 -78
  50. scripts/generate_file.py +140 -64
  51. megfile-3.1.1.dist-info/RECORD +0 -55
  52. {megfile-3.1.1.dist-info → megfile-3.1.3.dist-info}/LICENSE +0 -0
  53. {megfile-3.1.1.dist-info → megfile-3.1.3.dist-info}/LICENSE.pyre +0 -0
  54. {megfile-3.1.1.dist-info → megfile-3.1.3.dist-info}/entry_points.txt +0 -0
  55. {megfile-3.1.1.dist-info → megfile-3.1.3.dist-info}/top_level.txt +0 -0
megfile/hdfs.py CHANGED
@@ -1,163 +1,182 @@
1
1
  from typing import IO, BinaryIO, Iterator, List, Optional, Tuple
2
2
 
3
- from megfile.hdfs_path import HdfsPath, hdfs_glob, hdfs_glob_stat, hdfs_iglob, hdfs_makedirs, is_hdfs
3
+ from megfile.hdfs_path import (
4
+ HdfsPath,
5
+ hdfs_glob,
6
+ hdfs_glob_stat,
7
+ hdfs_iglob,
8
+ hdfs_makedirs,
9
+ is_hdfs,
10
+ )
4
11
  from megfile.interfaces import FileEntry, PathLike, StatResult
5
12
 
6
13
  __all__ = [
7
- 'is_hdfs',
8
- 'hdfs_glob',
9
- 'hdfs_glob_stat',
10
- 'hdfs_iglob',
11
- 'hdfs_makedirs',
12
- 'hdfs_exists',
13
- 'hdfs_stat',
14
- 'hdfs_getmtime',
15
- 'hdfs_getsize',
16
- 'hdfs_isdir',
17
- 'hdfs_isfile',
18
- 'hdfs_listdir',
19
- 'hdfs_load_from',
20
- 'hdfs_move',
21
- 'hdfs_remove',
22
- 'hdfs_scan',
23
- 'hdfs_scan_stat',
24
- 'hdfs_scandir',
25
- 'hdfs_unlink',
26
- 'hdfs_walk',
27
- 'hdfs_getmd5',
28
- 'hdfs_save_as',
29
- 'hdfs_open',
14
+ "is_hdfs",
15
+ "hdfs_glob",
16
+ "hdfs_glob_stat",
17
+ "hdfs_iglob",
18
+ "hdfs_makedirs",
19
+ "hdfs_exists",
20
+ "hdfs_stat",
21
+ "hdfs_getmtime",
22
+ "hdfs_getsize",
23
+ "hdfs_isdir",
24
+ "hdfs_isfile",
25
+ "hdfs_listdir",
26
+ "hdfs_load_from",
27
+ "hdfs_move",
28
+ "hdfs_remove",
29
+ "hdfs_scan",
30
+ "hdfs_scan_stat",
31
+ "hdfs_scandir",
32
+ "hdfs_unlink",
33
+ "hdfs_walk",
34
+ "hdfs_getmd5",
35
+ "hdfs_save_as",
36
+ "hdfs_open",
30
37
  ]
31
38
 
32
39
 
33
40
  def hdfs_exists(path: PathLike, followlinks: bool = False) -> bool:
34
- '''
41
+ """
35
42
  Test if path exists
36
43
 
37
44
  If the bucket of path are not permitted to read, return False
38
45
 
39
46
  :param path: Given path
40
47
  :returns: True if path exists, else False
41
- '''
48
+ """
42
49
  return HdfsPath(path).exists(followlinks)
43
50
 
44
51
 
45
52
  def hdfs_stat(path: PathLike, follow_symlinks=True) -> StatResult:
46
- '''
47
- Get StatResult of path file, including file size and mtime, referring to hdfs_getsize and hdfs_getmtime
53
+ """
54
+ Get StatResult of path file, including file size and mtime,
55
+ referring to hdfs_getsize and hdfs_getmtime
48
56
 
49
- If path is not an existent path, which means hdfs_exist(path) returns False, then raise FileNotFoundError
50
- If attempt to get StatResult of complete hdfs, such as hdfs_dir_url == 'hdfs://', raise BucketNotFoundError
57
+ If path is not an existent path, which means hdfs_exist(path) returns False,
58
+ then raise FileNotFoundError
59
+
60
+ If attempt to get StatResult of complete hdfs, such as hdfs_dir_url == 'hdfs://',
61
+ raise BucketNotFoundError
51
62
 
52
63
  :param path: Given path
53
64
  :returns: StatResult
54
65
  :raises: FileNotFoundError
55
- '''
66
+ """
56
67
  return HdfsPath(path).stat(follow_symlinks)
57
68
 
58
69
 
59
70
  def hdfs_getmtime(path: PathLike, follow_symlinks: bool = False) -> float:
60
- '''
61
- Get last-modified time of the file on the given path path (in Unix timestamp format).
62
- If the path is an existent directory, return the latest modified time of all file in it. The mtime of empty directory is 1970-01-01 00:00:00
71
+ """
72
+ Get last-modified time of the file on the given path path (in Unix timestamp
73
+ format).
74
+ If the path is an existent directory, return the latest modified time of all
75
+ file in it. The mtime of empty directory is 1970-01-01 00:00:00
63
76
 
64
- If path is not an existent path, which means hdfs_exist(path) returns False, then raise FileNotFoundError
77
+ If path is not an existent path, which means hdfs_exist(path) returns False,
78
+ then raise FileNotFoundError
65
79
 
66
80
  :param path: Given path
67
81
  :returns: Last-modified time
68
82
  :raises: FileNotFoundError
69
- '''
83
+ """
70
84
  return HdfsPath(path).getmtime(follow_symlinks)
71
85
 
72
86
 
73
87
  def hdfs_getsize(path: PathLike, follow_symlinks: bool = False) -> int:
74
- '''
88
+ """
75
89
  Get file size on the given path path (in bytes).
76
- If the path in a directory, return the sum of all file size in it, including file in subdirectories (if exist).
77
- The result excludes the size of directory itself. In other words, return 0 Byte on an empty directory path.
90
+ If the path in a directory, return the sum of all file size in it,
91
+ including file in subdirectories (if exist).
92
+
93
+ The result excludes the size of directory itself. In other words,
94
+ return 0 Byte on an empty directory path.
78
95
 
79
- If path is not an existent path, which means hdfs_exist(path) returns False, then raise FileNotFoundError
96
+ If path is not an existent path, which means hdfs_exist(path) returns False,
97
+ then raise FileNotFoundError
80
98
 
81
99
  :param path: Given path
82
100
  :returns: File size
83
101
  :raises: FileNotFoundError
84
- '''
102
+ """
85
103
  return HdfsPath(path).getsize(follow_symlinks)
86
104
 
87
105
 
88
106
  def hdfs_isdir(path: PathLike, followlinks: bool = False) -> bool:
89
- '''
107
+ """
90
108
  Test if an hdfs url is directory
91
109
  Specific procedures are as follows:
92
110
  If there exists a suffix, of which ``os.path.join(path, suffix)`` is a file
93
111
  If the url is empty bucket or hdfs://
94
112
 
95
113
  :param path: Given path
96
- :param followlinks: whether followlinks is True or False, result is the same. Because hdfs symlink not support dir.
114
+ :param followlinks: whether followlinks is True or False, result is the same.
115
+ Because hdfs symlink not support dir.
97
116
  :returns: True if path is hdfs directory, else False
98
- '''
117
+ """
99
118
  return HdfsPath(path).is_dir(followlinks)
100
119
 
101
120
 
102
121
  def hdfs_isfile(path: PathLike, followlinks: bool = False) -> bool:
103
- '''
122
+ """
104
123
  Test if an path is file
105
124
 
106
125
  :param path: Given path
107
126
  :returns: True if path is hdfs file, else False
108
- '''
127
+ """
109
128
  return HdfsPath(path).is_file(followlinks)
110
129
 
111
130
 
112
131
  def hdfs_listdir(path: PathLike, followlinks: bool = False) -> List[str]:
113
- '''
132
+ """
114
133
  Get all contents of given path.
115
134
 
116
135
  :param path: Given path
117
136
  :returns: All contents have prefix of path.
118
137
  :raises: FileNotFoundError, NotADirectoryError
119
- '''
138
+ """
120
139
  return HdfsPath(path).listdir(followlinks)
121
140
 
122
141
 
123
142
  def hdfs_load_from(path: PathLike, followlinks: bool = False) -> BinaryIO:
124
- '''Read all content in binary on specified path and write into memory
143
+ """Read all content in binary on specified path and write into memory
125
144
 
126
145
  User should close the BinaryIO manually
127
146
 
128
147
  :param path: Given path
129
148
  :returns: BinaryIO
130
- '''
149
+ """
131
150
  return HdfsPath(path).load(followlinks)
132
151
 
133
152
 
134
- def hdfs_move(
135
- src_path: PathLike, dst_path: PathLike, overwrite: bool = True) -> None:
136
- '''
153
+ def hdfs_move(src_path: PathLike, dst_path: PathLike, overwrite: bool = True) -> None:
154
+ """
137
155
  Move file/directory path from src_path to dst_path
138
156
 
139
157
  :param src_path: Given path
140
158
  :param dst_path: Given destination path
141
- '''
159
+ """
142
160
  return HdfsPath(src_path).move(dst_path, overwrite)
143
161
 
144
162
 
145
163
  def hdfs_remove(path: PathLike, missing_ok: bool = False) -> None:
146
- '''
147
- Remove the file or directory on hdfs, `hdfs://` and `hdfs://bucket` are not permitted to remove
164
+ """
165
+ Remove the file or directory on hdfs, `hdfs://` and `hdfs://bucket` are not
166
+ permitted to remove
148
167
 
149
168
  :param path: Given path
150
- :param missing_ok: if False and target file/directory not exists, raise FileNotFoundError
169
+ :param missing_ok: if False and target file/directory not exists,
170
+ raise FileNotFoundError
151
171
  :raises: FileNotFoundError, UnsupportedError
152
- '''
172
+ """
153
173
  return HdfsPath(path).remove(missing_ok)
154
174
 
155
175
 
156
176
  def hdfs_scan(
157
- path: PathLike,
158
- missing_ok: bool = True,
159
- followlinks: bool = False) -> Iterator[str]:
160
- '''
177
+ path: PathLike, missing_ok: bool = True, followlinks: bool = False
178
+ ) -> Iterator[str]:
179
+ """
161
180
  Iteratively traverse only files in given hdfs directory.
162
181
  Every iteration on generator yields a path string.
163
182
 
@@ -165,61 +184,64 @@ def hdfs_scan(
165
184
  If path is a non-existent path, return an empty generator
166
185
  If path is a bucket path, return all file paths in the bucket
167
186
  If path is an empty bucket, return an empty generator
168
- If path doesn't contain any bucket, which is path == 'hdfs://', raise UnsupportedError. walk() on complete hdfs is not supported in megfile
187
+ If path doesn't contain any bucket, which is path == 'hdfs://',
188
+ raise UnsupportedError. walk() on complete hdfs is not supported in megfile
169
189
 
170
190
  :param path: Given path
171
- :param missing_ok: If False and there's no file in the directory, raise FileNotFoundError
191
+ :param missing_ok: If False and there's no file in the directory,
192
+ raise FileNotFoundError
172
193
  :raises: UnsupportedError
173
194
  :returns: A file path generator
174
- '''
195
+ """
175
196
  return HdfsPath(path).scan(missing_ok, followlinks)
176
197
 
177
198
 
178
199
  def hdfs_scan_stat(
179
- path: PathLike,
180
- missing_ok: bool = True,
181
- followlinks: bool = False) -> Iterator[FileEntry]:
182
- '''
200
+ path: PathLike, missing_ok: bool = True, followlinks: bool = False
201
+ ) -> Iterator[FileEntry]:
202
+ """
183
203
  Iteratively traverse only files in given directory.
184
204
  Every iteration on generator yields a tuple of path string and file stat
185
205
 
186
206
  :param path: Given path
187
- :param missing_ok: If False and there's no file in the directory, raise FileNotFoundError
207
+ :param missing_ok: If False and there's no file in the directory,
208
+ raise FileNotFoundError
188
209
  :raises: UnsupportedError
189
210
  :returns: A file path generator
190
- '''
211
+ """
191
212
  return HdfsPath(path).scan_stat(missing_ok, followlinks)
192
213
 
193
214
 
194
- def hdfs_scandir(path: PathLike,
195
- followlinks: bool = False) -> Iterator[FileEntry]:
196
- '''
215
+ def hdfs_scandir(path: PathLike, followlinks: bool = False) -> Iterator[FileEntry]:
216
+ """
197
217
  Get all contents of given path, the order of result is not guaranteed.
198
218
 
199
219
  :param path: Given path
200
220
  :returns: All contents have prefix of path
201
221
  :raises: FileNotFoundError, NotADirectoryError
202
- '''
222
+ """
203
223
  return HdfsPath(path).scandir(followlinks)
204
224
 
205
225
 
206
226
  def hdfs_unlink(path: PathLike, missing_ok: bool = False) -> None:
207
- '''
227
+ """
208
228
  Remove the file on hdfs
209
229
 
210
230
  :param path: Given path
211
231
  :param missing_ok: if False and target file not exists, raise FileNotFoundError
212
232
  :raises: FileNotFoundError, IsADirectoryError
213
- '''
233
+ """
214
234
  return HdfsPath(path).unlink(missing_ok)
215
235
 
216
236
 
217
237
  def hdfs_walk(
218
- path: PathLike,
219
- followlinks: bool = False
238
+ path: PathLike, followlinks: bool = False
220
239
  ) -> Iterator[Tuple[str, List[str], List[str]]]:
221
- '''
222
- Iteratively traverse the given hdfs directory, in top-bottom order. In other words, firstly traverse parent directory, if subdirectories exist, traverse the subdirectories.
240
+ """
241
+ Iteratively traverse the given hdfs directory, in top-bottom order.
242
+ In other words, firstly traverse parent directory, if subdirectories exist,
243
+ traverse the subdirectories.
244
+
223
245
  Every iteration on generator yields a 3-tuple: (root, dirs, files)
224
246
 
225
247
  - root: Current hdfs path;
@@ -227,49 +249,59 @@ def hdfs_walk(
227
249
  - files: Name list of files in current directory.
228
250
 
229
251
  If path is a file path, return an empty generator
252
+
230
253
  If path is a non-existent path, return an empty generator
231
- If path is a bucket path, bucket will be the top directory, and will be returned at first iteration of generator
232
- If path is an empty bucket, only yield one 3-tuple (notes: hdfs doesn't have empty directory)
233
- If path doesn't contain any bucket, which is path == 'hdfs://', raise UnsupportedError. walk() on complete hdfs is not supported in megfile
254
+
255
+ If path is a bucket path, bucket will be the top directory,
256
+ and will be returned at first iteration of generator
257
+
258
+ If path is an empty bucket, only yield one 3-tuple
259
+ (notes: hdfs doesn't have empty directory)
260
+
261
+ If path doesn't contain any bucket, which is path == 'hdfs://',
262
+ raise UnsupportedError. walk() on complete hdfs is not supported in megfile
234
263
 
235
264
  :param path: Given path
236
- :param followlinks: whether followlinks is True or False, result is the same. Because hdfs not support symlink.
265
+ :param followlinks: whether followlinks is True or False, result is the same.
266
+ Because hdfs not support symlink.
237
267
  :returns: A 3-tuple generator
238
- '''
268
+ """
239
269
  return HdfsPath(path).walk(followlinks)
240
270
 
241
271
 
242
272
  def hdfs_getmd5(
243
- path: PathLike,
244
- recalculate: bool = False,
245
- followlinks: bool = False) -> str:
246
- '''
273
+ path: PathLike, recalculate: bool = False, followlinks: bool = False
274
+ ) -> str:
275
+ """
247
276
  Get checksum of the file or dir.
248
277
 
249
278
  :param path: Given path
250
279
  :param recalculate: Ignore this parameter, just for compatibility
251
280
  :param followlinks: Ignore this parameter, just for compatibility
252
281
  :returns: checksum
253
- '''
282
+ """
254
283
  return HdfsPath(path).md5(recalculate, followlinks)
255
284
 
256
285
 
257
286
  def hdfs_save_as(file_object: BinaryIO, path: PathLike):
258
- '''Write the opened binary stream to specified path, but the stream won't be closed
287
+ """Write the opened binary stream to specified path,
288
+ but the stream won't be closed
259
289
 
260
290
  :param path: Given path
261
291
  :param file_object: Stream to be read
262
- '''
292
+ """
263
293
  return HdfsPath(path).save(file_object)
264
294
 
265
295
 
266
296
  def hdfs_open(
267
- path: PathLike,
268
- mode: str = 'r',
269
- *,
270
- buffering: Optional[int] = None,
271
- encoding: Optional[str] = None,
272
- errors: Optional[str] = None,
273
- **kwargs) -> IO:
297
+ path: PathLike,
298
+ mode: str = "r",
299
+ *,
300
+ buffering: Optional[int] = None,
301
+ encoding: Optional[str] = None,
302
+ errors: Optional[str] = None,
303
+ **kwargs,
304
+ ) -> IO:
274
305
  return HdfsPath(path).open(
275
- mode, buffering=buffering, encoding=encoding, errors=errors)
306
+ mode, buffering=buffering, encoding=encoding, errors=errors
307
+ )