megfile 3.1.6__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megfile/cli.py +12 -7
- megfile/config.py +34 -44
- megfile/fs.py +169 -11
- megfile/fs_path.py +183 -259
- megfile/hdfs.py +106 -5
- megfile/hdfs_path.py +34 -90
- megfile/http.py +50 -1
- megfile/http_path.py +27 -65
- megfile/interfaces.py +1 -8
- megfile/lib/base_prefetch_reader.py +62 -78
- megfile/lib/combine_reader.py +5 -0
- megfile/lib/glob.py +3 -6
- megfile/lib/hdfs_prefetch_reader.py +7 -7
- megfile/lib/http_prefetch_reader.py +6 -6
- megfile/lib/s3_buffered_writer.py +67 -64
- megfile/lib/s3_cached_handler.py +1 -2
- megfile/lib/s3_limited_seekable_writer.py +3 -7
- megfile/lib/s3_memory_handler.py +1 -2
- megfile/lib/s3_pipe_handler.py +1 -2
- megfile/lib/s3_prefetch_reader.py +15 -20
- megfile/lib/s3_share_cache_reader.py +8 -5
- megfile/pathlike.py +397 -401
- megfile/s3.py +118 -17
- megfile/s3_path.py +150 -224
- megfile/sftp.py +300 -10
- megfile/sftp_path.py +46 -322
- megfile/smart.py +33 -27
- megfile/smart_path.py +9 -14
- megfile/stdio.py +1 -1
- megfile/stdio_path.py +2 -2
- megfile/utils/__init__.py +11 -4
- megfile/version.py +1 -1
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/METADATA +7 -7
- megfile-4.0.0.dist-info/RECORD +52 -0
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/WHEEL +1 -1
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/top_level.txt +0 -2
- docs/conf.py +0 -65
- megfile-3.1.6.dist-info/RECORD +0 -55
- scripts/convert_results_to_sarif.py +0 -91
- scripts/generate_file.py +0 -344
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/LICENSE +0 -0
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/entry_points.txt +0 -0
megfile/sftp_path.py
CHANGED
|
@@ -6,22 +6,21 @@ import os
|
|
|
6
6
|
import random
|
|
7
7
|
import shlex
|
|
8
8
|
import socket
|
|
9
|
-
import subprocess
|
|
9
|
+
import subprocess # nosec B404
|
|
10
10
|
from functools import cached_property
|
|
11
11
|
from logging import getLogger as get_logger
|
|
12
12
|
from stat import S_ISDIR, S_ISLNK, S_ISREG
|
|
13
|
-
from typing import IO, BinaryIO, Callable, Iterator, List, Optional, Tuple, Union
|
|
13
|
+
from typing import IO, BinaryIO, Callable, Iterator, List, Optional, Tuple, Type, Union
|
|
14
14
|
from urllib.parse import urlsplit, urlunsplit
|
|
15
15
|
|
|
16
16
|
import paramiko
|
|
17
17
|
|
|
18
|
-
from megfile.config import SFTP_MAX_RETRY_TIMES
|
|
18
|
+
from megfile.config import SFTP_HOST_KEY_POLICY, SFTP_MAX_RETRY_TIMES
|
|
19
19
|
from megfile.errors import SameFileError, _create_missing_ok_generator, patch_method
|
|
20
20
|
from megfile.interfaces import ContextIterator, FileEntry, PathLike, StatResult
|
|
21
21
|
from megfile.lib.compare import is_same_file
|
|
22
22
|
from megfile.lib.compat import fspath
|
|
23
23
|
from megfile.lib.glob import FSFunc, iglob
|
|
24
|
-
from megfile.lib.joinpath import uri_join
|
|
25
24
|
from megfile.pathlike import URIPath
|
|
26
25
|
from megfile.smart_path import SmartPath
|
|
27
26
|
from megfile.utils import calculate_md5, thread_local
|
|
@@ -31,23 +30,13 @@ _logger = get_logger(__name__)
|
|
|
31
30
|
__all__ = [
|
|
32
31
|
"SftpPath",
|
|
33
32
|
"is_sftp",
|
|
34
|
-
"sftp_readlink",
|
|
35
|
-
"sftp_glob",
|
|
36
|
-
"sftp_iglob",
|
|
37
|
-
"sftp_glob_stat",
|
|
38
|
-
"sftp_resolve",
|
|
39
|
-
"sftp_download",
|
|
40
|
-
"sftp_upload",
|
|
41
|
-
"sftp_path_join",
|
|
42
|
-
"sftp_concat",
|
|
43
|
-
"sftp_lstat",
|
|
44
33
|
]
|
|
45
34
|
|
|
46
35
|
SFTP_USERNAME = "SFTP_USERNAME"
|
|
47
|
-
SFTP_PASSWORD = "SFTP_PASSWORD"
|
|
36
|
+
SFTP_PASSWORD = "SFTP_PASSWORD" # nosec B105
|
|
48
37
|
SFTP_PRIVATE_KEY_PATH = "SFTP_PRIVATE_KEY_PATH"
|
|
49
38
|
SFTP_PRIVATE_KEY_TYPE = "SFTP_PRIVATE_KEY_TYPE"
|
|
50
|
-
SFTP_PRIVATE_KEY_PASSWORD = "SFTP_PRIVATE_KEY_PASSWORD"
|
|
39
|
+
SFTP_PRIVATE_KEY_PASSWORD = "SFTP_PRIVATE_KEY_PASSWORD" # nosec B105
|
|
51
40
|
SFTP_MAX_UNAUTH_CONN = "SFTP_MAX_UNAUTH_CONN"
|
|
52
41
|
MAX_RETRIES = SFTP_MAX_RETRY_TIMES
|
|
53
42
|
DEFAULT_SSH_CONNECT_TIMEOUT = 5
|
|
@@ -120,10 +109,11 @@ def _patch_sftp_client_request(
|
|
|
120
109
|
port: Optional[int] = None,
|
|
121
110
|
username: Optional[str] = None,
|
|
122
111
|
password: Optional[str] = None,
|
|
112
|
+
default_policy: Type[paramiko.MissingHostKeyPolicy] = paramiko.RejectPolicy,
|
|
123
113
|
):
|
|
124
114
|
def retry_callback(error, *args, **kwargs):
|
|
125
115
|
client.close()
|
|
126
|
-
ssh_client = get_ssh_client(hostname, port, username, password)
|
|
116
|
+
ssh_client = get_ssh_client(hostname, port, username, password, default_policy)
|
|
127
117
|
ssh_client.close()
|
|
128
118
|
atexit.unregister(ssh_client.close)
|
|
129
119
|
ssh_key = f"ssh_client:{hostname},{port},{username},{password}"
|
|
@@ -134,7 +124,11 @@ def _patch_sftp_client_request(
|
|
|
134
124
|
del thread_local[sftp_key]
|
|
135
125
|
|
|
136
126
|
new_sftp_client = get_sftp_client(
|
|
137
|
-
hostname=hostname,
|
|
127
|
+
hostname=hostname,
|
|
128
|
+
port=port,
|
|
129
|
+
username=username,
|
|
130
|
+
password=password,
|
|
131
|
+
default_policy=default_policy,
|
|
138
132
|
)
|
|
139
133
|
client.sock = new_sftp_client.sock
|
|
140
134
|
|
|
@@ -152,17 +146,24 @@ def _get_sftp_client(
|
|
|
152
146
|
port: Optional[int] = None,
|
|
153
147
|
username: Optional[str] = None,
|
|
154
148
|
password: Optional[str] = None,
|
|
149
|
+
default_policy: Type[paramiko.MissingHostKeyPolicy] = paramiko.RejectPolicy,
|
|
155
150
|
) -> paramiko.SFTPClient:
|
|
156
151
|
"""Get sftp client
|
|
157
152
|
|
|
158
153
|
:returns: sftp client
|
|
159
154
|
"""
|
|
160
155
|
session = get_ssh_session(
|
|
161
|
-
hostname=hostname,
|
|
156
|
+
hostname=hostname,
|
|
157
|
+
port=port,
|
|
158
|
+
username=username,
|
|
159
|
+
password=password,
|
|
160
|
+
default_policy=default_policy,
|
|
162
161
|
)
|
|
163
162
|
session.invoke_subsystem("sftp")
|
|
164
163
|
sftp_client = paramiko.SFTPClient(session)
|
|
165
|
-
_patch_sftp_client_request(
|
|
164
|
+
_patch_sftp_client_request(
|
|
165
|
+
sftp_client, hostname, port, username, password, default_policy
|
|
166
|
+
)
|
|
166
167
|
return sftp_client
|
|
167
168
|
|
|
168
169
|
|
|
@@ -171,6 +172,7 @@ def get_sftp_client(
|
|
|
171
172
|
port: Optional[int] = None,
|
|
172
173
|
username: Optional[str] = None,
|
|
173
174
|
password: Optional[str] = None,
|
|
175
|
+
default_policy: Type[paramiko.MissingHostKeyPolicy] = paramiko.RejectPolicy,
|
|
174
176
|
) -> paramiko.SFTPClient:
|
|
175
177
|
"""Get sftp client
|
|
176
178
|
|
|
@@ -183,6 +185,7 @@ def get_sftp_client(
|
|
|
183
185
|
port,
|
|
184
186
|
username,
|
|
185
187
|
password,
|
|
188
|
+
default_policy,
|
|
186
189
|
)
|
|
187
190
|
|
|
188
191
|
|
|
@@ -191,19 +194,27 @@ def _get_ssh_client(
|
|
|
191
194
|
port: Optional[int] = None,
|
|
192
195
|
username: Optional[str] = None,
|
|
193
196
|
password: Optional[str] = None,
|
|
197
|
+
default_policy: Type[paramiko.MissingHostKeyPolicy] = paramiko.RejectPolicy,
|
|
194
198
|
) -> paramiko.SSHClient:
|
|
195
199
|
hostname, port, username, password, private_key = provide_connect_info(
|
|
196
200
|
hostname=hostname, port=port, username=username, password=password
|
|
197
201
|
)
|
|
198
202
|
|
|
203
|
+
policies = {
|
|
204
|
+
"auto": paramiko.AutoAddPolicy,
|
|
205
|
+
"reject": paramiko.RejectPolicy,
|
|
206
|
+
"warning": paramiko.WarningPolicy,
|
|
207
|
+
}
|
|
208
|
+
policy = policies.get(SFTP_HOST_KEY_POLICY, default_policy)() # pyre-ignore[29]
|
|
209
|
+
|
|
199
210
|
ssh_client = paramiko.SSHClient()
|
|
200
|
-
ssh_client.set_missing_host_key_policy(
|
|
211
|
+
ssh_client.set_missing_host_key_policy(policy)
|
|
201
212
|
max_unauth_connections = int(os.getenv(SFTP_MAX_UNAUTH_CONN, 10))
|
|
202
213
|
try:
|
|
203
214
|
fd = os.open(
|
|
204
215
|
os.path.join(
|
|
205
|
-
"/tmp",
|
|
206
|
-
f"megfile-sftp-{hostname}-{random.randint(1, max_unauth_connections)}",
|
|
216
|
+
"/tmp", # nosec B108
|
|
217
|
+
f"megfile-sftp-{hostname}-{random.randint(1, max_unauth_connections)}", # nosec B311
|
|
207
218
|
),
|
|
208
219
|
os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
|
|
209
220
|
)
|
|
@@ -237,6 +248,7 @@ def get_ssh_client(
|
|
|
237
248
|
port: Optional[int] = None,
|
|
238
249
|
username: Optional[str] = None,
|
|
239
250
|
password: Optional[str] = None,
|
|
251
|
+
default_policy: Type[paramiko.MissingHostKeyPolicy] = paramiko.RejectPolicy,
|
|
240
252
|
) -> paramiko.SSHClient:
|
|
241
253
|
return thread_local(
|
|
242
254
|
f"ssh_client:{hostname},{port},{username},{password}",
|
|
@@ -245,6 +257,7 @@ def get_ssh_client(
|
|
|
245
257
|
port,
|
|
246
258
|
username,
|
|
247
259
|
password,
|
|
260
|
+
default_policy,
|
|
248
261
|
)
|
|
249
262
|
|
|
250
263
|
|
|
@@ -253,9 +266,10 @@ def get_ssh_session(
|
|
|
253
266
|
port: Optional[int] = None,
|
|
254
267
|
username: Optional[str] = None,
|
|
255
268
|
password: Optional[str] = None,
|
|
269
|
+
default_policy: Type[paramiko.MissingHostKeyPolicy] = paramiko.RejectPolicy,
|
|
256
270
|
) -> paramiko.Channel:
|
|
257
271
|
def retry_callback(error, *args, **kwargs):
|
|
258
|
-
ssh_client = get_ssh_client(hostname, port, username, password)
|
|
272
|
+
ssh_client = get_ssh_client(hostname, port, username, password, default_policy)
|
|
259
273
|
ssh_client.close()
|
|
260
274
|
atexit.unregister(ssh_client.close)
|
|
261
275
|
ssh_key = f"ssh_client:{hostname},{port},{username},{password}"
|
|
@@ -270,7 +284,7 @@ def get_ssh_session(
|
|
|
270
284
|
max_retries=MAX_RETRIES,
|
|
271
285
|
should_retry=sftp_should_retry,
|
|
272
286
|
retry_callback=retry_callback,
|
|
273
|
-
)(hostname, port, username, password)
|
|
287
|
+
)(hostname, port, username, password, default_policy)
|
|
274
288
|
|
|
275
289
|
|
|
276
290
|
def _open_session(
|
|
@@ -278,8 +292,9 @@ def _open_session(
|
|
|
278
292
|
port: Optional[int] = None,
|
|
279
293
|
username: Optional[str] = None,
|
|
280
294
|
password: Optional[str] = None,
|
|
295
|
+
default_policy: Type[paramiko.MissingHostKeyPolicy] = paramiko.RejectPolicy,
|
|
281
296
|
) -> paramiko.Channel:
|
|
282
|
-
ssh_client = get_ssh_client(hostname, port, username, password)
|
|
297
|
+
ssh_client = get_ssh_client(hostname, port, username, password, default_policy)
|
|
283
298
|
transport = ssh_client.get_transport()
|
|
284
299
|
if not transport:
|
|
285
300
|
raise paramiko.SSHException("Get transport error")
|
|
@@ -302,136 +317,11 @@ def is_sftp(path: PathLike) -> bool:
|
|
|
302
317
|
return parts.scheme == "sftp"
|
|
303
318
|
|
|
304
319
|
|
|
305
|
-
def sftp_readlink(path: PathLike) -> "str":
|
|
306
|
-
"""
|
|
307
|
-
Return a SftpPath instance representing the path to which the symbolic link points.
|
|
308
|
-
|
|
309
|
-
:param path: Given path
|
|
310
|
-
:returns: Return a SftpPath instance representing the path to
|
|
311
|
-
which the symbolic link points.
|
|
312
|
-
"""
|
|
313
|
-
return SftpPath(path).readlink().path_with_protocol
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
def sftp_glob(
|
|
317
|
-
path: PathLike, recursive: bool = True, missing_ok: bool = True
|
|
318
|
-
) -> List[str]:
|
|
319
|
-
"""Return path list in ascending alphabetical order,
|
|
320
|
-
in which path matches glob pattern
|
|
321
|
-
|
|
322
|
-
1. If doesn't match any path, return empty list
|
|
323
|
-
Notice: ``glob.glob`` in standard library returns ['a/'] instead of empty list
|
|
324
|
-
when pathname is like `a/**`, recursive is True and directory 'a' doesn't exist.
|
|
325
|
-
fs_glob behaves like ``glob.glob`` in standard library under such circumstance.
|
|
326
|
-
2. No guarantee that each path in result is different, which means:
|
|
327
|
-
Assume there exists a path `/a/b/c/b/d.txt`
|
|
328
|
-
use path pattern like `/**/b/**/*.txt` to glob,
|
|
329
|
-
the path above will be returned twice
|
|
330
|
-
3. `**` will match any matched file, directory, symlink and '' by default,
|
|
331
|
-
when recursive is `True`
|
|
332
|
-
4. fs_glob returns same as glob.glob(pathname, recursive=True)
|
|
333
|
-
in ascending alphabetical order.
|
|
334
|
-
5. Hidden files (filename stars with '.') will not be found in the result
|
|
335
|
-
|
|
336
|
-
:param path: Given path
|
|
337
|
-
:param pattern: Glob the given relative pattern in the directory represented
|
|
338
|
-
by this path
|
|
339
|
-
:param recursive: If False, `**` will not search directory recursively
|
|
340
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
341
|
-
raise FileNotFoundError
|
|
342
|
-
:returns: A list contains paths match `pathname`
|
|
343
|
-
"""
|
|
344
|
-
return list(sftp_iglob(path=path, recursive=recursive, missing_ok=missing_ok))
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
def sftp_glob_stat(
|
|
348
|
-
path: PathLike, recursive: bool = True, missing_ok: bool = True
|
|
349
|
-
) -> Iterator[FileEntry]:
|
|
350
|
-
"""Return a list contains tuples of path and file stat, in ascending alphabetical
|
|
351
|
-
order, in which path matches glob pattern
|
|
352
|
-
|
|
353
|
-
1. If doesn't match any path, return empty list
|
|
354
|
-
Notice: ``glob.glob`` in standard library returns ['a/'] instead of empty list
|
|
355
|
-
when pathname is like `a/**`, recursive is True and directory 'a' doesn't exist.
|
|
356
|
-
sftp_glob behaves like ``glob.glob`` in standard library under such circumstance.
|
|
357
|
-
2. No guarantee that each path in result is different, which means:
|
|
358
|
-
Assume there exists a path `/a/b/c/b/d.txt`
|
|
359
|
-
use path pattern like `/**/b/**/*.txt` to glob,
|
|
360
|
-
the path above will be returned twice
|
|
361
|
-
3. `**` will match any matched file, directory, symlink and '' by default,
|
|
362
|
-
when recursive is `True`
|
|
363
|
-
4. fs_glob returns same as glob.glob(pathname, recursive=True) in
|
|
364
|
-
ascending alphabetical order.
|
|
365
|
-
5. Hidden files (filename stars with '.') will not be found in the result
|
|
366
|
-
|
|
367
|
-
:param path: Given path
|
|
368
|
-
:param pattern: Glob the given relative pattern in the directory represented
|
|
369
|
-
by this path
|
|
370
|
-
:param recursive: If False, `**` will not search directory recursively
|
|
371
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
372
|
-
raise FileNotFoundError
|
|
373
|
-
:returns: A list contains tuples of path and file stat,
|
|
374
|
-
in which paths match `pathname`
|
|
375
|
-
"""
|
|
376
|
-
for path in sftp_iglob(path=path, recursive=recursive, missing_ok=missing_ok):
|
|
377
|
-
path_object = SftpPath(path)
|
|
378
|
-
yield FileEntry(
|
|
379
|
-
path_object.name, path_object.path_with_protocol, path_object.lstat()
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
def sftp_iglob(
|
|
384
|
-
path: PathLike, recursive: bool = True, missing_ok: bool = True
|
|
385
|
-
) -> Iterator[str]:
|
|
386
|
-
"""Return path iterator in ascending alphabetical order,
|
|
387
|
-
in which path matches glob pattern
|
|
388
|
-
|
|
389
|
-
1. If doesn't match any path, return empty list
|
|
390
|
-
Notice: ``glob.glob`` in standard library returns ['a/'] instead of empty list
|
|
391
|
-
when pathname is like `a/**`, recursive is True and directory 'a' doesn't exist.
|
|
392
|
-
fs_glob behaves like ``glob.glob`` in standard library under such circumstance.
|
|
393
|
-
2. No guarantee that each path in result is different, which means:
|
|
394
|
-
Assume there exists a path `/a/b/c/b/d.txt`
|
|
395
|
-
use path pattern like `/**/b/**/*.txt` to glob,
|
|
396
|
-
the path above will be returned twice
|
|
397
|
-
3. `**` will match any matched file, directory, symlink and '' by default,
|
|
398
|
-
when recursive is `True`
|
|
399
|
-
4. fs_glob returns same as glob.glob(pathname, recursive=True) in
|
|
400
|
-
ascending alphabetical order.
|
|
401
|
-
5. Hidden files (filename stars with '.') will not be found in the result
|
|
402
|
-
|
|
403
|
-
:param path: Given path
|
|
404
|
-
:param pattern: Glob the given relative pattern in the directory represented
|
|
405
|
-
by this path
|
|
406
|
-
:param recursive: If False, `**` will not search directory recursively
|
|
407
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
408
|
-
raise FileNotFoundError
|
|
409
|
-
:returns: An iterator contains paths match `pathname`
|
|
410
|
-
"""
|
|
411
|
-
|
|
412
|
-
for path in SftpPath(path).iglob(
|
|
413
|
-
pattern="", recursive=recursive, missing_ok=missing_ok
|
|
414
|
-
):
|
|
415
|
-
yield path.path_with_protocol
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
def sftp_resolve(path: PathLike, strict=False) -> "str":
|
|
419
|
-
"""Equal to fs_realpath
|
|
420
|
-
|
|
421
|
-
:param path: Given path
|
|
422
|
-
:param strict: Ignore this parameter, just for compatibility
|
|
423
|
-
:return: Return the canonical path of the specified filename,
|
|
424
|
-
eliminating any symbolic links encountered in the path.
|
|
425
|
-
:rtype: SftpPath
|
|
426
|
-
"""
|
|
427
|
-
return SftpPath(path).resolve(strict).path_with_protocol
|
|
428
|
-
|
|
429
|
-
|
|
430
320
|
def _sftp_scan_pairs(
|
|
431
321
|
src_url: PathLike, dst_url: PathLike
|
|
432
322
|
) -> Iterator[Tuple[PathLike, PathLike]]:
|
|
433
323
|
for src_file_path in SftpPath(src_url).scan():
|
|
434
|
-
content_path = src_file_path[len(src_url) :]
|
|
324
|
+
content_path = src_file_path[len(fspath(src_url)) :]
|
|
435
325
|
if len(content_path) > 0:
|
|
436
326
|
dst_file_path = SftpPath(dst_url).joinpath(content_path).path_with_protocol
|
|
437
327
|
else:
|
|
@@ -439,175 +329,6 @@ def _sftp_scan_pairs(
|
|
|
439
329
|
yield src_file_path, dst_file_path
|
|
440
330
|
|
|
441
331
|
|
|
442
|
-
def sftp_download(
|
|
443
|
-
src_url: PathLike,
|
|
444
|
-
dst_url: PathLike,
|
|
445
|
-
callback: Optional[Callable[[int], None]] = None,
|
|
446
|
-
followlinks: bool = False,
|
|
447
|
-
overwrite: bool = True,
|
|
448
|
-
):
|
|
449
|
-
"""
|
|
450
|
-
Downloads a file from sftp to local filesystem.
|
|
451
|
-
|
|
452
|
-
:param src_url: source sftp path
|
|
453
|
-
:param dst_url: target fs path
|
|
454
|
-
:param callback: Called periodically during copy, and the input parameter is
|
|
455
|
-
the data size (in bytes) of copy since the last call
|
|
456
|
-
:param followlinks: False if regard symlink as file, else True
|
|
457
|
-
:param overwrite: whether or not overwrite file when exists, default is True
|
|
458
|
-
"""
|
|
459
|
-
from megfile.fs import is_fs
|
|
460
|
-
from megfile.fs_path import FSPath
|
|
461
|
-
|
|
462
|
-
if not is_fs(dst_url):
|
|
463
|
-
raise OSError(f"dst_url is not fs path: {dst_url}")
|
|
464
|
-
if not is_sftp(src_url) and not isinstance(src_url, SftpPath):
|
|
465
|
-
raise OSError(f"src_url is not sftp path: {src_url}")
|
|
466
|
-
|
|
467
|
-
dst_path = FSPath(dst_url)
|
|
468
|
-
if not overwrite and dst_path.exists():
|
|
469
|
-
return
|
|
470
|
-
|
|
471
|
-
if isinstance(src_url, SftpPath):
|
|
472
|
-
src_path = src_url
|
|
473
|
-
else:
|
|
474
|
-
src_path = SftpPath(src_url)
|
|
475
|
-
|
|
476
|
-
if followlinks and src_path.is_symlink():
|
|
477
|
-
src_path = src_path.readlink()
|
|
478
|
-
if src_path.is_dir():
|
|
479
|
-
raise IsADirectoryError("Is a directory: %r" % src_url)
|
|
480
|
-
if str(dst_url).endswith("/"):
|
|
481
|
-
raise IsADirectoryError("Is a directory: %r" % dst_url)
|
|
482
|
-
|
|
483
|
-
dst_path.parent.makedirs(exist_ok=True)
|
|
484
|
-
|
|
485
|
-
sftp_callback = None
|
|
486
|
-
if callback:
|
|
487
|
-
bytes_transferred_before = 0
|
|
488
|
-
|
|
489
|
-
def sftp_callback(bytes_transferred: int, _total_bytes: int):
|
|
490
|
-
nonlocal bytes_transferred_before
|
|
491
|
-
callback(bytes_transferred - bytes_transferred_before) # pyre-ignore[29]
|
|
492
|
-
bytes_transferred_before = bytes_transferred
|
|
493
|
-
|
|
494
|
-
src_path._client.get(
|
|
495
|
-
src_path._real_path, dst_path.path_without_protocol, callback=sftp_callback
|
|
496
|
-
)
|
|
497
|
-
|
|
498
|
-
src_stat = src_path.stat()
|
|
499
|
-
dst_path.utime(src_stat.st_atime, src_stat.st_mtime)
|
|
500
|
-
dst_path.chmod(src_stat.st_mode)
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
def sftp_upload(
|
|
504
|
-
src_url: PathLike,
|
|
505
|
-
dst_url: PathLike,
|
|
506
|
-
callback: Optional[Callable[[int], None]] = None,
|
|
507
|
-
followlinks: bool = False,
|
|
508
|
-
overwrite: bool = True,
|
|
509
|
-
):
|
|
510
|
-
"""
|
|
511
|
-
Uploads a file from local filesystem to sftp server.
|
|
512
|
-
|
|
513
|
-
:param src_url: source fs path
|
|
514
|
-
:param dst_url: target sftp path
|
|
515
|
-
:param callback: Called periodically during copy, and the input parameter is
|
|
516
|
-
the data size (in bytes) of copy since the last call
|
|
517
|
-
:param overwrite: whether or not overwrite file when exists, default is True
|
|
518
|
-
"""
|
|
519
|
-
from megfile.fs import is_fs
|
|
520
|
-
from megfile.fs_path import FSPath
|
|
521
|
-
|
|
522
|
-
if not is_fs(src_url):
|
|
523
|
-
raise OSError(f"src_url is not fs path: {src_url}")
|
|
524
|
-
if not is_sftp(dst_url) and not isinstance(dst_url, SftpPath):
|
|
525
|
-
raise OSError(f"dst_url is not sftp path: {dst_url}")
|
|
526
|
-
|
|
527
|
-
if followlinks and os.path.islink(src_url):
|
|
528
|
-
src_url = os.readlink(src_url)
|
|
529
|
-
if os.path.isdir(src_url):
|
|
530
|
-
raise IsADirectoryError("Is a directory: %r" % src_url)
|
|
531
|
-
if str(dst_url).endswith("/"):
|
|
532
|
-
raise IsADirectoryError("Is a directory: %r" % dst_url)
|
|
533
|
-
|
|
534
|
-
src_path = FSPath(src_url)
|
|
535
|
-
if isinstance(dst_url, SftpPath):
|
|
536
|
-
dst_path = dst_url
|
|
537
|
-
else:
|
|
538
|
-
dst_path = SftpPath(dst_url)
|
|
539
|
-
if not overwrite and dst_path.exists():
|
|
540
|
-
return
|
|
541
|
-
|
|
542
|
-
dst_path.parent.makedirs(exist_ok=True)
|
|
543
|
-
|
|
544
|
-
sftp_callback = None
|
|
545
|
-
if callback:
|
|
546
|
-
bytes_transferred_before = 0
|
|
547
|
-
|
|
548
|
-
def sftp_callback(bytes_transferred: int, _total_bytes: int):
|
|
549
|
-
nonlocal bytes_transferred_before
|
|
550
|
-
callback(bytes_transferred - bytes_transferred_before) # pyre-ignore[29]
|
|
551
|
-
bytes_transferred_before = bytes_transferred
|
|
552
|
-
|
|
553
|
-
dst_path._client.put(
|
|
554
|
-
src_path.path_without_protocol, dst_path._real_path, callback=sftp_callback
|
|
555
|
-
)
|
|
556
|
-
|
|
557
|
-
src_stat = src_path.stat()
|
|
558
|
-
dst_path.utime(src_stat.st_atime, src_stat.st_mtime)
|
|
559
|
-
dst_path.chmod(src_stat.st_mode)
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
def sftp_path_join(path: PathLike, *other_paths: PathLike) -> str:
|
|
563
|
-
"""
|
|
564
|
-
Concat 2 or more path to a complete path
|
|
565
|
-
|
|
566
|
-
:param path: Given path
|
|
567
|
-
:param other_paths: Paths to be concatenated
|
|
568
|
-
:returns: Concatenated complete path
|
|
569
|
-
|
|
570
|
-
.. note ::
|
|
571
|
-
|
|
572
|
-
The difference between this function and ``os.path.join`` is that this function
|
|
573
|
-
ignores left side slash (which indicates absolute path) in ``other_paths``
|
|
574
|
-
and will directly concat.
|
|
575
|
-
|
|
576
|
-
e.g. os.path.join('/path', 'to', '/file') => '/file',
|
|
577
|
-
but sftp_path_join('/path', 'to', '/file') => '/path/to/file'
|
|
578
|
-
"""
|
|
579
|
-
return uri_join(fspath(path), *map(fspath, other_paths))
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
def sftp_concat(src_paths: List[PathLike], dst_path: PathLike) -> None:
|
|
583
|
-
"""Concatenate sftp files to one file.
|
|
584
|
-
|
|
585
|
-
:param src_paths: Given source paths
|
|
586
|
-
:param dst_path: Given destination path
|
|
587
|
-
"""
|
|
588
|
-
dst_path_obj = SftpPath(dst_path)
|
|
589
|
-
|
|
590
|
-
def get_real_path(path: PathLike) -> str:
|
|
591
|
-
return SftpPath(path)._real_path
|
|
592
|
-
|
|
593
|
-
command = ["cat", *map(get_real_path, src_paths), ">", get_real_path(dst_path)]
|
|
594
|
-
exec_result = dst_path_obj._exec_command(command)
|
|
595
|
-
if exec_result.returncode != 0:
|
|
596
|
-
_logger.error(exec_result.stderr)
|
|
597
|
-
raise OSError(f"Failed to concat {src_paths} to {dst_path}")
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
def sftp_lstat(path: PathLike) -> StatResult:
|
|
601
|
-
"""
|
|
602
|
-
Get StatResult of file on sftp, including file size and mtime,
|
|
603
|
-
referring to fs_getsize and fs_getmtime
|
|
604
|
-
|
|
605
|
-
:param path: Given path
|
|
606
|
-
:returns: StatResult
|
|
607
|
-
"""
|
|
608
|
-
return SftpPath(path).lstat()
|
|
609
|
-
|
|
610
|
-
|
|
611
332
|
@SmartPath.register
|
|
612
333
|
class SftpPath(URIPath):
|
|
613
334
|
"""sftp protocol
|
|
@@ -620,6 +341,7 @@ class SftpPath(URIPath):
|
|
|
620
341
|
"""
|
|
621
342
|
|
|
622
343
|
protocol = "sftp"
|
|
344
|
+
default_policy = paramiko.RejectPolicy
|
|
623
345
|
|
|
624
346
|
def __init__(self, path: "PathLike", *other_paths: "PathLike"):
|
|
625
347
|
super().__init__(path, *other_paths)
|
|
@@ -652,6 +374,7 @@ class SftpPath(URIPath):
|
|
|
652
374
|
port=self._urlsplit_parts.port,
|
|
653
375
|
username=self._urlsplit_parts.username,
|
|
654
376
|
password=self._urlsplit_parts.password,
|
|
377
|
+
default_policy=self.default_policy,
|
|
655
378
|
)
|
|
656
379
|
|
|
657
380
|
def _generate_path_object(self, sftp_local_path: str, resolve: bool = False):
|
|
@@ -1334,11 +1057,12 @@ class SftpPath(URIPath):
|
|
|
1334
1057
|
port=self._urlsplit_parts.port,
|
|
1335
1058
|
username=self._urlsplit_parts.username,
|
|
1336
1059
|
password=self._urlsplit_parts.password,
|
|
1060
|
+
default_policy=self.default_policy,
|
|
1337
1061
|
) as chan:
|
|
1338
1062
|
chan.settimeout(timeout)
|
|
1339
1063
|
if environment:
|
|
1340
1064
|
chan.update_environment(environment)
|
|
1341
|
-
chan.exec_command(" ".join([shlex.quote(arg) for arg in command]))
|
|
1065
|
+
chan.exec_command(" ".join([shlex.quote(arg) for arg in command])) # nosec B601
|
|
1342
1066
|
stdout = (
|
|
1343
1067
|
chan.makefile("r", bufsize).read().decode(errors="backslashreplace")
|
|
1344
1068
|
)
|
megfile/smart.py
CHANGED
|
@@ -397,7 +397,9 @@ def smart_copy(
|
|
|
397
397
|
def _smart_sync_single_file(items: dict):
|
|
398
398
|
src_root_path = items["src_root_path"]
|
|
399
399
|
dst_root_path = items["dst_root_path"]
|
|
400
|
-
|
|
400
|
+
src_file_entry = items["src_file_entry"]
|
|
401
|
+
src_file_path = src_file_entry.path
|
|
402
|
+
src_file_stat = src_file_entry.stat
|
|
401
403
|
callback = items["callback"]
|
|
402
404
|
followlinks = items["followlinks"]
|
|
403
405
|
callback_after_copy_file = items["callback_after_copy_file"]
|
|
@@ -417,17 +419,17 @@ def _smart_sync_single_file(items: dict):
|
|
|
417
419
|
dst_protocol, _ = SmartPath._extract_protocol(dst_abs_file_path)
|
|
418
420
|
should_sync = True
|
|
419
421
|
try:
|
|
420
|
-
if force:
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
except NotImplementedError:
|
|
422
|
+
if not force:
|
|
423
|
+
dst_file_stat = smart_stat(dst_abs_file_path, follow_symlinks=followlinks)
|
|
424
|
+
if not overwrite:
|
|
425
|
+
should_sync = False
|
|
426
|
+
elif is_same_file(
|
|
427
|
+
src_file_stat,
|
|
428
|
+
dst_file_stat,
|
|
429
|
+
get_sync_type(src_protocol, dst_protocol),
|
|
430
|
+
):
|
|
431
|
+
should_sync = False
|
|
432
|
+
except (NotImplementedError, FileNotFoundError):
|
|
431
433
|
pass
|
|
432
434
|
|
|
433
435
|
if should_sync:
|
|
@@ -513,15 +515,16 @@ def smart_sync(
|
|
|
513
515
|
src_path, dst_path = get_traditional_path(src_path), get_traditional_path(dst_path)
|
|
514
516
|
if not src_file_stats:
|
|
515
517
|
src_file_stats = smart_scan_stat(src_path, followlinks=followlinks)
|
|
518
|
+
if not smart_exists(dst_path):
|
|
519
|
+
force = True
|
|
516
520
|
|
|
517
521
|
def create_generator():
|
|
518
522
|
for src_file_entry in src_file_stats:
|
|
519
523
|
if src_file_entry.name:
|
|
520
|
-
src_file_path = src_file_entry.path
|
|
521
524
|
yield dict(
|
|
522
525
|
src_root_path=src_path,
|
|
523
526
|
dst_root_path=dst_path,
|
|
524
|
-
|
|
527
|
+
src_file_entry=src_file_entry,
|
|
525
528
|
callback=callback,
|
|
526
529
|
followlinks=followlinks,
|
|
527
530
|
callback_after_copy_file=callback_after_copy_file,
|
|
@@ -671,9 +674,10 @@ def smart_makedirs(path: PathLike, exist_ok: bool = False) -> None:
|
|
|
671
674
|
def smart_open(
|
|
672
675
|
path: PathLike,
|
|
673
676
|
mode: str = "r",
|
|
674
|
-
s3_open_func: Callable[[str, str], BinaryIO] = s3_open,
|
|
675
677
|
encoding: Optional[str] = None,
|
|
676
678
|
errors: Optional[str] = None,
|
|
679
|
+
*,
|
|
680
|
+
s3_open_func: Callable[[str, str], BinaryIO] = s3_open,
|
|
677
681
|
**options,
|
|
678
682
|
) -> IO:
|
|
679
683
|
r"""
|
|
@@ -685,16 +689,6 @@ def smart_open(
|
|
|
685
689
|
this function create directories automatically, instead of
|
|
686
690
|
raising FileNotFoundError
|
|
687
691
|
|
|
688
|
-
Currently, supported protocols are:
|
|
689
|
-
|
|
690
|
-
1. s3: "s3://<bucket>/<key>"
|
|
691
|
-
|
|
692
|
-
2. http(s): http(s) url
|
|
693
|
-
|
|
694
|
-
3. stdio: "stdio://-"
|
|
695
|
-
|
|
696
|
-
4. FS file: Besides above mentioned protocols, other path are considered fs path
|
|
697
|
-
|
|
698
692
|
Here are a few examples: ::
|
|
699
693
|
|
|
700
694
|
>>> import cv2
|
|
@@ -708,12 +702,24 @@ def smart_open(
|
|
|
708
702
|
|
|
709
703
|
:param path: Given path
|
|
710
704
|
:param mode: Mode to open file, supports r'[rwa][tb]?\+?'
|
|
711
|
-
:param s3_open_func: Function used to open s3_url. Require the function includes 2
|
|
712
|
-
necessary parameters, file path and mode
|
|
713
705
|
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
714
706
|
the file. This should only be used in text mode.
|
|
715
707
|
:param errors: errors is an optional string that specifies how encoding and decoding
|
|
716
708
|
errors are to be handled—this cannot be used in binary mode.
|
|
709
|
+
:param buffering: buffering is an optional integer used to
|
|
710
|
+
set the buffering policy. Only be used when support.
|
|
711
|
+
:param followlinks: follow symbolic link, default `False`. Only be used when support
|
|
712
|
+
:param s3_open_func: Function used to open s3_url. Require the function includes
|
|
713
|
+
2 necessary parameters, file path and mode. only be used in s3 path.
|
|
714
|
+
:param max_workers: Max download / upload thread number, `None` by default,
|
|
715
|
+
will use global thread pool with 8 threads. Only be used in s3, http, hdfs.
|
|
716
|
+
:param max_buffer_size: Max cached buffer size in memory, 128MB by default.
|
|
717
|
+
Set to `0` will disable cache. Only be used in s3, http, hdfs.
|
|
718
|
+
:param block_forward: How many blocks of data cached from offset position, only for
|
|
719
|
+
read mode. Only be used in s3, http, hdfs.
|
|
720
|
+
:param block_size: Size of single block. Each block will be uploaded by single
|
|
721
|
+
thread. Only be used in s3, http, hdfs.
|
|
722
|
+
|
|
717
723
|
:returns: File-Like object
|
|
718
724
|
:raises: FileNotFoundError, IsADirectoryError, ValueError
|
|
719
725
|
"""
|