python-misc-utils 0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_misc_utils/__init__.py +0 -0
- py_misc_utils/abs_timeout.py +12 -0
- py_misc_utils/alog.py +311 -0
- py_misc_utils/app_main.py +179 -0
- py_misc_utils/archive_streamer.py +112 -0
- py_misc_utils/assert_checks.py +118 -0
- py_misc_utils/ast_utils.py +121 -0
- py_misc_utils/async_manager.py +189 -0
- py_misc_utils/break_control.py +63 -0
- py_misc_utils/buffered_iterator.py +35 -0
- py_misc_utils/cached_file.py +507 -0
- py_misc_utils/call_limiter.py +26 -0
- py_misc_utils/call_result_selector.py +13 -0
- py_misc_utils/cleanups.py +85 -0
- py_misc_utils/cmd.py +97 -0
- py_misc_utils/compression.py +116 -0
- py_misc_utils/cond_waiter.py +13 -0
- py_misc_utils/context_base.py +18 -0
- py_misc_utils/context_managers.py +67 -0
- py_misc_utils/core_utils.py +577 -0
- py_misc_utils/daemon_process.py +252 -0
- py_misc_utils/data_cache.py +46 -0
- py_misc_utils/date_utils.py +90 -0
- py_misc_utils/debug.py +24 -0
- py_misc_utils/dyn_modules.py +50 -0
- py_misc_utils/dynamod.py +103 -0
- py_misc_utils/env_config.py +35 -0
- py_misc_utils/executor.py +239 -0
- py_misc_utils/file_overwrite.py +29 -0
- py_misc_utils/fin_wrap.py +77 -0
- py_misc_utils/fp_utils.py +47 -0
- py_misc_utils/fs/__init__.py +0 -0
- py_misc_utils/fs/file_fs.py +127 -0
- py_misc_utils/fs/ftp_fs.py +242 -0
- py_misc_utils/fs/gcs_fs.py +196 -0
- py_misc_utils/fs/http_fs.py +241 -0
- py_misc_utils/fs/s3_fs.py +417 -0
- py_misc_utils/fs_base.py +133 -0
- py_misc_utils/fs_utils.py +207 -0
- py_misc_utils/gcs_fs.py +169 -0
- py_misc_utils/gen_indices.py +54 -0
- py_misc_utils/gfs.py +371 -0
- py_misc_utils/git_repo.py +77 -0
- py_misc_utils/global_namespace.py +110 -0
- py_misc_utils/http_async_fetcher.py +139 -0
- py_misc_utils/http_server.py +196 -0
- py_misc_utils/http_utils.py +143 -0
- py_misc_utils/img_utils.py +20 -0
- py_misc_utils/infix_op.py +20 -0
- py_misc_utils/inspect_utils.py +205 -0
- py_misc_utils/iostream.py +21 -0
- py_misc_utils/iter_file.py +117 -0
- py_misc_utils/key_wrap.py +46 -0
- py_misc_utils/lazy_import.py +25 -0
- py_misc_utils/lockfile.py +164 -0
- py_misc_utils/mem_size.py +64 -0
- py_misc_utils/mirror_from.py +72 -0
- py_misc_utils/mmap.py +16 -0
- py_misc_utils/module_utils.py +196 -0
- py_misc_utils/moving_average.py +19 -0
- py_misc_utils/msgpack_streamer.py +26 -0
- py_misc_utils/multi_wait.py +24 -0
- py_misc_utils/multiprocessing.py +102 -0
- py_misc_utils/named_array.py +224 -0
- py_misc_utils/no_break.py +46 -0
- py_misc_utils/no_except.py +32 -0
- py_misc_utils/np_ml_framework.py +184 -0
- py_misc_utils/np_utils.py +346 -0
- py_misc_utils/ntuple_utils.py +38 -0
- py_misc_utils/num_utils.py +54 -0
- py_misc_utils/obj.py +73 -0
- py_misc_utils/object_cache.py +100 -0
- py_misc_utils/object_tracker.py +88 -0
- py_misc_utils/ordered_set.py +71 -0
- py_misc_utils/osfd.py +27 -0
- py_misc_utils/packet.py +22 -0
- py_misc_utils/parquet_streamer.py +69 -0
- py_misc_utils/pd_utils.py +254 -0
- py_misc_utils/periodic_task.py +61 -0
- py_misc_utils/pickle_wrap.py +121 -0
- py_misc_utils/pipeline.py +98 -0
- py_misc_utils/remap_pickle.py +50 -0
- py_misc_utils/resource_manager.py +155 -0
- py_misc_utils/rnd_utils.py +56 -0
- py_misc_utils/run_once.py +19 -0
- py_misc_utils/scheduler.py +135 -0
- py_misc_utils/select_params.py +300 -0
- py_misc_utils/signal.py +141 -0
- py_misc_utils/skl_utils.py +270 -0
- py_misc_utils/split.py +147 -0
- py_misc_utils/state.py +53 -0
- py_misc_utils/std_module.py +56 -0
- py_misc_utils/stream_dataframe.py +176 -0
- py_misc_utils/streamed_file.py +144 -0
- py_misc_utils/tempdir.py +79 -0
- py_misc_utils/template_replace.py +51 -0
- py_misc_utils/tensor_stream.py +269 -0
- py_misc_utils/thread_context.py +33 -0
- py_misc_utils/throttle.py +30 -0
- py_misc_utils/time_trigger.py +18 -0
- py_misc_utils/timegen.py +11 -0
- py_misc_utils/traceback.py +49 -0
- py_misc_utils/tracking_executor.py +91 -0
- py_misc_utils/transform_array.py +42 -0
- py_misc_utils/uncompress.py +35 -0
- py_misc_utils/url_fetcher.py +157 -0
- py_misc_utils/utils.py +538 -0
- py_misc_utils/varint.py +50 -0
- py_misc_utils/virt_array.py +52 -0
- py_misc_utils/weak_call.py +33 -0
- py_misc_utils/work_results.py +100 -0
- py_misc_utils/writeback_file.py +43 -0
- python_misc_utils-0.2.dist-info/METADATA +36 -0
- python_misc_utils-0.2.dist-info/RECORD +117 -0
- python_misc_utils-0.2.dist-info/WHEEL +5 -0
- python_misc_utils-0.2.dist-info/licenses/LICENSE +13 -0
- python_misc_utils-0.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
import ftplib
|
|
2
|
+
import functools
|
|
3
|
+
import hashlib
|
|
4
|
+
import io
|
|
5
|
+
import os
|
|
6
|
+
import tempfile
|
|
7
|
+
import urllib.parse as uparse
|
|
8
|
+
|
|
9
|
+
import ftputil
|
|
10
|
+
|
|
11
|
+
from .. import alog
|
|
12
|
+
from .. import assert_checks as tas
|
|
13
|
+
from .. import context_managers as cm
|
|
14
|
+
from .. import fs_base as fsb
|
|
15
|
+
from .. import fs_utils as fsu
|
|
16
|
+
from .. import cached_file as chf
|
|
17
|
+
from .. import object_cache as objc
|
|
18
|
+
from .. import writeback_file as wbf
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CacheHandler(objc.Handler):
|
|
22
|
+
|
|
23
|
+
def __init__(self, *args, **kwargs):
|
|
24
|
+
super().__init__()
|
|
25
|
+
self._args = args
|
|
26
|
+
self._kwargs = kwargs
|
|
27
|
+
|
|
28
|
+
def create(self):
|
|
29
|
+
return ftputil.FTPHost(*self._args, **self._kwargs)
|
|
30
|
+
|
|
31
|
+
def is_alive(self, obj):
|
|
32
|
+
try:
|
|
33
|
+
obj.keep_alive()
|
|
34
|
+
|
|
35
|
+
return True
|
|
36
|
+
except:
|
|
37
|
+
return False
|
|
38
|
+
|
|
39
|
+
def close(self, obj):
|
|
40
|
+
obj.close()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class FtpReader:
|
|
44
|
+
|
|
45
|
+
def __init__(self, conn, path):
|
|
46
|
+
self._conn = conn
|
|
47
|
+
self._path = path
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def tag(cls, sres):
|
|
51
|
+
return chf.make_tag(size=sres.st_size, mtime=sres.st_mtime)
|
|
52
|
+
|
|
53
|
+
def support_blocks(self):
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
def read_block(self, bpath, offset, size):
|
|
57
|
+
tas.check_eq(offset, chf.CachedBlockFile.WHOLE_OFFSET,
|
|
58
|
+
msg=f'Wrong offset for whole content read: {offset}')
|
|
59
|
+
|
|
60
|
+
bfd = os.open(bpath, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, mode=0o440)
|
|
61
|
+
with open(bfd, mode='wb') as wfd:
|
|
62
|
+
with self._conn.open(self._path, mode='rb') as rfd:
|
|
63
|
+
self._conn.copyfileobj(rfd, wfd)
|
|
64
|
+
|
|
65
|
+
return os.path.getsize(bpath)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# https://docs.python.org/3/library/ftplib.html
|
|
69
|
+
# https://ftputil.sschwarzer.net/
|
|
70
|
+
class FtpSession(ftplib.FTP):
|
|
71
|
+
|
|
72
|
+
def __init__(self, host, userid, passwd, port):
|
|
73
|
+
super().__init__()
|
|
74
|
+
self.connect(host, port=port)
|
|
75
|
+
self.login(userid, passwd)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class FtpFs(fsb.FsBase):
|
|
79
|
+
|
|
80
|
+
ID = 'ftp'
|
|
81
|
+
IDS = (ID,)
|
|
82
|
+
|
|
83
|
+
def __init__(self, cache_iface=None, **kwargs):
|
|
84
|
+
super().__init__(cache_iface=cache_iface, **kwargs)
|
|
85
|
+
|
|
86
|
+
def _get_connection(self, host, port, user, passwd):
|
|
87
|
+
handler = CacheHandler(host, user, passwd,
|
|
88
|
+
port=port,
|
|
89
|
+
session_factory=FtpSession)
|
|
90
|
+
name = ('FTPFS', host, port, user)
|
|
91
|
+
|
|
92
|
+
return objc.cache().get(name, handler)
|
|
93
|
+
|
|
94
|
+
def _netloc(self, purl):
|
|
95
|
+
return (purl.hostname.lower(), purl.port or 21)
|
|
96
|
+
|
|
97
|
+
def _parse_url(self, url):
|
|
98
|
+
purl = uparse.urlparse(url)
|
|
99
|
+
|
|
100
|
+
host, port = self._netloc(purl)
|
|
101
|
+
user = purl.username or 'anonymous'
|
|
102
|
+
passwd = purl.password or ''
|
|
103
|
+
|
|
104
|
+
conn = self._get_connection(host, port, user, passwd)
|
|
105
|
+
|
|
106
|
+
return conn, purl
|
|
107
|
+
|
|
108
|
+
def _make_reader(self, conn, purl):
|
|
109
|
+
sres = self._stat(conn, purl.path)
|
|
110
|
+
|
|
111
|
+
tag = FtpReader.tag(sres)
|
|
112
|
+
meta = chf.Meta(size=sres.st_size, mtime=sres.st_mtime, tag=tag)
|
|
113
|
+
reader = FtpReader(conn, purl.path)
|
|
114
|
+
|
|
115
|
+
return reader, meta
|
|
116
|
+
|
|
117
|
+
def remove(self, url):
|
|
118
|
+
conn, purl = self._parse_url(url)
|
|
119
|
+
conn.remove(purl.path)
|
|
120
|
+
|
|
121
|
+
def rename(self, src_url, dest_url):
|
|
122
|
+
src_conn, src_purl = self._parse_url(src_url)
|
|
123
|
+
dest_conn, dest_purl = self._parse_url(dest_url)
|
|
124
|
+
|
|
125
|
+
src_netloc, dest_netloc = self._netloc(src_purl), self._netloc(dest_purl)
|
|
126
|
+
|
|
127
|
+
tas.check_eq(src_netloc, dest_netloc,
|
|
128
|
+
msg=f'Source and destination URL must be on the same host: ' \
|
|
129
|
+
f'{src_netloc} vs. {dest_netloc}')
|
|
130
|
+
|
|
131
|
+
src_conn.rename(src_purl.path, dest_purl.path)
|
|
132
|
+
|
|
133
|
+
def mkdir(self, url, mode=None):
|
|
134
|
+
conn, purl = self._parse_url(url)
|
|
135
|
+
conn.mkdir(purl.path)
|
|
136
|
+
|
|
137
|
+
def makedirs(self, url, mode=None, exist_ok=None):
|
|
138
|
+
conn, purl = self._parse_url(url)
|
|
139
|
+
|
|
140
|
+
conn.makedirs(purl.path, exist_ok=exist_ok or False)
|
|
141
|
+
|
|
142
|
+
def rmdir(self, url):
|
|
143
|
+
conn, purl = self._parse_url(url)
|
|
144
|
+
conn.rmdir(purl.path)
|
|
145
|
+
|
|
146
|
+
def rmtree(self, url, ignore_errors=None):
|
|
147
|
+
conn, purl = self._parse_url(url)
|
|
148
|
+
|
|
149
|
+
conn.rmtree(purl.path, ignore_errors=ignore_errors or False)
|
|
150
|
+
|
|
151
|
+
def _stat(self, conn, path):
|
|
152
|
+
sres = conn.stat(path)
|
|
153
|
+
|
|
154
|
+
tag = FtpReader.tag(sres)
|
|
155
|
+
|
|
156
|
+
return fsb.DirEntry(name=os.path.basename(path),
|
|
157
|
+
path=path,
|
|
158
|
+
etag=tag,
|
|
159
|
+
st_mode=sres.st_mode,
|
|
160
|
+
st_size=sres.st_size,
|
|
161
|
+
st_ctime=sres.st_ctime or sres.st_mtime,
|
|
162
|
+
st_mtime=sres.st_mtime)
|
|
163
|
+
|
|
164
|
+
def stat(self, url):
|
|
165
|
+
conn, purl = self._parse_url(url)
|
|
166
|
+
|
|
167
|
+
return self._stat(conn, purl.path)
|
|
168
|
+
|
|
169
|
+
def list(self, url):
|
|
170
|
+
conn, purl = self._parse_url(url)
|
|
171
|
+
|
|
172
|
+
for name in conn.listdir(purl.path):
|
|
173
|
+
path = os.path.join(purl.path, name)
|
|
174
|
+
|
|
175
|
+
yield self._stat(conn, path)
|
|
176
|
+
|
|
177
|
+
def open(self, url, mode, **kwargs):
|
|
178
|
+
conn, purl = self._parse_url(url)
|
|
179
|
+
|
|
180
|
+
if self.read_mode(mode):
|
|
181
|
+
reader, meta = self._make_reader(conn, purl)
|
|
182
|
+
cfile = self._cache_iface.open(url, meta, reader, **kwargs)
|
|
183
|
+
|
|
184
|
+
return io.TextIOWrapper(cfile) if self.text_mode(mode) else cfile
|
|
185
|
+
else:
|
|
186
|
+
writeback_fn = functools.partial(self._upload_file, url)
|
|
187
|
+
if not self.truncate_mode(mode) and conn.path.exists(purl.path):
|
|
188
|
+
url_file = self._download_file(url)
|
|
189
|
+
self.seek_stream(mode, url_file)
|
|
190
|
+
else:
|
|
191
|
+
url_file = tempfile.TemporaryFile()
|
|
192
|
+
|
|
193
|
+
wbfile = wbf.WritebackFile(url_file, writeback_fn)
|
|
194
|
+
|
|
195
|
+
return io.TextIOWrapper(wbfile) if self.text_mode(mode) else wbfile
|
|
196
|
+
|
|
197
|
+
def _upload_file(self, url, stream):
|
|
198
|
+
conn, purl = self._parse_url(url)
|
|
199
|
+
|
|
200
|
+
stream.seek(0)
|
|
201
|
+
with conn.open(purl.path, mode='wb') as dest_fd:
|
|
202
|
+
conn.copyfileobj(stream, dest_fd)
|
|
203
|
+
|
|
204
|
+
def _download_file(self, url):
|
|
205
|
+
conn, purl = self._parse_url(url)
|
|
206
|
+
|
|
207
|
+
with cm.Wrapper(tempfile.TemporaryFile()) as ftmp:
|
|
208
|
+
with conn.open(purl.path, mode='rb') as src_fd:
|
|
209
|
+
conn.copyfileobj(src_fd, ftmp.v)
|
|
210
|
+
|
|
211
|
+
return ftmp.detach()
|
|
212
|
+
|
|
213
|
+
def put_file(self, url, data_gen):
|
|
214
|
+
conn, purl = self._parse_url(url)
|
|
215
|
+
|
|
216
|
+
with conn.open(purl.path, mode='wb') as fd:
|
|
217
|
+
for data in data_gen:
|
|
218
|
+
fd.write(data)
|
|
219
|
+
|
|
220
|
+
def get_file(self, url):
|
|
221
|
+
conn, purl = self._parse_url(url)
|
|
222
|
+
|
|
223
|
+
with conn.open(purl.path, mode='rb') as fd:
|
|
224
|
+
for data in fsu.enum_chunks(fd):
|
|
225
|
+
yield data
|
|
226
|
+
|
|
227
|
+
def as_local(self, url, **kwargs):
|
|
228
|
+
conn, purl = self._parse_url(url)
|
|
229
|
+
reader, meta = self._make_reader(conn, purl)
|
|
230
|
+
|
|
231
|
+
return self._cache_iface.as_local(url, meta, reader, **kwargs)
|
|
232
|
+
|
|
233
|
+
def link(self, src_url, dest_url):
|
|
234
|
+
# There is not link support in FTP.
|
|
235
|
+
self.copyfile(src_url, dest_url)
|
|
236
|
+
|
|
237
|
+
def symlink(self, src_url, dest_url):
|
|
238
|
+
self.link(src_url, dest_url)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
FILE_SYSTEMS = (FtpFs,)
|
|
242
|
+
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import hashlib
|
|
3
|
+
import io
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
import urllib.parse as uparse
|
|
7
|
+
|
|
8
|
+
from .. import alog
|
|
9
|
+
from .. import assert_checks as tas
|
|
10
|
+
from .. import context_managers as cm
|
|
11
|
+
from .. import fs_base as fsb
|
|
12
|
+
from .. import fs_utils as fsu
|
|
13
|
+
from .. import cached_file as chf
|
|
14
|
+
from .. import gcs_fs as gcs
|
|
15
|
+
from .. import object_cache as objc
|
|
16
|
+
from .. import osfd
|
|
17
|
+
from .. import writeback_file as wbf
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CacheHandler(objc.Handler):
|
|
21
|
+
|
|
22
|
+
def __init__(self, *args, **kwargs):
|
|
23
|
+
super().__init__()
|
|
24
|
+
self._args = args
|
|
25
|
+
self._kwargs = kwargs
|
|
26
|
+
|
|
27
|
+
def create(self):
|
|
28
|
+
return gcs.GcsFs(*self._args, **self._kwargs)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class GcsReader:
|
|
32
|
+
|
|
33
|
+
def __init__(self, fs, path, sres):
|
|
34
|
+
self._fs = fs
|
|
35
|
+
self._path = path
|
|
36
|
+
self._sres = sres
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def tag(cls, sres):
|
|
40
|
+
return sres.etag or chf.make_tag(size=sres.st_size, mtime=sres.st_mtime)
|
|
41
|
+
|
|
42
|
+
def support_blocks(self):
|
|
43
|
+
return True
|
|
44
|
+
|
|
45
|
+
def read_block(self, bpath, offset, size):
|
|
46
|
+
if offset != chf.CachedBlockFile.WHOLE_OFFSET:
|
|
47
|
+
size = min(size, self._sres.st_size - offset)
|
|
48
|
+
data = self._fs.pread(self._path, offset, size)
|
|
49
|
+
|
|
50
|
+
with osfd.OsFd(bpath, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, mode=0o440) as wfd:
|
|
51
|
+
os.write(wfd, data)
|
|
52
|
+
|
|
53
|
+
return len(data)
|
|
54
|
+
else:
|
|
55
|
+
with osfd.OsFd(bpath, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, mode=0o440) as wfd:
|
|
56
|
+
for data in self._fs.download(self._path):
|
|
57
|
+
os.write(wfd, data)
|
|
58
|
+
|
|
59
|
+
return os.path.getsize(bpath)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class GcsFs(fsb.FsBase):
|
|
63
|
+
|
|
64
|
+
ID = 'gcs'
|
|
65
|
+
IDS = (ID,)
|
|
66
|
+
|
|
67
|
+
def __init__(self, cache_iface=None, **kwargs):
|
|
68
|
+
super().__init__(cache_iface=cache_iface, **kwargs)
|
|
69
|
+
|
|
70
|
+
def _get_fs(self, bucket):
|
|
71
|
+
handler = CacheHandler(bucket)
|
|
72
|
+
name = ('GCSFS', bucket)
|
|
73
|
+
|
|
74
|
+
return objc.cache().get(name, handler)
|
|
75
|
+
|
|
76
|
+
def _parse_url(self, url):
|
|
77
|
+
purl = uparse.urlparse(url)
|
|
78
|
+
purl = purl._replace(path=purl.path.lstrip('/'))
|
|
79
|
+
fs = self._get_fs(purl.hostname)
|
|
80
|
+
|
|
81
|
+
return fs, purl
|
|
82
|
+
|
|
83
|
+
def _make_reader(self, fs, purl):
|
|
84
|
+
sres = fs.stat(purl.path)
|
|
85
|
+
tas.check_is_not_none(sres, msg=f'File does not exist: {purl.geturl()}')
|
|
86
|
+
|
|
87
|
+
tag = GcsReader.tag(sres)
|
|
88
|
+
meta = chf.Meta(size=sres.st_size, mtime=sres.st_mtime, tag=tag)
|
|
89
|
+
reader = GcsReader(fs, purl.path, sres)
|
|
90
|
+
|
|
91
|
+
return reader, meta
|
|
92
|
+
|
|
93
|
+
def _parse_samefs(self, src_url, dest_url):
|
|
94
|
+
src_fs, src_purl = self._parse_url(src_url)
|
|
95
|
+
dest_fs, dest_purl = self._parse_url(dest_url)
|
|
96
|
+
|
|
97
|
+
tas.check_eq(src_fs.bucket, dest_fs.bucket,
|
|
98
|
+
msg=f'Source and destination URL must be on the same bucket: ' \
|
|
99
|
+
f'{src_url} vs. {dest_url}')
|
|
100
|
+
|
|
101
|
+
return (src_fs, src_purl), (dest_fs, dest_purl)
|
|
102
|
+
|
|
103
|
+
def _copy(self, src_url, dest_url):
|
|
104
|
+
(src_fs, src_purl), (dest_fs, dest_purl) = self._parse_samefs(src_url, dest_url)
|
|
105
|
+
|
|
106
|
+
src_fs.copy(src_purl.path, dest_purl.path)
|
|
107
|
+
|
|
108
|
+
def remove(self, url):
|
|
109
|
+
fs, purl = self._parse_url(url)
|
|
110
|
+
fs.remove(purl.path)
|
|
111
|
+
|
|
112
|
+
def rename(self, src_url, dest_url):
|
|
113
|
+
(src_fs, src_purl), (dest_fs, dest_purl) = self._parse_samefs(src_url, dest_url)
|
|
114
|
+
|
|
115
|
+
src_fs.rename(src_purl.path, dest_purl.path)
|
|
116
|
+
|
|
117
|
+
def mkdir(self, url, mode=None):
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
def makedirs(self, url, mode=None, exist_ok=None):
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
def rmdir(self, url):
|
|
124
|
+
pass
|
|
125
|
+
|
|
126
|
+
def rmtree(self, url, ignore_errors=None):
|
|
127
|
+
fs, purl = self._parse_url(url)
|
|
128
|
+
|
|
129
|
+
fs.rmtree(purl.path, ignore_errors=ignore_errors or False)
|
|
130
|
+
|
|
131
|
+
def stat(self, url):
|
|
132
|
+
fs, purl = self._parse_url(url)
|
|
133
|
+
|
|
134
|
+
return fs.stat(purl.path)
|
|
135
|
+
|
|
136
|
+
def list(self, url):
|
|
137
|
+
fs, purl = self._parse_url(url)
|
|
138
|
+
|
|
139
|
+
return fs.listdir(purl.path)
|
|
140
|
+
|
|
141
|
+
def open(self, url, mode, **kwargs):
|
|
142
|
+
fs, purl = self._parse_url(url)
|
|
143
|
+
|
|
144
|
+
if self.read_mode(mode):
|
|
145
|
+
reader, meta = self._make_reader(fs, purl)
|
|
146
|
+
cfile = self._cache_iface.open(url, meta, reader, **kwargs)
|
|
147
|
+
|
|
148
|
+
return io.TextIOWrapper(cfile) if self.text_mode(mode) else cfile
|
|
149
|
+
else:
|
|
150
|
+
writeback_fn = functools.partial(self._upload_file, url)
|
|
151
|
+
if not self.truncate_mode(mode) and fs.exists(purl.path):
|
|
152
|
+
url_file = self._download_file(url)
|
|
153
|
+
self.seek_stream(mode, url_file)
|
|
154
|
+
else:
|
|
155
|
+
url_file = tempfile.TemporaryFile()
|
|
156
|
+
|
|
157
|
+
wbfile = wbf.WritebackFile(url_file, writeback_fn)
|
|
158
|
+
|
|
159
|
+
return io.TextIOWrapper(wbfile) if self.text_mode(mode) else wbfile
|
|
160
|
+
|
|
161
|
+
def _upload_file(self, url, stream):
|
|
162
|
+
stream.seek(0)
|
|
163
|
+
self.put_file(url, fsu.enum_chunks(stream))
|
|
164
|
+
|
|
165
|
+
def _download_file(self, url):
|
|
166
|
+
with cm.Wrapper(tempfile.TemporaryFile()) as ftmp:
|
|
167
|
+
for data in self.get_file(url):
|
|
168
|
+
ftmp.v.write(data)
|
|
169
|
+
|
|
170
|
+
return ftmp.detach()
|
|
171
|
+
|
|
172
|
+
def put_file(self, url, data_gen):
|
|
173
|
+
fs, purl = self._parse_url(url)
|
|
174
|
+
|
|
175
|
+
fs.upload(purl.path, data_gen)
|
|
176
|
+
|
|
177
|
+
def get_file(self, url):
|
|
178
|
+
fs, purl = self._parse_url(url)
|
|
179
|
+
|
|
180
|
+
return fs.download(purl.path)
|
|
181
|
+
|
|
182
|
+
def as_local(self, url, **kwargs):
|
|
183
|
+
fs, purl = self._parse_url(url)
|
|
184
|
+
reader, meta = self._make_reader(fs, purl)
|
|
185
|
+
|
|
186
|
+
return self._cache_iface.as_local(url, meta, reader, **kwargs)
|
|
187
|
+
|
|
188
|
+
def link(self, src_url, dest_url):
|
|
189
|
+
self._copy(src_url, dest_url)
|
|
190
|
+
|
|
191
|
+
def symlink(self, src_url, dest_url):
|
|
192
|
+
self.link(src_url, dest_url)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
FILE_SYSTEMS = (GcsFs,)
|
|
196
|
+
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import hashlib
|
|
3
|
+
import io
|
|
4
|
+
import mimetypes
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import requests
|
|
8
|
+
import stat as st
|
|
9
|
+
import tempfile
|
|
10
|
+
|
|
11
|
+
import bs4
|
|
12
|
+
|
|
13
|
+
from .. import alog
|
|
14
|
+
from .. import assert_checks as tas
|
|
15
|
+
from .. import context_managers as cm
|
|
16
|
+
from .. import fs_base as fsb
|
|
17
|
+
from .. import fs_utils as fsu
|
|
18
|
+
from .. import cached_file as chf
|
|
19
|
+
from .. import http_utils as hu
|
|
20
|
+
from .. import osfd
|
|
21
|
+
from .. import writeback_file as wbf
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class HttpReader:
|
|
25
|
+
|
|
26
|
+
def __init__(self, url, session=None, head=None, req_kwargs=None, chunk_size=None):
|
|
27
|
+
session = session if session is not None else requests.Session()
|
|
28
|
+
req_kwargs = req_kwargs or dict()
|
|
29
|
+
if head is None:
|
|
30
|
+
head = hu.info(url, mod=session, **req_kwargs)
|
|
31
|
+
|
|
32
|
+
allow_ranges = hu.support_ranges(head.headers)
|
|
33
|
+
|
|
34
|
+
self._url = url
|
|
35
|
+
self._session = session
|
|
36
|
+
self._req_kwargs = req_kwargs
|
|
37
|
+
self._chunk_size = chunk_size or 16 * 1024**2
|
|
38
|
+
self._size = hu.content_length(head.headers)
|
|
39
|
+
self._support_blocks = self._size is not None and allow_ranges
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def tag(cls, head):
|
|
43
|
+
tag = hu.etag(head.headers)
|
|
44
|
+
if tag is None:
|
|
45
|
+
mtime = hu.last_modified(head.headers)
|
|
46
|
+
length = hu.content_length(head.headers)
|
|
47
|
+
tag = chf.make_tag(size=length, mtime=mtime)
|
|
48
|
+
|
|
49
|
+
return tag
|
|
50
|
+
|
|
51
|
+
def support_blocks(self):
|
|
52
|
+
return self._support_blocks
|
|
53
|
+
|
|
54
|
+
def read_block(self, bpath, offset, size):
|
|
55
|
+
if self._support_blocks and offset != chf.CachedBlockFile.WHOLE_OFFSET:
|
|
56
|
+
with osfd.OsFd(bpath, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, mode=0o440) as wfd:
|
|
57
|
+
size = min(size, self._size - offset)
|
|
58
|
+
|
|
59
|
+
headers = self._req_kwargs.get('headers', dict()).copy()
|
|
60
|
+
hu.add_range(headers, offset, offset + size)
|
|
61
|
+
|
|
62
|
+
resp = self._session.get(self._url, headers=headers)
|
|
63
|
+
resp.raise_for_status()
|
|
64
|
+
data = hu.range_data(offset, offset + size - 1, resp.headers, resp.content)
|
|
65
|
+
|
|
66
|
+
os.write(wfd, data)
|
|
67
|
+
|
|
68
|
+
return len(data)
|
|
69
|
+
else:
|
|
70
|
+
resp = self._session.get(self._url, stream=True, **self._req_kwargs)
|
|
71
|
+
resp.raise_for_status()
|
|
72
|
+
with osfd.OsFd(bpath, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, mode=0o440) as wfd:
|
|
73
|
+
for data in resp.iter_content(chunk_size=self._chunk_size):
|
|
74
|
+
os.write(wfd, data)
|
|
75
|
+
|
|
76
|
+
return os.path.getsize(bpath)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class HttpFs(fsb.FsBase):
|
|
80
|
+
|
|
81
|
+
mimetypes.init()
|
|
82
|
+
|
|
83
|
+
ID = 'http'
|
|
84
|
+
IDS = (ID, 'https')
|
|
85
|
+
|
|
86
|
+
def __init__(self, cache_iface=None, **kwargs):
|
|
87
|
+
super().__init__(cache_iface=cache_iface, **kwargs)
|
|
88
|
+
self._req_kwargs = hu.filter_request_args(kwargs)
|
|
89
|
+
self._session = requests.Session()
|
|
90
|
+
|
|
91
|
+
def _exists(self, url):
|
|
92
|
+
try:
|
|
93
|
+
hu.info(url, mod=self._session, **self._req_kwargs)
|
|
94
|
+
|
|
95
|
+
return True
|
|
96
|
+
except requests.exceptions.HTTPError:
|
|
97
|
+
return False
|
|
98
|
+
|
|
99
|
+
def _make_reader(self, url):
|
|
100
|
+
head = hu.info(url, mod=self._session, **self._req_kwargs)
|
|
101
|
+
|
|
102
|
+
tag = HttpReader.tag(head)
|
|
103
|
+
size = hu.content_length(head.headers)
|
|
104
|
+
mtime = hu.last_modified(head.headers)
|
|
105
|
+
meta = chf.Meta(size=size, mtime=mtime, tag=tag)
|
|
106
|
+
reader = HttpReader(url,
|
|
107
|
+
session=self._session,
|
|
108
|
+
head=head,
|
|
109
|
+
req_kwargs=self._req_kwargs)
|
|
110
|
+
|
|
111
|
+
return reader, meta
|
|
112
|
+
|
|
113
|
+
def stat(self, url):
|
|
114
|
+
head = hu.info(url, mod=self._session, **self._req_kwargs)
|
|
115
|
+
|
|
116
|
+
length = hu.content_length(head.headers)
|
|
117
|
+
mtime = hu.last_modified(head.headers)
|
|
118
|
+
tag = hu.etag(head.headers) or chf.make_tag(size=length, mtime=mtime)
|
|
119
|
+
|
|
120
|
+
# HTML pages have content, but can also be listed (for HREF linked from it).
|
|
121
|
+
# hence the weird st.S_IFREG | st.S_IFDIR.
|
|
122
|
+
return fsb.DirEntry(name=os.path.basename(url.rstrip('/')),
|
|
123
|
+
path=url,
|
|
124
|
+
etag=tag,
|
|
125
|
+
st_mode=st.S_IFREG | st.S_IFDIR,
|
|
126
|
+
st_size=length,
|
|
127
|
+
st_ctime=mtime,
|
|
128
|
+
st_mtime=mtime)
|
|
129
|
+
|
|
130
|
+
def open(self, url, mode, **kwargs):
|
|
131
|
+
if self.read_mode(mode):
|
|
132
|
+
reader, meta = self._make_reader(url)
|
|
133
|
+
cfile = self._cache_iface.open(url, meta, reader, **kwargs)
|
|
134
|
+
|
|
135
|
+
return io.TextIOWrapper(cfile) if self.text_mode(mode) else cfile
|
|
136
|
+
else:
|
|
137
|
+
writeback_fn = functools.partial(self._upload_file, url)
|
|
138
|
+
if not self.truncate_mode(mode) and self._exists(url):
|
|
139
|
+
url_file = self._download_file(url)
|
|
140
|
+
self.seek_stream(mode, url_file)
|
|
141
|
+
else:
|
|
142
|
+
url_file = tempfile.TemporaryFile()
|
|
143
|
+
|
|
144
|
+
wbfile = wbf.WritebackFile(url_file, writeback_fn)
|
|
145
|
+
|
|
146
|
+
return io.TextIOWrapper(wbfile) if self.text_mode(mode) else wbfile
|
|
147
|
+
|
|
148
|
+
def remove(self, url):
|
|
149
|
+
self._session.delete(url, **self._req_kwargs)
|
|
150
|
+
|
|
151
|
+
def rename(self, src_url, dest_url):
|
|
152
|
+
# There is no "rename" in HTTP ...
|
|
153
|
+
with self._download_file(src_url) as fd:
|
|
154
|
+
self._upload_file(dest_url, fd)
|
|
155
|
+
|
|
156
|
+
self.remove(src_url)
|
|
157
|
+
|
|
158
|
+
def mkdir(self, url, mode=None):
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
def makedirs(self, url, mode=None, exist_ok=None):
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
def rmdir(self, url):
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
def rmtree(self, url, ignore_errors=None):
|
|
168
|
+
pass
|
|
169
|
+
|
|
170
|
+
def list(self, url):
|
|
171
|
+
resp = self._session.get(url, **self._req_kwargs)
|
|
172
|
+
resp.raise_for_status()
|
|
173
|
+
|
|
174
|
+
html_parser = bs4.BeautifulSoup(resp.text, 'html.parser')
|
|
175
|
+
|
|
176
|
+
for link in html_parser.find_all('a'):
|
|
177
|
+
href = link.get('href')
|
|
178
|
+
if href and not re.match(r'[a-zA-Z]+://', href):
|
|
179
|
+
lurl = os.path.join(url, href)
|
|
180
|
+
try:
|
|
181
|
+
de = self.stat(lurl)
|
|
182
|
+
|
|
183
|
+
yield de
|
|
184
|
+
except GeneratorExit:
|
|
185
|
+
raise
|
|
186
|
+
except Exception as ex:
|
|
187
|
+
alog.debug(f'Unable to stat URL {lurl}: {ex}')
|
|
188
|
+
|
|
189
|
+
def _upload_data_gen(self, url, data_gen):
|
|
190
|
+
ctype, cencoding = mimetypes.guess_type(url, strict=False)
|
|
191
|
+
|
|
192
|
+
headers = self._req_kwargs.get('headers', dict()).copy()
|
|
193
|
+
if ctype is not None:
|
|
194
|
+
headers[hu.CONTENT_TYPE] = ctype
|
|
195
|
+
if cencoding is not None:
|
|
196
|
+
headers[hu.CONTENT_ENCODING] = cencoding
|
|
197
|
+
|
|
198
|
+
self._session.put(url, headers=headers, data=data_gen)
|
|
199
|
+
|
|
200
|
+
def _upload_file(self, url, stream):
|
|
201
|
+
stream.seek(0)
|
|
202
|
+
self._upload_data_gen(url, fsu.enum_chunks(stream))
|
|
203
|
+
|
|
204
|
+
def _iterate_chunks(self, url, chunk_size=None):
|
|
205
|
+
chunk_size = chunk_size or 16 * 1024**2
|
|
206
|
+
|
|
207
|
+
resp = self._session.get(url, stream=True, **self._req_kwargs)
|
|
208
|
+
resp.raise_for_status()
|
|
209
|
+
|
|
210
|
+
for data in resp.iter_content(chunk_size=chunk_size):
|
|
211
|
+
yield data
|
|
212
|
+
|
|
213
|
+
def _download_file(self, url, chunk_size=None):
|
|
214
|
+
with cm.Wrapper(tempfile.TemporaryFile()) as ftmp:
|
|
215
|
+
for data in self._iterate_chunks(url, chunk_size=chunk_size):
|
|
216
|
+
ftmp.v.write(data)
|
|
217
|
+
|
|
218
|
+
return ftmp.detach()
|
|
219
|
+
|
|
220
|
+
def put_file(self, url, data_gen):
|
|
221
|
+
self._upload_data_gen(url, data_gen)
|
|
222
|
+
|
|
223
|
+
def get_file(self, url):
|
|
224
|
+
for data in self._iterate_chunks(url):
|
|
225
|
+
yield data
|
|
226
|
+
|
|
227
|
+
def as_local(self, url, **kwargs):
|
|
228
|
+
reader, meta = self._make_reader(url)
|
|
229
|
+
|
|
230
|
+
return self._cache_iface.as_local(url, meta, reader, **kwargs)
|
|
231
|
+
|
|
232
|
+
def link(self, src_url, dest_url):
|
|
233
|
+
# There is not link support in HTTP.
|
|
234
|
+
self.copyfile(src_url, dest_url)
|
|
235
|
+
|
|
236
|
+
def symlink(self, src_url, dest_url):
|
|
237
|
+
self.link(src_url, dest_url)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
FILE_SYSTEMS = (HttpFs,)
|
|
241
|
+
|