python-misc-utils 0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. py_misc_utils/__init__.py +0 -0
  2. py_misc_utils/abs_timeout.py +12 -0
  3. py_misc_utils/alog.py +311 -0
  4. py_misc_utils/app_main.py +179 -0
  5. py_misc_utils/archive_streamer.py +112 -0
  6. py_misc_utils/assert_checks.py +118 -0
  7. py_misc_utils/ast_utils.py +121 -0
  8. py_misc_utils/async_manager.py +189 -0
  9. py_misc_utils/break_control.py +63 -0
  10. py_misc_utils/buffered_iterator.py +35 -0
  11. py_misc_utils/cached_file.py +507 -0
  12. py_misc_utils/call_limiter.py +26 -0
  13. py_misc_utils/call_result_selector.py +13 -0
  14. py_misc_utils/cleanups.py +85 -0
  15. py_misc_utils/cmd.py +97 -0
  16. py_misc_utils/compression.py +116 -0
  17. py_misc_utils/cond_waiter.py +13 -0
  18. py_misc_utils/context_base.py +18 -0
  19. py_misc_utils/context_managers.py +67 -0
  20. py_misc_utils/core_utils.py +577 -0
  21. py_misc_utils/daemon_process.py +252 -0
  22. py_misc_utils/data_cache.py +46 -0
  23. py_misc_utils/date_utils.py +90 -0
  24. py_misc_utils/debug.py +24 -0
  25. py_misc_utils/dyn_modules.py +50 -0
  26. py_misc_utils/dynamod.py +103 -0
  27. py_misc_utils/env_config.py +35 -0
  28. py_misc_utils/executor.py +239 -0
  29. py_misc_utils/file_overwrite.py +29 -0
  30. py_misc_utils/fin_wrap.py +77 -0
  31. py_misc_utils/fp_utils.py +47 -0
  32. py_misc_utils/fs/__init__.py +0 -0
  33. py_misc_utils/fs/file_fs.py +127 -0
  34. py_misc_utils/fs/ftp_fs.py +242 -0
  35. py_misc_utils/fs/gcs_fs.py +196 -0
  36. py_misc_utils/fs/http_fs.py +241 -0
  37. py_misc_utils/fs/s3_fs.py +417 -0
  38. py_misc_utils/fs_base.py +133 -0
  39. py_misc_utils/fs_utils.py +207 -0
  40. py_misc_utils/gcs_fs.py +169 -0
  41. py_misc_utils/gen_indices.py +54 -0
  42. py_misc_utils/gfs.py +371 -0
  43. py_misc_utils/git_repo.py +77 -0
  44. py_misc_utils/global_namespace.py +110 -0
  45. py_misc_utils/http_async_fetcher.py +139 -0
  46. py_misc_utils/http_server.py +196 -0
  47. py_misc_utils/http_utils.py +143 -0
  48. py_misc_utils/img_utils.py +20 -0
  49. py_misc_utils/infix_op.py +20 -0
  50. py_misc_utils/inspect_utils.py +205 -0
  51. py_misc_utils/iostream.py +21 -0
  52. py_misc_utils/iter_file.py +117 -0
  53. py_misc_utils/key_wrap.py +46 -0
  54. py_misc_utils/lazy_import.py +25 -0
  55. py_misc_utils/lockfile.py +164 -0
  56. py_misc_utils/mem_size.py +64 -0
  57. py_misc_utils/mirror_from.py +72 -0
  58. py_misc_utils/mmap.py +16 -0
  59. py_misc_utils/module_utils.py +196 -0
  60. py_misc_utils/moving_average.py +19 -0
  61. py_misc_utils/msgpack_streamer.py +26 -0
  62. py_misc_utils/multi_wait.py +24 -0
  63. py_misc_utils/multiprocessing.py +102 -0
  64. py_misc_utils/named_array.py +224 -0
  65. py_misc_utils/no_break.py +46 -0
  66. py_misc_utils/no_except.py +32 -0
  67. py_misc_utils/np_ml_framework.py +184 -0
  68. py_misc_utils/np_utils.py +346 -0
  69. py_misc_utils/ntuple_utils.py +38 -0
  70. py_misc_utils/num_utils.py +54 -0
  71. py_misc_utils/obj.py +73 -0
  72. py_misc_utils/object_cache.py +100 -0
  73. py_misc_utils/object_tracker.py +88 -0
  74. py_misc_utils/ordered_set.py +71 -0
  75. py_misc_utils/osfd.py +27 -0
  76. py_misc_utils/packet.py +22 -0
  77. py_misc_utils/parquet_streamer.py +69 -0
  78. py_misc_utils/pd_utils.py +254 -0
  79. py_misc_utils/periodic_task.py +61 -0
  80. py_misc_utils/pickle_wrap.py +121 -0
  81. py_misc_utils/pipeline.py +98 -0
  82. py_misc_utils/remap_pickle.py +50 -0
  83. py_misc_utils/resource_manager.py +155 -0
  84. py_misc_utils/rnd_utils.py +56 -0
  85. py_misc_utils/run_once.py +19 -0
  86. py_misc_utils/scheduler.py +135 -0
  87. py_misc_utils/select_params.py +300 -0
  88. py_misc_utils/signal.py +141 -0
  89. py_misc_utils/skl_utils.py +270 -0
  90. py_misc_utils/split.py +147 -0
  91. py_misc_utils/state.py +53 -0
  92. py_misc_utils/std_module.py +56 -0
  93. py_misc_utils/stream_dataframe.py +176 -0
  94. py_misc_utils/streamed_file.py +144 -0
  95. py_misc_utils/tempdir.py +79 -0
  96. py_misc_utils/template_replace.py +51 -0
  97. py_misc_utils/tensor_stream.py +269 -0
  98. py_misc_utils/thread_context.py +33 -0
  99. py_misc_utils/throttle.py +30 -0
  100. py_misc_utils/time_trigger.py +18 -0
  101. py_misc_utils/timegen.py +11 -0
  102. py_misc_utils/traceback.py +49 -0
  103. py_misc_utils/tracking_executor.py +91 -0
  104. py_misc_utils/transform_array.py +42 -0
  105. py_misc_utils/uncompress.py +35 -0
  106. py_misc_utils/url_fetcher.py +157 -0
  107. py_misc_utils/utils.py +538 -0
  108. py_misc_utils/varint.py +50 -0
  109. py_misc_utils/virt_array.py +52 -0
  110. py_misc_utils/weak_call.py +33 -0
  111. py_misc_utils/work_results.py +100 -0
  112. py_misc_utils/writeback_file.py +43 -0
  113. python_misc_utils-0.2.dist-info/METADATA +36 -0
  114. python_misc_utils-0.2.dist-info/RECORD +117 -0
  115. python_misc_utils-0.2.dist-info/WHEEL +5 -0
  116. python_misc_utils-0.2.dist-info/licenses/LICENSE +13 -0
  117. python_misc_utils-0.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,242 @@
1
+ import ftplib
2
+ import functools
3
+ import hashlib
4
+ import io
5
+ import os
6
+ import tempfile
7
+ import urllib.parse as uparse
8
+
9
+ import ftputil
10
+
11
+ from .. import alog
12
+ from .. import assert_checks as tas
13
+ from .. import context_managers as cm
14
+ from .. import fs_base as fsb
15
+ from .. import fs_utils as fsu
16
+ from .. import cached_file as chf
17
+ from .. import object_cache as objc
18
+ from .. import writeback_file as wbf
19
+
20
+
21
+ class CacheHandler(objc.Handler):
22
+
23
+ def __init__(self, *args, **kwargs):
24
+ super().__init__()
25
+ self._args = args
26
+ self._kwargs = kwargs
27
+
28
+ def create(self):
29
+ return ftputil.FTPHost(*self._args, **self._kwargs)
30
+
31
+ def is_alive(self, obj):
32
+ try:
33
+ obj.keep_alive()
34
+
35
+ return True
36
+ except:
37
+ return False
38
+
39
+ def close(self, obj):
40
+ obj.close()
41
+
42
+
43
+ class FtpReader:
44
+
45
+ def __init__(self, conn, path):
46
+ self._conn = conn
47
+ self._path = path
48
+
49
+ @classmethod
50
+ def tag(cls, sres):
51
+ return chf.make_tag(size=sres.st_size, mtime=sres.st_mtime)
52
+
53
+ def support_blocks(self):
54
+ return False
55
+
56
+ def read_block(self, bpath, offset, size):
57
+ tas.check_eq(offset, chf.CachedBlockFile.WHOLE_OFFSET,
58
+ msg=f'Wrong offset for whole content read: {offset}')
59
+
60
+ bfd = os.open(bpath, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, mode=0o440)
61
+ with open(bfd, mode='wb') as wfd:
62
+ with self._conn.open(self._path, mode='rb') as rfd:
63
+ self._conn.copyfileobj(rfd, wfd)
64
+
65
+ return os.path.getsize(bpath)
66
+
67
+
68
+ # https://docs.python.org/3/library/ftplib.html
69
+ # https://ftputil.sschwarzer.net/
70
+ class FtpSession(ftplib.FTP):
71
+
72
+ def __init__(self, host, userid, passwd, port):
73
+ super().__init__()
74
+ self.connect(host, port=port)
75
+ self.login(userid, passwd)
76
+
77
+
78
+ class FtpFs(fsb.FsBase):
79
+
80
+ ID = 'ftp'
81
+ IDS = (ID,)
82
+
83
+ def __init__(self, cache_iface=None, **kwargs):
84
+ super().__init__(cache_iface=cache_iface, **kwargs)
85
+
86
+ def _get_connection(self, host, port, user, passwd):
87
+ handler = CacheHandler(host, user, passwd,
88
+ port=port,
89
+ session_factory=FtpSession)
90
+ name = ('FTPFS', host, port, user)
91
+
92
+ return objc.cache().get(name, handler)
93
+
94
+ def _netloc(self, purl):
95
+ return (purl.hostname.lower(), purl.port or 21)
96
+
97
+ def _parse_url(self, url):
98
+ purl = uparse.urlparse(url)
99
+
100
+ host, port = self._netloc(purl)
101
+ user = purl.username or 'anonymous'
102
+ passwd = purl.password or ''
103
+
104
+ conn = self._get_connection(host, port, user, passwd)
105
+
106
+ return conn, purl
107
+
108
+ def _make_reader(self, conn, purl):
109
+ sres = self._stat(conn, purl.path)
110
+
111
+ tag = FtpReader.tag(sres)
112
+ meta = chf.Meta(size=sres.st_size, mtime=sres.st_mtime, tag=tag)
113
+ reader = FtpReader(conn, purl.path)
114
+
115
+ return reader, meta
116
+
117
+ def remove(self, url):
118
+ conn, purl = self._parse_url(url)
119
+ conn.remove(purl.path)
120
+
121
+ def rename(self, src_url, dest_url):
122
+ src_conn, src_purl = self._parse_url(src_url)
123
+ dest_conn, dest_purl = self._parse_url(dest_url)
124
+
125
+ src_netloc, dest_netloc = self._netloc(src_purl), self._netloc(dest_purl)
126
+
127
+ tas.check_eq(src_netloc, dest_netloc,
128
+ msg=f'Source and destination URL must be on the same host: ' \
129
+ f'{src_netloc} vs. {dest_netloc}')
130
+
131
+ src_conn.rename(src_purl.path, dest_purl.path)
132
+
133
+ def mkdir(self, url, mode=None):
134
+ conn, purl = self._parse_url(url)
135
+ conn.mkdir(purl.path)
136
+
137
+ def makedirs(self, url, mode=None, exist_ok=None):
138
+ conn, purl = self._parse_url(url)
139
+
140
+ conn.makedirs(purl.path, exist_ok=exist_ok or False)
141
+
142
+ def rmdir(self, url):
143
+ conn, purl = self._parse_url(url)
144
+ conn.rmdir(purl.path)
145
+
146
+ def rmtree(self, url, ignore_errors=None):
147
+ conn, purl = self._parse_url(url)
148
+
149
+ conn.rmtree(purl.path, ignore_errors=ignore_errors or False)
150
+
151
+ def _stat(self, conn, path):
152
+ sres = conn.stat(path)
153
+
154
+ tag = FtpReader.tag(sres)
155
+
156
+ return fsb.DirEntry(name=os.path.basename(path),
157
+ path=path,
158
+ etag=tag,
159
+ st_mode=sres.st_mode,
160
+ st_size=sres.st_size,
161
+ st_ctime=sres.st_ctime or sres.st_mtime,
162
+ st_mtime=sres.st_mtime)
163
+
164
+ def stat(self, url):
165
+ conn, purl = self._parse_url(url)
166
+
167
+ return self._stat(conn, purl.path)
168
+
169
+ def list(self, url):
170
+ conn, purl = self._parse_url(url)
171
+
172
+ for name in conn.listdir(purl.path):
173
+ path = os.path.join(purl.path, name)
174
+
175
+ yield self._stat(conn, path)
176
+
177
+ def open(self, url, mode, **kwargs):
178
+ conn, purl = self._parse_url(url)
179
+
180
+ if self.read_mode(mode):
181
+ reader, meta = self._make_reader(conn, purl)
182
+ cfile = self._cache_iface.open(url, meta, reader, **kwargs)
183
+
184
+ return io.TextIOWrapper(cfile) if self.text_mode(mode) else cfile
185
+ else:
186
+ writeback_fn = functools.partial(self._upload_file, url)
187
+ if not self.truncate_mode(mode) and conn.path.exists(purl.path):
188
+ url_file = self._download_file(url)
189
+ self.seek_stream(mode, url_file)
190
+ else:
191
+ url_file = tempfile.TemporaryFile()
192
+
193
+ wbfile = wbf.WritebackFile(url_file, writeback_fn)
194
+
195
+ return io.TextIOWrapper(wbfile) if self.text_mode(mode) else wbfile
196
+
197
+ def _upload_file(self, url, stream):
198
+ conn, purl = self._parse_url(url)
199
+
200
+ stream.seek(0)
201
+ with conn.open(purl.path, mode='wb') as dest_fd:
202
+ conn.copyfileobj(stream, dest_fd)
203
+
204
+ def _download_file(self, url):
205
+ conn, purl = self._parse_url(url)
206
+
207
+ with cm.Wrapper(tempfile.TemporaryFile()) as ftmp:
208
+ with conn.open(purl.path, mode='rb') as src_fd:
209
+ conn.copyfileobj(src_fd, ftmp.v)
210
+
211
+ return ftmp.detach()
212
+
213
+ def put_file(self, url, data_gen):
214
+ conn, purl = self._parse_url(url)
215
+
216
+ with conn.open(purl.path, mode='wb') as fd:
217
+ for data in data_gen:
218
+ fd.write(data)
219
+
220
+ def get_file(self, url):
221
+ conn, purl = self._parse_url(url)
222
+
223
+ with conn.open(purl.path, mode='rb') as fd:
224
+ for data in fsu.enum_chunks(fd):
225
+ yield data
226
+
227
+ def as_local(self, url, **kwargs):
228
+ conn, purl = self._parse_url(url)
229
+ reader, meta = self._make_reader(conn, purl)
230
+
231
+ return self._cache_iface.as_local(url, meta, reader, **kwargs)
232
+
233
+ def link(self, src_url, dest_url):
234
+ # There is not link support in FTP.
235
+ self.copyfile(src_url, dest_url)
236
+
237
+ def symlink(self, src_url, dest_url):
238
+ self.link(src_url, dest_url)
239
+
240
+
241
+ FILE_SYSTEMS = (FtpFs,)
242
+
@@ -0,0 +1,196 @@
1
+ import functools
2
+ import hashlib
3
+ import io
4
+ import os
5
+ import tempfile
6
+ import urllib.parse as uparse
7
+
8
+ from .. import alog
9
+ from .. import assert_checks as tas
10
+ from .. import context_managers as cm
11
+ from .. import fs_base as fsb
12
+ from .. import fs_utils as fsu
13
+ from .. import cached_file as chf
14
+ from .. import gcs_fs as gcs
15
+ from .. import object_cache as objc
16
+ from .. import osfd
17
+ from .. import writeback_file as wbf
18
+
19
+
20
+ class CacheHandler(objc.Handler):
21
+
22
+ def __init__(self, *args, **kwargs):
23
+ super().__init__()
24
+ self._args = args
25
+ self._kwargs = kwargs
26
+
27
+ def create(self):
28
+ return gcs.GcsFs(*self._args, **self._kwargs)
29
+
30
+
31
+ class GcsReader:
32
+
33
+ def __init__(self, fs, path, sres):
34
+ self._fs = fs
35
+ self._path = path
36
+ self._sres = sres
37
+
38
+ @classmethod
39
+ def tag(cls, sres):
40
+ return sres.etag or chf.make_tag(size=sres.st_size, mtime=sres.st_mtime)
41
+
42
+ def support_blocks(self):
43
+ return True
44
+
45
+ def read_block(self, bpath, offset, size):
46
+ if offset != chf.CachedBlockFile.WHOLE_OFFSET:
47
+ size = min(size, self._sres.st_size - offset)
48
+ data = self._fs.pread(self._path, offset, size)
49
+
50
+ with osfd.OsFd(bpath, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, mode=0o440) as wfd:
51
+ os.write(wfd, data)
52
+
53
+ return len(data)
54
+ else:
55
+ with osfd.OsFd(bpath, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, mode=0o440) as wfd:
56
+ for data in self._fs.download(self._path):
57
+ os.write(wfd, data)
58
+
59
+ return os.path.getsize(bpath)
60
+
61
+
62
+ class GcsFs(fsb.FsBase):
63
+
64
+ ID = 'gcs'
65
+ IDS = (ID,)
66
+
67
+ def __init__(self, cache_iface=None, **kwargs):
68
+ super().__init__(cache_iface=cache_iface, **kwargs)
69
+
70
+ def _get_fs(self, bucket):
71
+ handler = CacheHandler(bucket)
72
+ name = ('GCSFS', bucket)
73
+
74
+ return objc.cache().get(name, handler)
75
+
76
+ def _parse_url(self, url):
77
+ purl = uparse.urlparse(url)
78
+ purl = purl._replace(path=purl.path.lstrip('/'))
79
+ fs = self._get_fs(purl.hostname)
80
+
81
+ return fs, purl
82
+
83
+ def _make_reader(self, fs, purl):
84
+ sres = fs.stat(purl.path)
85
+ tas.check_is_not_none(sres, msg=f'File does not exist: {purl.geturl()}')
86
+
87
+ tag = GcsReader.tag(sres)
88
+ meta = chf.Meta(size=sres.st_size, mtime=sres.st_mtime, tag=tag)
89
+ reader = GcsReader(fs, purl.path, sres)
90
+
91
+ return reader, meta
92
+
93
+ def _parse_samefs(self, src_url, dest_url):
94
+ src_fs, src_purl = self._parse_url(src_url)
95
+ dest_fs, dest_purl = self._parse_url(dest_url)
96
+
97
+ tas.check_eq(src_fs.bucket, dest_fs.bucket,
98
+ msg=f'Source and destination URL must be on the same bucket: ' \
99
+ f'{src_url} vs. {dest_url}')
100
+
101
+ return (src_fs, src_purl), (dest_fs, dest_purl)
102
+
103
+ def _copy(self, src_url, dest_url):
104
+ (src_fs, src_purl), (dest_fs, dest_purl) = self._parse_samefs(src_url, dest_url)
105
+
106
+ src_fs.copy(src_purl.path, dest_purl.path)
107
+
108
+ def remove(self, url):
109
+ fs, purl = self._parse_url(url)
110
+ fs.remove(purl.path)
111
+
112
+ def rename(self, src_url, dest_url):
113
+ (src_fs, src_purl), (dest_fs, dest_purl) = self._parse_samefs(src_url, dest_url)
114
+
115
+ src_fs.rename(src_purl.path, dest_purl.path)
116
+
117
+ def mkdir(self, url, mode=None):
118
+ pass
119
+
120
+ def makedirs(self, url, mode=None, exist_ok=None):
121
+ pass
122
+
123
+ def rmdir(self, url):
124
+ pass
125
+
126
+ def rmtree(self, url, ignore_errors=None):
127
+ fs, purl = self._parse_url(url)
128
+
129
+ fs.rmtree(purl.path, ignore_errors=ignore_errors or False)
130
+
131
+ def stat(self, url):
132
+ fs, purl = self._parse_url(url)
133
+
134
+ return fs.stat(purl.path)
135
+
136
+ def list(self, url):
137
+ fs, purl = self._parse_url(url)
138
+
139
+ return fs.listdir(purl.path)
140
+
141
+ def open(self, url, mode, **kwargs):
142
+ fs, purl = self._parse_url(url)
143
+
144
+ if self.read_mode(mode):
145
+ reader, meta = self._make_reader(fs, purl)
146
+ cfile = self._cache_iface.open(url, meta, reader, **kwargs)
147
+
148
+ return io.TextIOWrapper(cfile) if self.text_mode(mode) else cfile
149
+ else:
150
+ writeback_fn = functools.partial(self._upload_file, url)
151
+ if not self.truncate_mode(mode) and fs.exists(purl.path):
152
+ url_file = self._download_file(url)
153
+ self.seek_stream(mode, url_file)
154
+ else:
155
+ url_file = tempfile.TemporaryFile()
156
+
157
+ wbfile = wbf.WritebackFile(url_file, writeback_fn)
158
+
159
+ return io.TextIOWrapper(wbfile) if self.text_mode(mode) else wbfile
160
+
161
+ def _upload_file(self, url, stream):
162
+ stream.seek(0)
163
+ self.put_file(url, fsu.enum_chunks(stream))
164
+
165
+ def _download_file(self, url):
166
+ with cm.Wrapper(tempfile.TemporaryFile()) as ftmp:
167
+ for data in self.get_file(url):
168
+ ftmp.v.write(data)
169
+
170
+ return ftmp.detach()
171
+
172
+ def put_file(self, url, data_gen):
173
+ fs, purl = self._parse_url(url)
174
+
175
+ fs.upload(purl.path, data_gen)
176
+
177
+ def get_file(self, url):
178
+ fs, purl = self._parse_url(url)
179
+
180
+ return fs.download(purl.path)
181
+
182
+ def as_local(self, url, **kwargs):
183
+ fs, purl = self._parse_url(url)
184
+ reader, meta = self._make_reader(fs, purl)
185
+
186
+ return self._cache_iface.as_local(url, meta, reader, **kwargs)
187
+
188
+ def link(self, src_url, dest_url):
189
+ self._copy(src_url, dest_url)
190
+
191
+ def symlink(self, src_url, dest_url):
192
+ self.link(src_url, dest_url)
193
+
194
+
195
+ FILE_SYSTEMS = (GcsFs,)
196
+
@@ -0,0 +1,241 @@
1
+ import functools
2
+ import hashlib
3
+ import io
4
+ import mimetypes
5
+ import os
6
+ import re
7
+ import requests
8
+ import stat as st
9
+ import tempfile
10
+
11
+ import bs4
12
+
13
+ from .. import alog
14
+ from .. import assert_checks as tas
15
+ from .. import context_managers as cm
16
+ from .. import fs_base as fsb
17
+ from .. import fs_utils as fsu
18
+ from .. import cached_file as chf
19
+ from .. import http_utils as hu
20
+ from .. import osfd
21
+ from .. import writeback_file as wbf
22
+
23
+
24
+ class HttpReader:
25
+
26
+ def __init__(self, url, session=None, head=None, req_kwargs=None, chunk_size=None):
27
+ session = session if session is not None else requests.Session()
28
+ req_kwargs = req_kwargs or dict()
29
+ if head is None:
30
+ head = hu.info(url, mod=session, **req_kwargs)
31
+
32
+ allow_ranges = hu.support_ranges(head.headers)
33
+
34
+ self._url = url
35
+ self._session = session
36
+ self._req_kwargs = req_kwargs
37
+ self._chunk_size = chunk_size or 16 * 1024**2
38
+ self._size = hu.content_length(head.headers)
39
+ self._support_blocks = self._size is not None and allow_ranges
40
+
41
+ @classmethod
42
+ def tag(cls, head):
43
+ tag = hu.etag(head.headers)
44
+ if tag is None:
45
+ mtime = hu.last_modified(head.headers)
46
+ length = hu.content_length(head.headers)
47
+ tag = chf.make_tag(size=length, mtime=mtime)
48
+
49
+ return tag
50
+
51
+ def support_blocks(self):
52
+ return self._support_blocks
53
+
54
+ def read_block(self, bpath, offset, size):
55
+ if self._support_blocks and offset != chf.CachedBlockFile.WHOLE_OFFSET:
56
+ with osfd.OsFd(bpath, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, mode=0o440) as wfd:
57
+ size = min(size, self._size - offset)
58
+
59
+ headers = self._req_kwargs.get('headers', dict()).copy()
60
+ hu.add_range(headers, offset, offset + size)
61
+
62
+ resp = self._session.get(self._url, headers=headers)
63
+ resp.raise_for_status()
64
+ data = hu.range_data(offset, offset + size - 1, resp.headers, resp.content)
65
+
66
+ os.write(wfd, data)
67
+
68
+ return len(data)
69
+ else:
70
+ resp = self._session.get(self._url, stream=True, **self._req_kwargs)
71
+ resp.raise_for_status()
72
+ with osfd.OsFd(bpath, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, mode=0o440) as wfd:
73
+ for data in resp.iter_content(chunk_size=self._chunk_size):
74
+ os.write(wfd, data)
75
+
76
+ return os.path.getsize(bpath)
77
+
78
+
79
+ class HttpFs(fsb.FsBase):
80
+
81
+ mimetypes.init()
82
+
83
+ ID = 'http'
84
+ IDS = (ID, 'https')
85
+
86
+ def __init__(self, cache_iface=None, **kwargs):
87
+ super().__init__(cache_iface=cache_iface, **kwargs)
88
+ self._req_kwargs = hu.filter_request_args(kwargs)
89
+ self._session = requests.Session()
90
+
91
+ def _exists(self, url):
92
+ try:
93
+ hu.info(url, mod=self._session, **self._req_kwargs)
94
+
95
+ return True
96
+ except requests.exceptions.HTTPError:
97
+ return False
98
+
99
+ def _make_reader(self, url):
100
+ head = hu.info(url, mod=self._session, **self._req_kwargs)
101
+
102
+ tag = HttpReader.tag(head)
103
+ size = hu.content_length(head.headers)
104
+ mtime = hu.last_modified(head.headers)
105
+ meta = chf.Meta(size=size, mtime=mtime, tag=tag)
106
+ reader = HttpReader(url,
107
+ session=self._session,
108
+ head=head,
109
+ req_kwargs=self._req_kwargs)
110
+
111
+ return reader, meta
112
+
113
+ def stat(self, url):
114
+ head = hu.info(url, mod=self._session, **self._req_kwargs)
115
+
116
+ length = hu.content_length(head.headers)
117
+ mtime = hu.last_modified(head.headers)
118
+ tag = hu.etag(head.headers) or chf.make_tag(size=length, mtime=mtime)
119
+
120
+ # HTML pages have content, but can also be listed (for HREF linked from it).
121
+ # hence the weird st.S_IFREG | st.S_IFDIR.
122
+ return fsb.DirEntry(name=os.path.basename(url.rstrip('/')),
123
+ path=url,
124
+ etag=tag,
125
+ st_mode=st.S_IFREG | st.S_IFDIR,
126
+ st_size=length,
127
+ st_ctime=mtime,
128
+ st_mtime=mtime)
129
+
130
+ def open(self, url, mode, **kwargs):
131
+ if self.read_mode(mode):
132
+ reader, meta = self._make_reader(url)
133
+ cfile = self._cache_iface.open(url, meta, reader, **kwargs)
134
+
135
+ return io.TextIOWrapper(cfile) if self.text_mode(mode) else cfile
136
+ else:
137
+ writeback_fn = functools.partial(self._upload_file, url)
138
+ if not self.truncate_mode(mode) and self._exists(url):
139
+ url_file = self._download_file(url)
140
+ self.seek_stream(mode, url_file)
141
+ else:
142
+ url_file = tempfile.TemporaryFile()
143
+
144
+ wbfile = wbf.WritebackFile(url_file, writeback_fn)
145
+
146
+ return io.TextIOWrapper(wbfile) if self.text_mode(mode) else wbfile
147
+
148
+ def remove(self, url):
149
+ self._session.delete(url, **self._req_kwargs)
150
+
151
+ def rename(self, src_url, dest_url):
152
+ # There is no "rename" in HTTP ...
153
+ with self._download_file(src_url) as fd:
154
+ self._upload_file(dest_url, fd)
155
+
156
+ self.remove(src_url)
157
+
158
+ def mkdir(self, url, mode=None):
159
+ pass
160
+
161
+ def makedirs(self, url, mode=None, exist_ok=None):
162
+ pass
163
+
164
+ def rmdir(self, url):
165
+ pass
166
+
167
+ def rmtree(self, url, ignore_errors=None):
168
+ pass
169
+
170
+ def list(self, url):
171
+ resp = self._session.get(url, **self._req_kwargs)
172
+ resp.raise_for_status()
173
+
174
+ html_parser = bs4.BeautifulSoup(resp.text, 'html.parser')
175
+
176
+ for link in html_parser.find_all('a'):
177
+ href = link.get('href')
178
+ if href and not re.match(r'[a-zA-Z]+://', href):
179
+ lurl = os.path.join(url, href)
180
+ try:
181
+ de = self.stat(lurl)
182
+
183
+ yield de
184
+ except GeneratorExit:
185
+ raise
186
+ except Exception as ex:
187
+ alog.debug(f'Unable to stat URL {lurl}: {ex}')
188
+
189
+ def _upload_data_gen(self, url, data_gen):
190
+ ctype, cencoding = mimetypes.guess_type(url, strict=False)
191
+
192
+ headers = self._req_kwargs.get('headers', dict()).copy()
193
+ if ctype is not None:
194
+ headers[hu.CONTENT_TYPE] = ctype
195
+ if cencoding is not None:
196
+ headers[hu.CONTENT_ENCODING] = cencoding
197
+
198
+ self._session.put(url, headers=headers, data=data_gen)
199
+
200
+ def _upload_file(self, url, stream):
201
+ stream.seek(0)
202
+ self._upload_data_gen(url, fsu.enum_chunks(stream))
203
+
204
+ def _iterate_chunks(self, url, chunk_size=None):
205
+ chunk_size = chunk_size or 16 * 1024**2
206
+
207
+ resp = self._session.get(url, stream=True, **self._req_kwargs)
208
+ resp.raise_for_status()
209
+
210
+ for data in resp.iter_content(chunk_size=chunk_size):
211
+ yield data
212
+
213
+ def _download_file(self, url, chunk_size=None):
214
+ with cm.Wrapper(tempfile.TemporaryFile()) as ftmp:
215
+ for data in self._iterate_chunks(url, chunk_size=chunk_size):
216
+ ftmp.v.write(data)
217
+
218
+ return ftmp.detach()
219
+
220
+ def put_file(self, url, data_gen):
221
+ self._upload_data_gen(url, data_gen)
222
+
223
+ def get_file(self, url):
224
+ for data in self._iterate_chunks(url):
225
+ yield data
226
+
227
+ def as_local(self, url, **kwargs):
228
+ reader, meta = self._make_reader(url)
229
+
230
+ return self._cache_iface.as_local(url, meta, reader, **kwargs)
231
+
232
+ def link(self, src_url, dest_url):
233
+ # There is not link support in HTTP.
234
+ self.copyfile(src_url, dest_url)
235
+
236
+ def symlink(self, src_url, dest_url):
237
+ self.link(src_url, dest_url)
238
+
239
+
240
+ FILE_SYSTEMS = (HttpFs,)
241
+