python-misc-utils 0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_misc_utils/__init__.py +0 -0
- py_misc_utils/abs_timeout.py +12 -0
- py_misc_utils/alog.py +311 -0
- py_misc_utils/app_main.py +179 -0
- py_misc_utils/archive_streamer.py +112 -0
- py_misc_utils/assert_checks.py +118 -0
- py_misc_utils/ast_utils.py +121 -0
- py_misc_utils/async_manager.py +189 -0
- py_misc_utils/break_control.py +63 -0
- py_misc_utils/buffered_iterator.py +35 -0
- py_misc_utils/cached_file.py +507 -0
- py_misc_utils/call_limiter.py +26 -0
- py_misc_utils/call_result_selector.py +13 -0
- py_misc_utils/cleanups.py +85 -0
- py_misc_utils/cmd.py +97 -0
- py_misc_utils/compression.py +116 -0
- py_misc_utils/cond_waiter.py +13 -0
- py_misc_utils/context_base.py +18 -0
- py_misc_utils/context_managers.py +67 -0
- py_misc_utils/core_utils.py +577 -0
- py_misc_utils/daemon_process.py +252 -0
- py_misc_utils/data_cache.py +46 -0
- py_misc_utils/date_utils.py +90 -0
- py_misc_utils/debug.py +24 -0
- py_misc_utils/dyn_modules.py +50 -0
- py_misc_utils/dynamod.py +103 -0
- py_misc_utils/env_config.py +35 -0
- py_misc_utils/executor.py +239 -0
- py_misc_utils/file_overwrite.py +29 -0
- py_misc_utils/fin_wrap.py +77 -0
- py_misc_utils/fp_utils.py +47 -0
- py_misc_utils/fs/__init__.py +0 -0
- py_misc_utils/fs/file_fs.py +127 -0
- py_misc_utils/fs/ftp_fs.py +242 -0
- py_misc_utils/fs/gcs_fs.py +196 -0
- py_misc_utils/fs/http_fs.py +241 -0
- py_misc_utils/fs/s3_fs.py +417 -0
- py_misc_utils/fs_base.py +133 -0
- py_misc_utils/fs_utils.py +207 -0
- py_misc_utils/gcs_fs.py +169 -0
- py_misc_utils/gen_indices.py +54 -0
- py_misc_utils/gfs.py +371 -0
- py_misc_utils/git_repo.py +77 -0
- py_misc_utils/global_namespace.py +110 -0
- py_misc_utils/http_async_fetcher.py +139 -0
- py_misc_utils/http_server.py +196 -0
- py_misc_utils/http_utils.py +143 -0
- py_misc_utils/img_utils.py +20 -0
- py_misc_utils/infix_op.py +20 -0
- py_misc_utils/inspect_utils.py +205 -0
- py_misc_utils/iostream.py +21 -0
- py_misc_utils/iter_file.py +117 -0
- py_misc_utils/key_wrap.py +46 -0
- py_misc_utils/lazy_import.py +25 -0
- py_misc_utils/lockfile.py +164 -0
- py_misc_utils/mem_size.py +64 -0
- py_misc_utils/mirror_from.py +72 -0
- py_misc_utils/mmap.py +16 -0
- py_misc_utils/module_utils.py +196 -0
- py_misc_utils/moving_average.py +19 -0
- py_misc_utils/msgpack_streamer.py +26 -0
- py_misc_utils/multi_wait.py +24 -0
- py_misc_utils/multiprocessing.py +102 -0
- py_misc_utils/named_array.py +224 -0
- py_misc_utils/no_break.py +46 -0
- py_misc_utils/no_except.py +32 -0
- py_misc_utils/np_ml_framework.py +184 -0
- py_misc_utils/np_utils.py +346 -0
- py_misc_utils/ntuple_utils.py +38 -0
- py_misc_utils/num_utils.py +54 -0
- py_misc_utils/obj.py +73 -0
- py_misc_utils/object_cache.py +100 -0
- py_misc_utils/object_tracker.py +88 -0
- py_misc_utils/ordered_set.py +71 -0
- py_misc_utils/osfd.py +27 -0
- py_misc_utils/packet.py +22 -0
- py_misc_utils/parquet_streamer.py +69 -0
- py_misc_utils/pd_utils.py +254 -0
- py_misc_utils/periodic_task.py +61 -0
- py_misc_utils/pickle_wrap.py +121 -0
- py_misc_utils/pipeline.py +98 -0
- py_misc_utils/remap_pickle.py +50 -0
- py_misc_utils/resource_manager.py +155 -0
- py_misc_utils/rnd_utils.py +56 -0
- py_misc_utils/run_once.py +19 -0
- py_misc_utils/scheduler.py +135 -0
- py_misc_utils/select_params.py +300 -0
- py_misc_utils/signal.py +141 -0
- py_misc_utils/skl_utils.py +270 -0
- py_misc_utils/split.py +147 -0
- py_misc_utils/state.py +53 -0
- py_misc_utils/std_module.py +56 -0
- py_misc_utils/stream_dataframe.py +176 -0
- py_misc_utils/streamed_file.py +144 -0
- py_misc_utils/tempdir.py +79 -0
- py_misc_utils/template_replace.py +51 -0
- py_misc_utils/tensor_stream.py +269 -0
- py_misc_utils/thread_context.py +33 -0
- py_misc_utils/throttle.py +30 -0
- py_misc_utils/time_trigger.py +18 -0
- py_misc_utils/timegen.py +11 -0
- py_misc_utils/traceback.py +49 -0
- py_misc_utils/tracking_executor.py +91 -0
- py_misc_utils/transform_array.py +42 -0
- py_misc_utils/uncompress.py +35 -0
- py_misc_utils/url_fetcher.py +157 -0
- py_misc_utils/utils.py +538 -0
- py_misc_utils/varint.py +50 -0
- py_misc_utils/virt_array.py +52 -0
- py_misc_utils/weak_call.py +33 -0
- py_misc_utils/work_results.py +100 -0
- py_misc_utils/writeback_file.py +43 -0
- python_misc_utils-0.2.dist-info/METADATA +36 -0
- python_misc_utils-0.2.dist-info/RECORD +117 -0
- python_misc_utils-0.2.dist-info/WHEEL +5 -0
- python_misc_utils-0.2.dist-info/licenses/LICENSE +13 -0
- python_misc_utils-0.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,507 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import datetime
|
|
3
|
+
import functools
|
|
4
|
+
import hashlib
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
import time
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
from . import alog
|
|
12
|
+
from . import assert_checks as tas
|
|
13
|
+
from . import core_utils as cu
|
|
14
|
+
from . import file_overwrite as fow
|
|
15
|
+
from . import fin_wrap as fw
|
|
16
|
+
from . import fs_utils as fsu
|
|
17
|
+
from . import lockfile as lockf
|
|
18
|
+
from . import no_except as nox
|
|
19
|
+
from . import obj
|
|
20
|
+
from . import osfd
|
|
21
|
+
from . import tempdir as tmpd
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
_DroppedBlock = collections.namedtuple('DroppedBlock', 'name, sres, cid, offset')
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Meta(obj.Obj):
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class CachedBlockFile:
|
|
32
|
+
|
|
33
|
+
METAFILE = 'META'
|
|
34
|
+
BLOCKSDIR = 'blocks'
|
|
35
|
+
LINKSDIR = 'links'
|
|
36
|
+
WHOLE_OFFSET = -1
|
|
37
|
+
CID_SIZE = 16
|
|
38
|
+
BLOCKSIZE = 32 * 1024**2
|
|
39
|
+
|
|
40
|
+
def __init__(self, path, reader, meta=None, close_fn=None):
|
|
41
|
+
self._path = path
|
|
42
|
+
self._reader = reader
|
|
43
|
+
self._close_fn = close_fn
|
|
44
|
+
self.meta = self.load_meta(path) if meta is None else meta
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def default_meta(cls):
|
|
48
|
+
return Meta(url=None, size=None, block_size=cls.BLOCKSIZE)
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def prepare_meta(cls, meta, **kwargs):
|
|
52
|
+
cmeta = cls.default_meta()
|
|
53
|
+
cmeta.update_from(meta)
|
|
54
|
+
cmeta.update(**kwargs)
|
|
55
|
+
|
|
56
|
+
cid = hashlib.sha1(cmeta.tag.encode()).hexdigest()[: cls.CID_SIZE]
|
|
57
|
+
cmeta.update(cid=cid)
|
|
58
|
+
|
|
59
|
+
return cmeta
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def remove(cls, path):
|
|
63
|
+
try:
|
|
64
|
+
fsu.safe_rmtree(path, ignore_errors=True)
|
|
65
|
+
|
|
66
|
+
return True
|
|
67
|
+
except:
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def create(cls, path, meta):
|
|
72
|
+
tpath = fsu.temp_path(nspath=path)
|
|
73
|
+
try:
|
|
74
|
+
os.makedirs(tpath, exist_ok=True)
|
|
75
|
+
os.mkdir(cls.blocks_dir(tpath))
|
|
76
|
+
os.mkdir(cls.links_dir(tpath))
|
|
77
|
+
|
|
78
|
+
cls.save_meta(tpath, meta)
|
|
79
|
+
|
|
80
|
+
os.rename(tpath, path)
|
|
81
|
+
except:
|
|
82
|
+
shutil.rmtree(tpath, ignore_errors=True)
|
|
83
|
+
raise
|
|
84
|
+
|
|
85
|
+
def _fblock_path(self, offset):
|
|
86
|
+
return self.fblock_path(self._path, self.meta.cid, offset)
|
|
87
|
+
|
|
88
|
+
def _fetch_block(self, offset):
|
|
89
|
+
bpath = self._fblock_path(offset)
|
|
90
|
+
with lockf.LockFile(bpath):
|
|
91
|
+
if (sres := fsu.stat(bpath)) is None:
|
|
92
|
+
tpath = fsu.temp_path(nspath=bpath)
|
|
93
|
+
try:
|
|
94
|
+
rsize = self._reader.read_block(tpath, offset, self.meta.block_size)
|
|
95
|
+
if rsize > 0:
|
|
96
|
+
os.replace(tpath, bpath)
|
|
97
|
+
if offset == self.WHOLE_OFFSET:
|
|
98
|
+
self._make_link(bpath)
|
|
99
|
+
except:
|
|
100
|
+
fsu.maybe_remove(tpath)
|
|
101
|
+
raise
|
|
102
|
+
else:
|
|
103
|
+
rsize = sres.st_size
|
|
104
|
+
|
|
105
|
+
return rsize, bpath
|
|
106
|
+
|
|
107
|
+
def _make_link(self, bpath):
|
|
108
|
+
lpath = self.local_link()
|
|
109
|
+
if not os.path.exists(lpath):
|
|
110
|
+
try:
|
|
111
|
+
os.makedirs(os.path.dirname(lpath), exist_ok=True)
|
|
112
|
+
os.link(bpath, lpath)
|
|
113
|
+
os.chmod(lpath, 0o444)
|
|
114
|
+
except Exception as ex:
|
|
115
|
+
alog.warning(f'Unable to create link: {bpath} -> {lpath}')
|
|
116
|
+
|
|
117
|
+
def _try_block(self, boffset, offset):
|
|
118
|
+
bpath = self._fblock_path(boffset)
|
|
119
|
+
try:
|
|
120
|
+
with osfd.OsFd(bpath, os.O_RDONLY) as fd:
|
|
121
|
+
sres = os.stat(fd)
|
|
122
|
+
if sres.st_size >= offset:
|
|
123
|
+
os.lseek(fd, offset, os.SEEK_SET)
|
|
124
|
+
size = min(self.meta.block_size, sres.st_size - offset)
|
|
125
|
+
|
|
126
|
+
return os.read(fd, size)
|
|
127
|
+
except FileNotFoundError:
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
def _translate_offset(self, offset):
|
|
131
|
+
has_whole_content = True
|
|
132
|
+
if self._reader.support_blocks():
|
|
133
|
+
# Even if the reader supports blocks, we might have cached the whole content
|
|
134
|
+
# at once, so make sure we do not waste the cached whole content.
|
|
135
|
+
bpath = self._fblock_path(self.WHOLE_OFFSET)
|
|
136
|
+
has_whole_content = os.path.exists(bpath)
|
|
137
|
+
|
|
138
|
+
if has_whole_content:
|
|
139
|
+
boffset = self.WHOLE_OFFSET
|
|
140
|
+
else:
|
|
141
|
+
boffset, offset = offset, 0
|
|
142
|
+
|
|
143
|
+
return boffset, offset
|
|
144
|
+
|
|
145
|
+
def close(self):
|
|
146
|
+
if self._close_fn is not None:
|
|
147
|
+
self._close_fn()
|
|
148
|
+
self._close_fn = None
|
|
149
|
+
|
|
150
|
+
def cacheall(self):
|
|
151
|
+
size, bpath = self._fetch_block(self.WHOLE_OFFSET)
|
|
152
|
+
|
|
153
|
+
return self.local_link() if size > 0 else None
|
|
154
|
+
|
|
155
|
+
def read_block(self, offset):
|
|
156
|
+
tas.check_eq(offset % self.meta.block_size, 0,
|
|
157
|
+
msg=f'Block offset ({offset}) must be multiple of {self.meta.block_size}')
|
|
158
|
+
|
|
159
|
+
boffset, offset = self._translate_offset(offset)
|
|
160
|
+
|
|
161
|
+
data = self._try_block(boffset, offset)
|
|
162
|
+
if data is None:
|
|
163
|
+
read_size, _ = self._fetch_block(boffset)
|
|
164
|
+
if read_size > 0:
|
|
165
|
+
data = self._try_block(boffset, offset)
|
|
166
|
+
|
|
167
|
+
return data
|
|
168
|
+
|
|
169
|
+
def size(self):
|
|
170
|
+
size = self.meta.size
|
|
171
|
+
if size is None:
|
|
172
|
+
size, _ = self._fetch_block(self.WHOLE_OFFSET)
|
|
173
|
+
meta = self.meta.clone(size=size)
|
|
174
|
+
self.save_meta(self._path, meta)
|
|
175
|
+
self.meta = meta
|
|
176
|
+
|
|
177
|
+
return size
|
|
178
|
+
|
|
179
|
+
def locked(self):
|
|
180
|
+
return lockf.LockFile(self._path)
|
|
181
|
+
|
|
182
|
+
def local_link(self):
|
|
183
|
+
return self.flink_path(self._path, self.meta.cid, self.meta.url)
|
|
184
|
+
|
|
185
|
+
@classmethod
|
|
186
|
+
def blocks_dir(cls, path):
|
|
187
|
+
return os.path.join(path, cls.BLOCKSDIR)
|
|
188
|
+
|
|
189
|
+
@classmethod
|
|
190
|
+
def links_dir(cls, path):
|
|
191
|
+
return os.path.join(path, cls.LINKSDIR)
|
|
192
|
+
|
|
193
|
+
@classmethod
|
|
194
|
+
def fblock_path(cls, path, cid, offset):
|
|
195
|
+
block_id = f'block-{cid}-{offset}' if offset >= 0 else f'block-{cid}'
|
|
196
|
+
|
|
197
|
+
return os.path.join(cls.blocks_dir(path), block_id)
|
|
198
|
+
|
|
199
|
+
@classmethod
|
|
200
|
+
def parse_block_file(cls, fname):
|
|
201
|
+
m = re.match(r'block\-([^\-]+)(\-(\d+))?$', fname)
|
|
202
|
+
if m:
|
|
203
|
+
offset = m.group(3)
|
|
204
|
+
offset = int(offset) if offset is not None else cls.WHOLE_OFFSET
|
|
205
|
+
|
|
206
|
+
return m.group(1), offset
|
|
207
|
+
|
|
208
|
+
@classmethod
|
|
209
|
+
def flink_path(cls, path, cid, url):
|
|
210
|
+
lpath = os.path.join(cls.links_dir(path), cid)
|
|
211
|
+
|
|
212
|
+
return os.path.join(lpath, os.path.basename(url))
|
|
213
|
+
|
|
214
|
+
@classmethod
|
|
215
|
+
def purge_blocks(cls, path, max_age=None):
|
|
216
|
+
meta = cls.load_meta(path)
|
|
217
|
+
|
|
218
|
+
bpath = cls.blocks_dir(path)
|
|
219
|
+
dropped = []
|
|
220
|
+
with os.scandir(bpath) as sdit:
|
|
221
|
+
for dentry in sdit:
|
|
222
|
+
if dentry.is_file():
|
|
223
|
+
pbf = cls.parse_block_file(dentry.name)
|
|
224
|
+
if pbf is not None:
|
|
225
|
+
cid, offset = pbf
|
|
226
|
+
if cid != meta.cid:
|
|
227
|
+
dropped.append(_DroppedBlock(name=dentry.name,
|
|
228
|
+
sres=dentry.stat(),
|
|
229
|
+
cid=cid,
|
|
230
|
+
offset=offset))
|
|
231
|
+
|
|
232
|
+
max_age = max_age or int(os.getenv('GFS_CACHE_MAXAGE', 300))
|
|
233
|
+
for dblock in dropped:
|
|
234
|
+
if (time.time() - dblock.sres.st_mtime) > max_age:
|
|
235
|
+
try:
|
|
236
|
+
alog.info(f'Removing block file {dblock.name} from {path} ({meta})')
|
|
237
|
+
os.remove(os.path.join(bpath, dblock.name))
|
|
238
|
+
except Exception as ex:
|
|
239
|
+
alog.warning(f'Unable to purge block file from {dblock.name} from {path}: {ex}')
|
|
240
|
+
|
|
241
|
+
lpath = cls.flink_path(path, dblock.cid, meta.url)
|
|
242
|
+
nox.qno_except(fsu.safe_rmtree, os.path.dirname(lpath), ignore_errors=True)
|
|
243
|
+
|
|
244
|
+
return meta
|
|
245
|
+
|
|
246
|
+
@classmethod
|
|
247
|
+
def fmeta_path(cls, path):
|
|
248
|
+
return os.path.join(path, cls.METAFILE)
|
|
249
|
+
|
|
250
|
+
@classmethod
|
|
251
|
+
def save_meta(cls, path, meta):
|
|
252
|
+
mpath = cls.fmeta_path(path)
|
|
253
|
+
with fow.FileOverwrite(mpath) as fd:
|
|
254
|
+
yaml.dump(meta.as_dict(), fd, default_flow_style=False)
|
|
255
|
+
|
|
256
|
+
@classmethod
|
|
257
|
+
def load_meta(cls, path):
|
|
258
|
+
mpath = cls.fmeta_path(path)
|
|
259
|
+
with open(mpath, mode='r') as fd:
|
|
260
|
+
meta = yaml.safe_load(fd)
|
|
261
|
+
|
|
262
|
+
return Meta(**meta)
|
|
263
|
+
|
|
264
|
+
@classmethod
|
|
265
|
+
def validate(cls, path):
|
|
266
|
+
try:
|
|
267
|
+
return cls.load_meta(path)
|
|
268
|
+
except:
|
|
269
|
+
pass
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class CachedFile:
|
|
273
|
+
|
|
274
|
+
def __init__(self, cbf, block_size=None):
|
|
275
|
+
fw.fin_wrap(self, 'cbf', cbf, finfn=cbf.close)
|
|
276
|
+
self._block_size = block_size or cbf.meta.block_size
|
|
277
|
+
self._offset = 0
|
|
278
|
+
self._block_start = 0
|
|
279
|
+
self._block = None
|
|
280
|
+
|
|
281
|
+
def close(self):
|
|
282
|
+
cbf = self.cbf
|
|
283
|
+
if cbf is not None:
|
|
284
|
+
fw.fin_wrap(self, 'cbf', None, cleanup=True)
|
|
285
|
+
|
|
286
|
+
@property
|
|
287
|
+
def closed(self):
|
|
288
|
+
return self.cbf is None
|
|
289
|
+
|
|
290
|
+
def seek(self, pos, whence=os.SEEK_SET):
|
|
291
|
+
if whence == os.SEEK_SET:
|
|
292
|
+
offset = pos
|
|
293
|
+
elif whence == os.SEEK_CUR:
|
|
294
|
+
offset = self._offset + pos
|
|
295
|
+
elif whence == os.SEEK_END:
|
|
296
|
+
offset = self.cbf.size() + pos
|
|
297
|
+
else:
|
|
298
|
+
alog.xraise(ValueError, f'Invalid seek mode: {whence}')
|
|
299
|
+
|
|
300
|
+
tas.check_le(offset, self.cbf.size(), msg=f'Offset out of range')
|
|
301
|
+
tas.check_ge(offset, 0, msg=f'Offset out of range')
|
|
302
|
+
|
|
303
|
+
self._offset = offset
|
|
304
|
+
|
|
305
|
+
return offset
|
|
306
|
+
|
|
307
|
+
def tell(self):
|
|
308
|
+
return self._offset
|
|
309
|
+
|
|
310
|
+
def _ensure_buffer(self, offset):
|
|
311
|
+
boffset = offset - self._block_start
|
|
312
|
+
if self._block is None or boffset < 0 or boffset >= len(self._block):
|
|
313
|
+
block_offset = (offset // self._block_size) * self._block_size
|
|
314
|
+
|
|
315
|
+
self._block = memoryview(self.cbf.read_block(block_offset))
|
|
316
|
+
self._block_start = block_offset
|
|
317
|
+
boffset = offset - block_offset
|
|
318
|
+
|
|
319
|
+
return boffset
|
|
320
|
+
|
|
321
|
+
def _max_size(self, size):
|
|
322
|
+
available = self.cbf.size() - self._offset
|
|
323
|
+
|
|
324
|
+
return available if size < 0 else min(size, available)
|
|
325
|
+
|
|
326
|
+
def read(self, size=-1):
|
|
327
|
+
rsize = self._max_size(size)
|
|
328
|
+
|
|
329
|
+
parts = []
|
|
330
|
+
while rsize > 0:
|
|
331
|
+
boffset = self._ensure_buffer(self._offset)
|
|
332
|
+
|
|
333
|
+
csize = min(rsize, len(self._block) - boffset)
|
|
334
|
+
parts.append(self._block[boffset: boffset + csize])
|
|
335
|
+
self._offset += csize
|
|
336
|
+
rsize -= csize
|
|
337
|
+
|
|
338
|
+
return b''.join(parts)
|
|
339
|
+
|
|
340
|
+
def read1(self, size=-1):
|
|
341
|
+
return self.read(size=size)
|
|
342
|
+
|
|
343
|
+
def peek(self, size=0):
|
|
344
|
+
if size > 0:
|
|
345
|
+
boffset = self._ensure_buffer(self._offset)
|
|
346
|
+
csize = min(size, len(self._block) - boffset)
|
|
347
|
+
|
|
348
|
+
return self._block[boffset: boffset + csize].tobytes()
|
|
349
|
+
|
|
350
|
+
return b''
|
|
351
|
+
|
|
352
|
+
def readline(self, size=-1):
|
|
353
|
+
rsize = self._max_size(size)
|
|
354
|
+
|
|
355
|
+
parts = []
|
|
356
|
+
while rsize > 0:
|
|
357
|
+
boffset = self._ensure_buffer(self._offset)
|
|
358
|
+
|
|
359
|
+
csize = min(rsize, len(self._block) - boffset)
|
|
360
|
+
cdata = self._block[boffset: boffset + csize]
|
|
361
|
+
|
|
362
|
+
pos = cu.vfind(cdata, b'\n')
|
|
363
|
+
if pos >= 0:
|
|
364
|
+
parts.append(cdata[: pos + 1])
|
|
365
|
+
self._offset += pos + 1
|
|
366
|
+
break
|
|
367
|
+
else:
|
|
368
|
+
self._offset += csize
|
|
369
|
+
rsize -= csize
|
|
370
|
+
|
|
371
|
+
return b''.join(parts)
|
|
372
|
+
|
|
373
|
+
def flush(self):
|
|
374
|
+
pass
|
|
375
|
+
|
|
376
|
+
def readable(self):
|
|
377
|
+
return not self.closed
|
|
378
|
+
|
|
379
|
+
def seekable(self):
|
|
380
|
+
return not self.closed
|
|
381
|
+
|
|
382
|
+
def writable(self):
|
|
383
|
+
return False
|
|
384
|
+
|
|
385
|
+
def __enter__(self):
|
|
386
|
+
return self
|
|
387
|
+
|
|
388
|
+
def __exit__(self, *exc):
|
|
389
|
+
self.close()
|
|
390
|
+
|
|
391
|
+
return False
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
class CacheInterface:
|
|
395
|
+
|
|
396
|
+
def __init__(self, cache_dir):
|
|
397
|
+
self._cache_dir = cache_dir
|
|
398
|
+
|
|
399
|
+
def _open(self, cfpath, url, meta, reader, close_fn=None, **kwargs):
|
|
400
|
+
with lockf.LockFile(cfpath):
|
|
401
|
+
meta = CachedBlockFile.prepare_meta(meta, url=url)
|
|
402
|
+
if (xmeta := CachedBlockFile.validate(cfpath)) is None:
|
|
403
|
+
CachedBlockFile.create(cfpath, meta)
|
|
404
|
+
else:
|
|
405
|
+
if xmeta.cid != meta.cid:
|
|
406
|
+
alog.debug(f'Updating meta of {cfpath}: {xmeta} -> {meta}')
|
|
407
|
+
CachedBlockFile.save_meta(cfpath, meta)
|
|
408
|
+
|
|
409
|
+
return CachedFile(CachedBlockFile(cfpath, reader, meta=meta, close_fn=close_fn))
|
|
410
|
+
|
|
411
|
+
def open(self, url, meta, reader, **kwargs):
|
|
412
|
+
uncached = kwargs.pop('uncached', False)
|
|
413
|
+
if uncached:
|
|
414
|
+
tmp_path = tmpd.create()
|
|
415
|
+
cfpath = _get_cache_path(tmp_path, url)
|
|
416
|
+
close_fn = functools.partial(fsu.safe_rmtree, tmp_path, ignore_errors=True)
|
|
417
|
+
else:
|
|
418
|
+
cfpath = _get_cache_path(self._cache_dir, url)
|
|
419
|
+
close_fn = None
|
|
420
|
+
|
|
421
|
+
return self._open(cfpath, url, meta, reader, close_fn=close_fn, **kwargs)
|
|
422
|
+
|
|
423
|
+
def as_local(self, url, meta, reader, **kwargs):
|
|
424
|
+
cfile = self.open(url, meta, reader, **kwargs)
|
|
425
|
+
|
|
426
|
+
local_path = cfile.cbf.cacheall()
|
|
427
|
+
tas.check_is_not_none(local_path, msg=f'Unable to materialize a local path: {url}')
|
|
428
|
+
|
|
429
|
+
return local_path
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def _get_cache_path(cache_dir, url):
|
|
433
|
+
uhash = hashlib.sha1(url.encode()).hexdigest()
|
|
434
|
+
|
|
435
|
+
return os.path.join(cache_dir, uhash)
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
_CacheFileStats = collections.namedtuple(
|
|
439
|
+
'CacheFileStats', 'path, mtime, size, meta',
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
def cleanup_cache(cache_dir, max_age=None, max_size=None):
|
|
443
|
+
alog.verbose(f'Cache cleanup running: {cache_dir}')
|
|
444
|
+
|
|
445
|
+
if os.path.isdir(cache_dir):
|
|
446
|
+
cache_files = []
|
|
447
|
+
with os.scandir(cache_dir) as sdit:
|
|
448
|
+
for dentry in sdit:
|
|
449
|
+
if dentry.is_dir():
|
|
450
|
+
cfpath = os.path.join(cache_dir, dentry.name)
|
|
451
|
+
with lockf.LockFile(cfpath):
|
|
452
|
+
try:
|
|
453
|
+
meta = CachedBlockFile.purge_blocks(cfpath, max_age=max_age)
|
|
454
|
+
|
|
455
|
+
cfsize = fsu.du(cfpath)
|
|
456
|
+
sres = os.stat(CachedBlockFile.fmeta_path(cfpath))
|
|
457
|
+
cache_files.append(_CacheFileStats(path=cfpath,
|
|
458
|
+
mtime=sres.st_mtime,
|
|
459
|
+
size=cfsize,
|
|
460
|
+
meta=meta))
|
|
461
|
+
except Exception as ex:
|
|
462
|
+
alog.warning(f'Unable to purge blocks from {cfpath}: {ex}')
|
|
463
|
+
|
|
464
|
+
cache_files = sorted(cache_files, key=lambda cfs: cfs.mtime, reverse=True)
|
|
465
|
+
max_size = max_size or int(os.getenv('GFS_CACHE_MAXSIZE', 16 * 1024**3))
|
|
466
|
+
|
|
467
|
+
cache_size = 0
|
|
468
|
+
for cfs in cache_files:
|
|
469
|
+
cache_size += cfs.size
|
|
470
|
+
if cache_size >= max_size:
|
|
471
|
+
alog.info(f'Dropping cache for {cfs.meta.url} stored at {cfs.path}')
|
|
472
|
+
with lockf.LockFile(cfs.path):
|
|
473
|
+
CachedBlockFile.remove(cfs.path)
|
|
474
|
+
|
|
475
|
+
alog.debug0(f'Cache size was {cu.size_str(cache_size)} (size will be trimmed ' \
|
|
476
|
+
f'to {cu.size_str(max_size)})')
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def make_tag(**kwargs):
|
|
480
|
+
stag = ','.join(f'{k}={v}' for k, v in kwargs.items())
|
|
481
|
+
|
|
482
|
+
return hashlib.sha1(stag.encode()).hexdigest()
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
_CLEANUP_PERIOD = int(os.getenv('GFS_CACHE_CLEANUP_PERIOD', 8 * 3600))
|
|
486
|
+
|
|
487
|
+
def _cleanup_check(path):
|
|
488
|
+
lpath = os.path.join(path, '.last_cleanup')
|
|
489
|
+
if (sres := fsu.stat(lpath)) is None:
|
|
490
|
+
do_cleanup = os.path.isdir(path)
|
|
491
|
+
else:
|
|
492
|
+
do_cleanup = time.time() > sres.st_mtime + _CLEANUP_PERIOD
|
|
493
|
+
|
|
494
|
+
if do_cleanup:
|
|
495
|
+
alog.debug(f'Triggering cache cleanup: {path}')
|
|
496
|
+
cleanup_cache(path)
|
|
497
|
+
with open(lpath, mode='w') as fd:
|
|
498
|
+
fd.write(datetime.datetime.now().isoformat(timespec='microseconds'))
|
|
499
|
+
|
|
500
|
+
return path
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def get_cache_dir(path):
|
|
504
|
+
cdpath = os.path.join(fsu.normpath(path), 'gfs')
|
|
505
|
+
|
|
506
|
+
return _cleanup_check(cdpath)
|
|
507
|
+
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
|
|
3
|
+
from . import traceback as tb
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
_LOCK = threading.Lock()
|
|
7
|
+
_TB = dict()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def trigger(filename, count):
|
|
11
|
+
frame = tb.get_frame_after(filename)
|
|
12
|
+
if frame is not None:
|
|
13
|
+
tb = frame.f_code.co_filename, frame.f_lineno
|
|
14
|
+
with _LOCK:
|
|
15
|
+
c = _TB.get(tb, 0)
|
|
16
|
+
_TB[tb] = c + 1
|
|
17
|
+
|
|
18
|
+
return count > c
|
|
19
|
+
|
|
20
|
+
return True
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def limit_call(count, fn, *args, _filename=None, **kwargs):
|
|
24
|
+
if trigger(_filename or __file__, count):
|
|
25
|
+
return fn(*args, **kwargs)
|
|
26
|
+
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
import collections
|
|
3
|
+
import threading
|
|
4
|
+
|
|
5
|
+
from . import alog
|
|
6
|
+
from . import global_namespace as gns
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
_Cleaner = collections.namedtuple('Cleaner', 'fn, args, kwargs')
|
|
10
|
+
|
|
11
|
+
class _Cleanups:
|
|
12
|
+
|
|
13
|
+
def __init__(self):
|
|
14
|
+
self._lock = threading.Lock()
|
|
15
|
+
self._nextid = 0
|
|
16
|
+
self._cleaners = dict()
|
|
17
|
+
|
|
18
|
+
# The run() API is called from a "finally" clause of the multiprocessing module,
|
|
19
|
+
# which is the preferred path since we know eveything is up at that time. But we
|
|
20
|
+
# also register an atexit callback for cases (like child prcesses) which do not
|
|
21
|
+
# end up going out the multiprocessing path (although every child process using
|
|
22
|
+
# this library should be created with the multiprocessing.create_process() API).
|
|
23
|
+
atexit.register(self.run)
|
|
24
|
+
|
|
25
|
+
def register(self, fn, *args, **kwargs):
|
|
26
|
+
with self._lock:
|
|
27
|
+
cid = self._nextid
|
|
28
|
+
self._cleaners[cid] = _Cleaner(fn=fn, args=args, kwargs=kwargs)
|
|
29
|
+
self._nextid += 1
|
|
30
|
+
|
|
31
|
+
return cid
|
|
32
|
+
|
|
33
|
+
def unregister(self, cid, run=False):
|
|
34
|
+
with self._lock:
|
|
35
|
+
cleaner = self._cleaners.pop(cid, None)
|
|
36
|
+
|
|
37
|
+
if cleaner is not None and run:
|
|
38
|
+
self._run_cleaner(fn, args, kwargs)
|
|
39
|
+
|
|
40
|
+
return cleaner
|
|
41
|
+
|
|
42
|
+
def _run_cleaner(self, cleaner):
|
|
43
|
+
try:
|
|
44
|
+
cleaner.fn(*cleaner.args, **cleaner.kwargs)
|
|
45
|
+
except Exception as ex:
|
|
46
|
+
alog.exception(ex, exmsg=f'Exception while running cleanups')
|
|
47
|
+
|
|
48
|
+
def run(self):
|
|
49
|
+
with self._lock:
|
|
50
|
+
cleaners = self._cleaners
|
|
51
|
+
self._cleaners = dict()
|
|
52
|
+
|
|
53
|
+
# Sort by reverse ID, which is reverse register order.
|
|
54
|
+
cids = sorted(cleaners.keys(), reverse=True)
|
|
55
|
+
|
|
56
|
+
for cleaner in (cleaners[cid] for cid in cids):
|
|
57
|
+
self._run_cleaner(cleaner)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
_CLEANUPS = gns.Var(f'{__name__}.CLEANUPS',
|
|
61
|
+
fork_init=True,
|
|
62
|
+
defval=lambda: _Cleanups())
|
|
63
|
+
|
|
64
|
+
def _cleanups():
|
|
65
|
+
return gns.get(_CLEANUPS)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def register(fn, *args, **kwargs):
|
|
69
|
+
return _cleanups().register(fn, *args, **kwargs)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# Decorator style registration.
|
|
73
|
+
def reg(fn):
|
|
74
|
+
register(fn)
|
|
75
|
+
|
|
76
|
+
return fn
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def unregister(cid, run=False):
|
|
80
|
+
return _cleanups().unregister(cid, run=run)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def run():
|
|
84
|
+
_cleanups().run()
|
|
85
|
+
|