python-misc-utils 0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_misc_utils/__init__.py +0 -0
- py_misc_utils/abs_timeout.py +12 -0
- py_misc_utils/alog.py +311 -0
- py_misc_utils/app_main.py +179 -0
- py_misc_utils/archive_streamer.py +112 -0
- py_misc_utils/assert_checks.py +118 -0
- py_misc_utils/ast_utils.py +121 -0
- py_misc_utils/async_manager.py +189 -0
- py_misc_utils/break_control.py +63 -0
- py_misc_utils/buffered_iterator.py +35 -0
- py_misc_utils/cached_file.py +507 -0
- py_misc_utils/call_limiter.py +26 -0
- py_misc_utils/call_result_selector.py +13 -0
- py_misc_utils/cleanups.py +85 -0
- py_misc_utils/cmd.py +97 -0
- py_misc_utils/compression.py +116 -0
- py_misc_utils/cond_waiter.py +13 -0
- py_misc_utils/context_base.py +18 -0
- py_misc_utils/context_managers.py +67 -0
- py_misc_utils/core_utils.py +577 -0
- py_misc_utils/daemon_process.py +252 -0
- py_misc_utils/data_cache.py +46 -0
- py_misc_utils/date_utils.py +90 -0
- py_misc_utils/debug.py +24 -0
- py_misc_utils/dyn_modules.py +50 -0
- py_misc_utils/dynamod.py +103 -0
- py_misc_utils/env_config.py +35 -0
- py_misc_utils/executor.py +239 -0
- py_misc_utils/file_overwrite.py +29 -0
- py_misc_utils/fin_wrap.py +77 -0
- py_misc_utils/fp_utils.py +47 -0
- py_misc_utils/fs/__init__.py +0 -0
- py_misc_utils/fs/file_fs.py +127 -0
- py_misc_utils/fs/ftp_fs.py +242 -0
- py_misc_utils/fs/gcs_fs.py +196 -0
- py_misc_utils/fs/http_fs.py +241 -0
- py_misc_utils/fs/s3_fs.py +417 -0
- py_misc_utils/fs_base.py +133 -0
- py_misc_utils/fs_utils.py +207 -0
- py_misc_utils/gcs_fs.py +169 -0
- py_misc_utils/gen_indices.py +54 -0
- py_misc_utils/gfs.py +371 -0
- py_misc_utils/git_repo.py +77 -0
- py_misc_utils/global_namespace.py +110 -0
- py_misc_utils/http_async_fetcher.py +139 -0
- py_misc_utils/http_server.py +196 -0
- py_misc_utils/http_utils.py +143 -0
- py_misc_utils/img_utils.py +20 -0
- py_misc_utils/infix_op.py +20 -0
- py_misc_utils/inspect_utils.py +205 -0
- py_misc_utils/iostream.py +21 -0
- py_misc_utils/iter_file.py +117 -0
- py_misc_utils/key_wrap.py +46 -0
- py_misc_utils/lazy_import.py +25 -0
- py_misc_utils/lockfile.py +164 -0
- py_misc_utils/mem_size.py +64 -0
- py_misc_utils/mirror_from.py +72 -0
- py_misc_utils/mmap.py +16 -0
- py_misc_utils/module_utils.py +196 -0
- py_misc_utils/moving_average.py +19 -0
- py_misc_utils/msgpack_streamer.py +26 -0
- py_misc_utils/multi_wait.py +24 -0
- py_misc_utils/multiprocessing.py +102 -0
- py_misc_utils/named_array.py +224 -0
- py_misc_utils/no_break.py +46 -0
- py_misc_utils/no_except.py +32 -0
- py_misc_utils/np_ml_framework.py +184 -0
- py_misc_utils/np_utils.py +346 -0
- py_misc_utils/ntuple_utils.py +38 -0
- py_misc_utils/num_utils.py +54 -0
- py_misc_utils/obj.py +73 -0
- py_misc_utils/object_cache.py +100 -0
- py_misc_utils/object_tracker.py +88 -0
- py_misc_utils/ordered_set.py +71 -0
- py_misc_utils/osfd.py +27 -0
- py_misc_utils/packet.py +22 -0
- py_misc_utils/parquet_streamer.py +69 -0
- py_misc_utils/pd_utils.py +254 -0
- py_misc_utils/periodic_task.py +61 -0
- py_misc_utils/pickle_wrap.py +121 -0
- py_misc_utils/pipeline.py +98 -0
- py_misc_utils/remap_pickle.py +50 -0
- py_misc_utils/resource_manager.py +155 -0
- py_misc_utils/rnd_utils.py +56 -0
- py_misc_utils/run_once.py +19 -0
- py_misc_utils/scheduler.py +135 -0
- py_misc_utils/select_params.py +300 -0
- py_misc_utils/signal.py +141 -0
- py_misc_utils/skl_utils.py +270 -0
- py_misc_utils/split.py +147 -0
- py_misc_utils/state.py +53 -0
- py_misc_utils/std_module.py +56 -0
- py_misc_utils/stream_dataframe.py +176 -0
- py_misc_utils/streamed_file.py +144 -0
- py_misc_utils/tempdir.py +79 -0
- py_misc_utils/template_replace.py +51 -0
- py_misc_utils/tensor_stream.py +269 -0
- py_misc_utils/thread_context.py +33 -0
- py_misc_utils/throttle.py +30 -0
- py_misc_utils/time_trigger.py +18 -0
- py_misc_utils/timegen.py +11 -0
- py_misc_utils/traceback.py +49 -0
- py_misc_utils/tracking_executor.py +91 -0
- py_misc_utils/transform_array.py +42 -0
- py_misc_utils/uncompress.py +35 -0
- py_misc_utils/url_fetcher.py +157 -0
- py_misc_utils/utils.py +538 -0
- py_misc_utils/varint.py +50 -0
- py_misc_utils/virt_array.py +52 -0
- py_misc_utils/weak_call.py +33 -0
- py_misc_utils/work_results.py +100 -0
- py_misc_utils/writeback_file.py +43 -0
- python_misc_utils-0.2.dist-info/METADATA +36 -0
- python_misc_utils-0.2.dist-info/RECORD +117 -0
- python_misc_utils-0.2.dist-info/WHEEL +5 -0
- python_misc_utils-0.2.dist-info/licenses/LICENSE +13 -0
- python_misc_utils-0.2.dist-info/top_level.txt +1 -0
py_misc_utils/split.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from . import assert_checks as tas
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class _Skipper:
|
|
8
|
+
|
|
9
|
+
def __init__(self, quote_rx):
|
|
10
|
+
self.quote_rx = quote_rx
|
|
11
|
+
self.next_pos = 0
|
|
12
|
+
|
|
13
|
+
def skip(self, data, pos):
|
|
14
|
+
next_pos = self.next_pos - pos
|
|
15
|
+
if next_pos <= 0:
|
|
16
|
+
m = re.search(self.quote_rx, data)
|
|
17
|
+
next_pos = m.start() if m else len(data)
|
|
18
|
+
self.next_pos = pos + next_pos
|
|
19
|
+
|
|
20
|
+
return next_pos
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _chars_regex(chars):
|
|
24
|
+
rexs = bytearray(b'[\\')
|
|
25
|
+
for c in sorted(chars):
|
|
26
|
+
rexs.extend((ord('\\'), c))
|
|
27
|
+
|
|
28
|
+
rexs.append(ord(']'))
|
|
29
|
+
|
|
30
|
+
return re.compile(bytes(rexs))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _specials_regex(qmap):
|
|
34
|
+
return _chars_regex(set(tuple(qmap.keys()) + tuple(qmap.values())))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _split_forward(data, pos, split_rx, skipper, seq):
|
|
38
|
+
pdata = data[pos:]
|
|
39
|
+
|
|
40
|
+
xm = re.search(split_rx, pdata)
|
|
41
|
+
if xm:
|
|
42
|
+
seq_pos, next_pos = xm.start(), xm.end()
|
|
43
|
+
else:
|
|
44
|
+
seq_pos = next_pos = len(pdata)
|
|
45
|
+
|
|
46
|
+
skip_pos = skipper.skip(pdata, pos)
|
|
47
|
+
if skip_pos < seq_pos:
|
|
48
|
+
seq_pos = next_pos = skip_pos
|
|
49
|
+
xm = None
|
|
50
|
+
|
|
51
|
+
seq.extend(pdata[: seq_pos])
|
|
52
|
+
|
|
53
|
+
return pos + next_pos, xm is not None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
SplitContext = collections.namedtuple('SplitContext', 'map, quote_rx, quote_sprx')
|
|
57
|
+
|
|
58
|
+
def make_context(quote_map):
|
|
59
|
+
bmap = {ord(k): ord(v) for k, v in quote_map.items()}
|
|
60
|
+
|
|
61
|
+
return SplitContext(map=bmap,
|
|
62
|
+
quote_rx=_chars_regex(bmap.keys()),
|
|
63
|
+
quote_sprx=_specials_regex(bmap))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _to_bytes(data, split_rx):
|
|
67
|
+
if isinstance(data, str):
|
|
68
|
+
data = data.encode()
|
|
69
|
+
if isinstance(split_rx, str):
|
|
70
|
+
split_rx = split_rx.encode()
|
|
71
|
+
|
|
72
|
+
split_rx = re.compile(split_rx) if isinstance(split_rx, bytes) else split_rx
|
|
73
|
+
|
|
74
|
+
return memoryview(data), split_rx
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
_QUOTE_MAP = {
|
|
78
|
+
'"': '"',
|
|
79
|
+
"'": "'",
|
|
80
|
+
'`': '`',
|
|
81
|
+
'(': ')',
|
|
82
|
+
'{': '}',
|
|
83
|
+
'[': ']',
|
|
84
|
+
'<': '>',
|
|
85
|
+
}
|
|
86
|
+
_QUOTE_CTX = make_context(_QUOTE_MAP)
|
|
87
|
+
|
|
88
|
+
_Quote = collections.namedtuple('Quote', 'closec, pos, nest_ok')
|
|
89
|
+
|
|
90
|
+
def split(data, split_rx, quote_ctx=None):
|
|
91
|
+
qctx = quote_ctx or _QUOTE_CTX
|
|
92
|
+
|
|
93
|
+
bdata, bsplit_rx = _to_bytes(data, split_rx)
|
|
94
|
+
skipper = _Skipper(qctx.quote_rx)
|
|
95
|
+
|
|
96
|
+
sval = ord('\\')
|
|
97
|
+
pos, qstack, parts, seq = 0, [], [], bytearray()
|
|
98
|
+
while pos < len(bdata):
|
|
99
|
+
if seq and seq[-1] == sval:
|
|
100
|
+
seq.append(bdata[pos])
|
|
101
|
+
pos += 1
|
|
102
|
+
elif qstack:
|
|
103
|
+
m = re.search(qctx.quote_sprx, bdata[pos:])
|
|
104
|
+
if not m:
|
|
105
|
+
break
|
|
106
|
+
|
|
107
|
+
seq.extend(bdata[pos: pos + m.start()])
|
|
108
|
+
pos += m.start()
|
|
109
|
+
c = bdata[pos]
|
|
110
|
+
tq = qstack[-1]
|
|
111
|
+
if c == tq.closec:
|
|
112
|
+
qstack.pop()
|
|
113
|
+
elif tq.nest_ok and (cc := qctx.map.get(c)) is not None:
|
|
114
|
+
qstack.append(_Quote(cc, pos, c != cc))
|
|
115
|
+
seq.append(c)
|
|
116
|
+
pos += 1
|
|
117
|
+
else:
|
|
118
|
+
kpos, is_split = _split_forward(bdata, pos, bsplit_rx, skipper, seq)
|
|
119
|
+
if is_split:
|
|
120
|
+
parts.append(seq)
|
|
121
|
+
seq = bytearray()
|
|
122
|
+
elif kpos < len(bdata):
|
|
123
|
+
c = bdata[kpos]
|
|
124
|
+
if (cc := qctx.map.get(c)) is not None:
|
|
125
|
+
qstack.append(_Quote(cc, kpos, c != cc))
|
|
126
|
+
seq.append(c)
|
|
127
|
+
kpos += 1
|
|
128
|
+
pos = max(kpos, pos + 1)
|
|
129
|
+
|
|
130
|
+
tas.check_eq(len(qstack), 0, msg=f'Unmatched quotes during split: "{data}"\n {qstack}')
|
|
131
|
+
if seq or parts:
|
|
132
|
+
parts.append(seq)
|
|
133
|
+
|
|
134
|
+
return tuple(p.decode() for p in parts) if isinstance(data, str) else tuple(parts)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def unquote(data, quote_map=None):
|
|
138
|
+
if len(data) >= 2:
|
|
139
|
+
quote_map = quote_map or _QUOTE_MAP
|
|
140
|
+
cc = quote_map.get(data[0])
|
|
141
|
+
if cc == data[-1]:
|
|
142
|
+
udata = data[1: -1]
|
|
143
|
+
|
|
144
|
+
return re.sub(rf'\\{cc}', rf'{cc}', udata) if cc == data[0] else udata
|
|
145
|
+
|
|
146
|
+
return data
|
|
147
|
+
|
py_misc_utils/state.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
|
|
3
|
+
from . import gfs
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
_STATE_KEY = '__SB_STATE__'
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _kname(cls, name):
|
|
10
|
+
return f'{cls.__name__}.{name}'
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class StateBase:
|
|
14
|
+
|
|
15
|
+
def _get_state(self, state):
|
|
16
|
+
return state
|
|
17
|
+
|
|
18
|
+
def _set_state(self, state):
|
|
19
|
+
self.__dict__.update(state)
|
|
20
|
+
|
|
21
|
+
def _store_state(self, cls, **kwargs):
|
|
22
|
+
sdict = getattr(self, _STATE_KEY, None)
|
|
23
|
+
if sdict is None:
|
|
24
|
+
sdict = dict()
|
|
25
|
+
setattr(self, _STATE_KEY, sdict)
|
|
26
|
+
|
|
27
|
+
for k, v in kwargs.items():
|
|
28
|
+
sdict[_kname(cls, k)] = v
|
|
29
|
+
|
|
30
|
+
def _load_state(self, cls, state, name, defval=None):
|
|
31
|
+
sdict = state.get(_STATE_KEY)
|
|
32
|
+
|
|
33
|
+
return sdict.get(_kname(cls, name), defval) if sdict is not None else defval
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def to_state(obj, path):
|
|
37
|
+
# Needs a copy here, as the _get_state() call chains will modify the state.
|
|
38
|
+
state = obj._get_state(obj.__dict__.copy())
|
|
39
|
+
with gfs.open(path, mode='wb') as sfd:
|
|
40
|
+
pickle.dump(state, sfd)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def from_state(cls, path, **kwargs):
|
|
44
|
+
with gfs.open(path, mode='rb') as sfd:
|
|
45
|
+
state = pickle.load(sfd)
|
|
46
|
+
|
|
47
|
+
state.update(kwargs)
|
|
48
|
+
|
|
49
|
+
obj = cls.__new__(cls)
|
|
50
|
+
obj._set_state(state)
|
|
51
|
+
|
|
52
|
+
return obj
|
|
53
|
+
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import importlib
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from . import core_utils as cu
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _module_origin(modname):
|
|
10
|
+
module = sys.modules.get(modname)
|
|
11
|
+
if module is None:
|
|
12
|
+
try:
|
|
13
|
+
module = importlib.import_module(modname)
|
|
14
|
+
except ModuleNotFoundError:
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
if module is not None:
|
|
18
|
+
path = getattr(module, '__file__', None)
|
|
19
|
+
if path is None:
|
|
20
|
+
spec = getattr(module, '__spec__', None)
|
|
21
|
+
path = spec.origin if spec is not None else None
|
|
22
|
+
|
|
23
|
+
return path
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _module_libpath(modname):
|
|
27
|
+
origin = _module_origin(modname)
|
|
28
|
+
if origin not in {None, 'built-in'}:
|
|
29
|
+
lib_path = os.path.dirname(origin)
|
|
30
|
+
|
|
31
|
+
return lib_path if lib_path else None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Some of the standard modules. Should be enough to get coverage of the
|
|
35
|
+
# Python standard library path (there are more than one since some might
|
|
36
|
+
# turn "built-in" and not have a __file__ or __spec__).
|
|
37
|
+
_STDLIB_MODULES = (
|
|
38
|
+
'abc',
|
|
39
|
+
'copy',
|
|
40
|
+
'io',
|
|
41
|
+
'os',
|
|
42
|
+
'pickle',
|
|
43
|
+
'random',
|
|
44
|
+
'string',
|
|
45
|
+
'types',
|
|
46
|
+
)
|
|
47
|
+
_STDLIB_PATHS = set(filter(lambda x: x is not None,
|
|
48
|
+
(_module_libpath(m) for m in _STDLIB_MODULES)))
|
|
49
|
+
|
|
50
|
+
@functools.cache
|
|
51
|
+
def is_std_module(modname):
|
|
52
|
+
modname = cu.root_module(modname)
|
|
53
|
+
lib_path = _module_libpath(modname)
|
|
54
|
+
|
|
55
|
+
return lib_path is None or lib_path in _STDLIB_PATHS
|
|
56
|
+
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import bisect
|
|
2
|
+
import collections
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from . import assert_checks as tas
|
|
8
|
+
from . import np_utils as npu
|
|
9
|
+
from . import tensor_stream as ts
|
|
10
|
+
from . import utils as ut
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
WriteField = collections.namedtuple('WriteField', 'dtype')
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class StreamDataWriter:
|
|
17
|
+
|
|
18
|
+
def __init__(self, fields, path):
|
|
19
|
+
self._writer = ts.Writer(path)
|
|
20
|
+
self._fields = collections.OrderedDict()
|
|
21
|
+
if isinstance(fields, str):
|
|
22
|
+
sfields = tuple(tuple(ut.resplit(x, '=')) for x in ut.comma_split(fields))
|
|
23
|
+
else:
|
|
24
|
+
sfields = fields
|
|
25
|
+
|
|
26
|
+
for field, dtype in sfields:
|
|
27
|
+
self._fields[field] = WriteField(dtype=np.dtype(dtype))
|
|
28
|
+
|
|
29
|
+
# Note that the tensors handed over to the write() API will become owned by
|
|
30
|
+
# the StreamDataWriter obect, and cannot be written over after the write operation.
|
|
31
|
+
def write(self, **kwargs):
|
|
32
|
+
args = []
|
|
33
|
+
for field, wfield in self._fields.items():
|
|
34
|
+
data = kwargs.get(field)
|
|
35
|
+
tas.check_is_not_none(data, msg=f'Missing "{field}" data in write operation')
|
|
36
|
+
|
|
37
|
+
if isinstance(data, np.ndarray):
|
|
38
|
+
if data.dtype != wfield.dtype:
|
|
39
|
+
data = data.astype(wfield.dtype)
|
|
40
|
+
else:
|
|
41
|
+
data = np.array(data, dtype=wfield.dtype)
|
|
42
|
+
|
|
43
|
+
args.append(data)
|
|
44
|
+
|
|
45
|
+
self._writer.write(*args)
|
|
46
|
+
|
|
47
|
+
def write_dataframe(self, df):
|
|
48
|
+
wargs = collections.OrderedDict()
|
|
49
|
+
for field in self._fields.keys():
|
|
50
|
+
wargs[field] = df[field].to_numpy()
|
|
51
|
+
|
|
52
|
+
self.write(**wargs)
|
|
53
|
+
|
|
54
|
+
def flush(self):
|
|
55
|
+
state = dict(fields=self._fields)
|
|
56
|
+
|
|
57
|
+
self._writer.flush(state=state)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class StreamDataReader:
|
|
61
|
+
|
|
62
|
+
def __init__(self, path):
|
|
63
|
+
self._reader = ts.Reader(path)
|
|
64
|
+
self._fields = self._reader.state['fields']
|
|
65
|
+
self._fields_id = {field: i for i, field in enumerate(self._fields.keys())}
|
|
66
|
+
|
|
67
|
+
def __len__(self):
|
|
68
|
+
return len(self._reader)
|
|
69
|
+
|
|
70
|
+
def fields(self):
|
|
71
|
+
return tuple(self._fields.keys())
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def dtype(self):
|
|
75
|
+
return tuple(wfield.dtype for wfield in self._fields.values())
|
|
76
|
+
|
|
77
|
+
def get_slice(self, start, size=None):
|
|
78
|
+
data = collections.OrderedDict()
|
|
79
|
+
for i, field in enumerate(self._fields.keys()):
|
|
80
|
+
data[field] = self._reader.get_slice(i, start, size=size)
|
|
81
|
+
|
|
82
|
+
return data
|
|
83
|
+
|
|
84
|
+
def get_field_slice(self, field, start, size=None):
|
|
85
|
+
fid = self._fields_id[field]
|
|
86
|
+
|
|
87
|
+
return self._reader.get_slice(fid, start, size=size)
|
|
88
|
+
|
|
89
|
+
def typed_fields(self):
|
|
90
|
+
return tuple((field, wfield.dtype) for field, wfield in self._fields.items())
|
|
91
|
+
|
|
92
|
+
def empty_array(self, size):
|
|
93
|
+
rdata = collections.OrderedDict()
|
|
94
|
+
for field, dtype in self.typed_fields():
|
|
95
|
+
if npu.is_numeric(dtype):
|
|
96
|
+
rdata[field] = np.empty(size, dtype=dtype)
|
|
97
|
+
else:
|
|
98
|
+
rdata[field] = [None] * size
|
|
99
|
+
|
|
100
|
+
return rdata
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _compute_indices(reader, field, start=None, end=None, reverse=False):
|
|
104
|
+
fvalues = reader.get_field_slice(field, 0)
|
|
105
|
+
indices = np.argsort(fvalues)
|
|
106
|
+
if reverse:
|
|
107
|
+
indices = np.flip(indices)
|
|
108
|
+
|
|
109
|
+
if start is not None or end is not None:
|
|
110
|
+
fvalues = fvalues[indices]
|
|
111
|
+
start_index = bisect.bisect(fvalues, start) if start is not None else 0
|
|
112
|
+
end_index = bisect.bisect(fvalues, end) if end is not None else len(indices)
|
|
113
|
+
if start_index > end_index:
|
|
114
|
+
start_index, end_index = end_index, start_index
|
|
115
|
+
|
|
116
|
+
indices = indices[start_index: end_index]
|
|
117
|
+
|
|
118
|
+
return indices
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class StreamSortedScan:
|
|
122
|
+
|
|
123
|
+
def __init__(self, reader, field,
|
|
124
|
+
start=None,
|
|
125
|
+
end=None,
|
|
126
|
+
slice_size=None,
|
|
127
|
+
max_slices=None,
|
|
128
|
+
reverse=False):
|
|
129
|
+
self._slice_size = slice_size or 100000
|
|
130
|
+
self._max_slices = max_slices or 16
|
|
131
|
+
self._reader = reader
|
|
132
|
+
self._slices = collections.OrderedDict()
|
|
133
|
+
self._indices = _compute_indices(reader, field, start=start, end=end, reverse=reverse)
|
|
134
|
+
|
|
135
|
+
def _get_slice(self, idx):
|
|
136
|
+
sidx = (idx // self._slice_size) * self._slice_size
|
|
137
|
+
data = self._slices.get(sidx)
|
|
138
|
+
if data is None:
|
|
139
|
+
if len(self._slices) >= self._max_slices:
|
|
140
|
+
self._slices.popitem(last=False)
|
|
141
|
+
|
|
142
|
+
slice_size = min(self._slice_size, len(self._indices) - sidx)
|
|
143
|
+
|
|
144
|
+
data = self._reader.get_slice(sidx, size=slice_size)
|
|
145
|
+
self._slices[sidx] = data
|
|
146
|
+
else:
|
|
147
|
+
self._slices.move_to_end(sidx)
|
|
148
|
+
|
|
149
|
+
return data, idx - sidx
|
|
150
|
+
|
|
151
|
+
def _as_numpy(self, rdata):
|
|
152
|
+
return {field: np.array(data) for field, data in rdata.items()}
|
|
153
|
+
|
|
154
|
+
def scan(self):
|
|
155
|
+
# An ampty array can contain fields which are Python lists, so _as_numpy() is
|
|
156
|
+
# used when returning data to the caller.
|
|
157
|
+
rdata = self._reader.empty_array(self._slice_size)
|
|
158
|
+
widx = 0
|
|
159
|
+
for idx in self._indices:
|
|
160
|
+
if widx == self._slice_size:
|
|
161
|
+
yield widx, self._as_numpy(rdata)
|
|
162
|
+
widx = 0
|
|
163
|
+
|
|
164
|
+
sdata, sidx = self._get_slice(idx)
|
|
165
|
+
for field, data in rdata.items():
|
|
166
|
+
data[widx] = sdata[field][sidx]
|
|
167
|
+
|
|
168
|
+
widx += 1
|
|
169
|
+
|
|
170
|
+
if widx:
|
|
171
|
+
frdata = collections.OrderedDict()
|
|
172
|
+
for field, data in rdata.items():
|
|
173
|
+
frdata[field] = data[: widx]
|
|
174
|
+
|
|
175
|
+
yield widx, self._as_numpy(frdata)
|
|
176
|
+
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import threading
|
|
4
|
+
|
|
5
|
+
from . import alog
|
|
6
|
+
from . import assert_checks as tas
|
|
7
|
+
from . import fin_wrap as fw
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StreamedFile:
|
|
11
|
+
|
|
12
|
+
def __init__(self, resp):
|
|
13
|
+
self._resp = resp
|
|
14
|
+
self._lock = threading.Lock()
|
|
15
|
+
self._cond = threading.Condition(lock=self._lock)
|
|
16
|
+
|
|
17
|
+
tmpfile = tempfile.TemporaryFile()
|
|
18
|
+
fw.fin_wrap(self, '_tempfile', tmpfile, finfn=tmpfile.close)
|
|
19
|
+
|
|
20
|
+
self._offset = 0
|
|
21
|
+
self._size = 0
|
|
22
|
+
self._completed = False
|
|
23
|
+
self._closed = False
|
|
24
|
+
self._thread = threading.Thread(target=self._stream, daemon=True)
|
|
25
|
+
self._thread.start()
|
|
26
|
+
|
|
27
|
+
def _stream(self):
|
|
28
|
+
for data in self._resp:
|
|
29
|
+
with self._lock:
|
|
30
|
+
self._tempfile.seek(self._size)
|
|
31
|
+
self._tempfile.write(data)
|
|
32
|
+
self._size += len(data)
|
|
33
|
+
self._cond.notify_all()
|
|
34
|
+
if self._closed:
|
|
35
|
+
break
|
|
36
|
+
|
|
37
|
+
with self._lock:
|
|
38
|
+
self._completed = True
|
|
39
|
+
self._cond.notify_all()
|
|
40
|
+
|
|
41
|
+
def _wait_completed(self):
|
|
42
|
+
with self._lock:
|
|
43
|
+
while not (self._completed or self._closed):
|
|
44
|
+
self._cond.wait()
|
|
45
|
+
|
|
46
|
+
def close(self):
|
|
47
|
+
with self._lock:
|
|
48
|
+
self._closed = True
|
|
49
|
+
while not self._completed:
|
|
50
|
+
self._cond.wait()
|
|
51
|
+
|
|
52
|
+
self._thread.join()
|
|
53
|
+
|
|
54
|
+
with self._lock:
|
|
55
|
+
tempfile = self._tempfile
|
|
56
|
+
if tempfile is not None:
|
|
57
|
+
fw.fin_wrap(self, '_tempfile', None)
|
|
58
|
+
|
|
59
|
+
if tempfile is not None:
|
|
60
|
+
tempfile.close()
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def closed(self):
|
|
64
|
+
return self._tempfile is None
|
|
65
|
+
|
|
66
|
+
def seek(self, pos, whence=os.SEEK_SET):
|
|
67
|
+
if whence == os.SEEK_SET:
|
|
68
|
+
offset = pos
|
|
69
|
+
elif whence == os.SEEK_CUR:
|
|
70
|
+
offset = self._offset + pos
|
|
71
|
+
elif whence == os.SEEK_END:
|
|
72
|
+
self._wait_completed()
|
|
73
|
+
offset = self._size + pos
|
|
74
|
+
else:
|
|
75
|
+
alog.xraise(ValueError, f'Invalid seek mode: {whence}')
|
|
76
|
+
|
|
77
|
+
if offset > 0:
|
|
78
|
+
if whence != os.SEEK_END:
|
|
79
|
+
self._wait_completed()
|
|
80
|
+
tas.check_le(offset, self._size, msg=f'Offset out of range')
|
|
81
|
+
|
|
82
|
+
tas.check_ge(offset, 0, msg=f'Offset out of range')
|
|
83
|
+
|
|
84
|
+
self._offset = offset
|
|
85
|
+
|
|
86
|
+
return offset
|
|
87
|
+
|
|
88
|
+
def tell(self):
|
|
89
|
+
return self._offset
|
|
90
|
+
|
|
91
|
+
def _read(self, offset, size, adj_offset):
|
|
92
|
+
while not (self._completed or self._closed or
|
|
93
|
+
(size >= 0 and self._size >= offset + size)):
|
|
94
|
+
self._cond.wait()
|
|
95
|
+
|
|
96
|
+
available = self._size - offset
|
|
97
|
+
to_read = min(size, available) if size >= 0 else available
|
|
98
|
+
if not self._closed and to_read > 0:
|
|
99
|
+
self._tempfile.seek(offset)
|
|
100
|
+
data = self._tempfile.read(to_read)
|
|
101
|
+
if adj_offset:
|
|
102
|
+
self._offset += len(data)
|
|
103
|
+
else:
|
|
104
|
+
data = b''
|
|
105
|
+
|
|
106
|
+
return data
|
|
107
|
+
|
|
108
|
+
def read(self, size=-1):
|
|
109
|
+
with self._lock:
|
|
110
|
+
return self._read(self._offset, size, True)
|
|
111
|
+
|
|
112
|
+
def read1(self, size=-1):
|
|
113
|
+
return self.read(size=size)
|
|
114
|
+
|
|
115
|
+
def peek(self, size=0):
|
|
116
|
+
with self._lock:
|
|
117
|
+
size = min(size, max(1, self._size - self._offset))
|
|
118
|
+
|
|
119
|
+
return self._read(self._offset, size, False) if size > 0 else b''
|
|
120
|
+
|
|
121
|
+
def pread(self, offset, size):
|
|
122
|
+
with self._lock:
|
|
123
|
+
return self._read(offset, size, False)
|
|
124
|
+
|
|
125
|
+
def flush(self):
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
def readable(self):
|
|
129
|
+
return not self.closed
|
|
130
|
+
|
|
131
|
+
def seekable(self):
|
|
132
|
+
return not self.closed
|
|
133
|
+
|
|
134
|
+
def writable(self):
|
|
135
|
+
return False
|
|
136
|
+
|
|
137
|
+
def __enter__(self):
|
|
138
|
+
return self
|
|
139
|
+
|
|
140
|
+
def __exit__(self, *exc):
|
|
141
|
+
self.close()
|
|
142
|
+
|
|
143
|
+
return False
|
|
144
|
+
|
py_misc_utils/tempdir.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import tempfile
|
|
4
|
+
|
|
5
|
+
from . import cleanups
|
|
6
|
+
from . import global_namespace as gns
|
|
7
|
+
from . import rnd_utils as rngu
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _RootDir:
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
self._path = tempfile.mkdtemp()
|
|
14
|
+
self._cid = cleanups.register(shutil.rmtree, self._path, ignore_errors=True)
|
|
15
|
+
|
|
16
|
+
def create(self):
|
|
17
|
+
return tempfile.mkdtemp(dir=self._path)
|
|
18
|
+
|
|
19
|
+
def root(self):
|
|
20
|
+
return self._path
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
_ROOTDIR = gns.Var(f'{__name__}.ROOTDIR',
|
|
24
|
+
fork_init=True,
|
|
25
|
+
defval=lambda: _RootDir())
|
|
26
|
+
|
|
27
|
+
def _root_dir():
|
|
28
|
+
return gns.get(_ROOTDIR)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def create():
|
|
32
|
+
return _root_dir().create()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_temp_root():
|
|
36
|
+
return _root_dir().root()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _try_fastfs_dir(path):
|
|
40
|
+
if os.path.isdir(path):
|
|
41
|
+
fastfs_dir = os.path.join(path, 'fastfs')
|
|
42
|
+
try:
|
|
43
|
+
os.makedirs(fastfs_dir, exist_ok=True)
|
|
44
|
+
|
|
45
|
+
return fastfs_dir
|
|
46
|
+
except:
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _find_fastfs_dir():
|
|
51
|
+
fastfs_dirs = []
|
|
52
|
+
|
|
53
|
+
if (path := os.getenv('FASTFS_DIR')) is not None:
|
|
54
|
+
fastfs_dirs.append(path)
|
|
55
|
+
|
|
56
|
+
if os.name == 'posix':
|
|
57
|
+
# Try known tmpfs/ramfs places in case on Linux.
|
|
58
|
+
fastfs_dirs.append(f'/run/user/{os.getuid()}')
|
|
59
|
+
fastfs_dirs.append('/dev/shm')
|
|
60
|
+
|
|
61
|
+
fastfs_dirs.append(tempfile.gettempdir())
|
|
62
|
+
fastfs_dirs.append(os.getcwd())
|
|
63
|
+
|
|
64
|
+
for path in fastfs_dirs:
|
|
65
|
+
if (fastfs_dir := _try_fastfs_dir(path)) is not None:
|
|
66
|
+
return fastfs_dir
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
_FASTFS_DIR = _find_fastfs_dir()
|
|
70
|
+
_NAMELEN = int(os.getenv('FASTFS_NAMELEN', 12))
|
|
71
|
+
|
|
72
|
+
def fastfs_dir(name=None, namelen=_NAMELEN):
|
|
73
|
+
name = name or rngu.rand_string(namelen)
|
|
74
|
+
|
|
75
|
+
path = os.path.join(_FASTFS_DIR, name)
|
|
76
|
+
os.makedirs(path, exist_ok=True)
|
|
77
|
+
|
|
78
|
+
return path
|
|
79
|
+
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import string
|
|
3
|
+
|
|
4
|
+
from . import alog
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class _FnDict:
|
|
8
|
+
|
|
9
|
+
def __init__(self, lookup_fn):
|
|
10
|
+
self._lookup_fn = lookup_fn
|
|
11
|
+
|
|
12
|
+
def __getitem__(self, key):
|
|
13
|
+
m = re.match(r'([^:]+):(.*)', key)
|
|
14
|
+
if m:
|
|
15
|
+
lkey, defval = m.group(1), m.group(2)
|
|
16
|
+
else:
|
|
17
|
+
lkey, defval = key, None
|
|
18
|
+
|
|
19
|
+
return self._lookup_fn(lkey, defval=defval)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _dict_lookup_fn(vals, delim, misses_ok):
|
|
23
|
+
|
|
24
|
+
def lookup_fn(key, defval=None):
|
|
25
|
+
value = vals.get(key, defval)
|
|
26
|
+
if value is None:
|
|
27
|
+
if not misses_ok:
|
|
28
|
+
alog.xraise(KeyError, f'String template replace missing value for key: {key}')
|
|
29
|
+
else:
|
|
30
|
+
value = f'{delim}{key}'
|
|
31
|
+
|
|
32
|
+
return value
|
|
33
|
+
|
|
34
|
+
return lookup_fn
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def template_replace(st, vals=None, lookup_fn=None, delim=None, misses_ok=None):
|
|
38
|
+
delim = delim or '$'
|
|
39
|
+
misses_ok = False if misses_ok is None else misses_ok
|
|
40
|
+
|
|
41
|
+
class Template(string.Template):
|
|
42
|
+
|
|
43
|
+
# Allow for brace ID with the format ${ID:DEFAULT_VALUE}.
|
|
44
|
+
braceidpattern = r'((?a:[_a-z][_a-z0-9]*)(:[^}]*)?)'
|
|
45
|
+
delimiter = delim
|
|
46
|
+
|
|
47
|
+
if lookup_fn is None:
|
|
48
|
+
lookup_fn = _dict_lookup_fn(vals, delim, misses_ok)
|
|
49
|
+
|
|
50
|
+
return Template(st).safe_substitute(_FnDict(lookup_fn))
|
|
51
|
+
|