python-misc-utils 0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_misc_utils/__init__.py +0 -0
- py_misc_utils/abs_timeout.py +12 -0
- py_misc_utils/alog.py +311 -0
- py_misc_utils/app_main.py +179 -0
- py_misc_utils/archive_streamer.py +112 -0
- py_misc_utils/assert_checks.py +118 -0
- py_misc_utils/ast_utils.py +121 -0
- py_misc_utils/async_manager.py +189 -0
- py_misc_utils/break_control.py +63 -0
- py_misc_utils/buffered_iterator.py +35 -0
- py_misc_utils/cached_file.py +507 -0
- py_misc_utils/call_limiter.py +26 -0
- py_misc_utils/call_result_selector.py +13 -0
- py_misc_utils/cleanups.py +85 -0
- py_misc_utils/cmd.py +97 -0
- py_misc_utils/compression.py +116 -0
- py_misc_utils/cond_waiter.py +13 -0
- py_misc_utils/context_base.py +18 -0
- py_misc_utils/context_managers.py +67 -0
- py_misc_utils/core_utils.py +577 -0
- py_misc_utils/daemon_process.py +252 -0
- py_misc_utils/data_cache.py +46 -0
- py_misc_utils/date_utils.py +90 -0
- py_misc_utils/debug.py +24 -0
- py_misc_utils/dyn_modules.py +50 -0
- py_misc_utils/dynamod.py +103 -0
- py_misc_utils/env_config.py +35 -0
- py_misc_utils/executor.py +239 -0
- py_misc_utils/file_overwrite.py +29 -0
- py_misc_utils/fin_wrap.py +77 -0
- py_misc_utils/fp_utils.py +47 -0
- py_misc_utils/fs/__init__.py +0 -0
- py_misc_utils/fs/file_fs.py +127 -0
- py_misc_utils/fs/ftp_fs.py +242 -0
- py_misc_utils/fs/gcs_fs.py +196 -0
- py_misc_utils/fs/http_fs.py +241 -0
- py_misc_utils/fs/s3_fs.py +417 -0
- py_misc_utils/fs_base.py +133 -0
- py_misc_utils/fs_utils.py +207 -0
- py_misc_utils/gcs_fs.py +169 -0
- py_misc_utils/gen_indices.py +54 -0
- py_misc_utils/gfs.py +371 -0
- py_misc_utils/git_repo.py +77 -0
- py_misc_utils/global_namespace.py +110 -0
- py_misc_utils/http_async_fetcher.py +139 -0
- py_misc_utils/http_server.py +196 -0
- py_misc_utils/http_utils.py +143 -0
- py_misc_utils/img_utils.py +20 -0
- py_misc_utils/infix_op.py +20 -0
- py_misc_utils/inspect_utils.py +205 -0
- py_misc_utils/iostream.py +21 -0
- py_misc_utils/iter_file.py +117 -0
- py_misc_utils/key_wrap.py +46 -0
- py_misc_utils/lazy_import.py +25 -0
- py_misc_utils/lockfile.py +164 -0
- py_misc_utils/mem_size.py +64 -0
- py_misc_utils/mirror_from.py +72 -0
- py_misc_utils/mmap.py +16 -0
- py_misc_utils/module_utils.py +196 -0
- py_misc_utils/moving_average.py +19 -0
- py_misc_utils/msgpack_streamer.py +26 -0
- py_misc_utils/multi_wait.py +24 -0
- py_misc_utils/multiprocessing.py +102 -0
- py_misc_utils/named_array.py +224 -0
- py_misc_utils/no_break.py +46 -0
- py_misc_utils/no_except.py +32 -0
- py_misc_utils/np_ml_framework.py +184 -0
- py_misc_utils/np_utils.py +346 -0
- py_misc_utils/ntuple_utils.py +38 -0
- py_misc_utils/num_utils.py +54 -0
- py_misc_utils/obj.py +73 -0
- py_misc_utils/object_cache.py +100 -0
- py_misc_utils/object_tracker.py +88 -0
- py_misc_utils/ordered_set.py +71 -0
- py_misc_utils/osfd.py +27 -0
- py_misc_utils/packet.py +22 -0
- py_misc_utils/parquet_streamer.py +69 -0
- py_misc_utils/pd_utils.py +254 -0
- py_misc_utils/periodic_task.py +61 -0
- py_misc_utils/pickle_wrap.py +121 -0
- py_misc_utils/pipeline.py +98 -0
- py_misc_utils/remap_pickle.py +50 -0
- py_misc_utils/resource_manager.py +155 -0
- py_misc_utils/rnd_utils.py +56 -0
- py_misc_utils/run_once.py +19 -0
- py_misc_utils/scheduler.py +135 -0
- py_misc_utils/select_params.py +300 -0
- py_misc_utils/signal.py +141 -0
- py_misc_utils/skl_utils.py +270 -0
- py_misc_utils/split.py +147 -0
- py_misc_utils/state.py +53 -0
- py_misc_utils/std_module.py +56 -0
- py_misc_utils/stream_dataframe.py +176 -0
- py_misc_utils/streamed_file.py +144 -0
- py_misc_utils/tempdir.py +79 -0
- py_misc_utils/template_replace.py +51 -0
- py_misc_utils/tensor_stream.py +269 -0
- py_misc_utils/thread_context.py +33 -0
- py_misc_utils/throttle.py +30 -0
- py_misc_utils/time_trigger.py +18 -0
- py_misc_utils/timegen.py +11 -0
- py_misc_utils/traceback.py +49 -0
- py_misc_utils/tracking_executor.py +91 -0
- py_misc_utils/transform_array.py +42 -0
- py_misc_utils/uncompress.py +35 -0
- py_misc_utils/url_fetcher.py +157 -0
- py_misc_utils/utils.py +538 -0
- py_misc_utils/varint.py +50 -0
- py_misc_utils/virt_array.py +52 -0
- py_misc_utils/weak_call.py +33 -0
- py_misc_utils/work_results.py +100 -0
- py_misc_utils/writeback_file.py +43 -0
- python_misc_utils-0.2.dist-info/METADATA +36 -0
- python_misc_utils-0.2.dist-info/RECORD +117 -0
- python_misc_utils-0.2.dist-info/WHEEL +5 -0
- python_misc_utils-0.2.dist-info/licenses/LICENSE +13 -0
- python_misc_utils-0.2.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class AbsTimeout:
|
|
5
|
+
|
|
6
|
+
def __init__(self, timeout, timefn=None):
|
|
7
|
+
self._timefn = time.time if timefn is None else timefn
|
|
8
|
+
self._expires = self._timefn() + timeout if timeout is not None else None
|
|
9
|
+
|
|
10
|
+
def get(self):
|
|
11
|
+
return max(0, self._expires - self._timefn()) if self._expires is not None else None
|
|
12
|
+
|
py_misc_utils/alog.py
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
import math
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
import traceback
|
|
8
|
+
import types
|
|
9
|
+
|
|
10
|
+
from . import call_limiter as cl
|
|
11
|
+
from . import run_once as ro
|
|
12
|
+
from . import traceback as tb
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
DEBUG = logging.DEBUG
|
|
16
|
+
INFO = logging.INFO
|
|
17
|
+
WARNING = logging.WARNING
|
|
18
|
+
ERROR = logging.ERROR
|
|
19
|
+
CRITICAL = logging.CRITICAL
|
|
20
|
+
|
|
21
|
+
SPAM = DEBUG - 2
|
|
22
|
+
VERBOSE = DEBUG - 1
|
|
23
|
+
DEBUG0 = DEBUG + 1
|
|
24
|
+
DEBUG1 = DEBUG + 2
|
|
25
|
+
DEBUG2 = DEBUG + 3
|
|
26
|
+
DEBUG3 = DEBUG + 4
|
|
27
|
+
|
|
28
|
+
_SHORT_LEV = {
|
|
29
|
+
SPAM: 'SP',
|
|
30
|
+
VERBOSE: 'VB',
|
|
31
|
+
DEBUG0: '0D',
|
|
32
|
+
DEBUG1: '1D',
|
|
33
|
+
DEBUG2: '2D',
|
|
34
|
+
DEBUG3: '3D',
|
|
35
|
+
DEBUG: 'DD',
|
|
36
|
+
INFO: 'IN',
|
|
37
|
+
WARNING: 'WA',
|
|
38
|
+
ERROR: 'ER',
|
|
39
|
+
CRITICAL: 'CR',
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Formatter(logging.Formatter):
|
|
44
|
+
|
|
45
|
+
def __init__(self, emit_extra=None):
|
|
46
|
+
super().__init__()
|
|
47
|
+
self.emit_extra = emit_extra
|
|
48
|
+
|
|
49
|
+
def format(self, r):
|
|
50
|
+
hdr = self.make_header(r)
|
|
51
|
+
msg = (r.msg % r.args) if r.args else r.msg
|
|
52
|
+
|
|
53
|
+
return '\n'.join([f'{hdr}: {ln}' for ln in msg.split('\n')])
|
|
54
|
+
|
|
55
|
+
def formatTime(self, r, datefmt=None):
|
|
56
|
+
if datefmt:
|
|
57
|
+
return time.strftime(datefmt, r.created)
|
|
58
|
+
|
|
59
|
+
tstr = time.strftime('%Y%m%d %H:%M:%S', time.localtime(r.created))
|
|
60
|
+
usecs = math.modf(r.created)[0] * 1e6
|
|
61
|
+
|
|
62
|
+
return f'{tstr}.{usecs:06.0f}'
|
|
63
|
+
|
|
64
|
+
def make_header(self, r):
|
|
65
|
+
tstr = self.formatTime(r)
|
|
66
|
+
lid = _SHORT_LEV.get(r.levelno, r.levelname[:2])
|
|
67
|
+
hdr = f'{lid}{tstr};{os.getpid()};{r.module}'
|
|
68
|
+
if self.emit_extra:
|
|
69
|
+
extras = [str(getattr(r, name, None)) for name in self.emit_extra]
|
|
70
|
+
hdr = f'{hdr};{";".join(extras)}'
|
|
71
|
+
|
|
72
|
+
return hdr
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
_DEFAULT_ARGS = dict(
|
|
76
|
+
log_level=os.getenv('LOG_LEVEL', 'INFO'),
|
|
77
|
+
log_file=os.getenv('LOG_FILE', 'STDERR'),
|
|
78
|
+
log_mod_levels=[],
|
|
79
|
+
log_emit_extra=[],
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def add_logging_options(parser):
|
|
83
|
+
parser.add_argument('--log_level', type=str, default=_DEFAULT_ARGS.get('log_level'),
|
|
84
|
+
choices={'SPAM', 'VERBOSE', 'DEBUG', 'DEBUG0', 'DEBUG1', 'DEBUG2',
|
|
85
|
+
'DEBUG3', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'},
|
|
86
|
+
help='The logging level')
|
|
87
|
+
parser.add_argument('--log_file', type=str, default=_DEFAULT_ARGS.get('log_file'),
|
|
88
|
+
help='Comma separated list of target log files (STDOUT, STDERR ' \
|
|
89
|
+
f'are also recognized)')
|
|
90
|
+
parser.add_argument('--log_mod_levels', nargs='*',
|
|
91
|
+
help='Comma separated list of LOGGER_NAME,LEVEL to set the log level at')
|
|
92
|
+
parser.add_argument('--log_emit_extra', nargs='*',
|
|
93
|
+
help='Which other logging record fields should be emitted')
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@ro.run_once
|
|
97
|
+
def _add_levels():
|
|
98
|
+
logging.addLevelName(SPAM, 'SPAM')
|
|
99
|
+
logging.addLevelName(VERBOSE, 'VERBOSE')
|
|
100
|
+
logging.addLevelName(DEBUG0, 'DEBUG0')
|
|
101
|
+
logging.addLevelName(DEBUG1, 'DEBUG1')
|
|
102
|
+
logging.addLevelName(DEBUG2, 'DEBUG2')
|
|
103
|
+
logging.addLevelName(DEBUG3, 'DEBUG3')
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _clear_logging_handlers():
|
|
107
|
+
# Python >= 3.8 has a force=True argument to logging.basicConfig() to force
|
|
108
|
+
# initialization, but since Colab is not there yet, we do it manually.
|
|
109
|
+
root_logger = logging.getLogger()
|
|
110
|
+
for handler in tuple(root_logger.handlers):
|
|
111
|
+
handler.flush()
|
|
112
|
+
root_logger.removeHandler(handler)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _set_logmod_levels(mlevels):
|
|
116
|
+
mlevels = list(mlevels) if mlevels else []
|
|
117
|
+
env_mlevels = os.getenv('LOGMOD_LEVELS', None)
|
|
118
|
+
if env_mlevels is not None:
|
|
119
|
+
mlevels.extend(env_mlevels.split(':'))
|
|
120
|
+
for mlev in mlevels:
|
|
121
|
+
mod, level = mlev.split(',')
|
|
122
|
+
logging.getLogger(mod).setLevel(logging.getLevelName(level.upper()))
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def setup_logging(args):
|
|
126
|
+
_add_levels()
|
|
127
|
+
_clear_logging_handlers()
|
|
128
|
+
|
|
129
|
+
numeric_level = logging.getLevelName(args.log_level.upper())
|
|
130
|
+
handlers = []
|
|
131
|
+
if args.log_file:
|
|
132
|
+
for fname in args.log_file.split(','):
|
|
133
|
+
if fname == 'STDOUT':
|
|
134
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
135
|
+
elif fname == 'STDERR':
|
|
136
|
+
handler = logging.StreamHandler(sys.stderr)
|
|
137
|
+
else:
|
|
138
|
+
handler = logging.StreamHandler(open(fname, mode='a'))
|
|
139
|
+
|
|
140
|
+
handler.setLevel(numeric_level)
|
|
141
|
+
handler.setFormatter(Formatter(emit_extra=args.log_emit_extra))
|
|
142
|
+
handlers.append(handler)
|
|
143
|
+
|
|
144
|
+
logging.basicConfig(level=numeric_level, handlers=handlers, force=True)
|
|
145
|
+
|
|
146
|
+
set_current_level(numeric_level, set_logger=False)
|
|
147
|
+
|
|
148
|
+
_set_logmod_levels(args.log_mod_levels)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def basic_setup(**kwargs):
|
|
152
|
+
args = _DEFAULT_ARGS.copy()
|
|
153
|
+
args.update(kwargs)
|
|
154
|
+
setup_logging(types.SimpleNamespace(**args))
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def get_main_config():
|
|
158
|
+
return types.SimpleNamespace(add_arguments=add_logging_options,
|
|
159
|
+
config_module=setup_logging)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
_LEVEL = DEBUG
|
|
163
|
+
|
|
164
|
+
def set_current_level(level, set_logger=True):
|
|
165
|
+
if set_logger:
|
|
166
|
+
logger = logging.getLogger()
|
|
167
|
+
logger.setLevel(level)
|
|
168
|
+
for handler in logger.handlers:
|
|
169
|
+
handler.setLevel(level)
|
|
170
|
+
|
|
171
|
+
global _LEVEL
|
|
172
|
+
|
|
173
|
+
_LEVEL = level
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def level_active(level):
|
|
177
|
+
return _LEVEL <= level
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def level_run(level, fn):
|
|
181
|
+
return fn() if level_active(level) else None
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
_LOGGING_FRAMES = 1 if sys.version_info >= (3, 11) else 2
|
|
185
|
+
|
|
186
|
+
def logging_args(kwargs):
|
|
187
|
+
limit = kwargs.pop('limit', -1)
|
|
188
|
+
stacklevel = kwargs.get('stacklevel', 1)
|
|
189
|
+
if limit < 0 or cl.trigger(__file__, limit):
|
|
190
|
+
kwargs['stacklevel'] = stacklevel + _LOGGING_FRAMES
|
|
191
|
+
|
|
192
|
+
return kwargs
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _nested_args(kwargs):
|
|
196
|
+
kwargs['stacklevel'] = kwargs.get('stacklevel', 1) + 1
|
|
197
|
+
|
|
198
|
+
return kwargs
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _dmsg(msg):
|
|
202
|
+
return msg() if callable(msg) else msg
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def log(level, msg, *args, **kwargs):
|
|
206
|
+
kwargs = logging_args(kwargs)
|
|
207
|
+
if kwargs is not None:
|
|
208
|
+
logging.log(level, _dmsg(msg), *args, **kwargs)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def spam(msg, *args, **kwargs):
|
|
212
|
+
if SPAM >= _LEVEL:
|
|
213
|
+
log(SPAM, _dmsg(msg), *args, **_nested_args(kwargs))
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def verbose(msg, *args, **kwargs):
|
|
217
|
+
if VERBOSE >= _LEVEL:
|
|
218
|
+
log(VERBOSE, _dmsg(msg), *args, **_nested_args(kwargs))
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def debug0(msg, *args, **kwargs):
|
|
222
|
+
if DEBUG0 >= _LEVEL:
|
|
223
|
+
log(DEBUG0, _dmsg(msg), *args, **_nested_args(kwargs))
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def debug1(msg, *args, **kwargs):
|
|
227
|
+
if DEBUG1 >= _LEVEL:
|
|
228
|
+
log(DEBUG1, _dmsg(msg), *args, **_nested_args(kwargs))
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def debug2(msg, *args, **kwargs):
|
|
232
|
+
if DEBUG2 >= _LEVEL:
|
|
233
|
+
log(DEBUG2, _dmsg(msg), *args, **_nested_args(kwargs))
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def debug3(msg, *args, **kwargs):
|
|
237
|
+
if DEBUG3 >= _LEVEL:
|
|
238
|
+
log(DEBUG3, _dmsg(msg), *args, **_nested_args(kwargs))
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def debug(msg, *args, **kwargs):
|
|
242
|
+
if DEBUG >= _LEVEL:
|
|
243
|
+
log(DEBUG, _dmsg(msg), *args, **_nested_args(kwargs))
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def info(msg, *args, **kwargs):
|
|
247
|
+
if INFO >= _LEVEL:
|
|
248
|
+
log(INFO, _dmsg(msg), *args, **_nested_args(kwargs))
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def warning(msg, *args, **kwargs):
|
|
252
|
+
if WARNING >= _LEVEL:
|
|
253
|
+
log(WARNING, _dmsg(msg), *args, **_nested_args(kwargs))
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def error(msg, *args, **kwargs):
|
|
257
|
+
if ERROR >= _LEVEL:
|
|
258
|
+
log(ERROR, _dmsg(msg), *args, **_nested_args(kwargs))
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def critical(msg, *args, **kwargs):
|
|
262
|
+
if CRITICAL >= _LEVEL:
|
|
263
|
+
log(CRITICAL, _dmsg(msg), *args, **_nested_args(kwargs))
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def exception(e, *args, **kwargs):
|
|
267
|
+
kwargs = logging_args(kwargs)
|
|
268
|
+
if kwargs is not None:
|
|
269
|
+
msg = kwargs.pop('exmsg', 'Exception')
|
|
270
|
+
tb = traceback.format_exc()
|
|
271
|
+
error(f'{_dmsg(msg)}: {e}\n{tb}', *args, **_nested_args(kwargs))
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def xraise(e, msg, *args, **kwargs):
|
|
275
|
+
if kwargs.pop('logit', False):
|
|
276
|
+
error(msg, *args, **_nested_args(kwargs))
|
|
277
|
+
|
|
278
|
+
raise e(_dmsg(msg))
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def async_log(level, msg, *args, **kwargs):
|
|
282
|
+
# This one cannot use the logging module as it could be called from signal
|
|
283
|
+
# handler asycnhronously. The logging.getLevelName() is safe since it is simply
|
|
284
|
+
# a (lockless) dictionary lookup.
|
|
285
|
+
# Similarly, no other APIs taking locks can be called from this context.
|
|
286
|
+
if level >= _LEVEL:
|
|
287
|
+
kwargs = logging_args(kwargs)
|
|
288
|
+
if kwargs is not None:
|
|
289
|
+
# Fake a logging record, filling up only the fields used by the Formatter.
|
|
290
|
+
# Do not call logging APIs for that, for the same reasons cited above.
|
|
291
|
+
frame = tb.get_frame(n=1)
|
|
292
|
+
module = frame.f_globals.get('__name__', 'ASYNC').split('.')[-1]
|
|
293
|
+
|
|
294
|
+
now = time.time()
|
|
295
|
+
record = types.SimpleNamespace(
|
|
296
|
+
msg=_dmsg(msg),
|
|
297
|
+
args=args,
|
|
298
|
+
created=now,
|
|
299
|
+
msecs=math.modf(now)[0] * 1000,
|
|
300
|
+
levelno=level,
|
|
301
|
+
levelname=logging.getLevelName(level),
|
|
302
|
+
module=module,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
formatter = Formatter()
|
|
306
|
+
|
|
307
|
+
logfd = kwargs.pop('file', sys.stderr)
|
|
308
|
+
logfd.write(formatter.format(record))
|
|
309
|
+
logfd.write('\n')
|
|
310
|
+
logfd.flush()
|
|
311
|
+
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import functools
|
|
3
|
+
import inspect
|
|
4
|
+
import sys
|
|
5
|
+
import typing
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
from . import alog
|
|
9
|
+
from . import core_utils as cu
|
|
10
|
+
from . import global_namespace as gns
|
|
11
|
+
from . import multiprocessing as mp
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _get_init_modules():
|
|
15
|
+
# Here is the place to import (import here to avoid cycling dependencies) and
|
|
16
|
+
# call the get_main_config() API of modules which require setting up a
|
|
17
|
+
# command line and configuring themselves with the parsed arguments.
|
|
18
|
+
# Note that alog is imported at the top since it is used in other places (and
|
|
19
|
+
# also has minimal dependencies which do not create issues).
|
|
20
|
+
# Objects returned by the get_main_config() API must have a add_arguments(parser)
|
|
21
|
+
# API to allow them to add command line arguments, and a config_module(args) API
|
|
22
|
+
# to configure themselves with the parsed arguments.
|
|
23
|
+
# Example:
|
|
24
|
+
#
|
|
25
|
+
# from . import foo
|
|
26
|
+
# modules.append(foo.get_main_config())
|
|
27
|
+
#
|
|
28
|
+
modules = []
|
|
29
|
+
modules.append(alog.get_main_config())
|
|
30
|
+
|
|
31
|
+
return tuple(modules)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _add_arguments(init_modules, parser):
|
|
35
|
+
for module in init_modules:
|
|
36
|
+
module.add_arguments(parser)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _config_modules(init_modules, args):
|
|
40
|
+
for module in init_modules:
|
|
41
|
+
module.config_module(args)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _child_setup_modules(args):
|
|
45
|
+
init_modules = _get_init_modules()
|
|
46
|
+
_config_modules(init_modules, args)
|
|
47
|
+
|
|
48
|
+
return args
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
_ARGS = gns.Var(f'{__name__}.ARGS', child_fn=_child_setup_modules)
|
|
52
|
+
|
|
53
|
+
def _main(parser, mainfn, args, rem_args):
|
|
54
|
+
if isinstance(mainfn, Main):
|
|
55
|
+
mainfn.add_arguments(parser)
|
|
56
|
+
|
|
57
|
+
init_modules = _get_init_modules()
|
|
58
|
+
_add_arguments(init_modules, parser)
|
|
59
|
+
|
|
60
|
+
if rem_args:
|
|
61
|
+
xargs = args or sys.argv[1:]
|
|
62
|
+
|
|
63
|
+
ddpos = cu.lindex(xargs, '--')
|
|
64
|
+
if ddpos >= 0:
|
|
65
|
+
rargs = xargs[ddpos + 1:]
|
|
66
|
+
xargs = xargs[: ddpos]
|
|
67
|
+
else:
|
|
68
|
+
rargs = []
|
|
69
|
+
|
|
70
|
+
parsed_args = parser.parse_args(args=xargs)
|
|
71
|
+
setattr(parsed_args, rem_args, tuple(rargs))
|
|
72
|
+
else:
|
|
73
|
+
parsed_args = parser.parse_args(args=args)
|
|
74
|
+
|
|
75
|
+
gns.set(_ARGS, parsed_args)
|
|
76
|
+
_config_modules(init_modules, parsed_args)
|
|
77
|
+
|
|
78
|
+
mainfn(parsed_args)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def main(parser, mainfn, args=None, rem_args=None):
|
|
82
|
+
mp.procfn_wrap(_main, parser, mainfn, args, rem_args)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def basic_main(mainfn, description='Basic Main'):
|
|
86
|
+
parser = argparse.ArgumentParser(
|
|
87
|
+
description=description,
|
|
88
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
89
|
+
)
|
|
90
|
+
main(parser, mainfn)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _child_setup_functions(setup_functions):
|
|
94
|
+
for setupfn in setup_functions:
|
|
95
|
+
setupfn()
|
|
96
|
+
|
|
97
|
+
return setup_functions
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
_SETUP_FUNCTIONS = gns.Var(f'{__name__}.SETUP_FUNCTIONS',
|
|
101
|
+
child_fn=_child_setup_functions,
|
|
102
|
+
defval=[])
|
|
103
|
+
|
|
104
|
+
def add_setupfn(setupfn, run=True):
|
|
105
|
+
if run:
|
|
106
|
+
setupfn()
|
|
107
|
+
|
|
108
|
+
setup_functions = gns.get(_SETUP_FUNCTIONS)
|
|
109
|
+
setup_functions.append(setupfn)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# This is similar to Fire but brings up the app_main infrastructure.
|
|
113
|
+
# Use as:
|
|
114
|
+
#
|
|
115
|
+
# @app_main.Main
|
|
116
|
+
# def my_main(arg, ..., kwarg=17, ...):
|
|
117
|
+
# ...
|
|
118
|
+
#
|
|
119
|
+
# if __name__ == '__main__':
|
|
120
|
+
# parser = argparse.ArgumentParser(...)
|
|
121
|
+
# ...
|
|
122
|
+
# app_main.main(parser, my_main, ...)
|
|
123
|
+
#
|
|
124
|
+
class Main:
|
|
125
|
+
|
|
126
|
+
def __init__(self, func):
|
|
127
|
+
self._func = func
|
|
128
|
+
self._sig = inspect.signature(func)
|
|
129
|
+
functools.update_wrapper(self, func)
|
|
130
|
+
|
|
131
|
+
def __call__(self, parsed_args):
|
|
132
|
+
args, kwargs = [], {}
|
|
133
|
+
for n, p in self._sig.parameters.items():
|
|
134
|
+
pv = getattr(parsed_args, n, None)
|
|
135
|
+
if p.kind == p.POSITIONAL_ONLY:
|
|
136
|
+
args.append(pv)
|
|
137
|
+
else:
|
|
138
|
+
kwargs[n] = pv
|
|
139
|
+
|
|
140
|
+
return self._func(*args, **kwargs)
|
|
141
|
+
|
|
142
|
+
def add_arguments(self, parser):
|
|
143
|
+
fname = self._func.__name__
|
|
144
|
+
|
|
145
|
+
for n, p in self._sig.parameters.items():
|
|
146
|
+
choices = None
|
|
147
|
+
defval = p.default if p.default is not p.empty else None
|
|
148
|
+
if p.annotation is not p.empty:
|
|
149
|
+
ptype = p.annotation
|
|
150
|
+
if typing.get_origin(ptype) == typing.Literal:
|
|
151
|
+
choices = typing.get_args(ptype)
|
|
152
|
+
ptype = type(choices[0])
|
|
153
|
+
|
|
154
|
+
type_cast = functools.partial(cu.to_type, vtype=ptype)
|
|
155
|
+
elif defval is not None:
|
|
156
|
+
ptype = type(defval)
|
|
157
|
+
type_cast = functools.partial(cu.to_type, vtype=ptype)
|
|
158
|
+
else:
|
|
159
|
+
ptype, type_cast = str, yaml.safe_load
|
|
160
|
+
|
|
161
|
+
action = argparse.BooleanOptionalAction if ptype is bool else None
|
|
162
|
+
|
|
163
|
+
help_str = f'Argument "{n}" (type={ptype.__name__}) of function {fname}(...)'
|
|
164
|
+
if p.default is p.empty or p.kind == p.POSITIONAL_ONLY:
|
|
165
|
+
parser.add_argument(n,
|
|
166
|
+
metavar=n.upper(),
|
|
167
|
+
action=action,
|
|
168
|
+
type=type_cast,
|
|
169
|
+
default=defval,
|
|
170
|
+
choices=choices,
|
|
171
|
+
help=help_str)
|
|
172
|
+
else:
|
|
173
|
+
parser.add_argument(f'--{n}',
|
|
174
|
+
action=action,
|
|
175
|
+
type=type_cast,
|
|
176
|
+
default=defval,
|
|
177
|
+
choices=choices,
|
|
178
|
+
help=help_str)
|
|
179
|
+
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import hashlib
|
|
3
|
+
import os
|
|
4
|
+
import tarfile
|
|
5
|
+
import zipfile
|
|
6
|
+
|
|
7
|
+
from . import alog
|
|
8
|
+
from . import assert_checks as tas
|
|
9
|
+
from . import gfs
|
|
10
|
+
from . import img_utils as imgu
|
|
11
|
+
from . import utils as ut
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
ArchiveSpecs = collections.namedtuple('ArchiveSpecs', 'kind, compression, base_path, purl')
|
|
15
|
+
ArchiveEntry = collections.namedtuple('ArchiveEntry', 'name, data')
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_EXT_COMPRESSION = {
|
|
19
|
+
'gz': 'gz',
|
|
20
|
+
'xz': 'xz',
|
|
21
|
+
'bz2': 'bz2',
|
|
22
|
+
'bzip2': 'bz2',
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
def parse_specs(url):
|
|
26
|
+
usplit = gfs.splitext(url)
|
|
27
|
+
|
|
28
|
+
compression = _EXT_COMPRESSION.get(usplit.ext)
|
|
29
|
+
ubase = usplit.base if compression else usplit.purl.path
|
|
30
|
+
|
|
31
|
+
base_path, ext = os.path.splitext(ubase)
|
|
32
|
+
|
|
33
|
+
tas.check(ext, msg=f'Unable to infer archive type: {url}')
|
|
34
|
+
|
|
35
|
+
return ArchiveSpecs(kind=usplit.ext.lower(),
|
|
36
|
+
compression=compression,
|
|
37
|
+
base_path=base_path,
|
|
38
|
+
purl=usplit.purl)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ArchiveStreamer:
|
|
42
|
+
|
|
43
|
+
def __init__(self, url, **kwargs):
|
|
44
|
+
self._url = url
|
|
45
|
+
self._kwargs = kwargs
|
|
46
|
+
|
|
47
|
+
def _url_uid(self, url):
|
|
48
|
+
return hashlib.sha1(url.encode()).hexdigest()[: 8]
|
|
49
|
+
|
|
50
|
+
def _generate_zip(self, specs):
|
|
51
|
+
# The ZIP format requires random access (specifically, the file list is at EOF)
|
|
52
|
+
# so it is better to cache the file locally before opening.
|
|
53
|
+
with gfs.open_local(self._url, mode='rb', **self._kwargs) as stream:
|
|
54
|
+
zfile = zipfile.ZipFile(stream, mode='r')
|
|
55
|
+
for zinfo in zfile.infolist():
|
|
56
|
+
if not zinfo.is_dir():
|
|
57
|
+
data = zfile.read(zinfo)
|
|
58
|
+
yield ArchiveEntry(name=zinfo.filename, data=data)
|
|
59
|
+
|
|
60
|
+
def _generate_tar(self, specs):
|
|
61
|
+
with gfs.open(self._url, mode='rb', **self._kwargs) as stream:
|
|
62
|
+
tfile = tarfile.open(mode=f'r|{specs.compression or ""}', fileobj=stream)
|
|
63
|
+
for tinfo in tfile:
|
|
64
|
+
data = tfile.extractfile(tinfo).read()
|
|
65
|
+
yield ArchiveEntry(name=tinfo.name, data=data)
|
|
66
|
+
|
|
67
|
+
def _generate_parquet(self, specs):
|
|
68
|
+
# Keep the import dependency local, to make it required only if parquet is used.
|
|
69
|
+
from . import parquet_streamer as pqs
|
|
70
|
+
|
|
71
|
+
uid = self._url_uid(self._url)
|
|
72
|
+
|
|
73
|
+
pq_streamer = pqs.ParquetStreamer(self._url, **self._kwargs)
|
|
74
|
+
for i, recd in enumerate(pq_streamer):
|
|
75
|
+
# Simulate a streaming similar to what a Web Dataset would expect, with a
|
|
76
|
+
# UID.ENTITY naming, where the UID is constant for all the entities of a record
|
|
77
|
+
# (which are streamed sequentially).
|
|
78
|
+
ruid = f'{uid}_{i}'
|
|
79
|
+
for name, data in recd.items():
|
|
80
|
+
yield ArchiveEntry(name=f'{ruid}.{name}', data=data)
|
|
81
|
+
|
|
82
|
+
def _generate_msgpack(self, specs):
|
|
83
|
+
# Keep the import dependency local, to make it required only if parquet is used.
|
|
84
|
+
from . import msgpack_streamer as mps
|
|
85
|
+
|
|
86
|
+
uid = self._url_uid(self._url)
|
|
87
|
+
|
|
88
|
+
mps_streamer = mps.MsgPackStreamer(self._url, **self._kwargs)
|
|
89
|
+
for i, recd in enumerate(mps_streamer):
|
|
90
|
+
# Simulate a streaming similar to what a Web Dataset would expect, with a
|
|
91
|
+
# UID.ENTITY naming, where the UID is constant for all the entities of a record
|
|
92
|
+
# (which are streamed sequentially).
|
|
93
|
+
ruid = f'{uid}_{i}'
|
|
94
|
+
for name, data in recd.items():
|
|
95
|
+
yield ArchiveEntry(name=f'{ruid}.{name}', data=data)
|
|
96
|
+
|
|
97
|
+
def generate(self):
|
|
98
|
+
specs = parse_specs(self._url)
|
|
99
|
+
if specs.kind == 'zip':
|
|
100
|
+
yield from self._generate_zip(specs)
|
|
101
|
+
elif specs.kind == 'tar':
|
|
102
|
+
yield from self._generate_tar(specs)
|
|
103
|
+
elif specs.kind == 'parquet':
|
|
104
|
+
yield from self._generate_parquet(specs)
|
|
105
|
+
elif specs.kind == 'msgpack':
|
|
106
|
+
yield from self._generate_msgpack(specs)
|
|
107
|
+
else:
|
|
108
|
+
alog.xraise(RuntimeError, f'Unknown archive type "{specs.kind}": {self._url}')
|
|
109
|
+
|
|
110
|
+
def __iter__(self):
|
|
111
|
+
return self.generate()
|
|
112
|
+
|