python-misc-utils 0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_misc_utils/__init__.py +0 -0
- py_misc_utils/abs_timeout.py +12 -0
- py_misc_utils/alog.py +311 -0
- py_misc_utils/app_main.py +179 -0
- py_misc_utils/archive_streamer.py +112 -0
- py_misc_utils/assert_checks.py +118 -0
- py_misc_utils/ast_utils.py +121 -0
- py_misc_utils/async_manager.py +189 -0
- py_misc_utils/break_control.py +63 -0
- py_misc_utils/buffered_iterator.py +35 -0
- py_misc_utils/cached_file.py +507 -0
- py_misc_utils/call_limiter.py +26 -0
- py_misc_utils/call_result_selector.py +13 -0
- py_misc_utils/cleanups.py +85 -0
- py_misc_utils/cmd.py +97 -0
- py_misc_utils/compression.py +116 -0
- py_misc_utils/cond_waiter.py +13 -0
- py_misc_utils/context_base.py +18 -0
- py_misc_utils/context_managers.py +67 -0
- py_misc_utils/core_utils.py +577 -0
- py_misc_utils/daemon_process.py +252 -0
- py_misc_utils/data_cache.py +46 -0
- py_misc_utils/date_utils.py +90 -0
- py_misc_utils/debug.py +24 -0
- py_misc_utils/dyn_modules.py +50 -0
- py_misc_utils/dynamod.py +103 -0
- py_misc_utils/env_config.py +35 -0
- py_misc_utils/executor.py +239 -0
- py_misc_utils/file_overwrite.py +29 -0
- py_misc_utils/fin_wrap.py +77 -0
- py_misc_utils/fp_utils.py +47 -0
- py_misc_utils/fs/__init__.py +0 -0
- py_misc_utils/fs/file_fs.py +127 -0
- py_misc_utils/fs/ftp_fs.py +242 -0
- py_misc_utils/fs/gcs_fs.py +196 -0
- py_misc_utils/fs/http_fs.py +241 -0
- py_misc_utils/fs/s3_fs.py +417 -0
- py_misc_utils/fs_base.py +133 -0
- py_misc_utils/fs_utils.py +207 -0
- py_misc_utils/gcs_fs.py +169 -0
- py_misc_utils/gen_indices.py +54 -0
- py_misc_utils/gfs.py +371 -0
- py_misc_utils/git_repo.py +77 -0
- py_misc_utils/global_namespace.py +110 -0
- py_misc_utils/http_async_fetcher.py +139 -0
- py_misc_utils/http_server.py +196 -0
- py_misc_utils/http_utils.py +143 -0
- py_misc_utils/img_utils.py +20 -0
- py_misc_utils/infix_op.py +20 -0
- py_misc_utils/inspect_utils.py +205 -0
- py_misc_utils/iostream.py +21 -0
- py_misc_utils/iter_file.py +117 -0
- py_misc_utils/key_wrap.py +46 -0
- py_misc_utils/lazy_import.py +25 -0
- py_misc_utils/lockfile.py +164 -0
- py_misc_utils/mem_size.py +64 -0
- py_misc_utils/mirror_from.py +72 -0
- py_misc_utils/mmap.py +16 -0
- py_misc_utils/module_utils.py +196 -0
- py_misc_utils/moving_average.py +19 -0
- py_misc_utils/msgpack_streamer.py +26 -0
- py_misc_utils/multi_wait.py +24 -0
- py_misc_utils/multiprocessing.py +102 -0
- py_misc_utils/named_array.py +224 -0
- py_misc_utils/no_break.py +46 -0
- py_misc_utils/no_except.py +32 -0
- py_misc_utils/np_ml_framework.py +184 -0
- py_misc_utils/np_utils.py +346 -0
- py_misc_utils/ntuple_utils.py +38 -0
- py_misc_utils/num_utils.py +54 -0
- py_misc_utils/obj.py +73 -0
- py_misc_utils/object_cache.py +100 -0
- py_misc_utils/object_tracker.py +88 -0
- py_misc_utils/ordered_set.py +71 -0
- py_misc_utils/osfd.py +27 -0
- py_misc_utils/packet.py +22 -0
- py_misc_utils/parquet_streamer.py +69 -0
- py_misc_utils/pd_utils.py +254 -0
- py_misc_utils/periodic_task.py +61 -0
- py_misc_utils/pickle_wrap.py +121 -0
- py_misc_utils/pipeline.py +98 -0
- py_misc_utils/remap_pickle.py +50 -0
- py_misc_utils/resource_manager.py +155 -0
- py_misc_utils/rnd_utils.py +56 -0
- py_misc_utils/run_once.py +19 -0
- py_misc_utils/scheduler.py +135 -0
- py_misc_utils/select_params.py +300 -0
- py_misc_utils/signal.py +141 -0
- py_misc_utils/skl_utils.py +270 -0
- py_misc_utils/split.py +147 -0
- py_misc_utils/state.py +53 -0
- py_misc_utils/std_module.py +56 -0
- py_misc_utils/stream_dataframe.py +176 -0
- py_misc_utils/streamed_file.py +144 -0
- py_misc_utils/tempdir.py +79 -0
- py_misc_utils/template_replace.py +51 -0
- py_misc_utils/tensor_stream.py +269 -0
- py_misc_utils/thread_context.py +33 -0
- py_misc_utils/throttle.py +30 -0
- py_misc_utils/time_trigger.py +18 -0
- py_misc_utils/timegen.py +11 -0
- py_misc_utils/traceback.py +49 -0
- py_misc_utils/tracking_executor.py +91 -0
- py_misc_utils/transform_array.py +42 -0
- py_misc_utils/uncompress.py +35 -0
- py_misc_utils/url_fetcher.py +157 -0
- py_misc_utils/utils.py +538 -0
- py_misc_utils/varint.py +50 -0
- py_misc_utils/virt_array.py +52 -0
- py_misc_utils/weak_call.py +33 -0
- py_misc_utils/work_results.py +100 -0
- py_misc_utils/writeback_file.py +43 -0
- python_misc_utils-0.2.dist-info/METADATA +36 -0
- python_misc_utils-0.2.dist-info/RECORD +117 -0
- python_misc_utils-0.2.dist-info/WHEEL +5 -0
- python_misc_utils-0.2.dist-info/licenses/LICENSE +13 -0
- python_misc_utils-0.2.dist-info/top_level.txt +1 -0
py_misc_utils/gfs.py
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import contextlib
|
|
3
|
+
import importlib
|
|
4
|
+
import os
|
|
5
|
+
import pkgutil
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
import sys
|
|
9
|
+
import urllib.parse as uparse
|
|
10
|
+
|
|
11
|
+
from . import alog
|
|
12
|
+
from . import assert_checks as tas
|
|
13
|
+
from . import cached_file as chf
|
|
14
|
+
from . import context_managers as cm
|
|
15
|
+
from . import fs_utils as fsu
|
|
16
|
+
from . import mirror_from as mrf
|
|
17
|
+
from . import run_once as ro
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TempFile:
|
|
21
|
+
|
|
22
|
+
def __init__(self, nsdir=None, nspath=None, **kwargs):
|
|
23
|
+
nsdir = nsdir if nsdir is None or is_local_path(nsdir) else None
|
|
24
|
+
nspath = nspath if nspath is None or is_local_path(nspath) else None
|
|
25
|
+
|
|
26
|
+
self._fs, self._path = resolve_fs(fsu.temp_path(nspath=nspath, nsdir=nsdir), **kwargs)
|
|
27
|
+
self._kwargs = kwargs
|
|
28
|
+
self._fd, self._delete = None, False
|
|
29
|
+
|
|
30
|
+
def open(self):
|
|
31
|
+
self._fd = self._fs.open(self._path, **self._kwargs)
|
|
32
|
+
self._delete = True
|
|
33
|
+
mrf.mirror_all(self._fd, self, name='fd')
|
|
34
|
+
|
|
35
|
+
return self
|
|
36
|
+
|
|
37
|
+
def _close_fd(self):
|
|
38
|
+
if self._fd is not None:
|
|
39
|
+
self._fd.close()
|
|
40
|
+
mrf.unmirror(self, name='fd')
|
|
41
|
+
self._fd = None
|
|
42
|
+
|
|
43
|
+
def close(self):
|
|
44
|
+
self._close_fd()
|
|
45
|
+
if self._delete:
|
|
46
|
+
self._fs.remove(self._path)
|
|
47
|
+
self._delete = False
|
|
48
|
+
|
|
49
|
+
def replace(self, path):
|
|
50
|
+
self._close_fd()
|
|
51
|
+
replace(self._path, path, src_fs=self._fs)
|
|
52
|
+
self._delete = False
|
|
53
|
+
|
|
54
|
+
def __enter__(self):
|
|
55
|
+
self.open()
|
|
56
|
+
|
|
57
|
+
return self
|
|
58
|
+
|
|
59
|
+
def __exit__(self, *exc):
|
|
60
|
+
self.close()
|
|
61
|
+
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
_STD_FILES = {
|
|
66
|
+
'STDIN': sys.stdin,
|
|
67
|
+
'STDOUT': sys.stdout,
|
|
68
|
+
'STDERR': sys.stderr,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
def std_open(path, **kwargs):
|
|
72
|
+
if isinstance(path, str) and (sfd := _STD_FILES.get(path)) is not None:
|
|
73
|
+
return contextlib.nullcontext(sfd)
|
|
74
|
+
|
|
75
|
+
return open(path, **kwargs)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def open(source, **kwargs):
|
|
79
|
+
if (path := path_of(source)) is not None:
|
|
80
|
+
fs, fpath = resolve_fs(path, **kwargs)
|
|
81
|
+
|
|
82
|
+
return fs.open(fpath, **kwargs)
|
|
83
|
+
|
|
84
|
+
return contextlib.nullcontext(source)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def open_local(path, **kwargs):
|
|
88
|
+
return open(path, **kwargs)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def maybe_open(path, **kwargs):
|
|
92
|
+
try:
|
|
93
|
+
return open(path, **kwargs)
|
|
94
|
+
except:
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def as_local(path, **kwargs):
|
|
99
|
+
fs, fpath = resolve_fs(path, **kwargs)
|
|
100
|
+
|
|
101
|
+
return fs.as_local(fpath, **kwargs)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def path_of(path):
|
|
105
|
+
return os.fspath(path) if isinstance(path, (str, os.PathLike)) else None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
PathSplit = collections.namedtuple('PathSplit', 'base, ext, purl')
|
|
109
|
+
|
|
110
|
+
def splitext(path):
|
|
111
|
+
purl = uparse.urlparse(path)
|
|
112
|
+
base, ext = os.path.splitext(purl.path)
|
|
113
|
+
|
|
114
|
+
return PathSplit(base=base, ext=ext[1:], purl=purl)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def is_file(path):
|
|
118
|
+
fs, fpath = resolve_fs(path)
|
|
119
|
+
|
|
120
|
+
return fs.isfile(fpath)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def is_dir(path):
|
|
124
|
+
fs, fpath = resolve_fs(path)
|
|
125
|
+
|
|
126
|
+
return fs.isdir(fpath)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def exists(path):
|
|
130
|
+
fs, fpath = resolve_fs(path)
|
|
131
|
+
|
|
132
|
+
return fs.exists(fpath)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def is_same_fs(*args):
|
|
136
|
+
specs = []
|
|
137
|
+
for fspath in args:
|
|
138
|
+
purl = uparse.urlparse(fspath.path)
|
|
139
|
+
if purl.scheme:
|
|
140
|
+
specs.append((purl.scheme, purl.netloc))
|
|
141
|
+
else:
|
|
142
|
+
specs.append((_DEFAULT_LOCAL_PROTO, fsu.localfs_mount(purl.path)))
|
|
143
|
+
|
|
144
|
+
return all(specs[0] == s for s in specs[1:])
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
_DEFAULT_LOCAL_PROTO = 'file'
|
|
148
|
+
|
|
149
|
+
def is_local_proto(proto):
|
|
150
|
+
return proto == _DEFAULT_LOCAL_PROTO
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def is_local_fs(fs):
|
|
154
|
+
return is_local_proto(fs.ID)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def is_local_path(path):
|
|
158
|
+
return is_local_proto(get_proto(path))
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def is_path(path):
|
|
162
|
+
# It is a path is it contains a proto, or starts with '/', './' or '../'.
|
|
163
|
+
return has_proto(path) or re.match(r'/|\.\.?/', path) is not None
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def has_proto(path):
|
|
167
|
+
return re.match(r'\w+://', path) is not None
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def get_proto(path):
|
|
171
|
+
m = re.match(r'(\w+)://', path)
|
|
172
|
+
|
|
173
|
+
return m.group(1).lower() if m else _DEFAULT_LOCAL_PROTO
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
FsPath = collections.namedtuple('FsPath', 'fs, path')
|
|
177
|
+
|
|
178
|
+
def resolve_paths(*paths):
|
|
179
|
+
resolved = []
|
|
180
|
+
for path_arg in paths:
|
|
181
|
+
if isinstance(path_arg, (list, tuple)):
|
|
182
|
+
fs, path = path_arg
|
|
183
|
+
else:
|
|
184
|
+
fs, path = None, path_arg
|
|
185
|
+
if fs is None:
|
|
186
|
+
fs, path = resolve_fs(path)
|
|
187
|
+
|
|
188
|
+
resolved.append(FsPath(fs, path))
|
|
189
|
+
|
|
190
|
+
return tuple(resolved)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def copy(src_path, dest_path, src_fs=None, dest_fs=None):
|
|
194
|
+
src, dest = resolve_paths((src_fs, src_path), (dest_fs, dest_path))
|
|
195
|
+
|
|
196
|
+
src.fs.copyfile(src.path, dest.fs, dest.path)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def replace(src_path, dest_path, src_fs=None, dest_fs=None):
|
|
200
|
+
src, dest = resolve_paths((src_fs, src_path), (dest_fs, dest_path))
|
|
201
|
+
|
|
202
|
+
if is_same_fs(src, dest):
|
|
203
|
+
dest.fs.replace(src.path, dest.path)
|
|
204
|
+
else:
|
|
205
|
+
copy(src.path, dest.path, src_fs=src.fs, dest_fs=dest.fs)
|
|
206
|
+
src.fs.remove(src.path)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def remove(path):
|
|
210
|
+
fs, fpath = resolve_fs(path)
|
|
211
|
+
fs.remove(fpath)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def mkdir(path, **kwargs):
|
|
215
|
+
fs, fpath = resolve_fs(path)
|
|
216
|
+
fs.mkdir(fpath, **kwargs)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def makedirs(path, **kwargs):
|
|
220
|
+
fs, fpath = resolve_fs(path)
|
|
221
|
+
fs.makedirs(fpath, **kwargs)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def rmdir(path):
|
|
225
|
+
fs, fpath = resolve_fs(path)
|
|
226
|
+
fs.rmdir(fpath)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def rmtree(path, **kwargs):
|
|
230
|
+
fs, fpath = resolve_fs(path)
|
|
231
|
+
fs.rmtree(fpath, **kwargs)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def stat(path):
|
|
235
|
+
fs, fpath = resolve_fs(path)
|
|
236
|
+
|
|
237
|
+
return fs.stat(fpath)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def link(src_path, dest_path):
|
|
241
|
+
src, dest = resolve_paths(src_path, dest_path)
|
|
242
|
+
|
|
243
|
+
tas.check(is_same_fs(src, dest),
|
|
244
|
+
msg=f'Unable to link across file systems: {src_path} -> {dest_path}')
|
|
245
|
+
|
|
246
|
+
src.fs.link(src.path, dest.path)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def symlink(src_path, dest_path):
|
|
250
|
+
src, dest = resolve_paths(src_path, dest_path)
|
|
251
|
+
|
|
252
|
+
tas.check(is_same_fs(src, dest),
|
|
253
|
+
msg=f'Unable to symlink across file systems: {src_path} -> {dest_path}')
|
|
254
|
+
|
|
255
|
+
src.fs.symlink(src.path, dest.path)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class RegexMatcher:
|
|
259
|
+
|
|
260
|
+
def __init__(self, rex):
|
|
261
|
+
self._rex = re.compile(rex)
|
|
262
|
+
self.match = None
|
|
263
|
+
|
|
264
|
+
def __call__(self, value):
|
|
265
|
+
self.match = re.match(self._rex, value)
|
|
266
|
+
|
|
267
|
+
return self.match is not None
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def enumerate_files(path, matcher=None, return_stats=False):
|
|
271
|
+
fs, fpath = resolve_fs(path)
|
|
272
|
+
|
|
273
|
+
for de in fs.list(fpath):
|
|
274
|
+
if matcher is None or matcher(de.name):
|
|
275
|
+
if return_stats:
|
|
276
|
+
yield de.name, de
|
|
277
|
+
else:
|
|
278
|
+
yield de.name
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def normpath(path):
|
|
282
|
+
_, fpath = resolve_fs(path)
|
|
283
|
+
|
|
284
|
+
return fpath
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
_CACHE_DIR = fsu.normpath(os.getenv('CACHE_DIR',
|
|
288
|
+
os.path.join(fsu.home(), '.cache')))
|
|
289
|
+
|
|
290
|
+
def cache_dir():
|
|
291
|
+
return _CACHE_DIR
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def set_cache_dir(path):
|
|
295
|
+
global _CACHE_DIR
|
|
296
|
+
|
|
297
|
+
_CACHE_DIR = fsu.normpath(path)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def find_mount(path):
|
|
301
|
+
fs, fpath = resolve_fs(path)
|
|
302
|
+
|
|
303
|
+
return fsu.localfs_mount(fpath) if is_local_fs(fs) else None
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
_FS_REGISTRY = dict()
|
|
307
|
+
|
|
308
|
+
def register_fs(cls):
|
|
309
|
+
for fsid in cls.IDS:
|
|
310
|
+
alog.debug(f'Registering file system: {fsid}')
|
|
311
|
+
_FS_REGISTRY[fsid] = cls
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def try_register(importer, modname, parent=None):
|
|
315
|
+
try:
|
|
316
|
+
if parent is None:
|
|
317
|
+
spec = importer.find_spec(modname)
|
|
318
|
+
module = importlib.util.module_from_spec(spec)
|
|
319
|
+
spec.loader.exec_module(module)
|
|
320
|
+
else:
|
|
321
|
+
module = importlib.import_module(f'{parent}.{modname}')
|
|
322
|
+
|
|
323
|
+
file_systems = getattr(module, 'FILE_SYSTEMS', ())
|
|
324
|
+
for cls in file_systems:
|
|
325
|
+
register_fs(cls)
|
|
326
|
+
|
|
327
|
+
return module
|
|
328
|
+
except ImportError as ex:
|
|
329
|
+
alog.verbose(f'Unable to import file system module "{modname}": {ex}')
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def register_fs_from_path(path, parent=None):
|
|
333
|
+
for importer, modname, _ in pkgutil.iter_modules(path=path):
|
|
334
|
+
if modname.endswith('_fs'):
|
|
335
|
+
try_register(importer, modname, parent=parent)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
@ro.run_once
|
|
339
|
+
def register_modules():
|
|
340
|
+
import py_misc_utils.fs as pyfs
|
|
341
|
+
|
|
342
|
+
register_fs_from_path(pyfs.__path__, parent='py_misc_utils.fs')
|
|
343
|
+
|
|
344
|
+
gfs_path = os.getenv('GFS_PATH')
|
|
345
|
+
if gfs_path:
|
|
346
|
+
for path in gfs_path.split(':'):
|
|
347
|
+
register_fs_from_path(path)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def get_proto_fs(proto, **kwargs):
|
|
351
|
+
register_modules()
|
|
352
|
+
|
|
353
|
+
cls = _FS_REGISTRY.get(proto)
|
|
354
|
+
tas.check_is_not_none(cls, msg=f'Protocol "{proto}" not registered')
|
|
355
|
+
|
|
356
|
+
return cls(**kwargs)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def resolve_fs(path, **kwargs):
|
|
360
|
+
proto = get_proto(path)
|
|
361
|
+
|
|
362
|
+
cachedir = chf.get_cache_dir(kwargs.pop('cache_dir', cache_dir()))
|
|
363
|
+
|
|
364
|
+
cache_iface = kwargs.pop('cache_iface', None)
|
|
365
|
+
if cache_iface is None:
|
|
366
|
+
cache_iface = chf.CacheInterface(cachedir)
|
|
367
|
+
|
|
368
|
+
fs = get_proto_fs(proto, cache_iface=cache_iface, cache_dir=cachedir, **kwargs)
|
|
369
|
+
|
|
370
|
+
return fs, fs.norm_url(path)
|
|
371
|
+
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import subprocess
|
|
4
|
+
|
|
5
|
+
from . import alog
|
|
6
|
+
from . import assert_checks as tas
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class GitRepo:
|
|
10
|
+
|
|
11
|
+
def __init__(self, path):
|
|
12
|
+
self.path = path
|
|
13
|
+
|
|
14
|
+
def _git(self, *cmd):
|
|
15
|
+
git_cmd = ['git', '-C', self.path] + list(cmd)
|
|
16
|
+
alog.debug(f'Running GIT: {git_cmd}')
|
|
17
|
+
|
|
18
|
+
return git_cmd
|
|
19
|
+
|
|
20
|
+
def _run(self, *cmd):
|
|
21
|
+
rcmd = []
|
|
22
|
+
for arg in cmd:
|
|
23
|
+
if isinstance(arg, (list, tuple)):
|
|
24
|
+
rcmd.extend(arg)
|
|
25
|
+
else:
|
|
26
|
+
rcmd.appen(arg)
|
|
27
|
+
|
|
28
|
+
subprocess.run(rcmd, capture_output=True, check=True)
|
|
29
|
+
|
|
30
|
+
def _cmd(self, *cmd):
|
|
31
|
+
self._run(self._git(*cmd))
|
|
32
|
+
|
|
33
|
+
def _outcmd(self, *cmd, strip=False):
|
|
34
|
+
output = subprocess.check_output(self._git(*cmd))
|
|
35
|
+
if isinstance(output, bytes):
|
|
36
|
+
output = output.decode()
|
|
37
|
+
|
|
38
|
+
return output.strip() if strip else output
|
|
39
|
+
|
|
40
|
+
def repo(self):
|
|
41
|
+
return self._outcmd('config', '--get', 'remote.origin.url', strip=True)
|
|
42
|
+
|
|
43
|
+
def clone(self, repo, force=False, shallow=False):
|
|
44
|
+
do_clone = True
|
|
45
|
+
if os.path.isdir(self.path):
|
|
46
|
+
tas.check_eq(repo, self.repo(), msg=f'Repo mismatch!')
|
|
47
|
+
if force or shallow != self.is_shallow():
|
|
48
|
+
alog.info(f'Purging old GIT folder: {self.path}')
|
|
49
|
+
shutil.rmtree(self.path)
|
|
50
|
+
else:
|
|
51
|
+
self.pull()
|
|
52
|
+
do_clone = False
|
|
53
|
+
|
|
54
|
+
if do_clone:
|
|
55
|
+
parent_path = os.path.dirname(self.path)
|
|
56
|
+
os.makedirs(parent_path, exist_ok=True)
|
|
57
|
+
git_cmd = ['git', '-C', parent_path, 'clone', '-q']
|
|
58
|
+
if shallow:
|
|
59
|
+
git_cmd += ['--depth', '1', repo, os.path.basename(self.path)]
|
|
60
|
+
else:
|
|
61
|
+
git_cmd += [repo, os.path.basename(self.path)]
|
|
62
|
+
|
|
63
|
+
alog.debug(f'Running GIT: {git_cmd}')
|
|
64
|
+
self._run(git_cmd)
|
|
65
|
+
|
|
66
|
+
def current_commit(self):
|
|
67
|
+
return self._outcmd('rev-parse', 'HEAD', strip=True)
|
|
68
|
+
|
|
69
|
+
def is_shallow(self):
|
|
70
|
+
return self._outcmd('rev-parse', '--is-shallow-repository', strip=True) == 'true'
|
|
71
|
+
|
|
72
|
+
def pull(self):
|
|
73
|
+
self._cmd('pull', '-q')
|
|
74
|
+
|
|
75
|
+
def checkout(self, commit):
|
|
76
|
+
self._cmd('checkout', '-q', commit)
|
|
77
|
+
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# When using multiprocessing, there are two distinct behaviours if fork start method
|
|
2
|
+
# is used, WRT spawn/forkserver. In the latter case the global context
|
|
3
|
+
# accumulated by the running process is not pickled-through the child, so the new
|
|
4
|
+
# process start with wiped out global namespace.
|
|
5
|
+
# Using this API, together with the multiprocessing.create_process(), it is possible to
|
|
6
|
+
# have global data transfered to the child.
|
|
7
|
+
# All data stored in the global namespace must be pickle-able, unless fork_init is
|
|
8
|
+
# set to True.
|
|
9
|
+
# If the fork_init attribute is True, it means the variables data must be cleared
|
|
10
|
+
# within the child process, and not carried over (COW-ed) like it would happen when
|
|
11
|
+
# using the fork(2) system call.
|
|
12
|
+
# NOTE: This is a low level module which should have no explicit local dependencies.
|
|
13
|
+
|
|
14
|
+
import collections
|
|
15
|
+
import inspect
|
|
16
|
+
import multiprocessing
|
|
17
|
+
import os
|
|
18
|
+
import threading
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# The parent_fn function is called (if present) before the creation of a new process,
|
|
22
|
+
# within the parent, with the current value of the variable, and is supposed to be
|
|
23
|
+
# returning the "state" of such variable. The state must be pickle-able.
|
|
24
|
+
# The child_fn function is called (if present) after the creation of a new process,
|
|
25
|
+
# within the child, to restore a variable from its state (the value of the new variable
|
|
26
|
+
# should be returned).
|
|
27
|
+
Var = collections.namedtuple(
|
|
28
|
+
'Var',
|
|
29
|
+
'name, parent_fn, child_fn, data, fork_init, defval',
|
|
30
|
+
defaults=(None, None, None, False, None))
|
|
31
|
+
|
|
32
|
+
_NS = dict()
|
|
33
|
+
_LOCK = threading.RLock()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _child_fork():
|
|
37
|
+
global _NS, _LOCK
|
|
38
|
+
|
|
39
|
+
cns = dict()
|
|
40
|
+
for var in _NS.values():
|
|
41
|
+
if not var.fork_init:
|
|
42
|
+
cns[var.name] = var
|
|
43
|
+
|
|
44
|
+
_NS = cns
|
|
45
|
+
_LOCK = threading.RLock()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
if os.name == 'posix':
|
|
49
|
+
os.register_at_fork(after_in_child=_child_fork)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def parent_switch(method):
|
|
53
|
+
assert method in multiprocessing.get_all_start_methods(), method
|
|
54
|
+
|
|
55
|
+
pns = dict()
|
|
56
|
+
with _LOCK:
|
|
57
|
+
for var in _NS.values():
|
|
58
|
+
# Variables with fork_init=True are the ones that are supposed to be
|
|
59
|
+
# initialized in every process, and as such do not have to be carried over
|
|
60
|
+
# from the parent context. Also, fork_init=True variables might contain data
|
|
61
|
+
# which is not pickle-able, and carrying them over will fail.
|
|
62
|
+
if not var.fork_init:
|
|
63
|
+
if var.parent_fn is not None:
|
|
64
|
+
data = var.parent_fn(var.data)
|
|
65
|
+
if data is not var.data:
|
|
66
|
+
var = None if data is None else var._replace(data=data)
|
|
67
|
+
|
|
68
|
+
if var is not None:
|
|
69
|
+
pns[var.name] = var
|
|
70
|
+
|
|
71
|
+
return pns
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def child_switch(method, ns):
|
|
75
|
+
global _NS
|
|
76
|
+
|
|
77
|
+
assert method in multiprocessing.get_all_start_methods(), method
|
|
78
|
+
|
|
79
|
+
cns = dict()
|
|
80
|
+
for var in ns.values():
|
|
81
|
+
if var.child_fn is not None:
|
|
82
|
+
data = var.child_fn(var.data)
|
|
83
|
+
if data is not var.data:
|
|
84
|
+
var = None if data is None else var._replace(data=data)
|
|
85
|
+
|
|
86
|
+
if var is not None:
|
|
87
|
+
cns[var.name] = var
|
|
88
|
+
|
|
89
|
+
_NS = cns
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get(var, force=True):
|
|
93
|
+
with _LOCK:
|
|
94
|
+
value = _NS.get(var.name)
|
|
95
|
+
if value is None and force:
|
|
96
|
+
data = var.defval() if inspect.isfunction(var.defval) else var.defval
|
|
97
|
+
value = var._replace(data=data)
|
|
98
|
+
_NS[value.name] = value
|
|
99
|
+
|
|
100
|
+
return value.data if value is not None else None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def set(var, data):
|
|
104
|
+
with _LOCK:
|
|
105
|
+
prev_value = _NS.get(var.name)
|
|
106
|
+
value = var._replace(data=data)
|
|
107
|
+
_NS[value.name] = value
|
|
108
|
+
|
|
109
|
+
return prev_value.data if prev_value is not None else None
|
|
110
|
+
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
import httpx
|
|
5
|
+
|
|
6
|
+
from . import async_manager as asym
|
|
7
|
+
from . import assert_checks as tas
|
|
8
|
+
from . import core_utils as cu
|
|
9
|
+
from . import file_overwrite as fow
|
|
10
|
+
from . import fin_wrap as fw
|
|
11
|
+
from . import gfs
|
|
12
|
+
from . import tempdir as tmpd
|
|
13
|
+
from . import utils as ut
|
|
14
|
+
from . import work_results as wres
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
async def http_fetch_url(url, context=None, path=None, http_args=None):
|
|
18
|
+
wpath = wres.work_path(path, url)
|
|
19
|
+
try:
|
|
20
|
+
client = await context.get('httpx.AsyncClient', httpx.AsyncClient)
|
|
21
|
+
|
|
22
|
+
resp = await client.get(url, **http_args)
|
|
23
|
+
resp.raise_for_status()
|
|
24
|
+
|
|
25
|
+
with wres.write_result(wpath) as fd:
|
|
26
|
+
fd.write(resp.content)
|
|
27
|
+
except Exception as ex:
|
|
28
|
+
wres.write_error(wpath, ex, workid=url)
|
|
29
|
+
finally:
|
|
30
|
+
return wpath
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class HttpAsyncFetcher:
|
|
34
|
+
|
|
35
|
+
def __init__(self,
|
|
36
|
+
path=None,
|
|
37
|
+
num_workers=None,
|
|
38
|
+
http_args=None,
|
|
39
|
+
mpctx=None):
|
|
40
|
+
self._ctor_path = path
|
|
41
|
+
self._path = None
|
|
42
|
+
self._num_workers = num_workers
|
|
43
|
+
self._http_args = ut.dict_setmissing(
|
|
44
|
+
http_args or dict(),
|
|
45
|
+
timeout=ut.getenv('FETCHER_TIMEO', dtype=float, defval=10.0),
|
|
46
|
+
)
|
|
47
|
+
self._mpctx = mpctx
|
|
48
|
+
self._async_manager = None
|
|
49
|
+
self._pending = set()
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def _cleaner(cls, self):
|
|
53
|
+
self._async_manager.close()
|
|
54
|
+
if self._path != self._ctor_path:
|
|
55
|
+
gfs.rmtree(self._path, ignore_errors=True)
|
|
56
|
+
|
|
57
|
+
def start(self):
|
|
58
|
+
if self._ctor_path is None:
|
|
59
|
+
self._path = tmpd.fastfs_dir()
|
|
60
|
+
else:
|
|
61
|
+
self._path = self._ctor_path
|
|
62
|
+
|
|
63
|
+
async_manager = asym.AsyncManager(**cu.denone(num_workers=self._num_workers,
|
|
64
|
+
mpctx=self._mpctx))
|
|
65
|
+
|
|
66
|
+
finfn = functools.partial(self._cleaner,
|
|
67
|
+
cu.object_context(self, _async_manager=async_manager))
|
|
68
|
+
fw.fin_wrap(self, '_async_manager', async_manager, finfn=finfn)
|
|
69
|
+
|
|
70
|
+
def shutdown(self):
|
|
71
|
+
async_manager = self._async_manager
|
|
72
|
+
if async_manager is not None:
|
|
73
|
+
fw.fin_wrap(self, '_async_manager', None, cleanup=True)
|
|
74
|
+
self._path = None
|
|
75
|
+
self._pending = set()
|
|
76
|
+
|
|
77
|
+
def enqueue(self, *urls):
|
|
78
|
+
wmap = dict()
|
|
79
|
+
for url in urls:
|
|
80
|
+
if url:
|
|
81
|
+
work_ctor = functools.partial(http_fetch_url, url,
|
|
82
|
+
path=self._path,
|
|
83
|
+
http_args=self._http_args)
|
|
84
|
+
self._async_manager.enqueue_work(url, work_ctor)
|
|
85
|
+
self._pending.add(url)
|
|
86
|
+
wmap[url] = wres.work_hash(url)
|
|
87
|
+
|
|
88
|
+
return wmap
|
|
89
|
+
|
|
90
|
+
def wait(self, url):
|
|
91
|
+
wpath = wres.work_path(self._path, url)
|
|
92
|
+
if not os.path.isfile(wpath):
|
|
93
|
+
tas.check(url in self._pending, msg=f'URL already retired: {url}')
|
|
94
|
+
|
|
95
|
+
while self._pending:
|
|
96
|
+
(rurl, result) = self._async_manager.fetch_result()
|
|
97
|
+
|
|
98
|
+
self._pending.discard(rurl)
|
|
99
|
+
wres.raise_if_error(result)
|
|
100
|
+
if rurl == url:
|
|
101
|
+
break
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
return wres.get_work(wpath)
|
|
105
|
+
finally:
|
|
106
|
+
os.remove(wpath)
|
|
107
|
+
|
|
108
|
+
def iter_results(self, max_results=None, block=True, timeout=None):
|
|
109
|
+
count = 0
|
|
110
|
+
while self._pending:
|
|
111
|
+
if (fetchres := self._async_manager.fetch_result(block=block,
|
|
112
|
+
timeout=timeout)) is None:
|
|
113
|
+
break
|
|
114
|
+
|
|
115
|
+
rurl, result = fetchres
|
|
116
|
+
|
|
117
|
+
self._pending.discard(rurl)
|
|
118
|
+
wpath = wres.work_path(self._path, rurl)
|
|
119
|
+
|
|
120
|
+
wdata = wres.load_work(wpath)
|
|
121
|
+
|
|
122
|
+
os.remove(wpath)
|
|
123
|
+
|
|
124
|
+
yield rurl, wdata
|
|
125
|
+
|
|
126
|
+
count += 1
|
|
127
|
+
if max_results is not None and count >= max_results:
|
|
128
|
+
break
|
|
129
|
+
|
|
130
|
+
def __enter__(self):
|
|
131
|
+
self.start()
|
|
132
|
+
|
|
133
|
+
return self
|
|
134
|
+
|
|
135
|
+
def __exit__(self, *exc):
|
|
136
|
+
self.shutdown()
|
|
137
|
+
|
|
138
|
+
return False
|
|
139
|
+
|