python-misc-utils 0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. py_misc_utils/__init__.py +0 -0
  2. py_misc_utils/abs_timeout.py +12 -0
  3. py_misc_utils/alog.py +311 -0
  4. py_misc_utils/app_main.py +179 -0
  5. py_misc_utils/archive_streamer.py +112 -0
  6. py_misc_utils/assert_checks.py +118 -0
  7. py_misc_utils/ast_utils.py +121 -0
  8. py_misc_utils/async_manager.py +189 -0
  9. py_misc_utils/break_control.py +63 -0
  10. py_misc_utils/buffered_iterator.py +35 -0
  11. py_misc_utils/cached_file.py +507 -0
  12. py_misc_utils/call_limiter.py +26 -0
  13. py_misc_utils/call_result_selector.py +13 -0
  14. py_misc_utils/cleanups.py +85 -0
  15. py_misc_utils/cmd.py +97 -0
  16. py_misc_utils/compression.py +116 -0
  17. py_misc_utils/cond_waiter.py +13 -0
  18. py_misc_utils/context_base.py +18 -0
  19. py_misc_utils/context_managers.py +67 -0
  20. py_misc_utils/core_utils.py +577 -0
  21. py_misc_utils/daemon_process.py +252 -0
  22. py_misc_utils/data_cache.py +46 -0
  23. py_misc_utils/date_utils.py +90 -0
  24. py_misc_utils/debug.py +24 -0
  25. py_misc_utils/dyn_modules.py +50 -0
  26. py_misc_utils/dynamod.py +103 -0
  27. py_misc_utils/env_config.py +35 -0
  28. py_misc_utils/executor.py +239 -0
  29. py_misc_utils/file_overwrite.py +29 -0
  30. py_misc_utils/fin_wrap.py +77 -0
  31. py_misc_utils/fp_utils.py +47 -0
  32. py_misc_utils/fs/__init__.py +0 -0
  33. py_misc_utils/fs/file_fs.py +127 -0
  34. py_misc_utils/fs/ftp_fs.py +242 -0
  35. py_misc_utils/fs/gcs_fs.py +196 -0
  36. py_misc_utils/fs/http_fs.py +241 -0
  37. py_misc_utils/fs/s3_fs.py +417 -0
  38. py_misc_utils/fs_base.py +133 -0
  39. py_misc_utils/fs_utils.py +207 -0
  40. py_misc_utils/gcs_fs.py +169 -0
  41. py_misc_utils/gen_indices.py +54 -0
  42. py_misc_utils/gfs.py +371 -0
  43. py_misc_utils/git_repo.py +77 -0
  44. py_misc_utils/global_namespace.py +110 -0
  45. py_misc_utils/http_async_fetcher.py +139 -0
  46. py_misc_utils/http_server.py +196 -0
  47. py_misc_utils/http_utils.py +143 -0
  48. py_misc_utils/img_utils.py +20 -0
  49. py_misc_utils/infix_op.py +20 -0
  50. py_misc_utils/inspect_utils.py +205 -0
  51. py_misc_utils/iostream.py +21 -0
  52. py_misc_utils/iter_file.py +117 -0
  53. py_misc_utils/key_wrap.py +46 -0
  54. py_misc_utils/lazy_import.py +25 -0
  55. py_misc_utils/lockfile.py +164 -0
  56. py_misc_utils/mem_size.py +64 -0
  57. py_misc_utils/mirror_from.py +72 -0
  58. py_misc_utils/mmap.py +16 -0
  59. py_misc_utils/module_utils.py +196 -0
  60. py_misc_utils/moving_average.py +19 -0
  61. py_misc_utils/msgpack_streamer.py +26 -0
  62. py_misc_utils/multi_wait.py +24 -0
  63. py_misc_utils/multiprocessing.py +102 -0
  64. py_misc_utils/named_array.py +224 -0
  65. py_misc_utils/no_break.py +46 -0
  66. py_misc_utils/no_except.py +32 -0
  67. py_misc_utils/np_ml_framework.py +184 -0
  68. py_misc_utils/np_utils.py +346 -0
  69. py_misc_utils/ntuple_utils.py +38 -0
  70. py_misc_utils/num_utils.py +54 -0
  71. py_misc_utils/obj.py +73 -0
  72. py_misc_utils/object_cache.py +100 -0
  73. py_misc_utils/object_tracker.py +88 -0
  74. py_misc_utils/ordered_set.py +71 -0
  75. py_misc_utils/osfd.py +27 -0
  76. py_misc_utils/packet.py +22 -0
  77. py_misc_utils/parquet_streamer.py +69 -0
  78. py_misc_utils/pd_utils.py +254 -0
  79. py_misc_utils/periodic_task.py +61 -0
  80. py_misc_utils/pickle_wrap.py +121 -0
  81. py_misc_utils/pipeline.py +98 -0
  82. py_misc_utils/remap_pickle.py +50 -0
  83. py_misc_utils/resource_manager.py +155 -0
  84. py_misc_utils/rnd_utils.py +56 -0
  85. py_misc_utils/run_once.py +19 -0
  86. py_misc_utils/scheduler.py +135 -0
  87. py_misc_utils/select_params.py +300 -0
  88. py_misc_utils/signal.py +141 -0
  89. py_misc_utils/skl_utils.py +270 -0
  90. py_misc_utils/split.py +147 -0
  91. py_misc_utils/state.py +53 -0
  92. py_misc_utils/std_module.py +56 -0
  93. py_misc_utils/stream_dataframe.py +176 -0
  94. py_misc_utils/streamed_file.py +144 -0
  95. py_misc_utils/tempdir.py +79 -0
  96. py_misc_utils/template_replace.py +51 -0
  97. py_misc_utils/tensor_stream.py +269 -0
  98. py_misc_utils/thread_context.py +33 -0
  99. py_misc_utils/throttle.py +30 -0
  100. py_misc_utils/time_trigger.py +18 -0
  101. py_misc_utils/timegen.py +11 -0
  102. py_misc_utils/traceback.py +49 -0
  103. py_misc_utils/tracking_executor.py +91 -0
  104. py_misc_utils/transform_array.py +42 -0
  105. py_misc_utils/uncompress.py +35 -0
  106. py_misc_utils/url_fetcher.py +157 -0
  107. py_misc_utils/utils.py +538 -0
  108. py_misc_utils/varint.py +50 -0
  109. py_misc_utils/virt_array.py +52 -0
  110. py_misc_utils/weak_call.py +33 -0
  111. py_misc_utils/work_results.py +100 -0
  112. py_misc_utils/writeback_file.py +43 -0
  113. python_misc_utils-0.2.dist-info/METADATA +36 -0
  114. python_misc_utils-0.2.dist-info/RECORD +117 -0
  115. python_misc_utils-0.2.dist-info/WHEEL +5 -0
  116. python_misc_utils-0.2.dist-info/licenses/LICENSE +13 -0
  117. python_misc_utils-0.2.dist-info/top_level.txt +1 -0
py_misc_utils/gfs.py ADDED
@@ -0,0 +1,371 @@
1
+ import collections
2
+ import contextlib
3
+ import importlib
4
+ import os
5
+ import pkgutil
6
+ import re
7
+ import shutil
8
+ import sys
9
+ import urllib.parse as uparse
10
+
11
+ from . import alog
12
+ from . import assert_checks as tas
13
+ from . import cached_file as chf
14
+ from . import context_managers as cm
15
+ from . import fs_utils as fsu
16
+ from . import mirror_from as mrf
17
+ from . import run_once as ro
18
+
19
+
20
+ class TempFile:
21
+
22
+ def __init__(self, nsdir=None, nspath=None, **kwargs):
23
+ nsdir = nsdir if nsdir is None or is_local_path(nsdir) else None
24
+ nspath = nspath if nspath is None or is_local_path(nspath) else None
25
+
26
+ self._fs, self._path = resolve_fs(fsu.temp_path(nspath=nspath, nsdir=nsdir), **kwargs)
27
+ self._kwargs = kwargs
28
+ self._fd, self._delete = None, False
29
+
30
+ def open(self):
31
+ self._fd = self._fs.open(self._path, **self._kwargs)
32
+ self._delete = True
33
+ mrf.mirror_all(self._fd, self, name='fd')
34
+
35
+ return self
36
+
37
+ def _close_fd(self):
38
+ if self._fd is not None:
39
+ self._fd.close()
40
+ mrf.unmirror(self, name='fd')
41
+ self._fd = None
42
+
43
+ def close(self):
44
+ self._close_fd()
45
+ if self._delete:
46
+ self._fs.remove(self._path)
47
+ self._delete = False
48
+
49
+ def replace(self, path):
50
+ self._close_fd()
51
+ replace(self._path, path, src_fs=self._fs)
52
+ self._delete = False
53
+
54
+ def __enter__(self):
55
+ self.open()
56
+
57
+ return self
58
+
59
+ def __exit__(self, *exc):
60
+ self.close()
61
+
62
+ return False
63
+
64
+
65
+ _STD_FILES = {
66
+ 'STDIN': sys.stdin,
67
+ 'STDOUT': sys.stdout,
68
+ 'STDERR': sys.stderr,
69
+ }
70
+
71
+ def std_open(path, **kwargs):
72
+ if isinstance(path, str) and (sfd := _STD_FILES.get(path)) is not None:
73
+ return contextlib.nullcontext(sfd)
74
+
75
+ return open(path, **kwargs)
76
+
77
+
78
+ def open(source, **kwargs):
79
+ if (path := path_of(source)) is not None:
80
+ fs, fpath = resolve_fs(path, **kwargs)
81
+
82
+ return fs.open(fpath, **kwargs)
83
+
84
+ return contextlib.nullcontext(source)
85
+
86
+
87
+ def open_local(path, **kwargs):
88
+ return open(path, **kwargs)
89
+
90
+
91
+ def maybe_open(path, **kwargs):
92
+ try:
93
+ return open(path, **kwargs)
94
+ except:
95
+ pass
96
+
97
+
98
+ def as_local(path, **kwargs):
99
+ fs, fpath = resolve_fs(path, **kwargs)
100
+
101
+ return fs.as_local(fpath, **kwargs)
102
+
103
+
104
+ def path_of(path):
105
+ return os.fspath(path) if isinstance(path, (str, os.PathLike)) else None
106
+
107
+
108
+ PathSplit = collections.namedtuple('PathSplit', 'base, ext, purl')
109
+
110
+ def splitext(path):
111
+ purl = uparse.urlparse(path)
112
+ base, ext = os.path.splitext(purl.path)
113
+
114
+ return PathSplit(base=base, ext=ext[1:], purl=purl)
115
+
116
+
117
+ def is_file(path):
118
+ fs, fpath = resolve_fs(path)
119
+
120
+ return fs.isfile(fpath)
121
+
122
+
123
+ def is_dir(path):
124
+ fs, fpath = resolve_fs(path)
125
+
126
+ return fs.isdir(fpath)
127
+
128
+
129
+ def exists(path):
130
+ fs, fpath = resolve_fs(path)
131
+
132
+ return fs.exists(fpath)
133
+
134
+
135
+ def is_same_fs(*args):
136
+ specs = []
137
+ for fspath in args:
138
+ purl = uparse.urlparse(fspath.path)
139
+ if purl.scheme:
140
+ specs.append((purl.scheme, purl.netloc))
141
+ else:
142
+ specs.append((_DEFAULT_LOCAL_PROTO, fsu.localfs_mount(purl.path)))
143
+
144
+ return all(specs[0] == s for s in specs[1:])
145
+
146
+
147
+ _DEFAULT_LOCAL_PROTO = 'file'
148
+
149
+ def is_local_proto(proto):
150
+ return proto == _DEFAULT_LOCAL_PROTO
151
+
152
+
153
+ def is_local_fs(fs):
154
+ return is_local_proto(fs.ID)
155
+
156
+
157
+ def is_local_path(path):
158
+ return is_local_proto(get_proto(path))
159
+
160
+
161
+ def is_path(path):
162
+ # It is a path is it contains a proto, or starts with '/', './' or '../'.
163
+ return has_proto(path) or re.match(r'/|\.\.?/', path) is not None
164
+
165
+
166
+ def has_proto(path):
167
+ return re.match(r'\w+://', path) is not None
168
+
169
+
170
+ def get_proto(path):
171
+ m = re.match(r'(\w+)://', path)
172
+
173
+ return m.group(1).lower() if m else _DEFAULT_LOCAL_PROTO
174
+
175
+
176
+ FsPath = collections.namedtuple('FsPath', 'fs, path')
177
+
178
+ def resolve_paths(*paths):
179
+ resolved = []
180
+ for path_arg in paths:
181
+ if isinstance(path_arg, (list, tuple)):
182
+ fs, path = path_arg
183
+ else:
184
+ fs, path = None, path_arg
185
+ if fs is None:
186
+ fs, path = resolve_fs(path)
187
+
188
+ resolved.append(FsPath(fs, path))
189
+
190
+ return tuple(resolved)
191
+
192
+
193
+ def copy(src_path, dest_path, src_fs=None, dest_fs=None):
194
+ src, dest = resolve_paths((src_fs, src_path), (dest_fs, dest_path))
195
+
196
+ src.fs.copyfile(src.path, dest.fs, dest.path)
197
+
198
+
199
+ def replace(src_path, dest_path, src_fs=None, dest_fs=None):
200
+ src, dest = resolve_paths((src_fs, src_path), (dest_fs, dest_path))
201
+
202
+ if is_same_fs(src, dest):
203
+ dest.fs.replace(src.path, dest.path)
204
+ else:
205
+ copy(src.path, dest.path, src_fs=src.fs, dest_fs=dest.fs)
206
+ src.fs.remove(src.path)
207
+
208
+
209
+ def remove(path):
210
+ fs, fpath = resolve_fs(path)
211
+ fs.remove(fpath)
212
+
213
+
214
+ def mkdir(path, **kwargs):
215
+ fs, fpath = resolve_fs(path)
216
+ fs.mkdir(fpath, **kwargs)
217
+
218
+
219
+ def makedirs(path, **kwargs):
220
+ fs, fpath = resolve_fs(path)
221
+ fs.makedirs(fpath, **kwargs)
222
+
223
+
224
+ def rmdir(path):
225
+ fs, fpath = resolve_fs(path)
226
+ fs.rmdir(fpath)
227
+
228
+
229
+ def rmtree(path, **kwargs):
230
+ fs, fpath = resolve_fs(path)
231
+ fs.rmtree(fpath, **kwargs)
232
+
233
+
234
+ def stat(path):
235
+ fs, fpath = resolve_fs(path)
236
+
237
+ return fs.stat(fpath)
238
+
239
+
240
+ def link(src_path, dest_path):
241
+ src, dest = resolve_paths(src_path, dest_path)
242
+
243
+ tas.check(is_same_fs(src, dest),
244
+ msg=f'Unable to link across file systems: {src_path} -> {dest_path}')
245
+
246
+ src.fs.link(src.path, dest.path)
247
+
248
+
249
+ def symlink(src_path, dest_path):
250
+ src, dest = resolve_paths(src_path, dest_path)
251
+
252
+ tas.check(is_same_fs(src, dest),
253
+ msg=f'Unable to symlink across file systems: {src_path} -> {dest_path}')
254
+
255
+ src.fs.symlink(src.path, dest.path)
256
+
257
+
258
+ class RegexMatcher:
259
+
260
+ def __init__(self, rex):
261
+ self._rex = re.compile(rex)
262
+ self.match = None
263
+
264
+ def __call__(self, value):
265
+ self.match = re.match(self._rex, value)
266
+
267
+ return self.match is not None
268
+
269
+
270
+ def enumerate_files(path, matcher=None, return_stats=False):
271
+ fs, fpath = resolve_fs(path)
272
+
273
+ for de in fs.list(fpath):
274
+ if matcher is None or matcher(de.name):
275
+ if return_stats:
276
+ yield de.name, de
277
+ else:
278
+ yield de.name
279
+
280
+
281
+ def normpath(path):
282
+ _, fpath = resolve_fs(path)
283
+
284
+ return fpath
285
+
286
+
287
+ _CACHE_DIR = fsu.normpath(os.getenv('CACHE_DIR',
288
+ os.path.join(fsu.home(), '.cache')))
289
+
290
+ def cache_dir():
291
+ return _CACHE_DIR
292
+
293
+
294
+ def set_cache_dir(path):
295
+ global _CACHE_DIR
296
+
297
+ _CACHE_DIR = fsu.normpath(path)
298
+
299
+
300
+ def find_mount(path):
301
+ fs, fpath = resolve_fs(path)
302
+
303
+ return fsu.localfs_mount(fpath) if is_local_fs(fs) else None
304
+
305
+
306
+ _FS_REGISTRY = dict()
307
+
308
+ def register_fs(cls):
309
+ for fsid in cls.IDS:
310
+ alog.debug(f'Registering file system: {fsid}')
311
+ _FS_REGISTRY[fsid] = cls
312
+
313
+
314
+ def try_register(importer, modname, parent=None):
315
+ try:
316
+ if parent is None:
317
+ spec = importer.find_spec(modname)
318
+ module = importlib.util.module_from_spec(spec)
319
+ spec.loader.exec_module(module)
320
+ else:
321
+ module = importlib.import_module(f'{parent}.{modname}')
322
+
323
+ file_systems = getattr(module, 'FILE_SYSTEMS', ())
324
+ for cls in file_systems:
325
+ register_fs(cls)
326
+
327
+ return module
328
+ except ImportError as ex:
329
+ alog.verbose(f'Unable to import file system module "{modname}": {ex}')
330
+
331
+
332
+ def register_fs_from_path(path, parent=None):
333
+ for importer, modname, _ in pkgutil.iter_modules(path=path):
334
+ if modname.endswith('_fs'):
335
+ try_register(importer, modname, parent=parent)
336
+
337
+
338
+ @ro.run_once
339
+ def register_modules():
340
+ import py_misc_utils.fs as pyfs
341
+
342
+ register_fs_from_path(pyfs.__path__, parent='py_misc_utils.fs')
343
+
344
+ gfs_path = os.getenv('GFS_PATH')
345
+ if gfs_path:
346
+ for path in gfs_path.split(':'):
347
+ register_fs_from_path(path)
348
+
349
+
350
+ def get_proto_fs(proto, **kwargs):
351
+ register_modules()
352
+
353
+ cls = _FS_REGISTRY.get(proto)
354
+ tas.check_is_not_none(cls, msg=f'Protocol "{proto}" not registered')
355
+
356
+ return cls(**kwargs)
357
+
358
+
359
+ def resolve_fs(path, **kwargs):
360
+ proto = get_proto(path)
361
+
362
+ cachedir = chf.get_cache_dir(kwargs.pop('cache_dir', cache_dir()))
363
+
364
+ cache_iface = kwargs.pop('cache_iface', None)
365
+ if cache_iface is None:
366
+ cache_iface = chf.CacheInterface(cachedir)
367
+
368
+ fs = get_proto_fs(proto, cache_iface=cache_iface, cache_dir=cachedir, **kwargs)
369
+
370
+ return fs, fs.norm_url(path)
371
+
@@ -0,0 +1,77 @@
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+
5
+ from . import alog
6
+ from . import assert_checks as tas
7
+
8
+
9
+ class GitRepo:
10
+
11
+ def __init__(self, path):
12
+ self.path = path
13
+
14
+ def _git(self, *cmd):
15
+ git_cmd = ['git', '-C', self.path] + list(cmd)
16
+ alog.debug(f'Running GIT: {git_cmd}')
17
+
18
+ return git_cmd
19
+
20
+ def _run(self, *cmd):
21
+ rcmd = []
22
+ for arg in cmd:
23
+ if isinstance(arg, (list, tuple)):
24
+ rcmd.extend(arg)
25
+ else:
26
+ rcmd.appen(arg)
27
+
28
+ subprocess.run(rcmd, capture_output=True, check=True)
29
+
30
+ def _cmd(self, *cmd):
31
+ self._run(self._git(*cmd))
32
+
33
+ def _outcmd(self, *cmd, strip=False):
34
+ output = subprocess.check_output(self._git(*cmd))
35
+ if isinstance(output, bytes):
36
+ output = output.decode()
37
+
38
+ return output.strip() if strip else output
39
+
40
+ def repo(self):
41
+ return self._outcmd('config', '--get', 'remote.origin.url', strip=True)
42
+
43
+ def clone(self, repo, force=False, shallow=False):
44
+ do_clone = True
45
+ if os.path.isdir(self.path):
46
+ tas.check_eq(repo, self.repo(), msg=f'Repo mismatch!')
47
+ if force or shallow != self.is_shallow():
48
+ alog.info(f'Purging old GIT folder: {self.path}')
49
+ shutil.rmtree(self.path)
50
+ else:
51
+ self.pull()
52
+ do_clone = False
53
+
54
+ if do_clone:
55
+ parent_path = os.path.dirname(self.path)
56
+ os.makedirs(parent_path, exist_ok=True)
57
+ git_cmd = ['git', '-C', parent_path, 'clone', '-q']
58
+ if shallow:
59
+ git_cmd += ['--depth', '1', repo, os.path.basename(self.path)]
60
+ else:
61
+ git_cmd += [repo, os.path.basename(self.path)]
62
+
63
+ alog.debug(f'Running GIT: {git_cmd}')
64
+ self._run(git_cmd)
65
+
66
+ def current_commit(self):
67
+ return self._outcmd('rev-parse', 'HEAD', strip=True)
68
+
69
+ def is_shallow(self):
70
+ return self._outcmd('rev-parse', '--is-shallow-repository', strip=True) == 'true'
71
+
72
+ def pull(self):
73
+ self._cmd('pull', '-q')
74
+
75
+ def checkout(self, commit):
76
+ self._cmd('checkout', '-q', commit)
77
+
@@ -0,0 +1,110 @@
1
+ # When using multiprocessing, there are two distinct behaviours if fork start method
2
+ # is used, WRT spawn/forkserver. In the latter case the global context
3
+ # accumulated by the running process is not pickled-through the child, so the new
4
+ # process start with wiped out global namespace.
5
+ # Using this API, together with the multiprocessing.create_process(), it is possible to
6
+ # have global data transfered to the child.
7
+ # All data stored in the global namespace must be pickle-able, unless fork_init is
8
+ # set to True.
9
+ # If the fork_init attribute is True, it means the variables data must be cleared
10
+ # within the child process, and not carried over (COW-ed) like it would happen when
11
+ # using the fork(2) system call.
12
+ # NOTE: This is a low level module which should have no explicit local dependencies.
13
+
14
+ import collections
15
+ import inspect
16
+ import multiprocessing
17
+ import os
18
+ import threading
19
+
20
+
21
+ # The parent_fn function is called (if present) before the creation of a new process,
22
+ # within the parent, with the current value of the variable, and is supposed to be
23
+ # returning the "state" of such variable. The state must be pickle-able.
24
+ # The child_fn function is called (if present) after the creation of a new process,
25
+ # within the child, to restore a variable from its state (the value of the new variable
26
+ # should be returned).
27
+ Var = collections.namedtuple(
28
+ 'Var',
29
+ 'name, parent_fn, child_fn, data, fork_init, defval',
30
+ defaults=(None, None, None, False, None))
31
+
32
+ _NS = dict()
33
+ _LOCK = threading.RLock()
34
+
35
+
36
+ def _child_fork():
37
+ global _NS, _LOCK
38
+
39
+ cns = dict()
40
+ for var in _NS.values():
41
+ if not var.fork_init:
42
+ cns[var.name] = var
43
+
44
+ _NS = cns
45
+ _LOCK = threading.RLock()
46
+
47
+
48
+ if os.name == 'posix':
49
+ os.register_at_fork(after_in_child=_child_fork)
50
+
51
+
52
+ def parent_switch(method):
53
+ assert method in multiprocessing.get_all_start_methods(), method
54
+
55
+ pns = dict()
56
+ with _LOCK:
57
+ for var in _NS.values():
58
+ # Variables with fork_init=True are the ones that are supposed to be
59
+ # initialized in every process, and as such do not have to be carried over
60
+ # from the parent context. Also, fork_init=True variables might contain data
61
+ # which is not pickle-able, and carrying them over will fail.
62
+ if not var.fork_init:
63
+ if var.parent_fn is not None:
64
+ data = var.parent_fn(var.data)
65
+ if data is not var.data:
66
+ var = None if data is None else var._replace(data=data)
67
+
68
+ if var is not None:
69
+ pns[var.name] = var
70
+
71
+ return pns
72
+
73
+
74
+ def child_switch(method, ns):
75
+ global _NS
76
+
77
+ assert method in multiprocessing.get_all_start_methods(), method
78
+
79
+ cns = dict()
80
+ for var in ns.values():
81
+ if var.child_fn is not None:
82
+ data = var.child_fn(var.data)
83
+ if data is not var.data:
84
+ var = None if data is None else var._replace(data=data)
85
+
86
+ if var is not None:
87
+ cns[var.name] = var
88
+
89
+ _NS = cns
90
+
91
+
92
+ def get(var, force=True):
93
+ with _LOCK:
94
+ value = _NS.get(var.name)
95
+ if value is None and force:
96
+ data = var.defval() if inspect.isfunction(var.defval) else var.defval
97
+ value = var._replace(data=data)
98
+ _NS[value.name] = value
99
+
100
+ return value.data if value is not None else None
101
+
102
+
103
+ def set(var, data):
104
+ with _LOCK:
105
+ prev_value = _NS.get(var.name)
106
+ value = var._replace(data=data)
107
+ _NS[value.name] = value
108
+
109
+ return prev_value.data if prev_value is not None else None
110
+
@@ -0,0 +1,139 @@
1
+ import functools
2
+ import os
3
+
4
+ import httpx
5
+
6
+ from . import async_manager as asym
7
+ from . import assert_checks as tas
8
+ from . import core_utils as cu
9
+ from . import file_overwrite as fow
10
+ from . import fin_wrap as fw
11
+ from . import gfs
12
+ from . import tempdir as tmpd
13
+ from . import utils as ut
14
+ from . import work_results as wres
15
+
16
+
17
+ async def http_fetch_url(url, context=None, path=None, http_args=None):
18
+ wpath = wres.work_path(path, url)
19
+ try:
20
+ client = await context.get('httpx.AsyncClient', httpx.AsyncClient)
21
+
22
+ resp = await client.get(url, **http_args)
23
+ resp.raise_for_status()
24
+
25
+ with wres.write_result(wpath) as fd:
26
+ fd.write(resp.content)
27
+ except Exception as ex:
28
+ wres.write_error(wpath, ex, workid=url)
29
+ finally:
30
+ return wpath
31
+
32
+
33
+ class HttpAsyncFetcher:
34
+
35
+ def __init__(self,
36
+ path=None,
37
+ num_workers=None,
38
+ http_args=None,
39
+ mpctx=None):
40
+ self._ctor_path = path
41
+ self._path = None
42
+ self._num_workers = num_workers
43
+ self._http_args = ut.dict_setmissing(
44
+ http_args or dict(),
45
+ timeout=ut.getenv('FETCHER_TIMEO', dtype=float, defval=10.0),
46
+ )
47
+ self._mpctx = mpctx
48
+ self._async_manager = None
49
+ self._pending = set()
50
+
51
+ @classmethod
52
+ def _cleaner(cls, self):
53
+ self._async_manager.close()
54
+ if self._path != self._ctor_path:
55
+ gfs.rmtree(self._path, ignore_errors=True)
56
+
57
+ def start(self):
58
+ if self._ctor_path is None:
59
+ self._path = tmpd.fastfs_dir()
60
+ else:
61
+ self._path = self._ctor_path
62
+
63
+ async_manager = asym.AsyncManager(**cu.denone(num_workers=self._num_workers,
64
+ mpctx=self._mpctx))
65
+
66
+ finfn = functools.partial(self._cleaner,
67
+ cu.object_context(self, _async_manager=async_manager))
68
+ fw.fin_wrap(self, '_async_manager', async_manager, finfn=finfn)
69
+
70
+ def shutdown(self):
71
+ async_manager = self._async_manager
72
+ if async_manager is not None:
73
+ fw.fin_wrap(self, '_async_manager', None, cleanup=True)
74
+ self._path = None
75
+ self._pending = set()
76
+
77
+ def enqueue(self, *urls):
78
+ wmap = dict()
79
+ for url in urls:
80
+ if url:
81
+ work_ctor = functools.partial(http_fetch_url, url,
82
+ path=self._path,
83
+ http_args=self._http_args)
84
+ self._async_manager.enqueue_work(url, work_ctor)
85
+ self._pending.add(url)
86
+ wmap[url] = wres.work_hash(url)
87
+
88
+ return wmap
89
+
90
+ def wait(self, url):
91
+ wpath = wres.work_path(self._path, url)
92
+ if not os.path.isfile(wpath):
93
+ tas.check(url in self._pending, msg=f'URL already retired: {url}')
94
+
95
+ while self._pending:
96
+ (rurl, result) = self._async_manager.fetch_result()
97
+
98
+ self._pending.discard(rurl)
99
+ wres.raise_if_error(result)
100
+ if rurl == url:
101
+ break
102
+
103
+ try:
104
+ return wres.get_work(wpath)
105
+ finally:
106
+ os.remove(wpath)
107
+
108
+ def iter_results(self, max_results=None, block=True, timeout=None):
109
+ count = 0
110
+ while self._pending:
111
+ if (fetchres := self._async_manager.fetch_result(block=block,
112
+ timeout=timeout)) is None:
113
+ break
114
+
115
+ rurl, result = fetchres
116
+
117
+ self._pending.discard(rurl)
118
+ wpath = wres.work_path(self._path, rurl)
119
+
120
+ wdata = wres.load_work(wpath)
121
+
122
+ os.remove(wpath)
123
+
124
+ yield rurl, wdata
125
+
126
+ count += 1
127
+ if max_results is not None and count >= max_results:
128
+ break
129
+
130
+ def __enter__(self):
131
+ self.start()
132
+
133
+ return self
134
+
135
+ def __exit__(self, *exc):
136
+ self.shutdown()
137
+
138
+ return False
139
+