cdxcore 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cdxcore might be problematic. Click here for more details.

cdxcore/jcpool.py ADDED
@@ -0,0 +1,411 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Simple multi-processing wrapper around the already great joblib.paralllel.
4
+ The minor additions are that parallel processing will be a tad more convenient for dictionaries,
5
+ and that it supports routing cdxbasics.cdxbasics.Context messaging via a Queue to a single thread.
6
+ """
7
+
8
+ from joblib import Parallel as joblib_Parallel, delayed as jl_delayed
9
+ from multiprocessing import Manager, Queue
10
+ from threading import Thread, get_ident as get_thread_id
11
+ import gc as gc
12
+ from collections import OrderedDict
13
+ from collections.abc import Mapping, Callable, Sequence, Iterable
14
+ import functools as functools
15
+
16
+ from .verbose import Context, Timer
17
+ from .subdir import SubDir
18
+
19
+ class ParallelContextChannel( Context ):
20
+ """
21
+ Lightweight channel for cdxbasics.verbose.Context which is pickle'able
22
+ Implements trivial Context channel() protocol.
23
+ """
24
+ def __init__(self, *, cid, maintid, queue):
25
+ self._queue = queue
26
+ self._cid = cid
27
+ self._maintid = maintid
28
+ def __call__(self, msg : str, flush : bool ):
29
+ """ Context channel call (outside process) to send messages to 'report' """
30
+ if get_thread_id() == self._maintid:
31
+ print(msg, end='', flush=True)
32
+ else:
33
+ return self._queue.put( (msg, flush) )
34
+
35
+ class _ParallelContextOperator( object ):
36
+ """
37
+ Queue-based channel backbone for _ParallelContextChannel
38
+ This object cannot be pickled; use self.mp_context as object to pass to other processes.
39
+ """
40
+ def __init__(self, pool_verbose : Context, # context to print Pool progress to (in thread)
41
+ f_verbose : Context, # original function context (in thread)
42
+ verbose_interval : float = None # throttling for reporting
43
+ ):
44
+ cid = id(f_verbose)
45
+ tid = get_thread_id()
46
+ with pool_verbose.write_t(f"Launching messaging queue '{cid}' using thread '{tid}'... ", end='') as tme:
47
+ self._cid = cid
48
+ self._tid = tid
49
+ self._pool_verbose = pool_verbose
50
+ self._mgr = Manager()
51
+ self._queue = self._mgr.Queue()
52
+ self._thread = Thread(target=self.report, kwargs=dict(cid=cid, queue=self._queue, f_verbose=f_verbose, verbose_interval=verbose_interval), daemon=True)
53
+ self._mp_context = Context( f_verbose,
54
+ channel=ParallelContextChannel( cid=self._cid, queue=self._queue, maintid=self._tid ) )
55
+ self._thread.start()
56
+ pool_verbose.write(f"done; this took {tme}.", head=False)
57
+
58
+ def __del__(self):
59
+ """ clean up; should not be necessary """
60
+ self.terminate()
61
+
62
+ def terminate(self):
63
+ """ stop all multi-thread/processing activity """
64
+ if self._queue is None:
65
+ return
66
+ tme = Timer()
67
+ self._queue.put( None )
68
+ self._thread.join(timeout=2)
69
+ if self._thread.is_alive():
70
+ raise RuntimeError("Failed to terminate thread")
71
+ self._thread = None
72
+ self._queue = None
73
+ self._mgr = None
74
+ gc.collect()
75
+ self._pool_verbose.write(f"Terminated message queue '{self.cid}'. This took {tme}.")
76
+
77
+ @property
78
+ def cid(self) -> str:
79
+ """ context ID. Useful for debugging """
80
+ return self._cid
81
+
82
+ @property
83
+ def mp_context(self):
84
+ """ Return the actual channel as a pickleable object """
85
+ return self._mp_context
86
+
87
+ @staticmethod
88
+ def report( cid : str, queue : Queue, f_verbose : Context, verbose_interval : float ):
89
+ """ Thread program to keep reporting messages until None is received """
90
+ tme = f_verbose.timer()
91
+ while True:
92
+ r = queue.get()
93
+ if r is None:
94
+ break
95
+ if isinstance(r, Exception):
96
+ print(f"*** Messaging queue {cid} encountered an exception: {r}. Aborting.")
97
+ raise r
98
+ msg, flush = r
99
+ if tme.interval_test(verbose_interval):
100
+ print(msg, end='', flush=flush)
101
+
102
+ def __enter__(self):
103
+ return self.mp_context
104
+
105
+ def __exit__(self, *kargs, **kwargs):
106
+ #self.terminate()
107
+ return False#raise exceptions
108
+
109
+ class _DIF(object):
110
+ """ _DictIterator 'F' """
111
+ def __init__(self, k : str, f : Callable, merge_tuple : bool ):
112
+ self._f = f
113
+ self._k = k
114
+ self._merge_tuple = merge_tuple
115
+ def __call__(self, *args, **kwargs):
116
+ r = self._f(*args, **kwargs)
117
+ if not self._merge_tuple or not isinstance(r, tuple):
118
+ return (self._k, r)
119
+ return ((self._k,) + r)
120
+
121
+ class _DictIterator(object):
122
+ """ Dictionary iterator """
123
+ def __init__(self, jobs : Mapping, merge_tuple : bool):
124
+ self._jobs = jobs
125
+ self._merge_tuple = merge_tuple
126
+ def __iter__(self):
127
+ for k, v in self._jobs.items():
128
+ f, args, kwargs = v
129
+ yield _DIF(k,f, self._merge_tuple), args, kwargs
130
+ def __len__(self):#don't really need that but good to have
131
+ return len(self._jobs)
132
+
133
+ def _parallel(pool, jobs : Iterable) -> Iterable:
134
+ """
135
+ Process 'jobs' in parallel using the current multiprocessing pool.
136
+ All (function) values of 'jobs' must be generated using self.delayed.
137
+ See help(JCPool) for usage patterns.
138
+
139
+ Parameters
140
+ ----------
141
+ jobs:
142
+ can be a sequence, a generator, or a dictionary.
143
+ Each function value must have been generated using JCPool.delayed()
144
+
145
+ Returns
146
+ -------
147
+ An iterator which yields results as soon as they are available.
148
+ If 'jobs' is a dictionary, then the resutling iterator will generate tuples with the first
149
+ element equal to the dictionary key of the respective function job.
150
+ """
151
+ if not isinstance(jobs, Mapping):
152
+ return pool( jobs )
153
+ return pool( _DictIterator(jobs,merge_tuple=True) )
154
+
155
+ def _parallel_to_dict(pool, jobs : Mapping) -> Mapping:
156
+ """
157
+ Process 'jobs' in parallel using the current multiprocessing pool.
158
+ All values of the dictionary 'jobs' must be generated using self.delayed.
159
+ This function awaits the calculation of all elements of 'jobs' and
160
+ returns a dictionary with the results.
161
+
162
+ See help(JCPool) for usage patterns.
163
+
164
+ Parameters
165
+ ----------
166
+ jobs:
167
+ A dictionary where all (function) values must have been generated using JCPool.delayed.
168
+
169
+ Returns
170
+ -------
171
+ A dictionary with results.
172
+ If 'jobs' is an OrderedDict, then this function will return an OrderedDict
173
+ with the same order as 'jobs'.
174
+ """
175
+ assert isinstance(jobs, Mapping), ("'jobs' must be a Mapping.", type(jobs))
176
+ r = dict( pool( _DictIterator(jobs,merge_tuple=False) ) )
177
+ if isinstance( jobs, OrderedDict ):
178
+ q = OrderedDict()
179
+ for k in jobs:
180
+ q[k] = r[k]
181
+ r = q
182
+ return r
183
+
184
+ def _parallel_to_list(pool, jobs : Sequence ) -> Sequence:
185
+ """
186
+ Call parallel() and convert the resulting generator into a list.
187
+
188
+ Parameters
189
+ ----------
190
+ jobs:
191
+ can be a sequence, a generator, or a dictionary.
192
+ Each function value must have been generated using JCPool.delayed()
193
+
194
+ Returns
195
+ -------
196
+ An list with the results in order of the input.
197
+ """
198
+ assert not isinstance( jobs, Mapping ), ("'jobs' is a Mapping. Use parallel_to_dict() instead.", type(jobs))
199
+ r = _parallel_to_dict( pool, { i: j for i, j in enumerate(jobs) } )
200
+ return list( r[i] for i in range(len(jobs)) )
201
+
202
+ class JCPool( object ):
203
+ """
204
+ Parallel Job Context Pool
205
+
206
+ Simple wrapper around joblib.Parallel which allows using cdxbasics.verbose.Context objects seemlessly:
207
+ use of any contexts from a different process will send messages via a Queue to the main process
208
+ where a sepeate thread prints these messages out.
209
+ Using a fixed pool object also avoids relaunching processes.
210
+
211
+ Finally, the parallel pool allows working directly with dictionaries which is useful for asynchronous
212
+ processing (which is the default).
213
+
214
+ Usage
215
+ -----
216
+ Assume we have a function such as:
217
+
218
+ def f( ticker, tdata, verbose : Context ):
219
+ #...
220
+ tx = 0.
221
+ ty = 1.
222
+ verbose.write(f"Result for {ticker}: {tx}, {ty}")
223
+ return tx, ty # tuple result for illustration
224
+
225
+ List/Generator
226
+ --------------
227
+ Use the pool.context() context handler to convert a Context 'verbose' object into a multi-processing channel.
228
+ Then pass a generator to pool.parallel
229
+
230
+ pool = JPool( num_workers=4 )
231
+ verbose = Context("all")
232
+ with pool.context( verbose ) as verbose:
233
+ for tx, ty in pool.parallel( pool.delayed(f)( ticker=ticker, tdata=tdata, verbose=verbose ) for ticker, tdata in self.data.items() ):
234
+ print(f"Returned {tx}, {ty}")
235
+ print("Done")
236
+
237
+ Dict
238
+ ----
239
+ Similar construct, but with a dictionary. Considering the asynchronous nature of the returned data it is often desirable
240
+ to keep track of a result identifier. This is automated with the dictionary usage pattern:
241
+
242
+ pool = JPool( num_workers=4 )
243
+ verbose = Context("all")
244
+ with pool.context( verbose ) as verbose:
245
+ for ticker, tx, ty in pool.parallel( { ticker: pool.delayed(f)( ticker=ticker, tdata=tdata, verbose=verbose ) for ticker, tdata in self.data.items() } ):
246
+ print(f"Returned {tx}, {ty} for {ticker}")
247
+ print("Done")
248
+
249
+ Note that pool.parallel when applied to a dictionary does not return a dictionary, but a sequence of tuples.
250
+ As in the example this also works if the function being called returns tuples itself; in this case the returned data
251
+ is extended by the key of the dictionary provided.
252
+
253
+ In order to retrieve a dictionary use
254
+
255
+ pool = JPool( num_workers=4 )
256
+ verbose = Context("all")
257
+ with pool.context( verbose ) as verbose:
258
+ r = pool.parallel_to_dict( { ticker: pool.delayed(f)( ticker=ticker, tdata=tdata, verbose=verbose ) for ticker, tdata in self.data.items() } )
259
+ print("Done")
260
+
261
+ Note that in this case the function returns after all items have been processed.
262
+ """
263
+ def __init__(self, num_workers : int = 1,
264
+ threading : bool = False,
265
+ tmp_dir : str = "!/.cdxmp", *,
266
+ verbose : Context = Context.quiet,
267
+ parallel_kwargs : dict = {} ):
268
+ """
269
+ Initialize a multi-processing pool. Thin wrapper aroud joblib.parallel for cdxbasics.verbose.Context() output
270
+ """
271
+ num_workers = int(num_workers)
272
+ self._tmp_dir = SubDir(tmp_dir, ext='')
273
+ self._verbose = verbose if not verbose is None else Context("quiet")
274
+ self._threading = threading
275
+ assert num_workers > 0, ("'num_workers' must be positive", num_workers)
276
+
277
+ with self._verbose.write_t(f"Launching {num_workers} processes with temporary path '{self.tmp_path}'... ", end='') as tme:
278
+ self._pool = joblib_Parallel( n_jobs=num_workers,
279
+ backend="loky" if not threading else "threading",
280
+ return_as="generator_unordered",
281
+ temp_folder=self.tmp_path, **parallel_kwargs)
282
+ self._verbose.write(f"done; this took {tme}.", head=False)
283
+
284
+ def __del__(self):
285
+ self.terminate()
286
+
287
+ @property
288
+ def tmp_path(self) -> str:
289
+ return self._tmp_dir.path
290
+ @property
291
+ def is_threading(self) -> bool:
292
+ return self._threading
293
+
294
+ def terminate(self):
295
+ """
296
+ Stop the current parallel pool, and delete any temporary files.
297
+ """
298
+ if not self._pool is None:
299
+ tme = Timer()
300
+ del self._pool
301
+ self._pool = None
302
+ self._verbose.write(f"Shut down parallel pool. This took {tme}.")
303
+ gc.collect()
304
+ self._tmp_dir.eraseEverything(keepDirectory=True)
305
+
306
+ def context( self, verbose : Context, verbose_interval : float = None ):
307
+ """
308
+ Return a cdxbasics.verbose.Context object whose 'channel' is a queue towards a parallel thread.
309
+ As a result the worker process is able to use 'verbose' as if it were in-process
310
+
311
+ See help(JCPool) for usage patterns.
312
+ """
313
+ if self._threading:
314
+ return verbose
315
+ return _ParallelContextOperator( pool_verbose=self._verbose,
316
+ f_verbose=verbose,
317
+ verbose_interval=verbose_interval )
318
+
319
+ @staticmethod
320
+ def validate( F : Callable, args : list, kwargs : Mapping ):
321
+ """ Check that 'args' and 'kwargs' do not contain Context objects without channel """
322
+ for k, v in enumerate(args):
323
+ if isinstance(v, Context) and not isinstance(v.channel, ParallelContextChannel):
324
+ raise RuntimeError(f"Argument #{k} for {F.__qualname__} is a Context object, but its channel is not set to 'ParallelContextChannel'. Use JPool.context().")
325
+ for k, v in kwargs.items():
326
+ if isinstance(v, Context) and not isinstance(v.channel, ParallelContextChannel):
327
+ raise RuntimeError(f"Keyword argument '{k}' for {F.__qualname__} is a Context object, but its channel is not set to 'ParallelContextChannel'. Use JPool.context().")
328
+
329
+ def delayed(self, F : Callable):
330
+ """
331
+ Decorate a function F for parallel execution.
332
+ Synthatical sugar aroud joblib.delayed().
333
+ Checks that there are no Context arguments without ParallelContextChannel present.
334
+
335
+ Parameters
336
+ ----------
337
+ F : function.
338
+
339
+ Returns
340
+ -------
341
+ Decorated function.
342
+ """
343
+ if self._threading:
344
+ return jl_delayed(F)
345
+ def delayed_function( *args, **kwargs ):
346
+ JCPool.validate( F, args, kwargs )
347
+ return F, args, kwargs # mimic joblin.delayed()
348
+ try:
349
+ delayed_function = functools.wraps(F)(delayed_function)
350
+ except AttributeError:
351
+ " functools.wraps fails on some callable objects "
352
+ return delayed_function
353
+
354
+ def parallel(self, jobs : Iterable) -> Iterable:
355
+ """
356
+ Process 'jobs' in parallel using the current multiprocessing pool.
357
+ All (function) values of 'jobs' must be generated using self.delayed.
358
+ See help(JCPool) for usage patterns.
359
+
360
+ Parameters
361
+ ----------
362
+ jobs:
363
+ can be a sequence, a generator, or a dictionary.
364
+ Each function value must have been generated using JCPool.delayed()
365
+
366
+ Returns
367
+ -------
368
+ An iterator which yields results as soon as they are available.
369
+ If 'jobs' is a dictionary, then the resutling iterator will generate tuples with the first
370
+ element equal to the dictionary key of the respective function job.
371
+ """
372
+ return _parallel( self._pool, jobs )
373
+
374
+ def parallel_to_dict(self, jobs : Mapping) -> Mapping:
375
+ """
376
+ Process 'jobs' in parallel using the current multiprocessing pool.
377
+ All values of the dictionary 'jobs' must be generated using self.delayed.
378
+ This function awaits the calculation of all elements of 'jobs' and
379
+ returns a dictionary with the results.
380
+
381
+ See help(JCPool) for usage patterns.
382
+
383
+ Parameters
384
+ ----------
385
+ jobs:
386
+ A dictionary where all (function) values must have been generated using JCPool.delayed.
387
+
388
+ Returns
389
+ -------
390
+ A dictionary with results.
391
+ If 'jobs' is an OrderedDict, then this function will return an OrderedDict
392
+ with the same order as 'jobs'.
393
+ """
394
+ return _parallel_to_dict( self._pool, jobs )
395
+
396
+ def parallel_to_list(self, jobs : Sequence ) -> Sequence:
397
+ """
398
+ Call parallel() and convert the resulting generator into a list.
399
+
400
+ Parameters
401
+ ----------
402
+ jobs:
403
+ can be a sequence, a generator, or a dictionary.
404
+ Each function value must have been generated using JCPool.delayed()
405
+
406
+ Returns
407
+ -------
408
+ An list with the results in order of the input.
409
+ """
410
+ return _parallel_to_list( self._pool, jobs )
411
+