gwaslab 3.4.42__py3-none-any.whl → 3.4.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

@@ -0,0 +1,687 @@
1
+ from pathlib import Path
2
+ import os
3
+ import pickle
4
+ import concurrent.futures
5
+ import threading
6
+ import multiprocessing as mp
7
+ import time
8
+ import h5py
9
+
10
+ from gwaslab.g_Log import Log
11
+
12
+ from platformdirs import user_cache_dir
13
+ from pysam import VariantFile
14
+
15
+ APPNAME = "gwaspipe"
16
+ APPAUTHOR = "cloufield"
17
+
18
+ CACHE_EXT = '.cache'
19
+
20
+
21
+ ################################################# UTILS #################################################
22
+
23
+ def get_cache_path(base_path):
24
+ cache_filename = str(Path(base_path).stem) + CACHE_EXT
25
+ cache_path = os.path.join(os.path.dirname(base_path), cache_filename)
26
+ if os.path.exists(cache_path):
27
+ return cache_path
28
+ else:
29
+ cache_dir = user_cache_dir(APPNAME, APPAUTHOR)
30
+ user_cache_path = os.path.join(cache_dir, cache_filename)
31
+ if os.path.exists(user_cache_path):
32
+ return user_cache_path
33
+
34
+ return None
35
+
36
+ def get_write_path(base_path):
37
+ cache_filename = str(Path(base_path).stem) + CACHE_EXT
38
+ if os.access(os.path.dirname(base_path), os.W_OK):
39
+ # if we have write access to the directory where the original input file is located
40
+ return os.path.join(os.path.dirname(base_path), cache_filename)
41
+ else:
42
+ cache_dir = user_cache_dir(APPNAME, APPAUTHOR)
43
+ if os.access(cache_dir, os.W_OK):
44
+ # if we have write access to the user cache directory
45
+ return os.path.join(cache_dir, cache_filename)
46
+
47
+ raise Exception('No write access to any cache directory')
48
+
49
+ def cache_exists(path, ref_alt_freq, category='all'):
50
+ ''' Check if the cache file exists and contains the required data '''
51
+ found = False
52
+ try:
53
+ found = is_in_h5py(path, ref_alt_freq, category)
54
+ except Exception as e:
55
+ pass
56
+ return found
57
+
58
+ def is_in_h5py(path, ref_alt_freq, category='all'):
59
+ '''
60
+ Check if the cache file exists and contains the required data.
61
+ Raise an exception if the cache file does not exist.
62
+ '''
63
+ if not path or not os.path.exists(path):
64
+ raise Exception('Cache file not found')
65
+
66
+ with h5py.File(path, 'r') as f:
67
+ if ref_alt_freq in f.keys():
68
+ if category in f[ref_alt_freq].keys():
69
+ if len(f[ref_alt_freq][category].keys()) > 0:
70
+ return True
71
+ return False
72
+
73
+ def load_h5py_cache(path, ref_alt_freq, category='all'):
74
+ if not path or not os.path.exists(path):
75
+ raise Exception('Cache file not found')
76
+
77
+ if not is_in_h5py(path, ref_alt_freq, category):
78
+ raise Exception('Cache file does not contain the required data')
79
+
80
+ _cache = {}
81
+ with h5py.File(path, 'r') as f:
82
+ for v in f[ref_alt_freq][category].values():
83
+ # iterate over chromosomes
84
+ keys = list(v['keys'].asstr()[:])
85
+ values = list(v['values'][:])
86
+ chrom_cache = dict(zip(keys, values)) # Combine keys and values into a dictionary
87
+ _cache.update(chrom_cache)
88
+ return _cache
89
+
90
+ def build_cache(base_path, ref_alt_freq=None, n_cores=1, return_cache=False, filter_fn=None, category='all', log=Log(), verbose=True):
91
+ cache_builder = CacheBuilder(base_path, ref_alt_freq=ref_alt_freq, n_cores=n_cores, log=log, verbose=verbose)
92
+ cache_builder.start_building(filter_fn=filter_fn, category=category, set_cache=return_cache) # start_building will wait for all processes to finish building cache
93
+ if return_cache:
94
+ return cache_builder.get_cache()
95
+
96
+ def is_palindromic(ref, alt):
97
+ gc = (ref=="G") & (alt=="C")
98
+ cg = (ref=="C") & (alt=="G")
99
+ at = (ref=="A") & (alt=="T")
100
+ ta = (ref=="T") & (alt=="A")
101
+ palindromic = gc | cg | at | ta
102
+ return palindromic
103
+
104
+ def is_indel(ref, alt):
105
+ return len(ref) != len(alt)
106
+
107
+ def filter_fn_pi(*, ref, alt):
108
+ return is_palindromic(ref, alt) or is_indel(ref, alt)
109
+
110
+ def filter_fn_np(*, ref, alt):
111
+ return not is_palindromic(ref, alt)
112
+
113
+ PALINDROMIC_INDEL = 'pi' # palindromic + indel
114
+ NON_PALINDROMIC = 'np' # non-palindromic
115
+
116
+ FILTER_FN = {
117
+ PALINDROMIC_INDEL: filter_fn_pi,
118
+ NON_PALINDROMIC: filter_fn_np
119
+ }
120
+
121
+
122
+ ################################################# CACHE MANAGERs #################################################
123
+
124
+ class CacheMainManager:
125
+ def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
126
+ self.base_path = base_path
127
+ self.ref_alt_freq = ref_alt_freq
128
+ self.category = category
129
+ self.filter_fn = filter_fn
130
+ self.n_cores = n_cores
131
+ self.log = log
132
+ self.verbose = verbose
133
+
134
+ def _get_cache_path(self):
135
+ return get_cache_path(self.base_path)
136
+
137
+ def _get_write_path(self):
138
+ if self.base_path is not None:
139
+ return get_write_path(self.base_path)
140
+ else:
141
+ raise Exception('base_path is None')
142
+
143
+ @property
144
+ def cache_len(self):
145
+ return len(self.cache)
146
+
147
+ @property
148
+ def cache(self):
149
+ if not hasattr(self, '_cache'):
150
+ raise Exception('Cache not loaded')
151
+ return self._cache
152
+
153
+ def build_cache(self):
154
+ ''' Build and load the cache'''
155
+ self._cache = build_cache(
156
+ self.base_path, ref_alt_freq=self.ref_alt_freq, n_cores=self.n_cores,
157
+ filter_fn=self.filter_fn, category=self.category,
158
+ return_cache=True, log=self.log, verbose=self.verbose
159
+ )
160
+
161
+ def load_cache(self, category=None):
162
+ if category is None:
163
+ category = self.category
164
+ cache_path = self._get_cache_path()
165
+ self._cache = load_h5py_cache(cache_path, ref_alt_freq=self.ref_alt_freq, category=category)
166
+
167
+
168
+ class CacheManager(CacheMainManager):
169
+ def __init__(self, base_path=None, cache_loader=None, cache_process=None, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
170
+ none_value = sum([cache_loader is not None, cache_process is not None])
171
+ assert none_value in [0, 1], 'Only one between cache_loader and cache_process should be provided'
172
+ super().__init__(base_path, ref_alt_freq=ref_alt_freq, category=category, filter_fn=filter_fn, n_cores=n_cores, log=log, verbose=verbose)
173
+ if none_value == 1:
174
+ self.base_path = None # unset base_path if cache_loader or cache_process is provided
175
+
176
+ self.cache_loader = cache_loader
177
+ self.cache_process = cache_process
178
+
179
+ if cache_loader is not None:
180
+ assert callable(getattr(cache_loader, 'get_cache', None)), 'cache_loader must have a get_cache method'
181
+ elif cache_process is not None:
182
+ assert isinstance(cache_process, CacheProcess), 'cache_process must be an instance of CacheProcess'
183
+ else:
184
+ cache_path = self._get_cache_path()
185
+ if cache_path is not None:
186
+ self.log.write(f'Start loading cache from {cache_path}...', verbose=self.verbose)
187
+ self.load_cache()
188
+ self.log.write('Finshed loading cache.', verbose=self.verbose)
189
+ else:
190
+ self.log.write(f'Start building cache from {base_path}...', verbose=self.verbose)
191
+ self.build_cache()
192
+ self.log.write('Finished building (and loading) cache.', verbose=self.verbose)
193
+
194
+ @property
195
+ def cache_len(self):
196
+ if self.cache_process is not None:
197
+ return self.cache_process.cache_len()
198
+ else:
199
+ return len(self.cache)
200
+
201
+ @property
202
+ def cache(self):
203
+ if self.cache_loader is not None:
204
+ return self.cache_loader.get_cache()
205
+ else:
206
+ if not hasattr(self, '_cache'):
207
+ raise Exception('Cache not loaded or class not exposing cache')
208
+ return self._cache
209
+
210
+ def apply_fn(self, fn, *args, **kwargs):
211
+ assert 'cache' not in kwargs, "'cache' can't be inside kwargs"
212
+ if self.cache_process is not None:
213
+ return self.cache_process.apply_fn(fn, *args, **kwargs)
214
+ else:
215
+ return fn(*args, cache=self.cache, **kwargs)
216
+
217
+ def _get_cache_path(self):
218
+ if self.cache_loader is None and self.cache_process is None:
219
+ return super()._get_cache_path()
220
+ return None
221
+
222
+
223
+ class CacheProcess(mp.Process):
224
+ '''
225
+ A class for managing a cache in a separate process. It is used to reduce memory consumption when the cache is very large.
226
+ This class will load the cache in a separate process and provide methods to perform operations on the cache directly on the subprocess.
227
+ In this way, the cache is not copied to the main process, but the operations are performed on the cache in the subprocess and only the
228
+ input and output of the operations are communicated (i.e. copied) between the main and the subprocess.
229
+
230
+ This is very useful when the cache is huge (e.g. 40GB in memory) and we want to perform operations on it based on a relatively small input
231
+ (e.g. a "small" dataframe, where small is relative to the cache size) and the output is also relatively small.
232
+ '''
233
+ def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
234
+ super().__init__()
235
+ self.base_path = base_path
236
+ self.ref_alt_freq = ref_alt_freq
237
+ self.filter_fn = filter_fn
238
+ self.category = category
239
+ self.n_cores = n_cores
240
+ self.log = log
241
+ self.verbose = verbose
242
+
243
+ self.daemon = True # When parent process exits, it will attempt to terminate all of its daemonic child processes.
244
+
245
+ self.manager = mp.Manager()
246
+ self.input_queue = mp.Queue() # Queue for communication between processes
247
+ self.result_queue = mp.Queue()
248
+ self.result_produced = mp.Value('b', True)
249
+
250
+ cache_path = self._get_cache_path()
251
+ if not cache_exists(cache_path, ref_alt_freq, category):
252
+ self.build_cache()
253
+ else:
254
+ if n_cores > 1:
255
+ self.log.warning('[CacheProcess: since the cache already exists, the parameter n_cores could be set to 1 without any performance loss]', verbose=self.verbose)
256
+
257
+ def _get_cache_path(self):
258
+ return get_cache_path(self.base_path)
259
+
260
+ def build_cache(self):
261
+ build_cache(
262
+ self.base_path, ref_alt_freq=self.ref_alt_freq, n_cores=self.n_cores,
263
+ filter_fn=self.filter_fn, category=self.category,
264
+ return_cache=False, log=self.log, verbose=self.verbose
265
+ )
266
+
267
+ def run(self):
268
+ cache_path = self._get_cache_path()
269
+ self.log.write(f'[CacheProcess: Start loading cache from {cache_path}...]', verbose=self.verbose)
270
+ cache = load_h5py_cache(cache_path, ref_alt_freq=self.ref_alt_freq, category=self.category)
271
+ self.log.write('[CacheProcess: Finshed loading cache.]', verbose=self.verbose)
272
+
273
+ # Continuously listen for method calls
274
+ while True:
275
+ method, args, kwargs = self.input_queue.get()
276
+ if method == 'get_from_cache':
277
+ key = args[0]
278
+ self.result_queue.put(cache[key])
279
+ self.result_produced.value = True
280
+ elif method == 'apply_fn':
281
+ assert 'cache' not in kwargs, "'cache' can't be inside kwargs"
282
+ fn, *args = args
283
+ result = fn(*args, cache=cache, **kwargs)
284
+ self.result_queue.put(result)
285
+ self.result_produced.value = True
286
+ elif method == 'cache_len':
287
+ self.result_queue.put(len(cache))
288
+ self.result_produced.value = True
289
+ elif method == "terminate":
290
+ self.result_produced.value = True
291
+ break
292
+
293
+ def _call_method(self, method, *args, **kwargs):
294
+ self.result_produced.value = False
295
+ self.input_queue.put((method, args, kwargs))
296
+
297
+ # wait until the result is produced
298
+ while not self.result_produced.value:
299
+ pass
300
+
301
+ def get_from_cache(self, key):
302
+ self._call_method('get_from_cache', key)
303
+ return self.result_queue.get()
304
+
305
+ def apply_fn(self, fn, **kwargs):
306
+ '''
307
+ Apply an arbitrary function to the cache. The function should take the cache as an argument,
308
+ and all the arguments should be passed as named arguments.
309
+ '''
310
+ self._call_method('apply_fn', fn, **kwargs)
311
+ return self.result_queue.get()
312
+
313
+ def cache_len(self):
314
+ self._call_method('cache_len')
315
+ return self.result_queue.get()
316
+
317
+ def terminate(self):
318
+ self._call_method("terminate")
319
+
320
+
321
+ ################################################# CACHE BUILDER #################################################
322
+
323
+ class CacheBuilderOld:
324
+ def __init__(self, ref_infer, ref_alt_freq=None, n_cores=1, log=Log(), verbose=True):
325
+ self.ref_infer = ref_infer
326
+ self.ref_alt_freq = ref_alt_freq
327
+ self.n_cores = n_cores
328
+ self.log = log
329
+ self.verbose = verbose
330
+
331
+ self.cache = {}
332
+ self.lock = threading.Lock() # For thread-safe cache access
333
+ self.cancelled = False # Flag for cancelling the cache building process
334
+ self.running = False
335
+ self.executor = None # Thread pool executor
336
+ self.futures = None # Stores Future objects
337
+
338
+ def start_building(self):
339
+ if self.running:
340
+ print("Cache building is already running. If you want to restart, please stop the current process first.")
341
+ return
342
+
343
+ n_cores = self.n_cores
344
+ contigs = self.get_contigs()
345
+
346
+ self.cancelled = False
347
+ self.running = True
348
+
349
+ self.log.write(f" -Building cache on {n_cores} cores...", verbose=self.verbose)
350
+ self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=n_cores)
351
+ self.futures = [self.executor.submit(self.build_cache, chrom) for chrom in contigs]
352
+
353
+ def get_contigs(self):
354
+ vcf_reader = VariantFile(self.ref_infer, drop_samples=True)
355
+ contigs = [v.name for v in vcf_reader.header.contigs.values()]
356
+ vcf_reader.close()
357
+ return contigs
358
+
359
+ def build_cache(self, chrom):
360
+ vcf_reader = VariantFile(self.ref_infer, drop_samples=True)
361
+ #self.log.write(f" -Fetching contig '{chrom}'...")
362
+ seq = vcf_reader.fetch(chrom)
363
+
364
+ first = True
365
+ for record in seq:
366
+ if first:
367
+ #self.log.write(f" -Found at least one record for contig '{chrom}'...")
368
+ first = False
369
+ chrom = record.chrom
370
+ start = record.pos - 1
371
+ end = record.pos
372
+ cache_key = f"{chrom}:{start}:{end}"
373
+ to_add = [record.pos, record.ref, record.alts, record.info[self.ref_alt_freq][0]]
374
+ self.add_to_cache(cache_key, to_add)
375
+
376
+ def stop_building(self, wait=False, verbose=False):
377
+ if self.futures:
378
+ self.cancelled = True
379
+ for future in self.futures:
380
+ future.cancel()
381
+ self.executor.shutdown(wait=wait) # Whether to wait for threads to finish
382
+ self.futures = None
383
+ self.executor = None
384
+ self.running = False
385
+
386
+ if verbose:
387
+ print(f"Cache contains {len(self.get_cache())} variants")
388
+
389
+ def add_to_cache(self, key, value):
390
+ self.lock.acquire()
391
+ if key in self.cache:
392
+ self.cache[key].append(value)
393
+ else:
394
+ self.cache[key] = [value]
395
+ self.lock.release()
396
+
397
+ def get_cache(self, complete=False):
398
+ if complete:
399
+ concurrent.futures.wait(self.futures)
400
+
401
+ self.lock.acquire()
402
+ cache = self.cache
403
+ self.lock.release()
404
+ return cache
405
+
406
+ def reset_cache(self):
407
+ self.lock.acquire()
408
+ self.cache = {}
409
+ self.lock.release()
410
+
411
+ def save_cache(self, save_path):
412
+ cache = self.get_cache(complete=True)
413
+ self.log.write(f' -Saving cache to {save_path}', verbose=self.verbose)
414
+ with open(save_path, 'wb') as f:
415
+ pickle.dump(cache, f, protocol=pickle.HIGHEST_PROTOCOL)
416
+ self.log.write(' -Cache saved', verbose=self.verbose)
417
+
418
+
419
+ class CacheBuilder:
420
+ def __init__(self, ref_infer, ref_alt_freq=None, n_cores=1, log=Log(), verbose=True):
421
+ self.ref_infer = ref_infer
422
+ self.ref_alt_freq = ref_alt_freq
423
+ self.n_cores = n_cores
424
+ self.log = log
425
+ self.verbose = verbose
426
+
427
+ self.running = False
428
+ self.cache = None
429
+
430
+ def get_contigs(self):
431
+ vcf_reader = VariantFile(self.ref_infer, drop_samples=True)
432
+ contigs = [v.name for v in vcf_reader.header.contigs.values()]
433
+ vcf_reader.close()
434
+ return contigs
435
+
436
+ def already_built(self, category):
437
+ cache_path = get_cache_path(self.ref_infer)
438
+ return cache_exists(cache_path, self.ref_alt_freq, category)
439
+
440
+ def start_building(self, filter_fn=None, category='all', set_cache=True):
441
+ if self.running:
442
+ print("Cache building is already running. If you want to restart, please stop the current process first.")
443
+ return
444
+
445
+ if isinstance(filter_fn, str) and filter_fn in FILTER_FN:
446
+ filter_fn = FILTER_FN[filter_fn]
447
+ category = filter_fn
448
+ elif category in FILTER_FN:
449
+ self.log.write(f" -Using the built-in filter function for category '{category}'. filter_fn will be ignored if provided.", verbose=self.verbose)
450
+ filter_fn = FILTER_FN[category]
451
+
452
+ assert filter_fn is None or category != 'all', "If filter_fn is not None, category cannot be 'all'"
453
+ assert filter_fn is not None or category == 'all', "If category is not 'all', filter_fn must be provided"
454
+
455
+ if self.already_built(category=category):
456
+ # TODO: we should probably improve the checking logic, and maybe also allows to overwrite the cache
457
+ self.log.write(f"Cache for category '{category}' and ref_alt_freq {self.ref_alt_freq} already exists. Skipping cache building", verbose=self.verbose)
458
+ return
459
+
460
+ n_cores = max(self.n_cores-1, 1) # leave one core for the watcher process
461
+ contigs = self.get_contigs()
462
+
463
+ self.running = True
464
+
465
+ self.log.write(f" -Building cache for category '{category}' on {n_cores} cores...", verbose=self.verbose)
466
+
467
+ pool = mp.Pool(n_cores)
468
+ manager = mp.Manager()
469
+ queue = manager.Queue()
470
+ jobs = []
471
+
472
+ # Start a watcher process to handle the output of each subprocess.
473
+ # The watcher will write the cache to the file as soon as it receives the output from the subprocess, in a safe way.
474
+ watcher = mp.Process(target=self.handle_output, args=(queue,))
475
+ watcher.daemon = True
476
+ watcher.start()
477
+
478
+ for chrom in contigs:
479
+ job = pool.apply_async(self.build_cache, args=(chrom, queue), kwds={'filter_fn': filter_fn, 'category': category})
480
+ jobs.append(job)
481
+
482
+ pool.close()
483
+ pool.join() # wait for all processes to finish
484
+
485
+ queue.put('kill') # send a signal to the watcher process to stop
486
+ watcher.join()
487
+
488
+ if set_cache:
489
+ self.cache = {}
490
+ for job in jobs:
491
+ self.cache.update(job.get()['cache'])
492
+
493
+ self.running = False
494
+
495
+ def build_cache(self, chrom, queue, filter_fn=None, category='all'):
496
+ assert filter_fn is None or category != 'all', "If filter_fn is not None, category cannot be 'all'"
497
+
498
+ inner_cache = {}
499
+ ref_alt_freq = self.ref_alt_freq
500
+
501
+ vcf_reader = VariantFile(self.ref_infer, drop_samples=True)
502
+ #self.log.write(f" -Fetching contig '{chrom}'...", verbose=self.verbose)
503
+ seq = vcf_reader.fetch(chrom)
504
+
505
+ for record in seq:
506
+ for alt in record.alts:
507
+ if filter_fn is None or filter_fn(ref=record.ref, alt=alt):
508
+ key = f"{record.chrom}:{record.pos}:{record.ref}:{alt}"
509
+ value = record.info[ref_alt_freq][0]
510
+ inner_cache[key] = value
511
+
512
+ vcf_reader.close()
513
+
514
+ result = {}
515
+ result['chrom'] = chrom
516
+ result['ref_alt_freq'] = ref_alt_freq
517
+ result['category'] = category
518
+ result['cache'] = inner_cache
519
+ queue.put(result)
520
+ return result
521
+
522
+ def handle_output(self, queue):
523
+ ''' Function that monitors a queue and writes the cache to a file as soon as it receives the output from the subprocess.'''
524
+ first = True
525
+ m = queue.get() # wait for the first message, to avoid creating an empty cache file
526
+
527
+ if m != 'kill':
528
+ cache_path = get_write_path(self.ref_infer)
529
+ with h5py.File(cache_path, mode='a') as f:
530
+ while True:
531
+ if first:
532
+ first = False
533
+ else:
534
+ m = queue.get()
535
+
536
+ if m == 'kill':
537
+ break
538
+
539
+ result = m
540
+ cache = result['cache']
541
+ if cache is not None and len(cache) > 0:
542
+ main_group = f.require_group(result['ref_alt_freq'])
543
+ sub_group = main_group.require_group(result['category'])
544
+ chrom_group = sub_group.require_group(str(result['chrom']))
545
+
546
+ keys_list = list(cache.keys())
547
+ max_len = len(max(keys_list, key=len))
548
+ #self.log.write(f"Writing {result['ref_alt_freq']}, {result['category']}, {str(result['chrom'])}\n")
549
+ keys_dataset = chrom_group.create_dataset('keys', data=keys_list, dtype=f'S{max_len}', compression="gzip", compression_opts=4)
550
+ values_dataset = chrom_group.create_dataset('values', data=list(cache.values()), dtype='f', compression="gzip", compression_opts=4)
551
+
552
+ def get_cache(self):
553
+ return self.cache
554
+
555
+
556
+ ################################################# CACHE LOADERs #################################################
557
+ # Classes for loading the cache in a separate thread or process in the background while the main process is running.
558
+ # However, right now, the most efficient way to load the cache and perform operations on it is to use the CacheProcess class.
559
+
560
+ class CacheLoader:
561
+ def __new__(cls, *args, **kwargs):
562
+ if cls is CacheLoader:
563
+ raise TypeError(f"You are trying to instantiate an abstract class {cls.__name__}. Please use a concrete subclass.")
564
+ return super().__new__(cls)
565
+
566
+ def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
567
+ self.base_path = base_path
568
+ self.ref_alt_freq = ref_alt_freq
569
+ self.category = category
570
+ self.filter_fn = filter_fn
571
+ self.n_cores = n_cores
572
+ self.log = log
573
+ self.verbose = verbose
574
+
575
+ def _get_cache_path(self):
576
+ return get_cache_path(self.base_path)
577
+
578
+ def build_cache(self):
579
+ self.cache = build_cache(
580
+ self.base_path, ref_alt_freq=self.ref_alt_freq, n_cores=self.n_cores,
581
+ filter_fn=self.filter_fn, category=self.category,
582
+ return_cache=True, log=self.log, verbose=self.verbose
583
+ )
584
+
585
+ def add_to_cache(self, key, value):
586
+ self.cache[key] = value
587
+
588
+ def get_cache(self):
589
+ return self.cache
590
+
591
+ def reset_cache(self):
592
+ self.cache = {}
593
+
594
+
595
+ class CacheLoaderThread(CacheLoader):
596
+ '''
597
+ A class for loading a cache in a separate thread. It is used to load the cache in the background while the main process is running.
598
+
599
+ In theory, this should be the best and simplest approach to directly load the cache in the same process as the main process, without further
600
+ copying the cache to the main process. However, due to the GIL (Global Interpreter Lock) in Python, this approach is not efficient and
601
+ it slows down the main process.
602
+ '''
603
+ def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
604
+ super().__init__(base_path, ref_alt_freq=ref_alt_freq, category=category, filter_fn=filter_fn, n_cores=n_cores, log=log, verbose=verbose)
605
+ self.cache = {}
606
+ self.lock = threading.Lock() # For thread-safe cache access
607
+ self.running = False
608
+ self.executor = None # Thread pool executor
609
+ self.future = None # Stores Future objects
610
+
611
+ def start_loading(self):
612
+ if self.running:
613
+ print("Cache loading is already running. If you want to restart, please stop the current process first.")
614
+ return
615
+
616
+ cache_path = self._get_cache_path()
617
+
618
+ if not cache_exists(cache_path, self.ref_alt_freq, self.category):
619
+ self.log.write("Cache does not exist. Start building (and loading) cache...", verbose=self.verbose)
620
+ self.build_cache() # this will also load the cache
621
+ else:
622
+ self.running = True
623
+ self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
624
+ self.future = self.executor.submit(self.load_cache)
625
+
626
+ def load_cache(self):
627
+ cache_path = self._get_cache_path()
628
+ self.log.write(f'[Start loading cache from {cache_path}...]', verbose=self.verbose)
629
+ self.cache = load_h5py_cache(cache_path, ref_alt_freq=self.ref_alt_freq, category=self.category)
630
+ self.log.write('[Finshed loading cache.]', verbose=self.verbose)
631
+
632
+ self.future.cancel()
633
+ self.executor.shutdown(wait=False)
634
+ self.executor = None
635
+ self.future = None
636
+ self.running = False
637
+
638
+ def get_cache(self):
639
+ if self.future is not None:
640
+ self.future.result() # Ensure loading is finished before accessing the cache
641
+ return self.cache
642
+
643
+
644
+ def _load_cache_process(path, ref_alt_freq, category, cache):
645
+ #start = time.time()
646
+ local_cache = load_h5py_cache(path, ref_alt_freq=ref_alt_freq, category=category)
647
+ #print(f" ********* DONE LOADING local in {time.time() - start} seconds *********")
648
+
649
+ #start = time.time()
650
+ cache.update(local_cache)
651
+ #print(f" ********* DONE COPYING shared in {time.time() - start} seconds *********")
652
+ del local_cache
653
+
654
+ class CacheLoaderProcess(CacheLoader):
655
+ '''
656
+ A class for loading a cache in a separate process. It is used to load the cache in the background while the main process is running.
657
+
658
+ Unlike CacheLoaderThread, this class is more efficient because it loads the cache in a separate process, which is not affected by the GIL.
659
+ However, a lot of memory and time is wasted in copying the cache from the subprocess to the main process.
660
+ '''
661
+ def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
662
+ super().__init__(base_path, ref_alt_freq=ref_alt_freq, category=category, filter_fn=filter_fn, n_cores=n_cores, log=log, verbose=verbose)
663
+ self.manager = mp.Manager()
664
+ self.cache = self.manager.dict()
665
+ self.running = False
666
+ self.process = None
667
+
668
+ def start_loading(self):
669
+ if self.running:
670
+ print("Cache loading is already running. If you want to restart, please stop the current process first.")
671
+ return
672
+
673
+ cache_path = self._get_cache_path()
674
+
675
+ if not cache_exists(cache_path, self.ref_alt_freq, self.category):
676
+ self.log.write("Cache does not exist. Start building (and loading) cache...", verbose=self.verbose)
677
+ self.build_cache() # this will also load the cache
678
+ else:
679
+ self.running = True
680
+ self.process = mp.Process(target=_load_cache_process, args=(cache_path, self.ref_alt_freq, self.filter_fn, self.cache))
681
+ self.process.start()
682
+
683
+ def get_cache(self):
684
+ if self.running:
685
+ self.process.join() # Wait for cache loading process to finish
686
+ self.running = False
687
+ return self.cache