gwaslab 3.4.42__py3-none-any.whl → 3.4.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/cache_manager.py +687 -0
- gwaslab/g_Sumstats.py +4 -2
- gwaslab/g_version.py +2 -2
- gwaslab/hm_harmonize_sumstats.py +227 -33
- gwaslab/qc_fix_sumstats.py +134 -35
- gwaslab/viz_plot_mqqplot.py +12 -11
- {gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/METADATA +5 -3
- {gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/RECORD +12 -11
- {gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/top_level.txt +0 -0
gwaslab/cache_manager.py
ADDED
|
@@ -0,0 +1,687 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import os
|
|
3
|
+
import pickle
|
|
4
|
+
import concurrent.futures
|
|
5
|
+
import threading
|
|
6
|
+
import multiprocessing as mp
|
|
7
|
+
import time
|
|
8
|
+
import h5py
|
|
9
|
+
|
|
10
|
+
from gwaslab.g_Log import Log
|
|
11
|
+
|
|
12
|
+
from platformdirs import user_cache_dir
|
|
13
|
+
from pysam import VariantFile
|
|
14
|
+
|
|
15
|
+
APPNAME = "gwaspipe"
|
|
16
|
+
APPAUTHOR = "cloufield"
|
|
17
|
+
|
|
18
|
+
CACHE_EXT = '.cache'
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
################################################# UTILS #################################################
|
|
22
|
+
|
|
23
|
+
def get_cache_path(base_path):
|
|
24
|
+
cache_filename = str(Path(base_path).stem) + CACHE_EXT
|
|
25
|
+
cache_path = os.path.join(os.path.dirname(base_path), cache_filename)
|
|
26
|
+
if os.path.exists(cache_path):
|
|
27
|
+
return cache_path
|
|
28
|
+
else:
|
|
29
|
+
cache_dir = user_cache_dir(APPNAME, APPAUTHOR)
|
|
30
|
+
user_cache_path = os.path.join(cache_dir, cache_filename)
|
|
31
|
+
if os.path.exists(user_cache_path):
|
|
32
|
+
return user_cache_path
|
|
33
|
+
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
def get_write_path(base_path):
|
|
37
|
+
cache_filename = str(Path(base_path).stem) + CACHE_EXT
|
|
38
|
+
if os.access(os.path.dirname(base_path), os.W_OK):
|
|
39
|
+
# if we have write access to the directory where the original input file is located
|
|
40
|
+
return os.path.join(os.path.dirname(base_path), cache_filename)
|
|
41
|
+
else:
|
|
42
|
+
cache_dir = user_cache_dir(APPNAME, APPAUTHOR)
|
|
43
|
+
if os.access(cache_dir, os.W_OK):
|
|
44
|
+
# if we have write access to the user cache directory
|
|
45
|
+
return os.path.join(cache_dir, cache_filename)
|
|
46
|
+
|
|
47
|
+
raise Exception('No write access to any cache directory')
|
|
48
|
+
|
|
49
|
+
def cache_exists(path, ref_alt_freq, category='all'):
|
|
50
|
+
''' Check if the cache file exists and contains the required data '''
|
|
51
|
+
found = False
|
|
52
|
+
try:
|
|
53
|
+
found = is_in_h5py(path, ref_alt_freq, category)
|
|
54
|
+
except Exception as e:
|
|
55
|
+
pass
|
|
56
|
+
return found
|
|
57
|
+
|
|
58
|
+
def is_in_h5py(path, ref_alt_freq, category='all'):
|
|
59
|
+
'''
|
|
60
|
+
Check if the cache file exists and contains the required data.
|
|
61
|
+
Raise an exception if the cache file does not exist.
|
|
62
|
+
'''
|
|
63
|
+
if not path or not os.path.exists(path):
|
|
64
|
+
raise Exception('Cache file not found')
|
|
65
|
+
|
|
66
|
+
with h5py.File(path, 'r') as f:
|
|
67
|
+
if ref_alt_freq in f.keys():
|
|
68
|
+
if category in f[ref_alt_freq].keys():
|
|
69
|
+
if len(f[ref_alt_freq][category].keys()) > 0:
|
|
70
|
+
return True
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
def load_h5py_cache(path, ref_alt_freq, category='all'):
|
|
74
|
+
if not path or not os.path.exists(path):
|
|
75
|
+
raise Exception('Cache file not found')
|
|
76
|
+
|
|
77
|
+
if not is_in_h5py(path, ref_alt_freq, category):
|
|
78
|
+
raise Exception('Cache file does not contain the required data')
|
|
79
|
+
|
|
80
|
+
_cache = {}
|
|
81
|
+
with h5py.File(path, 'r') as f:
|
|
82
|
+
for v in f[ref_alt_freq][category].values():
|
|
83
|
+
# iterate over chromosomes
|
|
84
|
+
keys = list(v['keys'].asstr()[:])
|
|
85
|
+
values = list(v['values'][:])
|
|
86
|
+
chrom_cache = dict(zip(keys, values)) # Combine keys and values into a dictionary
|
|
87
|
+
_cache.update(chrom_cache)
|
|
88
|
+
return _cache
|
|
89
|
+
|
|
90
|
+
def build_cache(base_path, ref_alt_freq=None, n_cores=1, return_cache=False, filter_fn=None, category='all', log=Log(), verbose=True):
|
|
91
|
+
cache_builder = CacheBuilder(base_path, ref_alt_freq=ref_alt_freq, n_cores=n_cores, log=log, verbose=verbose)
|
|
92
|
+
cache_builder.start_building(filter_fn=filter_fn, category=category, set_cache=return_cache) # start_building will wait for all processes to finish building cache
|
|
93
|
+
if return_cache:
|
|
94
|
+
return cache_builder.get_cache()
|
|
95
|
+
|
|
96
|
+
def is_palindromic(ref, alt):
|
|
97
|
+
gc = (ref=="G") & (alt=="C")
|
|
98
|
+
cg = (ref=="C") & (alt=="G")
|
|
99
|
+
at = (ref=="A") & (alt=="T")
|
|
100
|
+
ta = (ref=="T") & (alt=="A")
|
|
101
|
+
palindromic = gc | cg | at | ta
|
|
102
|
+
return palindromic
|
|
103
|
+
|
|
104
|
+
def is_indel(ref, alt):
|
|
105
|
+
return len(ref) != len(alt)
|
|
106
|
+
|
|
107
|
+
def filter_fn_pi(*, ref, alt):
|
|
108
|
+
return is_palindromic(ref, alt) or is_indel(ref, alt)
|
|
109
|
+
|
|
110
|
+
def filter_fn_np(*, ref, alt):
|
|
111
|
+
return not is_palindromic(ref, alt)
|
|
112
|
+
|
|
113
|
+
PALINDROMIC_INDEL = 'pi' # palindromic + indel
|
|
114
|
+
NON_PALINDROMIC = 'np' # non-palindromic
|
|
115
|
+
|
|
116
|
+
FILTER_FN = {
|
|
117
|
+
PALINDROMIC_INDEL: filter_fn_pi,
|
|
118
|
+
NON_PALINDROMIC: filter_fn_np
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
################################################# CACHE MANAGERs #################################################
|
|
123
|
+
|
|
124
|
+
class CacheMainManager:
|
|
125
|
+
def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
|
|
126
|
+
self.base_path = base_path
|
|
127
|
+
self.ref_alt_freq = ref_alt_freq
|
|
128
|
+
self.category = category
|
|
129
|
+
self.filter_fn = filter_fn
|
|
130
|
+
self.n_cores = n_cores
|
|
131
|
+
self.log = log
|
|
132
|
+
self.verbose = verbose
|
|
133
|
+
|
|
134
|
+
def _get_cache_path(self):
|
|
135
|
+
return get_cache_path(self.base_path)
|
|
136
|
+
|
|
137
|
+
def _get_write_path(self):
|
|
138
|
+
if self.base_path is not None:
|
|
139
|
+
return get_write_path(self.base_path)
|
|
140
|
+
else:
|
|
141
|
+
raise Exception('base_path is None')
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def cache_len(self):
|
|
145
|
+
return len(self.cache)
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def cache(self):
|
|
149
|
+
if not hasattr(self, '_cache'):
|
|
150
|
+
raise Exception('Cache not loaded')
|
|
151
|
+
return self._cache
|
|
152
|
+
|
|
153
|
+
def build_cache(self):
|
|
154
|
+
''' Build and load the cache'''
|
|
155
|
+
self._cache = build_cache(
|
|
156
|
+
self.base_path, ref_alt_freq=self.ref_alt_freq, n_cores=self.n_cores,
|
|
157
|
+
filter_fn=self.filter_fn, category=self.category,
|
|
158
|
+
return_cache=True, log=self.log, verbose=self.verbose
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def load_cache(self, category=None):
|
|
162
|
+
if category is None:
|
|
163
|
+
category = self.category
|
|
164
|
+
cache_path = self._get_cache_path()
|
|
165
|
+
self._cache = load_h5py_cache(cache_path, ref_alt_freq=self.ref_alt_freq, category=category)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class CacheManager(CacheMainManager):
|
|
169
|
+
def __init__(self, base_path=None, cache_loader=None, cache_process=None, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
|
|
170
|
+
none_value = sum([cache_loader is not None, cache_process is not None])
|
|
171
|
+
assert none_value in [0, 1], 'Only one between cache_loader and cache_process should be provided'
|
|
172
|
+
super().__init__(base_path, ref_alt_freq=ref_alt_freq, category=category, filter_fn=filter_fn, n_cores=n_cores, log=log, verbose=verbose)
|
|
173
|
+
if none_value == 1:
|
|
174
|
+
self.base_path = None # unset base_path if cache_loader or cache_process is provided
|
|
175
|
+
|
|
176
|
+
self.cache_loader = cache_loader
|
|
177
|
+
self.cache_process = cache_process
|
|
178
|
+
|
|
179
|
+
if cache_loader is not None:
|
|
180
|
+
assert callable(getattr(cache_loader, 'get_cache', None)), 'cache_loader must have a get_cache method'
|
|
181
|
+
elif cache_process is not None:
|
|
182
|
+
assert isinstance(cache_process, CacheProcess), 'cache_process must be an instance of CacheProcess'
|
|
183
|
+
else:
|
|
184
|
+
cache_path = self._get_cache_path()
|
|
185
|
+
if cache_path is not None:
|
|
186
|
+
self.log.write(f'Start loading cache from {cache_path}...', verbose=self.verbose)
|
|
187
|
+
self.load_cache()
|
|
188
|
+
self.log.write('Finshed loading cache.', verbose=self.verbose)
|
|
189
|
+
else:
|
|
190
|
+
self.log.write(f'Start building cache from {base_path}...', verbose=self.verbose)
|
|
191
|
+
self.build_cache()
|
|
192
|
+
self.log.write('Finished building (and loading) cache.', verbose=self.verbose)
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def cache_len(self):
|
|
196
|
+
if self.cache_process is not None:
|
|
197
|
+
return self.cache_process.cache_len()
|
|
198
|
+
else:
|
|
199
|
+
return len(self.cache)
|
|
200
|
+
|
|
201
|
+
@property
|
|
202
|
+
def cache(self):
|
|
203
|
+
if self.cache_loader is not None:
|
|
204
|
+
return self.cache_loader.get_cache()
|
|
205
|
+
else:
|
|
206
|
+
if not hasattr(self, '_cache'):
|
|
207
|
+
raise Exception('Cache not loaded or class not exposing cache')
|
|
208
|
+
return self._cache
|
|
209
|
+
|
|
210
|
+
def apply_fn(self, fn, *args, **kwargs):
|
|
211
|
+
assert 'cache' not in kwargs, "'cache' can't be inside kwargs"
|
|
212
|
+
if self.cache_process is not None:
|
|
213
|
+
return self.cache_process.apply_fn(fn, *args, **kwargs)
|
|
214
|
+
else:
|
|
215
|
+
return fn(*args, cache=self.cache, **kwargs)
|
|
216
|
+
|
|
217
|
+
def _get_cache_path(self):
|
|
218
|
+
if self.cache_loader is None and self.cache_process is None:
|
|
219
|
+
return super()._get_cache_path()
|
|
220
|
+
return None
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class CacheProcess(mp.Process):
|
|
224
|
+
'''
|
|
225
|
+
A class for managing a cache in a separate process. It is used to reduce memory consumption when the cache is very large.
|
|
226
|
+
This class will load the cache in a separate process and provide methods to perform operations on the cache directly on the subprocess.
|
|
227
|
+
In this way, the cache is not copied to the main process, but the operations are performed on the cache in the subprocess and only the
|
|
228
|
+
input and output of the operations are communicated (i.e. copied) between the main and the subprocess.
|
|
229
|
+
|
|
230
|
+
This is very useful when the cache is huge (e.g. 40GB in memory) and we want to perform operations on it based on a relatively small input
|
|
231
|
+
(e.g. a "small" dataframe, where small is relative to the cache size) and the output is also relatively small.
|
|
232
|
+
'''
|
|
233
|
+
def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
|
|
234
|
+
super().__init__()
|
|
235
|
+
self.base_path = base_path
|
|
236
|
+
self.ref_alt_freq = ref_alt_freq
|
|
237
|
+
self.filter_fn = filter_fn
|
|
238
|
+
self.category = category
|
|
239
|
+
self.n_cores = n_cores
|
|
240
|
+
self.log = log
|
|
241
|
+
self.verbose = verbose
|
|
242
|
+
|
|
243
|
+
self.daemon = True # When parent process exits, it will attempt to terminate all of its daemonic child processes.
|
|
244
|
+
|
|
245
|
+
self.manager = mp.Manager()
|
|
246
|
+
self.input_queue = mp.Queue() # Queue for communication between processes
|
|
247
|
+
self.result_queue = mp.Queue()
|
|
248
|
+
self.result_produced = mp.Value('b', True)
|
|
249
|
+
|
|
250
|
+
cache_path = self._get_cache_path()
|
|
251
|
+
if not cache_exists(cache_path, ref_alt_freq, category):
|
|
252
|
+
self.build_cache()
|
|
253
|
+
else:
|
|
254
|
+
if n_cores > 1:
|
|
255
|
+
self.log.warning('[CacheProcess: since the cache already exists, the parameter n_cores could be set to 1 without any performance loss]', verbose=self.verbose)
|
|
256
|
+
|
|
257
|
+
def _get_cache_path(self):
|
|
258
|
+
return get_cache_path(self.base_path)
|
|
259
|
+
|
|
260
|
+
def build_cache(self):
|
|
261
|
+
build_cache(
|
|
262
|
+
self.base_path, ref_alt_freq=self.ref_alt_freq, n_cores=self.n_cores,
|
|
263
|
+
filter_fn=self.filter_fn, category=self.category,
|
|
264
|
+
return_cache=False, log=self.log, verbose=self.verbose
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
def run(self):
|
|
268
|
+
cache_path = self._get_cache_path()
|
|
269
|
+
self.log.write(f'[CacheProcess: Start loading cache from {cache_path}...]', verbose=self.verbose)
|
|
270
|
+
cache = load_h5py_cache(cache_path, ref_alt_freq=self.ref_alt_freq, category=self.category)
|
|
271
|
+
self.log.write('[CacheProcess: Finshed loading cache.]', verbose=self.verbose)
|
|
272
|
+
|
|
273
|
+
# Continuously listen for method calls
|
|
274
|
+
while True:
|
|
275
|
+
method, args, kwargs = self.input_queue.get()
|
|
276
|
+
if method == 'get_from_cache':
|
|
277
|
+
key = args[0]
|
|
278
|
+
self.result_queue.put(cache[key])
|
|
279
|
+
self.result_produced.value = True
|
|
280
|
+
elif method == 'apply_fn':
|
|
281
|
+
assert 'cache' not in kwargs, "'cache' can't be inside kwargs"
|
|
282
|
+
fn, *args = args
|
|
283
|
+
result = fn(*args, cache=cache, **kwargs)
|
|
284
|
+
self.result_queue.put(result)
|
|
285
|
+
self.result_produced.value = True
|
|
286
|
+
elif method == 'cache_len':
|
|
287
|
+
self.result_queue.put(len(cache))
|
|
288
|
+
self.result_produced.value = True
|
|
289
|
+
elif method == "terminate":
|
|
290
|
+
self.result_produced.value = True
|
|
291
|
+
break
|
|
292
|
+
|
|
293
|
+
def _call_method(self, method, *args, **kwargs):
|
|
294
|
+
self.result_produced.value = False
|
|
295
|
+
self.input_queue.put((method, args, kwargs))
|
|
296
|
+
|
|
297
|
+
# wait until the result is produced
|
|
298
|
+
while not self.result_produced.value:
|
|
299
|
+
pass
|
|
300
|
+
|
|
301
|
+
def get_from_cache(self, key):
|
|
302
|
+
self._call_method('get_from_cache', key)
|
|
303
|
+
return self.result_queue.get()
|
|
304
|
+
|
|
305
|
+
def apply_fn(self, fn, **kwargs):
|
|
306
|
+
'''
|
|
307
|
+
Apply an arbitrary function to the cache. The function should take the cache as an argument,
|
|
308
|
+
and all the arguments should be passed as named arguments.
|
|
309
|
+
'''
|
|
310
|
+
self._call_method('apply_fn', fn, **kwargs)
|
|
311
|
+
return self.result_queue.get()
|
|
312
|
+
|
|
313
|
+
def cache_len(self):
|
|
314
|
+
self._call_method('cache_len')
|
|
315
|
+
return self.result_queue.get()
|
|
316
|
+
|
|
317
|
+
def terminate(self):
|
|
318
|
+
self._call_method("terminate")
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
################################################# CACHE BUILDER #################################################
|
|
322
|
+
|
|
323
|
+
class CacheBuilderOld:
|
|
324
|
+
def __init__(self, ref_infer, ref_alt_freq=None, n_cores=1, log=Log(), verbose=True):
|
|
325
|
+
self.ref_infer = ref_infer
|
|
326
|
+
self.ref_alt_freq = ref_alt_freq
|
|
327
|
+
self.n_cores = n_cores
|
|
328
|
+
self.log = log
|
|
329
|
+
self.verbose = verbose
|
|
330
|
+
|
|
331
|
+
self.cache = {}
|
|
332
|
+
self.lock = threading.Lock() # For thread-safe cache access
|
|
333
|
+
self.cancelled = False # Flag for cancelling the cache building process
|
|
334
|
+
self.running = False
|
|
335
|
+
self.executor = None # Thread pool executor
|
|
336
|
+
self.futures = None # Stores Future objects
|
|
337
|
+
|
|
338
|
+
def start_building(self):
|
|
339
|
+
if self.running:
|
|
340
|
+
print("Cache building is already running. If you want to restart, please stop the current process first.")
|
|
341
|
+
return
|
|
342
|
+
|
|
343
|
+
n_cores = self.n_cores
|
|
344
|
+
contigs = self.get_contigs()
|
|
345
|
+
|
|
346
|
+
self.cancelled = False
|
|
347
|
+
self.running = True
|
|
348
|
+
|
|
349
|
+
self.log.write(f" -Building cache on {n_cores} cores...", verbose=self.verbose)
|
|
350
|
+
self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=n_cores)
|
|
351
|
+
self.futures = [self.executor.submit(self.build_cache, chrom) for chrom in contigs]
|
|
352
|
+
|
|
353
|
+
def get_contigs(self):
|
|
354
|
+
vcf_reader = VariantFile(self.ref_infer, drop_samples=True)
|
|
355
|
+
contigs = [v.name for v in vcf_reader.header.contigs.values()]
|
|
356
|
+
vcf_reader.close()
|
|
357
|
+
return contigs
|
|
358
|
+
|
|
359
|
+
def build_cache(self, chrom):
|
|
360
|
+
vcf_reader = VariantFile(self.ref_infer, drop_samples=True)
|
|
361
|
+
#self.log.write(f" -Fetching contig '{chrom}'...")
|
|
362
|
+
seq = vcf_reader.fetch(chrom)
|
|
363
|
+
|
|
364
|
+
first = True
|
|
365
|
+
for record in seq:
|
|
366
|
+
if first:
|
|
367
|
+
#self.log.write(f" -Found at least one record for contig '{chrom}'...")
|
|
368
|
+
first = False
|
|
369
|
+
chrom = record.chrom
|
|
370
|
+
start = record.pos - 1
|
|
371
|
+
end = record.pos
|
|
372
|
+
cache_key = f"{chrom}:{start}:{end}"
|
|
373
|
+
to_add = [record.pos, record.ref, record.alts, record.info[self.ref_alt_freq][0]]
|
|
374
|
+
self.add_to_cache(cache_key, to_add)
|
|
375
|
+
|
|
376
|
+
def stop_building(self, wait=False, verbose=False):
|
|
377
|
+
if self.futures:
|
|
378
|
+
self.cancelled = True
|
|
379
|
+
for future in self.futures:
|
|
380
|
+
future.cancel()
|
|
381
|
+
self.executor.shutdown(wait=wait) # Whether to wait for threads to finish
|
|
382
|
+
self.futures = None
|
|
383
|
+
self.executor = None
|
|
384
|
+
self.running = False
|
|
385
|
+
|
|
386
|
+
if verbose:
|
|
387
|
+
print(f"Cache contains {len(self.get_cache())} variants")
|
|
388
|
+
|
|
389
|
+
def add_to_cache(self, key, value):
|
|
390
|
+
self.lock.acquire()
|
|
391
|
+
if key in self.cache:
|
|
392
|
+
self.cache[key].append(value)
|
|
393
|
+
else:
|
|
394
|
+
self.cache[key] = [value]
|
|
395
|
+
self.lock.release()
|
|
396
|
+
|
|
397
|
+
def get_cache(self, complete=False):
|
|
398
|
+
if complete:
|
|
399
|
+
concurrent.futures.wait(self.futures)
|
|
400
|
+
|
|
401
|
+
self.lock.acquire()
|
|
402
|
+
cache = self.cache
|
|
403
|
+
self.lock.release()
|
|
404
|
+
return cache
|
|
405
|
+
|
|
406
|
+
def reset_cache(self):
|
|
407
|
+
self.lock.acquire()
|
|
408
|
+
self.cache = {}
|
|
409
|
+
self.lock.release()
|
|
410
|
+
|
|
411
|
+
def save_cache(self, save_path):
|
|
412
|
+
cache = self.get_cache(complete=True)
|
|
413
|
+
self.log.write(f' -Saving cache to {save_path}', verbose=self.verbose)
|
|
414
|
+
with open(save_path, 'wb') as f:
|
|
415
|
+
pickle.dump(cache, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
416
|
+
self.log.write(' -Cache saved', verbose=self.verbose)
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
class CacheBuilder:
|
|
420
|
+
def __init__(self, ref_infer, ref_alt_freq=None, n_cores=1, log=Log(), verbose=True):
|
|
421
|
+
self.ref_infer = ref_infer
|
|
422
|
+
self.ref_alt_freq = ref_alt_freq
|
|
423
|
+
self.n_cores = n_cores
|
|
424
|
+
self.log = log
|
|
425
|
+
self.verbose = verbose
|
|
426
|
+
|
|
427
|
+
self.running = False
|
|
428
|
+
self.cache = None
|
|
429
|
+
|
|
430
|
+
def get_contigs(self):
|
|
431
|
+
vcf_reader = VariantFile(self.ref_infer, drop_samples=True)
|
|
432
|
+
contigs = [v.name for v in vcf_reader.header.contigs.values()]
|
|
433
|
+
vcf_reader.close()
|
|
434
|
+
return contigs
|
|
435
|
+
|
|
436
|
+
def already_built(self, category):
|
|
437
|
+
cache_path = get_cache_path(self.ref_infer)
|
|
438
|
+
return cache_exists(cache_path, self.ref_alt_freq, category)
|
|
439
|
+
|
|
440
|
+
def start_building(self, filter_fn=None, category='all', set_cache=True):
|
|
441
|
+
if self.running:
|
|
442
|
+
print("Cache building is already running. If you want to restart, please stop the current process first.")
|
|
443
|
+
return
|
|
444
|
+
|
|
445
|
+
if isinstance(filter_fn, str) and filter_fn in FILTER_FN:
|
|
446
|
+
filter_fn = FILTER_FN[filter_fn]
|
|
447
|
+
category = filter_fn
|
|
448
|
+
elif category in FILTER_FN:
|
|
449
|
+
self.log.write(f" -Using the built-in filter function for category '{category}'. filter_fn will be ignored if provided.", verbose=self.verbose)
|
|
450
|
+
filter_fn = FILTER_FN[category]
|
|
451
|
+
|
|
452
|
+
assert filter_fn is None or category != 'all', "If filter_fn is not None, category cannot be 'all'"
|
|
453
|
+
assert filter_fn is not None or category == 'all', "If category is not 'all', filter_fn must be provided"
|
|
454
|
+
|
|
455
|
+
if self.already_built(category=category):
|
|
456
|
+
# TODO: we should probably improve the checking logic, and maybe also allows to overwrite the cache
|
|
457
|
+
self.log.write(f"Cache for category '{category}' and ref_alt_freq {self.ref_alt_freq} already exists. Skipping cache building", verbose=self.verbose)
|
|
458
|
+
return
|
|
459
|
+
|
|
460
|
+
n_cores = max(self.n_cores-1, 1) # leave one core for the watcher process
|
|
461
|
+
contigs = self.get_contigs()
|
|
462
|
+
|
|
463
|
+
self.running = True
|
|
464
|
+
|
|
465
|
+
self.log.write(f" -Building cache for category '{category}' on {n_cores} cores...", verbose=self.verbose)
|
|
466
|
+
|
|
467
|
+
pool = mp.Pool(n_cores)
|
|
468
|
+
manager = mp.Manager()
|
|
469
|
+
queue = manager.Queue()
|
|
470
|
+
jobs = []
|
|
471
|
+
|
|
472
|
+
# Start a watcher process to handle the output of each subprocess.
|
|
473
|
+
# The watcher will write the cache to the file as soon as it receives the output from the subprocess, in a safe way.
|
|
474
|
+
watcher = mp.Process(target=self.handle_output, args=(queue,))
|
|
475
|
+
watcher.daemon = True
|
|
476
|
+
watcher.start()
|
|
477
|
+
|
|
478
|
+
for chrom in contigs:
|
|
479
|
+
job = pool.apply_async(self.build_cache, args=(chrom, queue), kwds={'filter_fn': filter_fn, 'category': category})
|
|
480
|
+
jobs.append(job)
|
|
481
|
+
|
|
482
|
+
pool.close()
|
|
483
|
+
pool.join() # wait for all processes to finish
|
|
484
|
+
|
|
485
|
+
queue.put('kill') # send a signal to the watcher process to stop
|
|
486
|
+
watcher.join()
|
|
487
|
+
|
|
488
|
+
if set_cache:
|
|
489
|
+
self.cache = {}
|
|
490
|
+
for job in jobs:
|
|
491
|
+
self.cache.update(job.get()['cache'])
|
|
492
|
+
|
|
493
|
+
self.running = False
|
|
494
|
+
|
|
495
|
+
def build_cache(self, chrom, queue, filter_fn=None, category='all'):
|
|
496
|
+
assert filter_fn is None or category != 'all', "If filter_fn is not None, category cannot be 'all'"
|
|
497
|
+
|
|
498
|
+
inner_cache = {}
|
|
499
|
+
ref_alt_freq = self.ref_alt_freq
|
|
500
|
+
|
|
501
|
+
vcf_reader = VariantFile(self.ref_infer, drop_samples=True)
|
|
502
|
+
#self.log.write(f" -Fetching contig '{chrom}'...", verbose=self.verbose)
|
|
503
|
+
seq = vcf_reader.fetch(chrom)
|
|
504
|
+
|
|
505
|
+
for record in seq:
|
|
506
|
+
for alt in record.alts:
|
|
507
|
+
if filter_fn is None or filter_fn(ref=record.ref, alt=alt):
|
|
508
|
+
key = f"{record.chrom}:{record.pos}:{record.ref}:{alt}"
|
|
509
|
+
value = record.info[ref_alt_freq][0]
|
|
510
|
+
inner_cache[key] = value
|
|
511
|
+
|
|
512
|
+
vcf_reader.close()
|
|
513
|
+
|
|
514
|
+
result = {}
|
|
515
|
+
result['chrom'] = chrom
|
|
516
|
+
result['ref_alt_freq'] = ref_alt_freq
|
|
517
|
+
result['category'] = category
|
|
518
|
+
result['cache'] = inner_cache
|
|
519
|
+
queue.put(result)
|
|
520
|
+
return result
|
|
521
|
+
|
|
522
|
+
def handle_output(self, queue):
|
|
523
|
+
''' Function that monitors a queue and writes the cache to a file as soon as it receives the output from the subprocess.'''
|
|
524
|
+
first = True
|
|
525
|
+
m = queue.get() # wait for the first message, to avoid creating an empty cache file
|
|
526
|
+
|
|
527
|
+
if m != 'kill':
|
|
528
|
+
cache_path = get_write_path(self.ref_infer)
|
|
529
|
+
with h5py.File(cache_path, mode='a') as f:
|
|
530
|
+
while True:
|
|
531
|
+
if first:
|
|
532
|
+
first = False
|
|
533
|
+
else:
|
|
534
|
+
m = queue.get()
|
|
535
|
+
|
|
536
|
+
if m == 'kill':
|
|
537
|
+
break
|
|
538
|
+
|
|
539
|
+
result = m
|
|
540
|
+
cache = result['cache']
|
|
541
|
+
if cache is not None and len(cache) > 0:
|
|
542
|
+
main_group = f.require_group(result['ref_alt_freq'])
|
|
543
|
+
sub_group = main_group.require_group(result['category'])
|
|
544
|
+
chrom_group = sub_group.require_group(str(result['chrom']))
|
|
545
|
+
|
|
546
|
+
keys_list = list(cache.keys())
|
|
547
|
+
max_len = len(max(keys_list, key=len))
|
|
548
|
+
#self.log.write(f"Writing {result['ref_alt_freq']}, {result['category']}, {str(result['chrom'])}\n")
|
|
549
|
+
keys_dataset = chrom_group.create_dataset('keys', data=keys_list, dtype=f'S{max_len}', compression="gzip", compression_opts=4)
|
|
550
|
+
values_dataset = chrom_group.create_dataset('values', data=list(cache.values()), dtype='f', compression="gzip", compression_opts=4)
|
|
551
|
+
|
|
552
|
+
def get_cache(self):
|
|
553
|
+
return self.cache
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
################################################# CACHE LOADERs #################################################
|
|
557
|
+
# Classes for loading the cache in a separate thread or process in the background while the main process is running.
|
|
558
|
+
# However, right now, the most efficient way to load the cache and perform operations on it is to use the CacheProcess class.
|
|
559
|
+
|
|
560
|
+
class CacheLoader:
|
|
561
|
+
def __new__(cls, *args, **kwargs):
|
|
562
|
+
if cls is CacheLoader:
|
|
563
|
+
raise TypeError(f"You are trying to instantiate an abstract class {cls.__name__}. Please use a concrete subclass.")
|
|
564
|
+
return super().__new__(cls)
|
|
565
|
+
|
|
566
|
+
def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
|
|
567
|
+
self.base_path = base_path
|
|
568
|
+
self.ref_alt_freq = ref_alt_freq
|
|
569
|
+
self.category = category
|
|
570
|
+
self.filter_fn = filter_fn
|
|
571
|
+
self.n_cores = n_cores
|
|
572
|
+
self.log = log
|
|
573
|
+
self.verbose = verbose
|
|
574
|
+
|
|
575
|
+
def _get_cache_path(self):
|
|
576
|
+
return get_cache_path(self.base_path)
|
|
577
|
+
|
|
578
|
+
def build_cache(self):
|
|
579
|
+
self.cache = build_cache(
|
|
580
|
+
self.base_path, ref_alt_freq=self.ref_alt_freq, n_cores=self.n_cores,
|
|
581
|
+
filter_fn=self.filter_fn, category=self.category,
|
|
582
|
+
return_cache=True, log=self.log, verbose=self.verbose
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
def add_to_cache(self, key, value):
|
|
586
|
+
self.cache[key] = value
|
|
587
|
+
|
|
588
|
+
def get_cache(self):
|
|
589
|
+
return self.cache
|
|
590
|
+
|
|
591
|
+
def reset_cache(self):
|
|
592
|
+
self.cache = {}
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
class CacheLoaderThread(CacheLoader):
|
|
596
|
+
'''
|
|
597
|
+
A class for loading a cache in a separate thread. It is used to load the cache in the background while the main process is running.
|
|
598
|
+
|
|
599
|
+
In theory, this should be the best and simplest approach to directly load the cache in the same process as the main process, without further
|
|
600
|
+
copying the cache to the main process. However, due to the GIL (Global Interpreter Lock) in Python, this approach is not efficient and
|
|
601
|
+
it slows down the main process.
|
|
602
|
+
'''
|
|
603
|
+
def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
|
|
604
|
+
super().__init__(base_path, ref_alt_freq=ref_alt_freq, category=category, filter_fn=filter_fn, n_cores=n_cores, log=log, verbose=verbose)
|
|
605
|
+
self.cache = {}
|
|
606
|
+
self.lock = threading.Lock() # For thread-safe cache access
|
|
607
|
+
self.running = False
|
|
608
|
+
self.executor = None # Thread pool executor
|
|
609
|
+
self.future = None # Stores Future objects
|
|
610
|
+
|
|
611
|
+
def start_loading(self):
|
|
612
|
+
if self.running:
|
|
613
|
+
print("Cache loading is already running. If you want to restart, please stop the current process first.")
|
|
614
|
+
return
|
|
615
|
+
|
|
616
|
+
cache_path = self._get_cache_path()
|
|
617
|
+
|
|
618
|
+
if not cache_exists(cache_path, self.ref_alt_freq, self.category):
|
|
619
|
+
self.log.write("Cache does not exist. Start building (and loading) cache...", verbose=self.verbose)
|
|
620
|
+
self.build_cache() # this will also load the cache
|
|
621
|
+
else:
|
|
622
|
+
self.running = True
|
|
623
|
+
self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
|
|
624
|
+
self.future = self.executor.submit(self.load_cache)
|
|
625
|
+
|
|
626
|
+
def load_cache(self):
|
|
627
|
+
cache_path = self._get_cache_path()
|
|
628
|
+
self.log.write(f'[Start loading cache from {cache_path}...]', verbose=self.verbose)
|
|
629
|
+
self.cache = load_h5py_cache(cache_path, ref_alt_freq=self.ref_alt_freq, category=self.category)
|
|
630
|
+
self.log.write('[Finshed loading cache.]', verbose=self.verbose)
|
|
631
|
+
|
|
632
|
+
self.future.cancel()
|
|
633
|
+
self.executor.shutdown(wait=False)
|
|
634
|
+
self.executor = None
|
|
635
|
+
self.future = None
|
|
636
|
+
self.running = False
|
|
637
|
+
|
|
638
|
+
def get_cache(self):
|
|
639
|
+
if self.future is not None:
|
|
640
|
+
self.future.result() # Ensure loading is finished before accessing the cache
|
|
641
|
+
return self.cache
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
def _load_cache_process(path, ref_alt_freq, category, cache):
|
|
645
|
+
#start = time.time()
|
|
646
|
+
local_cache = load_h5py_cache(path, ref_alt_freq=ref_alt_freq, category=category)
|
|
647
|
+
#print(f" ********* DONE LOADING local in {time.time() - start} seconds *********")
|
|
648
|
+
|
|
649
|
+
#start = time.time()
|
|
650
|
+
cache.update(local_cache)
|
|
651
|
+
#print(f" ********* DONE COPYING shared in {time.time() - start} seconds *********")
|
|
652
|
+
del local_cache
|
|
653
|
+
|
|
654
|
+
class CacheLoaderProcess(CacheLoader):
|
|
655
|
+
'''
|
|
656
|
+
A class for loading a cache in a separate process. It is used to load the cache in the background while the main process is running.
|
|
657
|
+
|
|
658
|
+
Unlike CacheLoaderThread, this class is more efficient because it loads the cache in a separate process, which is not affected by the GIL.
|
|
659
|
+
However, a lot of memory and time is wasted in copying the cache from the subprocess to the main process.
|
|
660
|
+
'''
|
|
661
|
+
def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
|
|
662
|
+
super().__init__(base_path, ref_alt_freq=ref_alt_freq, category=category, filter_fn=filter_fn, n_cores=n_cores, log=log, verbose=verbose)
|
|
663
|
+
self.manager = mp.Manager()
|
|
664
|
+
self.cache = self.manager.dict()
|
|
665
|
+
self.running = False
|
|
666
|
+
self.process = None
|
|
667
|
+
|
|
668
|
+
def start_loading(self):
|
|
669
|
+
if self.running:
|
|
670
|
+
print("Cache loading is already running. If you want to restart, please stop the current process first.")
|
|
671
|
+
return
|
|
672
|
+
|
|
673
|
+
cache_path = self._get_cache_path()
|
|
674
|
+
|
|
675
|
+
if not cache_exists(cache_path, self.ref_alt_freq, self.category):
|
|
676
|
+
self.log.write("Cache does not exist. Start building (and loading) cache...", verbose=self.verbose)
|
|
677
|
+
self.build_cache() # this will also load the cache
|
|
678
|
+
else:
|
|
679
|
+
self.running = True
|
|
680
|
+
self.process = mp.Process(target=_load_cache_process, args=(cache_path, self.ref_alt_freq, self.filter_fn, self.cache))
|
|
681
|
+
self.process.start()
|
|
682
|
+
|
|
683
|
+
def get_cache(self):
|
|
684
|
+
if self.running:
|
|
685
|
+
self.process.join() # Wait for cache loading process to finish
|
|
686
|
+
self.running = False
|
|
687
|
+
return self.cache
|