gwaslab 3.4.42__py3-none-any.whl → 3.4.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

@@ -0,0 +1,687 @@
1
+ from pathlib import Path
2
+ import os
3
+ import pickle
4
+ import concurrent.futures
5
+ import threading
6
+ import multiprocessing as mp
7
+ import time
8
+ import h5py
9
+
10
+ from gwaslab.g_Log import Log
11
+
12
+ from platformdirs import user_cache_dir
13
+ from pysam import VariantFile
14
+
15
+ APPNAME = "gwaspipe"
16
+ APPAUTHOR = "cloufield"
17
+
18
+ CACHE_EXT = '.cache'
19
+
20
+
21
+ ################################################# UTILS #################################################
22
+
23
+ def get_cache_path(base_path):
24
+ cache_filename = str(Path(base_path).stem) + CACHE_EXT
25
+ cache_path = os.path.join(os.path.dirname(base_path), cache_filename)
26
+ if os.path.exists(cache_path):
27
+ return cache_path
28
+ else:
29
+ cache_dir = user_cache_dir(APPNAME, APPAUTHOR)
30
+ user_cache_path = os.path.join(cache_dir, cache_filename)
31
+ if os.path.exists(user_cache_path):
32
+ return user_cache_path
33
+
34
+ return None
35
+
36
+ def get_write_path(base_path):
37
+ cache_filename = str(Path(base_path).stem) + CACHE_EXT
38
+ if os.access(os.path.dirname(base_path), os.W_OK):
39
+ # if we have write access to the directory where the original input file is located
40
+ return os.path.join(os.path.dirname(base_path), cache_filename)
41
+ else:
42
+ cache_dir = user_cache_dir(APPNAME, APPAUTHOR)
43
+ if os.access(cache_dir, os.W_OK):
44
+ # if we have write access to the user cache directory
45
+ return os.path.join(cache_dir, cache_filename)
46
+
47
+ raise Exception('No write access to any cache directory')
48
+
49
+ def cache_exists(path, ref_alt_freq, category='all'):
50
+ ''' Check if the cache file exists and contains the required data '''
51
+ found = False
52
+ try:
53
+ found = is_in_h5py(path, ref_alt_freq, category)
54
+ except Exception as e:
55
+ pass
56
+ return found
57
+
58
+ def is_in_h5py(path, ref_alt_freq, category='all'):
59
+ '''
60
+ Check if the cache file exists and contains the required data.
61
+ Raise an exception if the cache file does not exist.
62
+ '''
63
+ if not path or not os.path.exists(path):
64
+ raise Exception('Cache file not found')
65
+
66
+ with h5py.File(path, 'r') as f:
67
+ if ref_alt_freq in f.keys():
68
+ if category in f[ref_alt_freq].keys():
69
+ if len(f[ref_alt_freq][category].keys()) > 0:
70
+ return True
71
+ return False
72
+
73
+ def load_h5py_cache(path, ref_alt_freq, category='all'):
74
+ if not path or not os.path.exists(path):
75
+ raise Exception('Cache file not found')
76
+
77
+ if not is_in_h5py(path, ref_alt_freq, category):
78
+ raise Exception('Cache file does not contain the required data')
79
+
80
+ _cache = {}
81
+ with h5py.File(path, 'r') as f:
82
+ for v in f[ref_alt_freq][category].values():
83
+ # iterate over chromosomes
84
+ keys = list(v['keys'].asstr()[:])
85
+ values = list(v['values'][:])
86
+ chrom_cache = dict(zip(keys, values)) # Combine keys and values into a dictionary
87
+ _cache.update(chrom_cache)
88
+ return _cache
89
+
90
+ def build_cache(base_path, ref_alt_freq=None, n_cores=1, return_cache=False, filter_fn=None, category='all', log=Log(), verbose=True):
91
+ cache_builder = CacheBuilder(base_path, ref_alt_freq=ref_alt_freq, n_cores=n_cores, log=log, verbose=verbose)
92
+ cache_builder.start_building(filter_fn=filter_fn, category=category, set_cache=return_cache) # start_building will wait for all processes to finish building cache
93
+ if return_cache:
94
+ return cache_builder.get_cache()
95
+
96
+ def is_palindromic(ref, alt):
97
+ gc = (ref=="G") & (alt=="C")
98
+ cg = (ref=="C") & (alt=="G")
99
+ at = (ref=="A") & (alt=="T")
100
+ ta = (ref=="T") & (alt=="A")
101
+ palindromic = gc | cg | at | ta
102
+ return palindromic
103
+
104
+ def is_indel(ref, alt):
105
+ return len(ref) != len(alt)
106
+
107
+ def filter_fn_pi(*, ref, alt):
108
+ return is_palindromic(ref, alt) or is_indel(ref, alt)
109
+
110
+ def filter_fn_np(*, ref, alt):
111
+ return not is_palindromic(ref, alt)
112
+
113
+ PALINDROMIC_INDEL = 'pi' # palindromic + indel
114
+ NON_PALINDROMIC = 'np' # non-palindromic
115
+
116
+ FILTER_FN = {
117
+ PALINDROMIC_INDEL: filter_fn_pi,
118
+ NON_PALINDROMIC: filter_fn_np
119
+ }
120
+
121
+
122
+ ################################################# CACHE MANAGERs #################################################
123
+
124
+ class CacheMainManager:
125
+ def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
126
+ self.base_path = base_path
127
+ self.ref_alt_freq = ref_alt_freq
128
+ self.category = category
129
+ self.filter_fn = filter_fn
130
+ self.n_cores = n_cores
131
+ self.log = log
132
+ self.verbose = verbose
133
+
134
+ def _get_cache_path(self):
135
+ return get_cache_path(self.base_path)
136
+
137
+ def _get_write_path(self):
138
+ if self.base_path is not None:
139
+ return get_write_path(self.base_path)
140
+ else:
141
+ raise Exception('base_path is None')
142
+
143
+ @property
144
+ def cache_len(self):
145
+ return len(self.cache)
146
+
147
+ @property
148
+ def cache(self):
149
+ if not hasattr(self, '_cache'):
150
+ raise Exception('Cache not loaded')
151
+ return self._cache
152
+
153
+ def build_cache(self):
154
+ ''' Build and load the cache'''
155
+ self._cache = build_cache(
156
+ self.base_path, ref_alt_freq=self.ref_alt_freq, n_cores=self.n_cores,
157
+ filter_fn=self.filter_fn, category=self.category,
158
+ return_cache=True, log=self.log, verbose=self.verbose
159
+ )
160
+
161
+ def load_cache(self, category=None):
162
+ if category is None:
163
+ category = self.category
164
+ cache_path = self._get_cache_path()
165
+ self._cache = load_h5py_cache(cache_path, ref_alt_freq=self.ref_alt_freq, category=category)
166
+
167
+
168
+ class CacheManager(CacheMainManager):
169
+ def __init__(self, base_path=None, cache_loader=None, cache_process=None, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
170
+ none_value = sum([cache_loader is not None, cache_process is not None])
171
+ assert none_value in [0, 1], 'Only one between cache_loader and cache_process should be provided'
172
+ super().__init__(base_path, ref_alt_freq=ref_alt_freq, category=category, filter_fn=filter_fn, n_cores=n_cores, log=log, verbose=verbose)
173
+ if none_value == 1:
174
+ self.base_path = None # unset base_path if cache_loader or cache_process is provided
175
+
176
+ self.cache_loader = cache_loader
177
+ self.cache_process = cache_process
178
+
179
+ if cache_loader is not None:
180
+ assert callable(getattr(cache_loader, 'get_cache', None)), 'cache_loader must have a get_cache method'
181
+ elif cache_process is not None:
182
+ assert isinstance(cache_process, CacheProcess), 'cache_process must be an instance of CacheProcess'
183
+ else:
184
+ cache_path = self._get_cache_path()
185
+ if cache_path is not None:
186
+ self.log.write(f'Start loading cache from {cache_path}...', verbose=self.verbose)
187
+ self.load_cache()
188
+ self.log.write('Finshed loading cache.', verbose=self.verbose)
189
+ else:
190
+ self.log.write(f'Start building cache from {base_path}...', verbose=self.verbose)
191
+ self.build_cache()
192
+ self.log.write('Finished building (and loading) cache.', verbose=self.verbose)
193
+
194
+ @property
195
+ def cache_len(self):
196
+ if self.cache_process is not None:
197
+ return self.cache_process.cache_len()
198
+ else:
199
+ return len(self.cache)
200
+
201
+ @property
202
+ def cache(self):
203
+ if self.cache_loader is not None:
204
+ return self.cache_loader.get_cache()
205
+ else:
206
+ if not hasattr(self, '_cache'):
207
+ raise Exception('Cache not loaded or class not exposing cache')
208
+ return self._cache
209
+
210
+ def apply_fn(self, fn, *args, **kwargs):
211
+ assert 'cache' not in kwargs, "'cache' can't be inside kwargs"
212
+ if self.cache_process is not None:
213
+ return self.cache_process.apply_fn(fn, *args, **kwargs)
214
+ else:
215
+ return fn(*args, cache=self.cache, **kwargs)
216
+
217
+ def _get_cache_path(self):
218
+ if self.cache_loader is None and self.cache_process is None:
219
+ return super()._get_cache_path()
220
+ return None
221
+
222
+
223
+ class CacheProcess(mp.Process):
224
+ '''
225
+ A class for managing a cache in a separate process. It is used to reduce memory consumption when the cache is very large.
226
+ This class will load the cache in a separate process and provide methods to perform operations on the cache directly on the subprocess.
227
+ In this way, the cache is not copied to the main process, but the operations are performed on the cache in the subprocess and only the
228
+ input and output of the operations are communicated (i.e. copied) between the main and the subprocess.
229
+
230
+ This is very useful when the cache is huge (e.g. 40GB in memory) and we want to perform operations on it based on a relatively small input
231
+ (e.g. a "small" dataframe, where small is relative to the cache size) and the output is also relatively small.
232
+ '''
233
+ def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
234
+ super().__init__()
235
+ self.base_path = base_path
236
+ self.ref_alt_freq = ref_alt_freq
237
+ self.filter_fn = filter_fn
238
+ self.category = category
239
+ self.n_cores = n_cores
240
+ self.log = log
241
+ self.verbose = verbose
242
+
243
+ self.daemon = True # When parent process exits, it will attempt to terminate all of its daemonic child processes.
244
+
245
+ self.manager = mp.Manager()
246
+ self.input_queue = mp.Queue() # Queue for communication between processes
247
+ self.result_queue = mp.Queue()
248
+ self.result_produced = mp.Value('b', True)
249
+
250
+ cache_path = self._get_cache_path()
251
+ if not cache_exists(cache_path, ref_alt_freq, category):
252
+ self.build_cache()
253
+ else:
254
+ if n_cores > 1:
255
+ self.log.warning('[CacheProcess: since the cache already exists, the parameter n_cores could be set to 1 without any performance loss]', verbose=self.verbose)
256
+
257
+ def _get_cache_path(self):
258
+ return get_cache_path(self.base_path)
259
+
260
+ def build_cache(self):
261
+ build_cache(
262
+ self.base_path, ref_alt_freq=self.ref_alt_freq, n_cores=self.n_cores,
263
+ filter_fn=self.filter_fn, category=self.category,
264
+ return_cache=False, log=self.log, verbose=self.verbose
265
+ )
266
+
267
+ def run(self):
268
+ cache_path = self._get_cache_path()
269
+ self.log.write(f'[CacheProcess: Start loading cache from {cache_path}...]', verbose=self.verbose)
270
+ cache = load_h5py_cache(cache_path, ref_alt_freq=self.ref_alt_freq, category=self.category)
271
+ self.log.write('[CacheProcess: Finshed loading cache.]', verbose=self.verbose)
272
+
273
+ # Continuously listen for method calls
274
+ while True:
275
+ method, args, kwargs = self.input_queue.get()
276
+ if method == 'get_from_cache':
277
+ key = args[0]
278
+ self.result_queue.put(cache[key])
279
+ self.result_produced.value = True
280
+ elif method == 'apply_fn':
281
+ assert 'cache' not in kwargs, "'cache' can't be inside kwargs"
282
+ fn, *args = args
283
+ result = fn(*args, cache=cache, **kwargs)
284
+ self.result_queue.put(result)
285
+ self.result_produced.value = True
286
+ elif method == 'cache_len':
287
+ self.result_queue.put(len(cache))
288
+ self.result_produced.value = True
289
+ elif method == "terminate":
290
+ self.result_produced.value = True
291
+ break
292
+
293
+ def _call_method(self, method, *args, **kwargs):
294
+ self.result_produced.value = False
295
+ self.input_queue.put((method, args, kwargs))
296
+
297
+ # wait until the result is produced
298
+ while not self.result_produced.value:
299
+ pass
300
+
301
+ def get_from_cache(self, key):
302
+ self._call_method('get_from_cache', key)
303
+ return self.result_queue.get()
304
+
305
+ def apply_fn(self, fn, **kwargs):
306
+ '''
307
+ Apply an arbitrary function to the cache. The function should take the cache as an argument,
308
+ and all the arguments should be passed as named arguments.
309
+ '''
310
+ self._call_method('apply_fn', fn, **kwargs)
311
+ return self.result_queue.get()
312
+
313
+ def cache_len(self):
314
+ self._call_method('cache_len')
315
+ return self.result_queue.get()
316
+
317
+ def terminate(self):
318
+ self._call_method("terminate")
319
+
320
+
321
+ ################################################# CACHE BUILDER #################################################
322
+
323
+ class CacheBuilderOld:
324
+ def __init__(self, ref_infer, ref_alt_freq=None, n_cores=1, log=Log(), verbose=True):
325
+ self.ref_infer = ref_infer
326
+ self.ref_alt_freq = ref_alt_freq
327
+ self.n_cores = n_cores
328
+ self.log = log
329
+ self.verbose = verbose
330
+
331
+ self.cache = {}
332
+ self.lock = threading.Lock() # For thread-safe cache access
333
+ self.cancelled = False # Flag for cancelling the cache building process
334
+ self.running = False
335
+ self.executor = None # Thread pool executor
336
+ self.futures = None # Stores Future objects
337
+
338
+ def start_building(self):
339
+ if self.running:
340
+ print("Cache building is already running. If you want to restart, please stop the current process first.")
341
+ return
342
+
343
+ n_cores = self.n_cores
344
+ contigs = self.get_contigs()
345
+
346
+ self.cancelled = False
347
+ self.running = True
348
+
349
+ self.log.write(f" -Building cache on {n_cores} cores...", verbose=self.verbose)
350
+ self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=n_cores)
351
+ self.futures = [self.executor.submit(self.build_cache, chrom) for chrom in contigs]
352
+
353
+ def get_contigs(self):
354
+ vcf_reader = VariantFile(self.ref_infer, drop_samples=True)
355
+ contigs = [v.name for v in vcf_reader.header.contigs.values()]
356
+ vcf_reader.close()
357
+ return contigs
358
+
359
+ def build_cache(self, chrom):
360
+ vcf_reader = VariantFile(self.ref_infer, drop_samples=True)
361
+ #self.log.write(f" -Fetching contig '{chrom}'...")
362
+ seq = vcf_reader.fetch(chrom)
363
+
364
+ first = True
365
+ for record in seq:
366
+ if first:
367
+ #self.log.write(f" -Found at least one record for contig '{chrom}'...")
368
+ first = False
369
+ chrom = record.chrom
370
+ start = record.pos - 1
371
+ end = record.pos
372
+ cache_key = f"{chrom}:{start}:{end}"
373
+ to_add = [record.pos, record.ref, record.alts, record.info[self.ref_alt_freq][0]]
374
+ self.add_to_cache(cache_key, to_add)
375
+
376
+ def stop_building(self, wait=False, verbose=False):
377
+ if self.futures:
378
+ self.cancelled = True
379
+ for future in self.futures:
380
+ future.cancel()
381
+ self.executor.shutdown(wait=wait) # Whether to wait for threads to finish
382
+ self.futures = None
383
+ self.executor = None
384
+ self.running = False
385
+
386
+ if verbose:
387
+ print(f"Cache contains {len(self.get_cache())} variants")
388
+
389
+ def add_to_cache(self, key, value):
390
+ self.lock.acquire()
391
+ if key in self.cache:
392
+ self.cache[key].append(value)
393
+ else:
394
+ self.cache[key] = [value]
395
+ self.lock.release()
396
+
397
+ def get_cache(self, complete=False):
398
+ if complete:
399
+ concurrent.futures.wait(self.futures)
400
+
401
+ self.lock.acquire()
402
+ cache = self.cache
403
+ self.lock.release()
404
+ return cache
405
+
406
+ def reset_cache(self):
407
+ self.lock.acquire()
408
+ self.cache = {}
409
+ self.lock.release()
410
+
411
+ def save_cache(self, save_path):
412
+ cache = self.get_cache(complete=True)
413
+ self.log.write(f' -Saving cache to {save_path}', verbose=self.verbose)
414
+ with open(save_path, 'wb') as f:
415
+ pickle.dump(cache, f, protocol=pickle.HIGHEST_PROTOCOL)
416
+ self.log.write(' -Cache saved', verbose=self.verbose)
417
+
418
+
419
+ class CacheBuilder:
420
+ def __init__(self, ref_infer, ref_alt_freq=None, n_cores=1, log=Log(), verbose=True):
421
+ self.ref_infer = ref_infer
422
+ self.ref_alt_freq = ref_alt_freq
423
+ self.n_cores = n_cores
424
+ self.log = log
425
+ self.verbose = verbose
426
+
427
+ self.running = False
428
+ self.cache = None
429
+
430
+ def get_contigs(self):
431
+ vcf_reader = VariantFile(self.ref_infer, drop_samples=True)
432
+ contigs = [v.name for v in vcf_reader.header.contigs.values()]
433
+ vcf_reader.close()
434
+ return contigs
435
+
436
+ def already_built(self, category):
437
+ cache_path = get_cache_path(self.ref_infer)
438
+ return cache_exists(cache_path, self.ref_alt_freq, category)
439
+
440
+ def start_building(self, filter_fn=None, category='all', set_cache=True):
441
+ if self.running:
442
+ print("Cache building is already running. If you want to restart, please stop the current process first.")
443
+ return
444
+
445
+ if isinstance(filter_fn, str) and filter_fn in FILTER_FN:
446
+ filter_fn = FILTER_FN[filter_fn]
447
+ category = filter_fn
448
+ elif category in FILTER_FN:
449
+ self.log.write(f" -Using the built-in filter function for category '{category}'. filter_fn will be ignored if provided.", verbose=self.verbose)
450
+ filter_fn = FILTER_FN[category]
451
+
452
+ assert filter_fn is None or category != 'all', "If filter_fn is not None, category cannot be 'all'"
453
+ assert filter_fn is not None or category == 'all', "If category is not 'all', filter_fn must be provided"
454
+
455
+ if self.already_built(category=category):
456
+ # TODO: we should probably improve the checking logic, and maybe also allows to overwrite the cache
457
+ self.log.write(f"Cache for category '{category}' and ref_alt_freq {self.ref_alt_freq} already exists. Skipping cache building", verbose=self.verbose)
458
+ return
459
+
460
+ n_cores = max(self.n_cores-1, 1) # leave one core for the watcher process
461
+ contigs = self.get_contigs()
462
+
463
+ self.running = True
464
+
465
+ self.log.write(f" -Building cache for category '{category}' on {n_cores} cores...", verbose=self.verbose)
466
+
467
+ pool = mp.Pool(n_cores)
468
+ manager = mp.Manager()
469
+ queue = manager.Queue()
470
+ jobs = []
471
+
472
+ # Start a watcher process to handle the output of each subprocess.
473
+ # The watcher will write the cache to the file as soon as it receives the output from the subprocess, in a safe way.
474
+ watcher = mp.Process(target=self.handle_output, args=(queue,))
475
+ watcher.daemon = True
476
+ watcher.start()
477
+
478
+ for chrom in contigs:
479
+ job = pool.apply_async(self.build_cache, args=(chrom, queue), kwds={'filter_fn': filter_fn, 'category': category})
480
+ jobs.append(job)
481
+
482
+ pool.close()
483
+ pool.join() # wait for all processes to finish
484
+
485
+ queue.put('kill') # send a signal to the watcher process to stop
486
+ watcher.join()
487
+
488
+ if set_cache:
489
+ self.cache = {}
490
+ for job in jobs:
491
+ self.cache.update(job.get()['cache'])
492
+
493
+ self.running = False
494
+
495
+ def build_cache(self, chrom, queue, filter_fn=None, category='all'):
496
+ assert filter_fn is None or category != 'all', "If filter_fn is not None, category cannot be 'all'"
497
+
498
+ inner_cache = {}
499
+ ref_alt_freq = self.ref_alt_freq
500
+
501
+ vcf_reader = VariantFile(self.ref_infer, drop_samples=True)
502
+ #self.log.write(f" -Fetching contig '{chrom}'...", verbose=self.verbose)
503
+ seq = vcf_reader.fetch(chrom)
504
+
505
+ for record in seq:
506
+ for alt in record.alts:
507
+ if filter_fn is None or filter_fn(ref=record.ref, alt=alt):
508
+ key = f"{record.chrom}:{record.pos}:{record.ref}:{alt}"
509
+ value = record.info[ref_alt_freq][0]
510
+ inner_cache[key] = value
511
+
512
+ vcf_reader.close()
513
+
514
+ result = {}
515
+ result['chrom'] = chrom
516
+ result['ref_alt_freq'] = ref_alt_freq
517
+ result['category'] = category
518
+ result['cache'] = inner_cache
519
+ queue.put(result)
520
+ return result
521
+
522
+ def handle_output(self, queue):
523
+ ''' Function that monitors a queue and writes the cache to a file as soon as it receives the output from the subprocess.'''
524
+ first = True
525
+ m = queue.get() # wait for the first message, to avoid creating an empty cache file
526
+
527
+ if m != 'kill':
528
+ cache_path = get_write_path(self.ref_infer)
529
+ with h5py.File(cache_path, mode='a') as f:
530
+ while True:
531
+ if first:
532
+ first = False
533
+ else:
534
+ m = queue.get()
535
+
536
+ if m == 'kill':
537
+ break
538
+
539
+ result = m
540
+ cache = result['cache']
541
+ if cache is not None and len(cache) > 0:
542
+ main_group = f.require_group(result['ref_alt_freq'])
543
+ sub_group = main_group.require_group(result['category'])
544
+ chrom_group = sub_group.require_group(str(result['chrom']))
545
+
546
+ keys_list = list(cache.keys())
547
+ max_len = len(max(keys_list, key=len))
548
+ #self.log.write(f"Writing {result['ref_alt_freq']}, {result['category']}, {str(result['chrom'])}\n")
549
+ keys_dataset = chrom_group.create_dataset('keys', data=keys_list, dtype=f'S{max_len}', compression="gzip", compression_opts=4)
550
+ values_dataset = chrom_group.create_dataset('values', data=list(cache.values()), dtype='f', compression="gzip", compression_opts=4)
551
+
552
+ def get_cache(self):
553
+ return self.cache
554
+
555
+
556
+ ################################################# CACHE LOADERs #################################################
557
+ # Classes for loading the cache in a separate thread or process in the background while the main process is running.
558
+ # However, right now, the most efficient way to load the cache and perform operations on it is to use the CacheProcess class.
559
+
560
+ class CacheLoader:
561
+ def __new__(cls, *args, **kwargs):
562
+ if cls is CacheLoader:
563
+ raise TypeError(f"You are trying to instantiate an abstract class {cls.__name__}. Please use a concrete subclass.")
564
+ return super().__new__(cls)
565
+
566
+ def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
567
+ self.base_path = base_path
568
+ self.ref_alt_freq = ref_alt_freq
569
+ self.category = category
570
+ self.filter_fn = filter_fn
571
+ self.n_cores = n_cores
572
+ self.log = log
573
+ self.verbose = verbose
574
+
575
+ def _get_cache_path(self):
576
+ return get_cache_path(self.base_path)
577
+
578
+ def build_cache(self):
579
+ self.cache = build_cache(
580
+ self.base_path, ref_alt_freq=self.ref_alt_freq, n_cores=self.n_cores,
581
+ filter_fn=self.filter_fn, category=self.category,
582
+ return_cache=True, log=self.log, verbose=self.verbose
583
+ )
584
+
585
+ def add_to_cache(self, key, value):
586
+ self.cache[key] = value
587
+
588
+ def get_cache(self):
589
+ return self.cache
590
+
591
+ def reset_cache(self):
592
+ self.cache = {}
593
+
594
+
595
+ class CacheLoaderThread(CacheLoader):
596
+ '''
597
+ A class for loading a cache in a separate thread. It is used to load the cache in the background while the main process is running.
598
+
599
+ In theory, this should be the best and simplest approach to directly load the cache in the same process as the main process, without further
600
+ copying the cache to the main process. However, due to the GIL (Global Interpreter Lock) in Python, this approach is not efficient and
601
+ it slows down the main process.
602
+ '''
603
+ def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
604
+ super().__init__(base_path, ref_alt_freq=ref_alt_freq, category=category, filter_fn=filter_fn, n_cores=n_cores, log=log, verbose=verbose)
605
+ self.cache = {}
606
+ self.lock = threading.Lock() # For thread-safe cache access
607
+ self.running = False
608
+ self.executor = None # Thread pool executor
609
+ self.future = None # Stores Future objects
610
+
611
+ def start_loading(self):
612
+ if self.running:
613
+ print("Cache loading is already running. If you want to restart, please stop the current process first.")
614
+ return
615
+
616
+ cache_path = self._get_cache_path()
617
+
618
+ if not cache_exists(cache_path, self.ref_alt_freq, self.category):
619
+ self.log.write("Cache does not exist. Start building (and loading) cache...", verbose=self.verbose)
620
+ self.build_cache() # this will also load the cache
621
+ else:
622
+ self.running = True
623
+ self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
624
+ self.future = self.executor.submit(self.load_cache)
625
+
626
+ def load_cache(self):
627
+ cache_path = self._get_cache_path()
628
+ self.log.write(f'[Start loading cache from {cache_path}...]', verbose=self.verbose)
629
+ self.cache = load_h5py_cache(cache_path, ref_alt_freq=self.ref_alt_freq, category=self.category)
630
+ self.log.write('[Finshed loading cache.]', verbose=self.verbose)
631
+
632
+ self.future.cancel()
633
+ self.executor.shutdown(wait=False)
634
+ self.executor = None
635
+ self.future = None
636
+ self.running = False
637
+
638
+ def get_cache(self):
639
+ if self.future is not None:
640
+ self.future.result() # Ensure loading is finished before accessing the cache
641
+ return self.cache
642
+
643
+
644
+ def _load_cache_process(path, ref_alt_freq, category, cache):
645
+ #start = time.time()
646
+ local_cache = load_h5py_cache(path, ref_alt_freq=ref_alt_freq, category=category)
647
+ #print(f" ********* DONE LOADING local in {time.time() - start} seconds *********")
648
+
649
+ #start = time.time()
650
+ cache.update(local_cache)
651
+ #print(f" ********* DONE COPYING shared in {time.time() - start} seconds *********")
652
+ del local_cache
653
+
654
+ class CacheLoaderProcess(CacheLoader):
655
+ '''
656
+ A class for loading a cache in a separate process. It is used to load the cache in the background while the main process is running.
657
+
658
+ Unlike CacheLoaderThread, this class is more efficient because it loads the cache in a separate process, which is not affected by the GIL.
659
+ However, a lot of memory and time is wasted in copying the cache from the subprocess to the main process.
660
+ '''
661
+ def __init__(self, base_path, ref_alt_freq=None, category='all', filter_fn=None, n_cores=1, log=Log(), verbose=True):
662
+ super().__init__(base_path, ref_alt_freq=ref_alt_freq, category=category, filter_fn=filter_fn, n_cores=n_cores, log=log, verbose=verbose)
663
+ self.manager = mp.Manager()
664
+ self.cache = self.manager.dict()
665
+ self.running = False
666
+ self.process = None
667
+
668
+ def start_loading(self):
669
+ if self.running:
670
+ print("Cache loading is already running. If you want to restart, please stop the current process first.")
671
+ return
672
+
673
+ cache_path = self._get_cache_path()
674
+
675
+ if not cache_exists(cache_path, self.ref_alt_freq, self.category):
676
+ self.log.write("Cache does not exist. Start building (and loading) cache...", verbose=self.verbose)
677
+ self.build_cache() # this will also load the cache
678
+ else:
679
+ self.running = True
680
+ self.process = mp.Process(target=_load_cache_process, args=(cache_path, self.ref_alt_freq, self.filter_fn, self.cache))
681
+ self.process.start()
682
+
683
+ def get_cache(self):
684
+ if self.running:
685
+ self.process.join() # Wait for cache loading process to finish
686
+ self.running = False
687
+ return self.cache
gwaslab/g_version.py CHANGED
@@ -15,8 +15,8 @@ def _get_version():
15
15
  def gwaslab_info():
16
16
  # version meta information
17
17
  dic={
18
- "version":"3.4.42",
19
- "release_date":"20240328"
18
+ "version":"3.4.43",
19
+ "release_date":"20240403"
20
20
  }
21
21
  return dic
22
22
 
@@ -24,6 +24,7 @@ from gwaslab.bd_common_data import get_chr_to_number
24
24
  from gwaslab.bd_common_data import _maketrans
25
25
  from gwaslab.g_vchange_status import vchange_status
26
26
  from gwaslab.g_version import _get_version
27
+ from gwaslab.cache_manager import CacheManager, PALINDROMIC_INDEL, NON_PALINDROMIC
27
28
 
28
29
  #rsidtochrpos
29
30
  #checkref
@@ -912,6 +913,56 @@ def check_strand_status(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr
912
913
  return status_pre+"5"+status_end
913
914
  return status_pre+"8"+status_end
914
915
 
916
+ def check_strand_status_cache(data,cache,ref_infer=None,ref_alt_freq=None,chr_dict=get_number_to_chr(),trust_cache=True,log=Log(),verbose=True):
917
+ if not trust_cache:
918
+ assert ref_infer is not None, "If trust_cache is False, ref_infer must be provided"
919
+ log.warning("You are not trusting the cache, this will slow down the process. Please consider building a complete cache.")
920
+
921
+ if ref_infer is not None and not trust_cache:
922
+ vcf_reader = VariantFile(ref_infer)
923
+
924
+ if isinstance(data, pd.DataFrame):
925
+ data = data.values
926
+
927
+ in_cache = 0
928
+ new_statuses = []
929
+
930
+ for i in range(data.shape[0]):
931
+ _chrom, pos, ref, alt, eaf, status = data[i]
932
+ chrom = _chrom
933
+ start = pos - 1
934
+ end = pos
935
+
936
+ if chr_dict is not None: chrom=chr_dict[chrom]
937
+
938
+ status_pre=status[:6]
939
+ status_end=""
940
+
941
+ new_status = status_pre+"8"+status_end # default value
942
+
943
+ cache_key = f"{chrom}:{pos}:{ref}:{alt}"
944
+ if cache_key in cache:
945
+ in_cache += 1
946
+ record = cache[cache_key]
947
+ if record is None:
948
+ new_status = status_pre+"8"+status_end
949
+ else:
950
+ if (record<0.5) and (eaf<0.5):
951
+ new_status = status_pre+"1"+status_end
952
+ elif (record>0.5) and (eaf>0.5):
953
+ new_status = status_pre+"1"+status_end
954
+ else:
955
+ new_status = status_pre+"5"+status_end
956
+ else:
957
+ if not trust_cache:
958
+ # If we don't trust the cache as a not complete cache, we should perform the check reading from the VCF file
959
+ new_status = check_strand_status(_chrom, start, end, ref, alt, eaf, vcf_reader, ref_alt_freq, status, chr_dict)
960
+
961
+ new_statuses.append(new_status)
962
+
963
+ log.write(f" -Elements in cache: {in_cache}", verbose=verbose)
964
+ return new_statuses
965
+
915
966
 
916
967
  def check_unkonwn_indel(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr_dict=get_number_to_chr(),daf_tolerance=0.2):
917
968
  ### input : unknown indel, both on genome (xx1[45]x)
@@ -939,6 +990,65 @@ def check_unkonwn_indel(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr
939
990
 
940
991
  return status_pre+"8"+status_end
941
992
 
993
+
994
+ def check_unkonwn_indel_cache(data,cache,ref_infer=None,ref_alt_freq=None,chr_dict=get_number_to_chr(),daf_tolerance=0.2,trust_cache=True,log=Log(),verbose=True):
995
+ if not trust_cache:
996
+ assert ref_infer is not None, "If trust_cache is False, ref_infer must be provided"
997
+ log.warning("You are not trusting the cache, this will slow down the process. Please consider building a complete cache.")
998
+
999
+ if ref_infer is not None:
1000
+ vcf_reader = VariantFile(ref_infer)
1001
+
1002
+ if isinstance(data, pd.DataFrame):
1003
+ data = data.values
1004
+
1005
+ in_cache = 0
1006
+ new_statuses = []
1007
+
1008
+ for i in range(data.shape[0]):
1009
+ _chrom, pos, ref, alt, eaf, status = data[i]
1010
+ chrom = _chrom
1011
+
1012
+ if chr_dict is not None: chrom=chr_dict[chrom]
1013
+ start = pos - 1
1014
+ end = pos
1015
+
1016
+ status_pre=status[:6]
1017
+ status_end=""
1018
+
1019
+ new_status = status_pre+"8"+status_end # default value
1020
+
1021
+ cache_key_ref_alt = f"{chrom}:{pos}:{ref}:{alt}"
1022
+ cache_key_alt_ref = f"{chrom}:{pos}:{alt}:{ref}"
1023
+
1024
+ if cache_key_ref_alt in cache:
1025
+ in_cache += 1
1026
+ record = cache[cache_key_ref_alt]
1027
+ if record is None:
1028
+ new_status = status_pre+"8"+status_end
1029
+ else:
1030
+ if abs(record - eaf)<daf_tolerance:
1031
+ new_status = status_pre+"3"+status_end
1032
+
1033
+ elif cache_key_alt_ref in cache:
1034
+ in_cache += 1
1035
+ record = cache[cache_key_alt_ref]
1036
+ if record is None:
1037
+ new_status = status_pre+"8"+status_end
1038
+ else:
1039
+ if abs(record - (1 - eaf))<daf_tolerance:
1040
+ new_status = status_pre+"6"+status_end
1041
+
1042
+ else:
1043
+ if not trust_cache:
1044
+ # If we don't trust the cache as a not complete cache, we should perform the check reading from the VCF file
1045
+ new_status = check_unkonwn_indel(_chrom, start, end, ref, alt, eaf, vcf_reader, ref_alt_freq, status, chr_dict, daf_tolerance)
1046
+
1047
+ new_statuses.append(new_status)
1048
+
1049
+ log.write(f" -Elements in cache: {in_cache}", verbose=verbose)
1050
+ return new_statuses
1051
+
942
1052
 
943
1053
  def get_reverse_complementary_allele(a):
944
1054
  dic = str.maketrans({
@@ -963,16 +1073,40 @@ def check_strand(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="N
963
1073
  status_part = sumstats.apply(lambda x:check_strand_status(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict),axis=1)
964
1074
  return status_part
965
1075
 
1076
+ def check_strand_cache(sumstats,cache,ref_infer,ref_alt_freq=None,chr_dict=get_number_to_chr(),trust_cache=True,log=Log(),verbose=True):
1077
+ assert cache is not None, "Cache must be provided"
1078
+ status_part = check_strand_status_cache(sumstats,cache,ref_infer,ref_alt_freq,chr_dict,trust_cache,log,verbose)
1079
+ return status_part
1080
+
966
1081
  def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS",daf_tolerance=0.2):
967
1082
  vcf_reader = VariantFile(ref_infer)
968
1083
  status_part = sumstats.apply(lambda x:check_unkonwn_indel(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict,daf_tolerance),axis=1)
969
1084
  return status_part
970
1085
 
1086
+ def check_indel_cache(sumstats,cache,ref_infer,ref_alt_freq=None,chr_dict=get_number_to_chr(),daf_tolerance=0.2,trust_cache=True,log=Log(),verbose=True):
1087
+ assert cache is not None, "Cache must be provided"
1088
+ status_part = check_unkonwn_indel_cache(sumstats,cache,ref_infer,ref_alt_freq,chr_dict,daf_tolerance,trust_cache,log,verbose)
1089
+ return status_part
1090
+
971
1091
  ##################################################################################################################################################
972
1092
 
973
1093
  def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,daf_tolerance=0.20,remove_snp="",mode="pi",n_cores=1,remove_indel="",
974
1094
  chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
975
- chr_dict=None,verbose=True,log=Log()):
1095
+ chr_dict=None,cache_options={},verbose=True,log=Log()):
1096
+ '''
1097
+ Args:
1098
+ cache_options : A dictionary with the following keys:
1099
+ - cache_manager: CacheManager object or None. If any between cache_loader and cache_process is not None, or use_cache is True, a CacheManager object will be created automatically.
1100
+ - trust_cache: bool (optional, default: True). Whether to completely trust the cache or not. Trusting the cache means that any key not found inside the cache will be considered as a missing value even in the VCF file.
1101
+ - cache_loader: Object with a get_cache() method or None.
1102
+ - cache_process: Object with an apply_fn() method or None.
1103
+ - use_cache: bool (optional, default: False). If any of the cache_manager, cache_loader or cache_process is not None, this will be set to True automatically.
1104
+ If set to True and all between cache_manager, cache_loader and cache_process are None, the cache will be loaded (or built) on the spot.
1105
+
1106
+ The usefulness of a cache_loader or cache_process object is to pass a custom object which already has the cache loaded. This can be useful if the cache is loaded in background in another thread/process while other operations are performed.
1107
+ The cache_manager is a CacheManager object is used to expose the API to interact with the cache.
1108
+ '''
1109
+
976
1110
  ##start function with col checking##########################################################
977
1111
  _start_line = "infer strand for palindromic SNPs/align indistinguishable indels"
978
1112
  _end_line = "inferring strand for palindromic SNPs/align indistinguishable indels"
@@ -995,6 +1129,16 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
995
1129
 
996
1130
  chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
997
1131
 
1132
+ # Setup cache variables
1133
+ cache_manager = cache_options.get("cache_manager", None)
1134
+ if cache_manager is not None:
1135
+ assert isinstance(cache_manager, CacheManager), "cache_manager must be a CacheManager object"
1136
+ trust_cache = cache_options.get("trust_cache", True)
1137
+ cache_loader = cache_options.get("cache_loader", None)
1138
+ cache_process = cache_options.get("cache_process", None)
1139
+ use_cache = any(c is not None for c in [cache_manager, cache_loader, cache_process]) or cache_options.get('use_cache', False)
1140
+ _n_cores = n_cores # backup n_cores
1141
+
998
1142
  log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
999
1143
 
1000
1144
  if "p" in mode:
@@ -1022,16 +1166,30 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
1022
1166
  #########################################################################################
1023
1167
  if sum(unknow_palindromic_to_check)>0:
1024
1168
  if sum(unknow_palindromic_to_check)<10000:
1025
- n_cores=1
1026
-
1027
- #df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
1028
- df_split = _df_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
1029
- pool = Pool(n_cores)
1030
- map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
1031
- status_inferred = pd.concat(pool.map(map_func,df_split))
1032
- sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
1033
- pool.close()
1034
- pool.join()
1169
+ n_cores=1
1170
+
1171
+ if use_cache and cache_manager is None:
1172
+ cache_manager = CacheManager(base_path=ref_infer, cache_loader=cache_loader, cache_process=cache_process,
1173
+ ref_alt_freq=ref_alt_freq, category=PALINDROMIC_INDEL,
1174
+ n_cores=_n_cores, log=log, verbose=verbose)
1175
+
1176
+ log.write(" -Starting strand inference for palindromic SNPs...",verbose=verbose)
1177
+ df_to_check = sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]]
1178
+
1179
+ if use_cache and cache_manager.cache_len > 0:
1180
+ log.write(" -Using cache for strand inference",verbose=verbose)
1181
+ status_inferred = cache_manager.apply_fn(check_strand_cache, sumstats=df_to_check, ref_infer=ref_infer, ref_alt_freq=ref_alt_freq, chr_dict=chr_dict, trust_cache=trust_cache, log=log, verbose=verbose)
1182
+ sumstats.loc[unknow_palindromic_to_check,status] = status_inferred
1183
+ else:
1184
+ #df_split = np.array_split(df_to_check, n_cores)
1185
+ df_split = _df_split(df_to_check, n_cores)
1186
+ pool = Pool(n_cores)
1187
+ map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
1188
+ status_inferred = pd.concat(pool.map(map_func,df_split))
1189
+ sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
1190
+ pool.close()
1191
+ pool.join()
1192
+ log.write(" -Finished strand inference.",verbose=verbose)
1035
1193
  else:
1036
1194
  log.warning("No palindromic variants available for checking.")
1037
1195
  #########################################################################################
@@ -1082,15 +1240,30 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
1082
1240
 
1083
1241
  if sum(unknow_indel)>0:
1084
1242
  if sum(unknow_indel)<10000:
1085
- n_cores=1
1086
- #df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
1087
- df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
1088
- pool = Pool(n_cores)
1089
- map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
1090
- status_inferred = pd.concat(pool.map(map_func,df_split))
1091
- sumstats.loc[unknow_indel,status] = status_inferred.values
1092
- pool.close()
1093
- pool.join()
1243
+ n_cores=1
1244
+
1245
+ if use_cache and cache_manager is None:
1246
+ cache_manager = CacheManager(base_path=ref_infer, cache_loader=cache_loader, cache_process=cache_process,
1247
+ ref_alt_freq=ref_alt_freq, category=PALINDROMIC_INDEL,
1248
+ n_cores=_n_cores, log=log, verbose=verbose)
1249
+
1250
+ log.write(" -Starting indistinguishable indel inference...",verbose=verbose)
1251
+ df_to_check = sumstats.loc[unknow_indel,[chr,pos,ref,alt,eaf,status]]
1252
+
1253
+ if use_cache and cache_manager.cache_len > 0:
1254
+ log.write(" -Using cache for indel inference",verbose=verbose)
1255
+ status_inferred = cache_manager.apply_fn(check_indel_cache, sumstats=df_to_check, ref_infer=ref_infer, ref_alt_freq=ref_alt_freq, chr_dict=chr_dict, daf_tolerance=daf_tolerance, trust_cache=trust_cache, log=log, verbose=verbose)
1256
+ sumstats.loc[unknow_indel,status] = status_inferred
1257
+ else:
1258
+ #df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
1259
+ df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
1260
+ pool = Pool(n_cores)
1261
+ map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
1262
+ status_inferred = pd.concat(pool.map(map_func,df_split))
1263
+ sumstats.loc[unknow_indel,status] = status_inferred.values
1264
+ pool.close()
1265
+ pool.join()
1266
+ log.write(" -Finished indistinguishable indel inference.",verbose=verbose)
1094
1267
 
1095
1268
  #########################################################################################
1096
1269
 
@@ -1611,12 +1611,5 @@ def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
1611
1611
 
1612
1612
  ###############################################################################################################
1613
1613
  def _df_split(dataframe, n):
1614
- chunks = []
1615
- chunk_size = int(dataframe.shape[0] // n)+1
1616
-
1617
- for index in range(0, dataframe.shape[0], chunk_size):
1618
- chunks.append(
1619
- dataframe.iloc[index:index + chunk_size]
1620
- )
1621
-
1622
- return chunks
1614
+ k, m = divmod(len(dataframe), n)
1615
+ return [dataframe.iloc[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]
@@ -1031,17 +1031,18 @@ def mqqplot(insumstats,
1031
1031
 
1032
1032
  def _add_pad_to_x_axis(ax1, xpad, xpadl, xpadr, sumstats):
1033
1033
 
1034
- xmin, xmax = ax1.get_xlim()
1035
-
1036
- if xpad is not None:
1037
- pad = xpad* sumstats["i"].max()
1038
- ax1.set_xlim([xmin - pad, xmin + pad])
1039
- if xpadl is not None:
1040
- pad = xpadl* sumstats["i"].max()
1041
- ax1.set_xlim([xmin - pad,xmax])
1042
- if xpadr is not None:
1043
- pad = xpadr* sumstats["i"].max()
1044
- ax1.set_xlim([xmin, xmax + pad])
1034
+ if ax1 is not None:
1035
+ xmin, xmax = ax1.get_xlim()
1036
+
1037
+ if xpad is not None:
1038
+ pad = xpad* sumstats["i"].max()
1039
+ ax1.set_xlim([xmin - pad, xmin + pad])
1040
+ if xpadl is not None:
1041
+ pad = xpadl* sumstats["i"].max()
1042
+ ax1.set_xlim([xmin - pad,xmax])
1043
+ if xpadr is not None:
1044
+ pad = xpadr* sumstats["i"].max()
1045
+ ax1.set_xlim([xmin, xmax + pad])
1045
1046
 
1046
1047
  return ax1
1047
1048
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gwaslab
3
- Version: 3.4.42
3
+ Version: 3.4.43
4
4
  Summary: A collection of handy tools for GWAS SumStats
5
5
  Author-email: Yunye <yunye@gwaslab.com>
6
6
  Project-URL: Homepage, https://cloufield.github.io/gwaslab/
@@ -24,6 +24,7 @@ Requires-Dist: liftover >=1.1.13
24
24
  Requires-Dist: scikit-allel >=1.3.5
25
25
  Requires-Dist: pyensembl ==2.2.3
26
26
  Requires-Dist: gtfparse ==1.3.0
27
+ Requires-Dist: h5py >=3.10.0
27
28
 
28
29
  # GWASLab
29
30
 
@@ -193,6 +194,7 @@ dependencies:
193
194
  - adjustText==0.8
194
195
  - pysam==0.19
195
196
  - pyensembl==2.2.3
197
+ - h5py==3.10.0
196
198
  ```
197
199
 
198
200
  ## How to cite
@@ -3,6 +3,7 @@ gwaslab/bd_common_data.py,sha256=v98X3tdRNOVE2gCiSHkfyBb0pSIjTk5IFG8A725Oj3o,126
3
3
  gwaslab/bd_config.py,sha256=TP-r-DPhJD3XnRYZbw9bQHXaDIkiRgK8bG9HCt-UaLc,580
4
4
  gwaslab/bd_download.py,sha256=cDDk2C5IvjeAzvPvVYGTkI4Ss33DUtEDjGo8eAbQRvY,15663
5
5
  gwaslab/bd_get_hapmap3.py,sha256=asNjQYeGfQi8u3jnfenRvDdKMs5ptql5wpcUzqMlwUI,3937
6
+ gwaslab/cache_manager.py,sha256=HOTnSkCOyGEPLRl90WT8D_6pAdI8d8AzenMIDGuCeWc,28113
6
7
  gwaslab/g_Log.py,sha256=C3Zv-_6c3C9ms8bgQ-ytplz22sjk7euqXYkWr9zNeAs,1573
7
8
  gwaslab/g_Phenotypes.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
9
  gwaslab/g_Sumstats.py,sha256=GS0YUdvNYlwiR-mu6VJIv_JRqgBpHmTq9123XX5kiMI,35132
@@ -11,9 +12,9 @@ gwaslab/g_SumstatsT.py,sha256=u_DighLMnMxwTLnqm-B58pA0G6WXRj6pudPyKMVKjSU,2133
11
12
  gwaslab/g_Sumstats_summary.py,sha256=FECvvFXJVKaCX5dggBvvk9YvJ6AbdbcLfjltysX7wEE,6380
12
13
  gwaslab/g_meta.py,sha256=htWlgURWclm9R6UqFcX1a93WN27xny7lGUeyJZOtszQ,2583
13
14
  gwaslab/g_vchange_status.py,sha256=eX0jdIb6Spa07ZdpWNqUWqdVBWS0fuH2yrt4PDi3Res,1746
14
- gwaslab/g_version.py,sha256=sYIrfQwAxGSUHOGcP64nhQ71-Cgzax3Xs18GM0Os_9k,1818
15
+ gwaslab/g_version.py,sha256=79WGi9pB-TL4T-lRgKtkq1p5WXZOYfBG5KdKplTJxfs,1818
15
16
  gwaslab/hm_casting.py,sha256=FqP4EQl83Q2OKLw004OgLIvUH795TVCGwziLk5jsHqY,11368
16
- gwaslab/hm_harmonize_sumstats.py,sha256=n6aygO8V7MJaDgkNHozNzIsm_G8KcR70ukS-IOygw0E,67684
17
+ gwaslab/hm_harmonize_sumstats.py,sha256=Lu3UkNK6S9imwOgjK1ZBZTu2gDSFEDjBbgSwSOGfzcI,76705
17
18
  gwaslab/hm_rsid_to_chrpos.py,sha256=ODWREO0jPN0RAfNzL5fRzSRANfhiksOvUVPuEsFZQqA,6552
18
19
  gwaslab/io_preformat_input.py,sha256=w62JLAr16Ru0EgUtBCEV2eXRO89OqhidQxwf2IPAM38,20014
19
20
  gwaslab/io_read_ldsc.py,sha256=8S9n4imgl4d0WPms_GYld-6uUM5z7iWGiCA-M814kzY,12123
@@ -27,7 +28,7 @@ gwaslab/ldsc_parse.py,sha256=MBnfgcWlV4oHp9MoDRh1mpilaHhAR15Af77hMFn4-5k,10564
27
28
  gwaslab/ldsc_regressions.py,sha256=yzbGjgNV7u-SWXNPsh9S8y9mK97Bim_Nmad9G9V18ZU,30078
28
29
  gwaslab/ldsc_sumstats.py,sha256=O0olsDxKlh1MJ1gAuEN1t40rxhajOEwOQ20ak7xoDrI,26245
29
30
  gwaslab/qc_check_datatype.py,sha256=kW68uk4dTLOU2b1dHoVat6n0loundDysAjIqxsXW28Q,3379
30
- gwaslab/qc_fix_sumstats.py,sha256=Dp2HnVnqdO5aiXpLhnLsvL6XCKuC4Du2HJFEVIH2Ss0,87342
31
+ gwaslab/qc_fix_sumstats.py,sha256=YtuADrWFhT1kdRp9CmhWF9IQkkXwN8SLnmbF9DIIZ-Y,87231
31
32
  gwaslab/run_script.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
33
  gwaslab/util_ex_calculate_ldmatrix.py,sha256=LpE__LoYRHLgVKlCHo6lYWlz9LEUVUDqYPEAP-Svbm0,14598
33
34
  gwaslab/util_ex_calculate_prs.py,sha256=5l1eiZs8YwIpEgp7i3IurP8n5KwQM5awbG9fWSm4iT4,9053
@@ -58,7 +59,7 @@ gwaslab/viz_plot_compare_effect.py,sha256=8om3y6YQfnOk4FfkKSpKr2KqJcsMeCwQ6FRRKb
58
59
  gwaslab/viz_plot_forestplot.py,sha256=xgOnefh737CgdQxu5naVyRNBX1NQXPFKzf51fbh6afs,6771
59
60
  gwaslab/viz_plot_miamiplot.py,sha256=rCFEp7VNuVqeBBG3WRkmFAtFklbF79BvIQQYiSY70VY,31238
60
61
  gwaslab/viz_plot_miamiplot2.py,sha256=SWv82D8UBbREKsk8EoKth-2w68l6FbXyVLsb_E1hh8o,15882
61
- gwaslab/viz_plot_mqqplot.py,sha256=xIx-m8IP0GAAKuIoiAbzxl3fkUDEEunczo6dVEZ3KRY,61671
62
+ gwaslab/viz_plot_mqqplot.py,sha256=PzRWnm11whxww7ut-bzFkj1sbPc_c0OP7yRpIgYo2iQ,61739
62
63
  gwaslab/viz_plot_qqplot.py,sha256=psQgVpP29686CEZkzQz0iRbApzqy7aE3GGiBcazVvNw,7247
63
64
  gwaslab/viz_plot_regionalplot.py,sha256=PBIWkNj2fj-dRLKQJNpM8wor5jya2anqix0-UYLE0Is,37901
64
65
  gwaslab/viz_plot_rg_heatmap.py,sha256=PidUsgOiEVt6MfBPCF3_yDhOEytZ-I1q-ZD6_0pFrV4,13713
@@ -72,9 +73,9 @@ gwaslab/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz,sha256=qD9RsC5S2h6l-OdpW
72
73
  gwaslab/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz,sha256=Y8ZT2FIAhbhlgCJdE9qQVAiwnV_fcsPt72usBa7RSBM,10225828
73
74
  gwaslab/data/high_ld/high_ld_hla_hg19.bed.gz,sha256=R7IkssKu0L4WwkU9SrS84xCMdrkkKL0gnTNO_OKbG0Y,219
74
75
  gwaslab/data/high_ld/high_ld_hla_hg38.bed.gz,sha256=76CIU0pibDJ72Y6UY-TbIKE9gEPwTELAaIbCXyjm80Q,470
75
- gwaslab-3.4.42.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
76
- gwaslab-3.4.42.dist-info/LICENSE_before_v3.4.39,sha256=GhLOU_1UDEKeOacYhsRN_m9u-eIuVTazSndZPeNcTZA,1066
77
- gwaslab-3.4.42.dist-info/METADATA,sha256=iqArSw_x7yXovyF9D-z3gt5fzskQSOVObXPWCrYIcsg,7714
78
- gwaslab-3.4.42.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
79
- gwaslab-3.4.42.dist-info/top_level.txt,sha256=PyY6hWtrALpv2MAN3kjkIAzJNmmBTH5a2risz9KwH08,8
80
- gwaslab-3.4.42.dist-info/RECORD,,
76
+ gwaslab-3.4.43.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
77
+ gwaslab-3.4.43.dist-info/LICENSE_before_v3.4.39,sha256=GhLOU_1UDEKeOacYhsRN_m9u-eIuVTazSndZPeNcTZA,1066
78
+ gwaslab-3.4.43.dist-info/METADATA,sha256=bziEH7fBqmzBIWDEZQUaa9w_DinQxI2SbjaatoN-jYw,7764
79
+ gwaslab-3.4.43.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
80
+ gwaslab-3.4.43.dist-info/top_level.txt,sha256=PyY6hWtrALpv2MAN3kjkIAzJNmmBTH5a2risz9KwH08,8
81
+ gwaslab-3.4.43.dist-info/RECORD,,