gwaslab 3.4.42__py3-none-any.whl → 3.4.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/cache_manager.py +687 -0
- gwaslab/g_Sumstats.py +4 -2
- gwaslab/g_version.py +2 -2
- gwaslab/hm_harmonize_sumstats.py +227 -33
- gwaslab/qc_fix_sumstats.py +134 -35
- gwaslab/viz_plot_mqqplot.py +12 -11
- {gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/METADATA +5 -3
- {gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/RECORD +12 -11
- {gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/top_level.txt +0 -0
gwaslab/g_Sumstats.py
CHANGED
|
@@ -356,8 +356,10 @@ class Sumstats():
|
|
|
356
356
|
if ref_seq is not None:
|
|
357
357
|
if ref_seq_mode=="v":
|
|
358
358
|
self.data = checkref(self.data,ref_seq,log=self.log,**checkref_args)
|
|
359
|
-
|
|
359
|
+
elif ref_seq_mode=="s":
|
|
360
360
|
self.data = oldcheckref(self.data,ref_seq,log=self.log,**checkref_args)
|
|
361
|
+
else:
|
|
362
|
+
raise ValueError("ref_seq_mode should be 'v' (vectorized, faster) or 's' (sequential, slower)")
|
|
361
363
|
|
|
362
364
|
self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
|
|
363
365
|
|
|
@@ -429,7 +431,7 @@ class Sumstats():
|
|
|
429
431
|
if ref_seq_mode=="v":
|
|
430
432
|
self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
|
|
431
433
|
self.data = checkref(self.data,ref_seq,log=self.log,**kwargs)
|
|
432
|
-
|
|
434
|
+
elif ref_seq_mode=="s":
|
|
433
435
|
self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
|
|
434
436
|
self.data = oldcheckref(self.data,ref_seq,log=self.log,**kwargs)
|
|
435
437
|
def infer_strand(self,ref_infer,**kwargs):
|
gwaslab/g_version.py
CHANGED
gwaslab/hm_harmonize_sumstats.py
CHANGED
|
@@ -24,6 +24,7 @@ from gwaslab.bd_common_data import get_chr_to_number
|
|
|
24
24
|
from gwaslab.bd_common_data import _maketrans
|
|
25
25
|
from gwaslab.g_vchange_status import vchange_status
|
|
26
26
|
from gwaslab.g_version import _get_version
|
|
27
|
+
from gwaslab.cache_manager import CacheManager, PALINDROMIC_INDEL, NON_PALINDROMIC
|
|
27
28
|
|
|
28
29
|
#rsidtochrpos
|
|
29
30
|
#checkref
|
|
@@ -388,7 +389,10 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
|
|
|
388
389
|
return sumstats
|
|
389
390
|
|
|
390
391
|
#20240320 check if non-effect allele is aligned with reference genome
|
|
391
|
-
def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array):
|
|
392
|
+
def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array, records_len: np.array):
|
|
393
|
+
# starting_positions and records_len must be 1D arrays containing data only for the chromosomes contained in x,
|
|
394
|
+
# and these arrays must be ordered in the same way as the chromosomes in np.unique(x['CHR'].values).
|
|
395
|
+
|
|
392
396
|
# status
|
|
393
397
|
#0 / -----> match
|
|
394
398
|
#1 / -----> Flipped Fixed
|
|
@@ -430,6 +434,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
430
434
|
max_len_nea = _nea.str.len().max()
|
|
431
435
|
max_len_ea = _ea.str.len().max()
|
|
432
436
|
|
|
437
|
+
########################################## mask for variants with out of range POS
|
|
438
|
+
mask_outlier = pos > records_len[chrom]
|
|
439
|
+
#########################################
|
|
433
440
|
|
|
434
441
|
# Let's apply the same magic used for the fasta records (check build_fasta_records() for details) to convert the NEA and EA to
|
|
435
442
|
# a numpy array of integers in a very fast way.
|
|
@@ -441,7 +448,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
441
448
|
nea = _nea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_nea}')
|
|
442
449
|
nea = nea.view('<u4').reshape(-1, max_len_nea).astype(np.uint8)
|
|
443
450
|
nea[nea == 0] = PADDING_VALUE # padding value
|
|
444
|
-
|
|
451
|
+
###########################################
|
|
452
|
+
|
|
453
|
+
###########################################
|
|
445
454
|
# Create a mask holding True at the position of non-padding values
|
|
446
455
|
mask_nea = nea != PADDING_VALUE
|
|
447
456
|
|
|
@@ -457,7 +466,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
457
466
|
ea = _ea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_ea}')
|
|
458
467
|
ea = ea.view('<u4').reshape(-1, max_len_ea).astype(np.uint8)
|
|
459
468
|
ea[ea == 0] = PADDING_VALUE # padding value
|
|
460
|
-
|
|
469
|
+
###########################################
|
|
470
|
+
|
|
471
|
+
###########################################
|
|
461
472
|
mask_ea = ea != PADDING_VALUE
|
|
462
473
|
|
|
463
474
|
rev_ea = _ea.str.translate(TRANSLATE_TABLE_COMPL).str.pad(max_len_ea, 'left', chr(PADDING_VALUE)).to_numpy().astype(f'<U{max_len_ea}')
|
|
@@ -502,8 +513,11 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
502
513
|
# Index the record array using the computed indices.
|
|
503
514
|
# Since we use np.take, indices must all have the same length, and this is why we added the padding to NEA
|
|
504
515
|
# and we create the indices using max_len_nea (long story short, we can't obtain a scattered/ragged array)
|
|
505
|
-
output_nea = np.take(record, indices)
|
|
506
|
-
|
|
516
|
+
output_nea = np.take(record, indices, mode="clip")
|
|
517
|
+
##################################################################
|
|
518
|
+
output_nea[mask_outlier] = PADDING_VALUE
|
|
519
|
+
##################################################################
|
|
520
|
+
|
|
507
521
|
# Check if the NEA is equal to the reference sequence at the given position
|
|
508
522
|
# In a non-matrix way, this is equivalent (for one single element) to:
|
|
509
523
|
# nea == record[pos-1: pos+len(nea)-1]
|
|
@@ -526,7 +540,10 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
526
540
|
indices_range = np.arange(max_len_ea)
|
|
527
541
|
indices = pos + indices_range
|
|
528
542
|
indices = indices + modified_indices
|
|
529
|
-
output_ea = np.take(record, indices)
|
|
543
|
+
output_ea = np.take(record, indices, mode="clip")
|
|
544
|
+
##################################################################
|
|
545
|
+
output_ea[mask_outlier] = PADDING_VALUE
|
|
546
|
+
##################################################################
|
|
530
547
|
|
|
531
548
|
ea_eq_ref = np.all((ea == output_ea) + ~mask_ea, 1)
|
|
532
549
|
rev_ea_eq_ref = np.all((rev_ea == output_ea) + ~mask_ea, 1)
|
|
@@ -581,24 +598,28 @@ def check_status(sumstats: pd.DataFrame, fasta_records_dict, log=Log(), verbose=
|
|
|
581
598
|
chrom,pos,ea,nea,status = sumstats.columns
|
|
582
599
|
|
|
583
600
|
# First, convert the fasta records to a single numpy array of integers
|
|
584
|
-
record, starting_positions_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
|
|
601
|
+
record, starting_positions_dict, records_len_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
|
|
585
602
|
|
|
586
603
|
# In _fast_check_status(), several 2D numpy arrays are created and they are padded to have shape[1] == max_len_nea or max_len_ea
|
|
587
604
|
# Since most of the NEA and EA strings are short, we perform the check first on the records having short NEA and EA strings,
|
|
588
605
|
# and then we perform the check on the records having long NEA and EA strings. In this way we can speed up the process (since the
|
|
589
606
|
# arrays are smaller) and save memory.
|
|
590
607
|
max_len = 4 # this is a chosen value, we could compute it using some stats about the length and count of NEA and EA strings
|
|
591
|
-
condition = (sumstats[nea].str.len() <= max_len)
|
|
608
|
+
condition = (sumstats[nea].str.len() <= max_len) & (sumstats[ea].str.len() <= max_len)
|
|
592
609
|
|
|
593
610
|
log.write(f" -Checking records for ( len(NEA) <= {max_len} and len(EA) <= {max_len} )", verbose=verbose)
|
|
594
611
|
sumstats_cond = sumstats[condition]
|
|
595
|
-
|
|
596
|
-
|
|
612
|
+
unique_chrom_cond = sumstats_cond[chrom].unique()
|
|
613
|
+
starting_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_cond])
|
|
614
|
+
records_len_cond = np.array([records_len_dict[k] for k in unique_chrom_cond])
|
|
615
|
+
sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond, records_len=records_len_cond)
|
|
597
616
|
|
|
598
617
|
log.write(f" -Checking records for ( len(NEA) > {max_len} or len(EA) > {max_len} )", verbose=verbose)
|
|
599
618
|
sumstats_not_cond = sumstats[~condition]
|
|
600
|
-
|
|
601
|
-
|
|
619
|
+
unique_chrom_not_cond = sumstats_not_cond[chrom].unique()
|
|
620
|
+
starting_not_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_not_cond])
|
|
621
|
+
records_len_not_cond = np.array([records_len_dict[k] for k in unique_chrom_not_cond])
|
|
622
|
+
sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond, records_len=records_len_not_cond)
|
|
602
623
|
|
|
603
624
|
return sumstats[status].values
|
|
604
625
|
|
|
@@ -708,10 +729,11 @@ def build_fasta_records(fasta_records_dict, pos_as_dict=True, log=Log(), verbose
|
|
|
708
729
|
starting_positions = np.cumsum(records_len) - records_len
|
|
709
730
|
if pos_as_dict:
|
|
710
731
|
starting_positions = {k: v for k, v in zip(fasta_records_dict.keys(), starting_positions)}
|
|
732
|
+
records_len_dict = {k: v for k, v in zip(fasta_records_dict.keys(), records_len)}
|
|
711
733
|
record = np.concatenate(all_r)
|
|
712
734
|
del all_r # free memory
|
|
713
735
|
|
|
714
|
-
return record, starting_positions
|
|
736
|
+
return record, starting_positions,records_len_dict
|
|
715
737
|
|
|
716
738
|
#######################################################################################################################################
|
|
717
739
|
|
|
@@ -912,6 +934,56 @@ def check_strand_status(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr
|
|
|
912
934
|
return status_pre+"5"+status_end
|
|
913
935
|
return status_pre+"8"+status_end
|
|
914
936
|
|
|
937
|
+
def check_strand_status_cache(data,cache,ref_infer=None,ref_alt_freq=None,chr_dict=get_number_to_chr(),trust_cache=True,log=Log(),verbose=True):
|
|
938
|
+
if not trust_cache:
|
|
939
|
+
assert ref_infer is not None, "If trust_cache is False, ref_infer must be provided"
|
|
940
|
+
log.warning("You are not trusting the cache, this will slow down the process. Please consider building a complete cache.")
|
|
941
|
+
|
|
942
|
+
if ref_infer is not None and not trust_cache:
|
|
943
|
+
vcf_reader = VariantFile(ref_infer)
|
|
944
|
+
|
|
945
|
+
if isinstance(data, pd.DataFrame):
|
|
946
|
+
data = data.values
|
|
947
|
+
|
|
948
|
+
in_cache = 0
|
|
949
|
+
new_statuses = []
|
|
950
|
+
|
|
951
|
+
for i in range(data.shape[0]):
|
|
952
|
+
_chrom, pos, ref, alt, eaf, status = data[i]
|
|
953
|
+
chrom = _chrom
|
|
954
|
+
start = pos - 1
|
|
955
|
+
end = pos
|
|
956
|
+
|
|
957
|
+
if chr_dict is not None: chrom=chr_dict[chrom]
|
|
958
|
+
|
|
959
|
+
status_pre=status[:6]
|
|
960
|
+
status_end=""
|
|
961
|
+
|
|
962
|
+
new_status = status_pre+"8"+status_end # default value
|
|
963
|
+
|
|
964
|
+
cache_key = f"{chrom}:{pos}:{ref}:{alt}"
|
|
965
|
+
if cache_key in cache:
|
|
966
|
+
in_cache += 1
|
|
967
|
+
record = cache[cache_key]
|
|
968
|
+
if record is None:
|
|
969
|
+
new_status = status_pre+"8"+status_end
|
|
970
|
+
else:
|
|
971
|
+
if (record<0.5) and (eaf<0.5):
|
|
972
|
+
new_status = status_pre+"1"+status_end
|
|
973
|
+
elif (record>0.5) and (eaf>0.5):
|
|
974
|
+
new_status = status_pre+"1"+status_end
|
|
975
|
+
else:
|
|
976
|
+
new_status = status_pre+"5"+status_end
|
|
977
|
+
else:
|
|
978
|
+
if not trust_cache:
|
|
979
|
+
# If we don't trust the cache as a not complete cache, we should perform the check reading from the VCF file
|
|
980
|
+
new_status = check_strand_status(_chrom, start, end, ref, alt, eaf, vcf_reader, ref_alt_freq, status, chr_dict)
|
|
981
|
+
|
|
982
|
+
new_statuses.append(new_status)
|
|
983
|
+
|
|
984
|
+
log.write(f" -Elements in cache: {in_cache}", verbose=verbose)
|
|
985
|
+
return new_statuses
|
|
986
|
+
|
|
915
987
|
|
|
916
988
|
def check_unkonwn_indel(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr_dict=get_number_to_chr(),daf_tolerance=0.2):
|
|
917
989
|
### input : unknown indel, both on genome (xx1[45]x)
|
|
@@ -939,6 +1011,65 @@ def check_unkonwn_indel(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr
|
|
|
939
1011
|
|
|
940
1012
|
return status_pre+"8"+status_end
|
|
941
1013
|
|
|
1014
|
+
|
|
1015
|
+
def check_unkonwn_indel_cache(data,cache,ref_infer=None,ref_alt_freq=None,chr_dict=get_number_to_chr(),daf_tolerance=0.2,trust_cache=True,log=Log(),verbose=True):
|
|
1016
|
+
if not trust_cache:
|
|
1017
|
+
assert ref_infer is not None, "If trust_cache is False, ref_infer must be provided"
|
|
1018
|
+
log.warning("You are not trusting the cache, this will slow down the process. Please consider building a complete cache.")
|
|
1019
|
+
|
|
1020
|
+
if ref_infer is not None:
|
|
1021
|
+
vcf_reader = VariantFile(ref_infer)
|
|
1022
|
+
|
|
1023
|
+
if isinstance(data, pd.DataFrame):
|
|
1024
|
+
data = data.values
|
|
1025
|
+
|
|
1026
|
+
in_cache = 0
|
|
1027
|
+
new_statuses = []
|
|
1028
|
+
|
|
1029
|
+
for i in range(data.shape[0]):
|
|
1030
|
+
_chrom, pos, ref, alt, eaf, status = data[i]
|
|
1031
|
+
chrom = _chrom
|
|
1032
|
+
|
|
1033
|
+
if chr_dict is not None: chrom=chr_dict[chrom]
|
|
1034
|
+
start = pos - 1
|
|
1035
|
+
end = pos
|
|
1036
|
+
|
|
1037
|
+
status_pre=status[:6]
|
|
1038
|
+
status_end=""
|
|
1039
|
+
|
|
1040
|
+
new_status = status_pre+"8"+status_end # default value
|
|
1041
|
+
|
|
1042
|
+
cache_key_ref_alt = f"{chrom}:{pos}:{ref}:{alt}"
|
|
1043
|
+
cache_key_alt_ref = f"{chrom}:{pos}:{alt}:{ref}"
|
|
1044
|
+
|
|
1045
|
+
if cache_key_ref_alt in cache:
|
|
1046
|
+
in_cache += 1
|
|
1047
|
+
record = cache[cache_key_ref_alt]
|
|
1048
|
+
if record is None:
|
|
1049
|
+
new_status = status_pre+"8"+status_end
|
|
1050
|
+
else:
|
|
1051
|
+
if abs(record - eaf)<daf_tolerance:
|
|
1052
|
+
new_status = status_pre+"3"+status_end
|
|
1053
|
+
|
|
1054
|
+
elif cache_key_alt_ref in cache:
|
|
1055
|
+
in_cache += 1
|
|
1056
|
+
record = cache[cache_key_alt_ref]
|
|
1057
|
+
if record is None:
|
|
1058
|
+
new_status = status_pre+"8"+status_end
|
|
1059
|
+
else:
|
|
1060
|
+
if abs(record - (1 - eaf))<daf_tolerance:
|
|
1061
|
+
new_status = status_pre+"6"+status_end
|
|
1062
|
+
|
|
1063
|
+
else:
|
|
1064
|
+
if not trust_cache:
|
|
1065
|
+
# If we don't trust the cache as a not complete cache, we should perform the check reading from the VCF file
|
|
1066
|
+
new_status = check_unkonwn_indel(_chrom, start, end, ref, alt, eaf, vcf_reader, ref_alt_freq, status, chr_dict, daf_tolerance)
|
|
1067
|
+
|
|
1068
|
+
new_statuses.append(new_status)
|
|
1069
|
+
|
|
1070
|
+
log.write(f" -Elements in cache: {in_cache}", verbose=verbose)
|
|
1071
|
+
return new_statuses
|
|
1072
|
+
|
|
942
1073
|
|
|
943
1074
|
def get_reverse_complementary_allele(a):
|
|
944
1075
|
dic = str.maketrans({
|
|
@@ -963,16 +1094,40 @@ def check_strand(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="N
|
|
|
963
1094
|
status_part = sumstats.apply(lambda x:check_strand_status(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict),axis=1)
|
|
964
1095
|
return status_part
|
|
965
1096
|
|
|
1097
|
+
def check_strand_cache(sumstats,cache,ref_infer,ref_alt_freq=None,chr_dict=get_number_to_chr(),trust_cache=True,log=Log(),verbose=True):
|
|
1098
|
+
assert cache is not None, "Cache must be provided"
|
|
1099
|
+
status_part = check_strand_status_cache(sumstats,cache,ref_infer,ref_alt_freq,chr_dict,trust_cache,log,verbose)
|
|
1100
|
+
return status_part
|
|
1101
|
+
|
|
966
1102
|
def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS",daf_tolerance=0.2):
|
|
967
1103
|
vcf_reader = VariantFile(ref_infer)
|
|
968
1104
|
status_part = sumstats.apply(lambda x:check_unkonwn_indel(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict,daf_tolerance),axis=1)
|
|
969
1105
|
return status_part
|
|
970
1106
|
|
|
1107
|
+
def check_indel_cache(sumstats,cache,ref_infer,ref_alt_freq=None,chr_dict=get_number_to_chr(),daf_tolerance=0.2,trust_cache=True,log=Log(),verbose=True):
|
|
1108
|
+
assert cache is not None, "Cache must be provided"
|
|
1109
|
+
status_part = check_unkonwn_indel_cache(sumstats,cache,ref_infer,ref_alt_freq,chr_dict,daf_tolerance,trust_cache,log,verbose)
|
|
1110
|
+
return status_part
|
|
1111
|
+
|
|
971
1112
|
##################################################################################################################################################
|
|
972
1113
|
|
|
973
1114
|
def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,daf_tolerance=0.20,remove_snp="",mode="pi",n_cores=1,remove_indel="",
|
|
974
1115
|
chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
|
|
975
|
-
chr_dict=None,verbose=True,log=Log()):
|
|
1116
|
+
chr_dict=None,cache_options={},verbose=True,log=Log()):
|
|
1117
|
+
'''
|
|
1118
|
+
Args:
|
|
1119
|
+
cache_options : A dictionary with the following keys:
|
|
1120
|
+
- cache_manager: CacheManager object or None. If any between cache_loader and cache_process is not None, or use_cache is True, a CacheManager object will be created automatically.
|
|
1121
|
+
- trust_cache: bool (optional, default: True). Whether to completely trust the cache or not. Trusting the cache means that any key not found inside the cache will be considered as a missing value even in the VCF file.
|
|
1122
|
+
- cache_loader: Object with a get_cache() method or None.
|
|
1123
|
+
- cache_process: Object with an apply_fn() method or None.
|
|
1124
|
+
- use_cache: bool (optional, default: False). If any of the cache_manager, cache_loader or cache_process is not None, this will be set to True automatically.
|
|
1125
|
+
If set to True and all between cache_manager, cache_loader and cache_process are None, the cache will be loaded (or built) on the spot.
|
|
1126
|
+
|
|
1127
|
+
The usefulness of a cache_loader or cache_process object is to pass a custom object which already has the cache loaded. This can be useful if the cache is loaded in background in another thread/process while other operations are performed.
|
|
1128
|
+
The cache_manager is a CacheManager object is used to expose the API to interact with the cache.
|
|
1129
|
+
'''
|
|
1130
|
+
|
|
976
1131
|
##start function with col checking##########################################################
|
|
977
1132
|
_start_line = "infer strand for palindromic SNPs/align indistinguishable indels"
|
|
978
1133
|
_end_line = "inferring strand for palindromic SNPs/align indistinguishable indels"
|
|
@@ -995,6 +1150,16 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
995
1150
|
|
|
996
1151
|
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
997
1152
|
|
|
1153
|
+
# Setup cache variables
|
|
1154
|
+
cache_manager = cache_options.get("cache_manager", None)
|
|
1155
|
+
if cache_manager is not None:
|
|
1156
|
+
assert isinstance(cache_manager, CacheManager), "cache_manager must be a CacheManager object"
|
|
1157
|
+
trust_cache = cache_options.get("trust_cache", True)
|
|
1158
|
+
cache_loader = cache_options.get("cache_loader", None)
|
|
1159
|
+
cache_process = cache_options.get("cache_process", None)
|
|
1160
|
+
use_cache = any(c is not None for c in [cache_manager, cache_loader, cache_process]) or cache_options.get('use_cache', False)
|
|
1161
|
+
_n_cores = n_cores # backup n_cores
|
|
1162
|
+
|
|
998
1163
|
log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
|
|
999
1164
|
|
|
1000
1165
|
if "p" in mode:
|
|
@@ -1022,16 +1187,30 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
1022
1187
|
#########################################################################################
|
|
1023
1188
|
if sum(unknow_palindromic_to_check)>0:
|
|
1024
1189
|
if sum(unknow_palindromic_to_check)<10000:
|
|
1025
|
-
n_cores=1
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1190
|
+
n_cores=1
|
|
1191
|
+
|
|
1192
|
+
if use_cache and cache_manager is None:
|
|
1193
|
+
cache_manager = CacheManager(base_path=ref_infer, cache_loader=cache_loader, cache_process=cache_process,
|
|
1194
|
+
ref_alt_freq=ref_alt_freq, category=PALINDROMIC_INDEL,
|
|
1195
|
+
n_cores=_n_cores, log=log, verbose=verbose)
|
|
1196
|
+
|
|
1197
|
+
log.write(" -Starting strand inference for palindromic SNPs...",verbose=verbose)
|
|
1198
|
+
df_to_check = sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]]
|
|
1199
|
+
|
|
1200
|
+
if use_cache and cache_manager.cache_len > 0:
|
|
1201
|
+
log.write(" -Using cache for strand inference",verbose=verbose)
|
|
1202
|
+
status_inferred = cache_manager.apply_fn(check_strand_cache, sumstats=df_to_check, ref_infer=ref_infer, ref_alt_freq=ref_alt_freq, chr_dict=chr_dict, trust_cache=trust_cache, log=log, verbose=verbose)
|
|
1203
|
+
sumstats.loc[unknow_palindromic_to_check,status] = status_inferred
|
|
1204
|
+
else:
|
|
1205
|
+
#df_split = np.array_split(df_to_check, n_cores)
|
|
1206
|
+
df_split = _df_split(df_to_check, n_cores)
|
|
1207
|
+
pool = Pool(n_cores)
|
|
1208
|
+
map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
1209
|
+
status_inferred = pd.concat(pool.map(map_func,df_split))
|
|
1210
|
+
sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
|
|
1211
|
+
pool.close()
|
|
1212
|
+
pool.join()
|
|
1213
|
+
log.write(" -Finished strand inference.",verbose=verbose)
|
|
1035
1214
|
else:
|
|
1036
1215
|
log.warning("No palindromic variants available for checking.")
|
|
1037
1216
|
#########################################################################################
|
|
@@ -1082,15 +1261,30 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
1082
1261
|
|
|
1083
1262
|
if sum(unknow_indel)>0:
|
|
1084
1263
|
if sum(unknow_indel)<10000:
|
|
1085
|
-
n_cores=1
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1264
|
+
n_cores=1
|
|
1265
|
+
|
|
1266
|
+
if use_cache and cache_manager is None:
|
|
1267
|
+
cache_manager = CacheManager(base_path=ref_infer, cache_loader=cache_loader, cache_process=cache_process,
|
|
1268
|
+
ref_alt_freq=ref_alt_freq, category=PALINDROMIC_INDEL,
|
|
1269
|
+
n_cores=_n_cores, log=log, verbose=verbose)
|
|
1270
|
+
|
|
1271
|
+
log.write(" -Starting indistinguishable indel inference...",verbose=verbose)
|
|
1272
|
+
df_to_check = sumstats.loc[unknow_indel,[chr,pos,ref,alt,eaf,status]]
|
|
1273
|
+
|
|
1274
|
+
if use_cache and cache_manager.cache_len > 0:
|
|
1275
|
+
log.write(" -Using cache for indel inference",verbose=verbose)
|
|
1276
|
+
status_inferred = cache_manager.apply_fn(check_indel_cache, sumstats=df_to_check, ref_infer=ref_infer, ref_alt_freq=ref_alt_freq, chr_dict=chr_dict, daf_tolerance=daf_tolerance, trust_cache=trust_cache, log=log, verbose=verbose)
|
|
1277
|
+
sumstats.loc[unknow_indel,status] = status_inferred
|
|
1278
|
+
else:
|
|
1279
|
+
#df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
|
|
1280
|
+
df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
|
|
1281
|
+
pool = Pool(n_cores)
|
|
1282
|
+
map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
|
|
1283
|
+
status_inferred = pd.concat(pool.map(map_func,df_split))
|
|
1284
|
+
sumstats.loc[unknow_indel,status] = status_inferred.values
|
|
1285
|
+
pool.close()
|
|
1286
|
+
pool.join()
|
|
1287
|
+
log.write(" -Finished indistinguishable indel inference.",verbose=verbose)
|
|
1094
1288
|
|
|
1095
1289
|
#########################################################################################
|
|
1096
1290
|
|
gwaslab/qc_fix_sumstats.py
CHANGED
|
@@ -792,7 +792,7 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
|
|
|
792
792
|
###############################################################################################################
|
|
793
793
|
# 20220721
|
|
794
794
|
|
|
795
|
-
def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
|
|
795
|
+
def parallelnormalizeallele(sumstats,mode="s",snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",chunk=3000000,n_cores=1,verbose=True,log=Log()):
|
|
796
796
|
##start function with col checking##########################################################
|
|
797
797
|
_start_line = "normalize indels"
|
|
798
798
|
_end_line = "normalizing indels"
|
|
@@ -819,7 +819,51 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
819
819
|
log.write("Finished normalizing variants successfully!", verbose=verbose)
|
|
820
820
|
return sumstats
|
|
821
821
|
###############################################################################################################
|
|
822
|
-
if
|
|
822
|
+
if mode=="v":
|
|
823
|
+
if sum(variants_to_check)<100000:
|
|
824
|
+
n_cores=1
|
|
825
|
+
if n_cores==1:
|
|
826
|
+
normalized_pd, changed_index = fastnormalizeallele(sumstats.loc[variants_to_check,[pos,nea,ea,status]],pos=pos ,nea=nea,ea=ea,status=status,chunk=chunk, log=log, verbose=verbose)
|
|
827
|
+
else:
|
|
828
|
+
pool = Pool(n_cores)
|
|
829
|
+
map_func = partial(fastnormalizeallele,pos=pos,nea=nea,ea=ea,status=status)
|
|
830
|
+
df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
|
|
831
|
+
results = pool.map(map_func,df_split)
|
|
832
|
+
normalized_pd = pd.concat([i[0] for i in results])
|
|
833
|
+
changed_index = np.concatenate([i[1] for i in results])
|
|
834
|
+
del results
|
|
835
|
+
pool.close()
|
|
836
|
+
pool.join()
|
|
837
|
+
gc.collect()
|
|
838
|
+
###############################################################################################################
|
|
839
|
+
try:
|
|
840
|
+
example_sumstats = sumstats.loc[changed_index,:].head()
|
|
841
|
+
changed_num = len(changed_index)
|
|
842
|
+
if changed_num>0:
|
|
843
|
+
if snpid in example_sumstats.columns:
|
|
844
|
+
before_normalize_id = example_sumstats.loc[variants_to_check,snpid]
|
|
845
|
+
elif rsid in example_sumstats.columns:
|
|
846
|
+
before_normalize_id = example_sumstats.loc[variants_to_check,rsid]
|
|
847
|
+
else:
|
|
848
|
+
before_normalize_id = example_sumstats.index
|
|
849
|
+
|
|
850
|
+
log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
|
|
851
|
+
for i in before_normalize_id.values:
|
|
852
|
+
log.write(i,end=" ",show_time=False)
|
|
853
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
854
|
+
|
|
855
|
+
log.write(" -Not normalized allele:",end="", verbose=verbose)
|
|
856
|
+
for i in example_sumstats[[ea,nea]].values:
|
|
857
|
+
log.write(i,end="",show_time=False, verbose=verbose)
|
|
858
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
859
|
+
log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
|
|
860
|
+
else:
|
|
861
|
+
log.write(" -All variants are already normalized..", verbose=verbose)
|
|
862
|
+
except:
|
|
863
|
+
pass
|
|
864
|
+
|
|
865
|
+
##########################################################################################################################################################
|
|
866
|
+
elif mode=="s":
|
|
823
867
|
if sum(variants_to_check)<10000:
|
|
824
868
|
n_cores=1
|
|
825
869
|
pool = Pool(n_cores)
|
|
@@ -829,35 +873,36 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
829
873
|
normalized_pd = pd.concat(pool.map(map_func,df_split))
|
|
830
874
|
pool.close()
|
|
831
875
|
pool.join()
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
876
|
+
|
|
877
|
+
before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
|
|
878
|
+
changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
|
|
879
|
+
if changed_num>0:
|
|
880
|
+
if snpid in sumstats.columns:
|
|
881
|
+
before_normalize_id = sumstats.loc[variants_to_check,snpid]
|
|
882
|
+
elif rsid in sumstats.columns:
|
|
883
|
+
before_normalize_id = sumstats.loc[variants_to_check,rsid]
|
|
884
|
+
else:
|
|
885
|
+
before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
|
|
886
|
+
|
|
887
|
+
log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
|
|
888
|
+
for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
|
|
889
|
+
log.write(i,end=" ",show_time=False)
|
|
890
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
843
891
|
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
|
|
854
|
-
else:
|
|
855
|
-
log.write(" -All variants are already normalized..", verbose=verbose)
|
|
856
|
-
###################################################################################################################
|
|
892
|
+
log.write(" -Not normalized allele:",end="", verbose=verbose)
|
|
893
|
+
for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
|
|
894
|
+
log.write(i,end="",show_time=False, verbose=verbose)
|
|
895
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
896
|
+
log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
|
|
897
|
+
else:
|
|
898
|
+
log.write(" -All variants are already normalized..", verbose=verbose)
|
|
899
|
+
###################################################################################################################
|
|
900
|
+
|
|
857
901
|
categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
|
|
858
902
|
sumstats[ea] = pd.Categorical(sumstats[ea],categories = categories)
|
|
859
903
|
sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
|
|
860
904
|
sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
|
|
905
|
+
|
|
861
906
|
try:
|
|
862
907
|
sumstats[pos] = sumstats[pos].astype('Int64')
|
|
863
908
|
except:
|
|
@@ -873,6 +918,67 @@ def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
|
|
|
873
918
|
sumstats = pd.DataFrame(normalized.to_list(), columns=[pos,nea,ea,status],index=sumstats.index)
|
|
874
919
|
return sumstats
|
|
875
920
|
|
|
921
|
+
def fastnormalizeallele(insumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS",chunk=3000000,log=Log(),verbose=False):
|
|
922
|
+
log.write(" -Number of variants to check:{}".format(len(insumstats)), verbose=verbose)
|
|
923
|
+
log.write(" -Chunk size:{}".format(chunk), verbose=verbose)
|
|
924
|
+
log.write(" -Processing in chunks:",end="", verbose=verbose)
|
|
925
|
+
changed_index = np.array([])
|
|
926
|
+
for part_n in range(len(insumstats)//chunk+1):
|
|
927
|
+
log.write(part_n, end=" ",show_time=False, verbose=verbose)
|
|
928
|
+
insumstats["NEA"] = insumstats["NEA"].astype("string")
|
|
929
|
+
insumstats["EA"] = insumstats["EA"].astype("string")
|
|
930
|
+
insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:],changed_index_single = normalizae_chunk(insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:].copy())
|
|
931
|
+
changed_index = np.concatenate([changed_index,changed_index_single])
|
|
932
|
+
gc.collect()
|
|
933
|
+
log.write("\n",end="",show_time=False, verbose=verbose)
|
|
934
|
+
return insumstats, changed_index
|
|
935
|
+
|
|
936
|
+
def normalizae_chunk(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
|
|
937
|
+
# already normalized
|
|
938
|
+
|
|
939
|
+
is_same = sumstats["NEA"] == sumstats["EA"]
|
|
940
|
+
is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
|
|
941
|
+
|
|
942
|
+
# a series to keep tracking of variants that are modified
|
|
943
|
+
changed = sumstats["NEA"] != sumstats["NEA"]
|
|
944
|
+
|
|
945
|
+
# right side
|
|
946
|
+
ea_len = sumstats["NEA"].str.len()
|
|
947
|
+
nea_len = sumstats["EA"].str.len()
|
|
948
|
+
max_length=max(ea_len.max(), nea_len.max())
|
|
949
|
+
|
|
950
|
+
for i in range(1, max_length):
|
|
951
|
+
is_pop = (sumstats["NEA"].str[-1] == sumstats["EA"].str[-1]) & (~is_normalized)
|
|
952
|
+
if sum(is_pop)==0:
|
|
953
|
+
break
|
|
954
|
+
if i ==1:
|
|
955
|
+
changed = changed | is_pop
|
|
956
|
+
nea_len[is_pop] = nea_len[is_pop] -1
|
|
957
|
+
ea_len[is_pop] = ea_len[is_pop] -1
|
|
958
|
+
sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[:-1]
|
|
959
|
+
sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[:-1]
|
|
960
|
+
is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
|
|
961
|
+
gc.collect()
|
|
962
|
+
|
|
963
|
+
# left side
|
|
964
|
+
max_length=max(sumstats["NEA"].str.len().max(), sumstats["EA"].str.len().max())
|
|
965
|
+
for i in range(1, max_length):
|
|
966
|
+
is_pop = (sumstats["NEA"].str[0] == sumstats["EA"].str[0]) & (~is_normalized)
|
|
967
|
+
if sum(is_pop)==0:
|
|
968
|
+
break
|
|
969
|
+
if i ==1:
|
|
970
|
+
changed = changed | is_pop
|
|
971
|
+
sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[1:]
|
|
972
|
+
sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[1:]
|
|
973
|
+
sumstats.loc[is_pop, "POS"] = sumstats.loc[is_pop,"POS"] + 1
|
|
974
|
+
is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
|
|
975
|
+
gc.collect()
|
|
976
|
+
|
|
977
|
+
sumstats.loc[is_normalized,status] = vchange_status(sumstats.loc[is_normalized, status], 5,"4","0")
|
|
978
|
+
sumstats.loc[is_same,status] = vchange_status(sumstats.loc[is_same, status], 5,"4","3")
|
|
979
|
+
changed_index = sumstats[changed].index
|
|
980
|
+
return sumstats, changed_index.values
|
|
981
|
+
|
|
876
982
|
def normalizevariant(pos,a,b,status):
|
|
877
983
|
# single record
|
|
878
984
|
# https://genome.sph.umich.edu/wiki/Variant_Normalization
|
|
@@ -1611,12 +1717,5 @@ def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
|
|
|
1611
1717
|
|
|
1612
1718
|
###############################################################################################################
|
|
1613
1719
|
def _df_split(dataframe, n):
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
for index in range(0, dataframe.shape[0], chunk_size):
|
|
1618
|
-
chunks.append(
|
|
1619
|
-
dataframe.iloc[index:index + chunk_size]
|
|
1620
|
-
)
|
|
1621
|
-
|
|
1622
|
-
return chunks
|
|
1720
|
+
k, m = divmod(len(dataframe), n)
|
|
1721
|
+
return [dataframe.iloc[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]
|