gwaslab 3.4.43__py3-none-any.whl → 3.4.45__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/g_Sumstats.py +4 -2
- gwaslab/g_SumstatsPair.py +1 -1
- gwaslab/g_vchange_status.py +4 -2
- gwaslab/g_version.py +2 -2
- gwaslab/hm_harmonize_sumstats.py +45 -17
- gwaslab/qc_fix_sumstats.py +132 -26
- {gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/METADATA +5 -5
- {gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/RECORD +12 -12
- {gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/top_level.txt +0 -0
gwaslab/g_Sumstats.py
CHANGED
|
@@ -356,8 +356,10 @@ class Sumstats():
|
|
|
356
356
|
if ref_seq is not None:
|
|
357
357
|
if ref_seq_mode=="v":
|
|
358
358
|
self.data = checkref(self.data,ref_seq,log=self.log,**checkref_args)
|
|
359
|
-
|
|
359
|
+
elif ref_seq_mode=="s":
|
|
360
360
|
self.data = oldcheckref(self.data,ref_seq,log=self.log,**checkref_args)
|
|
361
|
+
else:
|
|
362
|
+
raise ValueError("ref_seq_mode should be 'v' (vectorized, faster) or 's' (sequential, slower)")
|
|
361
363
|
|
|
362
364
|
self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
|
|
363
365
|
|
|
@@ -429,7 +431,7 @@ class Sumstats():
|
|
|
429
431
|
if ref_seq_mode=="v":
|
|
430
432
|
self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
|
|
431
433
|
self.data = checkref(self.data,ref_seq,log=self.log,**kwargs)
|
|
432
|
-
|
|
434
|
+
elif ref_seq_mode=="s":
|
|
433
435
|
self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
|
|
434
436
|
self.data = oldcheckref(self.data,ref_seq,log=self.log,**kwargs)
|
|
435
437
|
def infer_strand(self,ref_infer,**kwargs):
|
gwaslab/g_SumstatsPair.py
CHANGED
|
@@ -139,7 +139,7 @@ class SumstatsPair( ):
|
|
|
139
139
|
self.clumps["clumps"], self.clumps["plink_log"] = _clump(self.data, log=self.log, p="P_1",mlog10p="MLOG10P_1", study = self.study_name, **kwargs)
|
|
140
140
|
|
|
141
141
|
def to_coloc(self,**kwargs):
|
|
142
|
-
self.to_finemapping_file_path, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
|
|
142
|
+
self.to_finemapping_file_path, output_file_list, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
|
|
143
143
|
|
|
144
144
|
def run_coloc_susie(self,**kwargs):
|
|
145
145
|
|
gwaslab/g_vchange_status.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
|
|
3
|
+
CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
4
|
+
|
|
3
5
|
def vchange_status(status,digit,before,after):
|
|
4
6
|
dic={}
|
|
5
7
|
for i in range(len(before)):
|
|
6
8
|
dic[before[i]]=after[i]
|
|
7
9
|
if digit>1:
|
|
8
|
-
return status.str[:digit-1]+status.str[digit-1].replace(dic)+status.str[digit:]
|
|
10
|
+
return pd.Categorical(status.str[:digit-1]+status.str[digit-1].replace(dic)+status.str[digit:],categories=CATEGORIES)
|
|
9
11
|
else:
|
|
10
|
-
return status.str[digit-1].replace(dic)+status.str[digit:]
|
|
12
|
+
return pd.Categorical(status.str[digit-1].replace(dic)+status.str[digit:],categories=CATEGORIES)
|
|
11
13
|
|
|
12
14
|
def copy_status(from_status,to_status, digit):
|
|
13
15
|
if digit>1:
|
gwaslab/g_version.py
CHANGED
gwaslab/hm_harmonize_sumstats.py
CHANGED
|
@@ -355,7 +355,11 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
|
|
|
355
355
|
|
|
356
356
|
log.write("\n",end="",show_time=False,verbose=verbose)
|
|
357
357
|
|
|
358
|
-
|
|
358
|
+
CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
359
|
+
sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
|
|
360
|
+
#sumstats[status] = sumstats[status].astype("string")
|
|
361
|
+
|
|
362
|
+
|
|
359
363
|
available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
|
|
360
364
|
status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
|
|
361
365
|
status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
|
|
@@ -389,7 +393,10 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
|
|
|
389
393
|
return sumstats
|
|
390
394
|
|
|
391
395
|
#20240320 check if non-effect allele is aligned with reference genome
|
|
392
|
-
def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array):
|
|
396
|
+
def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array, records_len: np.array):
|
|
397
|
+
# starting_positions and records_len must be 1D arrays containing data only for the chromosomes contained in x,
|
|
398
|
+
# and these arrays must be ordered in the same way as the chromosomes in np.unique(x['CHR'].values).
|
|
399
|
+
|
|
393
400
|
# status
|
|
394
401
|
#0 / -----> match
|
|
395
402
|
#1 / -----> Flipped Fixed
|
|
@@ -431,6 +438,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
431
438
|
max_len_nea = _nea.str.len().max()
|
|
432
439
|
max_len_ea = _ea.str.len().max()
|
|
433
440
|
|
|
441
|
+
########################################## mask for variants with out of range POS
|
|
442
|
+
mask_outlier = pos > records_len[chrom]
|
|
443
|
+
#########################################
|
|
434
444
|
|
|
435
445
|
# Let's apply the same magic used for the fasta records (check build_fasta_records() for details) to convert the NEA and EA to
|
|
436
446
|
# a numpy array of integers in a very fast way.
|
|
@@ -442,7 +452,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
442
452
|
nea = _nea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_nea}')
|
|
443
453
|
nea = nea.view('<u4').reshape(-1, max_len_nea).astype(np.uint8)
|
|
444
454
|
nea[nea == 0] = PADDING_VALUE # padding value
|
|
445
|
-
|
|
455
|
+
###########################################
|
|
456
|
+
|
|
457
|
+
###########################################
|
|
446
458
|
# Create a mask holding True at the position of non-padding values
|
|
447
459
|
mask_nea = nea != PADDING_VALUE
|
|
448
460
|
|
|
@@ -458,7 +470,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
458
470
|
ea = _ea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_ea}')
|
|
459
471
|
ea = ea.view('<u4').reshape(-1, max_len_ea).astype(np.uint8)
|
|
460
472
|
ea[ea == 0] = PADDING_VALUE # padding value
|
|
461
|
-
|
|
473
|
+
###########################################
|
|
474
|
+
|
|
475
|
+
###########################################
|
|
462
476
|
mask_ea = ea != PADDING_VALUE
|
|
463
477
|
|
|
464
478
|
rev_ea = _ea.str.translate(TRANSLATE_TABLE_COMPL).str.pad(max_len_ea, 'left', chr(PADDING_VALUE)).to_numpy().astype(f'<U{max_len_ea}')
|
|
@@ -503,8 +517,11 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
503
517
|
# Index the record array using the computed indices.
|
|
504
518
|
# Since we use np.take, indices must all have the same length, and this is why we added the padding to NEA
|
|
505
519
|
# and we create the indices using max_len_nea (long story short, we can't obtain a scattered/ragged array)
|
|
506
|
-
output_nea = np.take(record, indices)
|
|
507
|
-
|
|
520
|
+
output_nea = np.take(record, indices, mode="clip")
|
|
521
|
+
##################################################################
|
|
522
|
+
output_nea[mask_outlier] = PADDING_VALUE
|
|
523
|
+
##################################################################
|
|
524
|
+
|
|
508
525
|
# Check if the NEA is equal to the reference sequence at the given position
|
|
509
526
|
# In a non-matrix way, this is equivalent (for one single element) to:
|
|
510
527
|
# nea == record[pos-1: pos+len(nea)-1]
|
|
@@ -527,7 +544,10 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
|
|
|
527
544
|
indices_range = np.arange(max_len_ea)
|
|
528
545
|
indices = pos + indices_range
|
|
529
546
|
indices = indices + modified_indices
|
|
530
|
-
output_ea = np.take(record, indices)
|
|
547
|
+
output_ea = np.take(record, indices, mode="clip")
|
|
548
|
+
##################################################################
|
|
549
|
+
output_ea[mask_outlier] = PADDING_VALUE
|
|
550
|
+
##################################################################
|
|
531
551
|
|
|
532
552
|
ea_eq_ref = np.all((ea == output_ea) + ~mask_ea, 1)
|
|
533
553
|
rev_ea_eq_ref = np.all((rev_ea == output_ea) + ~mask_ea, 1)
|
|
@@ -582,24 +602,28 @@ def check_status(sumstats: pd.DataFrame, fasta_records_dict, log=Log(), verbose=
|
|
|
582
602
|
chrom,pos,ea,nea,status = sumstats.columns
|
|
583
603
|
|
|
584
604
|
# First, convert the fasta records to a single numpy array of integers
|
|
585
|
-
record, starting_positions_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
|
|
605
|
+
record, starting_positions_dict, records_len_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
|
|
586
606
|
|
|
587
607
|
# In _fast_check_status(), several 2D numpy arrays are created and they are padded to have shape[1] == max_len_nea or max_len_ea
|
|
588
608
|
# Since most of the NEA and EA strings are short, we perform the check first on the records having short NEA and EA strings,
|
|
589
609
|
# and then we perform the check on the records having long NEA and EA strings. In this way we can speed up the process (since the
|
|
590
610
|
# arrays are smaller) and save memory.
|
|
591
611
|
max_len = 4 # this is a chosen value, we could compute it using some stats about the length and count of NEA and EA strings
|
|
592
|
-
condition = (sumstats[nea].str.len() <= max_len)
|
|
612
|
+
condition = (sumstats[nea].str.len() <= max_len) & (sumstats[ea].str.len() <= max_len)
|
|
593
613
|
|
|
594
614
|
log.write(f" -Checking records for ( len(NEA) <= {max_len} and len(EA) <= {max_len} )", verbose=verbose)
|
|
595
615
|
sumstats_cond = sumstats[condition]
|
|
596
|
-
|
|
597
|
-
|
|
616
|
+
unique_chrom_cond = sumstats_cond[chrom].unique()
|
|
617
|
+
starting_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_cond])
|
|
618
|
+
records_len_cond = np.array([records_len_dict[k] for k in unique_chrom_cond])
|
|
619
|
+
sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond, records_len=records_len_cond)
|
|
598
620
|
|
|
599
621
|
log.write(f" -Checking records for ( len(NEA) > {max_len} or len(EA) > {max_len} )", verbose=verbose)
|
|
600
622
|
sumstats_not_cond = sumstats[~condition]
|
|
601
|
-
|
|
602
|
-
|
|
623
|
+
unique_chrom_not_cond = sumstats_not_cond[chrom].unique()
|
|
624
|
+
starting_not_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_not_cond])
|
|
625
|
+
records_len_not_cond = np.array([records_len_dict[k] for k in unique_chrom_not_cond])
|
|
626
|
+
sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond, records_len=records_len_not_cond)
|
|
603
627
|
|
|
604
628
|
return sumstats[status].values
|
|
605
629
|
|
|
@@ -649,9 +673,11 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
|
|
|
649
673
|
sumstats_to_check = sumstats.loc[to_check_ref,[chrom,pos,ea,nea,status]]
|
|
650
674
|
sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
|
|
651
675
|
log.write(" -Finished checking records", verbose=verbose)
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
676
|
+
|
|
677
|
+
CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
678
|
+
sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
|
|
679
|
+
#sumstats[status] = sumstats[status].astype("string")
|
|
680
|
+
|
|
655
681
|
available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
|
|
656
682
|
status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
|
|
657
683
|
status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
|
|
@@ -680,6 +706,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
|
|
|
680
706
|
if remove is True:
|
|
681
707
|
sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
|
|
682
708
|
log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
|
|
709
|
+
|
|
683
710
|
|
|
684
711
|
finished(log, verbose, _end_line)
|
|
685
712
|
return sumstats
|
|
@@ -709,10 +736,11 @@ def build_fasta_records(fasta_records_dict, pos_as_dict=True, log=Log(), verbose
|
|
|
709
736
|
starting_positions = np.cumsum(records_len) - records_len
|
|
710
737
|
if pos_as_dict:
|
|
711
738
|
starting_positions = {k: v for k, v in zip(fasta_records_dict.keys(), starting_positions)}
|
|
739
|
+
records_len_dict = {k: v for k, v in zip(fasta_records_dict.keys(), records_len)}
|
|
712
740
|
record = np.concatenate(all_r)
|
|
713
741
|
del all_r # free memory
|
|
714
742
|
|
|
715
|
-
return record, starting_positions
|
|
743
|
+
return record, starting_positions,records_len_dict
|
|
716
744
|
|
|
717
745
|
#######################################################################################################################################
|
|
718
746
|
|
gwaslab/qc_fix_sumstats.py
CHANGED
|
@@ -792,7 +792,7 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
|
|
|
792
792
|
###############################################################################################################
|
|
793
793
|
# 20220721
|
|
794
794
|
|
|
795
|
-
def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
|
|
795
|
+
def parallelnormalizeallele(sumstats,mode="s",snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",chunk=3000000,n_cores=1,verbose=True,log=Log()):
|
|
796
796
|
##start function with col checking##########################################################
|
|
797
797
|
_start_line = "normalize indels"
|
|
798
798
|
_end_line = "normalizing indels"
|
|
@@ -819,7 +819,51 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
819
819
|
log.write("Finished normalizing variants successfully!", verbose=verbose)
|
|
820
820
|
return sumstats
|
|
821
821
|
###############################################################################################################
|
|
822
|
-
if
|
|
822
|
+
if mode=="v":
|
|
823
|
+
if sum(variants_to_check)<100000:
|
|
824
|
+
n_cores=1
|
|
825
|
+
if n_cores==1:
|
|
826
|
+
normalized_pd, changed_index = fastnormalizeallele(sumstats.loc[variants_to_check,[pos,nea,ea,status]],pos=pos ,nea=nea,ea=ea,status=status,chunk=chunk, log=log, verbose=verbose)
|
|
827
|
+
else:
|
|
828
|
+
pool = Pool(n_cores)
|
|
829
|
+
map_func = partial(fastnormalizeallele,pos=pos,nea=nea,ea=ea,status=status)
|
|
830
|
+
df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
|
|
831
|
+
results = pool.map(map_func,df_split)
|
|
832
|
+
normalized_pd = pd.concat([i[0] for i in results])
|
|
833
|
+
changed_index = np.concatenate([i[1] for i in results])
|
|
834
|
+
del results
|
|
835
|
+
pool.close()
|
|
836
|
+
pool.join()
|
|
837
|
+
gc.collect()
|
|
838
|
+
###############################################################################################################
|
|
839
|
+
try:
|
|
840
|
+
example_sumstats = sumstats.loc[changed_index,:].head()
|
|
841
|
+
changed_num = len(changed_index)
|
|
842
|
+
if changed_num>0:
|
|
843
|
+
if snpid in example_sumstats.columns:
|
|
844
|
+
before_normalize_id = example_sumstats.loc[variants_to_check,snpid]
|
|
845
|
+
elif rsid in example_sumstats.columns:
|
|
846
|
+
before_normalize_id = example_sumstats.loc[variants_to_check,rsid]
|
|
847
|
+
else:
|
|
848
|
+
before_normalize_id = example_sumstats.index
|
|
849
|
+
|
|
850
|
+
log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
|
|
851
|
+
for i in before_normalize_id.values:
|
|
852
|
+
log.write(i,end=" ",show_time=False)
|
|
853
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
854
|
+
|
|
855
|
+
log.write(" -Not normalized allele:",end="", verbose=verbose)
|
|
856
|
+
for i in example_sumstats[[ea,nea]].values:
|
|
857
|
+
log.write(i,end="",show_time=False, verbose=verbose)
|
|
858
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
859
|
+
log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
|
|
860
|
+
else:
|
|
861
|
+
log.write(" -All variants are already normalized..", verbose=verbose)
|
|
862
|
+
except:
|
|
863
|
+
pass
|
|
864
|
+
|
|
865
|
+
##########################################################################################################################################################
|
|
866
|
+
elif mode=="s":
|
|
823
867
|
if sum(variants_to_check)<10000:
|
|
824
868
|
n_cores=1
|
|
825
869
|
pool = Pool(n_cores)
|
|
@@ -829,35 +873,36 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
|
|
|
829
873
|
normalized_pd = pd.concat(pool.map(map_func,df_split))
|
|
830
874
|
pool.close()
|
|
831
875
|
pool.join()
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
876
|
+
|
|
877
|
+
before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
|
|
878
|
+
changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
|
|
879
|
+
if changed_num>0:
|
|
880
|
+
if snpid in sumstats.columns:
|
|
881
|
+
before_normalize_id = sumstats.loc[variants_to_check,snpid]
|
|
882
|
+
elif rsid in sumstats.columns:
|
|
883
|
+
before_normalize_id = sumstats.loc[variants_to_check,rsid]
|
|
884
|
+
else:
|
|
885
|
+
before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
|
|
886
|
+
|
|
887
|
+
log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
|
|
888
|
+
for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
|
|
889
|
+
log.write(i,end=" ",show_time=False)
|
|
890
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
843
891
|
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
|
|
854
|
-
else:
|
|
855
|
-
log.write(" -All variants are already normalized..", verbose=verbose)
|
|
856
|
-
###################################################################################################################
|
|
892
|
+
log.write(" -Not normalized allele:",end="", verbose=verbose)
|
|
893
|
+
for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
|
|
894
|
+
log.write(i,end="",show_time=False, verbose=verbose)
|
|
895
|
+
log.write("... \n",end="",show_time=False, verbose=verbose)
|
|
896
|
+
log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
|
|
897
|
+
else:
|
|
898
|
+
log.write(" -All variants are already normalized..", verbose=verbose)
|
|
899
|
+
###################################################################################################################
|
|
900
|
+
|
|
857
901
|
categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
|
|
858
902
|
sumstats[ea] = pd.Categorical(sumstats[ea],categories = categories)
|
|
859
903
|
sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
|
|
860
904
|
sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
|
|
905
|
+
|
|
861
906
|
try:
|
|
862
907
|
sumstats[pos] = sumstats[pos].astype('Int64')
|
|
863
908
|
except:
|
|
@@ -873,6 +918,67 @@ def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
|
|
|
873
918
|
sumstats = pd.DataFrame(normalized.to_list(), columns=[pos,nea,ea,status],index=sumstats.index)
|
|
874
919
|
return sumstats
|
|
875
920
|
|
|
921
|
+
def fastnormalizeallele(insumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS",chunk=3000000,log=Log(),verbose=False):
|
|
922
|
+
log.write(" -Number of variants to check:{}".format(len(insumstats)), verbose=verbose)
|
|
923
|
+
log.write(" -Chunk size:{}".format(chunk), verbose=verbose)
|
|
924
|
+
log.write(" -Processing in chunks:",end="", verbose=verbose)
|
|
925
|
+
changed_index = np.array([])
|
|
926
|
+
for part_n in range(len(insumstats)//chunk+1):
|
|
927
|
+
log.write(part_n, end=" ",show_time=False, verbose=verbose)
|
|
928
|
+
insumstats["NEA"] = insumstats["NEA"].astype("string")
|
|
929
|
+
insumstats["EA"] = insumstats["EA"].astype("string")
|
|
930
|
+
insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:],changed_index_single = normalizae_chunk(insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:].copy())
|
|
931
|
+
changed_index = np.concatenate([changed_index,changed_index_single])
|
|
932
|
+
gc.collect()
|
|
933
|
+
log.write("\n",end="",show_time=False, verbose=verbose)
|
|
934
|
+
return insumstats, changed_index
|
|
935
|
+
|
|
936
|
+
def normalizae_chunk(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
|
|
937
|
+
# already normalized
|
|
938
|
+
|
|
939
|
+
is_same = sumstats["NEA"] == sumstats["EA"]
|
|
940
|
+
is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
|
|
941
|
+
|
|
942
|
+
# a series to keep tracking of variants that are modified
|
|
943
|
+
changed = sumstats["NEA"] != sumstats["NEA"]
|
|
944
|
+
|
|
945
|
+
# right side
|
|
946
|
+
ea_len = sumstats["NEA"].str.len()
|
|
947
|
+
nea_len = sumstats["EA"].str.len()
|
|
948
|
+
max_length=max(ea_len.max(), nea_len.max())
|
|
949
|
+
|
|
950
|
+
for i in range(1, max_length):
|
|
951
|
+
is_pop = (sumstats["NEA"].str[-1] == sumstats["EA"].str[-1]) & (~is_normalized)
|
|
952
|
+
if sum(is_pop)==0:
|
|
953
|
+
break
|
|
954
|
+
if i ==1:
|
|
955
|
+
changed = changed | is_pop
|
|
956
|
+
nea_len[is_pop] = nea_len[is_pop] -1
|
|
957
|
+
ea_len[is_pop] = ea_len[is_pop] -1
|
|
958
|
+
sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[:-1]
|
|
959
|
+
sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[:-1]
|
|
960
|
+
is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
|
|
961
|
+
gc.collect()
|
|
962
|
+
|
|
963
|
+
# left side
|
|
964
|
+
max_length=max(sumstats["NEA"].str.len().max(), sumstats["EA"].str.len().max())
|
|
965
|
+
for i in range(1, max_length):
|
|
966
|
+
is_pop = (sumstats["NEA"].str[0] == sumstats["EA"].str[0]) & (~is_normalized)
|
|
967
|
+
if sum(is_pop)==0:
|
|
968
|
+
break
|
|
969
|
+
if i ==1:
|
|
970
|
+
changed = changed | is_pop
|
|
971
|
+
sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[1:]
|
|
972
|
+
sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[1:]
|
|
973
|
+
sumstats.loc[is_pop, "POS"] = sumstats.loc[is_pop,"POS"] + 1
|
|
974
|
+
is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
|
|
975
|
+
gc.collect()
|
|
976
|
+
|
|
977
|
+
sumstats.loc[is_normalized,status] = vchange_status(sumstats.loc[is_normalized, status], 5,"4","0")
|
|
978
|
+
sumstats.loc[is_same,status] = vchange_status(sumstats.loc[is_same, status], 5,"4","3")
|
|
979
|
+
changed_index = sumstats[changed].index
|
|
980
|
+
return sumstats, changed_index.values
|
|
981
|
+
|
|
876
982
|
def normalizevariant(pos,a,b,status):
|
|
877
983
|
# single record
|
|
878
984
|
# https://genome.sph.umich.edu/wiki/Variant_Normalization
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: gwaslab
|
|
3
|
-
Version: 3.4.
|
|
3
|
+
Version: 3.4.45
|
|
4
4
|
Summary: A collection of handy tools for GWAS SumStats
|
|
5
5
|
Author-email: Yunye <yunye@gwaslab.com>
|
|
6
6
|
Project-URL: Homepage, https://cloufield.github.io/gwaslab/
|
|
@@ -8,7 +8,7 @@ Project-URL: Github, https://github.com/Cloufield/gwaslab
|
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: License :: OSI Approved :: MIT License
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python:
|
|
11
|
+
Requires-Python: <3.11,>=3.9
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
License-File: LICENSE_before_v3.4.39
|
|
@@ -17,7 +17,7 @@ Requires-Dist: numpy >=1.21.2
|
|
|
17
17
|
Requires-Dist: matplotlib !=3.7.2,>=3.5
|
|
18
18
|
Requires-Dist: seaborn >=0.12
|
|
19
19
|
Requires-Dist: scipy >=1.12
|
|
20
|
-
Requires-Dist: pySAM
|
|
20
|
+
Requires-Dist: pySAM ==0.22.1
|
|
21
21
|
Requires-Dist: Biopython >=1.79
|
|
22
22
|
Requires-Dist: adjustText <=0.8,>=0.7.3
|
|
23
23
|
Requires-Dist: liftover >=1.1.13
|
|
@@ -51,7 +51,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
|
|
|
51
51
|
### install via pip
|
|
52
52
|
|
|
53
53
|
```
|
|
54
|
-
pip install gwaslab==3.4.
|
|
54
|
+
pip install gwaslab==3.4.43
|
|
55
55
|
```
|
|
56
56
|
|
|
57
57
|
```python
|
|
@@ -90,7 +90,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
|
|
|
90
90
|
```
|
|
91
91
|
conda env create -n gwaslab_test -c conda-forge python=3.9
|
|
92
92
|
conda activate gwaslab
|
|
93
|
-
pip install gwaslab==3.4.
|
|
93
|
+
pip install gwaslab==3.4.43
|
|
94
94
|
```
|
|
95
95
|
|
|
96
96
|
or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)
|
|
@@ -6,15 +6,15 @@ gwaslab/bd_get_hapmap3.py,sha256=asNjQYeGfQi8u3jnfenRvDdKMs5ptql5wpcUzqMlwUI,393
|
|
|
6
6
|
gwaslab/cache_manager.py,sha256=HOTnSkCOyGEPLRl90WT8D_6pAdI8d8AzenMIDGuCeWc,28113
|
|
7
7
|
gwaslab/g_Log.py,sha256=C3Zv-_6c3C9ms8bgQ-ytplz22sjk7euqXYkWr9zNeAs,1573
|
|
8
8
|
gwaslab/g_Phenotypes.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
gwaslab/g_Sumstats.py,sha256=
|
|
10
|
-
gwaslab/g_SumstatsPair.py,sha256=
|
|
9
|
+
gwaslab/g_Sumstats.py,sha256=NOEQd00guGch_GIt5bHv1wcrAvETfChqzmtgm-nIx_I,35298
|
|
10
|
+
gwaslab/g_SumstatsPair.py,sha256=20snPb4SlI6ftMGVjgxAuyxsxYRQF-GzzlBSnoB-3Lo,8851
|
|
11
11
|
gwaslab/g_SumstatsT.py,sha256=u_DighLMnMxwTLnqm-B58pA0G6WXRj6pudPyKMVKjSU,2133
|
|
12
12
|
gwaslab/g_Sumstats_summary.py,sha256=FECvvFXJVKaCX5dggBvvk9YvJ6AbdbcLfjltysX7wEE,6380
|
|
13
13
|
gwaslab/g_meta.py,sha256=htWlgURWclm9R6UqFcX1a93WN27xny7lGUeyJZOtszQ,2583
|
|
14
|
-
gwaslab/g_vchange_status.py,sha256=
|
|
15
|
-
gwaslab/g_version.py,sha256=
|
|
14
|
+
gwaslab/g_vchange_status.py,sha256=jLoVzMJFhB5k_cJKzHuBNc2HZGBWydAunCNa0n_d54g,1923
|
|
15
|
+
gwaslab/g_version.py,sha256=49_gR8lEQ_jgmfO9XJszEzuzDIESj5dHj6gta3Ilkmw,1818
|
|
16
16
|
gwaslab/hm_casting.py,sha256=FqP4EQl83Q2OKLw004OgLIvUH795TVCGwziLk5jsHqY,11368
|
|
17
|
-
gwaslab/hm_harmonize_sumstats.py,sha256=
|
|
17
|
+
gwaslab/hm_harmonize_sumstats.py,sha256=ympk2MZkbb0MnZ1n2ajkV36L8EAm7nBEaYhjqjI38tU,78548
|
|
18
18
|
gwaslab/hm_rsid_to_chrpos.py,sha256=ODWREO0jPN0RAfNzL5fRzSRANfhiksOvUVPuEsFZQqA,6552
|
|
19
19
|
gwaslab/io_preformat_input.py,sha256=w62JLAr16Ru0EgUtBCEV2eXRO89OqhidQxwf2IPAM38,20014
|
|
20
20
|
gwaslab/io_read_ldsc.py,sha256=8S9n4imgl4d0WPms_GYld-6uUM5z7iWGiCA-M814kzY,12123
|
|
@@ -28,7 +28,7 @@ gwaslab/ldsc_parse.py,sha256=MBnfgcWlV4oHp9MoDRh1mpilaHhAR15Af77hMFn4-5k,10564
|
|
|
28
28
|
gwaslab/ldsc_regressions.py,sha256=yzbGjgNV7u-SWXNPsh9S8y9mK97Bim_Nmad9G9V18ZU,30078
|
|
29
29
|
gwaslab/ldsc_sumstats.py,sha256=O0olsDxKlh1MJ1gAuEN1t40rxhajOEwOQ20ak7xoDrI,26245
|
|
30
30
|
gwaslab/qc_check_datatype.py,sha256=kW68uk4dTLOU2b1dHoVat6n0loundDysAjIqxsXW28Q,3379
|
|
31
|
-
gwaslab/qc_fix_sumstats.py,sha256=
|
|
31
|
+
gwaslab/qc_fix_sumstats.py,sha256=cpJibJ_77p4cg39R4zRunhOK2deIK4PfQA9wmYZgyqk,92745
|
|
32
32
|
gwaslab/run_script.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
33
|
gwaslab/util_ex_calculate_ldmatrix.py,sha256=LpE__LoYRHLgVKlCHo6lYWlz9LEUVUDqYPEAP-Svbm0,14598
|
|
34
34
|
gwaslab/util_ex_calculate_prs.py,sha256=5l1eiZs8YwIpEgp7i3IurP8n5KwQM5awbG9fWSm4iT4,9053
|
|
@@ -73,9 +73,9 @@ gwaslab/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz,sha256=qD9RsC5S2h6l-OdpW
|
|
|
73
73
|
gwaslab/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz,sha256=Y8ZT2FIAhbhlgCJdE9qQVAiwnV_fcsPt72usBa7RSBM,10225828
|
|
74
74
|
gwaslab/data/high_ld/high_ld_hla_hg19.bed.gz,sha256=R7IkssKu0L4WwkU9SrS84xCMdrkkKL0gnTNO_OKbG0Y,219
|
|
75
75
|
gwaslab/data/high_ld/high_ld_hla_hg38.bed.gz,sha256=76CIU0pibDJ72Y6UY-TbIKE9gEPwTELAaIbCXyjm80Q,470
|
|
76
|
-
gwaslab-3.4.
|
|
77
|
-
gwaslab-3.4.
|
|
78
|
-
gwaslab-3.4.
|
|
79
|
-
gwaslab-3.4.
|
|
80
|
-
gwaslab-3.4.
|
|
81
|
-
gwaslab-3.4.
|
|
76
|
+
gwaslab-3.4.45.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
77
|
+
gwaslab-3.4.45.dist-info/LICENSE_before_v3.4.39,sha256=GhLOU_1UDEKeOacYhsRN_m9u-eIuVTazSndZPeNcTZA,1066
|
|
78
|
+
gwaslab-3.4.45.dist-info/METADATA,sha256=5FN5dbVypNPET635Eooi01_1NDFD1dNr1T9Jv0JXmLc,7757
|
|
79
|
+
gwaslab-3.4.45.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
80
|
+
gwaslab-3.4.45.dist-info/top_level.txt,sha256=PyY6hWtrALpv2MAN3kjkIAzJNmmBTH5a2risz9KwH08,8
|
|
81
|
+
gwaslab-3.4.45.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|