PyPI - gwaslab - Versions diffs - 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl - Mend

gwaslab 3.4.36py3-none-any.whl → 3.4.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (42) hide show

gwaslab/__init__.py +1 -1
gwaslab/data/formatbook.json +722 -721
gwaslab/g_Log.py +8 -0
gwaslab/g_Sumstats.py +80 -178
gwaslab/g_SumstatsPair.py +6 -2
gwaslab/g_Sumstats_summary.py +3 -3
gwaslab/g_meta.py +13 -3
gwaslab/g_version.py +2 -2
gwaslab/hm_casting.py +29 -15
gwaslab/hm_harmonize_sumstats.py +312 -159
gwaslab/hm_rsid_to_chrpos.py +1 -1
gwaslab/io_preformat_input.py +46 -37
gwaslab/io_to_formats.py +428 -295
gwaslab/qc_check_datatype.py +15 -1
gwaslab/qc_fix_sumstats.py +956 -719
gwaslab/util_ex_calculate_ldmatrix.py +29 -11
gwaslab/util_ex_gwascatalog.py +1 -1
gwaslab/util_ex_ldproxyfinder.py +1 -1
gwaslab/util_ex_process_h5.py +26 -17
gwaslab/util_ex_process_ref.py +3 -3
gwaslab/util_ex_run_coloc.py +26 -4
gwaslab/util_in_convert_h2.py +1 -1
gwaslab/util_in_fill_data.py +44 -5
gwaslab/util_in_filter_value.py +122 -34
gwaslab/util_in_get_density.py +2 -2
gwaslab/util_in_get_sig.py +41 -9
gwaslab/viz_aux_quickfix.py +26 -21
gwaslab/viz_aux_reposition_text.py +7 -4
gwaslab/viz_aux_save_figure.py +6 -5
gwaslab/viz_plot_compare_af.py +5 -5
gwaslab/viz_plot_compare_effect.py +22 -5
gwaslab/viz_plot_miamiplot2.py +28 -20
gwaslab/viz_plot_mqqplot.py +214 -98
gwaslab/viz_plot_qqplot.py +11 -8
gwaslab/viz_plot_regionalplot.py +16 -9
gwaslab/viz_plot_trumpetplot.py +15 -6
{gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/METADATA +3 -3
gwaslab-3.4.38.dist-info/RECORD +72 -0
gwaslab-3.4.36.dist-info/RECORD +0 -72
{gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
{gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
{gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0

gwaslab/hm_harmonize_sumstats.py CHANGED Viewed

@@ -11,11 +11,19 @@ import gc
 from gwaslab.g_Log import Log
 from gwaslab.qc_fix_sumstats import fixchr
 from gwaslab.qc_fix_sumstats import fixpos
+from gwaslab.qc_fix_sumstats import sortcolumn
+from gwaslab.qc_fix_sumstats import _df_split
+from gwaslab.qc_fix_sumstats import check_col
+from gwaslab.qc_fix_sumstats import start_to
+from gwaslab.qc_fix_sumstats import finished
+from gwaslab.qc_fix_sumstats import skipped
+from gwaslab.qc_check_datatype import check_dataframe_shape
 from gwaslab.bd_common_data import get_number_to_chr
 from gwaslab.bd_common_data import get_chr_list
 from gwaslab.bd_common_data import get_chr_to_number
 from gwaslab.g_vchange_status import vchange_status
 from gwaslab.g_version import _get_version
 #rsidtochrpos
 #checkref
 #parallelizeassignrsid
@@ -27,17 +35,35 @@ from gwaslab.g_version import _get_version
 ###~!!!!
 def rsidtochrpos(sumstats,
-         path="", snpid="SNPID",
+         path=None, ref_rsid_to_chrpos_tsv=None, snpid="SNPID",
          rsid="rsID", chrom="CHR",pos="POS",ref_rsid="rsID",ref_chr="CHR",ref_pos="POS", build="19",
               overwrite=False,remove=False,chunksize=5000000,verbose=True,log=Log()):
     '''
     assign chr:pos based on rsID
     '''
-    #########################################################################################################
-    if verbose:  log.write("Start to update chromosome and position information based on rsID...{}".format(_get_version()))
-    if verbose:  log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
+    ##start function with col checking##########################################################
+    _start_line = "assign CHR and POS using rsIDs"
+    _end_line = "assigning CHR and POS using rsIDs"
+    _start_cols = [rsid,chrom,pos]
+    _start_function = ".rsid_to_chrpos()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                              log=log,
+                              verbose=verbose,
+                              start_line=_start_line,
+                              end_line=_end_line,
+                              start_cols=_start_cols,
+                              start_function=_start_function,
+                              **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
     if verbose:  log.write(" -rsID dictionary file: "+ path)
+    if ref_rsid_to_chrpos_tsv is not None:
+        path = ref_rsid_to_chrpos_tsv
     if snpid in sumstats.columns and sum(sumstats[rsid].isna())>0:
         if verbose:  log.write(" -Filling na in rsID columns with SNPID...")
         sumstats.loc[sumstats[rsid].isna(),rsid] = sumstats.loc[sumstats[rsid].isna(),snpid]
@@ -75,6 +101,9 @@ def rsidtochrpos(sumstats,
     if verbose:  log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ")
     sumstats = fixchr(sumstats,verbose=verbose)
     sumstats = fixpos(sumstats,verbose=verbose)
+    sumstats = sortcolumn(sumstats,verbose=verbose)
+    finished(log,verbose,_end_line)
     return sumstats
     ####################################################################################################
@@ -96,9 +125,34 @@ def merge_chrpos(sumstats_part,all_groups_max,path,build,status):
     return sumstats_part
-def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None,build="99",status="STATUS",
+def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None, ref_rsid_to_chrpos_vcf = None, ref_rsid_to_chrpos_hdf5 = None, build="99",status="STATUS",
                          n_cores=4,block_size=20000000,verbose=True,log=Log()):
-    if verbose:  log.write("Start to assign CHR and POS using rsIDs...{}".format(_get_version()))
+    ##start function with col checking##########################################################
+    _start_line = "assign CHR and POS using rsIDs"
+    _end_line = "assigning CHR and POS using rsIDs"
+    _start_cols = [rsid,chrom,pos]
+    _start_function = ".rsid_to_chrpos2()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                              log=log,
+                              verbose=verbose,
+                              start_line=_start_line,
+                              end_line=_end_line,
+                              start_cols=_start_cols,
+                              start_function=_start_function,
+                              **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
+    if ref_rsid_to_chrpos_hdf5 is not None:
+        path = ref_rsid_to_chrpos_hdf5
+    elif ref_rsid_to_chrpos_vcf is not None:
+        vcf_file_name = os.path.basename(ref_rsid_to_chrpos_vcf)
+        vcf_dir_path = os.path.dirname(ref_rsid_to_chrpos_vcf)
+        path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(vcf_dir_path,vcf_file_name,int(block_size))
     if path is None:
         raise ValueError("Please provide path to hdf5 file.")
@@ -164,17 +218,20 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
     # merge back
     if verbose:  log.write(" -Append data... ")
     sumstats = pd.concat([sumstats_rs,sumstats_nonrs],ignore_index=True)
     del sumstats_rs
     del sumstats_nonrs
     gc.collect()
     # check
-    sumstats = fixchr(sumstats,verbose=True)
-    sumstats = fixpos(sumstats,verbose=True)
+    sumstats = fixchr(sumstats,verbose=verbose)
+    sumstats = fixpos(sumstats,verbose=verbose)
+    sumstats = sortcolumn(sumstats,verbose=verbose)
     pool.close()
     pool.join()
-    gc.collect()
-    if verbose:  log.write("Finished assigning CHR and POS using rsIDs.")
+    finished(log, verbose, _end_line)
     return sumstats
 ####################################################################################################################
 #20220426 check if non-effect allele is aligned with reference genome
@@ -192,15 +249,15 @@ def check_status(row,record):
     #8 / -----> not on ref genome
     #9 / ------> unchecked
-    status_pre=row[3][:5]
-    status_end=row[3][6:]
+    status_pre=row.iloc[3][:5]
+    status_end=row.iloc[3][6:]
     ## nea == ref
-    if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
+    if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
         ## ea == ref
-        if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
+        if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
             ## len(nea) >len(ea):
-            if len(row[2])!=len(row[1]):
+            if len(row.iloc[2])!=len(row.iloc[1]):
                 # indels both on ref, unable to identify
                 return status_pre+"6"+status_end
         else:
@@ -209,34 +266,49 @@ def check_status(row,record):
     ## nea!=ref
     else:
         # ea == ref_seq -> need to flip
-        if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
+        if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
             return status_pre+"3"+status_end
         # ea !=ref
         else:
             #_reverse_complementary
-            row[1] = get_reverse_complementary_allele(row[1])
-            row[2] = get_reverse_complementary_allele(row[2])
+            row.iloc[1] = get_reverse_complementary_allele(row.iloc[1])
+            row.iloc[2] = get_reverse_complementary_allele(row.iloc[2])
             ## nea == ref
-            if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
+            if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
                 ## ea == ref
-                if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
+                if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
                     ## len(nea) >len(ea):
-                    if len(row[2])!=len(row[1]):
+                    if len(row.iloc[2])!=len(row.iloc[1]):
                         return status_pre+"8"+status_end  # indel reverse complementary
                 else:
                     return status_pre+"4"+status_end
             else:
                 # ea == ref_seq -> need to flip
-                if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
+                if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
                     return status_pre+"5"+status_end
             # ea !=ref
             return status_pre+"8"+status_end
 def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
-    if verbose: log.write("Start to check if NEA is aligned with reference sequence...{}".format(_get_version()))
-    if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
-    if verbose: log.write(" -Reference genome fasta file: "+ ref_path)
+    ##start function with col checking##########################################################
+    _start_line = "check if NEA is aligned with reference sequence"
+    _end_line = "checking if NEA is aligned with reference sequence"
+    _start_cols = [chrom,pos,ea,nea,status]
+    _start_function = ".check_ref()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                              log=log,
+                              verbose=verbose,
+                              start_line=_start_line,
+                              end_line=_end_line,
+                              start_cols=_start_cols,
+                              start_function=_start_function,
+                              **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
+    if verbose: log.write(" -Reference genome FASTA file: "+ ref_path)
     if verbose: log.write(" -Checking records: ", end="")
     chromlist = get_chr_list(add_number=True)
     records = SeqIO.parse(ref_path, "fasta")
@@ -255,7 +327,7 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
     if verbose:  log.write("\n",end="",show_time=False)
-    sumstats.loc[:,status] = sumstats.loc[:,status].astype("string")
+    sumstats[status] = sumstats[status].astype("string")
     available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
     status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
     status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -271,7 +343,7 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
     flip_rate = status_3/available_to_check
     if verbose: log.write("  -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100))
     if raw_matching_rate <0.8:
-        if verbose: log.write("  -!!!Warning, matching rate is low, please check if the right reference genome is used.")
+        if verbose: log.warning("Matching rate is low, please check if the right reference genome is used.")
     if flip_rate > 0.85 :
         if verbose: log.write("  -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.")
@@ -284,7 +356,8 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
     if remove is True:
         sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
         if verbose: log.write(" -Variants not on given reference sequence were removed.")
-    gc.collect()
+    finished(log, verbose, _end_line)
     return sumstats
 #######################################################################################################################################
@@ -314,7 +387,7 @@ def assign_rsid_single(sumstats,path,rsid="rsID",chr="CHR",pos="POS",ref="NEA",a
     ## single df assignment
     vcf_reader = VariantFile(path)
     def rsid_helper(x,vcf_reader,chr_dict):
-         return chrposref_rsid(x[0],x[1],x[2],x[3],vcf_reader,chr_dict)
+         return chrposref_rsid(x.iloc[0],x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,chr_dict)
     map_func=partial(rsid_helper,vcf_reader=vcf_reader,chr_dict=chr_dict)
     rsID = sumstats.apply(map_func,axis=1)
     return rsID
@@ -327,19 +400,31 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
     all ,    overwrite rsid for all availalbe rsid
     invalid,  only assign rsid for variants with invalid rsid
     empty    only assign rsid for variants with na rsid
-    '''
+    '''
     if ref_mode=="vcf":
         ###################################################################################################################
-        if verbose: log.write("Start to assign rsID using vcf...{}".format(_get_version()))
-        if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
-        if verbose: log.write(" -CPU Cores to use :",n_cores)
-        if verbose: log.write(" -Reference VCF file:", path)
+        ##start function with col checking##########################################################
+        _start_line = "assign rsID using reference VCF"
+        _end_line = "assign rsID using reference file"
+        _start_cols = [chr,pos,ref,alt,status]
+        _start_function = ".assign_rsid()"
+        _must_args ={}
+        is_enough_info = start_to(sumstats=sumstats,
+                                log=log,
+                                verbose=verbose,
+                                start_line=_start_line,
+                                end_line=_end_line,
+                                start_cols=_start_cols,
+                                start_function=_start_function,
+                                n_cores=n_cores,
+                                ref_vcf=path,
+                                **_must_args)
+        if is_enough_info == False: return sumstats
+        ############################################################################################
         chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
-        if verbose: log.write(" -Assigning rsID based on chr:pos and ref:alt/alt:ref...")
+        if verbose: log.write(" -Assigning rsID based on CHR:POS and REF:ALT/ALT:REF...")
         ##############################################
         if rsid not in sumstats.columns:
             sumstats[rsid]=pd.Series(dtype="string")
@@ -361,7 +446,8 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
         if sum(to_assign)>0:
             if sum(to_assign)<10000: n_cores=1
-            df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
+            #df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
+            df_split = _df_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
             pool = Pool(n_cores)
             map_func = partial(assign_rsid_single,path=path,chr=chr,pos=pos,ref=ref,alt=alt,chr_dict=chr_dict)
             assigned_rsid = pd.concat(pool.map(map_func,df_split))
@@ -380,9 +466,25 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
         '''
         assign rsID based on chr:pos
         '''
-        if verbose:  log.write("Start to annotate rsID based on chromosome and position information...{}".format(_get_version()))
-        if verbose:  log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
-        if verbose:  log.write(" -SNPID-rsID text file: "+ path)
+        ##start function with col checking##########################################################
+        _start_line = "assign rsID by matching SNPID with CHR:POS:REF:ALT in the reference TSV"
+        _end_line = "assign rsID using reference file"
+        _start_cols = [snpid,status]
+        _start_function = ".assign_rsid()"
+        _must_args ={}
+        is_enough_info = start_to(sumstats=sumstats,
+                                log=log,
+                                verbose=verbose,
+                                start_line=_start_line,
+                                end_line=_end_line,
+                                start_cols=_start_cols,
+                                start_function=_start_function,
+                                n_cores=n_cores,
+                                ref_tsv=path,
+                                **_must_args)
+        if is_enough_info == False: return sumstats
+        ############################################################################################
         standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234][0126]\w", case=False, flags=0, na=False)
@@ -390,11 +492,12 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
             sumstats[rsid]=pd.Series(dtype="string")
         if overwrite == "empty":
-            to_assign = sumstats[rsid].isna()
+            to_assign = sumstats[rsid].isna() & standardized_normalized
         if overwrite=="all":
             to_assign = standardized_normalized
         if overwrite=="invalid":
             to_assign = (~sumstats[rsid].str.match(r'rs([0-9]+)', case=False, flags=0, na=False)) & standardized_normalized
         total_number= len(sumstats)
         pre_number = sum(~sumstats[rsid].isna())
         if verbose: log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...")
@@ -419,12 +522,13 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
             sumstats = sumstats.rename(columns = {'index':snpid})
             after_number = sum(~sumstats[rsid].isna())
-            if verbose: log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!")
+            if verbose: log.write(" -rsID annotation for "+str(total_number - after_number) +" needed to be fixed!")
             if verbose: log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!")
         else:
-            if verbose: log.write(" -No rsID could be fixed...skipping...")
+            if verbose: log.write(" -No rsID can be fixed...skipping...")
         ################################################################################################################
-    gc.collect()
+    finished(log,verbose,_end_line)
     return sumstats
 #################################################################################################################################################
 #single record assignment
@@ -503,12 +607,12 @@ def is_palindromic(sumstats,a1="EA",a2="NEA"):
 def check_strand(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS"):
     vcf_reader = VariantFile(ref_infer)
-    status_part = sumstats.apply(lambda x:check_strand_status(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict),axis=1)
+    status_part = sumstats.apply(lambda x:check_strand_status(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict),axis=1)
     return status_part
 def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS",daf_tolerance=0.2):
     vcf_reader = VariantFile(ref_infer)
-    status_part = sumstats.apply(lambda x:check_unkonwn_indel(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict,daf_tolerance),axis=1)
+    status_part = sumstats.apply(lambda x:check_unkonwn_indel(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict,daf_tolerance),axis=1)
     return status_part
 ##################################################################################################################################################
@@ -516,79 +620,98 @@ def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NE
 def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,daf_tolerance=0.20,remove_snp="",mode="pi",n_cores=1,remove_indel="",
                        chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
                        chr_dict=None,verbose=True,log=Log()):
-    if verbose: log.write("Start to infer strand for palindromic SNPs...{}".format(_get_version()))
-    if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
-    if verbose: log.write(" -Reference vcf file:", ref_infer)
+    ##start function with col checking##########################################################
+    _start_line = "infer strand for palindromic SNPs/align indistinguishable indels"
+    _end_line = "inferring strand for palindromic SNPs/align indistinguishable indels"
+    _start_cols = [chr,pos,ref,alt,eaf,status]
+    _start_function = ".infer_strand()"
+    _must_args ={"ref_alt_freq":ref_alt_freq}
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            n_cores=n_cores,
+                            ref_vcf=ref_infer,
+                            **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
     chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
+    log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
-    # check if the columns are complete
-    if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
-        raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
     if "p" in mode:
-        # ref_alt_freq INFO in vcf was provided
-        if ref_alt_freq is not None:
-            if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
-            ## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
-            good_chrpos =  sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
-            palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
-            not_palindromic_snp = good_chrpos & (~palindromic)
-            ##not palindromic : change status
-            sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
-            if verbose: log.write(" -Identified ", sum(palindromic)," palindromic SNPs...")
-            #palindromic but can not infer
-            maf_can_infer   = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
-            sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
-            if verbose: log.write(" -After filtering by MAF< ", maf_threshold ," , the strand of ", sum(palindromic & maf_can_infer)," palindromic SNPs will be inferred...")
-            #########################################################################################
-            if sum(palindromic & maf_can_infer)>0:
-                if sum(palindromic & maf_can_infer)<10000:
-                    n_cores=1
-                df_split = np.array_split(sumstats.loc[(palindromic & maf_can_infer),[chr,pos,ref,alt,eaf,status]], n_cores)
-                pool = Pool(n_cores)
-                map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
-                status_inferred = pd.concat(pool.map(map_func,df_split))
-                sumstats.loc[(palindromic & maf_can_infer),status] = status_inferred.values
-            pool.close()
-            pool.join()
-            #########################################################################################
-            #0 Not palindromic SNPs
-            #1 Palindromic +strand  -> no need to flip
-            #2 palindromic -strand  -> need to flip -> fixed
-            #3 Indel no need flip
-            #4 Unknown Indel -> fixed
-            #5 Palindromic -strand -> need to flip
-            #6 Indel need flip
-            #7 indistinguishable
-            #8 Not matching or No information
-            #9 Unchecked
-            status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
-            status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
-            status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
-            status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
-            status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
-            if verbose: log.write("  -Non-palindromic : ",sum(status0))
-            if verbose: log.write("  -Palindromic SNPs on + strand: ",sum(status1))
-            if verbose: log.write("  -Palindromic SNPs on - strand and need to be flipped:",sum(status5))
-            if verbose: log.write("  -Palindromic SNPs with maf not available to infer : ",sum(status7))
-            if verbose: log.write("  -Palindromic SNPs with no macthes or no information : ",sum(status8))
-            if ("7" in remove_snp) and ("8" in remove_snp) :
-                if verbose: log.write("  -Palindromic SNPs with maf not available to infer and with no macthes or no information will will be removed")
-                sumstats = sumstats.loc[~(status7 | status8),:].copy()
-            elif "8" in remove_snp:
-                if verbose: log.write("  -Palindromic SNPs with no macthes or no information will be removed")
-                sumstats = sumstats.loc[~status8,:].copy()
-            elif "7" in remove_snp:
-                if verbose: log.write("  -Palindromic SNPs with maf not available to infer will be removed")
-                sumstats = sumstats.loc[~status7,:].copy()
+        ## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
+        good_chrpos =  sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
+        palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
+        not_palindromic_snp = good_chrpos & (~palindromic)
+        ##not palindromic : change status
+        sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
+        if verbose: log.write(" -Identified ", sum(palindromic)," palindromic SNPs...")
+        #palindromic but can not infer
+        maf_can_infer   = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
+        sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
+        #palindromic WITH UNKNWON OR UNCHECKED STATUS
+        unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
+        unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
+        if verbose: log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)))
+        #########################################################################################
+        if sum(unknow_palindromic_to_check)>0:
+            if sum(unknow_palindromic_to_check)<10000:
+                n_cores=1
+            #df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
+            df_split = _df_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
+            pool = Pool(n_cores)
+            map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
+            status_inferred = pd.concat(pool.map(map_func,df_split))
+            sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
+        pool.close()
+        pool.join()
+        #########################################################################################
+        #0 Not palindromic SNPs
+        #1 Palindromic +strand  -> no need to flip
+        #2 palindromic -strand  -> need to flip -> fixed
+        #3 Indel no need flip
+        #4 Unknown Indel -> fixed
+        #5 Palindromic -strand -> need to flip
+        #6 Indel need flip
+        #7 indistinguishable
+        #8 Not matching or No information
+        #9 Unchecked
+        status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
+        status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
+        status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
+        status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
+        status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
+        if verbose: log.write("  -Non-palindromic : ",sum(status0))
+        if verbose: log.write("  -Palindromic SNPs on + strand: ",sum(status1))
+        if verbose: log.write("  -Palindromic SNPs on - strand and needed to be flipped:",sum(status5))
+        if verbose: log.write("  -Palindromic SNPs with MAF not available to infer : ",sum(status7))
+        if verbose: log.write("  -Palindromic SNPs with no macthes or no information : ",sum(status8))
+        if ("7" in remove_snp) and ("8" in remove_snp) :
+            if verbose: log.write("  -Palindromic SNPs with maf not available to infer and with no macthes or no information will will be removed")
+            sumstats = sumstats.loc[~(status7 | status8),:].copy()
+        elif "8" in remove_snp:
+            if verbose: log.write("  -Palindromic SNPs with no macthes or no information will be removed")
+            sumstats = sumstats.loc[~status8,:].copy()
+        elif "7" in remove_snp:
+            if verbose: log.write("  -Palindromic SNPs with maf not available to infer will be removed")
+            sumstats = sumstats.loc[~status7,:].copy()
     ### unknow_indel
     if "i" in mode:
@@ -598,14 +721,15 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
             if verbose: log.write(" -Indistinguishable indels will be inferred from reference vcf ref and alt...")
             #########################################################################################
             #with maf can not infer
-            #maf_can_infer   = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
+            #maf_can_infer   = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
             #sumstats.loc[unknow_indel&(~maf_can_infer),status] = vchange_status(sumstats.loc[unknow_indel&(~maf_can_infer),status],7,"9","8")
             if verbose: log.write(" -DAF tolerance: {}".format(daf_tolerance))
             if sum(unknow_indel)>0:
                 if sum(unknow_indel)<10000:
                     n_cores=1
-                df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
+                #df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
+                df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
                 pool = Pool(n_cores)
                 map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
                 status_inferred = pd.concat(pool.map(map_func,df_split))
@@ -624,7 +748,8 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
             if "8" in remove_indel:
                 if verbose: log.write("  -Indels with no macthes or no information will be removed")
                 sumstats = sumstats.loc[~status8,:].copy()
-    gc.collect()
+    finished(log,verbose,_end_line)
     return sumstats
@@ -648,22 +773,35 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
 ################################################################################################################
 def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
-    if verbose: log.write("Start to check the difference between EAF and reference vcf alt frequency ...{}".format(_get_version()))
-    if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
-    if verbose: log.write(" -Reference vcf file:", ref_infer)
-    if verbose: log.write(" -CPU Cores to use :",n_cores)
+    ##start function with col checking##########################################################
+    _start_line = "check the difference between EAF and reference VCF ALT frequency"
+    _end_line = "checking the difference between EAF and reference VCF ALT frequency"
+    _start_cols = [chr,pos,ref,alt,eaf,status]
+    _start_function = ".check_daf()"
+    _must_args ={"ref_alt_freq":ref_alt_freq}
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            n_cores=n_cores,
+                            ref_vcf=ref_infer,
+                            **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
     chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
     column_name = column_name + suffix
-    # check if the columns are complete
-    if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
-        raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
     # ref_alt_freq INFO in vcf was provided
     if ref_alt_freq is not None:
-        if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
+        log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
         if not force:
             good_chrpos =  sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
         if verbose: log.write(" -Checking variants:", sum(good_chrpos))
@@ -672,7 +810,8 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
     ########################
         if sum(~sumstats[eaf].isna())<10000:
             n_cores=1
-        df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
+        #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
+        df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
         pool = Pool(n_cores)
         if sum(~sumstats[eaf].isna())>0:
             map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,column_name=column_name,chr_dict=chr_dict)
@@ -683,13 +822,13 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
         #status_inferred = sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]].apply(lambda x:check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict),axis=1)
         #sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
-        #sumstats.loc[:,"DAF"]=sumstats.loc[:,"DAF"].astype("float")
-        if verbose: log.write(" - {} max:".format(column_name), np.nanmax(sumstats.loc[:,column_name]))
-        if verbose: log.write(" - {} min:".format(column_name), np.nanmin(sumstats.loc[:,column_name]))
-        if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats.loc[:,column_name]))
-        if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats.loc[:,column_name])))
-        if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats.loc[:,column_name])))
-        if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats.loc[:,column_name])))
+        #sumstats["DAF"]=sumstats["DAF"].astype("float")
+        if verbose: log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]))
+        if verbose: log.write(" - {} min:".format(column_name), np.nanmin(sumstats[column_name]))
+        if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats[column_name]))
+        if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats[column_name])))
+        if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats[column_name])))
+        if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats[column_name])))
         if verbose: log.write("Finished allele frequency checking!")
     return sumstats
@@ -697,11 +836,11 @@ def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos
     #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
     vcf_reader = VariantFile(ref_infer)
     def afapply(x,vcf,alt_freq,chr_dict):
-            return check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict)
+            return check_daf(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,chr_dict)
     map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
     status_inferred = sumstats.apply(map_func,axis=1)
-    sumstats.loc[:,column_name] = status_inferred.values
-    sumstats.loc[:,column_name]=sumstats.loc[:,column_name].astype("float")
+    sumstats[column_name] = status_inferred.values
+    sumstats[column_name]=sumstats[column_name].astype("float")
     return sumstats
 def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
@@ -716,25 +855,35 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
 ################################################################################################################
 def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
-    if verbose: log.write("Start to infer the AF and reference vcf alt frequency ...{}".format(_get_version()))
-    if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
-    if verbose: log.write(" -Reference vcf file:", ref_infer)
-    if verbose: log.write(" -CPU Cores to use :",n_cores)
+    ##start function with col checking##########################################################
+    _start_line = "infer EAF using reference VCF ALT frequency"
+    _end_line = "inferring EAF using reference VCF ALT frequency"
+    _start_cols = [chr,pos,ref,alt,eaf,status]
+    _start_function = ".infer_af()"
+    _must_args ={"ref_alt_freq":ref_alt_freq}
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            n_cores=n_cores,
+                            ref_vcf=ref_infer,
+                            **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
     chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
-    # check if the columns are complete
-    if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
-        raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
     if eaf not in sumstats.columns:
         sumstats[eaf]=np.nan
     prenumber = sum(sumstats[eaf].isna())
     # ref_alt_freq INFO in vcf was provided
     if ref_alt_freq is not None:
-        if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
+        log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
         if not force:
             good_chrpos =  sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
         if verbose: log.write(" -Checking variants:", sum(good_chrpos))
@@ -742,7 +891,8 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
     ########################
         if sum(sumstats[eaf].isna())<10000:
             n_cores=1
-        df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
+        #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
+        df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
         pool = Pool(n_cores)
         map_func = partial(inferaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
         sumstats.loc[good_chrpos,[eaf]] = pd.concat(pool.map(map_func,df_split))
@@ -753,18 +903,19 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
         afternumber = sum(sumstats[eaf].isna())
         if verbose: log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber))
         if verbose: log.write(" -EAF is still missing for {} variants.".format(afternumber))
-        if verbose: log.write("Finished allele frequency inferring!")
+    finished(log,verbose,_end_line)
     return sumstats
 def inferaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
     #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
     vcf_reader = VariantFile(ref_infer)
     def afapply(x,vcf,alt_freq,chr_dict):
-            return infer_af(x[0],x[1]-1,x[1],x[2],x[3],vcf_reader,ref_alt_freq,chr_dict)
+            return infer_af(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,ref_alt_freq,chr_dict)
     map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
     status_inferred = sumstats.apply(map_func,axis=1)
-    sumstats.loc[:,eaf] = status_inferred.values
-    sumstats.loc[:,eaf]=sumstats.loc[:,eaf].astype("float")
+    sumstats[eaf] = status_inferred.values
+    sumstats[eaf]=sumstats[eaf].astype("float")
     return sumstats
 def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
@@ -802,4 +953,6 @@ def check_vcf_chr_prefix(vcf_bcf_path):
         if m is not None:
             return m.group(1)
     else:
-        return None
+        return None

gwaslab 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.36py3-none-any.whl → 3.4.38py3-none-any.whl