PyPI - gwaslab - Versions diffs - 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl - Mend

gwaslab 3.4.37py3-none-any.whl → 3.4.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show

gwaslab/bd_common_data.py +6 -3
gwaslab/bd_download.py +9 -9
gwaslab/bd_get_hapmap3.py +43 -9
gwaslab/data/formatbook.json +722 -721
gwaslab/g_Log.py +22 -5
gwaslab/g_Sumstats.py +110 -163
gwaslab/g_SumstatsPair.py +76 -25
gwaslab/g_SumstatsT.py +2 -2
gwaslab/g_Sumstats_summary.py +3 -3
gwaslab/g_version.py +10 -10
gwaslab/hm_casting.py +36 -17
gwaslab/hm_harmonize_sumstats.py +354 -221
gwaslab/hm_rsid_to_chrpos.py +1 -1
gwaslab/io_preformat_input.py +49 -43
gwaslab/io_read_ldsc.py +49 -1
gwaslab/io_to_formats.py +428 -295
gwaslab/ldsc_irwls.py +198 -0
gwaslab/ldsc_jackknife.py +514 -0
gwaslab/ldsc_ldscore.py +417 -0
gwaslab/ldsc_parse.py +294 -0
gwaslab/ldsc_regressions.py +747 -0
gwaslab/ldsc_sumstats.py +629 -0
gwaslab/qc_check_datatype.py +3 -3
gwaslab/qc_fix_sumstats.py +891 -778
gwaslab/util_ex_calculate_ldmatrix.py +31 -13
gwaslab/util_ex_gwascatalog.py +25 -25
gwaslab/util_ex_ldproxyfinder.py +10 -10
gwaslab/util_ex_ldsc.py +189 -0
gwaslab/util_ex_process_ref.py +3 -3
gwaslab/util_ex_run_coloc.py +26 -4
gwaslab/util_in_calculate_gc.py +6 -6
gwaslab/util_in_calculate_power.py +42 -43
gwaslab/util_in_convert_h2.py +8 -8
gwaslab/util_in_fill_data.py +30 -30
gwaslab/util_in_filter_value.py +201 -74
gwaslab/util_in_get_density.py +10 -10
gwaslab/util_in_get_sig.py +445 -71
gwaslab/viz_aux_annotate_plot.py +12 -12
gwaslab/viz_aux_quickfix.py +42 -37
gwaslab/viz_aux_reposition_text.py +10 -7
gwaslab/viz_aux_save_figure.py +18 -8
gwaslab/viz_plot_compare_af.py +32 -33
gwaslab/viz_plot_compare_effect.py +63 -71
gwaslab/viz_plot_miamiplot2.py +34 -26
gwaslab/viz_plot_mqqplot.py +126 -75
gwaslab/viz_plot_qqplot.py +11 -8
gwaslab/viz_plot_regionalplot.py +36 -33
gwaslab/viz_plot_rg_heatmap.py +28 -26
gwaslab/viz_plot_stackedregional.py +40 -21
gwaslab/viz_plot_trumpetplot.py +65 -61
gwaslab-3.4.39.dist-info/LICENSE +674 -0
{gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
gwaslab-3.4.39.dist-info/RECORD +80 -0
gwaslab-3.4.37.dist-info/RECORD +0 -72
/gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
{gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0

gwaslab/hm_harmonize_sumstats.py CHANGED Viewed

@@ -12,12 +12,18 @@ from gwaslab.g_Log import Log
 from gwaslab.qc_fix_sumstats import fixchr
 from gwaslab.qc_fix_sumstats import fixpos
 from gwaslab.qc_fix_sumstats import sortcolumn
+from gwaslab.qc_fix_sumstats import _df_split
+from gwaslab.qc_fix_sumstats import check_col
+from gwaslab.qc_fix_sumstats import start_to
+from gwaslab.qc_fix_sumstats import finished
+from gwaslab.qc_fix_sumstats import skipped
 from gwaslab.qc_check_datatype import check_dataframe_shape
 from gwaslab.bd_common_data import get_number_to_chr
 from gwaslab.bd_common_data import get_chr_list
 from gwaslab.bd_common_data import get_chr_to_number
 from gwaslab.g_vchange_status import vchange_status
 from gwaslab.g_version import _get_version
 #rsidtochrpos
 #checkref
 #parallelizeassignrsid
@@ -35,20 +41,35 @@ def rsidtochrpos(sumstats,
     '''
     assign chr:pos based on rsID
     '''
-    #########################################################################################################
-    if verbose:  log.write("Start to update chromosome and position information based on rsID...{}".format(_get_version()))
-    check_dataframe_shape(sumstats, log, verbose)
-    if verbose:  log.write(" -rsID dictionary file: "+ path)
+    ##start function with col checking##########################################################
+    _start_line = "assign CHR and POS using rsIDs"
+    _end_line = "assigning CHR and POS using rsIDs"
+    _start_cols = [rsid,chrom,pos]
+    _start_function = ".rsid_to_chrpos()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                              log=log,
+                              verbose=verbose,
+                              start_line=_start_line,
+                              end_line=_end_line,
+                              start_cols=_start_cols,
+                              start_function=_start_function,
+                              **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
+    log.write(" -rsID dictionary file: "+ path,verbose=verbose)
     if ref_rsid_to_chrpos_tsv is not None:
         path = ref_rsid_to_chrpos_tsv
     if snpid in sumstats.columns and sum(sumstats[rsid].isna())>0:
-        if verbose:  log.write(" -Filling na in rsID columns with SNPID...")
+        log.write(" -Filling na in rsID columns with SNPID...",verbose=verbose)
         sumstats.loc[sumstats[rsid].isna(),rsid] = sumstats.loc[sumstats[rsid].isna(),snpid]
     if sum(sumstats[rsid].isna())>0:
-        if verbose:  log.write(" -Filling na in rsID columns with NA_xxx for {} variants...".format(sum(sumstats[rsid].isna())))
+        log.write(" -Filling na in rsID columns with NA_xxx for {} variants...".format(sum(sumstats[rsid].isna())),verbose=verbose)
         sumstats.loc[sumstats[rsid].isna(),rsid] = ["NA_" + str(x+1) for x in range(len(sumstats.loc[sumstats[rsid].isna(),rsid]))]
     dic_chuncks = pd.read_csv(path,sep="\t",usecols=[ref_rsid,ref_chr,ref_pos],
@@ -63,8 +84,8 @@ def rsidtochrpos(sumstats,
     if pos not in sumstats.columns:
         sumstats[pos] =pd.Series(dtype="Int64")
-    if verbose:  log.write(" -Setting block size: ",chunksize)
-    if verbose:  log.write(" -Loading block: ",end="")
+    log.write(" -Setting block size: ",chunksize,verbose=verbose)
+    log.write(" -Loading block: ",end="",verbose=verbose)
     for i,dic in enumerate(dic_chuncks):
         dic_to_update = dic[dic.index.notnull()]
         log.write(i," ",end=" ",show_time=False)
@@ -74,13 +95,15 @@ def rsidtochrpos(sumstats,
         sumstats.update(dic_to_update,overwrite="True")
         gc.collect()
-    if verbose:  log.write("\n",end="",show_time=False)
+    log.write("\n",end="",show_time=False,verbose=verbose)
     sumstats = sumstats.reset_index()
     sumstats = sumstats.rename(columns = {'index':rsid})
-    if verbose:  log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ")
+    log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ",verbose=verbose)
     sumstats = fixchr(sumstats,verbose=verbose)
     sumstats = fixpos(sumstats,verbose=verbose)
     sumstats = sortcolumn(sumstats,verbose=verbose)
+    finished(log,verbose,_end_line)
     return sumstats
     ####################################################################################################
@@ -104,33 +127,48 @@ def merge_chrpos(sumstats_part,all_groups_max,path,build,status):
 def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None, ref_rsid_to_chrpos_vcf = None, ref_rsid_to_chrpos_hdf5 = None, build="99",status="STATUS",
                          n_cores=4,block_size=20000000,verbose=True,log=Log()):
+    ##start function with col checking##########################################################
+    _start_line = "assign CHR and POS using rsIDs"
+    _end_line = "assigning CHR and POS using rsIDs"
+    _start_cols = [rsid,chrom,pos]
+    _start_function = ".rsid_to_chrpos2()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                              log=log,
+                              verbose=verbose,
+                              start_line=_start_line,
+                              end_line=_end_line,
+                              start_cols=_start_cols,
+                              start_function=_start_function,
+                              **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
     if ref_rsid_to_chrpos_hdf5 is not None:
         path = ref_rsid_to_chrpos_hdf5
     elif ref_rsid_to_chrpos_vcf is not None:
         vcf_file_name = os.path.basename(ref_rsid_to_chrpos_vcf)
         vcf_dir_path = os.path.dirname(ref_rsid_to_chrpos_vcf)
         path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(vcf_dir_path,vcf_file_name,int(block_size))
-    if verbose:  log.write("Start to assign CHR and POS using rsIDs...{}".format(_get_version()))
-    check_dataframe_shape(sumstats, log, verbose)
     if path is None:
         raise ValueError("Please provide path to hdf5 file.")
     sumstats["rsn"] = pd.to_numeric(sumstats[rsid].str.strip("rs"),errors="coerce").astype("Int64")
-    if verbose:  log.write(" -Source hdf5 file: ",path)
-    if verbose:  log.write(" -Cores to use : ",n_cores)
-    if verbose:  log.write(" -Blocksize (make sure it is the same as hdf5 file ): ",block_size)
+    log.write(" -Source hdf5 file: ",path,verbose=verbose)
+    log.write(" -Cores to use : ",n_cores,verbose=verbose)
+    log.write(" -Blocksize (make sure it is the same as hdf5 file ): ",block_size,verbose=verbose)
     input_columns= sumstats.columns
     sumstats_nonrs = sumstats.loc[sumstats["rsn"].isna()|sumstats["rsn"].duplicated(keep='first') ,:].copy()
     sumstats_rs  = sumstats.loc[sumstats["rsn"].notnull(),:].copy()
-    if verbose:  log.write(" -Non-Valid rsIDs: ",sum(sumstats["rsn"].isna()))
-    if verbose:  log.write(" -Duplicated rsIDs except for the first occurrence: ",sum(sumstats.loc[~sumstats["rsn"].isna(), "rsn"].duplicated(keep='first')))
-    if verbose:  log.write(" -Valid rsIDs: ", len(sumstats_rs))
+    log.write(" -Non-Valid rsIDs: ",sum(sumstats["rsn"].isna()),verbose=verbose)
+    log.write(" -Duplicated rsIDs except for the first occurrence: ",sum(sumstats.loc[~sumstats["rsn"].isna(), "rsn"].duplicated(keep='first')),verbose=verbose)
+    log.write(" -Valid rsIDs: ", len(sumstats_rs),verbose=verbose)
     del sumstats
     gc.collect()
@@ -147,16 +185,16 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
     #
     pool = Pool(n_cores)
     if chrom not in input_columns:
-        if verbose:  log.write(" -Initiating CHR ... ")
+        log.write(" -Initiating CHR ... ",verbose=verbose)
         sumstats_rs[chrom]=pd.Series(dtype="Int32")
     if pos not in input_columns:
-        if verbose:  log.write(" -Initiating POS ... ")
+        log.write(" -Initiating POS ... ",verbose=verbose)
         sumstats_rs[pos]=pd.Series(dtype="Int64")
     df_split=[y for x, y in sumstats_rs.groupby('group', as_index=False)]
-    if verbose:  log.write(" -Divided into groups: ",len(df_split))
-    if verbose:  log.write("  -",set(sumstats_rs.loc[:,"group"].unique()))
+    log.write(" -Divided into groups: ",len(df_split),verbose=verbose)
+    log.write("  -",set(sumstats_rs.loc[:,"group"].unique()),verbose=verbose)
     # check keys
     store = pd.HDFStore(path, 'r')
@@ -164,21 +202,21 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
     all_groups_len = len(all_groups)
     store.close()
     all_groups_max = max(map(lambda x: int(x.split("_")[1]), all_groups))
-    if verbose:  log.write(" -Number of groups in HDF5: ",all_groups_len)
-    if verbose:  log.write(" -Max index of groups in HDF5: ",all_groups_max)
+    log.write(" -Number of groups in HDF5: ",all_groups_len,verbose=verbose)
+    log.write(" -Max index of groups in HDF5: ",all_groups_max,verbose=verbose)
     # update CHR and POS using rsID with multiple threads
     sumstats_rs = pd.concat(pool.map(partial(merge_chrpos,all_groups_max=all_groups_max,path=path,build=build,status=status),df_split),ignore_index=True)
     sumstats_rs.loc[:,["CHR","POS"]] = sumstats_rs.loc[:,["CHR","POS"]].astype("Int64")
     del df_split
     gc.collect()
-    if verbose:  log.write(" -Merging group data... ")
+    log.write(" -Merging group data... ",verbose=verbose)
     # drop group and rsn
     sumstats_rs = sumstats_rs.drop(columns=["group"])
     sumstats_nonrs = sumstats_nonrs.drop(columns=["rsn"])
     # merge back
-    if verbose:  log.write(" -Append data... ")
+    log.write(" -Append data... ",verbose=verbose)
     sumstats = pd.concat([sumstats_rs,sumstats_nonrs],ignore_index=True)
     del sumstats_rs
@@ -192,8 +230,8 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
     pool.close()
     pool.join()
-    gc.collect()
-    if verbose:  log.write("Finished assigning CHR and POS using rsIDs.")
+    finished(log, verbose, _end_line)
     return sumstats
 ####################################################################################################################
 #20220426 check if non-effect allele is aligned with reference genome
@@ -211,15 +249,15 @@ def check_status(row,record):
     #8 / -----> not on ref genome
     #9 / ------> unchecked
-    status_pre=row[3][:5]
-    status_end=row[3][6:]
+    status_pre=row.iloc[3][:5]
+    status_end=row.iloc[3][6:]
     ## nea == ref
-    if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
+    if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
         ## ea == ref
-        if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
+        if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
             ## len(nea) >len(ea):
-            if len(row[2])!=len(row[1]):
+            if len(row.iloc[2])!=len(row.iloc[1]):
                 # indels both on ref, unable to identify
                 return status_pre+"6"+status_end
         else:
@@ -228,35 +266,50 @@ def check_status(row,record):
     ## nea!=ref
     else:
         # ea == ref_seq -> need to flip
-        if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
+        if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
             return status_pre+"3"+status_end
         # ea !=ref
         else:
             #_reverse_complementary
-            row[1] = get_reverse_complementary_allele(row[1])
-            row[2] = get_reverse_complementary_allele(row[2])
+            row.iloc[1] = get_reverse_complementary_allele(row.iloc[1])
+            row.iloc[2] = get_reverse_complementary_allele(row.iloc[2])
             ## nea == ref
-            if row[2] == record[row[0]-1: row[0]+len(row[2])-1].seq.upper():
+            if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
                 ## ea == ref
-                if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
+                if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
                     ## len(nea) >len(ea):
-                    if len(row[2])!=len(row[1]):
+                    if len(row.iloc[2])!=len(row.iloc[1]):
                         return status_pre+"8"+status_end  # indel reverse complementary
                 else:
                     return status_pre+"4"+status_end
             else:
                 # ea == ref_seq -> need to flip
-                if row[1] == record[row[0]-1: row[0]+len(row[1])-1].seq.upper():
+                if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
                     return status_pre+"5"+status_end
             # ea !=ref
             return status_pre+"8"+status_end
 def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
-    if verbose: log.write("Start to check if NEA is aligned with reference sequence...{}".format(_get_version()))
-    check_dataframe_shape(sumstats, log, verbose)
-    if verbose: log.write(" -Reference genome fasta file: "+ ref_path)
-    if verbose: log.write(" -Checking records: ", end="")
+    ##start function with col checking##########################################################
+    _start_line = "check if NEA is aligned with reference sequence"
+    _end_line = "checking if NEA is aligned with reference sequence"
+    _start_cols = [chrom,pos,ea,nea,status]
+    _start_function = ".check_ref()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                              log=log,
+                              verbose=verbose,
+                              start_line=_start_line,
+                              end_line=_end_line,
+                              start_cols=_start_cols,
+                              start_function=_start_function,
+                              **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
+    log.write(" -Reference genome FASTA file: "+ ref_path,verbose=verbose)
+    log.write(" -Checking records: ", end="",verbose=verbose)
     chromlist = get_chr_list(add_number=True)
     records = SeqIO.parse(ref_path, "fasta")
     for record in records:
@@ -268,13 +321,13 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
             else:
                 i = record_chr
             if i in chromlist:
-                if verbose:  log.write(record_chr," ", end="",show_time=False)
+                log.write(record_chr," ", end="",show_time=False,verbose=verbose)
                 to_check_ref = (sumstats[chrom]==i) & (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna())
                 sumstats.loc[to_check_ref,status] = sumstats.loc[to_check_ref,[pos,ea,nea,status]].apply(lambda x:check_status(x,record),axis=1)
-    if verbose:  log.write("\n",end="",show_time=False)
+    log.write("\n",end="",show_time=False,verbose=verbose)
-    sumstats.loc[:,status] = sumstats.loc[:,status].astype("string")
+    sumstats[status] = sumstats[status].astype("string")
     available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
     status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
     status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -284,26 +337,27 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
     #status_7=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[7]\w", case=False, flags=0, na=False))
     status_8=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w", case=False, flags=0, na=False))
-    if verbose: log.write(" -Variants allele on given reference sequence : ",status_0)
-    if verbose: log.write(" -Variants flipped : ",status_3)
+    log.write(" -Variants allele on given reference sequence : ",status_0,verbose=verbose)
+    log.write(" -Variants flipped : ",status_3,verbose=verbose)
     raw_matching_rate = (status_3+status_0)/available_to_check
     flip_rate = status_3/available_to_check
-    if verbose: log.write("  -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100))
+    log.write("  -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100),verbose=verbose)
     if raw_matching_rate <0.8:
-        if verbose: log.write("  -!!!Warning, matching rate is low, please check if the right reference genome is used.")
+        log.warning("Matching rate is low, please check if the right reference genome is used.")
     if flip_rate > 0.85 :
-        if verbose: log.write("  -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.")
+        log.write("  -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.",verbose=verbose)
-    if verbose: log.write(" -Variants inferred reverse_complement : ",status_4)
-    if verbose: log.write(" -Variants inferred reverse_complement_flipped : ",status_5)
-    if verbose: log.write(" -Both allele on genome + unable to distinguish : ",status_6)
-    #if verbose: log.write(" -Reverse_complementary + both allele on genome + unable to distinguish: ",status_7)
-    if verbose: log.write(" -Variants not on given reference sequence : ",status_8)
+    log.write(" -Variants inferred reverse_complement : ",status_4,verbose=verbose)
+    log.write(" -Variants inferred reverse_complement_flipped : ",status_5,verbose=verbose)
+    log.write(" -Both allele on genome + unable to distinguish : ",status_6,verbose=verbose)
+    #log.write(" -Reverse_complementary + both allele on genome + unable to distinguish: ",status_7)
+    log.write(" -Variants not on given reference sequence : ",status_8,verbose=verbose)
     if remove is True:
         sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
-        if verbose: log.write(" -Variants not on given reference sequence were removed.")
-    gc.collect()
+        log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
+    finished(log, verbose, _end_line)
     return sumstats
 #######################################################################################################################################
@@ -333,7 +387,7 @@ def assign_rsid_single(sumstats,path,rsid="rsID",chr="CHR",pos="POS",ref="NEA",a
     ## single df assignment
     vcf_reader = VariantFile(path)
     def rsid_helper(x,vcf_reader,chr_dict):
-         return chrposref_rsid(x[0],x[1],x[2],x[3],vcf_reader,chr_dict)
+         return chrposref_rsid(x.iloc[0],x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,chr_dict)
     map_func=partial(rsid_helper,vcf_reader=vcf_reader,chr_dict=chr_dict)
     rsID = sumstats.apply(map_func,axis=1)
     return rsID
@@ -346,19 +400,31 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
     all ,    overwrite rsid for all availalbe rsid
     invalid,  only assign rsid for variants with invalid rsid
     empty    only assign rsid for variants with na rsid
-    '''
+    '''
     if ref_mode=="vcf":
         ###################################################################################################################
-        if verbose: log.write("Start to assign rsID using vcf...{}".format(_get_version()))
-        if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
-        if verbose: log.write(" -CPU Cores to use :",n_cores)
-        if verbose: log.write(" -Reference VCF file:", path)
+        ##start function with col checking##########################################################
+        _start_line = "assign rsID using reference VCF"
+        _end_line = "assign rsID using reference file"
+        _start_cols = [chr,pos,ref,alt,status]
+        _start_function = ".assign_rsid()"
+        _must_args ={}
+        is_enough_info = start_to(sumstats=sumstats,
+                                log=log,
+                                verbose=verbose,
+                                start_line=_start_line,
+                                end_line=_end_line,
+                                start_cols=_start_cols,
+                                start_function=_start_function,
+                                n_cores=n_cores,
+                                ref_vcf=path,
+                                **_must_args)
+        if is_enough_info == False: return sumstats
+        ############################################################################################
         chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
-        if verbose: log.write(" -Assigning rsID based on chr:pos and ref:alt/alt:ref...")
+        log.write(" -Assigning rsID based on CHR:POS and REF:ALT/ALT:REF...",verbose=verbose)
         ##############################################
         if rsid not in sumstats.columns:
             sumstats[rsid]=pd.Series(dtype="string")
@@ -380,7 +446,8 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
         if sum(to_assign)>0:
             if sum(to_assign)<10000: n_cores=1
-            df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
+            #df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
+            df_split = _df_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
             pool = Pool(n_cores)
             map_func = partial(assign_rsid_single,path=path,chr=chr,pos=pos,ref=ref,alt=alt,chr_dict=chr_dict)
             assigned_rsid = pd.concat(pool.map(map_func,df_split))
@@ -391,40 +458,57 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
         ##################################################################################################################
         after_number = sum(~sumstats[rsid].isna())
-        if verbose: log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!")
-        if verbose: log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!")
+        log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!",verbose=verbose)
+        log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!",verbose=verbose)
     ##################################################################################################################
     elif ref_mode=="tsv":
         '''
         assign rsID based on chr:pos
         '''
-        if verbose:  log.write("Start to annotate rsID based on chromosome and position information...{}".format(_get_version()))
-        check_dataframe_shape(sumstats, log, verbose)
-        if verbose:  log.write(" -SNPID-rsID text file: "+ path)
+        ##start function with col checking##########################################################
+        _start_line = "assign rsID by matching SNPID with CHR:POS:REF:ALT in the reference TSV"
+        _end_line = "assign rsID using reference file"
+        _start_cols = [snpid,status]
+        _start_function = ".assign_rsid()"
+        _must_args ={}
+        is_enough_info = start_to(sumstats=sumstats,
+                                log=log,
+                                verbose=verbose,
+                                start_line=_start_line,
+                                end_line=_end_line,
+                                start_cols=_start_cols,
+                                start_function=_start_function,
+                                n_cores=n_cores,
+                                ref_tsv=path,
+                                **_must_args)
+        if is_enough_info == False: return sumstats
+        ############################################################################################
-        standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234][0126]\w", case=False, flags=0, na=False)
+        standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
         if rsid not in sumstats.columns:
             sumstats[rsid]=pd.Series(dtype="string")
         if overwrite == "empty":
-            to_assign = sumstats[rsid].isna()
+            to_assign = sumstats[rsid].isna() & standardized_normalized
         if overwrite=="all":
             to_assign = standardized_normalized
         if overwrite=="invalid":
             to_assign = (~sumstats[rsid].str.match(r'rs([0-9]+)', case=False, flags=0, na=False)) & standardized_normalized
         total_number= len(sumstats)
         pre_number = sum(~sumstats[rsid].isna())
-        if verbose: log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...")
+        log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...",verbose=verbose)
         if sum(to_assign)>0:
             sumstats = sumstats.set_index(snpid)
             dic_chuncks = pd.read_csv(path,sep="\t",usecols=[ref_snpid,ref_rsid],
                               chunksize=chunksize,index_col=ref_snpid,
                               dtype={ref_snpid:"string",ref_rsid:"string"})
-            if verbose:  log.write(" -Setting block size: ",chunksize)
-            if verbose:  log.write(" -Loading block: ",end="")
+            log.write(" -Setting block size: ",chunksize,verbose=verbose)
+            log.write(" -Loading block: ",end="",verbose=verbose)
             for i,dic in enumerate(dic_chuncks):
                 gc.collect()
                 log.write(i," ",end=" ",show_time=False)
@@ -433,17 +517,18 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
                 dic = dic.loc[~dic.index.duplicated(keep=False),:]
                 sumstats.update(dic,overwrite=True)
-            if verbose:  log.write("\n",end="",show_time=False)
+            log.write("\n",end="",show_time=False,verbose=verbose)
             sumstats = sumstats.reset_index()
             sumstats = sumstats.rename(columns = {'index':snpid})
             after_number = sum(~sumstats[rsid].isna())
-            if verbose: log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!")
-            if verbose: log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!")
+            log.write(" -rsID annotation for "+str(total_number - after_number) +" needed to be fixed!",verbose=verbose)
+            log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!",verbose=verbose)
         else:
-            if verbose: log.write(" -No rsID could be fixed...skipping...")
+            log.write(" -No rsID can be fixed...skipping...",verbose=verbose)
         ################################################################################################################
-    gc.collect()
+    finished(log,verbose,_end_line)
     return sumstats
 #################################################################################################################################################
 #single record assignment
@@ -522,12 +607,12 @@ def is_palindromic(sumstats,a1="EA",a2="NEA"):
 def check_strand(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS"):
     vcf_reader = VariantFile(ref_infer)
-    status_part = sumstats.apply(lambda x:check_strand_status(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict),axis=1)
+    status_part = sumstats.apply(lambda x:check_strand_status(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict),axis=1)
     return status_part
 def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS",daf_tolerance=0.2):
     vcf_reader = VariantFile(ref_infer)
-    status_part = sumstats.apply(lambda x:check_unkonwn_indel(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,x[5],chr_dict,daf_tolerance),axis=1)
+    status_part = sumstats.apply(lambda x:check_unkonwn_indel(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict,daf_tolerance),axis=1)
     return status_part
 ##################################################################################################################################################
@@ -535,121 +620,141 @@ def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NE
 def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,daf_tolerance=0.20,remove_snp="",mode="pi",n_cores=1,remove_indel="",
                        chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
                        chr_dict=None,verbose=True,log=Log()):
-    if verbose: log.write("Start to infer strand for palindromic SNPs...{}".format(_get_version()))
-    check_dataframe_shape(sumstats, log, verbose)
-    if verbose: log.write(" -Reference vcf file:", ref_infer)
+    ##start function with col checking##########################################################
+    _start_line = "infer strand for palindromic SNPs/align indistinguishable indels"
+    _end_line = "inferring strand for palindromic SNPs/align indistinguishable indels"
+    _start_cols = [chr,pos,ref,alt,eaf,status]
+    _start_function = ".infer_strand()"
+    _must_args ={"ref_alt_freq":ref_alt_freq}
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            n_cores=n_cores,
+                            ref_vcf=ref_infer,
+                            **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
     chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
+    log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
-    # check if the columns are complete
-    if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
-        raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
     if "p" in mode:
-        # ref_alt_freq INFO in vcf was provided
-        if ref_alt_freq is not None:
-            if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
-            ## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
-            good_chrpos =  sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
-            palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
-            not_palindromic_snp = good_chrpos & (~palindromic)
-            ##not palindromic : change status
-            sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
-            if verbose: log.write(" -Identified ", sum(palindromic)," palindromic SNPs...")
-            #palindromic but can not infer
-            maf_can_infer   = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
-            sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
-            #palindromic WITH UNKNWON OR UNCHECKED STATUS
-            unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
+        ## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
+        good_chrpos =  sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
+        palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
+        not_palindromic_snp = good_chrpos & (~palindromic)
+        ##not palindromic : change status
+        sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
+        log.write(" -Identified ", sum(palindromic)," palindromic SNPs...",verbose=verbose)
+        #palindromic but can not infer
+        maf_can_infer   = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
+        sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
+        #palindromic WITH UNKNWON OR UNCHECKED STATUS
+        unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
-            unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
-            if verbose: log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)))
+        unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
+        log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)),verbose=verbose)
-            #########################################################################################
-            if sum(unknow_palindromic_to_check)>0:
-                if sum(unknow_palindromic_to_check)<10000:
-                    n_cores=1
-                df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
-                pool = Pool(n_cores)
-                map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
-                status_inferred = pd.concat(pool.map(map_func,df_split))
-                sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
+        #########################################################################################
+        if sum(unknow_palindromic_to_check)>0:
+            if sum(unknow_palindromic_to_check)<10000:
+                n_cores=1
+            #df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
+            df_split = _df_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
+            pool = Pool(n_cores)
+            map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
+            status_inferred = pd.concat(pool.map(map_func,df_split))
+            sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
             pool.close()
             pool.join()
-            #########################################################################################
-            #0 Not palindromic SNPs
-            #1 Palindromic +strand  -> no need to flip
-            #2 palindromic -strand  -> need to flip -> fixed
-            #3 Indel no need flip
-            #4 Unknown Indel -> fixed
-            #5 Palindromic -strand -> need to flip
-            #6 Indel need flip
-            #7 indistinguishable
-            #8 Not matching or No information
-            #9 Unchecked
-            status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
-            status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
-            status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
-            status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
-            status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
-            if verbose: log.write("  -Non-palindromic : ",sum(status0))
-            if verbose: log.write("  -Palindromic SNPs on + strand: ",sum(status1))
-            if verbose: log.write("  -Palindromic SNPs on - strand and need to be flipped:",sum(status5))
-            if verbose: log.write("  -Palindromic SNPs with maf not available to infer : ",sum(status7))
-            if verbose: log.write("  -Palindromic SNPs with no macthes or no information : ",sum(status8))
-            if ("7" in remove_snp) and ("8" in remove_snp) :
-                if verbose: log.write("  -Palindromic SNPs with maf not available to infer and with no macthes or no information will will be removed")
-                sumstats = sumstats.loc[~(status7 | status8),:].copy()
-            elif "8" in remove_snp:
-                if verbose: log.write("  -Palindromic SNPs with no macthes or no information will be removed")
-                sumstats = sumstats.loc[~status8,:].copy()
-            elif "7" in remove_snp:
-                if verbose: log.write("  -Palindromic SNPs with maf not available to infer will be removed")
-                sumstats = sumstats.loc[~status7,:].copy()
+        else:
+            log.warning("No palindromic variants available for checking.")
+        #########################################################################################
+        #0 Not palindromic SNPs
+        #1 Palindromic +strand  -> no need to flip
+        #2 palindromic -strand  -> need to flip -> fixed
+        #3 Indel no need flip
+        #4 Unknown Indel -> fixed
+        #5 Palindromic -strand -> need to flip
+        #6 Indel need flip
+        #7 indistinguishable
+        #8 Not matching or No information
+        #9 Unchecked
+        status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
+        status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
+        status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
+        status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
+        status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
+        log.write("  -Non-palindromic : ",sum(status0),verbose=verbose)
+        log.write("  -Palindromic SNPs on + strand: ",sum(status1),verbose=verbose)
+        log.write("  -Palindromic SNPs on - strand and needed to be flipped:",sum(status5),verbose=verbose)
+        log.write("  -Palindromic SNPs with MAF not available to infer : ",sum(status7),verbose=verbose)
+        log.write("  -Palindromic SNPs with no macthes or no information : ",sum(status8),verbose=verbose)
+        if ("7" in remove_snp) and ("8" in remove_snp) :
+            log.write("  -Palindromic SNPs with MAF not available to infer and with no macthes or no information will will be removed",verbose=verbose)
+            sumstats = sumstats.loc[~(status7 | status8),:].copy()
+        elif "8" in remove_snp:
+            log.write("  -Palindromic SNPs with no macthes or no information will be removed",verbose=verbose)
+            sumstats = sumstats.loc[~status8,:].copy()
+        elif "7" in remove_snp:
+            log.write("  -Palindromic SNPs with MAF not available to infer will be removed",verbose=verbose)
+            sumstats = sumstats.loc[~status7,:].copy()
     ### unknow_indel
     if "i" in mode:
         unknow_indel = sumstats[status].str.match(r'\w\w\w\w\w[6][89]', case=False, flags=0, na=False)
-        if verbose: log.write(" -Identified ", sum(unknow_indel)," indistinguishable Indels...")
+        log.write(" -Identified ", sum(unknow_indel)," indistinguishable Indels...",verbose=verbose)
         if sum(unknow_indel)>0:
-            if verbose: log.write(" -Indistinguishable indels will be inferred from reference vcf ref and alt...")
+            log.write(" -Indistinguishable indels will be inferred from reference vcf REF and ALT...",verbose=verbose)
             #########################################################################################
             #with maf can not infer
-            #maf_can_infer   = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
+            #maf_can_infer   = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
             #sumstats.loc[unknow_indel&(~maf_can_infer),status] = vchange_status(sumstats.loc[unknow_indel&(~maf_can_infer),status],7,"9","8")
-            if verbose: log.write(" -DAF tolerance: {}".format(daf_tolerance))
+            log.write(" -Difference in allele frequency (DAF) tolerance: {}".format(daf_tolerance),verbose=verbose)
             if sum(unknow_indel)>0:
                 if sum(unknow_indel)<10000:
                     n_cores=1
-                df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
+                #df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
+                df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
                 pool = Pool(n_cores)
                 map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
                 status_inferred = pd.concat(pool.map(map_func,df_split))
                 sumstats.loc[unknow_indel,status] = status_inferred.values
-            pool.close()
-            pool.join()
+                pool.close()
+                pool.join()
             #########################################################################################
             status3 =  sumstats[status].str.match(r'\w\w\w\w\w\w[3]', case=False, flags=0, na=False)
             status6 =  sumstats[status].str.match(r'\w\w\w\w\w\w[6]', case=False, flags=0, na=False)
             status8 =  sumstats[status].str.match(r'\w\w\w\w\w[6][8]', case=False, flags=0, na=False)
-            if verbose: log.write("  -Indels ea/nea match reference : ",sum(status3))
-            if verbose: log.write("  -Indels ea/nea need to be flipped : ",sum(status6))
-            if verbose: log.write("  -Indels with no macthes or no information : ",sum(status8))
+            log.write("  -Indels ea/nea match reference : ",sum(status3),verbose=verbose)
+            log.write("  -Indels ea/nea need to be flipped : ",sum(status6),verbose=verbose)
+            log.write("  -Indels with no macthes or no information : ",sum(status8),verbose=verbose)
             if "8" in remove_indel:
-                if verbose: log.write("  -Indels with no macthes or no information will be removed")
-                sumstats = sumstats.loc[~status8,:].copy()
-    gc.collect()
+                log.write("  -Indels with no macthes or no information will be removed",verbose=verbose)
+                sumstats = sumstats.loc[~status8,:].copy()
+        else:
+            log.warning("No indistinguishable indels available for checking.")
+    finished(log,verbose,_end_line)
     return sumstats
@@ -673,31 +778,45 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
 ################################################################################################################
 def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
-    if verbose: log.write("Start to check the difference between EAF and reference vcf alt frequency ...{}".format(_get_version()))
-    check_dataframe_shape(sumstats, log, verbose)
-    if verbose: log.write(" -Reference vcf file:", ref_infer)
-    if verbose: log.write(" -CPU Cores to use :",n_cores)
+    ##start function with col checking##########################################################
+    _start_line = "check the difference between EAF and reference VCF ALT frequency"
+    _end_line = "checking the difference between EAF and reference VCF ALT frequency"
+    _start_cols = [chr,pos,ref,alt,eaf,status]
+    _start_function = ".check_daf()"
+    _must_args ={"ref_alt_freq":ref_alt_freq}
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            n_cores=n_cores,
+                            ref_vcf=ref_infer,
+                            **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
     chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
     column_name = column_name + suffix
-    # check if the columns are complete
-    if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
-        raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
     # ref_alt_freq INFO in vcf was provided
     if ref_alt_freq is not None:
-        if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
+        log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
         if not force:
             good_chrpos =  sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
-        if verbose: log.write(" -Checking variants:", sum(good_chrpos))
+        log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
         sumstats[column_name]=np.nan
     ########################
         if sum(~sumstats[eaf].isna())<10000:
             n_cores=1
-        df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
+        #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
+        df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
         pool = Pool(n_cores)
         if sum(~sumstats[eaf].isna())>0:
             map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,column_name=column_name,chr_dict=chr_dict)
@@ -708,25 +827,25 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
         #status_inferred = sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]].apply(lambda x:check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict),axis=1)
         #sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
-        #sumstats.loc[:,"DAF"]=sumstats.loc[:,"DAF"].astype("float")
-        if verbose: log.write(" - {} max:".format(column_name), np.nanmax(sumstats.loc[:,column_name]))
-        if verbose: log.write(" - {} min:".format(column_name), np.nanmin(sumstats.loc[:,column_name]))
-        if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats.loc[:,column_name]))
-        if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats.loc[:,column_name])))
-        if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats.loc[:,column_name])))
-        if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats.loc[:,column_name])))
-        if verbose: log.write("Finished allele frequency checking!")
+        #sumstats["DAF"]=sumstats["DAF"].astype("float")
+        log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]),verbose=verbose)
+        log.write(" - {} min:".format(column_name), np.nanmin(sumstats[column_name]),verbose=verbose)
+        log.write(" - {} sd:".format(column_name), np.nanstd(sumstats[column_name]),verbose=verbose)
+        log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats[column_name])),verbose=verbose)
+        log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats[column_name])),verbose=verbose)
+        log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats[column_name])),verbose=verbose)
+        log.write("Finished allele frequency checking!")
     return sumstats
 def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
     #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
     vcf_reader = VariantFile(ref_infer)
     def afapply(x,vcf,alt_freq,chr_dict):
-            return check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict)
+            return check_daf(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,chr_dict)
     map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
     status_inferred = sumstats.apply(map_func,axis=1)
-    sumstats.loc[:,column_name] = status_inferred.values
-    sumstats.loc[:,column_name]=sumstats.loc[:,column_name].astype("float")
+    sumstats[column_name] = status_inferred.values
+    sumstats[column_name]=sumstats[column_name].astype("float")
     return sumstats
 def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
@@ -741,33 +860,44 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
 ################################################################################################################
 def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
-    if verbose: log.write("Start to infer the AF and reference vcf alt frequency ...{}".format(_get_version()))
-    check_dataframe_shape(sumstats, log, verbose)
-    if verbose: log.write(" -Reference vcf file:", ref_infer)
-    if verbose: log.write(" -CPU Cores to use :",n_cores)
+    ##start function with col checking##########################################################
+    _start_line = "infer EAF using reference VCF ALT frequency"
+    _end_line = "inferring EAF using reference VCF ALT frequency"
+    _start_cols = [chr,pos,ref,alt,eaf,status]
+    _start_function = ".infer_af()"
+    _must_args ={"ref_alt_freq":ref_alt_freq}
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            n_cores=n_cores,
+                            ref_vcf=ref_infer,
+                            **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
     chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
-    # check if the columns are complete
-    if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
-        raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
     if eaf not in sumstats.columns:
         sumstats[eaf]=np.nan
     prenumber = sum(sumstats[eaf].isna())
     # ref_alt_freq INFO in vcf was provided
     if ref_alt_freq is not None:
-        if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
+        log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
         if not force:
             good_chrpos =  sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
-        if verbose: log.write(" -Checking variants:", sum(good_chrpos))
+        log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
     ########################
         if sum(sumstats[eaf].isna())<10000:
             n_cores=1
-        df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
+        #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
+        df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
         pool = Pool(n_cores)
         map_func = partial(inferaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
         sumstats.loc[good_chrpos,[eaf]] = pd.concat(pool.map(map_func,df_split))
@@ -776,20 +906,21 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
     ###########################
         afternumber = sum(sumstats[eaf].isna())
-        if verbose: log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber))
-        if verbose: log.write(" -EAF is still missing for {} variants.".format(afternumber))
-        if verbose: log.write("Finished allele frequency inferring!")
+        log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber),verbose=verbose)
+        log.write(" -EAF is still missing for {} variants.".format(afternumber),verbose=verbose)
+    finished(log,verbose,_end_line)
     return sumstats
 def inferaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
     #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
     vcf_reader = VariantFile(ref_infer)
     def afapply(x,vcf,alt_freq,chr_dict):
-            return infer_af(x[0],x[1]-1,x[1],x[2],x[3],vcf_reader,ref_alt_freq,chr_dict)
+            return infer_af(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,ref_alt_freq,chr_dict)
     map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
     status_inferred = sumstats.apply(map_func,axis=1)
-    sumstats.loc[:,eaf] = status_inferred.values
-    sumstats.loc[:,eaf]=sumstats.loc[:,eaf].astype("float")
+    sumstats[eaf] = status_inferred.values
+    sumstats[eaf]=sumstats[eaf].astype("float")
     return sumstats
 def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
@@ -810,13 +941,13 @@ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
 def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
     if vcf_path is not None:
         if vcf_chr_dict is None:
-            if verbose: log.write(" -Checking prefix for chromosomes in vcf files..." )
+            log.write(" -Checking prefix for chromosomes in vcf files..." ,verbose=verbose)
             prefix = check_vcf_chr_prefix(vcf_path)
             if prefix is not None:
-                if verbose: log.write(" -Prefix for chromosomes: ",prefix)
+                log.write(" -Prefix for chromosomes: ",prefix)
                 vcf_chr_dict = get_number_to_chr(prefix=prefix)
             else:
-                if verbose: log.write(" -No prefix for chromosomes in the VCF files." )
+                log.write(" -No prefix for chromosomes in the VCF files." ,verbose=verbose)
                 vcf_chr_dict = get_number_to_chr()
     return vcf_chr_dict
@@ -827,4 +958,6 @@ def check_vcf_chr_prefix(vcf_bcf_path):
         if m is not None:
             return m.group(1)
     else:
-        return None
+        return None

gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.37py3-none-any.whl → 3.4.39py3-none-any.whl