PyPI - gwaslab - Versions diffs - 3.4.41__py3-none-any.whl → 3.4.43__py3-none-any.whl - Mend

gwaslab 3.4.41py3-none-any.whl → 3.4.43py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (26) hide show

gwaslab/bd_common_data.py +14 -1
gwaslab/bd_get_hapmap3.py +7 -3
gwaslab/cache_manager.py +687 -0
gwaslab/g_Sumstats.py +156 -138
gwaslab/g_SumstatsPair.py +15 -15
gwaslab/g_version.py +2 -2
gwaslab/hm_harmonize_sumstats.py +558 -32
gwaslab/io_read_tabular.py +7 -7
gwaslab/io_to_formats.py +96 -21
gwaslab/io_to_pickle.py +1 -1
gwaslab/ldsc_ldscore.py +1 -1
gwaslab/qc_fix_sumstats.py +4 -11
gwaslab/util_ex_calculate_ldmatrix.py +2 -2
gwaslab/util_ex_calculate_prs.py +2 -2
gwaslab/util_ex_ldsc.py +163 -110
gwaslab/util_ex_plink_filter.py +2 -2
gwaslab/util_ex_run_clumping.py +2 -2
gwaslab/util_in_filter_value.py +27 -9
gwaslab/viz_plot_mqqplot.py +12 -11
gwaslab/viz_plot_trumpetplot.py +115 -4
{gwaslab-3.4.41.dist-info → gwaslab-3.4.43.dist-info}/METADATA +8 -3
{gwaslab-3.4.41.dist-info → gwaslab-3.4.43.dist-info}/RECORD +26 -25
{gwaslab-3.4.41.dist-info → gwaslab-3.4.43.dist-info}/WHEEL +1 -1
{gwaslab-3.4.41.dist-info → gwaslab-3.4.43.dist-info}/LICENSE +0 -0
{gwaslab-3.4.41.dist-info → gwaslab-3.4.43.dist-info}/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.4.41.dist-info → gwaslab-3.4.43.dist-info}/top_level.txt +0 -0

gwaslab/io_read_tabular.py CHANGED Viewed

@@ -3,30 +3,30 @@ from gwaslab.bd_common_data import get_formats_list
 from gwaslab.g_Log import Log
 from gwaslab.bd_common_data import get_format_dict
-def _read_tabular(path, fmt, **args):
+def _read_tabular(path, fmt, **kwargs):
     # default
     load_args_dict = {"sep":"\t",
                       "header":None}
     # if specified by user
-    if len(args)>0:
-        load_args_dict = args
+    if len(kwargs)>0:
+        load_args_dict = kwargs
     # load format
     meta_data, rename_dictionary = get_format_dict(fmt)
-    if "format_separator" in meta_data and "sep" not in args:
+    if "format_separator" in meta_data and "sep" not in kwargs:
         load_args_dict["sep"] = meta_data["format_separator"]
-    if "format_comment" in meta_data and "comment" not in args:
+    if "format_comment" in meta_data and "comment" not in kwargs:
         if  meta_data["format_comment"] is not None:
             load_args_dict["comment"] = meta_data["format_comment"]
-    if "format_header" in meta_data and "header" not in args:
+    if "format_header" in meta_data and "header" not in kwargs:
         load_args_dict["header"] = meta_data["format_header"]
-    if "format_na" in meta_data and "na_values" not in args:
+    if "format_na" in meta_data and "na_values" not in kwargs:
         if  meta_data["format_na"] is not None:
             load_args_dict["na_values"] = meta_data["format_na"]

gwaslab/io_to_formats.py CHANGED Viewed

@@ -2,6 +2,7 @@ import pandas as pd
 import yaml
 import hashlib
 import copy
+import gzip
 from pysam import tabix_compress
 from pysam import tabix_index
 from datetime import datetime
@@ -306,26 +307,30 @@ def tofmt(sumstats,
         vcf_header =  _process_vcf_header(sumstats, meta, meta_data, build, log, verbose)
         log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
-        # output header
-        with open(path,"w") as file:
-            file.write(vcf_header)
-        with open(path,"a") as file:
-            log.write(" -Output columns:"," ".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]))
-            file.write("\t".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]])+"\n")
-            log.write(" -Outputing data...")
-            QUAL="."
-            FILTER="PASS"
-            for index,row in sumstats.iterrows():
-                CHROM=str(row["#CHROM"])
-                POS=str(row["POS"])
-                ID=str(row["ID"])
-                REF=str(row["REF"])
-                ALT=str(row["ALT"])
-                INFO=str(row["INFO"])
-                FORMAT=":".join(output_format)
-                DATA=":".join(row[output_format].astype("string"))
-                file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, DATA))
+        try:
+            fast_to_vcf(sumstats, path, vcf_header, output_format, meta_data, meta)
+        except:
+            log.write(f"Error in using fast_to_vcf. Falling back to original implementation.",verbose=verbose)
+            # output header
+            with open(path,"w") as file:
+                file.write(vcf_header)
+            with open(path,"a") as file:
+                log.write(" -Output columns:"," ".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]))
+                file.write("\t".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]])+"\n")
+                log.write(" -Outputing data...")
+                QUAL="."
+                FILTER="PASS"
+                for index,row in sumstats.iterrows():
+                    CHROM=str(row["#CHROM"])
+                    POS=str(row["POS"])
+                    ID=str(row["ID"])
+                    REF=str(row["REF"])
+                    ALT=str(row["ALT"])
+                    INFO=str(row["INFO"])
+                    FORMAT=":".join(output_format)
+                    DATA=":".join(row[output_format].astype("string"))
+                    file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, DATA))
         _bgzip_tabix_md5sum(path, fmt, bgzip, md5sum, tabix, tabix_indexargs, log, verbose)
     ####################################################################################################################
@@ -342,7 +347,11 @@ def tofmt(sumstats,
         sumstats,to_csvargs = _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose)
         log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
-        sumstats.to_csv(path, index=None,**to_csvargs)
+        try:
+            fast_to_csv(sumstats, path, to_csvargs=to_csvargs, compress=True, write_in_chunks=True)
+        except:
+            log.write(f"Error in using fast_to_csv. Falling back to original implementation.",verbose=verbose)
+            sumstats.to_csv(path, index=None, **to_csvargs)
         if md5sum == True:
             md5_value = md5sum_file(path,log,verbose)
@@ -353,6 +362,72 @@ def tofmt(sumstats,
         _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose)
         return sumstats
+####################################################################################################################
+def fast_to_csv(dataframe, path, to_csvargs=None, compress=True, write_in_chunks=True):
+        df_numpy = dataframe.to_numpy()
+        if path.endswith(".gz"):
+            path = path[:-3]
+        if to_csvargs is None:
+            to_csvargs = {}
+        if 'sep' in to_csvargs:
+            sep = to_csvargs['sep']
+        else:
+            sep = '\t'
+        # this operation slows down a bit the process, but it is necessary to be consistent with the pandas.to_csv() behavior
+        if 'na_rep' in to_csvargs:
+            df_numpy[pd.isna(df_numpy)] = to_csvargs['na_rep'] # replace NaNs. We have to use pd.isna because np.isnan does not work with 'object' and 'string' dtypes
+        # np.savetext() is faster than df.to_csv, however it loops through the rows of X and formats each row individually:
+        # https://github.com/numpy/numpy/blob/d35cd07ea997f033b2d89d349734c61f5de54b0d/numpy/lib/npyio.py#L1613
+        # We can speed up the process building the whole format string and then appling the formatting in one single call
+        out_string = sep.join(dataframe.columns) + '\n'
+        fmt = sep.join(['%s']*dataframe.shape[1]) # build formatting for one single row
+        fmt = '\n'.join([fmt]*dataframe.shape[0]) # add newline and replicate the formatting for all rows
+        out_string += fmt % tuple(df_numpy.ravel()) # flatten the array and then apply formatting
+        out_string += '\n'
+        if write_in_chunks:
+            chunk_size = 50000000
+            lines = [out_string[i:i+chunk_size] for i in range(0, len(out_string), chunk_size)]
+        else:
+            lines = [out_string]
+        if compress:
+            lines = [line.encode() for line in lines]
+            with gzip.open(path+".gz", 'wb', compresslevel=1) as f:
+                f.writelines(lines)
+        else:
+            with open(path, 'w') as f:
+                f.writelines(lines)
+def fast_to_vcf(dataframe, path, vcf_header, output_format, meta_data, meta):
+    # Get the columns in the right order and convert to numpy
+    df_numpy = dataframe[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'INFO'] + output_format].to_numpy()
+    sep = '\t'
+    QUAL = "."
+    FILTER = "PASS"
+    FORMAT = ":".join(output_format)
+    format_format = ':'.join(['%s']*len(output_format))
+    single_row_format = f'%s %s %s %s %s {QUAL} {FILTER} %s {FORMAT} {format_format}'
+    out_string = vcf_header
+    out_string += sep.join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]) + "\n"
+    fmt = sep.join(single_row_format.split(' ')) # build formatting for one single row
+    fmt = '\n'.join([fmt]*dataframe.shape[0]) # add newline and replicate the formatting for all rows
+    out_string += fmt % tuple(df_numpy.ravel()) # flatten the array and then apply formatting
+    out_string += '\n'
+    with open(path, 'w') as f:
+        f.write(out_string)
 ####################################################################################################################
 def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose):
     # grab format cols that exist in sumstats

gwaslab/io_to_pickle.py CHANGED Viewed

@@ -13,7 +13,7 @@ def dump_pickle(glsumstats,path="~/mysumstats.pickle",overwrite=False):
         with open(path, 'wb') as file:
             glsumstats.log.write(" -Dump the Sumstats Object to : ", path)
             pickle.dump(glsumstats, file)
-    Log().write("Finished dumping.")
+    glsumstats.log.write("Finished dumping.")
 def load_pickle(path):
     if os.path.exists(path):

gwaslab/ldsc_ldscore.py CHANGED Viewed

@@ -296,7 +296,7 @@ class PlinkBEDFile(__GenotypeArrayInMemory__):
         nru_new = n_new + e
         nru = self.nru
         z = ba.bitarray(m*2*nru_new, endian="little")
-	z.setall(0)
+        z.setall(0)
         for e, i in enumerate(keep_indivs):
             z[2*e::2*nru_new] = geno[2*i::2*nru]
             z[2*e+1::2*nru_new] = geno[2*i+1::2*nru]

gwaslab/qc_fix_sumstats.py CHANGED Viewed

@@ -1532,7 +1532,7 @@ def start_to(sumstats,
              ref_fasta=None,
              n_cores=None,
              ref_tsv=None,
-             **args
+             **kwargs
              ):
     log.write("Start to {}...{}".format(start_line,_get_version()), verbose=verbose)
@@ -1557,7 +1557,7 @@ def start_to(sumstats,
                 log.write(" -Reference TSV: {}".format(ref_tsv))
             is_args_valid = True
-            for key, value in args.items():
+            for key, value in kwargs.items():
                 is_args_valid = is_args_valid & check_arg(log, verbose, key, value, start_function)
             is_enough_col = is_args_valid & is_enough_col
@@ -1611,12 +1611,5 @@ def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
 ###############################################################################################################
 def _df_split(dataframe, n):
-    chunks = []
-    chunk_size = int(dataframe.shape[0] // n)+1
-    for index in range(0, dataframe.shape[0], chunk_size):
-        chunks.append(
-            dataframe.iloc[index:index + chunk_size]
-        )
-    return chunks
+    k, m = divmod(len(dataframe), n)
+    return [dataframe.iloc[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]

gwaslab/util_ex_calculate_ldmatrix.py CHANGED Viewed

@@ -27,7 +27,7 @@ def tofinemapping(sumstats,
                   log=Log(),
                   suffixes=None,
                   verbose=True,
-                  **args):
+                  **kwargs):
     ##start function with col checking##########################################################
     _start_line = "calculate LD matrix"
     _end_line = "calculating LD matrix"
@@ -84,7 +84,7 @@ def tofinemapping(sumstats,
                                                                     n_cores=n_cores,
                                                                     log=log,
                                                                     load_bim=True,
-                                                                    overwrite=overwrite,**args)
+                                                                    overwrite=overwrite,**kwargs)
         ## check available snps with reference file
         matched_sumstats = _align_sumstats_with_bim(row=row,

gwaslab/util_ex_calculate_prs.py CHANGED Viewed

@@ -18,7 +18,7 @@ def _calculate_prs(sumstats,
           memory=None,
           overwrite=False,
           mode=None,delete=True,
-          log=Log(),**args):
+          log=Log(),**kwargs):
     #matching_alleles
         #read_bim
@@ -37,7 +37,7 @@ def _calculate_prs(sumstats,
                                                                     n_cores=n_cores,
                                                                     log=log,
                                                                     load_bim=False,
-                                                                    overwrite=overwrite,**args)
+                                                                    overwrite=overwrite,**kwargs)
         score_file_path_list =[]
         for index, chrom in enumerate(chrlist):
             chr_sumstats = sumstats.loc[sumstats["CHR"]==chrom,:].copy()

gwaslab 3.4.41__py3-none-any.whl → 3.4.43__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.41py3-none-any.whl → 3.4.43py3-none-any.whl