PyPI - gwaslab - Versions diffs - 3.4.43__tar.gz → 3.4.45__tar.gz - Mend

gwaslab 3.4.43tar.gz → 3.4.45tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (86) hide show

{gwaslab-3.4.43/src/gwaslab.egg-info → gwaslab-3.4.45}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: gwaslab
-Version: 3.4.43
+Version: 3.4.45
 Summary: A collection of handy tools for GWAS SumStats
 Author-email: Yunye <yunye@gwaslab.com>
 Project-URL: Homepage, https://cloufield.github.io/gwaslab/
@@ -8,7 +8,7 @@ Project-URL: Github, https://github.com/Cloufield/gwaslab
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
-Requires-Python: <=3.10,>=3.9
+Requires-Python: <3.11,>=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: LICENSE_before_v3.4.39
@@ -17,7 +17,7 @@ Requires-Dist: numpy>=1.21.2
 Requires-Dist: matplotlib!=3.7.2,>=3.5
 Requires-Dist: seaborn>=0.12
 Requires-Dist: scipy>=1.12
-Requires-Dist: pySAM<0.20,>=0.18.1
+Requires-Dist: pySAM==0.22.1
 Requires-Dist: Biopython>=1.79
 Requires-Dist: adjustText<=0.8,>=0.7.3
 Requires-Dist: liftover>=1.1.13
@@ -51,7 +51,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
 ### install via pip
 ```
-pip install gwaslab==3.4.41
+pip install gwaslab==3.4.43
 ```
 ```python
@@ -90,7 +90,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
 ```
 conda env create -n gwaslab_test -c conda-forge python=3.9
 conda activate gwaslab
-pip install gwaslab==3.4.41
+pip install gwaslab==3.4.43
 ```
 or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)

{gwaslab-3.4.43 → gwaslab-3.4.45}/README.md RENAMED Viewed

@@ -23,7 +23,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
 ### install via pip
 ```
-pip install gwaslab==3.4.41
+pip install gwaslab==3.4.43
 ```
 ```python
@@ -62,7 +62,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
 ```
 conda env create -n gwaslab_test -c conda-forge python=3.9
 conda activate gwaslab
-pip install gwaslab==3.4.41
+pip install gwaslab==3.4.43
 ```
 or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)

{gwaslab-3.4.43 → gwaslab-3.4.45}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "gwaslab"
-version = "3.4.43"
+version = "3.4.45"
 authors = [
   { name="Yunye", email="yunye@gwaslab.com" },
 ]
@@ -21,7 +21,7 @@ dependencies = [
     "matplotlib>=3.5,!=3.7.2",
     "seaborn>=0.12",
     "scipy>=1.12",
-    "pySAM>=0.18.1,<0.20",
+    "pySAM==0.22.1",
     "Biopython>=1.79",
     "adjustText>=0.7.3, <=0.8",
     "liftover>=1.1.13",
@@ -31,7 +31,7 @@ dependencies = [
     "h5py>=3.10.0"
 ]
-requires-python = ">=3.9,<=3.10"
+requires-python = ">=3.9,<3.11"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_Sumstats.py RENAMED Viewed

@@ -356,8 +356,10 @@ class Sumstats():
         if ref_seq is not None:
             if ref_seq_mode=="v":
                 self.data = checkref(self.data,ref_seq,log=self.log,**checkref_args)
-            else:
+            elif ref_seq_mode=="s":
                 self.data = oldcheckref(self.data,ref_seq,log=self.log,**checkref_args)
+            else:
+                raise ValueError("ref_seq_mode should be 'v' (vectorized, faster) or 's' (sequential, slower)")
             self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
@@ -429,7 +431,7 @@ class Sumstats():
         if ref_seq_mode=="v":
             self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
             self.data = checkref(self.data,ref_seq,log=self.log,**kwargs)
-        else:
+        elif ref_seq_mode=="s":
             self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
             self.data = oldcheckref(self.data,ref_seq,log=self.log,**kwargs)
     def infer_strand(self,ref_infer,**kwargs):

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_SumstatsPair.py RENAMED Viewed

@@ -139,7 +139,7 @@ class SumstatsPair( ):
         self.clumps["clumps"], self.clumps["plink_log"] = _clump(self.data, log=self.log, p="P_1",mlog10p="MLOG10P_1", study = self.study_name, **kwargs)
     def to_coloc(self,**kwargs):
-        self.to_finemapping_file_path, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
+        self.to_finemapping_file_path, output_file_list, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
     def run_coloc_susie(self,**kwargs):

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_vchange_status.py RENAMED Viewed

@@ -1,13 +1,15 @@
 import pandas as pd
+CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
 def vchange_status(status,digit,before,after):
     dic={}
     for i in range(len(before)):
         dic[before[i]]=after[i]
     if digit>1:
-        return status.str[:digit-1]+status.str[digit-1].replace(dic)+status.str[digit:]
+        return pd.Categorical(status.str[:digit-1]+status.str[digit-1].replace(dic)+status.str[digit:],categories=CATEGORIES)
     else:
-        return status.str[digit-1].replace(dic)+status.str[digit:]
+        return pd.Categorical(status.str[digit-1].replace(dic)+status.str[digit:],categories=CATEGORIES)
 def copy_status(from_status,to_status, digit):
     if digit>1:

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_version.py RENAMED Viewed

@@ -15,8 +15,8 @@ def _get_version():
 def gwaslab_info():
     # version meta information
     dic={
-   "version":"3.4.43",
-   "release_date":"20240403"
+   "version":"3.4.45",
+   "release_date":"20240509"
     }
     return dic

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/hm_harmonize_sumstats.py RENAMED Viewed

@@ -355,7 +355,11 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
     log.write("\n",end="",show_time=False,verbose=verbose)
-    sumstats[status] = sumstats[status].astype("string")
+    CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+    sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
+    #sumstats[status] = sumstats[status].astype("string")
     available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
     status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
     status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -389,7 +393,10 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
     return sumstats
 #20240320 check if non-effect allele is aligned with reference genome
-def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array):
+def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array, records_len: np.array):
+    # starting_positions and records_len must be 1D arrays containing data only for the chromosomes contained in x,
+    # and these arrays must be ordered in the same way as the chromosomes in np.unique(x['CHR'].values).
     # status
     #0 /  ----->  match
     #1 /  ----->  Flipped Fixed
@@ -431,6 +438,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     max_len_nea = _nea.str.len().max()
     max_len_ea = _ea.str.len().max()
+    ########################################## mask for variants with out of range POS
+    mask_outlier = pos > records_len[chrom]
+    #########################################
     # Let's apply the same magic used for the fasta records (check build_fasta_records() for details) to convert the NEA and EA to
     # a numpy array of integers in a very fast way.
@@ -442,7 +452,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     nea = _nea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_nea}')
     nea = nea.view('<u4').reshape(-1, max_len_nea).astype(np.uint8)
     nea[nea == 0] = PADDING_VALUE # padding value
+    ###########################################
+    ###########################################
     # Create a mask holding True at the position of non-padding values
     mask_nea = nea != PADDING_VALUE
@@ -458,7 +470,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     ea = _ea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_ea}')
     ea = ea.view('<u4').reshape(-1, max_len_ea).astype(np.uint8)
     ea[ea == 0] = PADDING_VALUE # padding value
+    ###########################################
+    ###########################################
     mask_ea = ea != PADDING_VALUE
     rev_ea = _ea.str.translate(TRANSLATE_TABLE_COMPL).str.pad(max_len_ea, 'left', chr(PADDING_VALUE)).to_numpy().astype(f'<U{max_len_ea}')
@@ -503,8 +517,11 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     # Index the record array using the computed indices.
     # Since we use np.take, indices must all have the same length, and this is why we added the padding to NEA
     # and we create the indices using max_len_nea (long story short, we can't obtain a scattered/ragged array)
-    output_nea = np.take(record, indices)
+    output_nea = np.take(record, indices, mode="clip")
+    ##################################################################
+    output_nea[mask_outlier] = PADDING_VALUE
+    ##################################################################
     # Check if the NEA is equal to the reference sequence at the given position
     # In a non-matrix way, this is equivalent (for one single element) to:
     # nea == record[pos-1: pos+len(nea)-1]
@@ -527,7 +544,10 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     indices_range = np.arange(max_len_ea)
     indices = pos + indices_range
     indices = indices + modified_indices
-    output_ea = np.take(record, indices)
+    output_ea = np.take(record, indices, mode="clip")
+    ##################################################################
+    output_ea[mask_outlier] = PADDING_VALUE
+    ##################################################################
     ea_eq_ref = np.all((ea == output_ea) + ~mask_ea, 1)
     rev_ea_eq_ref = np.all((rev_ea == output_ea) + ~mask_ea, 1)
@@ -582,24 +602,28 @@ def check_status(sumstats: pd.DataFrame, fasta_records_dict, log=Log(), verbose=
     chrom,pos,ea,nea,status = sumstats.columns
     # First, convert the fasta records to a single numpy array of integers
-    record, starting_positions_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
+    record, starting_positions_dict, records_len_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
     # In _fast_check_status(), several 2D numpy arrays are created and they are padded to have shape[1] == max_len_nea or max_len_ea
     # Since most of the NEA and EA strings are short, we perform the check first on the records having short NEA and EA strings,
     # and then we perform the check on the records having long NEA and EA strings. In this way we can speed up the process (since the
     # arrays are smaller) and save memory.
     max_len = 4 # this is a chosen value, we could compute it using some stats about the length and count of NEA and EA strings
-    condition = (sumstats[nea].str.len() <= max_len) * (sumstats[ea].str.len() <= max_len)
+    condition = (sumstats[nea].str.len() <= max_len) & (sumstats[ea].str.len() <= max_len)
     log.write(f"   -Checking records for ( len(NEA) <= {max_len} and len(EA) <= {max_len} )", verbose=verbose)
     sumstats_cond = sumstats[condition]
-    starting_pos_cond = np.array([starting_positions_dict[k] for k in sumstats_cond[chrom].unique()])
-    sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond)
+    unique_chrom_cond = sumstats_cond[chrom].unique()
+    starting_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_cond])
+    records_len_cond = np.array([records_len_dict[k] for k in unique_chrom_cond])
+    sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond, records_len=records_len_cond)
     log.write(f"   -Checking records for ( len(NEA) > {max_len} or len(EA) > {max_len} )", verbose=verbose)
     sumstats_not_cond = sumstats[~condition]
-    starting_not_pos_cond = np.array([starting_positions_dict[k] for k in sumstats_not_cond[chrom].unique()])
-    sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond)
+    unique_chrom_not_cond = sumstats_not_cond[chrom].unique()
+    starting_not_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_not_cond])
+    records_len_not_cond = np.array([records_len_dict[k] for k in unique_chrom_not_cond])
+    sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond, records_len=records_len_not_cond)
     return sumstats[status].values
@@ -649,9 +673,11 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
         sumstats_to_check = sumstats.loc[to_check_ref,[chrom,pos,ea,nea,status]]
         sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
         log.write(" -Finished checking records", verbose=verbose)
-    sumstats[status] = sumstats[status].astype("string")
+    CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+    sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
+    #sumstats[status] = sumstats[status].astype("string")
     available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
     status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
     status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -680,6 +706,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
     if remove is True:
         sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
         log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
     finished(log, verbose, _end_line)
     return sumstats
@@ -709,10 +736,11 @@ def build_fasta_records(fasta_records_dict, pos_as_dict=True, log=Log(), verbose
     starting_positions = np.cumsum(records_len) - records_len
     if pos_as_dict:
         starting_positions = {k: v for k, v in zip(fasta_records_dict.keys(), starting_positions)}
+        records_len_dict =  {k: v for k, v in zip(fasta_records_dict.keys(), records_len)}
     record = np.concatenate(all_r)
     del all_r # free memory
-    return record, starting_positions
+    return record, starting_positions,records_len_dict
 #######################################################################################################################################

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/qc_fix_sumstats.py RENAMED Viewed

@@ -792,7 +792,7 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
 ###############################################################################################################
 # 20220721
-def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
+def parallelnormalizeallele(sumstats,mode="s",snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",chunk=3000000,n_cores=1,verbose=True,log=Log()):
     ##start function with col checking##########################################################
     _start_line = "normalize indels"
     _end_line = "normalizing indels"
@@ -819,7 +819,51 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
         log.write("Finished normalizing variants successfully!", verbose=verbose)
         return sumstats
     ###############################################################################################################
-    if sum(variants_to_check)>0:
+    if mode=="v":
+        if sum(variants_to_check)<100000:
+            n_cores=1
+        if n_cores==1:
+            normalized_pd, changed_index = fastnormalizeallele(sumstats.loc[variants_to_check,[pos,nea,ea,status]],pos=pos ,nea=nea,ea=ea,status=status,chunk=chunk, log=log, verbose=verbose)
+        else:
+            pool = Pool(n_cores)
+            map_func = partial(fastnormalizeallele,pos=pos,nea=nea,ea=ea,status=status)
+            df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
+            results = pool.map(map_func,df_split)
+            normalized_pd = pd.concat([i[0] for i in results])
+            changed_index = np.concatenate([i[1] for i in results])
+            del results
+            pool.close()
+            pool.join()
+            gc.collect()
+        ###############################################################################################################
+        try:
+            example_sumstats = sumstats.loc[changed_index,:].head()
+            changed_num = len(changed_index)
+            if changed_num>0:
+                if snpid in example_sumstats.columns:
+                    before_normalize_id = example_sumstats.loc[variants_to_check,snpid]
+                elif rsid in example_sumstats.columns:
+                    before_normalize_id = example_sumstats.loc[variants_to_check,rsid]
+                else:
+                    before_normalize_id = example_sumstats.index
+                log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
+                for i in before_normalize_id.values:
+                    log.write(i,end=" ",show_time=False)
+                log.write("... \n",end="",show_time=False, verbose=verbose)
+                log.write(" -Not normalized allele:",end="", verbose=verbose)
+                for i in example_sumstats[[ea,nea]].values:
+                    log.write(i,end="",show_time=False, verbose=verbose)
+                log.write("... \n",end="",show_time=False, verbose=verbose)
+                log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
+            else:
+                log.write(" -All variants are already normalized..", verbose=verbose)
+        except:
+            pass
+    ##########################################################################################################################################################
+    elif mode=="s":
         if sum(variants_to_check)<10000:
             n_cores=1
         pool = Pool(n_cores)
@@ -829,35 +873,36 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
         normalized_pd = pd.concat(pool.map(map_func,df_split))
         pool.close()
         pool.join()
-    ###############################################################################################################
-    before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
-    changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
-    if changed_num>0:
-        if snpid in sumstats.columns:
-            before_normalize_id = sumstats.loc[variants_to_check,snpid]
-        elif rsid in sumstats.columns:
-            before_normalize_id = sumstats.loc[variants_to_check,rsid]
-        else:
-            before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
+        before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
+        changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
+        if changed_num>0:
+            if snpid in sumstats.columns:
+                before_normalize_id = sumstats.loc[variants_to_check,snpid]
+            elif rsid in sumstats.columns:
+                before_normalize_id = sumstats.loc[variants_to_check,rsid]
+            else:
+                before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
+            log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
+            for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
+                log.write(i,end=" ",show_time=False)
+            log.write("... \n",end="",show_time=False, verbose=verbose)
-        log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
-        for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
-            log.write(i,end=" ",show_time=False)
-        log.write("... \n",end="",show_time=False, verbose=verbose)
-        log.write(" -Not normalized allele:",end="", verbose=verbose)
-        for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
-            log.write(i,end="",show_time=False, verbose=verbose)
-        log.write("... \n",end="",show_time=False, verbose=verbose)
-        log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
-    else:
-        log.write(" -All variants are already normalized..", verbose=verbose)
-    ###################################################################################################################
+            log.write(" -Not normalized allele:",end="", verbose=verbose)
+            for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
+                log.write(i,end="",show_time=False, verbose=verbose)
+            log.write("... \n",end="",show_time=False, verbose=verbose)
+            log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
+        else:
+            log.write(" -All variants are already normalized..", verbose=verbose)
+        ###################################################################################################################
     categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
     sumstats[ea]  = pd.Categorical(sumstats[ea],categories = categories)
     sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
     sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
     try:
         sumstats[pos] = sumstats[pos].astype('Int64')
     except:
@@ -873,6 +918,67 @@ def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
     sumstats = pd.DataFrame(normalized.to_list(), columns=[pos,nea,ea,status],index=sumstats.index)
     return sumstats
+def fastnormalizeallele(insumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS",chunk=3000000,log=Log(),verbose=False):
+    log.write(" -Number of variants to check:{}".format(len(insumstats)), verbose=verbose)
+    log.write(" -Chunk size:{}".format(chunk), verbose=verbose)
+    log.write(" -Processing in chunks:",end="", verbose=verbose)
+    changed_index = np.array([])
+    for part_n in range(len(insumstats)//chunk+1):
+        log.write(part_n, end=" ",show_time=False, verbose=verbose)
+        insumstats["NEA"] = insumstats["NEA"].astype("string")
+        insumstats["EA"] = insumstats["EA"].astype("string")
+        insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:],changed_index_single  = normalizae_chunk(insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:].copy())
+        changed_index = np.concatenate([changed_index,changed_index_single])
+        gc.collect()
+    log.write("\n",end="",show_time=False, verbose=verbose)
+    return insumstats, changed_index
+def normalizae_chunk(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
+    # already normalized
+    is_same = sumstats["NEA"] == sumstats["EA"]
+    is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
+    # a series to keep tracking of variants that are modified
+    changed = sumstats["NEA"] != sumstats["NEA"]
+    # right side
+    ea_len = sumstats["NEA"].str.len()
+    nea_len = sumstats["EA"].str.len()
+    max_length=max(ea_len.max(), nea_len.max())
+    for i in range(1, max_length):
+        is_pop = (sumstats["NEA"].str[-1] == sumstats["EA"].str[-1]) & (~is_normalized)
+        if sum(is_pop)==0:
+            break
+        if i ==1:
+            changed = changed | is_pop
+        nea_len[is_pop] = nea_len[is_pop] -1
+        ea_len[is_pop] = ea_len[is_pop] -1
+        sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[:-1]
+        sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[:-1]
+        is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
+        gc.collect()
+    # left side
+    max_length=max(sumstats["NEA"].str.len().max(), sumstats["EA"].str.len().max())
+    for i in range(1, max_length):
+        is_pop = (sumstats["NEA"].str[0] == sumstats["EA"].str[0]) & (~is_normalized)
+        if sum(is_pop)==0:
+            break
+        if i ==1:
+            changed = changed | is_pop
+        sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[1:]
+        sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[1:]
+        sumstats.loc[is_pop, "POS"] = sumstats.loc[is_pop,"POS"] + 1
+        is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
+        gc.collect()
+    sumstats.loc[is_normalized,status]     = vchange_status(sumstats.loc[is_normalized, status],  5,"4","0")
+    sumstats.loc[is_same,status]     = vchange_status(sumstats.loc[is_same, status],  5,"4","3")
+    changed_index = sumstats[changed].index
+    return sumstats, changed_index.values
 def normalizevariant(pos,a,b,status):
     # single record
     # https://genome.sph.umich.edu/wiki/Variant_Normalization

{gwaslab-3.4.43 → gwaslab-3.4.45/src/gwaslab.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: gwaslab
-Version: 3.4.43
+Version: 3.4.45
 Summary: A collection of handy tools for GWAS SumStats
 Author-email: Yunye <yunye@gwaslab.com>
 Project-URL: Homepage, https://cloufield.github.io/gwaslab/
@@ -8,7 +8,7 @@ Project-URL: Github, https://github.com/Cloufield/gwaslab
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
-Requires-Python: <=3.10,>=3.9
+Requires-Python: <3.11,>=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: LICENSE_before_v3.4.39
@@ -17,7 +17,7 @@ Requires-Dist: numpy>=1.21.2
 Requires-Dist: matplotlib!=3.7.2,>=3.5
 Requires-Dist: seaborn>=0.12
 Requires-Dist: scipy>=1.12
-Requires-Dist: pySAM<0.20,>=0.18.1
+Requires-Dist: pySAM==0.22.1
 Requires-Dist: Biopython>=1.79
 Requires-Dist: adjustText<=0.8,>=0.7.3
 Requires-Dist: liftover>=1.1.13
@@ -51,7 +51,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
 ### install via pip
 ```
-pip install gwaslab==3.4.41
+pip install gwaslab==3.4.43
 ```
 ```python
@@ -90,7 +90,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
 ```
 conda env create -n gwaslab_test -c conda-forge python=3.9
 conda activate gwaslab
-pip install gwaslab==3.4.41
+pip install gwaslab==3.4.43
 ```
 or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab.egg-info/requires.txt RENAMED Viewed

@@ -3,7 +3,7 @@ numpy>=1.21.2
 matplotlib!=3.7.2,>=3.5
 seaborn>=0.12
 scipy>=1.12
-pySAM<0.20,>=0.18.1
+pySAM==0.22.1
 Biopython>=1.79
 adjustText<=0.8,>=0.7.3
 liftover>=1.1.13

{gwaslab-3.4.43 → gwaslab-3.4.45}/LICENSE RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/LICENSE_before_v3.4.39 RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/setup.cfg RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/__init__.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/bd_common_data.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/bd_config.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/bd_download.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/bd_get_hapmap3.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/cache_manager.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/chrx_par/chrx_par_hg19.bed.gz RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/chrx_par/chrx_par_hg38.bed.gz RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/formatbook.json RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/high_ld/high_ld_hla_hg19.bed.gz RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/high_ld/high_ld_hla_hg38.bed.gz RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/data/reference.json RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_Log.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_Phenotypes.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_SumstatsT.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_Sumstats_summary.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/g_meta.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/hm_casting.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/hm_rsid_to_chrpos.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/io_preformat_input.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/io_read_ldsc.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/io_read_tabular.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/io_to_formats.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/io_to_pickle.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/ldsc_irwls.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/ldsc_jackknife.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/ldsc_ldscore.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/ldsc_parse.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/ldsc_regressions.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/ldsc_sumstats.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/qc_check_datatype.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/run_script.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_calculate_ldmatrix.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_calculate_prs.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_gwascatalog.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_ldproxyfinder.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_ldsc.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_plink_filter.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_process_h5.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_process_ref.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_run_2samplemr.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_run_clumping.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_run_coloc.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_ex_run_susie.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_calculate_gc.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_calculate_power.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_convert_h2.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_correct_winnerscurse.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_fill_data.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_filter_value.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_get_density.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/util_in_get_sig.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_aux_annotate_plot.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_aux_quickfix.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_aux_reposition_text.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_aux_save_figure.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_compare_af.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_compare_effect.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_forestplot.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_miamiplot.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_miamiplot2.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_mqqplot.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_qqplot.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_regionalplot.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_rg_heatmap.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_stackedregional.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab/viz_plot_trumpetplot.py RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{gwaslab-3.4.43 → gwaslab-3.4.45}/src/gwaslab.egg-info/top_level.txt RENAMED Viewed

File without changes

gwaslab 3.4.43__tar.gz → 3.4.45__tar.gz

Potentially problematic release.

gwaslab 3.4.43tar.gz → 3.4.45tar.gz