PyPI - gwaslab - Versions diffs - 3.4.43__py3-none-any.whl → 3.4.45__py3-none-any.whl - Mend

gwaslab 3.4.43py3-none-any.whl → 3.4.45py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (12) hide show

gwaslab/g_Sumstats.py +4 -2
gwaslab/g_SumstatsPair.py +1 -1
gwaslab/g_vchange_status.py +4 -2
gwaslab/g_version.py +2 -2
gwaslab/hm_harmonize_sumstats.py +45 -17
gwaslab/qc_fix_sumstats.py +132 -26
{gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/METADATA +5 -5
{gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/RECORD +12 -12
{gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/LICENSE +0 -0
{gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/WHEEL +0 -0
{gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/top_level.txt +0 -0

gwaslab/g_Sumstats.py CHANGED Viewed

@@ -356,8 +356,10 @@ class Sumstats():
         if ref_seq is not None:
             if ref_seq_mode=="v":
                 self.data = checkref(self.data,ref_seq,log=self.log,**checkref_args)
-            else:
+            elif ref_seq_mode=="s":
                 self.data = oldcheckref(self.data,ref_seq,log=self.log,**checkref_args)
+            else:
+                raise ValueError("ref_seq_mode should be 'v' (vectorized, faster) or 's' (sequential, slower)")
             self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
@@ -429,7 +431,7 @@ class Sumstats():
         if ref_seq_mode=="v":
             self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
             self.data = checkref(self.data,ref_seq,log=self.log,**kwargs)
-        else:
+        elif ref_seq_mode=="s":
             self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
             self.data = oldcheckref(self.data,ref_seq,log=self.log,**kwargs)
     def infer_strand(self,ref_infer,**kwargs):

gwaslab/g_SumstatsPair.py CHANGED Viewed

@@ -139,7 +139,7 @@ class SumstatsPair( ):
         self.clumps["clumps"], self.clumps["plink_log"] = _clump(self.data, log=self.log, p="P_1",mlog10p="MLOG10P_1", study = self.study_name, **kwargs)
     def to_coloc(self,**kwargs):
-        self.to_finemapping_file_path, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
+        self.to_finemapping_file_path, output_file_list, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
     def run_coloc_susie(self,**kwargs):

gwaslab/g_vchange_status.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import pandas as pd
+CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
 def vchange_status(status,digit,before,after):
     dic={}
     for i in range(len(before)):
         dic[before[i]]=after[i]
     if digit>1:
-        return status.str[:digit-1]+status.str[digit-1].replace(dic)+status.str[digit:]
+        return pd.Categorical(status.str[:digit-1]+status.str[digit-1].replace(dic)+status.str[digit:],categories=CATEGORIES)
     else:
-        return status.str[digit-1].replace(dic)+status.str[digit:]
+        return pd.Categorical(status.str[digit-1].replace(dic)+status.str[digit:],categories=CATEGORIES)
 def copy_status(from_status,to_status, digit):
     if digit>1:

gwaslab/g_version.py CHANGED Viewed

@@ -15,8 +15,8 @@ def _get_version():
 def gwaslab_info():
     # version meta information
     dic={
-   "version":"3.4.43",
-   "release_date":"20240403"
+   "version":"3.4.45",
+   "release_date":"20240509"
     }
     return dic

gwaslab/hm_harmonize_sumstats.py CHANGED Viewed

@@ -355,7 +355,11 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
     log.write("\n",end="",show_time=False,verbose=verbose)
-    sumstats[status] = sumstats[status].astype("string")
+    CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+    sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
+    #sumstats[status] = sumstats[status].astype("string")
     available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
     status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
     status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -389,7 +393,10 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
     return sumstats
 #20240320 check if non-effect allele is aligned with reference genome
-def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array):
+def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array, records_len: np.array):
+    # starting_positions and records_len must be 1D arrays containing data only for the chromosomes contained in x,
+    # and these arrays must be ordered in the same way as the chromosomes in np.unique(x['CHR'].values).
     # status
     #0 /  ----->  match
     #1 /  ----->  Flipped Fixed
@@ -431,6 +438,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     max_len_nea = _nea.str.len().max()
     max_len_ea = _ea.str.len().max()
+    ########################################## mask for variants with out of range POS
+    mask_outlier = pos > records_len[chrom]
+    #########################################
     # Let's apply the same magic used for the fasta records (check build_fasta_records() for details) to convert the NEA and EA to
     # a numpy array of integers in a very fast way.
@@ -442,7 +452,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     nea = _nea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_nea}')
     nea = nea.view('<u4').reshape(-1, max_len_nea).astype(np.uint8)
     nea[nea == 0] = PADDING_VALUE # padding value
+    ###########################################
+    ###########################################
     # Create a mask holding True at the position of non-padding values
     mask_nea = nea != PADDING_VALUE
@@ -458,7 +470,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     ea = _ea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_ea}')
     ea = ea.view('<u4').reshape(-1, max_len_ea).astype(np.uint8)
     ea[ea == 0] = PADDING_VALUE # padding value
+    ###########################################
+    ###########################################
     mask_ea = ea != PADDING_VALUE
     rev_ea = _ea.str.translate(TRANSLATE_TABLE_COMPL).str.pad(max_len_ea, 'left', chr(PADDING_VALUE)).to_numpy().astype(f'<U{max_len_ea}')
@@ -503,8 +517,11 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     # Index the record array using the computed indices.
     # Since we use np.take, indices must all have the same length, and this is why we added the padding to NEA
     # and we create the indices using max_len_nea (long story short, we can't obtain a scattered/ragged array)
-    output_nea = np.take(record, indices)
+    output_nea = np.take(record, indices, mode="clip")
+    ##################################################################
+    output_nea[mask_outlier] = PADDING_VALUE
+    ##################################################################
     # Check if the NEA is equal to the reference sequence at the given position
     # In a non-matrix way, this is equivalent (for one single element) to:
     # nea == record[pos-1: pos+len(nea)-1]
@@ -527,7 +544,10 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     indices_range = np.arange(max_len_ea)
     indices = pos + indices_range
     indices = indices + modified_indices
-    output_ea = np.take(record, indices)
+    output_ea = np.take(record, indices, mode="clip")
+    ##################################################################
+    output_ea[mask_outlier] = PADDING_VALUE
+    ##################################################################
     ea_eq_ref = np.all((ea == output_ea) + ~mask_ea, 1)
     rev_ea_eq_ref = np.all((rev_ea == output_ea) + ~mask_ea, 1)
@@ -582,24 +602,28 @@ def check_status(sumstats: pd.DataFrame, fasta_records_dict, log=Log(), verbose=
     chrom,pos,ea,nea,status = sumstats.columns
     # First, convert the fasta records to a single numpy array of integers
-    record, starting_positions_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
+    record, starting_positions_dict, records_len_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
     # In _fast_check_status(), several 2D numpy arrays are created and they are padded to have shape[1] == max_len_nea or max_len_ea
     # Since most of the NEA and EA strings are short, we perform the check first on the records having short NEA and EA strings,
     # and then we perform the check on the records having long NEA and EA strings. In this way we can speed up the process (since the
     # arrays are smaller) and save memory.
     max_len = 4 # this is a chosen value, we could compute it using some stats about the length and count of NEA and EA strings
-    condition = (sumstats[nea].str.len() <= max_len) * (sumstats[ea].str.len() <= max_len)
+    condition = (sumstats[nea].str.len() <= max_len) & (sumstats[ea].str.len() <= max_len)
     log.write(f"   -Checking records for ( len(NEA) <= {max_len} and len(EA) <= {max_len} )", verbose=verbose)
     sumstats_cond = sumstats[condition]
-    starting_pos_cond = np.array([starting_positions_dict[k] for k in sumstats_cond[chrom].unique()])
-    sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond)
+    unique_chrom_cond = sumstats_cond[chrom].unique()
+    starting_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_cond])
+    records_len_cond = np.array([records_len_dict[k] for k in unique_chrom_cond])
+    sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond, records_len=records_len_cond)
     log.write(f"   -Checking records for ( len(NEA) > {max_len} or len(EA) > {max_len} )", verbose=verbose)
     sumstats_not_cond = sumstats[~condition]
-    starting_not_pos_cond = np.array([starting_positions_dict[k] for k in sumstats_not_cond[chrom].unique()])
-    sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond)
+    unique_chrom_not_cond = sumstats_not_cond[chrom].unique()
+    starting_not_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_not_cond])
+    records_len_not_cond = np.array([records_len_dict[k] for k in unique_chrom_not_cond])
+    sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond, records_len=records_len_not_cond)
     return sumstats[status].values
@@ -649,9 +673,11 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
         sumstats_to_check = sumstats.loc[to_check_ref,[chrom,pos,ea,nea,status]]
         sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
         log.write(" -Finished checking records", verbose=verbose)
-    sumstats[status] = sumstats[status].astype("string")
+    CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+    sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
+    #sumstats[status] = sumstats[status].astype("string")
     available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
     status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
     status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -680,6 +706,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
     if remove is True:
         sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
         log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
     finished(log, verbose, _end_line)
     return sumstats
@@ -709,10 +736,11 @@ def build_fasta_records(fasta_records_dict, pos_as_dict=True, log=Log(), verbose
     starting_positions = np.cumsum(records_len) - records_len
     if pos_as_dict:
         starting_positions = {k: v for k, v in zip(fasta_records_dict.keys(), starting_positions)}
+        records_len_dict =  {k: v for k, v in zip(fasta_records_dict.keys(), records_len)}
     record = np.concatenate(all_r)
     del all_r # free memory
-    return record, starting_positions
+    return record, starting_positions,records_len_dict
 #######################################################################################################################################

gwaslab/qc_fix_sumstats.py CHANGED Viewed

@@ -792,7 +792,7 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
 ###############################################################################################################
 # 20220721
-def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
+def parallelnormalizeallele(sumstats,mode="s",snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",chunk=3000000,n_cores=1,verbose=True,log=Log()):
     ##start function with col checking##########################################################
     _start_line = "normalize indels"
     _end_line = "normalizing indels"
@@ -819,7 +819,51 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
         log.write("Finished normalizing variants successfully!", verbose=verbose)
         return sumstats
     ###############################################################################################################
-    if sum(variants_to_check)>0:
+    if mode=="v":
+        if sum(variants_to_check)<100000:
+            n_cores=1
+        if n_cores==1:
+            normalized_pd, changed_index = fastnormalizeallele(sumstats.loc[variants_to_check,[pos,nea,ea,status]],pos=pos ,nea=nea,ea=ea,status=status,chunk=chunk, log=log, verbose=verbose)
+        else:
+            pool = Pool(n_cores)
+            map_func = partial(fastnormalizeallele,pos=pos,nea=nea,ea=ea,status=status)
+            df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
+            results = pool.map(map_func,df_split)
+            normalized_pd = pd.concat([i[0] for i in results])
+            changed_index = np.concatenate([i[1] for i in results])
+            del results
+            pool.close()
+            pool.join()
+            gc.collect()
+        ###############################################################################################################
+        try:
+            example_sumstats = sumstats.loc[changed_index,:].head()
+            changed_num = len(changed_index)
+            if changed_num>0:
+                if snpid in example_sumstats.columns:
+                    before_normalize_id = example_sumstats.loc[variants_to_check,snpid]
+                elif rsid in example_sumstats.columns:
+                    before_normalize_id = example_sumstats.loc[variants_to_check,rsid]
+                else:
+                    before_normalize_id = example_sumstats.index
+                log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
+                for i in before_normalize_id.values:
+                    log.write(i,end=" ",show_time=False)
+                log.write("... \n",end="",show_time=False, verbose=verbose)
+                log.write(" -Not normalized allele:",end="", verbose=verbose)
+                for i in example_sumstats[[ea,nea]].values:
+                    log.write(i,end="",show_time=False, verbose=verbose)
+                log.write("... \n",end="",show_time=False, verbose=verbose)
+                log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
+            else:
+                log.write(" -All variants are already normalized..", verbose=verbose)
+        except:
+            pass
+    ##########################################################################################################################################################
+    elif mode=="s":
         if sum(variants_to_check)<10000:
             n_cores=1
         pool = Pool(n_cores)
@@ -829,35 +873,36 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
         normalized_pd = pd.concat(pool.map(map_func,df_split))
         pool.close()
         pool.join()
-    ###############################################################################################################
-    before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
-    changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
-    if changed_num>0:
-        if snpid in sumstats.columns:
-            before_normalize_id = sumstats.loc[variants_to_check,snpid]
-        elif rsid in sumstats.columns:
-            before_normalize_id = sumstats.loc[variants_to_check,rsid]
-        else:
-            before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
+        before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
+        changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
+        if changed_num>0:
+            if snpid in sumstats.columns:
+                before_normalize_id = sumstats.loc[variants_to_check,snpid]
+            elif rsid in sumstats.columns:
+                before_normalize_id = sumstats.loc[variants_to_check,rsid]
+            else:
+                before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
+            log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
+            for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
+                log.write(i,end=" ",show_time=False)
+            log.write("... \n",end="",show_time=False, verbose=verbose)
-        log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
-        for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
-            log.write(i,end=" ",show_time=False)
-        log.write("... \n",end="",show_time=False, verbose=verbose)
-        log.write(" -Not normalized allele:",end="", verbose=verbose)
-        for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
-            log.write(i,end="",show_time=False, verbose=verbose)
-        log.write("... \n",end="",show_time=False, verbose=verbose)
-        log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
-    else:
-        log.write(" -All variants are already normalized..", verbose=verbose)
-    ###################################################################################################################
+            log.write(" -Not normalized allele:",end="", verbose=verbose)
+            for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
+                log.write(i,end="",show_time=False, verbose=verbose)
+            log.write("... \n",end="",show_time=False, verbose=verbose)
+            log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
+        else:
+            log.write(" -All variants are already normalized..", verbose=verbose)
+        ###################################################################################################################
     categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
     sumstats[ea]  = pd.Categorical(sumstats[ea],categories = categories)
     sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
     sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
     try:
         sumstats[pos] = sumstats[pos].astype('Int64')
     except:
@@ -873,6 +918,67 @@ def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
     sumstats = pd.DataFrame(normalized.to_list(), columns=[pos,nea,ea,status],index=sumstats.index)
     return sumstats
+def fastnormalizeallele(insumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS",chunk=3000000,log=Log(),verbose=False):
+    log.write(" -Number of variants to check:{}".format(len(insumstats)), verbose=verbose)
+    log.write(" -Chunk size:{}".format(chunk), verbose=verbose)
+    log.write(" -Processing in chunks:",end="", verbose=verbose)
+    changed_index = np.array([])
+    for part_n in range(len(insumstats)//chunk+1):
+        log.write(part_n, end=" ",show_time=False, verbose=verbose)
+        insumstats["NEA"] = insumstats["NEA"].astype("string")
+        insumstats["EA"] = insumstats["EA"].astype("string")
+        insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:],changed_index_single  = normalizae_chunk(insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:].copy())
+        changed_index = np.concatenate([changed_index,changed_index_single])
+        gc.collect()
+    log.write("\n",end="",show_time=False, verbose=verbose)
+    return insumstats, changed_index
+def normalizae_chunk(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
+    # already normalized
+    is_same = sumstats["NEA"] == sumstats["EA"]
+    is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
+    # a series to keep tracking of variants that are modified
+    changed = sumstats["NEA"] != sumstats["NEA"]
+    # right side
+    ea_len = sumstats["NEA"].str.len()
+    nea_len = sumstats["EA"].str.len()
+    max_length=max(ea_len.max(), nea_len.max())
+    for i in range(1, max_length):
+        is_pop = (sumstats["NEA"].str[-1] == sumstats["EA"].str[-1]) & (~is_normalized)
+        if sum(is_pop)==0:
+            break
+        if i ==1:
+            changed = changed | is_pop
+        nea_len[is_pop] = nea_len[is_pop] -1
+        ea_len[is_pop] = ea_len[is_pop] -1
+        sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[:-1]
+        sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[:-1]
+        is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
+        gc.collect()
+    # left side
+    max_length=max(sumstats["NEA"].str.len().max(), sumstats["EA"].str.len().max())
+    for i in range(1, max_length):
+        is_pop = (sumstats["NEA"].str[0] == sumstats["EA"].str[0]) & (~is_normalized)
+        if sum(is_pop)==0:
+            break
+        if i ==1:
+            changed = changed | is_pop
+        sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[1:]
+        sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[1:]
+        sumstats.loc[is_pop, "POS"] = sumstats.loc[is_pop,"POS"] + 1
+        is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
+        gc.collect()
+    sumstats.loc[is_normalized,status]     = vchange_status(sumstats.loc[is_normalized, status],  5,"4","0")
+    sumstats.loc[is_same,status]     = vchange_status(sumstats.loc[is_same, status],  5,"4","3")
+    changed_index = sumstats[changed].index
+    return sumstats, changed_index.values
 def normalizevariant(pos,a,b,status):
     # single record
     # https://genome.sph.umich.edu/wiki/Variant_Normalization

{gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: gwaslab
-Version: 3.4.43
+Version: 3.4.45
 Summary: A collection of handy tools for GWAS SumStats
 Author-email: Yunye <yunye@gwaslab.com>
 Project-URL: Homepage, https://cloufield.github.io/gwaslab/
@@ -8,7 +8,7 @@ Project-URL: Github, https://github.com/Cloufield/gwaslab
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
-Requires-Python: <=3.10,>=3.9
+Requires-Python: <3.11,>=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: LICENSE_before_v3.4.39
@@ -17,7 +17,7 @@ Requires-Dist: numpy >=1.21.2
 Requires-Dist: matplotlib !=3.7.2,>=3.5
 Requires-Dist: seaborn >=0.12
 Requires-Dist: scipy >=1.12
-Requires-Dist: pySAM <0.20,>=0.18.1
+Requires-Dist: pySAM ==0.22.1
 Requires-Dist: Biopython >=1.79
 Requires-Dist: adjustText <=0.8,>=0.7.3
 Requires-Dist: liftover >=1.1.13
@@ -51,7 +51,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
 ### install via pip
 ```
-pip install gwaslab==3.4.41
+pip install gwaslab==3.4.43
 ```
 ```python
@@ -90,7 +90,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
 ```
 conda env create -n gwaslab_test -c conda-forge python=3.9
 conda activate gwaslab
-pip install gwaslab==3.4.41
+pip install gwaslab==3.4.43
 ```
 or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)

{gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/RECORD RENAMED Viewed

@@ -6,15 +6,15 @@ gwaslab/bd_get_hapmap3.py,sha256=asNjQYeGfQi8u3jnfenRvDdKMs5ptql5wpcUzqMlwUI,393
 gwaslab/cache_manager.py,sha256=HOTnSkCOyGEPLRl90WT8D_6pAdI8d8AzenMIDGuCeWc,28113
 gwaslab/g_Log.py,sha256=C3Zv-_6c3C9ms8bgQ-ytplz22sjk7euqXYkWr9zNeAs,1573
 gwaslab/g_Phenotypes.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-gwaslab/g_Sumstats.py,sha256=GS0YUdvNYlwiR-mu6VJIv_JRqgBpHmTq9123XX5kiMI,35132
-gwaslab/g_SumstatsPair.py,sha256=tbgT-_1CqDEw22s4LbVJWF3ChELxz0gy1DEBzLc9ODU,8833
+gwaslab/g_Sumstats.py,sha256=NOEQd00guGch_GIt5bHv1wcrAvETfChqzmtgm-nIx_I,35298
+gwaslab/g_SumstatsPair.py,sha256=20snPb4SlI6ftMGVjgxAuyxsxYRQF-GzzlBSnoB-3Lo,8851
 gwaslab/g_SumstatsT.py,sha256=u_DighLMnMxwTLnqm-B58pA0G6WXRj6pudPyKMVKjSU,2133
 gwaslab/g_Sumstats_summary.py,sha256=FECvvFXJVKaCX5dggBvvk9YvJ6AbdbcLfjltysX7wEE,6380
 gwaslab/g_meta.py,sha256=htWlgURWclm9R6UqFcX1a93WN27xny7lGUeyJZOtszQ,2583
-gwaslab/g_vchange_status.py,sha256=eX0jdIb6Spa07ZdpWNqUWqdVBWS0fuH2yrt4PDi3Res,1746
-gwaslab/g_version.py,sha256=79WGi9pB-TL4T-lRgKtkq1p5WXZOYfBG5KdKplTJxfs,1818
+gwaslab/g_vchange_status.py,sha256=jLoVzMJFhB5k_cJKzHuBNc2HZGBWydAunCNa0n_d54g,1923
+gwaslab/g_version.py,sha256=49_gR8lEQ_jgmfO9XJszEzuzDIESj5dHj6gta3Ilkmw,1818
 gwaslab/hm_casting.py,sha256=FqP4EQl83Q2OKLw004OgLIvUH795TVCGwziLk5jsHqY,11368
-gwaslab/hm_harmonize_sumstats.py,sha256=Lu3UkNK6S9imwOgjK1ZBZTu2gDSFEDjBbgSwSOGfzcI,76705
+gwaslab/hm_harmonize_sumstats.py,sha256=ympk2MZkbb0MnZ1n2ajkV36L8EAm7nBEaYhjqjI38tU,78548
 gwaslab/hm_rsid_to_chrpos.py,sha256=ODWREO0jPN0RAfNzL5fRzSRANfhiksOvUVPuEsFZQqA,6552
 gwaslab/io_preformat_input.py,sha256=w62JLAr16Ru0EgUtBCEV2eXRO89OqhidQxwf2IPAM38,20014
 gwaslab/io_read_ldsc.py,sha256=8S9n4imgl4d0WPms_GYld-6uUM5z7iWGiCA-M814kzY,12123
@@ -28,7 +28,7 @@ gwaslab/ldsc_parse.py,sha256=MBnfgcWlV4oHp9MoDRh1mpilaHhAR15Af77hMFn4-5k,10564
 gwaslab/ldsc_regressions.py,sha256=yzbGjgNV7u-SWXNPsh9S8y9mK97Bim_Nmad9G9V18ZU,30078
 gwaslab/ldsc_sumstats.py,sha256=O0olsDxKlh1MJ1gAuEN1t40rxhajOEwOQ20ak7xoDrI,26245
 gwaslab/qc_check_datatype.py,sha256=kW68uk4dTLOU2b1dHoVat6n0loundDysAjIqxsXW28Q,3379
-gwaslab/qc_fix_sumstats.py,sha256=YtuADrWFhT1kdRp9CmhWF9IQkkXwN8SLnmbF9DIIZ-Y,87231
+gwaslab/qc_fix_sumstats.py,sha256=cpJibJ_77p4cg39R4zRunhOK2deIK4PfQA9wmYZgyqk,92745
 gwaslab/run_script.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 gwaslab/util_ex_calculate_ldmatrix.py,sha256=LpE__LoYRHLgVKlCHo6lYWlz9LEUVUDqYPEAP-Svbm0,14598
 gwaslab/util_ex_calculate_prs.py,sha256=5l1eiZs8YwIpEgp7i3IurP8n5KwQM5awbG9fWSm4iT4,9053
@@ -73,9 +73,9 @@ gwaslab/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz,sha256=qD9RsC5S2h6l-OdpW
 gwaslab/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz,sha256=Y8ZT2FIAhbhlgCJdE9qQVAiwnV_fcsPt72usBa7RSBM,10225828
 gwaslab/data/high_ld/high_ld_hla_hg19.bed.gz,sha256=R7IkssKu0L4WwkU9SrS84xCMdrkkKL0gnTNO_OKbG0Y,219
 gwaslab/data/high_ld/high_ld_hla_hg38.bed.gz,sha256=76CIU0pibDJ72Y6UY-TbIKE9gEPwTELAaIbCXyjm80Q,470
-gwaslab-3.4.43.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-gwaslab-3.4.43.dist-info/LICENSE_before_v3.4.39,sha256=GhLOU_1UDEKeOacYhsRN_m9u-eIuVTazSndZPeNcTZA,1066
-gwaslab-3.4.43.dist-info/METADATA,sha256=bziEH7fBqmzBIWDEZQUaa9w_DinQxI2SbjaatoN-jYw,7764
-gwaslab-3.4.43.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-gwaslab-3.4.43.dist-info/top_level.txt,sha256=PyY6hWtrALpv2MAN3kjkIAzJNmmBTH5a2risz9KwH08,8
-gwaslab-3.4.43.dist-info/RECORD,,
+gwaslab-3.4.45.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+gwaslab-3.4.45.dist-info/LICENSE_before_v3.4.39,sha256=GhLOU_1UDEKeOacYhsRN_m9u-eIuVTazSndZPeNcTZA,1066
+gwaslab-3.4.45.dist-info/METADATA,sha256=5FN5dbVypNPET635Eooi01_1NDFD1dNr1T9Jv0JXmLc,7757
+gwaslab-3.4.45.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+gwaslab-3.4.45.dist-info/top_level.txt,sha256=PyY6hWtrALpv2MAN3kjkIAzJNmmBTH5a2risz9KwH08,8
+gwaslab-3.4.45.dist-info/RECORD,,

{gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/LICENSE RENAMED Viewed

File without changes

{gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/LICENSE_before_v3.4.39 RENAMED Viewed

File without changes

{gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/WHEEL RENAMED Viewed

File without changes

{gwaslab-3.4.43.dist-info → gwaslab-3.4.45.dist-info}/top_level.txt RENAMED Viewed

File without changes

gwaslab 3.4.43__py3-none-any.whl → 3.4.45__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.43py3-none-any.whl → 3.4.45py3-none-any.whl