PyPI - gwaslab - Versions diffs - 3.4.42__py3-none-any.whl → 3.4.44__py3-none-any.whl - Mend

gwaslab 3.4.42py3-none-any.whl → 3.4.44py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (12) hide show

gwaslab/cache_manager.py +687 -0
gwaslab/g_Sumstats.py +4 -2
gwaslab/g_version.py +2 -2
gwaslab/hm_harmonize_sumstats.py +227 -33
gwaslab/qc_fix_sumstats.py +134 -35
gwaslab/viz_plot_mqqplot.py +12 -11
{gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/METADATA +5 -3
{gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/RECORD +12 -11
{gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/LICENSE +0 -0
{gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/WHEEL +0 -0
{gwaslab-3.4.42.dist-info → gwaslab-3.4.44.dist-info}/top_level.txt +0 -0

gwaslab/g_Sumstats.py CHANGED Viewed

@@ -356,8 +356,10 @@ class Sumstats():
         if ref_seq is not None:
             if ref_seq_mode=="v":
                 self.data = checkref(self.data,ref_seq,log=self.log,**checkref_args)
-            else:
+            elif ref_seq_mode=="s":
                 self.data = oldcheckref(self.data,ref_seq,log=self.log,**checkref_args)
+            else:
+                raise ValueError("ref_seq_mode should be 'v' (vectorized, faster) or 's' (sequential, slower)")
             self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
@@ -429,7 +431,7 @@ class Sumstats():
         if ref_seq_mode=="v":
             self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
             self.data = checkref(self.data,ref_seq,log=self.log,**kwargs)
-        else:
+        elif ref_seq_mode=="s":
             self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
             self.data = oldcheckref(self.data,ref_seq,log=self.log,**kwargs)
     def infer_strand(self,ref_infer,**kwargs):

gwaslab/g_version.py CHANGED Viewed

@@ -15,8 +15,8 @@ def _get_version():
 def gwaslab_info():
     # version meta information
     dic={
-   "version":"3.4.42",
-   "release_date":"20240328"
+   "version":"3.4.44",
+   "release_date":"20240424"
     }
     return dic

gwaslab/hm_harmonize_sumstats.py CHANGED Viewed

@@ -24,6 +24,7 @@ from gwaslab.bd_common_data import get_chr_to_number
 from gwaslab.bd_common_data import _maketrans
 from gwaslab.g_vchange_status import vchange_status
 from gwaslab.g_version import _get_version
+from gwaslab.cache_manager import CacheManager, PALINDROMIC_INDEL, NON_PALINDROMIC
 #rsidtochrpos
 #checkref
@@ -388,7 +389,10 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
     return sumstats
 #20240320 check if non-effect allele is aligned with reference genome
-def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array):
+def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array, records_len: np.array):
+    # starting_positions and records_len must be 1D arrays containing data only for the chromosomes contained in x,
+    # and these arrays must be ordered in the same way as the chromosomes in np.unique(x['CHR'].values).
     # status
     #0 /  ----->  match
     #1 /  ----->  Flipped Fixed
@@ -430,6 +434,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     max_len_nea = _nea.str.len().max()
     max_len_ea = _ea.str.len().max()
+    ########################################## mask for variants with out of range POS
+    mask_outlier = pos > records_len[chrom]
+    #########################################
     # Let's apply the same magic used for the fasta records (check build_fasta_records() for details) to convert the NEA and EA to
     # a numpy array of integers in a very fast way.
@@ -441,7 +448,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     nea = _nea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_nea}')
     nea = nea.view('<u4').reshape(-1, max_len_nea).astype(np.uint8)
     nea[nea == 0] = PADDING_VALUE # padding value
+    ###########################################
+    ###########################################
     # Create a mask holding True at the position of non-padding values
     mask_nea = nea != PADDING_VALUE
@@ -457,7 +466,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     ea = _ea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_ea}')
     ea = ea.view('<u4').reshape(-1, max_len_ea).astype(np.uint8)
     ea[ea == 0] = PADDING_VALUE # padding value
+    ###########################################
+    ###########################################
     mask_ea = ea != PADDING_VALUE
     rev_ea = _ea.str.translate(TRANSLATE_TABLE_COMPL).str.pad(max_len_ea, 'left', chr(PADDING_VALUE)).to_numpy().astype(f'<U{max_len_ea}')
@@ -502,8 +513,11 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     # Index the record array using the computed indices.
     # Since we use np.take, indices must all have the same length, and this is why we added the padding to NEA
     # and we create the indices using max_len_nea (long story short, we can't obtain a scattered/ragged array)
-    output_nea = np.take(record, indices)
+    output_nea = np.take(record, indices, mode="clip")
+    ##################################################################
+    output_nea[mask_outlier] = PADDING_VALUE
+    ##################################################################
     # Check if the NEA is equal to the reference sequence at the given position
     # In a non-matrix way, this is equivalent (for one single element) to:
     # nea == record[pos-1: pos+len(nea)-1]
@@ -526,7 +540,10 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
     indices_range = np.arange(max_len_ea)
     indices = pos + indices_range
     indices = indices + modified_indices
-    output_ea = np.take(record, indices)
+    output_ea = np.take(record, indices, mode="clip")
+    ##################################################################
+    output_ea[mask_outlier] = PADDING_VALUE
+    ##################################################################
     ea_eq_ref = np.all((ea == output_ea) + ~mask_ea, 1)
     rev_ea_eq_ref = np.all((rev_ea == output_ea) + ~mask_ea, 1)
@@ -581,24 +598,28 @@ def check_status(sumstats: pd.DataFrame, fasta_records_dict, log=Log(), verbose=
     chrom,pos,ea,nea,status = sumstats.columns
     # First, convert the fasta records to a single numpy array of integers
-    record, starting_positions_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
+    record, starting_positions_dict, records_len_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
     # In _fast_check_status(), several 2D numpy arrays are created and they are padded to have shape[1] == max_len_nea or max_len_ea
     # Since most of the NEA and EA strings are short, we perform the check first on the records having short NEA and EA strings,
     # and then we perform the check on the records having long NEA and EA strings. In this way we can speed up the process (since the
     # arrays are smaller) and save memory.
     max_len = 4 # this is a chosen value, we could compute it using some stats about the length and count of NEA and EA strings
-    condition = (sumstats[nea].str.len() <= max_len) * (sumstats[ea].str.len() <= max_len)
+    condition = (sumstats[nea].str.len() <= max_len) & (sumstats[ea].str.len() <= max_len)
     log.write(f"   -Checking records for ( len(NEA) <= {max_len} and len(EA) <= {max_len} )", verbose=verbose)
     sumstats_cond = sumstats[condition]
-    starting_pos_cond = np.array([starting_positions_dict[k] for k in sumstats_cond[chrom].unique()])
-    sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond)
+    unique_chrom_cond = sumstats_cond[chrom].unique()
+    starting_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_cond])
+    records_len_cond = np.array([records_len_dict[k] for k in unique_chrom_cond])
+    sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond, records_len=records_len_cond)
     log.write(f"   -Checking records for ( len(NEA) > {max_len} or len(EA) > {max_len} )", verbose=verbose)
     sumstats_not_cond = sumstats[~condition]
-    starting_not_pos_cond = np.array([starting_positions_dict[k] for k in sumstats_not_cond[chrom].unique()])
-    sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond)
+    unique_chrom_not_cond = sumstats_not_cond[chrom].unique()
+    starting_not_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_not_cond])
+    records_len_not_cond = np.array([records_len_dict[k] for k in unique_chrom_not_cond])
+    sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond, records_len=records_len_not_cond)
     return sumstats[status].values
@@ -708,10 +729,11 @@ def build_fasta_records(fasta_records_dict, pos_as_dict=True, log=Log(), verbose
     starting_positions = np.cumsum(records_len) - records_len
     if pos_as_dict:
         starting_positions = {k: v for k, v in zip(fasta_records_dict.keys(), starting_positions)}
+        records_len_dict =  {k: v for k, v in zip(fasta_records_dict.keys(), records_len)}
     record = np.concatenate(all_r)
     del all_r # free memory
-    return record, starting_positions
+    return record, starting_positions,records_len_dict
 #######################################################################################################################################
@@ -912,6 +934,56 @@ def check_strand_status(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr
                 return status_pre+"5"+status_end
     return status_pre+"8"+status_end
+def check_strand_status_cache(data,cache,ref_infer=None,ref_alt_freq=None,chr_dict=get_number_to_chr(),trust_cache=True,log=Log(),verbose=True):
+    if not trust_cache:
+        assert ref_infer is not None, "If trust_cache is False, ref_infer must be provided"
+        log.warning("You are not trusting the cache, this will slow down the process. Please consider building a complete cache.")
+    if ref_infer is not None and not trust_cache:
+        vcf_reader = VariantFile(ref_infer)
+    if isinstance(data, pd.DataFrame):
+        data = data.values
+    in_cache = 0
+    new_statuses = []
+    for i in range(data.shape[0]):
+        _chrom, pos, ref, alt, eaf, status = data[i]
+        chrom = _chrom
+        start = pos - 1
+        end = pos
+        if chr_dict is not None: chrom=chr_dict[chrom]
+        status_pre=status[:6]
+        status_end=""
+        new_status = status_pre+"8"+status_end # default value
+        cache_key = f"{chrom}:{pos}:{ref}:{alt}"
+        if cache_key in cache:
+            in_cache += 1
+            record = cache[cache_key]
+            if record is None:
+                new_status = status_pre+"8"+status_end
+            else:
+                if (record<0.5) and (eaf<0.5):
+                    new_status = status_pre+"1"+status_end
+                elif (record>0.5) and (eaf>0.5):
+                    new_status = status_pre+"1"+status_end
+                else:
+                    new_status = status_pre+"5"+status_end
+        else:
+            if not trust_cache:
+                # If we don't trust the cache as a not complete cache, we should perform the check reading from the VCF file
+                new_status = check_strand_status(_chrom, start, end, ref, alt, eaf, vcf_reader, ref_alt_freq, status, chr_dict)
+        new_statuses.append(new_status)
+    log.write(f"  -Elements in cache: {in_cache}", verbose=verbose)
+    return new_statuses
 def check_unkonwn_indel(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr_dict=get_number_to_chr(),daf_tolerance=0.2):
     ### input : unknown indel, both on genome (xx1[45]x)
@@ -939,6 +1011,65 @@ def check_unkonwn_indel(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr
     return status_pre+"8"+status_end
+def check_unkonwn_indel_cache(data,cache,ref_infer=None,ref_alt_freq=None,chr_dict=get_number_to_chr(),daf_tolerance=0.2,trust_cache=True,log=Log(),verbose=True):
+    if not trust_cache:
+        assert ref_infer is not None, "If trust_cache is False, ref_infer must be provided"
+        log.warning("You are not trusting the cache, this will slow down the process. Please consider building a complete cache.")
+    if ref_infer is not None:
+        vcf_reader = VariantFile(ref_infer)
+    if isinstance(data, pd.DataFrame):
+        data = data.values
+    in_cache = 0
+    new_statuses = []
+    for i in range(data.shape[0]):
+        _chrom, pos, ref, alt, eaf, status = data[i]
+        chrom = _chrom
+        if chr_dict is not None: chrom=chr_dict[chrom]
+        start = pos - 1
+        end = pos
+        status_pre=status[:6]
+        status_end=""
+        new_status = status_pre+"8"+status_end # default value
+        cache_key_ref_alt = f"{chrom}:{pos}:{ref}:{alt}"
+        cache_key_alt_ref = f"{chrom}:{pos}:{alt}:{ref}"
+        if cache_key_ref_alt in cache:
+            in_cache += 1
+            record = cache[cache_key_ref_alt]
+            if record is None:
+                new_status = status_pre+"8"+status_end
+            else:
+                if  abs(record - eaf)<daf_tolerance:
+                    new_status = status_pre+"3"+status_end
+        elif cache_key_alt_ref in cache:
+            in_cache += 1
+            record = cache[cache_key_alt_ref]
+            if record is None:
+                new_status = status_pre+"8"+status_end
+            else:
+                if  abs(record - (1 - eaf))<daf_tolerance:
+                    new_status = status_pre+"6"+status_end
+        else:
+            if not trust_cache:
+                # If we don't trust the cache as a not complete cache, we should perform the check reading from the VCF file
+                new_status = check_unkonwn_indel(_chrom, start, end, ref, alt, eaf, vcf_reader, ref_alt_freq, status, chr_dict, daf_tolerance)
+        new_statuses.append(new_status)
+    log.write(f"  -Elements in cache: {in_cache}", verbose=verbose)
+    return new_statuses
 def get_reverse_complementary_allele(a):
     dic = str.maketrans({
@@ -963,16 +1094,40 @@ def check_strand(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="N
     status_part = sumstats.apply(lambda x:check_strand_status(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict),axis=1)
     return status_part
+def check_strand_cache(sumstats,cache,ref_infer,ref_alt_freq=None,chr_dict=get_number_to_chr(),trust_cache=True,log=Log(),verbose=True):
+    assert cache is not None, "Cache must be provided"
+    status_part = check_strand_status_cache(sumstats,cache,ref_infer,ref_alt_freq,chr_dict,trust_cache,log,verbose)
+    return status_part
 def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS",daf_tolerance=0.2):
     vcf_reader = VariantFile(ref_infer)
     status_part = sumstats.apply(lambda x:check_unkonwn_indel(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict,daf_tolerance),axis=1)
     return status_part
+def check_indel_cache(sumstats,cache,ref_infer,ref_alt_freq=None,chr_dict=get_number_to_chr(),daf_tolerance=0.2,trust_cache=True,log=Log(),verbose=True):
+    assert cache is not None, "Cache must be provided"
+    status_part = check_unkonwn_indel_cache(sumstats,cache,ref_infer,ref_alt_freq,chr_dict,daf_tolerance,trust_cache,log,verbose)
+    return status_part
 ##################################################################################################################################################
 def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,daf_tolerance=0.20,remove_snp="",mode="pi",n_cores=1,remove_indel="",
                        chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
-                       chr_dict=None,verbose=True,log=Log()):
+                       chr_dict=None,cache_options={},verbose=True,log=Log()):
+    '''
+    Args:
+    cache_options : A dictionary with the following keys:
+        - cache_manager: CacheManager object or None. If any between cache_loader and cache_process is not None, or use_cache is True, a CacheManager object will be created automatically.
+        - trust_cache: bool (optional, default: True). Whether to completely trust the cache or not. Trusting the cache means that any key not found inside the cache will be considered as a missing value even in the VCF file.
+        - cache_loader: Object with a get_cache() method or None.
+        - cache_process: Object with an apply_fn() method or None.
+        - use_cache: bool (optional, default: False). If any of the cache_manager, cache_loader or cache_process is not None, this will be set to True automatically.
+                     If set to True and all between cache_manager, cache_loader and cache_process are None, the cache will be loaded (or built) on the spot.
+        The usefulness of a cache_loader or cache_process object is to pass a custom object which already has the cache loaded. This can be useful if the cache is loaded in background in another thread/process while other operations are performed.
+        The cache_manager is a CacheManager object is used to expose the API to interact with the cache.
+    '''
     ##start function with col checking##########################################################
     _start_line = "infer strand for palindromic SNPs/align indistinguishable indels"
     _end_line = "inferring strand for palindromic SNPs/align indistinguishable indels"
@@ -995,6 +1150,16 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
     chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
+    # Setup cache variables
+    cache_manager = cache_options.get("cache_manager", None)
+    if cache_manager is not None:
+        assert isinstance(cache_manager, CacheManager), "cache_manager must be a CacheManager object"
+    trust_cache = cache_options.get("trust_cache", True)
+    cache_loader = cache_options.get("cache_loader", None)
+    cache_process = cache_options.get("cache_process", None)
+    use_cache = any(c is not None for c in [cache_manager, cache_loader, cache_process]) or cache_options.get('use_cache', False)
+    _n_cores = n_cores # backup n_cores
     log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
     if "p" in mode:
@@ -1022,16 +1187,30 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
         #########################################################################################
         if sum(unknow_palindromic_to_check)>0:
             if sum(unknow_palindromic_to_check)<10000:
-                n_cores=1
-            #df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
-            df_split = _df_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
-            pool = Pool(n_cores)
-            map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
-            status_inferred = pd.concat(pool.map(map_func,df_split))
-            sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
-            pool.close()
-            pool.join()
+                n_cores=1
+            if use_cache and cache_manager is None:
+                cache_manager = CacheManager(base_path=ref_infer, cache_loader=cache_loader, cache_process=cache_process,
+                                             ref_alt_freq=ref_alt_freq, category=PALINDROMIC_INDEL,
+                                             n_cores=_n_cores, log=log, verbose=verbose)
+            log.write(" -Starting strand inference for palindromic SNPs...",verbose=verbose)
+            df_to_check = sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]]
+            if use_cache and cache_manager.cache_len > 0:
+                log.write("  -Using cache for strand inference",verbose=verbose)
+                status_inferred = cache_manager.apply_fn(check_strand_cache, sumstats=df_to_check, ref_infer=ref_infer, ref_alt_freq=ref_alt_freq, chr_dict=chr_dict, trust_cache=trust_cache, log=log, verbose=verbose)
+                sumstats.loc[unknow_palindromic_to_check,status] = status_inferred
+            else:
+                #df_split = np.array_split(df_to_check, n_cores)
+                df_split = _df_split(df_to_check, n_cores)
+                pool = Pool(n_cores)
+                map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
+                status_inferred = pd.concat(pool.map(map_func,df_split))
+                sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
+                pool.close()
+                pool.join()
+            log.write(" -Finished strand inference.",verbose=verbose)
         else:
             log.warning("No palindromic variants available for checking.")
         #########################################################################################
@@ -1082,15 +1261,30 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
             if sum(unknow_indel)>0:
                 if sum(unknow_indel)<10000:
-                    n_cores=1
-                #df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
-                df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
-                pool = Pool(n_cores)
-                map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
-                status_inferred = pd.concat(pool.map(map_func,df_split))
-                sumstats.loc[unknow_indel,status] = status_inferred.values
-                pool.close()
-                pool.join()
+                    n_cores=1
+                if use_cache and cache_manager is None:
+                    cache_manager = CacheManager(base_path=ref_infer, cache_loader=cache_loader, cache_process=cache_process,
+                                                ref_alt_freq=ref_alt_freq, category=PALINDROMIC_INDEL,
+                                                n_cores=_n_cores, log=log, verbose=verbose)
+                log.write(" -Starting indistinguishable indel inference...",verbose=verbose)
+                df_to_check = sumstats.loc[unknow_indel,[chr,pos,ref,alt,eaf,status]]
+                if use_cache and cache_manager.cache_len > 0:
+                    log.write("  -Using cache for indel inference",verbose=verbose)
+                    status_inferred = cache_manager.apply_fn(check_indel_cache, sumstats=df_to_check, ref_infer=ref_infer, ref_alt_freq=ref_alt_freq, chr_dict=chr_dict, daf_tolerance=daf_tolerance, trust_cache=trust_cache, log=log, verbose=verbose)
+                    sumstats.loc[unknow_indel,status] = status_inferred
+                else:
+                    #df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
+                    df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
+                    pool = Pool(n_cores)
+                    map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
+                    status_inferred = pd.concat(pool.map(map_func,df_split))
+                    sumstats.loc[unknow_indel,status] = status_inferred.values
+                    pool.close()
+                    pool.join()
+                log.write(" -Finished indistinguishable indel inference.",verbose=verbose)
             #########################################################################################

gwaslab/qc_fix_sumstats.py CHANGED Viewed

@@ -792,7 +792,7 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
 ###############################################################################################################
 # 20220721
-def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
+def parallelnormalizeallele(sumstats,mode="s",snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",chunk=3000000,n_cores=1,verbose=True,log=Log()):
     ##start function with col checking##########################################################
     _start_line = "normalize indels"
     _end_line = "normalizing indels"
@@ -819,7 +819,51 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
         log.write("Finished normalizing variants successfully!", verbose=verbose)
         return sumstats
     ###############################################################################################################
-    if sum(variants_to_check)>0:
+    if mode=="v":
+        if sum(variants_to_check)<100000:
+            n_cores=1
+        if n_cores==1:
+            normalized_pd, changed_index = fastnormalizeallele(sumstats.loc[variants_to_check,[pos,nea,ea,status]],pos=pos ,nea=nea,ea=ea,status=status,chunk=chunk, log=log, verbose=verbose)
+        else:
+            pool = Pool(n_cores)
+            map_func = partial(fastnormalizeallele,pos=pos,nea=nea,ea=ea,status=status)
+            df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
+            results = pool.map(map_func,df_split)
+            normalized_pd = pd.concat([i[0] for i in results])
+            changed_index = np.concatenate([i[1] for i in results])
+            del results
+            pool.close()
+            pool.join()
+            gc.collect()
+        ###############################################################################################################
+        try:
+            example_sumstats = sumstats.loc[changed_index,:].head()
+            changed_num = len(changed_index)
+            if changed_num>0:
+                if snpid in example_sumstats.columns:
+                    before_normalize_id = example_sumstats.loc[variants_to_check,snpid]
+                elif rsid in example_sumstats.columns:
+                    before_normalize_id = example_sumstats.loc[variants_to_check,rsid]
+                else:
+                    before_normalize_id = example_sumstats.index
+                log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
+                for i in before_normalize_id.values:
+                    log.write(i,end=" ",show_time=False)
+                log.write("... \n",end="",show_time=False, verbose=verbose)
+                log.write(" -Not normalized allele:",end="", verbose=verbose)
+                for i in example_sumstats[[ea,nea]].values:
+                    log.write(i,end="",show_time=False, verbose=verbose)
+                log.write("... \n",end="",show_time=False, verbose=verbose)
+                log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
+            else:
+                log.write(" -All variants are already normalized..", verbose=verbose)
+        except:
+            pass
+    ##########################################################################################################################################################
+    elif mode=="s":
         if sum(variants_to_check)<10000:
             n_cores=1
         pool = Pool(n_cores)
@@ -829,35 +873,36 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
         normalized_pd = pd.concat(pool.map(map_func,df_split))
         pool.close()
         pool.join()
-    ###############################################################################################################
-    before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
-    changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
-    if changed_num>0:
-        if snpid in sumstats.columns:
-            before_normalize_id = sumstats.loc[variants_to_check,snpid]
-        elif rsid in sumstats.columns:
-            before_normalize_id = sumstats.loc[variants_to_check,rsid]
-        else:
-            before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
+        before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
+        changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
+        if changed_num>0:
+            if snpid in sumstats.columns:
+                before_normalize_id = sumstats.loc[variants_to_check,snpid]
+            elif rsid in sumstats.columns:
+                before_normalize_id = sumstats.loc[variants_to_check,rsid]
+            else:
+                before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
+            log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
+            for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
+                log.write(i,end=" ",show_time=False)
+            log.write("... \n",end="",show_time=False, verbose=verbose)
-        log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
-        for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
-            log.write(i,end=" ",show_time=False)
-        log.write("... \n",end="",show_time=False, verbose=verbose)
-        log.write(" -Not normalized allele:",end="", verbose=verbose)
-        for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
-            log.write(i,end="",show_time=False, verbose=verbose)
-        log.write("... \n",end="",show_time=False, verbose=verbose)
-        log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
-    else:
-        log.write(" -All variants are already normalized..", verbose=verbose)
-    ###################################################################################################################
+            log.write(" -Not normalized allele:",end="", verbose=verbose)
+            for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
+                log.write(i,end="",show_time=False, verbose=verbose)
+            log.write("... \n",end="",show_time=False, verbose=verbose)
+            log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
+        else:
+            log.write(" -All variants are already normalized..", verbose=verbose)
+        ###################################################################################################################
     categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
     sumstats[ea]  = pd.Categorical(sumstats[ea],categories = categories)
     sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
     sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
     try:
         sumstats[pos] = sumstats[pos].astype('Int64')
     except:
@@ -873,6 +918,67 @@ def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
     sumstats = pd.DataFrame(normalized.to_list(), columns=[pos,nea,ea,status],index=sumstats.index)
     return sumstats
+def fastnormalizeallele(insumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS",chunk=3000000,log=Log(),verbose=False):
+    log.write(" -Number of variants to check:{}".format(len(insumstats)), verbose=verbose)
+    log.write(" -Chunk size:{}".format(chunk), verbose=verbose)
+    log.write(" -Processing in chunks:",end="", verbose=verbose)
+    changed_index = np.array([])
+    for part_n in range(len(insumstats)//chunk+1):
+        log.write(part_n, end=" ",show_time=False, verbose=verbose)
+        insumstats["NEA"] = insumstats["NEA"].astype("string")
+        insumstats["EA"] = insumstats["EA"].astype("string")
+        insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:],changed_index_single  = normalizae_chunk(insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:].copy())
+        changed_index = np.concatenate([changed_index,changed_index_single])
+        gc.collect()
+    log.write("\n",end="",show_time=False, verbose=verbose)
+    return insumstats, changed_index
+def normalizae_chunk(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
+    # already normalized
+    is_same = sumstats["NEA"] == sumstats["EA"]
+    is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
+    # a series to keep tracking of variants that are modified
+    changed = sumstats["NEA"] != sumstats["NEA"]
+    # right side
+    ea_len = sumstats["NEA"].str.len()
+    nea_len = sumstats["EA"].str.len()
+    max_length=max(ea_len.max(), nea_len.max())
+    for i in range(1, max_length):
+        is_pop = (sumstats["NEA"].str[-1] == sumstats["EA"].str[-1]) & (~is_normalized)
+        if sum(is_pop)==0:
+            break
+        if i ==1:
+            changed = changed | is_pop
+        nea_len[is_pop] = nea_len[is_pop] -1
+        ea_len[is_pop] = ea_len[is_pop] -1
+        sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[:-1]
+        sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[:-1]
+        is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
+        gc.collect()
+    # left side
+    max_length=max(sumstats["NEA"].str.len().max(), sumstats["EA"].str.len().max())
+    for i in range(1, max_length):
+        is_pop = (sumstats["NEA"].str[0] == sumstats["EA"].str[0]) & (~is_normalized)
+        if sum(is_pop)==0:
+            break
+        if i ==1:
+            changed = changed | is_pop
+        sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[1:]
+        sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[1:]
+        sumstats.loc[is_pop, "POS"] = sumstats.loc[is_pop,"POS"] + 1
+        is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
+        gc.collect()
+    sumstats.loc[is_normalized,status]     = vchange_status(sumstats.loc[is_normalized, status],  5,"4","0")
+    sumstats.loc[is_same,status]     = vchange_status(sumstats.loc[is_same, status],  5,"4","3")
+    changed_index = sumstats[changed].index
+    return sumstats, changed_index.values
 def normalizevariant(pos,a,b,status):
     # single record
     # https://genome.sph.umich.edu/wiki/Variant_Normalization
@@ -1611,12 +1717,5 @@ def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
 ###############################################################################################################
 def _df_split(dataframe, n):
-    chunks = []
-    chunk_size = int(dataframe.shape[0] // n)+1
-    for index in range(0, dataframe.shape[0], chunk_size):
-        chunks.append(
-            dataframe.iloc[index:index + chunk_size]
-        )
-    return chunks
+    k, m = divmod(len(dataframe), n)
+    return [dataframe.iloc[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]

gwaslab 3.4.42__py3-none-any.whl → 3.4.44__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.42py3-none-any.whl → 3.4.44py3-none-any.whl