gwaslab 3.4.42__py3-none-any.whl → 3.4.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

gwaslab/g_Sumstats.py CHANGED
@@ -356,8 +356,10 @@ class Sumstats():
356
356
  if ref_seq is not None:
357
357
  if ref_seq_mode=="v":
358
358
  self.data = checkref(self.data,ref_seq,log=self.log,**checkref_args)
359
- else:
359
+ elif ref_seq_mode=="s":
360
360
  self.data = oldcheckref(self.data,ref_seq,log=self.log,**checkref_args)
361
+ else:
362
+ raise ValueError("ref_seq_mode should be 'v' (vectorized, faster) or 's' (sequential, slower)")
361
363
 
362
364
  self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
363
365
 
@@ -429,7 +431,7 @@ class Sumstats():
429
431
  if ref_seq_mode=="v":
430
432
  self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
431
433
  self.data = checkref(self.data,ref_seq,log=self.log,**kwargs)
432
- else:
434
+ elif ref_seq_mode=="s":
433
435
  self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
434
436
  self.data = oldcheckref(self.data,ref_seq,log=self.log,**kwargs)
435
437
  def infer_strand(self,ref_infer,**kwargs):
gwaslab/g_version.py CHANGED
@@ -15,8 +15,8 @@ def _get_version():
15
15
  def gwaslab_info():
16
16
  # version meta information
17
17
  dic={
18
- "version":"3.4.42",
19
- "release_date":"20240328"
18
+ "version":"3.4.44",
19
+ "release_date":"20240424"
20
20
  }
21
21
  return dic
22
22
 
@@ -24,6 +24,7 @@ from gwaslab.bd_common_data import get_chr_to_number
24
24
  from gwaslab.bd_common_data import _maketrans
25
25
  from gwaslab.g_vchange_status import vchange_status
26
26
  from gwaslab.g_version import _get_version
27
+ from gwaslab.cache_manager import CacheManager, PALINDROMIC_INDEL, NON_PALINDROMIC
27
28
 
28
29
  #rsidtochrpos
29
30
  #checkref
@@ -388,7 +389,10 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
388
389
  return sumstats
389
390
 
390
391
  #20240320 check if non-effect allele is aligned with reference genome
391
- def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array):
392
+ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array, records_len: np.array):
393
+ # starting_positions and records_len must be 1D arrays containing data only for the chromosomes contained in x,
394
+ # and these arrays must be ordered in the same way as the chromosomes in np.unique(x['CHR'].values).
395
+
392
396
  # status
393
397
  #0 / -----> match
394
398
  #1 / -----> Flipped Fixed
@@ -430,6 +434,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
430
434
  max_len_nea = _nea.str.len().max()
431
435
  max_len_ea = _ea.str.len().max()
432
436
 
437
+ ########################################## mask for variants with out of range POS
438
+ mask_outlier = pos > records_len[chrom]
439
+ #########################################
433
440
 
434
441
  # Let's apply the same magic used for the fasta records (check build_fasta_records() for details) to convert the NEA and EA to
435
442
  # a numpy array of integers in a very fast way.
@@ -441,7 +448,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
441
448
  nea = _nea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_nea}')
442
449
  nea = nea.view('<u4').reshape(-1, max_len_nea).astype(np.uint8)
443
450
  nea[nea == 0] = PADDING_VALUE # padding value
444
-
451
+ ###########################################
452
+
453
+ ###########################################
445
454
  # Create a mask holding True at the position of non-padding values
446
455
  mask_nea = nea != PADDING_VALUE
447
456
 
@@ -457,7 +466,9 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
457
466
  ea = _ea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_ea}')
458
467
  ea = ea.view('<u4').reshape(-1, max_len_ea).astype(np.uint8)
459
468
  ea[ea == 0] = PADDING_VALUE # padding value
460
-
469
+ ###########################################
470
+
471
+ ###########################################
461
472
  mask_ea = ea != PADDING_VALUE
462
473
 
463
474
  rev_ea = _ea.str.translate(TRANSLATE_TABLE_COMPL).str.pad(max_len_ea, 'left', chr(PADDING_VALUE)).to_numpy().astype(f'<U{max_len_ea}')
@@ -502,8 +513,11 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
502
513
  # Index the record array using the computed indices.
503
514
  # Since we use np.take, indices must all have the same length, and this is why we added the padding to NEA
504
515
  # and we create the indices using max_len_nea (long story short, we can't obtain a scattered/ragged array)
505
- output_nea = np.take(record, indices)
506
-
516
+ output_nea = np.take(record, indices, mode="clip")
517
+ ##################################################################
518
+ output_nea[mask_outlier] = PADDING_VALUE
519
+ ##################################################################
520
+
507
521
  # Check if the NEA is equal to the reference sequence at the given position
508
522
  # In a non-matrix way, this is equivalent (for one single element) to:
509
523
  # nea == record[pos-1: pos+len(nea)-1]
@@ -526,7 +540,10 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
526
540
  indices_range = np.arange(max_len_ea)
527
541
  indices = pos + indices_range
528
542
  indices = indices + modified_indices
529
- output_ea = np.take(record, indices)
543
+ output_ea = np.take(record, indices, mode="clip")
544
+ ##################################################################
545
+ output_ea[mask_outlier] = PADDING_VALUE
546
+ ##################################################################
530
547
 
531
548
  ea_eq_ref = np.all((ea == output_ea) + ~mask_ea, 1)
532
549
  rev_ea_eq_ref = np.all((rev_ea == output_ea) + ~mask_ea, 1)
@@ -581,24 +598,28 @@ def check_status(sumstats: pd.DataFrame, fasta_records_dict, log=Log(), verbose=
581
598
  chrom,pos,ea,nea,status = sumstats.columns
582
599
 
583
600
  # First, convert the fasta records to a single numpy array of integers
584
- record, starting_positions_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
601
+ record, starting_positions_dict, records_len_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
585
602
 
586
603
  # In _fast_check_status(), several 2D numpy arrays are created and they are padded to have shape[1] == max_len_nea or max_len_ea
587
604
  # Since most of the NEA and EA strings are short, we perform the check first on the records having short NEA and EA strings,
588
605
  # and then we perform the check on the records having long NEA and EA strings. In this way we can speed up the process (since the
589
606
  # arrays are smaller) and save memory.
590
607
  max_len = 4 # this is a chosen value, we could compute it using some stats about the length and count of NEA and EA strings
591
- condition = (sumstats[nea].str.len() <= max_len) * (sumstats[ea].str.len() <= max_len)
608
+ condition = (sumstats[nea].str.len() <= max_len) & (sumstats[ea].str.len() <= max_len)
592
609
 
593
610
  log.write(f" -Checking records for ( len(NEA) <= {max_len} and len(EA) <= {max_len} )", verbose=verbose)
594
611
  sumstats_cond = sumstats[condition]
595
- starting_pos_cond = np.array([starting_positions_dict[k] for k in sumstats_cond[chrom].unique()])
596
- sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond)
612
+ unique_chrom_cond = sumstats_cond[chrom].unique()
613
+ starting_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_cond])
614
+ records_len_cond = np.array([records_len_dict[k] for k in unique_chrom_cond])
615
+ sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond, records_len=records_len_cond)
597
616
 
598
617
  log.write(f" -Checking records for ( len(NEA) > {max_len} or len(EA) > {max_len} )", verbose=verbose)
599
618
  sumstats_not_cond = sumstats[~condition]
600
- starting_not_pos_cond = np.array([starting_positions_dict[k] for k in sumstats_not_cond[chrom].unique()])
601
- sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond)
619
+ unique_chrom_not_cond = sumstats_not_cond[chrom].unique()
620
+ starting_not_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_not_cond])
621
+ records_len_not_cond = np.array([records_len_dict[k] for k in unique_chrom_not_cond])
622
+ sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond, records_len=records_len_not_cond)
602
623
 
603
624
  return sumstats[status].values
604
625
 
@@ -708,10 +729,11 @@ def build_fasta_records(fasta_records_dict, pos_as_dict=True, log=Log(), verbose
708
729
  starting_positions = np.cumsum(records_len) - records_len
709
730
  if pos_as_dict:
710
731
  starting_positions = {k: v for k, v in zip(fasta_records_dict.keys(), starting_positions)}
732
+ records_len_dict = {k: v for k, v in zip(fasta_records_dict.keys(), records_len)}
711
733
  record = np.concatenate(all_r)
712
734
  del all_r # free memory
713
735
 
714
- return record, starting_positions
736
+ return record, starting_positions,records_len_dict
715
737
 
716
738
  #######################################################################################################################################
717
739
 
@@ -912,6 +934,56 @@ def check_strand_status(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr
912
934
  return status_pre+"5"+status_end
913
935
  return status_pre+"8"+status_end
914
936
 
937
+ def check_strand_status_cache(data,cache,ref_infer=None,ref_alt_freq=None,chr_dict=get_number_to_chr(),trust_cache=True,log=Log(),verbose=True):
938
+ if not trust_cache:
939
+ assert ref_infer is not None, "If trust_cache is False, ref_infer must be provided"
940
+ log.warning("You are not trusting the cache, this will slow down the process. Please consider building a complete cache.")
941
+
942
+ if ref_infer is not None and not trust_cache:
943
+ vcf_reader = VariantFile(ref_infer)
944
+
945
+ if isinstance(data, pd.DataFrame):
946
+ data = data.values
947
+
948
+ in_cache = 0
949
+ new_statuses = []
950
+
951
+ for i in range(data.shape[0]):
952
+ _chrom, pos, ref, alt, eaf, status = data[i]
953
+ chrom = _chrom
954
+ start = pos - 1
955
+ end = pos
956
+
957
+ if chr_dict is not None: chrom=chr_dict[chrom]
958
+
959
+ status_pre=status[:6]
960
+ status_end=""
961
+
962
+ new_status = status_pre+"8"+status_end # default value
963
+
964
+ cache_key = f"{chrom}:{pos}:{ref}:{alt}"
965
+ if cache_key in cache:
966
+ in_cache += 1
967
+ record = cache[cache_key]
968
+ if record is None:
969
+ new_status = status_pre+"8"+status_end
970
+ else:
971
+ if (record<0.5) and (eaf<0.5):
972
+ new_status = status_pre+"1"+status_end
973
+ elif (record>0.5) and (eaf>0.5):
974
+ new_status = status_pre+"1"+status_end
975
+ else:
976
+ new_status = status_pre+"5"+status_end
977
+ else:
978
+ if not trust_cache:
979
+ # If we don't trust the cache as a not complete cache, we should perform the check reading from the VCF file
980
+ new_status = check_strand_status(_chrom, start, end, ref, alt, eaf, vcf_reader, ref_alt_freq, status, chr_dict)
981
+
982
+ new_statuses.append(new_status)
983
+
984
+ log.write(f" -Elements in cache: {in_cache}", verbose=verbose)
985
+ return new_statuses
986
+
915
987
 
916
988
  def check_unkonwn_indel(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr_dict=get_number_to_chr(),daf_tolerance=0.2):
917
989
  ### input : unknown indel, both on genome (xx1[45]x)
@@ -939,6 +1011,65 @@ def check_unkonwn_indel(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr
939
1011
 
940
1012
  return status_pre+"8"+status_end
941
1013
 
1014
+
1015
+ def check_unkonwn_indel_cache(data,cache,ref_infer=None,ref_alt_freq=None,chr_dict=get_number_to_chr(),daf_tolerance=0.2,trust_cache=True,log=Log(),verbose=True):
1016
+ if not trust_cache:
1017
+ assert ref_infer is not None, "If trust_cache is False, ref_infer must be provided"
1018
+ log.warning("You are not trusting the cache, this will slow down the process. Please consider building a complete cache.")
1019
+
1020
+ if ref_infer is not None:
1021
+ vcf_reader = VariantFile(ref_infer)
1022
+
1023
+ if isinstance(data, pd.DataFrame):
1024
+ data = data.values
1025
+
1026
+ in_cache = 0
1027
+ new_statuses = []
1028
+
1029
+ for i in range(data.shape[0]):
1030
+ _chrom, pos, ref, alt, eaf, status = data[i]
1031
+ chrom = _chrom
1032
+
1033
+ if chr_dict is not None: chrom=chr_dict[chrom]
1034
+ start = pos - 1
1035
+ end = pos
1036
+
1037
+ status_pre=status[:6]
1038
+ status_end=""
1039
+
1040
+ new_status = status_pre+"8"+status_end # default value
1041
+
1042
+ cache_key_ref_alt = f"{chrom}:{pos}:{ref}:{alt}"
1043
+ cache_key_alt_ref = f"{chrom}:{pos}:{alt}:{ref}"
1044
+
1045
+ if cache_key_ref_alt in cache:
1046
+ in_cache += 1
1047
+ record = cache[cache_key_ref_alt]
1048
+ if record is None:
1049
+ new_status = status_pre+"8"+status_end
1050
+ else:
1051
+ if abs(record - eaf)<daf_tolerance:
1052
+ new_status = status_pre+"3"+status_end
1053
+
1054
+ elif cache_key_alt_ref in cache:
1055
+ in_cache += 1
1056
+ record = cache[cache_key_alt_ref]
1057
+ if record is None:
1058
+ new_status = status_pre+"8"+status_end
1059
+ else:
1060
+ if abs(record - (1 - eaf))<daf_tolerance:
1061
+ new_status = status_pre+"6"+status_end
1062
+
1063
+ else:
1064
+ if not trust_cache:
1065
+ # If we don't trust the cache as a not complete cache, we should perform the check reading from the VCF file
1066
+ new_status = check_unkonwn_indel(_chrom, start, end, ref, alt, eaf, vcf_reader, ref_alt_freq, status, chr_dict, daf_tolerance)
1067
+
1068
+ new_statuses.append(new_status)
1069
+
1070
+ log.write(f" -Elements in cache: {in_cache}", verbose=verbose)
1071
+ return new_statuses
1072
+
942
1073
 
943
1074
  def get_reverse_complementary_allele(a):
944
1075
  dic = str.maketrans({
@@ -963,16 +1094,40 @@ def check_strand(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="N
963
1094
  status_part = sumstats.apply(lambda x:check_strand_status(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict),axis=1)
964
1095
  return status_part
965
1096
 
1097
+ def check_strand_cache(sumstats,cache,ref_infer,ref_alt_freq=None,chr_dict=get_number_to_chr(),trust_cache=True,log=Log(),verbose=True):
1098
+ assert cache is not None, "Cache must be provided"
1099
+ status_part = check_strand_status_cache(sumstats,cache,ref_infer,ref_alt_freq,chr_dict,trust_cache,log,verbose)
1100
+ return status_part
1101
+
966
1102
  def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS",daf_tolerance=0.2):
967
1103
  vcf_reader = VariantFile(ref_infer)
968
1104
  status_part = sumstats.apply(lambda x:check_unkonwn_indel(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict,daf_tolerance),axis=1)
969
1105
  return status_part
970
1106
 
1107
+ def check_indel_cache(sumstats,cache,ref_infer,ref_alt_freq=None,chr_dict=get_number_to_chr(),daf_tolerance=0.2,trust_cache=True,log=Log(),verbose=True):
1108
+ assert cache is not None, "Cache must be provided"
1109
+ status_part = check_unkonwn_indel_cache(sumstats,cache,ref_infer,ref_alt_freq,chr_dict,daf_tolerance,trust_cache,log,verbose)
1110
+ return status_part
1111
+
971
1112
  ##################################################################################################################################################
972
1113
 
973
1114
  def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,daf_tolerance=0.20,remove_snp="",mode="pi",n_cores=1,remove_indel="",
974
1115
  chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
975
- chr_dict=None,verbose=True,log=Log()):
1116
+ chr_dict=None,cache_options={},verbose=True,log=Log()):
1117
+ '''
1118
+ Args:
1119
+ cache_options : A dictionary with the following keys:
1120
+ - cache_manager: CacheManager object or None. If any between cache_loader and cache_process is not None, or use_cache is True, a CacheManager object will be created automatically.
1121
+ - trust_cache: bool (optional, default: True). Whether to completely trust the cache or not. Trusting the cache means that any key not found inside the cache will be considered as a missing value even in the VCF file.
1122
+ - cache_loader: Object with a get_cache() method or None.
1123
+ - cache_process: Object with an apply_fn() method or None.
1124
+ - use_cache: bool (optional, default: False). If any of the cache_manager, cache_loader or cache_process is not None, this will be set to True automatically.
1125
+ If set to True and all between cache_manager, cache_loader and cache_process are None, the cache will be loaded (or built) on the spot.
1126
+
1127
+ The usefulness of a cache_loader or cache_process object is to pass a custom object which already has the cache loaded. This can be useful if the cache is loaded in background in another thread/process while other operations are performed.
1128
+ The cache_manager is a CacheManager object is used to expose the API to interact with the cache.
1129
+ '''
1130
+
976
1131
  ##start function with col checking##########################################################
977
1132
  _start_line = "infer strand for palindromic SNPs/align indistinguishable indels"
978
1133
  _end_line = "inferring strand for palindromic SNPs/align indistinguishable indels"
@@ -995,6 +1150,16 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
995
1150
 
996
1151
  chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
997
1152
 
1153
+ # Setup cache variables
1154
+ cache_manager = cache_options.get("cache_manager", None)
1155
+ if cache_manager is not None:
1156
+ assert isinstance(cache_manager, CacheManager), "cache_manager must be a CacheManager object"
1157
+ trust_cache = cache_options.get("trust_cache", True)
1158
+ cache_loader = cache_options.get("cache_loader", None)
1159
+ cache_process = cache_options.get("cache_process", None)
1160
+ use_cache = any(c is not None for c in [cache_manager, cache_loader, cache_process]) or cache_options.get('use_cache', False)
1161
+ _n_cores = n_cores # backup n_cores
1162
+
998
1163
  log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
999
1164
 
1000
1165
  if "p" in mode:
@@ -1022,16 +1187,30 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
1022
1187
  #########################################################################################
1023
1188
  if sum(unknow_palindromic_to_check)>0:
1024
1189
  if sum(unknow_palindromic_to_check)<10000:
1025
- n_cores=1
1026
-
1027
- #df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
1028
- df_split = _df_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
1029
- pool = Pool(n_cores)
1030
- map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
1031
- status_inferred = pd.concat(pool.map(map_func,df_split))
1032
- sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
1033
- pool.close()
1034
- pool.join()
1190
+ n_cores=1
1191
+
1192
+ if use_cache and cache_manager is None:
1193
+ cache_manager = CacheManager(base_path=ref_infer, cache_loader=cache_loader, cache_process=cache_process,
1194
+ ref_alt_freq=ref_alt_freq, category=PALINDROMIC_INDEL,
1195
+ n_cores=_n_cores, log=log, verbose=verbose)
1196
+
1197
+ log.write(" -Starting strand inference for palindromic SNPs...",verbose=verbose)
1198
+ df_to_check = sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]]
1199
+
1200
+ if use_cache and cache_manager.cache_len > 0:
1201
+ log.write(" -Using cache for strand inference",verbose=verbose)
1202
+ status_inferred = cache_manager.apply_fn(check_strand_cache, sumstats=df_to_check, ref_infer=ref_infer, ref_alt_freq=ref_alt_freq, chr_dict=chr_dict, trust_cache=trust_cache, log=log, verbose=verbose)
1203
+ sumstats.loc[unknow_palindromic_to_check,status] = status_inferred
1204
+ else:
1205
+ #df_split = np.array_split(df_to_check, n_cores)
1206
+ df_split = _df_split(df_to_check, n_cores)
1207
+ pool = Pool(n_cores)
1208
+ map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
1209
+ status_inferred = pd.concat(pool.map(map_func,df_split))
1210
+ sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
1211
+ pool.close()
1212
+ pool.join()
1213
+ log.write(" -Finished strand inference.",verbose=verbose)
1035
1214
  else:
1036
1215
  log.warning("No palindromic variants available for checking.")
1037
1216
  #########################################################################################
@@ -1082,15 +1261,30 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
1082
1261
 
1083
1262
  if sum(unknow_indel)>0:
1084
1263
  if sum(unknow_indel)<10000:
1085
- n_cores=1
1086
- #df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
1087
- df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
1088
- pool = Pool(n_cores)
1089
- map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
1090
- status_inferred = pd.concat(pool.map(map_func,df_split))
1091
- sumstats.loc[unknow_indel,status] = status_inferred.values
1092
- pool.close()
1093
- pool.join()
1264
+ n_cores=1
1265
+
1266
+ if use_cache and cache_manager is None:
1267
+ cache_manager = CacheManager(base_path=ref_infer, cache_loader=cache_loader, cache_process=cache_process,
1268
+ ref_alt_freq=ref_alt_freq, category=PALINDROMIC_INDEL,
1269
+ n_cores=_n_cores, log=log, verbose=verbose)
1270
+
1271
+ log.write(" -Starting indistinguishable indel inference...",verbose=verbose)
1272
+ df_to_check = sumstats.loc[unknow_indel,[chr,pos,ref,alt,eaf,status]]
1273
+
1274
+ if use_cache and cache_manager.cache_len > 0:
1275
+ log.write(" -Using cache for indel inference",verbose=verbose)
1276
+ status_inferred = cache_manager.apply_fn(check_indel_cache, sumstats=df_to_check, ref_infer=ref_infer, ref_alt_freq=ref_alt_freq, chr_dict=chr_dict, daf_tolerance=daf_tolerance, trust_cache=trust_cache, log=log, verbose=verbose)
1277
+ sumstats.loc[unknow_indel,status] = status_inferred
1278
+ else:
1279
+ #df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
1280
+ df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
1281
+ pool = Pool(n_cores)
1282
+ map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
1283
+ status_inferred = pd.concat(pool.map(map_func,df_split))
1284
+ sumstats.loc[unknow_indel,status] = status_inferred.values
1285
+ pool.close()
1286
+ pool.join()
1287
+ log.write(" -Finished indistinguishable indel inference.",verbose=verbose)
1094
1288
 
1095
1289
  #########################################################################################
1096
1290
 
@@ -792,7 +792,7 @@ def fixallele(sumstats,ea="EA", nea="NEA",status="STATUS",remove=False,verbose=T
792
792
  ###############################################################################################################
793
793
  # 20220721
794
794
 
795
- def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",n_cores=1,verbose=True,log=Log()):
795
+ def parallelnormalizeallele(sumstats,mode="s",snpid="SNPID",rsid="rsID",pos="POS",nea="NEA",ea="EA" ,status="STATUS",chunk=3000000,n_cores=1,verbose=True,log=Log()):
796
796
  ##start function with col checking##########################################################
797
797
  _start_line = "normalize indels"
798
798
  _end_line = "normalizing indels"
@@ -819,7 +819,51 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
819
819
  log.write("Finished normalizing variants successfully!", verbose=verbose)
820
820
  return sumstats
821
821
  ###############################################################################################################
822
- if sum(variants_to_check)>0:
822
+ if mode=="v":
823
+ if sum(variants_to_check)<100000:
824
+ n_cores=1
825
+ if n_cores==1:
826
+ normalized_pd, changed_index = fastnormalizeallele(sumstats.loc[variants_to_check,[pos,nea,ea,status]],pos=pos ,nea=nea,ea=ea,status=status,chunk=chunk, log=log, verbose=verbose)
827
+ else:
828
+ pool = Pool(n_cores)
829
+ map_func = partial(fastnormalizeallele,pos=pos,nea=nea,ea=ea,status=status)
830
+ df_split = _df_split(sumstats.loc[variants_to_check,[pos,nea,ea,status]], n_cores)
831
+ results = pool.map(map_func,df_split)
832
+ normalized_pd = pd.concat([i[0] for i in results])
833
+ changed_index = np.concatenate([i[1] for i in results])
834
+ del results
835
+ pool.close()
836
+ pool.join()
837
+ gc.collect()
838
+ ###############################################################################################################
839
+ try:
840
+ example_sumstats = sumstats.loc[changed_index,:].head()
841
+ changed_num = len(changed_index)
842
+ if changed_num>0:
843
+ if snpid in example_sumstats.columns:
844
+ before_normalize_id = example_sumstats.loc[variants_to_check,snpid]
845
+ elif rsid in example_sumstats.columns:
846
+ before_normalize_id = example_sumstats.loc[variants_to_check,rsid]
847
+ else:
848
+ before_normalize_id = example_sumstats.index
849
+
850
+ log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
851
+ for i in before_normalize_id.values:
852
+ log.write(i,end=" ",show_time=False)
853
+ log.write("... \n",end="",show_time=False, verbose=verbose)
854
+
855
+ log.write(" -Not normalized allele:",end="", verbose=verbose)
856
+ for i in example_sumstats[[ea,nea]].values:
857
+ log.write(i,end="",show_time=False, verbose=verbose)
858
+ log.write("... \n",end="",show_time=False, verbose=verbose)
859
+ log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
860
+ else:
861
+ log.write(" -All variants are already normalized..", verbose=verbose)
862
+ except:
863
+ pass
864
+
865
+ ##########################################################################################################################################################
866
+ elif mode=="s":
823
867
  if sum(variants_to_check)<10000:
824
868
  n_cores=1
825
869
  pool = Pool(n_cores)
@@ -829,35 +873,36 @@ def parallelnormalizeallele(sumstats,snpid="SNPID",rsid="rsID",pos="POS",nea="NE
829
873
  normalized_pd = pd.concat(pool.map(map_func,df_split))
830
874
  pool.close()
831
875
  pool.join()
832
- ###############################################################################################################
833
-
834
- before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
835
- changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
836
- if changed_num>0:
837
- if snpid in sumstats.columns:
838
- before_normalize_id = sumstats.loc[variants_to_check,snpid]
839
- elif rsid in sumstats.columns:
840
- before_normalize_id = sumstats.loc[variants_to_check,rsid]
841
- else:
842
- before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
876
+
877
+ before_normalize = sumstats.loc[variants_to_check,[ea,nea]]
878
+ changed_num = len(normalized_pd.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),:])
879
+ if changed_num>0:
880
+ if snpid in sumstats.columns:
881
+ before_normalize_id = sumstats.loc[variants_to_check,snpid]
882
+ elif rsid in sumstats.columns:
883
+ before_normalize_id = sumstats.loc[variants_to_check,rsid]
884
+ else:
885
+ before_normalize_id = pd.DataFrame(sumstats.index[variants_to_check],index=sumstats.index[variants_to_check])
886
+
887
+ log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
888
+ for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
889
+ log.write(i,end=" ",show_time=False)
890
+ log.write("... \n",end="",show_time=False, verbose=verbose)
843
891
 
844
- log.write(" -Not normalized allele IDs:",end="", verbose=verbose)
845
- for i in before_normalize_id.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea])].head().values:
846
- log.write(i,end=" ",show_time=False)
847
- log.write("... \n",end="",show_time=False, verbose=verbose)
848
-
849
- log.write(" -Not normalized allele:",end="", verbose=verbose)
850
- for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
851
- log.write(i,end="",show_time=False, verbose=verbose)
852
- log.write("... \n",end="",show_time=False, verbose=verbose)
853
- log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
854
- else:
855
- log.write(" -All variants are already normalized..", verbose=verbose)
856
- ###################################################################################################################
892
+ log.write(" -Not normalized allele:",end="", verbose=verbose)
893
+ for i in before_normalize.loc[(before_normalize[ea]!=normalized_pd[ea]) | (before_normalize[nea]!=normalized_pd[nea]),[ea,nea]].head().values:
894
+ log.write(i,end="",show_time=False, verbose=verbose)
895
+ log.write("... \n",end="",show_time=False, verbose=verbose)
896
+ log.write(" -Modified "+str(changed_num) +" variants according to parsimony and left alignment principal.", verbose=verbose)
897
+ else:
898
+ log.write(" -All variants are already normalized..", verbose=verbose)
899
+ ###################################################################################################################
900
+
857
901
  categories = set(sumstats[ea])|set(sumstats[nea]) |set(normalized_pd.loc[:,ea]) |set(normalized_pd.loc[:,nea])
858
902
  sumstats[ea] = pd.Categorical(sumstats[ea],categories = categories)
859
903
  sumstats[nea] = pd.Categorical(sumstats[nea],categories = categories )
860
904
  sumstats.loc[variants_to_check,[pos,nea,ea,status]] = normalized_pd.values
905
+
861
906
  try:
862
907
  sumstats[pos] = sumstats[pos].astype('Int64')
863
908
  except:
@@ -873,6 +918,67 @@ def normalizeallele(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
873
918
  sumstats = pd.DataFrame(normalized.to_list(), columns=[pos,nea,ea,status],index=sumstats.index)
874
919
  return sumstats
875
920
 
921
+ def fastnormalizeallele(insumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS",chunk=3000000,log=Log(),verbose=False):
922
+ log.write(" -Number of variants to check:{}".format(len(insumstats)), verbose=verbose)
923
+ log.write(" -Chunk size:{}".format(chunk), verbose=verbose)
924
+ log.write(" -Processing in chunks:",end="", verbose=verbose)
925
+ changed_index = np.array([])
926
+ for part_n in range(len(insumstats)//chunk+1):
927
+ log.write(part_n, end=" ",show_time=False, verbose=verbose)
928
+ insumstats["NEA"] = insumstats["NEA"].astype("string")
929
+ insumstats["EA"] = insumstats["EA"].astype("string")
930
+ insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:],changed_index_single = normalizae_chunk(insumstats.iloc[part_n*chunk:(part_n+1)*chunk,:].copy())
931
+ changed_index = np.concatenate([changed_index,changed_index_single])
932
+ gc.collect()
933
+ log.write("\n",end="",show_time=False, verbose=verbose)
934
+ return insumstats, changed_index
935
+
936
+ def normalizae_chunk(sumstats,pos="POS" ,nea="NEA",ea="EA",status="STATUS"):
937
+ # already normalized
938
+
939
+ is_same = sumstats["NEA"] == sumstats["EA"]
940
+ is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
941
+
942
+ # a series to keep tracking of variants that are modified
943
+ changed = sumstats["NEA"] != sumstats["NEA"]
944
+
945
+ # right side
946
+ ea_len = sumstats["NEA"].str.len()
947
+ nea_len = sumstats["EA"].str.len()
948
+ max_length=max(ea_len.max(), nea_len.max())
949
+
950
+ for i in range(1, max_length):
951
+ is_pop = (sumstats["NEA"].str[-1] == sumstats["EA"].str[-1]) & (~is_normalized)
952
+ if sum(is_pop)==0:
953
+ break
954
+ if i ==1:
955
+ changed = changed | is_pop
956
+ nea_len[is_pop] = nea_len[is_pop] -1
957
+ ea_len[is_pop] = ea_len[is_pop] -1
958
+ sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[:-1]
959
+ sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[:-1]
960
+ is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
961
+ gc.collect()
962
+
963
+ # left side
964
+ max_length=max(sumstats["NEA"].str.len().max(), sumstats["EA"].str.len().max())
965
+ for i in range(1, max_length):
966
+ is_pop = (sumstats["NEA"].str[0] == sumstats["EA"].str[0]) & (~is_normalized)
967
+ if sum(is_pop)==0:
968
+ break
969
+ if i ==1:
970
+ changed = changed | is_pop
971
+ sumstats.loc[is_pop, "NEA"] = sumstats.loc[is_pop,"NEA"].str[1:]
972
+ sumstats.loc[is_pop, "EA"] = sumstats.loc[is_pop,"EA"].str[1:]
973
+ sumstats.loc[is_pop, "POS"] = sumstats.loc[is_pop,"POS"] + 1
974
+ is_normalized = ((sumstats["NEA"].str.len()==1) | (sumstats["EA"].str.len()==1) ) & (~is_same)
975
+ gc.collect()
976
+
977
+ sumstats.loc[is_normalized,status] = vchange_status(sumstats.loc[is_normalized, status], 5,"4","0")
978
+ sumstats.loc[is_same,status] = vchange_status(sumstats.loc[is_same, status], 5,"4","3")
979
+ changed_index = sumstats[changed].index
980
+ return sumstats, changed_index.values
981
+
876
982
  def normalizevariant(pos,a,b,status):
877
983
  # single record
878
984
  # https://genome.sph.umich.edu/wiki/Variant_Normalization
@@ -1611,12 +1717,5 @@ def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
1611
1717
 
1612
1718
  ###############################################################################################################
1613
1719
  def _df_split(dataframe, n):
1614
- chunks = []
1615
- chunk_size = int(dataframe.shape[0] // n)+1
1616
-
1617
- for index in range(0, dataframe.shape[0], chunk_size):
1618
- chunks.append(
1619
- dataframe.iloc[index:index + chunk_size]
1620
- )
1621
-
1622
- return chunks
1720
+ k, m = divmod(len(dataframe), n)
1721
+ return [dataframe.iloc[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]