gwaslab 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (110) hide show
  1. gwaslab/__init__.py +57 -47
  2. gwaslab/{bd_common_data.py → bd/bd_common_data.py} +10 -9
  3. gwaslab/bd/bd_config.py +28 -0
  4. gwaslab/{bd_download.py → bd/bd_download.py} +1 -1
  5. gwaslab/{bd_get_hapmap3.py → bd/bd_get_hapmap3.py} +9 -6
  6. gwaslab/bd/bd_path_manager.py +110 -0
  7. gwaslab/data/formatbook.json +805 -9
  8. gwaslab/{ldsc_irwls.py → extension/ldsc/ldsc_irwls.py} +1 -1
  9. gwaslab/{ldsc_regressions.py → extension/ldsc/ldsc_regressions.py} +2 -2
  10. gwaslab/{ldsc_sumstats.py → extension/ldsc/ldsc_sumstats.py} +2 -2
  11. gwaslab/{prscs_mcmc_gtb.py → extension/prscs/prscs_mcmc_gtb.py} +1 -1
  12. gwaslab/g_Sumstats.py +130 -96
  13. gwaslab/g_SumstatsMulti.py +69 -40
  14. gwaslab/g_SumstatsPair.py +54 -37
  15. gwaslab/g_SumstatsSet.py +88 -81
  16. gwaslab/g_SumstatsT.py +6 -6
  17. gwaslab/g_Sumstats_polars.py +84 -84
  18. gwaslab/g_meta_update.py +1 -1
  19. gwaslab/g_vchange_status.py +4 -4
  20. gwaslab/g_version.py +2 -2
  21. gwaslab/{hm_casting.py → hm/hm_casting.py} +4 -4
  22. gwaslab/{hm_casting_polars.py → hm/hm_casting_polars.py} +4 -4
  23. gwaslab/hm/hm_harmonize_sumstats.py +1635 -0
  24. gwaslab/hm_harmonize_sumstats.py +3 -8
  25. gwaslab/{io_load_ld.py → io/io_load_ld.py} +16 -13
  26. gwaslab/{io_preformat_input.py → io/io_preformat_input.py} +152 -73
  27. gwaslab/{io_preformat_input_polars.py → io/io_preformat_input_polars.py} +7 -7
  28. gwaslab/{io_read_pipcs.py → io/io_read_pipcs.py} +2 -2
  29. gwaslab/{io_read_tabular.py → io/io_read_tabular.py} +2 -2
  30. gwaslab/{io_to_formats.py → io/io_to_formats.py} +11 -8
  31. gwaslab/{io_to_pickle.py → io/io_to_pickle.py} +16 -1
  32. gwaslab/{qc_check_datatype_polars.py → qc/qc_check_datatype_polars.py} +2 -2
  33. gwaslab/{qc_fix_sumstats.py → qc/qc_fix_sumstats.py} +60 -33
  34. gwaslab/{qc_fix_sumstats_polars.py → qc/qc_fix_sumstats_polars.py} +15 -11
  35. gwaslab/{util_abf_finemapping.py → util/util_abf_finemapping.py} +2 -2
  36. gwaslab/{util_ex_calculate_ldmatrix.py → util/util_ex_calculate_ldmatrix.py} +18 -8
  37. gwaslab/{util_ex_calculate_prs.py → util/util_ex_calculate_prs.py} +2 -2
  38. gwaslab/{util_ex_ldproxyfinder.py → util/util_ex_ldproxyfinder.py} +6 -6
  39. gwaslab/{util_ex_ldsc.py → util/util_ex_ldsc.py} +18 -13
  40. gwaslab/{util_ex_match_ldmatrix.py → util/util_ex_match_ldmatrix.py} +8 -7
  41. gwaslab/util/util_ex_phewwas.py +117 -0
  42. gwaslab/{util_ex_process_h5.py → util/util_ex_process_h5.py} +2 -2
  43. gwaslab/{util_ex_process_ref.py → util/util_ex_process_ref.py} +2 -2
  44. gwaslab/{util_ex_run_2samplemr.py → util/util_ex_run_2samplemr.py} +18 -7
  45. gwaslab/{util_ex_run_ccgwas.py → util/util_ex_run_ccgwas.py} +4 -4
  46. gwaslab/{util_ex_run_clumping.py → util/util_ex_run_clumping.py} +28 -13
  47. gwaslab/{util_ex_run_coloc.py → util/util_ex_run_coloc.py} +22 -10
  48. gwaslab/{util_ex_run_hyprcoloc.py → util/util_ex_run_hyprcoloc.py} +4 -4
  49. gwaslab/{util_ex_run_magma.py → util/util_ex_run_magma.py} +21 -11
  50. gwaslab/{util_ex_run_mesusie.py → util/util_ex_run_mesusie.py} +3 -3
  51. gwaslab/{util_ex_run_mtag.py → util/util_ex_run_mtag.py} +50 -18
  52. gwaslab/{util_ex_run_prscs.py → util/util_ex_run_prscs.py} +3 -3
  53. gwaslab/{util_ex_run_scdrs.py → util/util_ex_run_scdrs.py} +10 -4
  54. gwaslab/{util_ex_run_susie.py → util/util_ex_run_susie.py} +49 -26
  55. gwaslab/{util_in_fill_data.py → util/util_in_fill_data.py} +1 -1
  56. gwaslab/{util_in_filter_value.py → util/util_in_filter_value.py} +18 -11
  57. gwaslab/{util_in_get_sig.py → util/util_in_get_sig.py} +15 -13
  58. gwaslab/{util_in_meta.py → util/util_in_meta.py} +1 -1
  59. gwaslab/{util_in_meta_polars.py → util/util_in_meta_polars.py} +1 -1
  60. gwaslab/{viz_aux_annotate_plot.py → viz/viz_aux_annotate_plot.py} +1 -1
  61. gwaslab/{viz_aux_quickfix.py → viz/viz_aux_quickfix.py} +2 -2
  62. gwaslab/{viz_plot_compare_af.py → viz/viz_plot_compare_af.py} +1 -1
  63. gwaslab/{viz_plot_compare_effect.py → viz/viz_plot_compare_effect.py} +16 -8
  64. gwaslab/{viz_plot_credible_sets.py → viz/viz_plot_credible_sets.py} +6 -6
  65. gwaslab/{viz_plot_effect.py → viz/viz_plot_effect.py} +37 -69
  66. gwaslab/{viz_plot_miamiplot.py → viz/viz_plot_miamiplot.py} +28 -20
  67. gwaslab/{viz_plot_miamiplot2.py → viz/viz_plot_miamiplot2.py} +27 -22
  68. gwaslab/{viz_plot_mqqplot.py → viz/viz_plot_mqqplot.py} +48 -38
  69. gwaslab/{viz_plot_phe_heatmap.py → viz/viz_plot_phe_heatmap.py} +18 -15
  70. gwaslab/{viz_plot_qqplot.py → viz/viz_plot_qqplot.py} +4 -2
  71. gwaslab/{viz_plot_regional2.py → viz/viz_plot_regional2.py} +11 -9
  72. gwaslab/{viz_plot_regionalplot.py → viz/viz_plot_regionalplot.py} +5 -4
  73. gwaslab/{viz_plot_rg_heatmap.py → viz/viz_plot_rg_heatmap.py} +1 -1
  74. gwaslab/{viz_plot_scatter_with_reg.py → viz/viz_plot_scatter_with_reg.py} +10 -7
  75. gwaslab/{viz_plot_stackedregional.py → viz/viz_plot_stackedregional.py} +67 -33
  76. gwaslab/{viz_plot_trumpetplot.py → viz/viz_plot_trumpetplot.py} +11 -9
  77. {gwaslab-3.6.6.dist-info → gwaslab-3.6.7.dist-info}/METADATA +1 -1
  78. gwaslab-3.6.7.dist-info/RECORD +123 -0
  79. gwaslab/bd_config.py +0 -18
  80. gwaslab-3.6.6.dist-info/RECORD +0 -120
  81. /gwaslab/{ldsc_jackknife.py → extension/ldsc/ldsc_jackknife.py} +0 -0
  82. /gwaslab/{ldsc_ldscore.py → extension/ldsc/ldsc_ldscore.py} +0 -0
  83. /gwaslab/{ldsc_parse.py → extension/ldsc/ldsc_parse.py} +0 -0
  84. /gwaslab/{prscs_gigrnd.py → extension/prscs/prscs_gigrnd.py} +0 -0
  85. /gwaslab/{prscs_parse_genet.py → extension/prscs/prscs_parse_genet.py} +0 -0
  86. /gwaslab/{hm_rsid_to_chrpos.py → hm/hm_rsid_to_chrpos.py} +0 -0
  87. /gwaslab/{io_process_args.py → io/io_process_args.py} +0 -0
  88. /gwaslab/{io_read_ldsc.py → io/io_read_ldsc.py} +0 -0
  89. /gwaslab/{qc_build.py → qc/qc_build.py} +0 -0
  90. /gwaslab/{qc_check_datatype.py → qc/qc_check_datatype.py} +0 -0
  91. /gwaslab/{util_ex_gwascatalog.py → util/util_ex_gwascatalog.py} +0 -0
  92. /gwaslab/{util_ex_infer_ancestry.py → util/util_ex_infer_ancestry.py} +0 -0
  93. /gwaslab/{util_ex_plink_filter.py → util/util_ex_plink_filter.py} +0 -0
  94. /gwaslab/{util_in_calculate_gc.py → util/util_in_calculate_gc.py} +0 -0
  95. /gwaslab/{util_in_calculate_power.py → util/util_in_calculate_power.py} +0 -0
  96. /gwaslab/{util_in_convert_h2.py → util/util_in_convert_h2.py} +0 -0
  97. /gwaslab/{util_in_correct_winnerscurse.py → util/util_in_correct_winnerscurse.py} +0 -0
  98. /gwaslab/{util_in_estimate_ess.py → util/util_in_estimate_ess.py} +0 -0
  99. /gwaslab/{util_in_get_density.py → util/util_in_get_density.py} +0 -0
  100. /gwaslab/{util_in_merge.py → util/util_in_merge.py} +0 -0
  101. /gwaslab/{util_in_snphwe.py → util/util_in_snphwe.py} +0 -0
  102. /gwaslab/{viz_aux_chromatin.py → viz/viz_aux_chromatin.py} +0 -0
  103. /gwaslab/{viz_aux_property.py → viz/viz_aux_property.py} +0 -0
  104. /gwaslab/{viz_aux_reposition_text.py → viz/viz_aux_reposition_text.py} +0 -0
  105. /gwaslab/{viz_aux_save_figure.py → viz/viz_aux_save_figure.py} +0 -0
  106. /gwaslab/{viz_plot_forestplot.py → viz/viz_plot_forestplot.py} +0 -0
  107. {gwaslab-3.6.6.dist-info → gwaslab-3.6.7.dist-info}/WHEEL +0 -0
  108. {gwaslab-3.6.6.dist-info → gwaslab-3.6.7.dist-info}/licenses/LICENSE +0 -0
  109. {gwaslab-3.6.6.dist-info → gwaslab-3.6.7.dist-info}/licenses/LICENSE_before_v3.4.39 +0 -0
  110. {gwaslab-3.6.6.dist-info → gwaslab-3.6.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1635 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from pysam import VariantFile
4
+ from Bio import SeqIO
5
+ from itertools import repeat
6
+ from multiprocessing import Pool
7
+ from functools import partial
8
+ import re
9
+ import os
10
+ import gc
11
+ from gwaslab.g_Log import Log
12
+ from gwaslab.qc.qc_fix_sumstats import fixchr
13
+ from gwaslab.qc.qc_fix_sumstats import fixpos
14
+ from gwaslab.qc.qc_fix_sumstats import sortcolumn
15
+ from gwaslab.qc.qc_fix_sumstats import _df_split
16
+ from gwaslab.qc.qc_fix_sumstats import check_col
17
+ from gwaslab.qc.qc_fix_sumstats import start_to
18
+ from gwaslab.qc.qc_fix_sumstats import finished
19
+ from gwaslab.qc.qc_fix_sumstats import skipped
20
+ from gwaslab.qc.qc_fix_sumstats import sortcoordinate
21
+ from gwaslab.qc.qc_check_datatype import check_dataframe_shape
22
+ from gwaslab.bd.bd_common_data import get_number_to_chr
23
+ from gwaslab.bd.bd_common_data import get_chr_list
24
+ from gwaslab.bd.bd_common_data import get_chr_to_number
25
+ from gwaslab.bd.bd_common_data import get_number_to_NC
26
+ from gwaslab.bd.bd_common_data import _maketrans
27
+ from gwaslab.g_vchange_status import vchange_status
28
+ from gwaslab.g_version import _get_version
29
+ from gwaslab.cache_manager import CacheManager, PALINDROMIC_INDEL, NON_PALINDROMIC
30
+ from gwaslab.g_vchange_status import STATUS_CATEGORIES
31
+ #rsidtochrpos
32
+ #checkref
33
+ #parallelizeassignrsid
34
+ #inferstrand
35
+ #parallelecheckaf
36
+
37
+ ### CONSTANTS AND MAPPINGS ###
38
+
39
+ PADDING_VALUE = 100
40
+
41
+ # chr(0) should not be used in the mapping dict because it's a reserved value.
42
+ # Instead of starting from chr(1), we start from chr(2) because this could be useful in the future
43
+ # to compute the complementary allele with a simple XOR operation (e.g. 2 ^ 1 = 3, 3 ^ 1 = 2, 4 ^ 1 = 5, 5 ^ 1 = 4, ...)
44
+ MAPPING = {
45
+ "A": chr(2),
46
+ "T": chr(3),
47
+ "C": chr(4),
48
+ "G": chr(5),
49
+ "N": chr(6),
50
+ }
51
+ assert all(value != chr(0) for value in MAPPING.values()), "Mapping in the dictionary should not be equal to chr(0). This is a reserved value"
52
+
53
+ _COMPLEMENTARY_MAPPING = {
54
+ "A": "T",
55
+ "C": "G",
56
+ "G": "C",
57
+ "T": "A",
58
+ "N": "N",
59
+ }
60
+ COMPLEMENTARY_MAPPING = {k: MAPPING[v] for k,v in _COMPLEMENTARY_MAPPING.items()}
61
+
62
+ TRANSLATE_TABLE = _maketrans(MAPPING)
63
+ TRANSLATE_TABLE_COMPL = _maketrans(COMPLEMENTARY_MAPPING)
64
+
65
+ #20220808
66
+ #################################################################################################################
67
+
68
+ ###~!!!!
69
+ def rsidtochrpos(sumstats,
70
+ path=None, ref_rsid_to_chrpos_tsv=None, snpid="SNPID",
71
+ rsid="rsID", chrom="CHR",pos="POS",ref_rsid="rsID",ref_chr="CHR",ref_pos="POS", build="19",
72
+ overwrite=False,remove=False,chunksize=5000000,verbose=True,log=Log()):
73
+ '''
74
+ assign chr:pos based on rsID
75
+ '''
76
+ ##start function with col checking##########################################################
77
+ _start_line = "assign CHR and POS using rsIDs"
78
+ _end_line = "assigning CHR and POS using rsIDs"
79
+ _start_cols = [rsid]
80
+ _start_function = ".rsid_to_chrpos()"
81
+ _must_args ={}
82
+
83
+ is_enough_info = start_to(sumstats=sumstats,
84
+ log=log,
85
+ verbose=verbose,
86
+ start_line=_start_line,
87
+ end_line=_end_line,
88
+ start_cols=_start_cols,
89
+ start_function=_start_function,
90
+ **_must_args)
91
+ if is_enough_info == False: return sumstats
92
+ ############################################################################################
93
+
94
+ log.write(" -rsID dictionary file: "+ path,verbose=verbose)
95
+
96
+ if ref_rsid_to_chrpos_tsv is not None:
97
+ path = ref_rsid_to_chrpos_tsv
98
+
99
+ if snpid in sumstats.columns and sum(sumstats[rsid].isna())>0:
100
+ log.write(" -Filling na in rsID columns with SNPID...",verbose=verbose)
101
+ sumstats.loc[sumstats[rsid].isna(),rsid] = sumstats.loc[sumstats[rsid].isna(),snpid]
102
+
103
+ if sum(sumstats[rsid].isna())>0:
104
+ log.write(" -Filling na in rsID columns with NA_xxx for {} variants...".format(sum(sumstats[rsid].isna())),verbose=verbose)
105
+ sumstats.loc[sumstats[rsid].isna(),rsid] = ["NA_" + str(x+1) for x in range(len(sumstats.loc[sumstats[rsid].isna(),rsid]))]
106
+
107
+ dic_chuncks = pd.read_csv(path,sep="\t",usecols=[ref_rsid,ref_chr,ref_pos],
108
+ chunksize=chunksize,index_col = ref_rsid,
109
+ dtype={ref_rsid:"string",ref_chr:"Int64",ref_pos:"Int64"})
110
+
111
+ sumstats = sumstats.set_index(rsid)
112
+
113
+ #if chr or pos columns not in sumstats
114
+ if chrom not in sumstats.columns:
115
+ sumstats[chrom] =pd.Series(dtype="Int64")
116
+ if pos not in sumstats.columns:
117
+ sumstats[pos] =pd.Series(dtype="Int64")
118
+
119
+ log.write(" -Setting block size: ",chunksize,verbose=verbose)
120
+ log.write(" -Loading block: ",end="",verbose=verbose)
121
+ for i,dic in enumerate(dic_chuncks):
122
+ dic_to_update = dic[dic.index.notnull()]
123
+ log.write(i," ",end=" ",show_time=False)
124
+ dic_to_update = dic_to_update.rename(index={ref_rsid:rsid})
125
+ dic_to_update = dic_to_update.rename(columns={ref_chr:chrom,ref_pos:pos})
126
+ dic_to_update = dic_to_update[~dic_to_update.index.duplicated(keep='first')]
127
+ sumstats.update(dic_to_update,overwrite="True")
128
+ gc.collect()
129
+
130
+ log.write("\n",end="",show_time=False,verbose=verbose)
131
+ sumstats = sumstats.reset_index()
132
+ sumstats = sumstats.rename(columns = {'index':rsid})
133
+ log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ",verbose=verbose)
134
+ sumstats = fixchr(sumstats,verbose=verbose)
135
+ sumstats = fixpos(sumstats,verbose=verbose)
136
+ sumstats = sortcolumn(sumstats,verbose=verbose)
137
+
138
+ finished(log,verbose,_end_line)
139
+ return sumstats
140
+ ####################################################################################################
141
+
142
+
143
+ ####################################################################################################################
144
+
145
+ def merge_chrpos(sumstats_part,all_groups_max,path,build,status):
146
+ group=str(sumstats_part["group"].mode(dropna=True)[0])
147
+ if group in [str(i) for i in range(all_groups_max+1)]:
148
+ try:
149
+ to_merge=pd.read_hdf(path, key="group_"+str(group)).drop_duplicates(subset="rsn")
150
+ to_merge = to_merge.set_index("rsn")
151
+ is_chrpos_fixable = sumstats_part.index.isin(to_merge.index)
152
+ sumstats_part.loc[is_chrpos_fixable,status] = vchange_status(sumstats_part.loc[is_chrpos_fixable, status], 1,"139",3*build[0])
153
+ sumstats_part.loc[is_chrpos_fixable,status] = vchange_status(sumstats_part.loc[is_chrpos_fixable, status], 2,"987",3*build[1])
154
+ sumstats_part.update(to_merge)
155
+ except:
156
+ pass
157
+ return sumstats_part
158
+
159
+
160
+ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None, ref_rsid_to_chrpos_vcf = None, ref_rsid_to_chrpos_hdf5 = None, build="99",status="STATUS",
161
+ n_cores=4,block_size=20000000,verbose=True,log=Log()):
162
+
163
+ ##start function with col checking##########################################################
164
+ _start_line = "assign CHR and POS using rsIDs"
165
+ _end_line = "assigning CHR and POS using rsIDs"
166
+ _start_cols = [rsid]
167
+ _start_function = ".rsid_to_chrpos2()"
168
+ _must_args ={}
169
+
170
+ is_enough_info = start_to(sumstats=sumstats,
171
+ log=log,
172
+ verbose=verbose,
173
+ start_line=_start_line,
174
+ end_line=_end_line,
175
+ start_cols=_start_cols,
176
+ start_function=_start_function,
177
+ **_must_args)
178
+ if is_enough_info == False: return sumstats
179
+ ############################################################################################
180
+
181
+ if ref_rsid_to_chrpos_hdf5 is not None:
182
+ path = ref_rsid_to_chrpos_hdf5
183
+ elif ref_rsid_to_chrpos_vcf is not None:
184
+ vcf_file_name = os.path.basename(ref_rsid_to_chrpos_vcf)
185
+ vcf_dir_path = os.path.dirname(ref_rsid_to_chrpos_vcf)
186
+ path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(vcf_dir_path,vcf_file_name,int(block_size))
187
+
188
+ if path is None:
189
+ raise ValueError("Please provide path to hdf5 file.")
190
+
191
+ sumstats["rsn"] = pd.to_numeric(sumstats[rsid].str.strip("rs"),errors="coerce").astype("Int64")
192
+
193
+ log.write(" -Source hdf5 file: ",path,verbose=verbose)
194
+ log.write(" -Cores to use : ",n_cores,verbose=verbose)
195
+ log.write(" -Blocksize (make sure it is the same as hdf5 file ): ",block_size,verbose=verbose)
196
+
197
+ input_columns= sumstats.columns
198
+ sumstats_nonrs = sumstats.loc[sumstats["rsn"].isna()|sumstats["rsn"].duplicated(keep='first') ,:].copy()
199
+ sumstats_rs = sumstats.loc[sumstats["rsn"].notnull(),:].copy()
200
+
201
+ log.write(" -Non-Valid rsIDs: ",sum(sumstats["rsn"].isna()),verbose=verbose)
202
+ log.write(" -Duplicated rsIDs except for the first occurrence: ",sum(sumstats.loc[~sumstats["rsn"].isna(), "rsn"].duplicated(keep='first')),verbose=verbose)
203
+ log.write(" -Valid rsIDs: ", len(sumstats_rs),verbose=verbose)
204
+
205
+ del sumstats
206
+ gc.collect()
207
+
208
+ # assign group number
209
+ sumstats_rs.loc[:,"group"]= sumstats_rs.loc[:,"rsn"]//block_size
210
+
211
+ # all groups
212
+
213
+
214
+ # set index
215
+ sumstats_rs = sumstats_rs.set_index("rsn")
216
+
217
+ #
218
+ pool = Pool(n_cores)
219
+ if chrom not in input_columns:
220
+ log.write(" -Initiating CHR ... ",verbose=verbose)
221
+ sumstats_rs[chrom]=pd.Series(dtype="Int64")
222
+
223
+ if pos not in input_columns:
224
+ log.write(" -Initiating POS ... ",verbose=verbose)
225
+ sumstats_rs[pos]=pd.Series(dtype="Int64")
226
+
227
+ df_split=[y for x, y in sumstats_rs.groupby('group', as_index=False)]
228
+ log.write(" -Divided into groups: ",len(df_split),verbose=verbose)
229
+ log.write(" -",set(sumstats_rs.loc[:,"group"].unique()),verbose=verbose)
230
+
231
+ # check keys
232
+ store = pd.HDFStore(path, 'r')
233
+ all_groups = store.keys()
234
+ all_groups_len = len(all_groups)
235
+ store.close()
236
+ all_groups_max = max(map(lambda x: int(x.split("_")[1]), all_groups))
237
+ log.write(" -Number of groups in HDF5: ",all_groups_len,verbose=verbose)
238
+ log.write(" -Max index of groups in HDF5: ",all_groups_max,verbose=verbose)
239
+
240
+ # update CHR and POS using rsID with multiple threads
241
+ sumstats_rs = pd.concat(pool.map(partial(merge_chrpos,all_groups_max=all_groups_max,path=path,build=build,status=status),df_split),ignore_index=True)
242
+ sumstats_rs[["CHR","POS"]] = sumstats_rs[["CHR","POS"]].astype("Int64")
243
+ del df_split
244
+ gc.collect()
245
+ log.write(" -Merging group data... ",verbose=verbose)
246
+ # drop group and rsn
247
+ sumstats_rs = sumstats_rs.drop(columns=["group"])
248
+ sumstats_nonrs = sumstats_nonrs.drop(columns=["rsn"])
249
+
250
+ # merge back
251
+ log.write(" -Append data... ",verbose=verbose)
252
+ sumstats = pd.concat([sumstats_rs,sumstats_nonrs],ignore_index=True)
253
+
254
+ del sumstats_rs
255
+ del sumstats_nonrs
256
+ gc.collect()
257
+
258
+ # check
259
+ sumstats = fixchr(sumstats,verbose=verbose)
260
+ sumstats = fixpos(sumstats,verbose=verbose)
261
+ sumstats = sortcolumn(sumstats,verbose=verbose)
262
+
263
+ pool.close()
264
+ pool.join()
265
+
266
+ finished(log, verbose, _end_line)
267
+ return sumstats
268
+ ####################################################################################################################
269
+ # old version
270
+ def _old_check_status(row,record):
271
+ #pos,ea,nea
272
+ # status
273
+ #0 / -----> match
274
+ #1 / -----> Flipped Fixed
275
+ #2 / -----> Reverse_complementary Fixed
276
+ #3 / -----> flipped
277
+ #4 / -----> reverse_complementary
278
+ #5 / ------> reverse_complementary + flipped
279
+ #6 / -----> both allele on genome + unable to distinguish
280
+ #7 / ----> reverse_complementary + both allele on genome + unable to distinguish
281
+ #8 / -----> not on ref genome
282
+ #9 / ------> unchecked
283
+
284
+ status_pre=row.iloc[3][:5]
285
+ status_end=row.iloc[3][6:]
286
+
287
+ ## nea == ref
288
+ if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
289
+ ## ea == ref
290
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
291
+ ## len(nea) >len(ea):
292
+ if len(row.iloc[2])!=len(row.iloc[1]):
293
+ # indels both on ref, unable to identify
294
+ return status_pre+"6"+status_end
295
+ else:
296
+ #nea == ref & ea != ref
297
+ return status_pre+"0"+status_end
298
+ ## nea!=ref
299
+ else:
300
+ # ea == ref_seq -> need to flip
301
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
302
+ return status_pre+"3"+status_end
303
+ # ea !=ref
304
+ else:
305
+ #_reverse_complementary
306
+ row.iloc[1] = get_reverse_complementary_allele(row.iloc[1])
307
+ row.iloc[2] = get_reverse_complementary_allele(row.iloc[2])
308
+ ## nea == ref
309
+ if row.iloc[2] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[2])-1].seq.upper():
310
+ ## ea == ref
311
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
312
+ ## len(nea) >len(ea):
313
+ if len(row.iloc[2])!=len(row.iloc[1]):
314
+ return status_pre+"8"+status_end # indel reverse complementary
315
+ else:
316
+ return status_pre+"4"+status_end
317
+ else:
318
+ # ea == ref_seq -> need to flip
319
+ if row.iloc[1] == record[row.iloc[0]-1: row.iloc[0]+len(row.iloc[1])-1].seq.upper():
320
+ return status_pre+"5"+status_end
321
+ # ea !=ref
322
+ return status_pre+"8"+status_end
323
+
324
+ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
325
+ ##start function with col checking##########################################################
326
+ _start_line = "check if NEA is aligned with reference sequence"
327
+ _end_line = "checking if NEA is aligned with reference sequence"
328
+ _start_cols = [chrom,pos,ea,nea,status]
329
+ _start_function = ".check_ref()"
330
+ _must_args ={}
331
+ is_enough_info = start_to(sumstats=sumstats,
332
+ log=log,
333
+ verbose=verbose,
334
+ start_line=_start_line,
335
+ end_line=_end_line,
336
+ start_cols=_start_cols,
337
+ start_function=_start_function,
338
+ **_must_args)
339
+ if is_enough_info == False: return sumstats
340
+ ############################################################################################
341
+ log.write(" -Reference genome FASTA file: "+ ref_seq,verbose=verbose)
342
+ log.write(" -Checking records: ", end="",verbose=verbose)
343
+ chromlist = get_chr_list(add_number=True)
344
+ records = SeqIO.parse(ref_seq, "fasta")
345
+ for record in records:
346
+ #record = next(records)
347
+ if record is not None:
348
+ record_chr = str(record.id).strip("chrCHR").upper()
349
+ if record_chr in chr_dict.keys():
350
+ i = chr_dict[record_chr]
351
+ else:
352
+ i = record_chr
353
+ if i in chromlist:
354
+ log.write(record_chr," ", end="",show_time=False,verbose=verbose)
355
+ to_check_ref = (sumstats[chrom]==i) & (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna())
356
+ sumstats.loc[to_check_ref,status] = sumstats.loc[to_check_ref,[pos,ea,nea,status]].apply(lambda x:_old_check_status(x,record),axis=1)
357
+
358
+ log.write("\n",end="",show_time=False,verbose=verbose)
359
+
360
+ #CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
361
+ sumstats[status] = pd.Categorical(sumstats[status],categories=STATUS_CATEGORIES)
362
+ #sumstats[status] = sumstats[status].astype("string")
363
+
364
+
365
+ available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
366
+ status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
367
+ status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
368
+ status_4=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[4]\w", case=False, flags=0, na=False))
369
+ status_5=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[5]\w", case=False, flags=0, na=False))
370
+ status_6=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[6]\w", case=False, flags=0, na=False))
371
+ #status_7=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[7]\w", case=False, flags=0, na=False))
372
+ status_8=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w", case=False, flags=0, na=False))
373
+
374
+ log.write(" -Variants allele on given reference sequence : ",status_0,verbose=verbose)
375
+ log.write(" -Variants flipped : ",status_3,verbose=verbose)
376
+ raw_matching_rate = (status_3+status_0)/available_to_check
377
+ flip_rate = status_3/available_to_check
378
+ log.write(" -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100),verbose=verbose)
379
+ if raw_matching_rate <0.8:
380
+ log.warning("Matching rate is low, please check if the right reference genome is used.")
381
+ if flip_rate > 0.85 :
382
+ log.write(" -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.",verbose=verbose)
383
+
384
+ log.write(" -Variants inferred reverse_complement : ",status_4,verbose=verbose)
385
+ log.write(" -Variants inferred reverse_complement_flipped : ",status_5,verbose=verbose)
386
+ log.write(" -Both allele on genome + unable to distinguish : ",status_6,verbose=verbose)
387
+ #log.write(" -Reverse_complementary + both allele on genome + unable to distinguish: ",status_7)
388
+ log.write(" -Variants not on given reference sequence : ",status_8,verbose=verbose)
389
+
390
+ if remove is True:
391
+ sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
392
+ log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
393
+
394
+ finished(log, verbose, _end_line)
395
+ return sumstats
396
+
397
+ #20240320 check if non-effect allele is aligned with reference genome
398
+ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array, records_len: np.array):
399
+ # starting_positions and records_len must be 1D arrays containing data only for the chromosomes contained in x,
400
+ # and these arrays must be ordered in the same way as the chromosomes in np.unique(x['CHR'].values).
401
+ # status
402
+ #0 / -----> match
403
+ #1 / -----> Flipped Fixed
404
+ #2 / -----> Reverse_complementary Fixed
405
+ #3 / -----> flipped
406
+ #4 / -----> reverse_complementary
407
+ #5 / ------> reverse_complementary + flipped
408
+ #6 / -----> both allele on genome + unable to distinguish
409
+ #7 / ----> reverse_complementary + both allele on genome + unable to distinguish
410
+ #8 / -----> not on ref genome
411
+ #9 / ------> unchecked
412
+ if x.empty:
413
+ return np.array([])
414
+
415
+ # x is expected to be a DataFrame with these columns in that order: ['CHR', 'POS', 'EA', 'NEA', 'STATUS']
416
+ # In this way, we don't need to specify the columns names
417
+ _chrom = x.iloc[:, 0]
418
+ _pos = x.iloc[:, 1]
419
+ _ea = x.iloc[:, 2]
420
+ _nea = x.iloc[:, 3]
421
+ _status = x.iloc[:, 4]
422
+
423
+ # position of the status (i.e. x['STATUS']) that will be modified
424
+ status_flip_idx = 5
425
+
426
+ pos = _pos.values.astype(np.int64) # convert to int64 because they could be of type 'object'
427
+
428
+ # Rebase the chromosome numbers to 0-based indexing
429
+ # e.g. ['1', '2', '4', '2'] -> [0, 1, 2, 1]
430
+ # This is needed because record is a single 1D array containing all the records for all the selected chromosomes,
431
+ # so for instance if record contains the records for chr1, chr2, chr4 ([...chr1...chr2...chr4...]), we need to
432
+ # rebase the chromosome numbers to 0-based indexing to index the correct record portion when we do starting_positions[chrom]
433
+ # Note that in x there are only the rows for the same chromosomes for which we have the records in record
434
+ # (i.e. we don't have rows for chr3 if we don't have the record for chr3). This filtering is done in the caller function
435
+ _chrom = _chrom.values
436
+ unique_values, _ = np.unique(_chrom, return_inverse=True) # Get the sorted unique values and their indices
437
+ chrom = np.searchsorted(unique_values, _chrom) # Replace each value in '_chrom' with its corresponding index in the sorted unique values
438
+
439
+ max_len_nea = _nea.str.len().max()
440
+ max_len_ea = _ea.str.len().max()
441
+
442
+ ########################################## mask for variants with out of range POS
443
+ mask_outlier = pos > records_len[chrom]
444
+
445
+ #########################################
446
+
447
+ # Let's apply the same magic used for the fasta records (check build_fasta_records() for details) to convert the NEA and EA to
448
+ # a numpy array of integers in a very fast way.
449
+ # In that case we start from a pd.Series to we can apply some built-in methods.
450
+ # Also, when doing nea.view('<u4'), each row will be automatically right-padded with zeros to reach the max_len_nea.
451
+ # For this reason, we then replace the zeros with out padding value
452
+ # (and that's why the mapping dict can't have chr(0) as a value, otherwise we would have zeros for both padding and a character)
453
+ # Reshaping is needed because .view('<u4') will create a flattened array
454
+ nea = _nea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_nea}')
455
+ nea = nea.view('<u4').reshape(-1, max_len_nea).astype(np.uint8)
456
+ nea[nea == 0] = PADDING_VALUE # padding value
457
+ ###########################################
458
+
459
+ ###########################################
460
+ # Create a mask holding True at the position of non-padding values
461
+ mask_nea = nea != PADDING_VALUE
462
+
463
+ # Create the reverse complement of NEA
464
+ # In this case, we manually left-pad the translated string with the padding value, since the padding done by view('<u4') would be right-padded
465
+ # and that will make hard the reverse operation (because we would have e.g. [2, 2, 4, 100, ..., 100] which will be hard to convert into [4, 2, 2, 100, ..., 100])
466
+ rev_nea = _nea.str.translate(TRANSLATE_TABLE_COMPL).str.pad(max_len_nea, 'left', chr(PADDING_VALUE)).to_numpy().astype(f'<U{max_len_nea}')
467
+ rev_nea = rev_nea.view('<u4').reshape(-1, max_len_nea).astype(np.uint8)
468
+ rev_nea = rev_nea[:, ::-1]
469
+
470
+
471
+ # Let's do everything again for EA
472
+ ea = _ea.str.translate(TRANSLATE_TABLE).to_numpy().astype(f'<U{max_len_ea}')
473
+ ea = ea.view('<u4').reshape(-1, max_len_ea).astype(np.uint8)
474
+ ea[ea == 0] = PADDING_VALUE # padding value
475
+ ###########################################
476
+
477
+ ###########################################
478
+ mask_ea = ea != PADDING_VALUE
479
+
480
+ rev_ea = _ea.str.translate(TRANSLATE_TABLE_COMPL).str.pad(max_len_ea, 'left', chr(PADDING_VALUE)).to_numpy().astype(f'<U{max_len_ea}')
481
+ rev_ea = rev_ea.view('<u4').reshape(-1, max_len_ea).astype(np.uint8)
482
+ rev_ea = rev_ea[:, ::-1]
483
+
484
+
485
+ # Convert the status (which are integers represented as strings) to a numpy array of integers.
486
+ # Again, use the same concept as before to do this in a very fast way.
487
+ # e.g. ["9999999", "9939999", "9929999"] -> [[9, 9, 9, 9, 9, 9, 9], [9, 9, 3, 9, 9, 9, 9], [9, 9, 2, 9, 9, 9, 9]]
488
+ assert _status.str.len().value_counts().nunique() == 1 # all the status strings should have the same length, let's be sure of that.
489
+ status_len = len(_status.iloc[0])
490
+ mapping_status = {str(v): chr(v) for v in range(10)}
491
+ table_stats = _maketrans(mapping_status)
492
+ status = _status.str.translate(table_stats).to_numpy().astype(f'<U{status_len}')
493
+ status = status.view('<u4').reshape(-1, status_len).astype(np.uint8)
494
+
495
+
496
+ # Expand the position to a 2D array and subtract 1 to convert to 0-based indexing
497
+ # e.g. [2, 21, 46] -> [[1], [20], [45]]
498
+ pos = np.expand_dims(pos, axis=-1) - 1
499
+
500
+ # Create a modified indices array specifying the starting position of each chromosome in the concatenated record array
501
+ modified_indices = starting_positions[chrom]
502
+ modified_indices = modified_indices[:, np.newaxis] # Add a new axis to modified_indices to align with the dimensions of pos
503
+
504
+ # Create the range of indices: [0, ..., max_len_nea-1]
505
+ indices_range = np.arange(max_len_nea)
506
+
507
+ # Add the range of indices to the starting indices
508
+ # e.g. pos = [[1], [20], [45]], indices_range = [0, 1, 2], indices = [[1, 2, 3], [20, 21, 22], [45, 46, 47]]
509
+ indices = pos + indices_range
510
+
511
+ # Modify indices to select the correct absolute position in the concatenated record array
512
+ indices = indices + modified_indices
513
+
514
+ # Let's pad the fasta records array because if there is a (pos, chrom) for which (pos+starting_position[chrom]+max_len_nea > len(record) we get out of bounds error.
515
+ # This basically happens if there is a pos for the last chromosome for which pos+max_len_nea > len(record for that chrom).
516
+ # This is very unlikely to happen but we should handle this case.
517
+ record = np.pad(record, (0, max_len_nea), constant_values=PADDING_VALUE)
518
+
519
+ # Index the record array using the computed indices.
520
+ # Since we use np.take, indices must all have the same length, and this is why we added the padding to NEA
521
+ # and we create the indices using max_len_nea (long story short, we can't obtain a scattered/ragged array)
522
+ output_nea = np.take(record, indices, mode="clip")
523
+ ##################################################################
524
+ output_nea[mask_outlier] = PADDING_VALUE
525
+ ##################################################################
526
+
527
+ # Check if the NEA is equal to the reference sequence at the given position
528
+ # In a non-matrix way, this is equivalent (for one single element) to:
529
+ # nea == record[pos-1: pos+len(nea)-1]
530
+ # where for example:
531
+ # a) nea = "AC", record = "ACTG", pos = 1 -> True
532
+ # b) nea = "T", record = "ACTG", pos = 3 -> True
533
+ # c) nea = "AG", record = "ACTG", pos = 1 -> False
534
+ # Since we want to do everything in a vectorized way, we will compare the padded NEA with the output
535
+ # and then we use the mask to focus only on the non-padded elements
536
+ # Pseudo example (X represents the padding value):
537
+ # nea = ['AC', 'T'], record = 'ACTGAAG', pos = [1, 3]
538
+ # -> nea = ['AC', 'TX'], indices = [[1, 2], [3, 4]], mask = [[True, True], [True, False]], output_nea = [['A', 'C'], ['T', 'G']]
539
+ # -> nea == output_nea: [[True, True], [True, False]], mask: [[True, True], [True, False]]
540
+ # -> nea == output_nea + ~mask: [[True, True], [True, True]]
541
+ # -> np.all(nea == output_nea + ~mask, 1): [True, True]
542
+
543
+ nea_eq_ref = np.all((nea == output_nea) + ~mask_nea, 1)
544
+ rev_nea_eq_ref = np.all((rev_nea == output_nea) + ~mask_nea, 1)
545
+
546
+ # Let's do everything again for EA
547
+ indices_range = np.arange(max_len_ea)
548
+ indices = pos + indices_range
549
+ indices = indices + modified_indices
550
+ output_ea = np.take(record, indices, mode="clip")
551
+ ##################################################################
552
+ output_ea[mask_outlier] = PADDING_VALUE
553
+ ##################################################################
554
+
555
+
556
+ ea_eq_ref = np.all((ea == output_ea) + ~mask_ea, 1)
557
+ rev_ea_eq_ref = np.all((rev_ea == output_ea) + ~mask_ea, 1)
558
+
559
+ masks_max_len = max(mask_nea.shape[1], mask_ea.shape[1])
560
+
561
+ len_nea_eq_len_ea = np.all(
562
+ np.pad(mask_nea, ((0,0),(0, masks_max_len-mask_nea.shape[1])), constant_values=False) ==
563
+ np.pad(mask_ea, ((0,0),(0, masks_max_len-mask_ea.shape[1])), constant_values=False)
564
+ , axis=1) # pad masks with False to reach same shape
565
+ len_rev_nea_eq_rev_len_ea = len_nea_eq_len_ea
566
+
567
+ # The following conditions replicates the if-else statements of the original check_status function:
568
+ # https://github.com/Cloufield/gwaslab/blob/f6b4c4e58a26e5d67d6587141cde27acf9ce2a11/src/gwaslab/hm_harmonize_sumstats.py#L238
569
+
570
+ # nea == ref && ea == ref && len(nea) != len(ea)
571
+ status[nea_eq_ref * ea_eq_ref * ~len_nea_eq_len_ea, status_flip_idx] = 6
572
+
573
+ # nea == ref && ea != ref
574
+ status[nea_eq_ref * ~ea_eq_ref, status_flip_idx] = 0
575
+
576
+ # nea != ref && ea == ref
577
+ status[~nea_eq_ref * ea_eq_ref, status_flip_idx] = 3
578
+
579
+ # nea != ref && ea != ref && rev_nea == ref && rev_ea == ref && len(rev_nea) != len(rev_ea)
580
+ status[~nea_eq_ref * ~ea_eq_ref * rev_nea_eq_ref * rev_ea_eq_ref * ~len_rev_nea_eq_rev_len_ea, status_flip_idx] = 8
581
+
582
+ # nea != ref && ea != ref && rev_nea == ref && rev_ea != ref
583
+ status[~nea_eq_ref * ~ea_eq_ref * rev_nea_eq_ref * ~rev_ea_eq_ref, status_flip_idx] = 4
584
+
585
+ # nea != ref && ea != ref && rev_nea != ref && rev_ea == ref
586
+ status[~nea_eq_ref * ~ea_eq_ref * ~rev_nea_eq_ref * rev_ea_eq_ref, status_flip_idx] = 5
587
+
588
+ # nea != ref && ea != ref && rev_nea != ref && rev_ea != ref
589
+ status[~nea_eq_ref * ~ea_eq_ref * ~rev_nea_eq_ref * ~rev_ea_eq_ref, status_flip_idx] = 8
590
+
591
+ # Convert back the (now modified) 2D status array to a numpy array of strings in a very fast way.
592
+ # Since 'status' is a 2D array of integers ranging from 0 to 9, we can build the integer representation
593
+ # of each row using the efficent operation below (e.g. [1, 2, 3, 4, 5] -> [12345]).
594
+ # Then we convert this integer to a string using the f'<U{status.shape[1]}' dtype (e.g. 12345 -> '12345')
595
+ # The "naive" way would be:
596
+ # status_str = [''.join(map(str, l)) for l in status]
597
+ # status_arr = np.array(status_str)
598
+ status_flat = np.sum(status * 10**np.arange(status.shape[1]-1, -1, -1), axis=1)
599
+ status_arr = status_flat.astype(f'<U{status.shape[1]}')
600
+
601
+ return status_arr
602
+
603
+
604
+ def check_status(sumstats: pd.DataFrame, fasta_records_dict, log=Log(), verbose=True):
605
+
606
+ chrom,pos,ea,nea,status = sumstats.columns
607
+
608
+ # First, convert the fasta records to a single numpy array of integers
609
+ record, starting_positions_dict, records_len_dict = build_fasta_records(fasta_records_dict, pos_as_dict=True, log=log, verbose=verbose)
610
+
611
+ # In _fast_check_status(), several 2D numpy arrays are created and they are padded to have shape[1] == max_len_nea or max_len_ea
612
+ # Since most of the NEA and EA strings are short, we perform the check first on the records having short NEA and EA strings,
613
+ # and then we perform the check on the records having long NEA and EA strings. In this way we can speed up the process (since the
614
+ # arrays are smaller) and save memory.
615
+ max_len = 4 # this is a chosen value, we could compute it using some stats about the length and count of NEA and EA strings
616
+ condition = (sumstats[nea].str.len() <= max_len) & (sumstats[ea].str.len() <= max_len)
617
+
618
+ log.write(f" -Checking records for ( len(NEA) <= {max_len} and len(EA) <= {max_len} )", verbose=verbose)
619
+ sumstats_cond = sumstats[condition]
620
+ unique_chrom_cond = sumstats_cond[chrom].unique()
621
+ starting_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_cond])
622
+ records_len_cond = np.array([records_len_dict[k] for k in unique_chrom_cond])
623
+
624
+ sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond, records_len=records_len_cond)
625
+
626
+ log.write(f" -Checking records for ( len(NEA) > {max_len} or len(EA) > {max_len} )", verbose=verbose)
627
+ sumstats_not_cond = sumstats[~condition]
628
+ unique_chrom_not_cond = sumstats_not_cond[chrom].unique()
629
+ starting_not_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_not_cond])
630
+ records_len_not_cond = np.array([records_len_dict[k] for k in unique_chrom_not_cond])
631
+ sumstats.loc[~condition, status] = _fast_check_status(sumstats_not_cond, record=record, starting_positions=starting_not_pos_cond, records_len=records_len_not_cond)
632
+
633
+ return sumstats[status].values
634
+
635
+
636
+ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
637
+ ##start function with col checking##########################################################
638
+ _start_line = "check if NEA is aligned with reference sequence"
639
+ _end_line = "checking if NEA is aligned with reference sequence"
640
+ _start_cols = [chrom,pos,ea,nea,status]
641
+ _start_function = ".check_ref()"
642
+ _must_args ={}
643
+
644
+ is_enough_info = start_to(sumstats=sumstats,
645
+ log=log,
646
+ verbose=verbose,
647
+ start_line=_start_line,
648
+ end_line=_end_line,
649
+ start_cols=_start_cols,
650
+ start_function=_start_function,
651
+ **_must_args)
652
+ if is_enough_info == False: return sumstats
653
+ ############################################################################################
654
+ log.write(" -Reference genome FASTA file: "+ ref_seq,verbose=verbose)
655
+ log.write(" -Loading fasta records:",end="", verbose=verbose)
656
+ chromlist = get_chr_list(add_number=True)
657
+ records = SeqIO.parse(ref_seq, "fasta")
658
+
659
+ sumstats = sortcoordinate(sumstats,verbose=False)
660
+
661
+ all_records_dict = {}
662
+ chroms_in_sumstats = sumstats[chrom].unique() # load records from Fasta file only for the chromosomes present in the sumstats
663
+ for record in records:
664
+ #record = next(records)
665
+ if record is not None:
666
+ record_chr = str(record.id).strip("chrCHR").upper()
667
+ if record_chr in chr_dict.keys():
668
+ i = chr_dict[record_chr]
669
+ else:
670
+ i = record_chr
671
+ if (i in chromlist) and (i in chroms_in_sumstats):
672
+ log.write(record_chr," ", end="",show_time=False,verbose=verbose)
673
+ all_records_dict.update({i: record})
674
+ log.write("",show_time=False,verbose=verbose)
675
+
676
+ if len(all_records_dict) > 0:
677
+ log.write(" -Checking records", verbose=verbose)
678
+ all_records_dict = dict(sorted(all_records_dict.items())) # sort by key in case the fasta records are not already ordered by chromosome
679
+ to_check_ref = (sumstats[chrom].isin(list(all_records_dict.keys()))) & (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna())
680
+ sumstats_to_check = sumstats.loc[to_check_ref,[chrom,pos,ea,nea,status]]
681
+ sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
682
+ log.write(" -Finished checking records", verbose=verbose)
683
+
684
+ #CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
685
+ sumstats[status] = pd.Categorical(sumstats[status],categories=STATUS_CATEGORIES)
686
+ #sumstats[status] = sumstats[status].astype("string")
687
+
688
+ available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
689
+ status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
690
+ status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
691
+ status_4=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[4]\w", case=False, flags=0, na=False))
692
+ status_5=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[5]\w", case=False, flags=0, na=False))
693
+ status_6=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[6]\w", case=False, flags=0, na=False))
694
+ #status_7=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[7]\w", case=False, flags=0, na=False))
695
+ status_8=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w", case=False, flags=0, na=False))
696
+
697
+ log.write(" -Variants allele on given reference sequence : ",status_0,verbose=verbose)
698
+ log.write(" -Variants flipped : ",status_3,verbose=verbose)
699
+ raw_matching_rate = (status_3+status_0)/available_to_check
700
+ flip_rate = status_3/available_to_check
701
+ log.write(" -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100),verbose=verbose)
702
+ if raw_matching_rate <0.8:
703
+ log.warning("Matching rate is low, please check if the right reference genome is used.")
704
+ if flip_rate > 0.85 :
705
+ log.write(" -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.",verbose=verbose)
706
+
707
+ log.write(" -Variants inferred reverse_complement : ",status_4,verbose=verbose)
708
+ log.write(" -Variants inferred reverse_complement_flipped : ",status_5,verbose=verbose)
709
+ log.write(" -Both allele on genome + unable to distinguish : ",status_6,verbose=verbose)
710
+ #log.write(" -Reverse_complementary + both allele on genome + unable to distinguish: ",status_7)
711
+ log.write(" -Variants not on given reference sequence : ",status_8,verbose=verbose)
712
+
713
+ if remove is True:
714
+ sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
715
+ log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
716
+
717
+
718
+ finished(log, verbose, _end_line)
719
+ return sumstats
720
+
721
+ def build_fasta_records(fasta_records_dict, pos_as_dict=True, log=Log(), verbose=True):
722
+ log.write(" -Building numpy fasta records from dict", verbose=verbose)
723
+
724
+ # Let's do some magic to convert the fasta record to a numpy array of integers in a very fast way.
725
+ # fasta_record.seq._data is a byte-string, so we can use the bytes.maketrans to apply a translation.
726
+ # Here we map the bytes to the unicode character representing the desired integer as defined in the mapping dict
727
+ # (i.e. b'A' -> '\x02', b'T' -> '\x03', b'C' -> '\x04', b'G' -> '\x05', b'N' -> '\x06')
728
+ # Then, using np.array(... dtype=<U..) we convert the string to a numpy array of unicode characters.
729
+ # Then, we do a magic with view('<u4') to convert the unicode characters to 4-byte integers, so we obtain the actual integer representation of the characters
730
+ # Lastly, we cast the array to np.uint8 to convert the 4-byte integers to 1-byte integers to save memory
731
+ # Full example:
732
+ # fasta_record.seq._data = b'ACTGN' -> b'\x02\x04\x03\x05\x06' -> np.array(['\x02\x04\x03\x05\x06'], dtype='<U5') -> np.array([2, 4, 3, 5, 6], dtype=uint32) -> np.array([2, 4, 3, 5, 6], dtype=uint8)
733
+ all_r = []
734
+ for r in fasta_records_dict.values():
735
+ r = r.seq._data.translate(TRANSLATE_TABLE)
736
+ r = np.array([r], dtype=f'<U{len(r)}').view('<u4').astype(np.uint8)
737
+ all_r.append(r)
738
+
739
+ # We've just created a list of numpy arrays, so we can concatenate them to obtain a single numpy array
740
+ # Then we keep track of the starting position of each record in the concatenated array. This will be useful later
741
+ # to index the record array depending on the position of the variant and the chromosome
742
+ records_len = np.array([len(r) for r in all_r])
743
+
744
+ starting_positions = np.cumsum(records_len) - records_len
745
+
746
+
747
+ if pos_as_dict:
748
+ starting_positions = {k: v for k, v in zip(fasta_records_dict.keys(), starting_positions)}
749
+ records_len_dict = {k: v for k, v in zip(fasta_records_dict.keys(), records_len)}
750
+ record = np.concatenate(all_r)
751
+ del all_r # free memory
752
+
753
+
754
+ return record, starting_positions,records_len_dict
755
+
756
+ #######################################################################################################################################
757
+
758
+ #20220721
759
+ def chrposref_rsid(chr,end,ref,alt,vcf_reader,chr_dict=get_number_to_chr()):
760
+ ## single record assignment
761
+ start=end-1
762
+ if chr_dict is not None: chr=chr_dict[chr]
763
+
764
+ try:
765
+ chr_seq = vcf_reader.fetch(chr,start,end)
766
+ except:
767
+ return pd.NA
768
+
769
+ for record in chr_seq:
770
+ if record.pos==end:
771
+ if record.alts is None:
772
+ return pd.NA
773
+ if record.ref==ref and (alt in record.alts):
774
+ return record.id
775
+ elif (ref in record.alts) and record.ref==alt:
776
+ return record.id
777
+ return pd.NA
778
+
779
+ def assign_rsid_single(sumstats,path,rsid="rsID",chr="CHR",pos="POS",ref="NEA",alt="EA",chr_dict=get_number_to_chr()):
780
+ ## single df assignment
781
+ vcf_reader = VariantFile(path)
782
+ def rsid_helper(x,vcf_reader,chr_dict):
783
+ return chrposref_rsid(x.iloc[0],x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,chr_dict)
784
+ map_func=partial(rsid_helper,vcf_reader=vcf_reader,chr_dict=chr_dict)
785
+ rsID = sumstats.apply(map_func,axis=1)
786
+ return rsID
787
+
788
+ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsID",chr="CHR",pos="POS",ref="NEA",alt="EA",status="STATUS",
789
+ n_cores=1,chunksize=5000000,ref_snpid="SNPID",ref_rsid="rsID",
790
+ overwrite="empty",verbose=True,log=Log(),chr_dict=None):
791
+ '''
792
+ overwrite mode :
793
+ all , overwrite rsid for all availalbe rsid
794
+ invalid, only assign rsid for variants with invalid rsid
795
+ empty only assign rsid for variants with na rsid
796
+ '''
797
+
798
+ if ref_mode=="vcf":
799
+ ###################################################################################################################
800
+ ##start function with col checking##########################################################
801
+ _start_line = "assign rsID using reference VCF"
802
+ _end_line = "assign rsID using reference file"
803
+ _start_cols = [chr,pos,ref,alt,status]
804
+ _start_function = ".assign_rsid()"
805
+ _must_args ={}
806
+
807
+ is_enough_info = start_to(sumstats=sumstats,
808
+ log=log,
809
+ verbose=verbose,
810
+ start_line=_start_line,
811
+ end_line=_end_line,
812
+ start_cols=_start_cols,
813
+ start_function=_start_function,
814
+ n_cores=n_cores,
815
+ ref_vcf=path,
816
+ **_must_args)
817
+ if is_enough_info == False: return sumstats
818
+ ############################################################################################
819
+ chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
820
+ log.write(" -Assigning rsID based on CHR:POS and REF:ALT/ALT:REF...",verbose=verbose)
821
+ ##############################################
822
+ if rsid not in sumstats.columns:
823
+ sumstats[rsid]=pd.Series(dtype="string")
824
+
825
+ ###############################################
826
+ total_number= len(sumstats)
827
+ pre_number = sum(~sumstats[rsid].isna())
828
+
829
+ ##################################################################################################################
830
+ standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
831
+ if overwrite=="all":
832
+ to_assign = standardized_normalized
833
+ if overwrite=="invalid":
834
+ to_assign = (~sumstats[rsid].str.match(r'rs([0-9]+)', case=False, flags=0, na=False)) & standardized_normalized
835
+ if overwrite=="empty":
836
+ to_assign = sumstats[rsid].isna()& standardized_normalized
837
+ ##################################################################################################################
838
+ # multicore arrangement
839
+
840
+ if sum(to_assign)>0:
841
+ if sum(to_assign)<10000: n_cores=1
842
+ #df_split = np.array_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
843
+ df_split = _df_split(sumstats.loc[to_assign, [chr,pos,ref,alt]], n_cores)
844
+ pool = Pool(n_cores)
845
+ map_func = partial(assign_rsid_single,path=path,chr=chr,pos=pos,ref=ref,alt=alt,chr_dict=chr_dict)
846
+ assigned_rsid = pd.concat(pool.map(map_func,df_split))
847
+ sumstats.loc[to_assign,rsid] = assigned_rsid.values
848
+ pool.close()
849
+ pool.join()
850
+ gc.collect()
851
+ ##################################################################################################################
852
+
853
+ after_number = sum(~sumstats[rsid].isna())
854
+ log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!",verbose=verbose)
855
+ log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!",verbose=verbose)
856
+
857
+ ##################################################################################################################
858
+ elif ref_mode=="tsv":
859
+ '''
860
+ assign rsID based on chr:pos
861
+ '''
862
+ ##start function with col checking##########################################################
863
+ _start_line = "assign rsID by matching SNPID with CHR:POS:REF:ALT in the reference TSV"
864
+ _end_line = "assign rsID using reference file"
865
+ _start_cols = [snpid,status]
866
+ _start_function = ".assign_rsid()"
867
+ _must_args ={}
868
+
869
+ is_enough_info = start_to(sumstats=sumstats,
870
+ log=log,
871
+ verbose=verbose,
872
+ start_line=_start_line,
873
+ end_line=_end_line,
874
+ start_cols=_start_cols,
875
+ start_function=_start_function,
876
+ n_cores=n_cores,
877
+ ref_tsv=path,
878
+ **_must_args)
879
+ if is_enough_info == False: return sumstats
880
+ ############################################################################################
881
+
882
+ #standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
883
+ standardized_normalized = sumstats["STATUS"] == sumstats["STATUS"]
884
+
885
+ if rsid not in sumstats.columns:
886
+ sumstats[rsid]=pd.Series(dtype="string")
887
+
888
+ if overwrite == "empty":
889
+ to_assign = sumstats[rsid].isna() & standardized_normalized
890
+ if overwrite=="all":
891
+ to_assign = standardized_normalized
892
+ if overwrite=="invalid":
893
+ to_assign = (~sumstats[rsid].str.match(r'rs([0-9]+)', case=False, flags=0, na=False)) & standardized_normalized
894
+
895
+ total_number= len(sumstats)
896
+ pre_number = sum(~sumstats[rsid].isna())
897
+ log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...",verbose=verbose)
898
+ if sum(to_assign)>0:
899
+ sumstats = sumstats.set_index(snpid)
900
+ dic_chuncks = pd.read_csv(path,sep="\t",usecols=[ref_snpid,ref_rsid],
901
+ chunksize=chunksize,index_col=ref_snpid,
902
+ dtype={ref_snpid:"string",ref_rsid:"string"})
903
+
904
+ log.write(" -Setting block size: ",chunksize,verbose=verbose)
905
+ log.write(" -Loading block: ",end="",verbose=verbose)
906
+ for i,dic in enumerate(dic_chuncks):
907
+ gc.collect()
908
+ log.write(i," ",end=" ",show_time=False)
909
+ dic = dic.rename(index={ref_snpid:snpid})
910
+ dic = dic.rename(columns={ref_rsid:rsid})
911
+ dic = dic.loc[~dic.index.duplicated(keep=False),:]
912
+ sumstats.update(dic,overwrite=True)
913
+
914
+ log.write("\n",end="",show_time=False,verbose=verbose)
915
+ sumstats = sumstats.reset_index()
916
+ sumstats = sumstats.rename(columns = {'index':snpid})
917
+
918
+ after_number = sum(~sumstats[rsid].isna())
919
+ log.write(" -rsID annotation for "+str(total_number - after_number) +" needed to be fixed!",verbose=verbose)
920
+ log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!",verbose=verbose)
921
+ else:
922
+ log.write(" -No rsID can be fixed...skipping...",verbose=verbose)
923
+ ################################################################################################################
924
+
925
+ finished(log,verbose,_end_line)
926
+ return sumstats
927
+ #################################################################################################################################################
928
+ #single record assignment
929
+
930
+ def check_strand_status(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr_dict=get_number_to_chr()):
931
+ ### 0 : not palindromic
932
+ ### 1 : palindromic +strand
933
+ ### 2 : palindromic -strand -> need to flip -> flipped
934
+ ### 5 : palindromic -strand -> need to flip
935
+ ### 8 : no ref data
936
+ if chr_dict is not None: chr=chr_dict[chr]
937
+ status_pre=status[:6]
938
+ status_end=""
939
+ try:
940
+ chr_seq = vcf_reader.fetch(chr,start,end)
941
+ except:
942
+ return status_pre+"8"+status_end
943
+
944
+
945
+ for record in chr_seq:
946
+ if record.pos==end and record.ref==ref and (alt in record.alts):
947
+
948
+ if (record.info[alt_freq][0]<0.5) and (eaf<0.5):
949
+ return status_pre+"1"+status_end
950
+ elif (record.info[alt_freq][0]>0.5) and (eaf>0.5):
951
+ return status_pre+"1"+status_end
952
+ else:
953
+ return status_pre+"5"+status_end
954
+ return status_pre+"8"+status_end
955
+
956
+ def check_strand_status_cache(data,cache,ref_infer=None,ref_alt_freq=None,chr_dict=get_number_to_chr(),trust_cache=True,log=Log(),verbose=True):
957
+ if not trust_cache:
958
+ assert ref_infer is not None, "If trust_cache is False, ref_infer must be provided"
959
+ log.warning("You are not trusting the cache, this will slow down the process. Please consider building a complete cache.")
960
+
961
+ if ref_infer is not None and not trust_cache:
962
+ vcf_reader = VariantFile(ref_infer)
963
+
964
+ if isinstance(data, pd.DataFrame):
965
+ data = data.values
966
+
967
+ in_cache = 0
968
+ new_statuses = []
969
+
970
+ for i in range(data.shape[0]):
971
+ _chrom, pos, ref, alt, eaf, status = data[i]
972
+ chrom = _chrom
973
+ start = pos - 1
974
+ end = pos
975
+
976
+ if chr_dict is not None: chrom=chr_dict[chrom]
977
+
978
+ status_pre=status[:6]
979
+ status_end=""
980
+
981
+ new_status = status_pre+"8"+status_end # default value
982
+
983
+ cache_key = f"{chrom}:{pos}:{ref}:{alt}"
984
+ if cache_key in cache:
985
+ in_cache += 1
986
+ record = cache[cache_key]
987
+ if record is None:
988
+ new_status = status_pre+"8"+status_end
989
+ else:
990
+ if (record<0.5) and (eaf<0.5):
991
+ new_status = status_pre+"1"+status_end
992
+ elif (record>0.5) and (eaf>0.5):
993
+ new_status = status_pre+"1"+status_end
994
+ else:
995
+ new_status = status_pre+"5"+status_end
996
+ else:
997
+ if not trust_cache:
998
+ # If we don't trust the cache as a not complete cache, we should perform the check reading from the VCF file
999
+ new_status = check_strand_status(_chrom, start, end, ref, alt, eaf, vcf_reader, ref_alt_freq, status, chr_dict)
1000
+
1001
+ new_statuses.append(new_status)
1002
+
1003
+ log.write(f" -Elements in cache: {in_cache}", verbose=verbose)
1004
+ return new_statuses
1005
+
1006
+
1007
+ def check_unkonwn_indel(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,status,chr_dict=get_number_to_chr(),daf_tolerance=0.2):
1008
+ ### input : unknown indel, both on genome (xx1[45]x)
1009
+ ### 3 no flip
1010
+ ### 4 unknown indel,fixed (6->5)
1011
+ ### 6 flip
1012
+
1013
+ if chr_dict is not None: chr=chr_dict[chr]
1014
+ status_pre=status[:6]
1015
+ status_end=""
1016
+
1017
+ try:
1018
+ chr_seq = vcf_reader.fetch(chr,start,end)
1019
+ except:
1020
+ return status_pre+"8"+status_end
1021
+
1022
+ for record in chr_seq:
1023
+ if record.pos==end and record.ref==ref and (alt in record.alts):
1024
+ if abs(record.info[alt_freq][0] - eaf)<daf_tolerance:
1025
+ return status_pre+"3"+status_end
1026
+
1027
+ elif record.pos==end and record.ref==alt and (ref in record.alts):
1028
+ if abs(record.info[alt_freq][0] - (1 - eaf))<daf_tolerance:
1029
+ return status_pre+"6"+status_end
1030
+
1031
+ return status_pre+"8"+status_end
1032
+
1033
+
1034
+ def check_unkonwn_indel_cache(data,cache,ref_infer=None,ref_alt_freq=None,chr_dict=get_number_to_chr(),daf_tolerance=0.2,trust_cache=True,log=Log(),verbose=True):
1035
+ if not trust_cache:
1036
+ assert ref_infer is not None, "If trust_cache is False, ref_infer must be provided"
1037
+ log.warning("You are not trusting the cache, this will slow down the process. Please consider building a complete cache.")
1038
+
1039
+ if ref_infer is not None:
1040
+ vcf_reader = VariantFile(ref_infer)
1041
+
1042
+ if isinstance(data, pd.DataFrame):
1043
+ data = data.values
1044
+
1045
+ in_cache = 0
1046
+ new_statuses = []
1047
+
1048
+ for i in range(data.shape[0]):
1049
+ _chrom, pos, ref, alt, eaf, status = data[i]
1050
+ chrom = _chrom
1051
+
1052
+ if chr_dict is not None: chrom=chr_dict[chrom]
1053
+ start = pos - 1
1054
+ end = pos
1055
+
1056
+ status_pre=status[:6]
1057
+ status_end=""
1058
+
1059
+ new_status = status_pre+"8"+status_end # default value
1060
+
1061
+ cache_key_ref_alt = f"{chrom}:{pos}:{ref}:{alt}"
1062
+ cache_key_alt_ref = f"{chrom}:{pos}:{alt}:{ref}"
1063
+
1064
+ if cache_key_ref_alt in cache:
1065
+ in_cache += 1
1066
+ record = cache[cache_key_ref_alt]
1067
+ if record is None:
1068
+ new_status = status_pre+"8"+status_end
1069
+ else:
1070
+ if abs(record - eaf)<daf_tolerance:
1071
+ new_status = status_pre+"3"+status_end
1072
+
1073
+ elif cache_key_alt_ref in cache:
1074
+ in_cache += 1
1075
+ record = cache[cache_key_alt_ref]
1076
+ if record is None:
1077
+ new_status = status_pre+"8"+status_end
1078
+ else:
1079
+ if abs(record - (1 - eaf))<daf_tolerance:
1080
+ new_status = status_pre+"6"+status_end
1081
+
1082
+ else:
1083
+ if not trust_cache:
1084
+ # If we don't trust the cache as a not complete cache, we should perform the check reading from the VCF file
1085
+ new_status = check_unkonwn_indel(_chrom, start, end, ref, alt, eaf, vcf_reader, ref_alt_freq, status, chr_dict, daf_tolerance)
1086
+
1087
+ new_statuses.append(new_status)
1088
+
1089
+ log.write(f" -Elements in cache: {in_cache}", verbose=verbose)
1090
+ return new_statuses
1091
+
1092
+
1093
+ def get_reverse_complementary_allele(a):
1094
+ dic = str.maketrans({
1095
+ "A":"T",
1096
+ "T":"A",
1097
+ "C":"G",
1098
+ "G":"C"})
1099
+ return a[::-1].translate(dic)
1100
+
1101
+ def is_palindromic(sumstats,a1="EA",a2="NEA"):
1102
+ gc= (sumstats[a1]=="G") & (sumstats[a2]=="C")
1103
+ cg= (sumstats[a1]=="C") & (sumstats[a2]=="G")
1104
+ at= (sumstats[a1]=="A") & (sumstats[a2]=="T")
1105
+ ta= (sumstats[a1]=="T") & (sumstats[a2]=="A")
1106
+ palindromic = gc | cg | at | ta
1107
+ return palindromic
1108
+ ##################################################################################################################################################
1109
+ #single df assignment
1110
+
1111
+ def check_strand(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS"):
1112
+ vcf_reader = VariantFile(ref_infer)
1113
+ status_part = sumstats.apply(lambda x:check_strand_status(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict),axis=1)
1114
+ return status_part
1115
+
1116
+ def check_strand_cache(sumstats,cache,ref_infer,ref_alt_freq=None,chr_dict=get_number_to_chr(),trust_cache=True,log=Log(),verbose=True):
1117
+ assert cache is not None, "Cache must be provided"
1118
+ status_part = check_strand_status_cache(sumstats,cache,ref_infer,ref_alt_freq,chr_dict,trust_cache,log,verbose)
1119
+ return status_part
1120
+
1121
+ def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=get_number_to_chr(),status="STATUS",daf_tolerance=0.2):
1122
+ vcf_reader = VariantFile(ref_infer)
1123
+ status_part = sumstats.apply(lambda x:check_unkonwn_indel(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,x.iloc[5],chr_dict,daf_tolerance),axis=1)
1124
+ return status_part
1125
+
1126
+ def check_indel_cache(sumstats,cache,ref_infer,ref_alt_freq=None,chr_dict=get_number_to_chr(),daf_tolerance=0.2,trust_cache=True,log=Log(),verbose=True):
1127
+ assert cache is not None, "Cache must be provided"
1128
+ status_part = check_unkonwn_indel_cache(sumstats,cache,ref_infer,ref_alt_freq,chr_dict,daf_tolerance,trust_cache,log,verbose)
1129
+ return status_part
1130
+
1131
+ ##################################################################################################################################################
1132
+
1133
+ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,daf_tolerance=0.20,remove_snp="",mode="pi",n_cores=1,remove_indel="",
1134
+ chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
1135
+ chr_dict=None,cache_options={},verbose=True,log=Log()):
1136
+ '''
1137
+ Args:
1138
+ cache_options : A dictionary with the following keys:
1139
+ - cache_manager: CacheManager object or None. If any between cache_loader and cache_process is not None, or use_cache is True, a CacheManager object will be created automatically.
1140
+ - trust_cache: bool (optional, default: True). Whether to completely trust the cache or not. Trusting the cache means that any key not found inside the cache will be considered as a missing value even in the VCF file.
1141
+ - cache_loader: Object with a get_cache() method or None.
1142
+ - cache_process: Object with an apply_fn() method or None.
1143
+ - use_cache: bool (optional, default: False). If any of the cache_manager, cache_loader or cache_process is not None, this will be set to True automatically.
1144
+ If set to True and all between cache_manager, cache_loader and cache_process are None, the cache will be loaded (or built) on the spot.
1145
+
1146
+ The usefulness of a cache_loader or cache_process object is to pass a custom object which already has the cache loaded. This can be useful if the cache is loaded in background in another thread/process while other operations are performed.
1147
+ The cache_manager is a CacheManager object is used to expose the API to interact with the cache.
1148
+ '''
1149
+
1150
+ ##start function with col checking##########################################################
1151
+ _start_line = "infer strand for palindromic SNPs/align indistinguishable indels"
1152
+ _end_line = "inferring strand for palindromic SNPs/align indistinguishable indels"
1153
+ _start_cols = [chr,pos,ref,alt,eaf,status]
1154
+ _start_function = ".infer_strand()"
1155
+ _must_args ={"ref_alt_freq":ref_alt_freq}
1156
+
1157
+ is_enough_info = start_to(sumstats=sumstats,
1158
+ log=log,
1159
+ verbose=verbose,
1160
+ start_line=_start_line,
1161
+ end_line=_end_line,
1162
+ start_cols=_start_cols,
1163
+ start_function=_start_function,
1164
+ n_cores=n_cores,
1165
+ ref_vcf=ref_infer,
1166
+ **_must_args)
1167
+ if is_enough_info == False: return sumstats
1168
+ ############################################################################################
1169
+
1170
+ chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
1171
+
1172
+ # Setup cache variables
1173
+ cache_manager = cache_options.get("cache_manager", None)
1174
+ if cache_manager is not None:
1175
+ assert isinstance(cache_manager, CacheManager), "cache_manager must be a CacheManager object"
1176
+ trust_cache = cache_options.get("trust_cache", True)
1177
+ cache_loader = cache_options.get("cache_loader", None)
1178
+ cache_process = cache_options.get("cache_process", None)
1179
+ use_cache = any(c is not None for c in [cache_manager, cache_loader, cache_process]) or cache_options.get('use_cache', False)
1180
+ _n_cores = n_cores # backup n_cores
1181
+
1182
+ log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
1183
+
1184
+ if "p" in mode:
1185
+ ## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
1186
+ good_chrpos = sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
1187
+ palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
1188
+ not_palindromic_snp = good_chrpos & (~palindromic)
1189
+
1190
+ ##not palindromic : change status
1191
+ sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
1192
+ log.write(" -Identified ", sum(palindromic)," palindromic SNPs...",verbose=verbose)
1193
+
1194
+ #palindromic but can not infer
1195
+ maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
1196
+
1197
+ sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
1198
+
1199
+ #palindromic WITH UNKNWON OR UNCHECKED STATUS
1200
+ unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
1201
+
1202
+ unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
1203
+
1204
+ log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)),verbose=verbose)
1205
+
1206
+ #########################################################################################
1207
+ if sum(unknow_palindromic_to_check)>0:
1208
+ if sum(unknow_palindromic_to_check)<10000:
1209
+ n_cores=1
1210
+
1211
+ if use_cache and cache_manager is None:
1212
+ cache_manager = CacheManager(base_path=ref_infer, cache_loader=cache_loader, cache_process=cache_process,
1213
+ ref_alt_freq=ref_alt_freq, category=PALINDROMIC_INDEL,
1214
+ n_cores=_n_cores, log=log, verbose=verbose)
1215
+
1216
+ log.write(" -Starting strand inference for palindromic SNPs...",verbose=verbose)
1217
+ df_to_check = sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]]
1218
+
1219
+ if use_cache and cache_manager.cache_len > 0:
1220
+ log.write(" -Using cache for strand inference",verbose=verbose)
1221
+ status_inferred = cache_manager.apply_fn(check_strand_cache, sumstats=df_to_check, ref_infer=ref_infer, ref_alt_freq=ref_alt_freq, chr_dict=chr_dict, trust_cache=trust_cache, log=log, verbose=verbose)
1222
+ sumstats.loc[unknow_palindromic_to_check,status] = status_inferred
1223
+ else:
1224
+ #df_split = np.array_split(df_to_check, n_cores)
1225
+ df_split = _df_split(df_to_check, n_cores)
1226
+ pool = Pool(n_cores)
1227
+ map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
1228
+ status_inferred = pd.concat(pool.map(map_func,df_split))
1229
+ sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
1230
+ pool.close()
1231
+ pool.join()
1232
+ log.write(" -Finished strand inference.",verbose=verbose)
1233
+ else:
1234
+ log.warning("No palindromic variants available for checking.")
1235
+ #########################################################################################
1236
+ #0 Not palindromic SNPs
1237
+ #1 Palindromic +strand -> no need to flip
1238
+ #2 palindromic -strand -> need to flip -> fixed
1239
+ #3 Indel no need flip
1240
+ #4 Unknown Indel -> fixed
1241
+ #5 Palindromic -strand -> need to flip
1242
+ #6 Indel need flip
1243
+ #7 indistinguishable
1244
+ #8 Not matching or No information
1245
+ #9 Unchecked
1246
+
1247
+ status0 = sumstats[status].str.match(r'\w\w\w\w\w\w[0]', case=False, flags=0, na=False)
1248
+ status1 = sumstats[status].str.match(r'\w\w\w\w\w\w[1]', case=False, flags=0, na=False)
1249
+ status5 = sumstats[status].str.match(r'\w\w\w\w\w\w[5]', case=False, flags=0, na=False)
1250
+ status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
1251
+ status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
1252
+
1253
+ log.write(" -Non-palindromic : ",sum(status0),verbose=verbose)
1254
+ log.write(" -Palindromic SNPs on + strand: ",sum(status1),verbose=verbose)
1255
+ log.write(" -Palindromic SNPs on - strand and needed to be flipped:",sum(status5),verbose=verbose)
1256
+ log.write(" -Palindromic SNPs with MAF not available to infer : ",sum(status7),verbose=verbose)
1257
+ log.write(" -Palindromic SNPs with no macthes or no information : ",sum(status8),verbose=verbose)
1258
+
1259
+ if ("7" in remove_snp) and ("8" in remove_snp) :
1260
+ log.write(" -Palindromic SNPs with MAF not available to infer and with no macthes or no information will will be removed",verbose=verbose)
1261
+ sumstats = sumstats.loc[~(status7 | status8),:].copy()
1262
+ elif "8" in remove_snp:
1263
+ log.write(" -Palindromic SNPs with no macthes or no information will be removed",verbose=verbose)
1264
+ sumstats = sumstats.loc[~status8,:].copy()
1265
+ elif "7" in remove_snp:
1266
+ log.write(" -Palindromic SNPs with MAF not available to infer will be removed",verbose=verbose)
1267
+ sumstats = sumstats.loc[~status7,:].copy()
1268
+
1269
+ ### unknow_indel
1270
+ if "i" in mode:
1271
+ unknow_indel = sumstats[status].str.match(r'\w\w\w\w\w[6][89]', case=False, flags=0, na=False)
1272
+ log.write(" -Identified ", sum(unknow_indel)," indistinguishable Indels...",verbose=verbose)
1273
+ if sum(unknow_indel)>0:
1274
+ log.write(" -Indistinguishable indels will be inferred from reference vcf REF and ALT...",verbose=verbose)
1275
+ #########################################################################################
1276
+ #with maf can not infer
1277
+ #maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
1278
+ #sumstats.loc[unknow_indel&(~maf_can_infer),status] = vchange_status(sumstats.loc[unknow_indel&(~maf_can_infer),status],7,"9","8")
1279
+ log.write(" -Difference in allele frequency (DAF) tolerance: {}".format(daf_tolerance),verbose=verbose)
1280
+
1281
+ if sum(unknow_indel)>0:
1282
+ if sum(unknow_indel)<10000:
1283
+ n_cores=1
1284
+
1285
+ if use_cache and cache_manager is None:
1286
+ cache_manager = CacheManager(base_path=ref_infer, cache_loader=cache_loader, cache_process=cache_process,
1287
+ ref_alt_freq=ref_alt_freq, category=PALINDROMIC_INDEL,
1288
+ n_cores=_n_cores, log=log, verbose=verbose)
1289
+
1290
+ log.write(" -Starting indistinguishable indel inference...",verbose=verbose)
1291
+ df_to_check = sumstats.loc[unknow_indel,[chr,pos,ref,alt,eaf,status]]
1292
+
1293
+ if use_cache and cache_manager.cache_len > 0:
1294
+ log.write(" -Using cache for indel inference",verbose=verbose)
1295
+ status_inferred = cache_manager.apply_fn(check_indel_cache, sumstats=df_to_check, ref_infer=ref_infer, ref_alt_freq=ref_alt_freq, chr_dict=chr_dict, daf_tolerance=daf_tolerance, trust_cache=trust_cache, log=log, verbose=verbose)
1296
+ sumstats.loc[unknow_indel,status] = status_inferred
1297
+ else:
1298
+ #df_split = np.array_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
1299
+ df_split = _df_split(sumstats.loc[unknow_indel, [chr,pos,ref,alt,eaf,status]], n_cores)
1300
+ pool = Pool(n_cores)
1301
+ map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
1302
+ status_inferred = pd.concat(pool.map(map_func,df_split))
1303
+ sumstats.loc[unknow_indel,status] = status_inferred.values
1304
+ pool.close()
1305
+ pool.join()
1306
+ log.write(" -Finished indistinguishable indel inference.",verbose=verbose)
1307
+
1308
+ #########################################################################################
1309
+
1310
+ status3 = sumstats[status].str.match(r'\w\w\w\w\w\w[3]', case=False, flags=0, na=False)
1311
+ status6 = sumstats[status].str.match(r'\w\w\w\w\w\w[6]', case=False, flags=0, na=False)
1312
+ status8 = sumstats[status].str.match(r'\w\w\w\w\w[6][8]', case=False, flags=0, na=False)
1313
+
1314
+ log.write(" -Indels ea/nea match reference : ",sum(status3),verbose=verbose)
1315
+ log.write(" -Indels ea/nea need to be flipped : ",sum(status6),verbose=verbose)
1316
+ log.write(" -Indels with no macthes or no information : ",sum(status8),verbose=verbose)
1317
+ if "8" in remove_indel:
1318
+ log.write(" -Indels with no macthes or no information will be removed",verbose=verbose)
1319
+ sumstats = sumstats.loc[~status8,:].copy()
1320
+ else:
1321
+ log.warning("No indistinguishable indels available for checking.")
1322
+
1323
+ finished(log,verbose,_end_line)
1324
+ return sumstats
1325
+
1326
+
1327
+
1328
+
1329
+
1330
+
1331
+
1332
+
1333
+
1334
+
1335
+
1336
+
1337
+
1338
+
1339
+
1340
+
1341
+
1342
+
1343
+
1344
+
1345
+ ################################################################################################################
1346
+ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
1347
+ ##start function with col checking##########################################################
1348
+ _start_line = "check the difference between EAF (sumstats) and ALT frequency (reference VCF)"
1349
+ _end_line = "checking the difference between EAF (sumstats) and ALT frequency (reference VCF)"
1350
+ _start_cols = [chr,pos,ref,alt,eaf,status]
1351
+ _start_function = ".check_daf()"
1352
+ _must_args ={"ref_alt_freq":ref_alt_freq}
1353
+
1354
+ is_enough_info = start_to(sumstats=sumstats,
1355
+ log=log,
1356
+ verbose=verbose,
1357
+ start_line=_start_line,
1358
+ end_line=_end_line,
1359
+ start_cols=_start_cols,
1360
+ start_function=_start_function,
1361
+ n_cores=n_cores,
1362
+ ref_vcf=ref_infer,
1363
+ **_must_args)
1364
+ if is_enough_info == False: return sumstats
1365
+ ############################################################################################
1366
+
1367
+ chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
1368
+
1369
+ column_name = column_name + suffix
1370
+
1371
+
1372
+
1373
+ # ref_alt_freq INFO in vcf was provided
1374
+ if ref_alt_freq is not None:
1375
+ log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
1376
+ if not force:
1377
+ good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
1378
+ log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
1379
+ sumstats[column_name]=np.nan
1380
+
1381
+ ########################
1382
+ if sum(~sumstats[eaf].isna())<10000:
1383
+ n_cores=1
1384
+ #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
1385
+ df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
1386
+ pool = Pool(n_cores)
1387
+ if sum(~sumstats[eaf].isna())>0:
1388
+ map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,column_name=column_name,chr_dict=chr_dict)
1389
+ sumstats.loc[good_chrpos,[column_name]] = pd.concat(pool.map(map_func,df_split))
1390
+ pool.close()
1391
+ pool.join()
1392
+ ###########################
1393
+ #status_inferred = sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]].apply(lambda x:check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict),axis=1)
1394
+ log.write(" -Difference in allele frequency (DAF) = EAF (sumstats) - ALT_AF (reference VCF)", verbose=verbose)
1395
+ log.write(" -Note: this DAF is not the derived allele frequency.", verbose=verbose)
1396
+ #sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
1397
+ #sumstats["DAF"]=sumstats["DAF"].astype("float")
1398
+ log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]),verbose=verbose)
1399
+ log.write(" - {} min:".format(column_name), np.nanmin(sumstats[column_name]),verbose=verbose)
1400
+ log.write(" - {} sd:".format(column_name), np.nanstd(sumstats[column_name]),verbose=verbose)
1401
+ log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats[column_name])),verbose=verbose)
1402
+ log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats[column_name])),verbose=verbose)
1403
+ log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats[column_name])),verbose=verbose)
1404
+ log.write("Finished allele frequency checking!")
1405
+ return sumstats
1406
+
1407
+ def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
1408
+ #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
1409
+ vcf_reader = VariantFile(ref_infer)
1410
+ def afapply(x,vcf,alt_freq,chr_dict):
1411
+ return check_daf(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],x.iloc[4],vcf_reader,ref_alt_freq,chr_dict)
1412
+ map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
1413
+ status_inferred = sumstats.apply(map_func,axis=1)
1414
+ sumstats[column_name] = status_inferred.values
1415
+ sumstats[column_name]=sumstats[column_name].astype("float")
1416
+ return sumstats
1417
+
1418
+ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
1419
+ if chr_dict is not None: chr=chr_dict[chr]
1420
+ chr_seq = vcf_reader.fetch(chr,start,end)
1421
+
1422
+ for record in chr_seq:
1423
+ if record.pos==end:
1424
+ if record.ref==ref and (alt in record.alts):
1425
+ return eaf - record.info[alt_freq][0]
1426
+ return np.nan
1427
+ ################################################################################################################
1428
+
1429
+ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
1430
+ ##start function with col checking##########################################################
1431
+ _start_line = "infer sumstats EAF using reference VCF ALT frequency"
1432
+ _end_line = "inferring sumstats EAF using reference VCF ALT frequency"
1433
+ _start_cols = [chr,pos,ref,alt,status]
1434
+ _start_function = ".infer_af()"
1435
+ _must_args ={"ref_alt_freq":ref_alt_freq}
1436
+
1437
+ is_enough_info = start_to(sumstats=sumstats,
1438
+ log=log,
1439
+ verbose=verbose,
1440
+ start_line=_start_line,
1441
+ end_line=_end_line,
1442
+ start_cols=_start_cols,
1443
+ start_function=_start_function,
1444
+ n_cores=n_cores,
1445
+ ref_vcf=ref_infer,
1446
+ **_must_args)
1447
+ if is_enough_info == False: return sumstats
1448
+ ############################################################################################
1449
+ chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
1450
+
1451
+ if eaf not in sumstats.columns:
1452
+ sumstats[eaf]=np.nan
1453
+
1454
+ prenumber = sum(sumstats[eaf].isna())
1455
+
1456
+ # ref_alt_freq INFO in vcf was provided
1457
+ if ref_alt_freq is not None:
1458
+ log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
1459
+ if not force:
1460
+ good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
1461
+ log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
1462
+
1463
+ ########################
1464
+ if sum(sumstats[eaf].isna())<10000:
1465
+ n_cores=1
1466
+ #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
1467
+ df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
1468
+ pool = Pool(n_cores)
1469
+ map_func = partial(inferaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
1470
+ sumstats.loc[good_chrpos,[eaf]] = pd.concat(pool.map(map_func,df_split))
1471
+ pool.close()
1472
+ pool.join()
1473
+ ###########################
1474
+
1475
+ afternumber = sum(sumstats[eaf].isna())
1476
+ log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber),verbose=verbose)
1477
+ log.write(" -EAF is still missing for {} variants.".format(afternumber),verbose=verbose)
1478
+
1479
+ finished(log,verbose,_end_line)
1480
+ return sumstats
1481
+
1482
+ def inferaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
1483
+ #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
1484
+ vcf_reader = VariantFile(ref_infer)
1485
+ def afapply(x,vcf,alt_freq,chr_dict):
1486
+ return infer_af(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,ref_alt_freq,chr_dict)
1487
+ map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
1488
+ status_inferred = sumstats.apply(map_func,axis=1)
1489
+ sumstats[eaf] = status_inferred.values
1490
+ sumstats[eaf]=sumstats[eaf].astype("float")
1491
+ return sumstats
1492
+
1493
+ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
1494
+ if chr_dict is not None: chr=chr_dict[chr]
1495
+ chr_seq = vcf_reader.fetch(chr,start,end)
1496
+
1497
+ for record in chr_seq:
1498
+ if record.pos==end:
1499
+ if record.ref==ref and (alt in record.alts):
1500
+ return record.info[alt_freq][0]
1501
+ elif record.ref==alt and (ref in record.alts):
1502
+ return 1 - record.info[alt_freq][0]
1503
+ return np.nan
1504
+ ##############################################################################################################################################################################################
1505
+
1506
+ ################################################################################################################
1507
+
1508
+ def _paralleleinferafwithmaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",
1509
+ eaf="EAF",maf="MAF",ref_eaf="_REF_EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
1510
+ ##start function with col checking##########################################################
1511
+ _start_line = "infer sumstats EAF from sumstats MAF using reference VCF ALT frequency"
1512
+ _end_line = "inferring sumstats EAF from sumstats MAF using reference VCF ALT frequency"
1513
+ _start_cols = [chr,pos,ref,alt,status]
1514
+ _start_function = ".infer_af()"
1515
+ _must_args ={"ref_alt_freq":ref_alt_freq}
1516
+
1517
+ is_enough_info = start_to(sumstats=sumstats,
1518
+ log=log,
1519
+ verbose=verbose,
1520
+ start_line=_start_line,
1521
+ end_line=_end_line,
1522
+ start_cols=_start_cols,
1523
+ start_function=_start_function,
1524
+ n_cores=n_cores,
1525
+ ref_vcf=ref_infer,
1526
+ **_must_args)
1527
+ if is_enough_info == False: return sumstats
1528
+ ############################################################################################
1529
+ chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
1530
+
1531
+ if eaf not in sumstats.columns:
1532
+ sumstats[eaf]=np.nan
1533
+ if ref_eaf not in sumstats.columns:
1534
+ sumstats[ref_eaf]=np.nan
1535
+
1536
+ prenumber = sum(sumstats[eaf].isna())
1537
+
1538
+ # ref_alt_freq INFO in vcf was provided
1539
+ if ref_alt_freq is not None:
1540
+ log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
1541
+ if not force:
1542
+ good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
1543
+ log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
1544
+
1545
+ ########################
1546
+ #extract ref af
1547
+ if sum(sumstats[eaf].isna())<10000:
1548
+ n_cores=1
1549
+ #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
1550
+ df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
1551
+ pool = Pool(n_cores)
1552
+ map_func = partial(inferaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=ref_eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
1553
+ sumstats.loc[good_chrpos,[ref_eaf]] = pd.concat(pool.map(map_func,df_split))
1554
+ pool.close()
1555
+ pool.join()
1556
+
1557
+ ###########################
1558
+ # infer sumstats EAF
1559
+ # based on sumstats MAF and reference EAF
1560
+ is_filpped = ((sumstats[ref_eaf]>=0.5)&(sumstats[maf]<=0.5)) |((sumstats[ref_eaf]<0.5)&(sumstats[maf]>0.5))
1561
+ sumstats[eaf] = sumstats[maf]
1562
+ log.write(" -Flipping MAF to obtain EAF for {} variants".format(sum(is_filpped)),verbose=verbose)
1563
+ sumstats.loc[is_filpped,eaf] = 1 - sumstats.loc[is_filpped,maf]
1564
+
1565
+ ###########################
1566
+ afternumber = sum(sumstats[eaf].isna())
1567
+ log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber),verbose=verbose)
1568
+ log.write(" -EAF is still missing for {} variants.".format(afternumber),verbose=verbose)
1569
+ sumstats = sumstats.drop(columns=[ref_eaf])
1570
+
1571
+ finished(log,verbose,_end_line)
1572
+ return sumstats
1573
+
1574
+ def inferaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
1575
+ #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
1576
+ vcf_reader = VariantFile(ref_infer)
1577
+ def afapply(x,vcf,alt_freq,chr_dict):
1578
+ return infer_af(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,ref_alt_freq,chr_dict)
1579
+ map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
1580
+ status_inferred = sumstats.apply(map_func,axis=1)
1581
+ sumstats[eaf] = status_inferred.values
1582
+ sumstats[eaf]=sumstats[eaf].astype("float")
1583
+ return sumstats
1584
+
1585
+ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
1586
+ if chr_dict is not None: chr=chr_dict[chr]
1587
+ chr_seq = vcf_reader.fetch(chr,start,end)
1588
+
1589
+ for record in chr_seq:
1590
+ if record.pos==end:
1591
+ if record.ref==ref and (alt in record.alts):
1592
+ return record.info[alt_freq][0]
1593
+ elif record.ref==alt and (ref in record.alts):
1594
+ return 1 - record.info[alt_freq][0]
1595
+ return np.nan
1596
+
1597
+ ##############################################################################################################################################################################################
1598
+ def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
1599
+ if vcf_path is not None:
1600
+ if vcf_chr_dict is None:
1601
+ log.write(" -Checking chromosome notations in VCF/BCF files..." ,verbose=verbose)
1602
+ vcf_chr_dict = check_vcf_chr_NC(vcf_path, log, verbose)
1603
+ if vcf_chr_dict is not None:
1604
+ return vcf_chr_dict
1605
+ log.write(" -Checking prefix for chromosomes in VCF/BCF files..." ,verbose=verbose)
1606
+ prefix = check_vcf_chr_prefix(vcf_path, log,verbose)
1607
+ if prefix is not None:
1608
+ log.write(" -Prefix for chromosomes: ",prefix)
1609
+ vcf_chr_dict = get_number_to_chr(prefix=prefix)
1610
+ else:
1611
+ log.write(" -No prefix for chromosomes in the VCF/BCF files." ,verbose=verbose)
1612
+ vcf_chr_dict = get_number_to_chr()
1613
+ return vcf_chr_dict
1614
+
1615
+ def check_vcf_chr_prefix(vcf_bcf_path,log,verbose):
1616
+ vcf_bcf = VariantFile(vcf_bcf_path)
1617
+ for i in list(vcf_bcf.header.contigs):
1618
+ m = re.search('(chr|Chr|CHR)([0-9xXyYmM]+)', i)
1619
+ if m is not None:
1620
+ return m.group(1)
1621
+ else:
1622
+ return None
1623
+
1624
+ def check_vcf_chr_NC(vcf_bcf_path,log,verbose):
1625
+ vcf_bcf = VariantFile(vcf_bcf_path)
1626
+ for i in list(vcf_bcf.header.contigs):
1627
+ if i in get_number_to_NC(build="19").values():
1628
+ log.write(" -RefSeq ID detected (hg19) in VCF/BCF...",verbose=verbose)
1629
+ return get_number_to_NC(build="19")
1630
+ elif i in get_number_to_NC(build="38").values():
1631
+ log.write(" -RefSeq ID detected (hg38) in VCF/BCF...",verbose=verbose)
1632
+ return get_number_to_NC(build="38")
1633
+ else:
1634
+ return None
1635
+