gwaslab 3.5.7__py3-none-any.whl → 3.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (67) hide show
  1. gwaslab/__init__.py +2 -0
  2. gwaslab/bd_common_data.py +1 -0
  3. gwaslab/bd_get_hapmap3.py +0 -1
  4. gwaslab/data/formatbook.json +78 -0
  5. gwaslab/data/reference.json +3 -1
  6. gwaslab/g_Sumstats.py +110 -25
  7. gwaslab/g_SumstatsMulti.py +287 -0
  8. gwaslab/g_SumstatsPair.py +101 -16
  9. gwaslab/g_Sumstats_polars.py +245 -0
  10. gwaslab/g_headers.py +12 -3
  11. gwaslab/g_meta.py +124 -47
  12. gwaslab/g_meta_update.py +48 -0
  13. gwaslab/g_vchange_status_polars.py +44 -0
  14. gwaslab/g_version.py +2 -2
  15. gwaslab/hm_casting.py +169 -110
  16. gwaslab/hm_casting_polars.py +202 -0
  17. gwaslab/hm_harmonize_sumstats.py +19 -8
  18. gwaslab/io_load_ld.py +529 -0
  19. gwaslab/io_preformat_input.py +11 -0
  20. gwaslab/io_preformat_input_polars.py +632 -0
  21. gwaslab/io_process_args.py +25 -1
  22. gwaslab/io_read_ldsc.py +34 -3
  23. gwaslab/io_read_pipcs.py +62 -6
  24. gwaslab/prscs_gigrnd.py +122 -0
  25. gwaslab/prscs_mcmc_gtb.py +136 -0
  26. gwaslab/prscs_parse_genet.py +98 -0
  27. gwaslab/qc_build.py +53 -0
  28. gwaslab/qc_check_datatype.py +10 -8
  29. gwaslab/qc_check_datatype_polars.py +128 -0
  30. gwaslab/qc_fix_sumstats.py +25 -23
  31. gwaslab/qc_fix_sumstats_polars.py +193 -0
  32. gwaslab/util_ex_calculate_ldmatrix.py +49 -19
  33. gwaslab/util_ex_gwascatalog.py +71 -28
  34. gwaslab/util_ex_infer_ancestry.py +65 -0
  35. gwaslab/util_ex_ldsc.py +67 -21
  36. gwaslab/util_ex_match_ldmatrix.py +396 -0
  37. gwaslab/util_ex_run_2samplemr.py +0 -2
  38. gwaslab/util_ex_run_ccgwas.py +155 -0
  39. gwaslab/util_ex_run_coloc.py +1 -1
  40. gwaslab/util_ex_run_hyprcoloc.py +117 -0
  41. gwaslab/util_ex_run_magma.py +74 -0
  42. gwaslab/util_ex_run_mesusie.py +155 -0
  43. gwaslab/util_ex_run_mtag.py +92 -0
  44. gwaslab/util_ex_run_prscs.py +85 -0
  45. gwaslab/util_ex_run_susie.py +40 -9
  46. gwaslab/util_in_estimate_ess.py +18 -0
  47. gwaslab/util_in_fill_data.py +20 -1
  48. gwaslab/util_in_filter_value.py +10 -5
  49. gwaslab/util_in_get_sig.py +71 -13
  50. gwaslab/util_in_meta.py +168 -4
  51. gwaslab/util_in_meta_polars.py +174 -0
  52. gwaslab/viz_aux_annotate_plot.py +13 -2
  53. gwaslab/viz_plot_compare_effect.py +87 -23
  54. gwaslab/viz_plot_credible_sets.py +55 -11
  55. gwaslab/viz_plot_effect.py +22 -12
  56. gwaslab/viz_plot_miamiplot2.py +3 -2
  57. gwaslab/viz_plot_mqqplot.py +94 -84
  58. gwaslab/viz_plot_qqplot.py +9 -7
  59. gwaslab/viz_plot_regional2.py +2 -1
  60. gwaslab/viz_plot_stackedregional.py +4 -1
  61. {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/METADATA +46 -68
  62. gwaslab-3.6.0.dist-info/RECORD +119 -0
  63. {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/WHEEL +1 -1
  64. gwaslab-3.5.7.dist-info/RECORD +0 -96
  65. {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info/licenses}/LICENSE +0 -0
  66. {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
  67. {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/top_level.txt +0 -0
gwaslab/hm_casting.py CHANGED
@@ -11,20 +11,38 @@ from gwaslab.util_in_fill_data import filldata
11
11
  from Bio import SeqIO
12
12
  from itertools import combinations
13
13
 
14
- def _merge_mold_with_sumstats_by_chrpos(mold, sumstats, ref_path=None, windowsizeb=10, log=Log(),suffixes=("_MOLD",""),verbose=True,return_not_matched_mold =False):
14
+ def _merge_mold_with_sumstats_by_chrpos(mold, sumstats, ref_path=None,add_raw_index=False, stats_cols1=None, stats_cols2=None,
15
+ windowsizeb=10,
16
+ log=Log(),
17
+ suffixes=("_MOLD",""),
18
+ merge_mode="inner",
19
+ verbose=True,
20
+ return_not_matched_mold =False):
15
21
 
16
-
22
+ log.write("Start to merge sumstats...", verbose=verbose)
23
+ if merge_mode=="outer":
24
+ sumstats = sumstats.rename(columns={
25
+ "SNPID":"_SNPID_RIGHT",
26
+ "rsID":"_rsID_RIGHT"
27
+ })
28
+
29
+ # drop old ids
17
30
  cols_to_drop = []
18
31
  for i in sumstats.columns:
19
32
  if i in ["SNPID","rsID"]:
20
- cols_to_drop.append(i)
21
-
22
- log.write("Start to merge sumstats...", verbose=verbose)
23
-
33
+ cols_to_drop.append(i)
24
34
  if len(cols_to_drop)>0:
25
35
  log.write(" -Dropping old IDs:{}".format(cols_to_drop), verbose=verbose)
26
36
  sumstats = sumstats.drop(columns=cols_to_drop)
27
-
37
+
38
+
39
+ if add_raw_index==True:
40
+ index1= "_INDEX" + suffixes[0]
41
+ index2= "_INDEX" + suffixes[1]
42
+ mold[index1] = mold.index
43
+ sumstats[index2] = sumstats.index
44
+
45
+
28
46
  if ref_path is not None :
29
47
  # index for checking removed variants
30
48
  index1= "_INDEX" + suffixes[0]
@@ -32,11 +50,35 @@ def _merge_mold_with_sumstats_by_chrpos(mold, sumstats, ref_path=None, windowsiz
32
50
  mold[index1] = range(len(mold))
33
51
  sumstats[index2] = range(len(sumstats))
34
52
 
35
- if return_not_matched_mold:
36
- mold["_IDENTIFIER_FOR_VARIANT"] = range(len(mold))
53
+ #if return_not_matched_mold:
54
+ # mold["_IDENTIFIER_FOR_VARIANT"] = range(len(mold))
55
+ # sumstats["_IDENTIFIER_FOR_VARIANT2"] = range(len(sumstats))
37
56
 
38
57
  # mold sumffix + mold
39
- mold_sumstats = pd.merge(mold, sumstats, on=["CHR","POS"], how="inner",suffixes=suffixes)
58
+ mold_sumstats = pd.merge(mold, sumstats, on=["CHR","POS"], how=merge_mode,suffixes=suffixes)
59
+
60
+ if merge_mode=="outer":
61
+ is_temp_na = mold_sumstats["EA_1"].isna()
62
+ log.write(" -Detected {} variants not in the template...".format(sum(is_temp_na)), verbose=verbose)
63
+
64
+ mold_sumstats["EA_1"] = mold_sumstats["EA_1"].astype("string")
65
+ mold_sumstats["NEA_1"] = mold_sumstats["NEA_1"].astype("string")
66
+ mold_sumstats["EA"] = mold_sumstats["EA"].astype("string")
67
+ mold_sumstats["NEA"] = mold_sumstats["NEA"].astype("string")
68
+
69
+ # for variants not in template, copy snp info
70
+ mold_sumstats.loc[is_temp_na, ["SNPID","EA_1","NEA_1","STATUS_1"]] = mold_sumstats.loc[is_temp_na, ["_SNPID_RIGHT","EA","NEA","STATUS"]].values
71
+
72
+ #
73
+ if "_rsID_RIGHT" in mold_sumstats.columns:
74
+ mold_sumstats.loc[is_temp_na, "rsID"] = mold_sumstats.loc[is_temp_na, "_rsID_RIGHT"].values
75
+
76
+
77
+ # for variants not in right sumstats, copy snp info
78
+ is_temp_na_2 = mold_sumstats["EA"].isna()
79
+ mold_sumstats.loc[is_temp_na_2, ["EA","NEA"]] = mold_sumstats.loc[is_temp_na_2, ["EA_1","NEA_1"]].values
80
+ mold_sumstats = mold_sumstats.drop(columns=["_SNPID_RIGHT"])
81
+
40
82
  log.write(" -After merging by CHR and POS:{}".format(len(mold_sumstats)), verbose=verbose)
41
83
 
42
84
  mold_sumstats = _keep_variants_with_same_allele_set(mold_sumstats,suffixes=suffixes)
@@ -51,10 +93,17 @@ def _merge_mold_with_sumstats_by_chrpos(mold, sumstats, ref_path=None, windowsiz
51
93
  # mold_sumstats.drop(columns=["_INDEX",""])
52
94
 
53
95
  if return_not_matched_mold == True:
54
- sumstats1 = mold.loc[~mold["_IDENTIFIER_FOR_VARIANT"].isin(mold_sumstats["_IDENTIFIER_FOR_VARIANT"]),:]
55
- sumstats1= sumstats1.drop(columns=["_IDENTIFIER_FOR_VARIANT"])
56
- mold_sumstats= mold_sumstats.drop(columns=["_IDENTIFIER_FOR_VARIANT"])
57
- return mold_sumstats, sumstats1
96
+
97
+ sumstats1 = mold.loc[~mold["_RAW_INDEX_1"].isin(mold_sumstats["_RAW_INDEX_1"]),:]
98
+ sumstats1 = sumstats1.drop(columns=["_RAW_INDEX_1"])
99
+ sumstats1 = _renaming_cols_r(sumstats1, stats_cols1 +["EA","NEA"],suffix="_1", verbose=False)
100
+
101
+ sumstats2 = sumstats.loc[~sumstats["_RAW_INDEX_2"].isin(mold_sumstats["_RAW_INDEX_2"]),:]
102
+ sumstats2 = sumstats2.drop(columns=["_RAW_INDEX_2"])
103
+
104
+ mold_sumstats= mold_sumstats.drop(columns=["_RAW_INDEX_1","_RAW_INDEX_2"])
105
+
106
+ return mold_sumstats, sumstats1, sumstats2
58
107
 
59
108
  return mold_sumstats
60
109
 
@@ -117,6 +166,16 @@ def _renaming_cols(sumstats, columns, log=Log(),verbose=True, suffixes=("_1","_2
117
166
  log.write(" -Renaming sumstats2 columns by adding suffix {}".format(suffixes[1]),verbose=verbose)
118
167
  return sumstats
119
168
 
169
+ def _renaming_cols_r(sumstats, columns, log=Log(),verbose=True, suffix=""):
170
+ # columns: name without suffix
171
+ to_rename =[]
172
+ for col in columns:
173
+ if col + suffix in sumstats.columns:
174
+ to_rename.append(col)
175
+ sumstats = sumstats.rename(columns={i + suffix:i for i in to_rename})
176
+ log.write(" -Renaming sumstats columns by removing suffix {}".format(suffix),verbose=verbose)
177
+ return sumstats
178
+
120
179
  def _sort_pair_cols(molded_sumstats, verbose=True, log=Log(), order=None, stats_order=None,suffixes=("_1","_2")):
121
180
  if stats_order is None:
122
181
  order = ["SNPID","rsID", "CHR", "POS", "EA", "NEA"]
@@ -158,99 +217,99 @@ def _assign_warning_code(sumstats, threshold=0.2, log=Log(),verbose=True):
158
217
  return sumstats
159
218
 
160
219
 
161
- def _match_two_sumstats(mold,sumstats,ref_path,windowsizeb=25,verbose=True,log=Log()):
162
-
163
- records = SeqIO.parse(ref_path, "fasta")
164
-
165
- chromlist = list(set(mold["CHR"].values) & set(sumstats["CHR"].values))
166
-
167
- for record in records:
168
- if len(chromlist) ==0:
169
- break
170
-
171
- if record is not None:
172
- ##############################################################################
173
- record_chr = int(str(record.id).strip("chrCHR").upper())
174
-
175
- if record_chr in chromlist:
176
- log.write(record_chr," ", end="",show_time=False,verbose=verbose)
177
- chromlist.remove(record_chr)
178
- else:
179
- continue
180
- ###############################################################################
181
- mold_chr = mold.loc[mold["CHR"]==record_chr,:]
182
- sumstats_chr = sumstats.loc[sumstats["CHR"]==record_chr,:]
183
-
184
- for index, row in sumstats_chr.iterrows():
185
- if len(row["EA"])>1 or len(row["NEA"])>1:
186
- is_in_variants_lista = (mold_chr["POS"] > row["POS"] - windowsizeb) & (mold_chr["POS"]< row["POS"] + windowsizeb)
187
-
188
- is_in_variants_listb = (sumstats_chr["POS"] > row["POS"] - windowsizeb) & (sumstats_chr["POS"]< row["POS"] + windowsizeb)
189
-
190
- if sum(is_in_variants_lista)>0 and sum(is_in_variants_listb)>0 and (sum(is_in_variants_lista) + sum(is_in_variants_listb) >2):
191
- variants_lista = mold.loc[is_in_variants_lista,:]
192
- variants_listb = sumstats.loc[is_in_variants_listb,:]
193
-
194
- refseq = record[row["POS"]-1 - windowsizeb: row["POS"] + windowsizeb].seq.upper()
195
- _match_single_variant(refseq, variants_lista, variants_listb, left_offset=row["POS"] - windowsizeb, windowsizeb=windowsizeb)
196
-
197
- def _match_single_variant(refseq, variants_lista, variants_listb, left_offset,windowsizeb):
198
-
199
-
200
- seta=set()
201
- setb=set()
202
-
203
- seta_pumutations=[]
204
- for i in range(1, len(variants_lista)+1):
205
- seta_pumutations+=combinations(variants_lista.index, i)
206
-
207
- for i in seta_pumutations:
208
- if _is_ref_overlap(variants_lista.loc[i,:],suffix="_MOLD"):
209
- continue
210
- else:
211
- seta = _form_haplotype(refseq, variants_lista.loc[i,:], seta, left_offset,suffix="_MOLD")
212
-
213
- setb_pumutations=[]
214
- for i in range(1,len(variants_listb)+1):
215
- setb_pumutations+=combinations(variants_listb.index, i)
216
- for i in setb_pumutations:
217
- if _is_ref_overlap(variants_listb.loc[i,:],suffix=""):
218
- continue
219
- else:
220
- setb = _form_haplotype(refseq, variants_listb.loc[i,:], setb, left_offset,suffix="")
221
-
222
- if len(seta & setb)>0:
223
- print("-Topmed--------------------------------")
224
- print(variants_lista[["CHR","POS","NEA_MOLD","EA_MOLD","EAF_MOLD"]])
225
- print("-Finngen--------------------------------")
226
- print(variants_listb[["CHR","POS","NEA","EA","EAF"]])
227
- print(refseq,left_offset)
228
- print("-set a--------------------------------")
229
- print(seta)
230
- print("-set b---------------------------------")
231
- print(setb)
232
- print("------------------------------------")
233
- print("maybe equivalent ########################################################################")
234
- a = seta & setb
235
- for i in a:
236
- print(i)
237
-
238
- def _is_ref_overlap(variants_list,suffix="_MOLD"):
239
- previous_end = 0
240
- for index, row in variants_list.iterrows():
241
- if row["POS"] <= previous_end:
242
- return True
243
- if row["POS"] + len(row["NEA"+suffix]) -1 > previous_end:
244
- previous_end = row["POS"] + len(row["NEA"+suffix]) -1
245
- return False
246
-
247
- def _form_haplotype(refseq, variants_list, haplotype_set, left_offset,suffix="_MOLD"):
248
- new_haplotype = ""
249
- lastpos = 0
250
- for index, row in variants_list.iterrows():
251
- new_haplotype += refseq[lastpos:row["POS"] - left_offset]
252
- new_haplotype += row["EA"+suffix]
253
- lastpos = row["POS"] + len(row["NEA"+suffix])- left_offset
254
- new_haplotype += refseq[lastpos:]
255
- haplotype_set.add(new_haplotype)
256
- return haplotype_set
220
+ #def _match_two_sumstats(mold,sumstats,ref_path,windowsizeb=25,verbose=True,log=Log()):
221
+ #
222
+ # records = SeqIO.parse(ref_path, "fasta")
223
+ #
224
+ # chromlist = list(set(mold["CHR"].values) & set(sumstats["CHR"].values))
225
+ #
226
+ # for record in records:
227
+ # if len(chromlist) ==0:
228
+ # break
229
+ #
230
+ # if record is not None:
231
+ # ##############################################################################
232
+ # record_chr = int(str(record.id).strip("chrCHR").upper())
233
+ #
234
+ # if record_chr in chromlist:
235
+ # log.write(record_chr," ", end="",show_time=False,verbose=verbose)
236
+ # chromlist.remove(record_chr)
237
+ # else:
238
+ # continue
239
+ # ###############################################################################
240
+ # mold_chr = mold.loc[mold["CHR"]==record_chr,:]
241
+ # sumstats_chr = sumstats.loc[sumstats["CHR"]==record_chr,:]
242
+ #
243
+ # for index, row in sumstats_chr.iterrows():
244
+ # if len(row["EA"])>1 or len(row["NEA"])>1:
245
+ # is_in_variants_lista = (mold_chr["POS"] > row["POS"] - windowsizeb) & (mold_chr["POS"]< row["POS"] + windowsizeb)
246
+ #
247
+ # is_in_variants_listb = (sumstats_chr["POS"] > row["POS"] - windowsizeb) & (sumstats_chr["POS"]< row["POS"] + windowsizeb)
248
+ #
249
+ # if sum(is_in_variants_lista)>0 and sum(is_in_variants_listb)>0 and (sum(is_in_variants_lista) + sum(is_in_variants_listb) >2):
250
+ # variants_lista = mold.loc[is_in_variants_lista,:]
251
+ # variants_listb = sumstats.loc[is_in_variants_listb,:]
252
+ #
253
+ # refseq = record[row["POS"]-1 - windowsizeb: row["POS"] + windowsizeb].seq.upper()
254
+ # _match_single_variant(refseq, variants_lista, variants_listb, left_offset=row["POS"] - windowsizeb, windowsizeb=windowsizeb)
255
+ #
256
+ #def _match_single_variant(refseq, variants_lista, variants_listb, left_offset,windowsizeb):
257
+ #
258
+ #
259
+ # seta=set()
260
+ # setb=set()
261
+ #
262
+ # seta_pumutations=[]
263
+ # for i in range(1, len(variants_lista)+1):
264
+ # seta_pumutations+=combinations(variants_lista.index, i)
265
+ #
266
+ # for i in seta_pumutations:
267
+ # if _is_ref_overlap(variants_lista.loc[i,:],suffix="_MOLD"):
268
+ # continue
269
+ # else:
270
+ # seta = _form_haplotype(refseq, variants_lista.loc[i,:], seta, left_offset,suffix="_MOLD")
271
+ #
272
+ # setb_pumutations=[]
273
+ # for i in range(1,len(variants_listb)+1):
274
+ # setb_pumutations+=combinations(variants_listb.index, i)
275
+ # for i in setb_pumutations:
276
+ # if _is_ref_overlap(variants_listb.loc[i,:],suffix=""):
277
+ # continue
278
+ # else:
279
+ # setb = _form_haplotype(refseq, variants_listb.loc[i,:], setb, left_offset,suffix="")
280
+ #
281
+ # if len(seta & setb)>0:
282
+ # print("-Topmed--------------------------------")
283
+ # print(variants_lista[["CHR","POS","NEA_MOLD","EA_MOLD","EAF_MOLD"]])
284
+ # print("-Finngen--------------------------------")
285
+ # print(variants_listb[["CHR","POS","NEA","EA","EAF"]])
286
+ # print(refseq,left_offset)
287
+ # print("-set a--------------------------------")
288
+ # print(seta)
289
+ # print("-set b---------------------------------")
290
+ # print(setb)
291
+ # print("------------------------------------")
292
+ # print("maybe equivalent ########################################################################")
293
+ # a = seta & setb
294
+ # for i in a:
295
+ # print(i)
296
+ #
297
+ #def _is_ref_overlap(variants_list,suffix="_MOLD"):
298
+ # previous_end = 0
299
+ # for index, row in variants_list.iterrows():
300
+ # if row["POS"] <= previous_end:
301
+ # return True
302
+ # if row["POS"] + len(row["NEA"+suffix]) -1 > previous_end:
303
+ # previous_end = row["POS"] + len(row["NEA"+suffix]) -1
304
+ # return False
305
+ #
306
+ #def _form_haplotype(refseq, variants_list, haplotype_set, left_offset,suffix="_MOLD"):
307
+ # new_haplotype = ""
308
+ # lastpos = 0
309
+ # for index, row in variants_list.iterrows():
310
+ # new_haplotype += refseq[lastpos:row["POS"] - left_offset]
311
+ # new_haplotype += row["EA"+suffix]
312
+ # lastpos = row["POS"] + len(row["NEA"+suffix])- left_offset
313
+ # new_haplotype += refseq[lastpos:]
314
+ # haplotype_set.add(new_haplotype)
315
+ # return haplotype_set
@@ -0,0 +1,202 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from gwaslab.g_Log import Log
4
+ from pandas.api.types import CategoricalDtype
5
+ from gwaslab.g_vchange_status import copy_status
6
+ from gwaslab.g_vchange_status_polars import vchange_statusp
7
+ from gwaslab.g_vchange_status_polars import copy_statusp
8
+ from gwaslab.qc_fix_sumstats import flipallelestats
9
+ from gwaslab.qc_check_datatype import check_datatype
10
+ from gwaslab.qc_fix_sumstats import start_to
11
+ from gwaslab.util_in_fill_data import filldata
12
+ from Bio import SeqIO
13
+ from itertools import combinations
14
+ import polars as pl
15
+
16
+ def _merge_mold_with_sumstats_by_chrposp(mold, sumstats, ref_path=None,add_raw_index=False, stats_cols1=None, stats_cols2=None,
17
+ windowsizeb=10,
18
+ log=Log(),
19
+ suffixes=("_MOLD",""),
20
+ merge_mode="full",
21
+ verbose=True,
22
+ return_not_matched_mold =False):
23
+
24
+ log.write("Start to merge sumstats...", verbose=verbose)
25
+ if merge_mode=="full":
26
+ sumstats = sumstats.rename({
27
+ "SNPID":"_SNPID_RIGHT",
28
+ "rsID":"_rsID_RIGHT"
29
+ }, strict=False) #,
30
+
31
+ # drop old ids
32
+ cols_to_drop = []
33
+ for i in sumstats.columns:
34
+ if i in ["SNPID","rsID"]:
35
+ cols_to_drop.append(i)
36
+ if len(cols_to_drop)>0:
37
+ log.write(" -Dropping old IDs:{}".format(cols_to_drop), verbose=verbose)
38
+ sumstats = sumstats.drop(columns=cols_to_drop)
39
+
40
+ ##################################################################################################################
41
+
42
+ # mold sumffix + mold
43
+
44
+ mold_sumstats = mold.join(sumstats, on=["CHR","POS"], how=merge_mode, suffix="_", coalesce=True)
45
+
46
+ if merge_mode=="full":
47
+ is_temp_na = mold_sumstats["EA_1"].is_null()
48
+ log.write(" -Detected {} variants not in the template...".format(sum(is_temp_na)), verbose=verbose)
49
+
50
+ for i in ["EA_1","NEA_1","EA","NEA"]:
51
+ mold_sumstats = mold_sumstats.with_columns(pl.col(i).cast(pl.String).alias(i))
52
+
53
+ # for variants not in template, copy snp info
54
+ mold_sumstats = mold_sumstats.with_columns(
55
+ pl.when( is_temp_na )
56
+ .then( pl.col("_SNPID_RIGHT") )
57
+ .otherwise( pl.col("SNPID") )
58
+ .alias("SNPID")
59
+ ).with_columns(
60
+ pl.when( is_temp_na )
61
+ .then( pl.col("EA") )
62
+ .otherwise( pl.col("EA_1") )
63
+ .alias("EA_1")
64
+ ).with_columns(
65
+ pl.when( is_temp_na )
66
+ .then( pl.col("NEA") )
67
+ .otherwise( pl.col("NEA_1") )
68
+ .alias("NEA_1")
69
+ ).with_columns(
70
+ pl.when( is_temp_na )
71
+ .then( pl.col("STATUS") )
72
+ .otherwise( pl.col("STATUS_1") )
73
+ .alias("STATUS_1")
74
+ )
75
+ #
76
+ if "_rsID_RIGHT" in mold_sumstats.columns:
77
+ mold_sumstats = mold_sumstats.with_columns(
78
+ pl.when( is_temp_na )
79
+ .then( pl.col("_rsID_RIGHT") )
80
+ .otherwise( pl.col("rsID") )
81
+ .alias("rsID")
82
+ )
83
+
84
+
85
+ # for variants not in right sumstats, copy snp info
86
+ is_temp_na_2 = mold_sumstats["EA"].is_null()
87
+
88
+ mold_sumstats = mold_sumstats.with_columns(
89
+ pl.when( is_temp_na_2 )
90
+ .then( pl.col("EA_1") )
91
+ .otherwise( pl.col("EA") )
92
+ .alias("EA")
93
+ ).with_columns(
94
+ pl.when( is_temp_na_2 )
95
+ .then( pl.col("NEA_1") )
96
+ .otherwise( pl.col("NEA") )
97
+ .alias("NEA")
98
+ )
99
+
100
+ mold_sumstats = mold_sumstats.drop(["_SNPID_RIGHT"])
101
+
102
+ log.write(" -After merging by CHR and POS:{}".format(len(mold_sumstats)), verbose=verbose)
103
+
104
+ mold_sumstats = _keep_variants_with_same_allele_setp(mold_sumstats,suffixes=suffixes)
105
+
106
+ log.write(" -Matched variants:{}".format(len(mold_sumstats)), verbose=verbose)
107
+
108
+ return mold_sumstats
109
+
110
+ def _keep_variants_with_same_allele_setp(sumstats, log=Log(),verbose=True,suffixes=("_MOLD","")):
111
+
112
+ ea1="EA"+suffixes[0]
113
+ nea1="NEA"+suffixes[0]
114
+ ea2="EA"+suffixes[1]
115
+ nea2="NEA"+suffixes[1]
116
+
117
+ is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
118
+ is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
119
+ is_allele_set_match = is_flipped_match | is_perfect_match
120
+
121
+ log.write(" -Matching alleles and keeping only variants with same allele set: ", verbose=verbose)
122
+ log.write(" -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
123
+ log.write(" -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
124
+ log.write(" -Unmatched : {}".format(sum(~is_allele_set_match)), verbose=verbose)
125
+ sumstats = sumstats.filter(is_allele_set_match)
126
+ return sumstats
127
+
128
+ def _align_with_moldp(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
129
+
130
+ ea1="EA"+suffixes[0]
131
+ nea1="NEA"+suffixes[0]
132
+ ea2="EA"+suffixes[1]
133
+ nea2="NEA"+suffixes[1]
134
+ status1="STATUS"+suffixes[0]
135
+ status2="STATUS"+suffixes[1]
136
+
137
+ is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
138
+ is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
139
+
140
+ log.write(" -Aligning alleles with reference: ", verbose=verbose)
141
+ log.write(" -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
142
+ log.write(" -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
143
+
144
+ log.write(" -For perfect match: copy STATUS from reference...", verbose=verbose)
145
+
146
+ sumstats = copy_statusp(sumstats, is_perfect_match, status1, status2, 6)
147
+
148
+ log.write(" -For Flipped match: convert STATUS xxxxx[456789]x to xxxxx3x...", verbose=verbose)
149
+
150
+ sumstats = vchange_statusp(sumstats, is_flipped_match, status2, 6,"456789","333333")
151
+
152
+ return sumstats
153
+
154
+ def _fill_missing_columnsp(sumstats, columns, log=Log(),verbose=True):
155
+ sumstats = filldata(sumstats, to_fill=columns)
156
+ return sumstats
157
+
158
+ def _renaming_colsp(sumstats, columns, log=Log(),verbose=True, suffixes=("_1","_2")):
159
+ to_rename =["STATUS"]
160
+ for col in columns:
161
+ if col in sumstats.columns:
162
+ to_rename.append(col)
163
+ sumstats = sumstats.rename({i:i + suffixes[1] for i in to_rename})
164
+ log.write(" -Renaming sumstats2 columns by adding suffix {}".format(suffixes[1]),verbose=verbose)
165
+ return sumstats
166
+
167
+ def _renaming_cols_rp(sumstats, columns, log=Log(),verbose=True, suffix=""):
168
+ # columns: name without suffix
169
+ to_rename =[]
170
+ for col in columns:
171
+ if col + suffix in sumstats.columns:
172
+ to_rename.append(col)
173
+ sumstats = sumstats.rename({i + suffix:i for i in to_rename})
174
+ log.write(" -Renaming sumstats columns by removing suffix {}".format(suffix),verbose=verbose)
175
+ return sumstats
176
+
177
+ def _sort_pair_colsp(molded_sumstats, verbose=True, log=Log(), order=None, stats_order=None,suffixes=("_1","_2")):
178
+ if stats_order is None:
179
+ order = ["SNPID","rsID", "CHR", "POS", "EA", "NEA"]
180
+ stats_order = ["EAF", "MAF", "BETA", "SE","BETA_95L","BETA_95U", "Z",
181
+ "CHISQ", "P", "MLOG10P", "OR", "OR_95L", "OR_95U","HR", "HR_95L", "HR_95U","INFO", "N","N_CASE","N_CONTROL","DIRECTION","I2","P_HET","DOF","SNPR2","STATUS"]
182
+
183
+ for suffix in suffixes:
184
+ for i in stats_order:
185
+ order.append(i+suffix)
186
+
187
+ log.write("Start to reorder the columns...",verbose=verbose)
188
+
189
+ output_columns = []
190
+
191
+ for i in order:
192
+ if i in molded_sumstats.columns:
193
+ output_columns.append(i)
194
+ for i in molded_sumstats.columns:
195
+ if i not in order:
196
+ output_columns.append(i)
197
+
198
+ log.write(" -Reordering columns to :", ",".join(output_columns), verbose=verbose)
199
+ molded_sumstats = molded_sumstats[ output_columns]
200
+ log.write("Finished sorting columns successfully!", verbose=verbose)
201
+
202
+ return molded_sumstats
@@ -17,6 +17,7 @@ from gwaslab.qc_fix_sumstats import check_col
17
17
  from gwaslab.qc_fix_sumstats import start_to
18
18
  from gwaslab.qc_fix_sumstats import finished
19
19
  from gwaslab.qc_fix_sumstats import skipped
20
+ from gwaslab.qc_fix_sumstats import sortcoordinate
20
21
  from gwaslab.qc_check_datatype import check_dataframe_shape
21
22
  from gwaslab.bd_common_data import get_number_to_chr
22
23
  from gwaslab.bd_common_data import get_chr_list
@@ -397,7 +398,6 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
397
398
  def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np.array, records_len: np.array):
398
399
  # starting_positions and records_len must be 1D arrays containing data only for the chromosomes contained in x,
399
400
  # and these arrays must be ordered in the same way as the chromosomes in np.unique(x['CHR'].values).
400
-
401
401
  # status
402
402
  #0 / -----> match
403
403
  #1 / -----> Flipped Fixed
@@ -435,12 +435,13 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
435
435
  _chrom = _chrom.values
436
436
  unique_values, _ = np.unique(_chrom, return_inverse=True) # Get the sorted unique values and their indices
437
437
  chrom = np.searchsorted(unique_values, _chrom) # Replace each value in '_chrom' with its corresponding index in the sorted unique values
438
-
438
+
439
439
  max_len_nea = _nea.str.len().max()
440
440
  max_len_ea = _ea.str.len().max()
441
441
 
442
442
  ########################################## mask for variants with out of range POS
443
443
  mask_outlier = pos > records_len[chrom]
444
+
444
445
  #########################################
445
446
 
446
447
  # Let's apply the same magic used for the fasta records (check build_fasta_records() for details) to convert the NEA and EA to
@@ -538,6 +539,7 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
538
539
  # -> nea == output_nea: [[True, True], [True, False]], mask: [[True, True], [True, False]]
539
540
  # -> nea == output_nea + ~mask: [[True, True], [True, True]]
540
541
  # -> np.all(nea == output_nea + ~mask, 1): [True, True]
542
+
541
543
  nea_eq_ref = np.all((nea == output_nea) + ~mask_nea, 1)
542
544
  rev_nea_eq_ref = np.all((rev_nea == output_nea) + ~mask_nea, 1)
543
545
 
@@ -550,6 +552,7 @@ def _fast_check_status(x: pd.DataFrame, record: np.array, starting_positions: np
550
552
  output_ea[mask_outlier] = PADDING_VALUE
551
553
  ##################################################################
552
554
 
555
+
553
556
  ea_eq_ref = np.all((ea == output_ea) + ~mask_ea, 1)
554
557
  rev_ea_eq_ref = np.all((rev_ea == output_ea) + ~mask_ea, 1)
555
558
 
@@ -617,6 +620,7 @@ def check_status(sumstats: pd.DataFrame, fasta_records_dict, log=Log(), verbose=
617
620
  unique_chrom_cond = sumstats_cond[chrom].unique()
618
621
  starting_pos_cond = np.array([starting_positions_dict[k] for k in unique_chrom_cond])
619
622
  records_len_cond = np.array([records_len_dict[k] for k in unique_chrom_cond])
623
+
620
624
  sumstats.loc[condition, status] = _fast_check_status(sumstats_cond, record=record, starting_positions=starting_pos_cond, records_len=records_len_cond)
621
625
 
622
626
  log.write(f" -Checking records for ( len(NEA) > {max_len} or len(EA) > {max_len} )", verbose=verbose)
@@ -651,6 +655,8 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
651
655
  log.write(" -Loading fasta records:",end="", verbose=verbose)
652
656
  chromlist = get_chr_list(add_number=True)
653
657
  records = SeqIO.parse(ref_seq, "fasta")
658
+
659
+ sumstats = sortcoordinate(sumstats,verbose=False)
654
660
 
655
661
  all_records_dict = {}
656
662
  chroms_in_sumstats = sumstats[chrom].unique() # load records from Fasta file only for the chromosomes present in the sumstats
@@ -729,17 +735,21 @@ def build_fasta_records(fasta_records_dict, pos_as_dict=True, log=Log(), verbose
729
735
  r = r.seq._data.translate(TRANSLATE_TABLE)
730
736
  r = np.array([r], dtype=f'<U{len(r)}').view('<u4').astype(np.uint8)
731
737
  all_r.append(r)
732
-
738
+
733
739
  # We've just created a list of numpy arrays, so we can concatenate them to obtain a single numpy array
734
740
  # Then we keep track of the starting position of each record in the concatenated array. This will be useful later
735
741
  # to index the record array depending on the position of the variant and the chromosome
736
742
  records_len = np.array([len(r) for r in all_r])
743
+
737
744
  starting_positions = np.cumsum(records_len) - records_len
745
+
746
+
738
747
  if pos_as_dict:
739
748
  starting_positions = {k: v for k, v in zip(fasta_records_dict.keys(), starting_positions)}
740
749
  records_len_dict = {k: v for k, v in zip(fasta_records_dict.keys(), records_len)}
741
750
  record = np.concatenate(all_r)
742
751
  del all_r # free memory
752
+
743
753
 
744
754
  return record, starting_positions,records_len_dict
745
755
 
@@ -1335,8 +1345,8 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
1335
1345
  ################################################################################################################
1336
1346
  def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
1337
1347
  ##start function with col checking##########################################################
1338
- _start_line = "check the difference between EAF and reference VCF ALT frequency"
1339
- _end_line = "checking the difference between EAF and reference VCF ALT frequency"
1348
+ _start_line = "check the difference between EAF (sumstats) and ALT frequency (reference VCF)"
1349
+ _end_line = "checking the difference between EAF (sumstats) and ALT frequency (reference VCF)"
1340
1350
  _start_cols = [chr,pos,ref,alt,eaf,status]
1341
1351
  _start_function = ".check_daf()"
1342
1352
  _must_args ={"ref_alt_freq":ref_alt_freq}
@@ -1381,7 +1391,8 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
1381
1391
  pool.join()
1382
1392
  ###########################
1383
1393
  #status_inferred = sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]].apply(lambda x:check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict),axis=1)
1384
-
1394
+ log.write(" -Difference in allele frequency (DAF) = EAF (sumstats) - ALT_AF (reference VCF)", verbose=verbose)
1395
+ log.write(" -Note: this DAF is not the derived allele frequency.", verbose=verbose)
1385
1396
  #sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
1386
1397
  #sumstats["DAF"]=sumstats["DAF"].astype("float")
1387
1398
  log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]),verbose=verbose)
@@ -1417,8 +1428,8 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
1417
1428
 
1418
1429
  def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
1419
1430
  ##start function with col checking##########################################################
1420
- _start_line = "infer EAF using reference VCF ALT frequency"
1421
- _end_line = "inferring EAF using reference VCF ALT frequency"
1431
+ _start_line = "infer sumstats EAF using reference VCF ALT frequency"
1432
+ _end_line = "inferring sumstats EAF using reference VCF ALT frequency"
1422
1433
  _start_cols = [chr,pos,ref,alt,status]
1423
1434
  _start_function = ".infer_af()"
1424
1435
  _must_args ={"ref_alt_freq":ref_alt_freq}