gwaslab 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (110) hide show
  1. gwaslab/__init__.py +57 -47
  2. gwaslab/{bd_common_data.py → bd/bd_common_data.py} +10 -9
  3. gwaslab/bd/bd_config.py +28 -0
  4. gwaslab/{bd_download.py → bd/bd_download.py} +1 -1
  5. gwaslab/{bd_get_hapmap3.py → bd/bd_get_hapmap3.py} +9 -6
  6. gwaslab/bd/bd_path_manager.py +110 -0
  7. gwaslab/data/formatbook.json +805 -9
  8. gwaslab/{ldsc_irwls.py → extension/ldsc/ldsc_irwls.py} +1 -1
  9. gwaslab/{ldsc_regressions.py → extension/ldsc/ldsc_regressions.py} +2 -2
  10. gwaslab/{ldsc_sumstats.py → extension/ldsc/ldsc_sumstats.py} +2 -2
  11. gwaslab/{prscs_mcmc_gtb.py → extension/prscs/prscs_mcmc_gtb.py} +1 -1
  12. gwaslab/g_Sumstats.py +130 -96
  13. gwaslab/g_SumstatsMulti.py +69 -40
  14. gwaslab/g_SumstatsPair.py +54 -37
  15. gwaslab/g_SumstatsSet.py +88 -81
  16. gwaslab/g_SumstatsT.py +6 -6
  17. gwaslab/g_Sumstats_polars.py +84 -84
  18. gwaslab/g_meta_update.py +1 -1
  19. gwaslab/g_vchange_status.py +4 -4
  20. gwaslab/g_version.py +2 -2
  21. gwaslab/{hm_casting.py → hm/hm_casting.py} +4 -4
  22. gwaslab/{hm_casting_polars.py → hm/hm_casting_polars.py} +4 -4
  23. gwaslab/hm/hm_harmonize_sumstats.py +1635 -0
  24. gwaslab/hm_harmonize_sumstats.py +3 -8
  25. gwaslab/{io_load_ld.py → io/io_load_ld.py} +16 -13
  26. gwaslab/{io_preformat_input.py → io/io_preformat_input.py} +152 -73
  27. gwaslab/{io_preformat_input_polars.py → io/io_preformat_input_polars.py} +7 -7
  28. gwaslab/{io_read_pipcs.py → io/io_read_pipcs.py} +2 -2
  29. gwaslab/{io_read_tabular.py → io/io_read_tabular.py} +2 -2
  30. gwaslab/{io_to_formats.py → io/io_to_formats.py} +11 -8
  31. gwaslab/{io_to_pickle.py → io/io_to_pickle.py} +16 -1
  32. gwaslab/{qc_check_datatype_polars.py → qc/qc_check_datatype_polars.py} +2 -2
  33. gwaslab/{qc_fix_sumstats.py → qc/qc_fix_sumstats.py} +60 -33
  34. gwaslab/{qc_fix_sumstats_polars.py → qc/qc_fix_sumstats_polars.py} +15 -11
  35. gwaslab/{util_abf_finemapping.py → util/util_abf_finemapping.py} +2 -2
  36. gwaslab/{util_ex_calculate_ldmatrix.py → util/util_ex_calculate_ldmatrix.py} +18 -8
  37. gwaslab/{util_ex_calculate_prs.py → util/util_ex_calculate_prs.py} +2 -2
  38. gwaslab/{util_ex_ldproxyfinder.py → util/util_ex_ldproxyfinder.py} +6 -6
  39. gwaslab/{util_ex_ldsc.py → util/util_ex_ldsc.py} +18 -13
  40. gwaslab/{util_ex_match_ldmatrix.py → util/util_ex_match_ldmatrix.py} +8 -7
  41. gwaslab/util/util_ex_phewwas.py +127 -0
  42. gwaslab/{util_ex_process_h5.py → util/util_ex_process_h5.py} +2 -2
  43. gwaslab/{util_ex_process_ref.py → util/util_ex_process_ref.py} +2 -2
  44. gwaslab/{util_ex_run_2samplemr.py → util/util_ex_run_2samplemr.py} +18 -7
  45. gwaslab/{util_ex_run_ccgwas.py → util/util_ex_run_ccgwas.py} +4 -4
  46. gwaslab/{util_ex_run_clumping.py → util/util_ex_run_clumping.py} +28 -13
  47. gwaslab/{util_ex_run_coloc.py → util/util_ex_run_coloc.py} +22 -10
  48. gwaslab/{util_ex_run_hyprcoloc.py → util/util_ex_run_hyprcoloc.py} +4 -4
  49. gwaslab/{util_ex_run_magma.py → util/util_ex_run_magma.py} +21 -11
  50. gwaslab/{util_ex_run_mesusie.py → util/util_ex_run_mesusie.py} +3 -3
  51. gwaslab/{util_ex_run_mtag.py → util/util_ex_run_mtag.py} +50 -18
  52. gwaslab/{util_ex_run_prscs.py → util/util_ex_run_prscs.py} +3 -3
  53. gwaslab/{util_ex_run_scdrs.py → util/util_ex_run_scdrs.py} +10 -4
  54. gwaslab/{util_ex_run_susie.py → util/util_ex_run_susie.py} +49 -26
  55. gwaslab/{util_in_fill_data.py → util/util_in_fill_data.py} +1 -1
  56. gwaslab/{util_in_filter_value.py → util/util_in_filter_value.py} +18 -11
  57. gwaslab/{util_in_get_sig.py → util/util_in_get_sig.py} +15 -13
  58. gwaslab/{util_in_meta.py → util/util_in_meta.py} +1 -1
  59. gwaslab/{util_in_meta_polars.py → util/util_in_meta_polars.py} +1 -1
  60. gwaslab/{viz_aux_annotate_plot.py → viz/viz_aux_annotate_plot.py} +1 -1
  61. gwaslab/{viz_aux_quickfix.py → viz/viz_aux_quickfix.py} +2 -2
  62. gwaslab/{viz_plot_compare_af.py → viz/viz_plot_compare_af.py} +1 -1
  63. gwaslab/{viz_plot_compare_effect.py → viz/viz_plot_compare_effect.py} +16 -8
  64. gwaslab/{viz_plot_credible_sets.py → viz/viz_plot_credible_sets.py} +6 -6
  65. gwaslab/{viz_plot_effect.py → viz/viz_plot_effect.py} +37 -69
  66. gwaslab/{viz_plot_miamiplot.py → viz/viz_plot_miamiplot.py} +28 -20
  67. gwaslab/{viz_plot_miamiplot2.py → viz/viz_plot_miamiplot2.py} +27 -22
  68. gwaslab/{viz_plot_mqqplot.py → viz/viz_plot_mqqplot.py} +100 -46
  69. gwaslab/{viz_plot_phe_heatmap.py → viz/viz_plot_phe_heatmap.py} +18 -15
  70. gwaslab/{viz_plot_qqplot.py → viz/viz_plot_qqplot.py} +12 -28
  71. gwaslab/{viz_plot_regional2.py → viz/viz_plot_regional2.py} +11 -9
  72. gwaslab/{viz_plot_regionalplot.py → viz/viz_plot_regionalplot.py} +5 -4
  73. gwaslab/{viz_plot_rg_heatmap.py → viz/viz_plot_rg_heatmap.py} +1 -1
  74. gwaslab/{viz_plot_scatter_with_reg.py → viz/viz_plot_scatter_with_reg.py} +10 -7
  75. gwaslab/{viz_plot_stackedregional.py → viz/viz_plot_stackedregional.py} +67 -33
  76. gwaslab/{viz_plot_trumpetplot.py → viz/viz_plot_trumpetplot.py} +15 -9
  77. {gwaslab-3.6.6.dist-info → gwaslab-3.6.8.dist-info}/METADATA +1 -1
  78. gwaslab-3.6.8.dist-info/RECORD +123 -0
  79. gwaslab/bd_config.py +0 -18
  80. gwaslab-3.6.6.dist-info/RECORD +0 -120
  81. /gwaslab/{ldsc_jackknife.py → extension/ldsc/ldsc_jackknife.py} +0 -0
  82. /gwaslab/{ldsc_ldscore.py → extension/ldsc/ldsc_ldscore.py} +0 -0
  83. /gwaslab/{ldsc_parse.py → extension/ldsc/ldsc_parse.py} +0 -0
  84. /gwaslab/{prscs_gigrnd.py → extension/prscs/prscs_gigrnd.py} +0 -0
  85. /gwaslab/{prscs_parse_genet.py → extension/prscs/prscs_parse_genet.py} +0 -0
  86. /gwaslab/{hm_rsid_to_chrpos.py → hm/hm_rsid_to_chrpos.py} +0 -0
  87. /gwaslab/{io_process_args.py → io/io_process_args.py} +0 -0
  88. /gwaslab/{io_read_ldsc.py → io/io_read_ldsc.py} +0 -0
  89. /gwaslab/{qc_build.py → qc/qc_build.py} +0 -0
  90. /gwaslab/{qc_check_datatype.py → qc/qc_check_datatype.py} +0 -0
  91. /gwaslab/{util_ex_gwascatalog.py → util/util_ex_gwascatalog.py} +0 -0
  92. /gwaslab/{util_ex_infer_ancestry.py → util/util_ex_infer_ancestry.py} +0 -0
  93. /gwaslab/{util_ex_plink_filter.py → util/util_ex_plink_filter.py} +0 -0
  94. /gwaslab/{util_in_calculate_gc.py → util/util_in_calculate_gc.py} +0 -0
  95. /gwaslab/{util_in_calculate_power.py → util/util_in_calculate_power.py} +0 -0
  96. /gwaslab/{util_in_convert_h2.py → util/util_in_convert_h2.py} +0 -0
  97. /gwaslab/{util_in_correct_winnerscurse.py → util/util_in_correct_winnerscurse.py} +0 -0
  98. /gwaslab/{util_in_estimate_ess.py → util/util_in_estimate_ess.py} +0 -0
  99. /gwaslab/{util_in_get_density.py → util/util_in_get_density.py} +0 -0
  100. /gwaslab/{util_in_merge.py → util/util_in_merge.py} +0 -0
  101. /gwaslab/{util_in_snphwe.py → util/util_in_snphwe.py} +0 -0
  102. /gwaslab/{viz_aux_chromatin.py → viz/viz_aux_chromatin.py} +0 -0
  103. /gwaslab/{viz_aux_property.py → viz/viz_aux_property.py} +0 -0
  104. /gwaslab/{viz_aux_reposition_text.py → viz/viz_aux_reposition_text.py} +0 -0
  105. /gwaslab/{viz_aux_save_figure.py → viz/viz_aux_save_figure.py} +0 -0
  106. /gwaslab/{viz_plot_forestplot.py → viz/viz_plot_forestplot.py} +0 -0
  107. {gwaslab-3.6.6.dist-info → gwaslab-3.6.8.dist-info}/WHEEL +0 -0
  108. {gwaslab-3.6.6.dist-info → gwaslab-3.6.8.dist-info}/licenses/LICENSE +0 -0
  109. {gwaslab-3.6.6.dist-info → gwaslab-3.6.8.dist-info}/licenses/LICENSE_before_v3.4.39 +0 -0
  110. {gwaslab-3.6.6.dist-info → gwaslab-3.6.8.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,7 @@ from gwaslab.bd_common_data import _maketrans
27
27
  from gwaslab.g_vchange_status import vchange_status
28
28
  from gwaslab.g_version import _get_version
29
29
  from gwaslab.cache_manager import CacheManager, PALINDROMIC_INDEL, NON_PALINDROMIC
30
-
30
+ from gwaslab.g_vchange_status import STATUS_CATEGORIES
31
31
  #rsidtochrpos
32
32
  #checkref
33
33
  #parallelizeassignrsid
@@ -357,10 +357,7 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
357
357
 
358
358
  log.write("\n",end="",show_time=False,verbose=verbose)
359
359
 
360
- CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
361
- sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
362
- #sumstats[status] = sumstats[status].astype("string")
363
-
360
+ sumstats[status] = pd.Categorical(sumstats[status],categories=STATUS_CATEGORIES)
364
361
 
365
362
  available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
366
363
  status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
@@ -681,9 +678,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
681
678
  sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
682
679
  log.write(" -Finished checking records", verbose=verbose)
683
680
 
684
- CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
685
- sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
686
- #sumstats[status] = sumstats[status].astype("string")
681
+ sumstats[status] = pd.Categorical(sumstats[status],categories=STATUS_CATEGORIES)
687
682
 
688
683
  available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
689
684
  status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
@@ -1,8 +1,6 @@
1
-
2
1
  import scipy.sparse as sparse
3
2
  import numpy as np
4
3
  import pandas as pd
5
- from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
6
4
  import subprocess
7
5
  import os
8
6
  import re
@@ -10,16 +8,22 @@ import gc
10
8
  import pandas as pd
11
9
  import numpy as np
12
10
  from gwaslab.g_Log import Log
13
- from gwaslab.qc_fix_sumstats import start_to
14
- from gwaslab.qc_fix_sumstats import finished
15
- from gwaslab.util_in_get_sig import getsig
16
- from gwaslab.util_ex_process_ref import _process_plink_input_files
17
11
  from gwaslab.g_version import _checking_plink_version
18
- from gwaslab.util_in_filter_value import _exclude_hla
19
- from gwaslab.util_ex_calculate_ldmatrix import _extract_variants_in_locus
20
- from gwaslab.util_ex_calculate_ldmatrix import _export_snplist_and_locus_sumstats
21
- from gwaslab.viz_plot_regional2 import _get_lead_id
22
- from gwaslab.util_ex_calculate_ldmatrix import _extract_variants_in_locus
12
+
13
+ from gwaslab.hm.hm_casting import _merge_mold_with_sumstats_by_chrpos
14
+
15
+ from gwaslab.qc.qc_fix_sumstats import start_to
16
+ from gwaslab.qc.qc_fix_sumstats import finished
17
+
18
+ from gwaslab.util.util_in_get_sig import getsig
19
+ from gwaslab.util.util_ex_process_ref import _process_plink_input_files
20
+ from gwaslab.util.util_in_filter_value import _exclude_hla
21
+ from gwaslab.util.util_ex_calculate_ldmatrix import _extract_variants_in_locus
22
+ from gwaslab.util.util_ex_calculate_ldmatrix import _export_snplist_and_locus_sumstats
23
+ from gwaslab.util.util_ex_calculate_ldmatrix import _extract_variants_in_locus
24
+
25
+ from gwaslab.viz.viz_plot_regional2 import _get_lead_id
26
+
23
27
 
24
28
  def tofinemapping_using_ld(sumstats,
25
29
  study=None,
@@ -72,7 +76,7 @@ def tofinemapping_using_ld(sumstats,
72
76
  sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
73
77
  else:
74
78
  sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
75
-
79
+ log.write(" -Number of loci: {}...".format(len(sig_df)),verbose=verbose)
76
80
  # Drop duplicate!!!!
77
81
  log.write(" -Dropping duplicated SNPIDs...",verbose=verbose)
78
82
  sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
@@ -447,7 +451,6 @@ def _merge_ld_map_with_sumstats(row,
447
451
  # matching by SNPID
448
452
  # preserve bim keys (use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.)
449
453
  combined_df = pd.merge(ld_map, locus_sumstats, on=["CHR","POS"],how="inner")
450
-
451
454
  # match allele
452
455
  perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
453
456
  log.write(" -Variants with perfect matched alleles:{}".format(sum(perfect_match)))
@@ -3,14 +3,16 @@ import numpy as np
3
3
  import scipy.stats as ss
4
4
  import gzip
5
5
  import os
6
+ import re
6
7
  import gc
7
- from gwaslab.bd_common_data import get_format_dict
8
- from gwaslab.qc_fix_sumstats import sortcolumn
9
- from gwaslab.qc_fix_sumstats import _process_build
10
- from gwaslab.qc_check_datatype import check_datatype
11
- from gwaslab.qc_check_datatype import quick_convert_datatype
12
- from gwaslab.qc_check_datatype import check_dataframe_memory_usage
8
+ from gwaslab.bd.bd_common_data import get_format_dict
9
+ from gwaslab.qc.qc_fix_sumstats import sortcolumn
10
+ from gwaslab.qc.qc_fix_sumstats import _process_build
11
+ from gwaslab.qc.qc_check_datatype import check_datatype
12
+ from gwaslab.qc.qc_check_datatype import quick_convert_datatype
13
+ from gwaslab.qc.qc_check_datatype import check_dataframe_memory_usage
13
14
  from gwaslab.g_headers import _check_overlap_with_reserved_keys
15
+ from gwaslab.g_vchange_status import STATUS_CATEGORIES
14
16
  #20221030
15
17
  def preformat(sumstats,
16
18
  fmt=None,
@@ -122,62 +124,22 @@ def preformat(sumstats,
122
124
 
123
125
  if "sep" not in readargs.keys():
124
126
  readargs["sep"] = "\t"
127
+ else:
128
+ meta_data = None
125
129
 
126
130
  #########################################################################################################################################################
127
131
 
128
- # check chr-separated path / vcf / then print header.
129
- try:
130
- if type(sumstats) is str:
131
- ## loading data from path #################################################
132
- inpath = sumstats
133
- ###load sumstats by each chromosome #################################################
134
- if "@" in inpath:
135
- log.write(" -Detected @ in path: load sumstats by each chromosome...",verbose=verbose)
136
- inpath_chr_list=[]
137
- inpath_chr_num_list=[]
138
- for chromosome in list(range(1,26))+["x","y","X","Y","MT","mt","m","M"]:
139
- inpath_chr = inpath.replace("@",str(chromosome))
140
- if isfile_casesensitive(inpath_chr):
141
- inpath_chr_num_list.append(str(chromosome))
142
- inpath_chr_list.append(inpath_chr)
143
- log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
144
- readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
145
- row_one = pd.read_table(inpath_chr_list[0],**readargs_header)
146
- # columns in the sumstats
147
- raw_cols = row_one.columns
148
- else:
149
- ##### loading data from tabular file#################################################
150
- readargs_header = get_readargs_header(inpath = inpath, readargs = readargs)
151
- row_one = pd.read_table(inpath,**readargs_header)
152
- raw_cols = row_one.columns
153
-
154
- if fmt=="vcf":
155
- # expanded
156
- format_cols = list(row_one["FORMAT"].str.split(":"))[0]
157
- # fixed + study1 + expanded
158
- raw_cols = meta_data["format_fixed"] + [raw_cols[9]] + format_cols
132
+ # check chr-separated path / vcf / then print header.
133
+ inpath, inpath_chr_list, inpath_chr_num_list, format_cols, raw_cols, usecols, dtype_dictionary = check_path_and_header(sumstats,
134
+ fmt,
135
+ meta_data,
136
+ readargs,
137
+ usecols,
138
+ dtype_dictionary,
139
+ rename_dictionary,
140
+ log,
141
+ verbose)
159
142
 
160
- ######################################################################################
161
- elif type(sumstats) is pd.DataFrame:
162
- ## loading data from dataframe
163
- raw_cols = sumstats.columns
164
-
165
- ################################################
166
- for key,value in rename_dictionary.items():
167
- # check avaiable keys key->raw header
168
- # usecols : a list of raw headers to load from file/DataFrame
169
- if key in raw_cols:
170
- usecols.append(key)
171
- if value in ["EA","NEA"]:
172
- dtype_dictionary[key]="category"
173
- if value in ["STATUS"]:
174
- dtype_dictionary[key]="string"
175
- if value in ["CHR"]:
176
- dtype_dictionary[key]="string"
177
-
178
- except ValueError:
179
- raise ValueError("Please input a path or a pd.DataFrame, and make sure the separator is correct and the columns you specified are in the file.")
180
-
181
143
  ###################################################################################################################################################
182
144
  ## check columns/datatype to use
183
145
  if snpid:
@@ -359,7 +321,7 @@ def preformat(sumstats,
359
321
  try:
360
322
  if type(sumstats) is str:
361
323
  ## loading data from path
362
- inpath = sumstats
324
+ #inpath = sumstats
363
325
  if "@" in inpath:
364
326
  log.write("Start to initialize gl.Sumstats from files with pattern :" + inpath,verbose=verbose)
365
327
  sumstats_chr_list=[]
@@ -445,14 +407,14 @@ def preformat(sumstats,
445
407
  sumstats["N_CONTROL"] = ncontrol
446
408
 
447
409
  ### status ######################################################################################################
448
- if status is None:
449
- sumstats = process_status(sumstats=sumstats,build=build,log=log,verbose=verbose)
410
+
411
+ sumstats = process_status(sumstats=sumstats,build=build,status=status,log=log,verbose=verbose)
450
412
 
451
413
  ## ea/nea, ref/alt ##############################################################################################
452
414
  sumstats = process_allele(sumstats=sumstats,log=log,verbose=verbose)
453
415
 
454
416
  ## NEAF to EAF ###########################################################################################################
455
- if neaf is not None :
417
+ if neaf is not None or ("NEAF" in sumstats.columns and "EAF" not in sumstats.columns):
456
418
  sumstats = process_neaf(sumstats=sumstats,log=log,verbose=verbose)
457
419
 
458
420
  ## reodering ###################################################################################################
@@ -562,9 +524,15 @@ def process_neaf(sumstats,log,verbose):
562
524
  log.write(" -NEAF is specified...",verbose=verbose)
563
525
  pre_number=len(sumstats)
564
526
  log.write(" -Checking if 0<= NEAF <=1 ...",verbose=verbose)
565
- sumstats["EAF"] = pd.to_numeric(sumstats["EAF"], errors='coerce')
566
- sumstats = sumstats.loc[(sumstats["EAF"]>=0) & (sumstats["EAF"]<=1),:]
567
- sumstats["EAF"] = 1- sumstats["EAF"]
527
+ if "NEAF" in sumstats.columns:
528
+ sumstats["NEAF"] = pd.to_numeric(sumstats["NEAF"], errors='coerce')
529
+ sumstats = sumstats.loc[(sumstats["NEAF"]>=0) & (sumstats["NEAF"]<=1),:]
530
+ sumstats["EAF"] = 1- sumstats["NEAF"]
531
+ sumstats.drop(columns=["NEAF"], inplace=True)
532
+ else:
533
+ sumstats["EAF"] = pd.to_numeric(sumstats["EAF"], errors='coerce')
534
+ sumstats = sumstats.loc[(sumstats["EAF"]>=0) & (sumstats["EAF"]<=1),:]
535
+ sumstats["EAF"] = 1- sumstats["EAF"]
568
536
  log.write(" -Converted NEAF to EAF.",verbose=verbose)
569
537
  after_number=len(sumstats)
570
538
  log.write(" -Removed "+str(pre_number - after_number)+" variants with bad NEAF.",verbose=verbose)
@@ -599,13 +567,14 @@ def process_allele(sumstats,log,verbose):
599
567
  sumstats["NEA"]=sumstats["NEA"].astype("category")
600
568
  return sumstats
601
569
 
602
- def process_status(sumstats,build,log,verbose):
603
- log.write(" -Initiating a status column: STATUS ...",verbose=verbose)
604
- #sumstats["STATUS"] = int(build)*(10**5) +99999
605
- build = _process_build(build,log,verbose)
606
- sumstats["STATUS"] = build +"99999"
607
- categories = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
608
- sumstats["STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
570
+ def process_status(sumstats,build,status, log,verbose):
571
+ if status is None:
572
+ log.write(" -Initiating a status column: STATUS ...",verbose=verbose)
573
+ #sumstats["STATUS"] = int(build)*(10**5) +99999
574
+ build = _process_build(build,log,verbose)
575
+ sumstats["STATUS"] = build +"99999"
576
+
577
+ sumstats["STATUS"] = pd.Categorical(sumstats["STATUS"],categories=STATUS_CATEGORIES)
609
578
  return sumstats
610
579
 
611
580
 
@@ -649,4 +618,114 @@ def _load_variants_with_pattern(inpath,usecols,dtype_dictionary,readargs,rename_
649
618
  log.write(" -Loading only variants with pattern : {} ...".format(snpid_pat),verbose=verbose)
650
619
  sumstats_filtered = pd.concat([chunk[chunk[chunk_snpid].str.match(snpid_pat, case=False,na=False) ] for chunk in sumstats_iter])
651
620
  log.write(" -Loaded {} variants with pattern : {} ...".format(len(sumstats_filtered), snpid_pat),verbose=verbose)
652
- return sumstats_filtered
621
+ return sumstats_filtered
622
+
623
+
624
+ def check_path_and_header(sumstats=None,
625
+ fmt=None,
626
+ meta_data=None,
627
+ readargs=None,
628
+ usecols=None,
629
+ dtype_dictionary=None,
630
+ rename_dictionary=None,
631
+ log=None,
632
+ verbose=None):
633
+
634
+
635
+ if type(sumstats) is str:
636
+ ## loading data from path #################################################
637
+ inpath = sumstats
638
+
639
+ try:
640
+ format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list = process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose)
641
+
642
+ except (FileNotFoundError, IndexError):
643
+ log.warning("Loading {} failed...Tesing if compressed/uncompressed...".format(inpath),verbose=verbose)
644
+ try:
645
+ if inpath[-3:]==".gz":
646
+ inpath = inpath[:-3]
647
+ log.write(" -Trying to load {}...".format(inpath),verbose=verbose)
648
+ format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list =process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose)
649
+ else:
650
+ inpath = inpath+".gz"
651
+ log.write(" -Trying to load {}...".format(inpath),verbose=verbose)
652
+ format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list = process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose)
653
+ except:
654
+ raise ValueError("Please input a valid path, and make sure the separator is correct and the columns you specified are in the file.")
655
+
656
+ ######################################################################################
657
+ elif type(sumstats) is pd.DataFrame:
658
+ inpath = None
659
+ format_cols = None
660
+ inpath_chr_list = None
661
+ inpath_chr_num_list = None
662
+ ## loading data from dataframe
663
+ raw_cols = sumstats.columns
664
+
665
+ ################################################
666
+ for key,value in rename_dictionary.items():
667
+ # check avaiable keys key->raw header
668
+ # usecols : a list of raw headers to load from file/DataFrame
669
+ if key in raw_cols:
670
+ usecols.append(key)
671
+ if value in ["EA","NEA"]:
672
+ dtype_dictionary[key]="category"
673
+ if value in ["STATUS"]:
674
+ dtype_dictionary[key]="string"
675
+ if value in ["CHR"]:
676
+ dtype_dictionary[key]="string"
677
+
678
+ return inpath, inpath_chr_list, inpath_chr_num_list, format_cols, raw_cols, usecols, dtype_dictionary
679
+
680
+ def process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose):
681
+
682
+ format_cols = None
683
+ inpath_chr_list = None
684
+ inpath_chr_num_list = None
685
+
686
+ if "@" in inpath:
687
+ log.write(" -Detected @ in path: load sumstats by each chromosome...",verbose=verbose)
688
+ inpath_chr_list=[]
689
+ inpath_chr_num_list=[]
690
+
691
+ # create a regex pattern for matching
692
+ pat = os.path.basename(inpath).replace("@","(\w+)")
693
+
694
+ # get dir
695
+ dirname = os.path.dirname(inpath)
696
+
697
+ # all files in the directory
698
+ files = os.listdir(dirname)
699
+
700
+ files.sort()
701
+
702
+ for file in files:
703
+ # match
704
+ result = re.match(pat, file)
705
+ if result:
706
+ # get chr
707
+ chr_matched = str(result.group(1))
708
+ inpath_chr_num_list.append(chr_matched)
709
+ inpath_chr_list.append(inpath.replace("@",str(chr_matched)) )
710
+
711
+ log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
712
+
713
+ #if inpath_chr_list is empty-> IndexError
714
+ readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
715
+ row_one = pd.read_table(inpath_chr_list[0],**readargs_header)
716
+ # columns in the sumstats
717
+ raw_cols = row_one.columns
718
+ else:
719
+ ##### loading data from tabular file#################################################
720
+ #if file not found, FileNotFoundError
721
+ readargs_header = get_readargs_header(inpath = inpath, readargs = readargs)
722
+ row_one = pd.read_table(inpath,**readargs_header)
723
+ raw_cols = row_one.columns
724
+
725
+ if fmt=="vcf":
726
+ # expanded
727
+ format_cols = list(row_one["FORMAT"].str.split(":"))[0]
728
+ # fixed + study1 + expanded
729
+ raw_cols = meta_data["format_fixed"] + [raw_cols[9]] + format_cols
730
+
731
+ return format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list
@@ -5,12 +5,12 @@ import scipy.stats as ss
5
5
  import gzip
6
6
  import os
7
7
  import gc
8
- from gwaslab.bd_common_data import get_format_dict
9
- from gwaslab.qc_fix_sumstats import sortcolumn
10
- from gwaslab.qc_fix_sumstats import _process_build
11
- from gwaslab.qc_check_datatype_polars import check_datatype
12
- from gwaslab.qc_check_datatype_polars import quick_convert_datatype
13
- from gwaslab.qc_check_datatype_polars import check_dataframe_memory_usage
8
+ from gwaslab.bd.bd_common_data import get_format_dict
9
+ from gwaslab.qc.qc_fix_sumstats import sortcolumn
10
+ from gwaslab.qc.qc_fix_sumstats import _process_build
11
+ from gwaslab.qc.qc_check_datatype_polars import check_datatype_polars
12
+ from gwaslab.qc.qc_check_datatype_polars import quick_convert_datatype
13
+ from gwaslab.qc.qc_check_datatype_polars import check_dataframe_memory_usage
14
14
  from gwaslab.g_headers import _check_overlap_with_reserved_keys
15
15
  #20221030
16
16
  def preformatp(sumstats,
@@ -433,7 +433,7 @@ def preformatp(sumstats,
433
433
  #sumstats = sortcolumn(sumstats=sumstats,log=log,verbose=verbose)
434
434
  sumstats = quick_convert_datatype(sumstats,log=log,verbose=verbose)
435
435
 
436
- check_datatype(sumstats,log=log,verbose=verbose)
436
+ check_datatype_polars(sumstats,log=log,verbose=verbose)
437
437
  #gc.collect()
438
438
  check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)
439
439
 
@@ -1,7 +1,7 @@
1
1
  import pandas as pd
2
2
  from gwaslab.g_Log import Log
3
- from gwaslab.qc_check_datatype import check_datatype
4
- from gwaslab.qc_check_datatype import check_dataframe_memory_usage
3
+ from gwaslab.qc.qc_check_datatype import check_datatype
4
+ from gwaslab.qc.qc_check_datatype import check_dataframe_memory_usage
5
5
  import re
6
6
  import os
7
7
 
@@ -1,7 +1,7 @@
1
1
  import pandas as pd
2
- from gwaslab.bd_common_data import get_formats_list
2
+ from gwaslab.bd.bd_common_data import get_formats_list
3
3
  from gwaslab.g_Log import Log
4
- from gwaslab.bd_common_data import get_format_dict
4
+ from gwaslab.bd.bd_common_data import get_format_dict
5
5
 
6
6
  def _read_tabular(path, fmt, **kwargs):
7
7
 
@@ -7,16 +7,19 @@ from pysam import tabix_compress
7
7
  from pysam import tabix_index
8
8
  from datetime import datetime
9
9
  from datetime import date
10
- from gwaslab.io_preformat_input import print_format_info
11
- from gwaslab.bd_common_data import get_formats_list
12
10
  from gwaslab.g_Log import Log
13
- from gwaslab.bd_common_data import get_format_dict
14
- from gwaslab.bd_common_data import get_number_to_chr
15
11
  from gwaslab.g_version import gwaslab_info
16
- from gwaslab.bd_get_hapmap3 import gethapmap3
17
- from gwaslab.util_in_filter_value import _exclude_hla
18
- from gwaslab.util_in_filter_value import _exclude
19
- from gwaslab.util_in_filter_value import _extract
12
+
13
+ from gwaslab.io.io_preformat_input import print_format_info
14
+
15
+ from gwaslab.bd.bd_common_data import get_format_dict
16
+ from gwaslab.bd.bd_common_data import get_number_to_chr
17
+ from gwaslab.bd.bd_common_data import get_formats_list
18
+ from gwaslab.bd.bd_get_hapmap3 import gethapmap3
19
+
20
+ from gwaslab.util.util_in_filter_value import _exclude_hla
21
+ from gwaslab.util.util_in_filter_value import _exclude
22
+ from gwaslab.util.util_in_filter_value import _extract
20
23
  # to vcf
21
24
  # to fmt
22
25
  ## vcf
@@ -44,4 +44,19 @@ def load_data_from_pickle(path,usecols=None):
44
44
  existing_cols.append(i)
45
45
  data = data.loc[:,existing_cols]
46
46
  gc.collect()
47
- return data
47
+ return data
48
+
49
+ def _offload(df,path,log):
50
+ with open(path, 'wb') as file:
51
+ pickle.dump(df, file)
52
+ log.write("Dumpping dataframe to : ", path)
53
+
54
+ def _reload(path,log):
55
+ with open(path, 'rb') as file:
56
+ df = pickle.load(file)
57
+ log.write("Loaded dataframe back from : ", path)
58
+ try:
59
+ os.remove(path)
60
+ except:
61
+ pass
62
+ return df
@@ -56,7 +56,7 @@ dtype_dict ={
56
56
  'P_RANDOM' :[pl.Float64()]
57
57
  }
58
58
 
59
- def check_datatype(sumstats, verbose=True, log=Log()):
59
+ def check_datatype_polars(sumstats, verbose=True, log=Log()):
60
60
 
61
61
  #try:
62
62
  headers = []
@@ -112,7 +112,7 @@ def quick_convert_datatype(sumstats, log, verbose):
112
112
  pass
113
113
  return sumstats
114
114
 
115
- def check_dataframe_shape(sumstats, log, verbose):
115
+ def check_dataframe_shape_polars(sumstats, log, verbose):
116
116
  memory_in_mb = sumstats.estimated_size(unit="mb")
117
117
  try:
118
118
  log.write(" -Current Dataframe shape : {} x {} ; Memory usage: {:.2f} MB".format(len(sumstats),len(sumstats.columns),memory_in_mb), verbose=verbose)
@@ -7,22 +7,29 @@ from multiprocessing import Pool
7
7
  from liftover import get_lifter
8
8
  from liftover import ChainFile
9
9
  from functools import partial
10
+
10
11
  from gwaslab.g_vchange_status import vchange_status
11
12
  from gwaslab.g_vchange_status import status_match
12
13
  from gwaslab.g_vchange_status import change_status
13
14
  from gwaslab.g_Log import Log
14
- from gwaslab.bd_common_data import get_chr_to_number
15
- from gwaslab.bd_common_data import get_number_to_chr
16
- from gwaslab.bd_common_data import get_chr_list
17
- from gwaslab.qc_check_datatype import check_datatype
18
- from gwaslab.qc_check_datatype import check_dataframe_shape
19
- from gwaslab.qc_build import _process_build
20
- from gwaslab.qc_build import _set_build
21
15
  from gwaslab.g_version import _get_version
22
- from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
23
- from gwaslab.util_in_fill_data import _convert_betase_to_p
24
- from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
25
- from gwaslab.bd_common_data import get_chain
16
+ from gwaslab.g_vchange_status import STATUS_CATEGORIES
17
+
18
+ from gwaslab.bd.bd_common_data import get_chr_to_number
19
+ from gwaslab.bd.bd_common_data import get_number_to_chr
20
+ from gwaslab.bd.bd_common_data import get_chr_list
21
+ from gwaslab.bd.bd_common_data import get_chain
22
+ from gwaslab.bd.bd_common_data import NA_STRINGS
23
+
24
+ from gwaslab.qc.qc_check_datatype import check_datatype
25
+ from gwaslab.qc.qc_check_datatype import check_dataframe_shape
26
+ from gwaslab.qc.qc_build import _process_build
27
+ from gwaslab.qc.qc_build import _set_build
28
+
29
+ from gwaslab.util.util_in_fill_data import _convert_betase_to_mlog10p
30
+ from gwaslab.util.util_in_fill_data import _convert_betase_to_p
31
+ from gwaslab.util.util_in_fill_data import _convert_mlog10p_to_p
32
+
26
33
  #process build
27
34
  #setbuild
28
35
  #fixID
@@ -69,7 +76,7 @@ from gwaslab.bd_common_data import get_chain
69
76
 
70
77
  def fixID(sumstats,
71
78
  snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",fixprefix=False,
72
- fixchrpos=False,fixid=False,fixeanea=False,fixeanea_flip=False,fixsep=False,
79
+ fixchrpos=False,fixid=False,fixeanea=False,fixeanea_flip=False,fixsep=False, reversea=False,
73
80
  overwrite=False,verbose=True,forcefixid=False,log=Log()):
74
81
  '''
75
82
  1. fx SNPid
@@ -120,7 +127,21 @@ def fixID(sumstats,
120
127
  except:
121
128
  log.write(" -Force converting SNPID to pd.string data type...",verbose=verbose)
122
129
  sumstats[snpid] = sumstats[snpid].astype("string")
130
+ ############################ checking string NA ###################################################
131
+ log.write(" -Checking NA strings :{}".format(",".join(NA_STRINGS)),verbose=verbose)
132
+ if snpid in sumstats.columns:
133
+ log.write(" -Checking if SNPID contains NA strings...",verbose=verbose)
134
+ is_snpid_string_na = sumstats[snpid].isin(NA_STRINGS)
135
+ if sum(is_snpid_string_na) >0:
136
+ log.write(" -Converting {} NA strings in SNPID to pd.NA...".format(sum(is_snpid_string_na)),verbose=verbose)
137
+ sumstats.loc[is_snpid_string_na ,snpid] = pd.NA
123
138
 
139
+ if rsid in sumstats.columns:
140
+ log.write(" -Checking if rsID contains NA strings...",verbose=verbose)
141
+ is_rsid_string_na = sumstats[rsid].isin(NA_STRINGS)
142
+ if sum(is_rsid_string_na) >0:
143
+ log.write(" -Converting {} NA strings in rsID to pd.NA...".format(sum(is_rsid_string_na)),verbose=verbose)
144
+ sumstats.loc[is_rsid_string_na ,rsid] = pd.NA
124
145
  ############################ checking ###################################################
125
146
  if snpid in sumstats.columns:
126
147
  log.write(" -Checking if SNPID is CHR:POS:NEA:EA...(separator: - ,: , _)",verbose=verbose)
@@ -148,7 +169,15 @@ def fixID(sumstats,
148
169
  log.write(" -A look at the unrecognized rsID :",set(sumstats.loc[(~is_rsid)&(~is_rs_chrpos),rsid].head()),"...", verbose=verbose)
149
170
 
150
171
  ############################ fixing chr pos###################################################
151
-
172
+ if reversea == True:
173
+ if snpid in sumstats.columns:
174
+ log.write(" -Reversing Alleles in SNPID...", verbose=verbose)
175
+ to_fix = is_chrposrefalt
176
+ to_fix_num = sum(to_fix)
177
+ if to_fix_num>0 and verbose: log.write(" -Number of variants could be reversed: "+str(to_fix_num)+" ...")
178
+ extracted = sumstats.loc[to_fix, snpid].str.extract(r'(^\w+[:_-]\d+[:_-])([ATCG]+)([:_-])([ATCG]+$)', flags=re.IGNORECASE)
179
+ sumstats.loc[to_fix, snpid] = extracted[0] + extracted[3] + extracted[2] + extracted[1]
180
+
152
181
  if fixchrpos == True:
153
182
  # from snpid or rsid, extract CHR:POS to fix CHR and POS
154
183
  if snpid in sumstats.columns:
@@ -537,24 +566,24 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
537
566
  pre_number =len(sumstats)
538
567
  specified_columns = []
539
568
  if "d" in mode:
540
- specified_columns.append(rsid)
541
- specified_columns.append(snpid)
542
- specified_columns.append(chrom)
543
- specified_columns.append(pos)
544
- specified_columns.append(ea)
545
- specified_columns.append(nea)
569
+ if rsid in sumstats.columns: specified_columns.append(rsid)
570
+ if snpid in sumstats.columns: specified_columns.append(snpid)
571
+ if chrom in sumstats.columns: specified_columns.append(chrom)
572
+ if pos in sumstats.columns: specified_columns.append(pos)
573
+ if ea in sumstats.columns: specified_columns.append(ea)
574
+ if nea in sumstats.columns: specified_columns.append(nea)
546
575
  if "r" in mode:
547
- specified_columns.append(rsid)
576
+ if rsid in sumstats.columns:specified_columns.append(rsid)
548
577
  if "s" in mode:
549
- specified_columns.append(snpid)
578
+ if snpid in sumstats.columns:specified_columns.append(snpid)
550
579
  if "m" in mode:
551
- specified_columns.append(chrom)
552
- specified_columns.append(pos)
580
+ if chrom in sumstats.columns:specified_columns.append(chrom)
581
+ if pos in sumstats.columns:specified_columns.append(pos)
553
582
  if "c" in mode:
554
- specified_columns.append(chrom)
555
- specified_columns.append(pos)
556
- specified_columns.append(ea)
557
- specified_columns.append(nea)
583
+ if chrom in sumstats.columns:specified_columns.append(chrom)
584
+ if pos in sumstats.columns:specified_columns.append(pos)
585
+ if ea in sumstats.columns:specified_columns.append(ea)
586
+ if nea in sumstats.columns:specified_columns.append(nea)
558
587
  sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
559
588
  after_number=len(sumstats)
560
589
  log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)), verbose=verbose)
@@ -1123,19 +1152,17 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
1123
1152
  cols_to_check.append(header)
1124
1153
  if header=="STATUS":
1125
1154
  log.write(" -Checking STATUS and converting STATUS to categories....", verbose=verbose)
1126
- categories = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
1127
- sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
1155
+ sumstats[header] = pd.Categorical(sumstats[header],categories=STATUS_CATEGORIES)
1128
1156
  return sumstats
1129
1157
 
1130
1158
  if dtype in ["Int64","Int32","int","int32","in64"]:
1131
1159
  log.write(" -Checking if {} <= {} <= {} ...".format( var_range[0] ,header, var_range[1]), verbose=verbose)
1132
1160
  sumstats[header] = np.floor(pd.to_numeric(sumstats[header], errors='coerce')).astype(dtype)
1133
-
1161
+ is_valid = (sumstats[header]>=var_range[0]) & (sumstats[header]<=var_range[1])
1134
1162
  elif dtype in ["Float64","Float32","float","float64","float32"]:
1135
1163
  log.write(" -Checking if {} < {} < {} ...".format( var_range[0] ,header, var_range[1]),verbose=verbose)
1136
1164
  sumstats[header] = pd.to_numeric(sumstats[header], errors='coerce').astype(dtype)
1137
-
1138
- is_valid = (sumstats[header]>=var_range[0]) & (sumstats[header]<=var_range[1])
1165
+ is_valid = (sumstats[header]>var_range[0]) & (sumstats[header]<var_range[1])
1139
1166
  is_valid = is_valid.fillna(False)
1140
1167
 
1141
1168
  if header=="P":
@@ -1835,4 +1862,4 @@ def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
1835
1862
  ###############################################################################################################
1836
1863
  def _df_split(dataframe, n):
1837
1864
  k, m = divmod(len(dataframe), n)
1838
- return [dataframe.iloc[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]
1865
+ return [dataframe.iloc[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]