gwaslab 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (110) hide show
  1. gwaslab/__init__.py +57 -47
  2. gwaslab/{bd_common_data.py → bd/bd_common_data.py} +10 -9
  3. gwaslab/bd/bd_config.py +28 -0
  4. gwaslab/{bd_download.py → bd/bd_download.py} +1 -1
  5. gwaslab/{bd_get_hapmap3.py → bd/bd_get_hapmap3.py} +9 -6
  6. gwaslab/bd/bd_path_manager.py +110 -0
  7. gwaslab/data/formatbook.json +805 -9
  8. gwaslab/{ldsc_irwls.py → extension/ldsc/ldsc_irwls.py} +1 -1
  9. gwaslab/{ldsc_regressions.py → extension/ldsc/ldsc_regressions.py} +2 -2
  10. gwaslab/{ldsc_sumstats.py → extension/ldsc/ldsc_sumstats.py} +2 -2
  11. gwaslab/{prscs_mcmc_gtb.py → extension/prscs/prscs_mcmc_gtb.py} +1 -1
  12. gwaslab/g_Sumstats.py +130 -96
  13. gwaslab/g_SumstatsMulti.py +69 -40
  14. gwaslab/g_SumstatsPair.py +54 -37
  15. gwaslab/g_SumstatsSet.py +88 -81
  16. gwaslab/g_SumstatsT.py +6 -6
  17. gwaslab/g_Sumstats_polars.py +84 -84
  18. gwaslab/g_meta_update.py +1 -1
  19. gwaslab/g_vchange_status.py +4 -4
  20. gwaslab/g_version.py +2 -2
  21. gwaslab/{hm_casting.py → hm/hm_casting.py} +4 -4
  22. gwaslab/{hm_casting_polars.py → hm/hm_casting_polars.py} +4 -4
  23. gwaslab/hm/hm_harmonize_sumstats.py +1635 -0
  24. gwaslab/hm_harmonize_sumstats.py +3 -8
  25. gwaslab/{io_load_ld.py → io/io_load_ld.py} +16 -13
  26. gwaslab/{io_preformat_input.py → io/io_preformat_input.py} +152 -73
  27. gwaslab/{io_preformat_input_polars.py → io/io_preformat_input_polars.py} +7 -7
  28. gwaslab/{io_read_pipcs.py → io/io_read_pipcs.py} +6 -5
  29. gwaslab/{io_read_tabular.py → io/io_read_tabular.py} +2 -2
  30. gwaslab/{io_to_formats.py → io/io_to_formats.py} +13 -9
  31. gwaslab/{io_to_pickle.py → io/io_to_pickle.py} +16 -1
  32. gwaslab/{qc_check_datatype_polars.py → qc/qc_check_datatype_polars.py} +2 -2
  33. gwaslab/{qc_fix_sumstats.py → qc/qc_fix_sumstats.py} +60 -33
  34. gwaslab/{qc_fix_sumstats_polars.py → qc/qc_fix_sumstats_polars.py} +15 -11
  35. gwaslab/{util_abf_finemapping.py → util/util_abf_finemapping.py} +2 -2
  36. gwaslab/{util_ex_calculate_ldmatrix.py → util/util_ex_calculate_ldmatrix.py} +18 -8
  37. gwaslab/{util_ex_calculate_prs.py → util/util_ex_calculate_prs.py} +2 -2
  38. gwaslab/{util_ex_ldproxyfinder.py → util/util_ex_ldproxyfinder.py} +6 -6
  39. gwaslab/{util_ex_ldsc.py → util/util_ex_ldsc.py} +18 -13
  40. gwaslab/{util_ex_match_ldmatrix.py → util/util_ex_match_ldmatrix.py} +8 -7
  41. gwaslab/util/util_ex_phewwas.py +117 -0
  42. gwaslab/{util_ex_process_h5.py → util/util_ex_process_h5.py} +2 -2
  43. gwaslab/{util_ex_process_ref.py → util/util_ex_process_ref.py} +2 -2
  44. gwaslab/{util_ex_run_2samplemr.py → util/util_ex_run_2samplemr.py} +18 -7
  45. gwaslab/{util_ex_run_ccgwas.py → util/util_ex_run_ccgwas.py} +4 -4
  46. gwaslab/{util_ex_run_clumping.py → util/util_ex_run_clumping.py} +28 -13
  47. gwaslab/{util_ex_run_coloc.py → util/util_ex_run_coloc.py} +22 -10
  48. gwaslab/{util_ex_run_hyprcoloc.py → util/util_ex_run_hyprcoloc.py} +4 -4
  49. gwaslab/{util_ex_run_magma.py → util/util_ex_run_magma.py} +21 -11
  50. gwaslab/{util_ex_run_mesusie.py → util/util_ex_run_mesusie.py} +3 -3
  51. gwaslab/{util_ex_run_mtag.py → util/util_ex_run_mtag.py} +50 -18
  52. gwaslab/{util_ex_run_prscs.py → util/util_ex_run_prscs.py} +3 -3
  53. gwaslab/{util_ex_run_scdrs.py → util/util_ex_run_scdrs.py} +10 -4
  54. gwaslab/{util_ex_run_susie.py → util/util_ex_run_susie.py} +49 -26
  55. gwaslab/{util_in_fill_data.py → util/util_in_fill_data.py} +1 -1
  56. gwaslab/{util_in_filter_value.py → util/util_in_filter_value.py} +18 -11
  57. gwaslab/{util_in_get_sig.py → util/util_in_get_sig.py} +15 -13
  58. gwaslab/{util_in_meta.py → util/util_in_meta.py} +1 -1
  59. gwaslab/{util_in_meta_polars.py → util/util_in_meta_polars.py} +1 -1
  60. gwaslab/{viz_aux_annotate_plot.py → viz/viz_aux_annotate_plot.py} +1 -1
  61. gwaslab/{viz_aux_quickfix.py → viz/viz_aux_quickfix.py} +2 -2
  62. gwaslab/{viz_plot_compare_af.py → viz/viz_plot_compare_af.py} +1 -1
  63. gwaslab/{viz_plot_compare_effect.py → viz/viz_plot_compare_effect.py} +16 -8
  64. gwaslab/{viz_plot_credible_sets.py → viz/viz_plot_credible_sets.py} +6 -6
  65. gwaslab/{viz_plot_effect.py → viz/viz_plot_effect.py} +37 -69
  66. gwaslab/{viz_plot_miamiplot.py → viz/viz_plot_miamiplot.py} +28 -20
  67. gwaslab/{viz_plot_miamiplot2.py → viz/viz_plot_miamiplot2.py} +27 -22
  68. gwaslab/{viz_plot_mqqplot.py → viz/viz_plot_mqqplot.py} +48 -38
  69. gwaslab/{viz_plot_phe_heatmap.py → viz/viz_plot_phe_heatmap.py} +18 -15
  70. gwaslab/{viz_plot_qqplot.py → viz/viz_plot_qqplot.py} +4 -2
  71. gwaslab/{viz_plot_regional2.py → viz/viz_plot_regional2.py} +11 -9
  72. gwaslab/{viz_plot_regionalplot.py → viz/viz_plot_regionalplot.py} +5 -4
  73. gwaslab/{viz_plot_rg_heatmap.py → viz/viz_plot_rg_heatmap.py} +1 -1
  74. gwaslab/{viz_plot_scatter_with_reg.py → viz/viz_plot_scatter_with_reg.py} +10 -7
  75. gwaslab/{viz_plot_stackedregional.py → viz/viz_plot_stackedregional.py} +67 -33
  76. gwaslab/{viz_plot_trumpetplot.py → viz/viz_plot_trumpetplot.py} +11 -9
  77. {gwaslab-3.6.5.dist-info → gwaslab-3.6.7.dist-info}/METADATA +1 -1
  78. gwaslab-3.6.7.dist-info/RECORD +123 -0
  79. gwaslab/bd_config.py +0 -18
  80. gwaslab-3.6.5.dist-info/RECORD +0 -120
  81. /gwaslab/{ldsc_jackknife.py → extension/ldsc/ldsc_jackknife.py} +0 -0
  82. /gwaslab/{ldsc_ldscore.py → extension/ldsc/ldsc_ldscore.py} +0 -0
  83. /gwaslab/{ldsc_parse.py → extension/ldsc/ldsc_parse.py} +0 -0
  84. /gwaslab/{prscs_gigrnd.py → extension/prscs/prscs_gigrnd.py} +0 -0
  85. /gwaslab/{prscs_parse_genet.py → extension/prscs/prscs_parse_genet.py} +0 -0
  86. /gwaslab/{hm_rsid_to_chrpos.py → hm/hm_rsid_to_chrpos.py} +0 -0
  87. /gwaslab/{io_process_args.py → io/io_process_args.py} +0 -0
  88. /gwaslab/{io_read_ldsc.py → io/io_read_ldsc.py} +0 -0
  89. /gwaslab/{qc_build.py → qc/qc_build.py} +0 -0
  90. /gwaslab/{qc_check_datatype.py → qc/qc_check_datatype.py} +0 -0
  91. /gwaslab/{util_ex_gwascatalog.py → util/util_ex_gwascatalog.py} +0 -0
  92. /gwaslab/{util_ex_infer_ancestry.py → util/util_ex_infer_ancestry.py} +0 -0
  93. /gwaslab/{util_ex_plink_filter.py → util/util_ex_plink_filter.py} +0 -0
  94. /gwaslab/{util_in_calculate_gc.py → util/util_in_calculate_gc.py} +0 -0
  95. /gwaslab/{util_in_calculate_power.py → util/util_in_calculate_power.py} +0 -0
  96. /gwaslab/{util_in_convert_h2.py → util/util_in_convert_h2.py} +0 -0
  97. /gwaslab/{util_in_correct_winnerscurse.py → util/util_in_correct_winnerscurse.py} +0 -0
  98. /gwaslab/{util_in_estimate_ess.py → util/util_in_estimate_ess.py} +0 -0
  99. /gwaslab/{util_in_get_density.py → util/util_in_get_density.py} +0 -0
  100. /gwaslab/{util_in_merge.py → util/util_in_merge.py} +0 -0
  101. /gwaslab/{util_in_snphwe.py → util/util_in_snphwe.py} +0 -0
  102. /gwaslab/{viz_aux_chromatin.py → viz/viz_aux_chromatin.py} +0 -0
  103. /gwaslab/{viz_aux_property.py → viz/viz_aux_property.py} +0 -0
  104. /gwaslab/{viz_aux_reposition_text.py → viz/viz_aux_reposition_text.py} +0 -0
  105. /gwaslab/{viz_aux_save_figure.py → viz/viz_aux_save_figure.py} +0 -0
  106. /gwaslab/{viz_plot_forestplot.py → viz/viz_plot_forestplot.py} +0 -0
  107. {gwaslab-3.6.5.dist-info → gwaslab-3.6.7.dist-info}/WHEEL +0 -0
  108. {gwaslab-3.6.5.dist-info → gwaslab-3.6.7.dist-info}/licenses/LICENSE +0 -0
  109. {gwaslab-3.6.5.dist-info → gwaslab-3.6.7.dist-info}/licenses/LICENSE_before_v3.4.39 +0 -0
  110. {gwaslab-3.6.5.dist-info → gwaslab-3.6.7.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,7 @@ from gwaslab.bd_common_data import _maketrans
27
27
  from gwaslab.g_vchange_status import vchange_status
28
28
  from gwaslab.g_version import _get_version
29
29
  from gwaslab.cache_manager import CacheManager, PALINDROMIC_INDEL, NON_PALINDROMIC
30
-
30
+ from gwaslab.g_vchange_status import STATUS_CATEGORIES
31
31
  #rsidtochrpos
32
32
  #checkref
33
33
  #parallelizeassignrsid
@@ -357,10 +357,7 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
357
357
 
358
358
  log.write("\n",end="",show_time=False,verbose=verbose)
359
359
 
360
- CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
361
- sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
362
- #sumstats[status] = sumstats[status].astype("string")
363
-
360
+ sumstats[status] = pd.Categorical(sumstats[status],categories=STATUS_CATEGORIES)
364
361
 
365
362
  available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
366
363
  status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
@@ -681,9 +678,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
681
678
  sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
682
679
  log.write(" -Finished checking records", verbose=verbose)
683
680
 
684
- CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
685
- sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
686
- #sumstats[status] = sumstats[status].astype("string")
681
+ sumstats[status] = pd.Categorical(sumstats[status],categories=STATUS_CATEGORIES)
687
682
 
688
683
  available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
689
684
  status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
@@ -1,8 +1,6 @@
1
-
2
1
  import scipy.sparse as sparse
3
2
  import numpy as np
4
3
  import pandas as pd
5
- from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
6
4
  import subprocess
7
5
  import os
8
6
  import re
@@ -10,16 +8,22 @@ import gc
10
8
  import pandas as pd
11
9
  import numpy as np
12
10
  from gwaslab.g_Log import Log
13
- from gwaslab.qc_fix_sumstats import start_to
14
- from gwaslab.qc_fix_sumstats import finished
15
- from gwaslab.util_in_get_sig import getsig
16
- from gwaslab.util_ex_process_ref import _process_plink_input_files
17
11
  from gwaslab.g_version import _checking_plink_version
18
- from gwaslab.util_in_filter_value import _exclude_hla
19
- from gwaslab.util_ex_calculate_ldmatrix import _extract_variants_in_locus
20
- from gwaslab.util_ex_calculate_ldmatrix import _export_snplist_and_locus_sumstats
21
- from gwaslab.viz_plot_regional2 import _get_lead_id
22
- from gwaslab.util_ex_calculate_ldmatrix import _extract_variants_in_locus
12
+
13
+ from gwaslab.hm.hm_casting import _merge_mold_with_sumstats_by_chrpos
14
+
15
+ from gwaslab.qc.qc_fix_sumstats import start_to
16
+ from gwaslab.qc.qc_fix_sumstats import finished
17
+
18
+ from gwaslab.util.util_in_get_sig import getsig
19
+ from gwaslab.util.util_ex_process_ref import _process_plink_input_files
20
+ from gwaslab.util.util_in_filter_value import _exclude_hla
21
+ from gwaslab.util.util_ex_calculate_ldmatrix import _extract_variants_in_locus
22
+ from gwaslab.util.util_ex_calculate_ldmatrix import _export_snplist_and_locus_sumstats
23
+ from gwaslab.util.util_ex_calculate_ldmatrix import _extract_variants_in_locus
24
+
25
+ from gwaslab.viz.viz_plot_regional2 import _get_lead_id
26
+
23
27
 
24
28
  def tofinemapping_using_ld(sumstats,
25
29
  study=None,
@@ -72,7 +76,7 @@ def tofinemapping_using_ld(sumstats,
72
76
  sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
73
77
  else:
74
78
  sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
75
-
79
+ log.write(" -Number of loci: {}...".format(len(sig_df)),verbose=verbose)
76
80
  # Drop duplicate!!!!
77
81
  log.write(" -Dropping duplicated SNPIDs...",verbose=verbose)
78
82
  sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
@@ -447,7 +451,6 @@ def _merge_ld_map_with_sumstats(row,
447
451
  # matching by SNPID
448
452
  # preserve bim keys (use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.)
449
453
  combined_df = pd.merge(ld_map, locus_sumstats, on=["CHR","POS"],how="inner")
450
-
451
454
  # match allele
452
455
  perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
453
456
  log.write(" -Variants with perfect matched alleles:{}".format(sum(perfect_match)))
@@ -3,14 +3,16 @@ import numpy as np
3
3
  import scipy.stats as ss
4
4
  import gzip
5
5
  import os
6
+ import re
6
7
  import gc
7
- from gwaslab.bd_common_data import get_format_dict
8
- from gwaslab.qc_fix_sumstats import sortcolumn
9
- from gwaslab.qc_fix_sumstats import _process_build
10
- from gwaslab.qc_check_datatype import check_datatype
11
- from gwaslab.qc_check_datatype import quick_convert_datatype
12
- from gwaslab.qc_check_datatype import check_dataframe_memory_usage
8
+ from gwaslab.bd.bd_common_data import get_format_dict
9
+ from gwaslab.qc.qc_fix_sumstats import sortcolumn
10
+ from gwaslab.qc.qc_fix_sumstats import _process_build
11
+ from gwaslab.qc.qc_check_datatype import check_datatype
12
+ from gwaslab.qc.qc_check_datatype import quick_convert_datatype
13
+ from gwaslab.qc.qc_check_datatype import check_dataframe_memory_usage
13
14
  from gwaslab.g_headers import _check_overlap_with_reserved_keys
15
+ from gwaslab.g_vchange_status import STATUS_CATEGORIES
14
16
  #20221030
15
17
  def preformat(sumstats,
16
18
  fmt=None,
@@ -122,62 +124,22 @@ def preformat(sumstats,
122
124
 
123
125
  if "sep" not in readargs.keys():
124
126
  readargs["sep"] = "\t"
127
+ else:
128
+ meta_data = None
125
129
 
126
130
  #########################################################################################################################################################
127
131
 
128
- # check chr-separated path / vcf / then print header.
129
- try:
130
- if type(sumstats) is str:
131
- ## loading data from path #################################################
132
- inpath = sumstats
133
- ###load sumstats by each chromosome #################################################
134
- if "@" in inpath:
135
- log.write(" -Detected @ in path: load sumstats by each chromosome...",verbose=verbose)
136
- inpath_chr_list=[]
137
- inpath_chr_num_list=[]
138
- for chromosome in list(range(1,26))+["x","y","X","Y","MT","mt","m","M"]:
139
- inpath_chr = inpath.replace("@",str(chromosome))
140
- if isfile_casesensitive(inpath_chr):
141
- inpath_chr_num_list.append(str(chromosome))
142
- inpath_chr_list.append(inpath_chr)
143
- log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
144
- readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
145
- row_one = pd.read_table(inpath_chr_list[0],**readargs_header)
146
- # columns in the sumstats
147
- raw_cols = row_one.columns
148
- else:
149
- ##### loading data from tabular file#################################################
150
- readargs_header = get_readargs_header(inpath = inpath, readargs = readargs)
151
- row_one = pd.read_table(inpath,**readargs_header)
152
- raw_cols = row_one.columns
153
-
154
- if fmt=="vcf":
155
- # expanded
156
- format_cols = list(row_one["FORMAT"].str.split(":"))[0]
157
- # fixed + study1 + expanded
158
- raw_cols = meta_data["format_fixed"] + [raw_cols[9]] + format_cols
132
+ # check chr-separated path / vcf / then print header.
133
+ inpath, inpath_chr_list, inpath_chr_num_list, format_cols, raw_cols, usecols, dtype_dictionary = check_path_and_header(sumstats,
134
+ fmt,
135
+ meta_data,
136
+ readargs,
137
+ usecols,
138
+ dtype_dictionary,
139
+ rename_dictionary,
140
+ log,
141
+ verbose)
159
142
 
160
- ######################################################################################
161
- elif type(sumstats) is pd.DataFrame:
162
- ## loading data from dataframe
163
- raw_cols = sumstats.columns
164
-
165
- ################################################
166
- for key,value in rename_dictionary.items():
167
- # check avaiable keys key->raw header
168
- # usecols : a list of raw headers to load from file/DataFrame
169
- if key in raw_cols:
170
- usecols.append(key)
171
- if value in ["EA","NEA"]:
172
- dtype_dictionary[key]="category"
173
- if value in ["STATUS"]:
174
- dtype_dictionary[key]="string"
175
- if value in ["CHR"]:
176
- dtype_dictionary[key]="string"
177
-
178
- except ValueError:
179
- raise ValueError("Please input a path or a pd.DataFrame, and make sure the separator is correct and the columns you specified are in the file.")
180
-
181
143
  ###################################################################################################################################################
182
144
  ## check columns/datatype to use
183
145
  if snpid:
@@ -359,7 +321,7 @@ def preformat(sumstats,
359
321
  try:
360
322
  if type(sumstats) is str:
361
323
  ## loading data from path
362
- inpath = sumstats
324
+ #inpath = sumstats
363
325
  if "@" in inpath:
364
326
  log.write("Start to initialize gl.Sumstats from files with pattern :" + inpath,verbose=verbose)
365
327
  sumstats_chr_list=[]
@@ -445,14 +407,14 @@ def preformat(sumstats,
445
407
  sumstats["N_CONTROL"] = ncontrol
446
408
 
447
409
  ### status ######################################################################################################
448
- if status is None:
449
- sumstats = process_status(sumstats=sumstats,build=build,log=log,verbose=verbose)
410
+
411
+ sumstats = process_status(sumstats=sumstats,build=build,status=status,log=log,verbose=verbose)
450
412
 
451
413
  ## ea/nea, ref/alt ##############################################################################################
452
414
  sumstats = process_allele(sumstats=sumstats,log=log,verbose=verbose)
453
415
 
454
416
  ## NEAF to EAF ###########################################################################################################
455
- if neaf is not None :
417
+ if neaf is not None or ("NEAF" in sumstats.columns and "EAF" not in sumstats.columns):
456
418
  sumstats = process_neaf(sumstats=sumstats,log=log,verbose=verbose)
457
419
 
458
420
  ## reodering ###################################################################################################
@@ -562,9 +524,15 @@ def process_neaf(sumstats,log,verbose):
562
524
  log.write(" -NEAF is specified...",verbose=verbose)
563
525
  pre_number=len(sumstats)
564
526
  log.write(" -Checking if 0<= NEAF <=1 ...",verbose=verbose)
565
- sumstats["EAF"] = pd.to_numeric(sumstats["EAF"], errors='coerce')
566
- sumstats = sumstats.loc[(sumstats["EAF"]>=0) & (sumstats["EAF"]<=1),:]
567
- sumstats["EAF"] = 1- sumstats["EAF"]
527
+ if "NEAF" in sumstats.columns:
528
+ sumstats["NEAF"] = pd.to_numeric(sumstats["NEAF"], errors='coerce')
529
+ sumstats = sumstats.loc[(sumstats["NEAF"]>=0) & (sumstats["NEAF"]<=1),:]
530
+ sumstats["EAF"] = 1- sumstats["NEAF"]
531
+ sumstats.drop(columns=["NEAF"], inplace=True)
532
+ else:
533
+ sumstats["EAF"] = pd.to_numeric(sumstats["EAF"], errors='coerce')
534
+ sumstats = sumstats.loc[(sumstats["EAF"]>=0) & (sumstats["EAF"]<=1),:]
535
+ sumstats["EAF"] = 1- sumstats["EAF"]
568
536
  log.write(" -Converted NEAF to EAF.",verbose=verbose)
569
537
  after_number=len(sumstats)
570
538
  log.write(" -Removed "+str(pre_number - after_number)+" variants with bad NEAF.",verbose=verbose)
@@ -599,13 +567,14 @@ def process_allele(sumstats,log,verbose):
599
567
  sumstats["NEA"]=sumstats["NEA"].astype("category")
600
568
  return sumstats
601
569
 
602
- def process_status(sumstats,build,log,verbose):
603
- log.write(" -Initiating a status column: STATUS ...",verbose=verbose)
604
- #sumstats["STATUS"] = int(build)*(10**5) +99999
605
- build = _process_build(build,log,verbose)
606
- sumstats["STATUS"] = build +"99999"
607
- categories = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
608
- sumstats["STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
570
+ def process_status(sumstats,build,status, log,verbose):
571
+ if status is None:
572
+ log.write(" -Initiating a status column: STATUS ...",verbose=verbose)
573
+ #sumstats["STATUS"] = int(build)*(10**5) +99999
574
+ build = _process_build(build,log,verbose)
575
+ sumstats["STATUS"] = build +"99999"
576
+
577
+ sumstats["STATUS"] = pd.Categorical(sumstats["STATUS"],categories=STATUS_CATEGORIES)
609
578
  return sumstats
610
579
 
611
580
 
@@ -649,4 +618,114 @@ def _load_variants_with_pattern(inpath,usecols,dtype_dictionary,readargs,rename_
649
618
  log.write(" -Loading only variants with pattern : {} ...".format(snpid_pat),verbose=verbose)
650
619
  sumstats_filtered = pd.concat([chunk[chunk[chunk_snpid].str.match(snpid_pat, case=False,na=False) ] for chunk in sumstats_iter])
651
620
  log.write(" -Loaded {} variants with pattern : {} ...".format(len(sumstats_filtered), snpid_pat),verbose=verbose)
652
- return sumstats_filtered
621
+ return sumstats_filtered
622
+
623
+
624
+ def check_path_and_header(sumstats=None,
625
+ fmt=None,
626
+ meta_data=None,
627
+ readargs=None,
628
+ usecols=None,
629
+ dtype_dictionary=None,
630
+ rename_dictionary=None,
631
+ log=None,
632
+ verbose=None):
633
+
634
+
635
+ if type(sumstats) is str:
636
+ ## loading data from path #################################################
637
+ inpath = sumstats
638
+
639
+ try:
640
+ format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list = process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose)
641
+
642
+ except (FileNotFoundError, IndexError):
643
+ log.warning("Loading {} failed...Tesing if compressed/uncompressed...".format(inpath),verbose=verbose)
644
+ try:
645
+ if inpath[-3:]==".gz":
646
+ inpath = inpath[:-3]
647
+ log.write(" -Trying to load {}...".format(inpath),verbose=verbose)
648
+ format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list =process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose)
649
+ else:
650
+ inpath = inpath+".gz"
651
+ log.write(" -Trying to load {}...".format(inpath),verbose=verbose)
652
+ format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list = process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose)
653
+ except:
654
+ raise ValueError("Please input a valid path, and make sure the separator is correct and the columns you specified are in the file.")
655
+
656
+ ######################################################################################
657
+ elif type(sumstats) is pd.DataFrame:
658
+ inpath = None
659
+ format_cols = None
660
+ inpath_chr_list = None
661
+ inpath_chr_num_list = None
662
+ ## loading data from dataframe
663
+ raw_cols = sumstats.columns
664
+
665
+ ################################################
666
+ for key,value in rename_dictionary.items():
667
+ # check avaiable keys key->raw header
668
+ # usecols : a list of raw headers to load from file/DataFrame
669
+ if key in raw_cols:
670
+ usecols.append(key)
671
+ if value in ["EA","NEA"]:
672
+ dtype_dictionary[key]="category"
673
+ if value in ["STATUS"]:
674
+ dtype_dictionary[key]="string"
675
+ if value in ["CHR"]:
676
+ dtype_dictionary[key]="string"
677
+
678
+ return inpath, inpath_chr_list, inpath_chr_num_list, format_cols, raw_cols, usecols, dtype_dictionary
679
+
680
+ def process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose):
681
+
682
+ format_cols = None
683
+ inpath_chr_list = None
684
+ inpath_chr_num_list = None
685
+
686
+ if "@" in inpath:
687
+ log.write(" -Detected @ in path: load sumstats by each chromosome...",verbose=verbose)
688
+ inpath_chr_list=[]
689
+ inpath_chr_num_list=[]
690
+
691
+ # create a regex pattern for matching
692
+ pat = os.path.basename(inpath).replace("@","(\w+)")
693
+
694
+ # get dir
695
+ dirname = os.path.dirname(inpath)
696
+
697
+ # all files in the directory
698
+ files = os.listdir(dirname)
699
+
700
+ files.sort()
701
+
702
+ for file in files:
703
+ # match
704
+ result = re.match(pat, file)
705
+ if result:
706
+ # get chr
707
+ chr_matched = str(result.group(1))
708
+ inpath_chr_num_list.append(chr_matched)
709
+ inpath_chr_list.append(inpath.replace("@",str(chr_matched)) )
710
+
711
+ log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
712
+
713
+ #if inpath_chr_list is empty-> IndexError
714
+ readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
715
+ row_one = pd.read_table(inpath_chr_list[0],**readargs_header)
716
+ # columns in the sumstats
717
+ raw_cols = row_one.columns
718
+ else:
719
+ ##### loading data from tabular file#################################################
720
+ #if file not found, FileNotFoundError
721
+ readargs_header = get_readargs_header(inpath = inpath, readargs = readargs)
722
+ row_one = pd.read_table(inpath,**readargs_header)
723
+ raw_cols = row_one.columns
724
+
725
+ if fmt=="vcf":
726
+ # expanded
727
+ format_cols = list(row_one["FORMAT"].str.split(":"))[0]
728
+ # fixed + study1 + expanded
729
+ raw_cols = meta_data["format_fixed"] + [raw_cols[9]] + format_cols
730
+
731
+ return format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list
@@ -5,12 +5,12 @@ import scipy.stats as ss
5
5
  import gzip
6
6
  import os
7
7
  import gc
8
- from gwaslab.bd_common_data import get_format_dict
9
- from gwaslab.qc_fix_sumstats import sortcolumn
10
- from gwaslab.qc_fix_sumstats import _process_build
11
- from gwaslab.qc_check_datatype_polars import check_datatype
12
- from gwaslab.qc_check_datatype_polars import quick_convert_datatype
13
- from gwaslab.qc_check_datatype_polars import check_dataframe_memory_usage
8
+ from gwaslab.bd.bd_common_data import get_format_dict
9
+ from gwaslab.qc.qc_fix_sumstats import sortcolumn
10
+ from gwaslab.qc.qc_fix_sumstats import _process_build
11
+ from gwaslab.qc.qc_check_datatype_polars import check_datatype_polars
12
+ from gwaslab.qc.qc_check_datatype_polars import quick_convert_datatype
13
+ from gwaslab.qc.qc_check_datatype_polars import check_dataframe_memory_usage
14
14
  from gwaslab.g_headers import _check_overlap_with_reserved_keys
15
15
  #20221030
16
16
  def preformatp(sumstats,
@@ -433,7 +433,7 @@ def preformatp(sumstats,
433
433
  #sumstats = sortcolumn(sumstats=sumstats,log=log,verbose=verbose)
434
434
  sumstats = quick_convert_datatype(sumstats,log=log,verbose=verbose)
435
435
 
436
- check_datatype(sumstats,log=log,verbose=verbose)
436
+ check_datatype_polars(sumstats,log=log,verbose=verbose)
437
437
  #gc.collect()
438
438
  check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)
439
439
 
@@ -1,7 +1,7 @@
1
1
  import pandas as pd
2
2
  from gwaslab.g_Log import Log
3
- from gwaslab.qc_check_datatype import check_datatype
4
- from gwaslab.qc_check_datatype import check_dataframe_memory_usage
3
+ from gwaslab.qc.qc_check_datatype import check_datatype
4
+ from gwaslab.qc.qc_check_datatype import check_dataframe_memory_usage
5
5
  import re
6
6
  import os
7
7
 
@@ -11,7 +11,8 @@ def _read_pipcs(data,
11
11
  group=None,
12
12
  studie_names=None,
13
13
  log=Log(),
14
- verbose=True):
14
+ verbose=True,
15
+ **readcsv_kwargs):
15
16
 
16
17
  log.write("Start to load PIP and CREDIBLE_SET_INDEX from file...",verbose=verbose)
17
18
  log.write(" -File:{}".format(output_prefix),verbose=verbose)
@@ -32,14 +33,14 @@ def _read_pipcs(data,
32
33
  pipcs_single_list=[]
33
34
  for index,pipcs_path in enumerate(pipcs_path_list):
34
35
  log.write(" -Loading {}:".format(pipcs_loci_list[index]) + pipcs_path)
35
- pipcs_single = pd.read_csv(pipcs_path)
36
+ pipcs_single = pd.read_csv(pipcs_path,**readcsv_kwargs)
36
37
  if "LOCUS" not in pipcs_single.columns:
37
38
  pipcs_single["LOCUS"]=pipcs_loci_list[index]
38
39
  pipcs_single_list.append(pipcs_single)
39
40
 
40
41
  pipcs = pd.concat(pipcs_single_list, axis=0, ignore_index=True)
41
42
  else:
42
- pipcs = pd.read_csv("{}".format(output_prefix))
43
+ pipcs = pd.read_csv("{}".format(output_prefix),**readcsv_kwargs)
43
44
 
44
45
  if "CHR" not in pipcs.columns:
45
46
  log.write(" -Merging CHR and POS from main dataframe...",verbose=verbose)
@@ -1,7 +1,7 @@
1
1
  import pandas as pd
2
- from gwaslab.bd_common_data import get_formats_list
2
+ from gwaslab.bd.bd_common_data import get_formats_list
3
3
  from gwaslab.g_Log import Log
4
- from gwaslab.bd_common_data import get_format_dict
4
+ from gwaslab.bd.bd_common_data import get_format_dict
5
5
 
6
6
  def _read_tabular(path, fmt, **kwargs):
7
7
 
@@ -7,16 +7,19 @@ from pysam import tabix_compress
7
7
  from pysam import tabix_index
8
8
  from datetime import datetime
9
9
  from datetime import date
10
- from gwaslab.io_preformat_input import print_format_info
11
- from gwaslab.bd_common_data import get_formats_list
12
10
  from gwaslab.g_Log import Log
13
- from gwaslab.bd_common_data import get_format_dict
14
- from gwaslab.bd_common_data import get_number_to_chr
15
11
  from gwaslab.g_version import gwaslab_info
16
- from gwaslab.bd_get_hapmap3 import gethapmap3
17
- from gwaslab.util_in_filter_value import _exclude_hla
18
- from gwaslab.util_in_filter_value import _exclude
19
- from gwaslab.util_in_filter_value import _extract
12
+
13
+ from gwaslab.io.io_preformat_input import print_format_info
14
+
15
+ from gwaslab.bd.bd_common_data import get_format_dict
16
+ from gwaslab.bd.bd_common_data import get_number_to_chr
17
+ from gwaslab.bd.bd_common_data import get_formats_list
18
+ from gwaslab.bd.bd_get_hapmap3 import gethapmap3
19
+
20
+ from gwaslab.util.util_in_filter_value import _exclude_hla
21
+ from gwaslab.util.util_in_filter_value import _exclude
22
+ from gwaslab.util.util_in_filter_value import _extract
20
23
  # to vcf
21
24
  # to fmt
22
25
  ## vcf
@@ -402,11 +405,11 @@ def tofmt(sumstats,
402
405
 
403
406
  ####################################################################################################################
404
407
  def _write_tabular(sumstats,rename_dictionary, path, tab_fmt, to_csvargs, to_tabular_kwargs, log, verbose):
405
- chr_header = rename_dictionary["CHR"]
406
408
  if tab_fmt=="tsv" or tab_fmt=="csv":
407
409
  try:
408
410
  log.write(f" -Fast to csv mode...",verbose=verbose)
409
411
  if "@" in path:
412
+ chr_header = rename_dictionary["CHR"]
410
413
  log.write(f" -@ detected: writing each chromosome to a single file...",verbose=verbose)
411
414
  log.write(" -Chromosomes:{}...".format(list(sumstats["CHR"].unique())),verbose=verbose)
412
415
  for single_chr in list(sumstats["CHR"].unique()):
@@ -420,6 +423,7 @@ def _write_tabular(sumstats,rename_dictionary, path, tab_fmt, to_csvargs, to_tab
420
423
  except:
421
424
  log.write(f"Error in using fast_to_csv. Falling back to original implementation.",verbose=verbose)
422
425
  if "@" in path:
426
+ chr_header = rename_dictionary["CHR"]
423
427
  log.write(f" -@ detected: writing each chromosome to a single file...",verbose=verbose)
424
428
  log.write(" -Chromosomes:{}...".format(list(sumstats["CHR"].unique())),verbose=verbose)
425
429
  for single_chr in list(sumstats["CHR"].unique()):
@@ -44,4 +44,19 @@ def load_data_from_pickle(path,usecols=None):
44
44
  existing_cols.append(i)
45
45
  data = data.loc[:,existing_cols]
46
46
  gc.collect()
47
- return data
47
+ return data
48
+
49
+ def _offload(df,path,log):
50
+ with open(path, 'wb') as file:
51
+ pickle.dump(df, file)
52
+ log.write("Dumpping dataframe to : ", path)
53
+
54
+ def _reload(path,log):
55
+ with open(path, 'rb') as file:
56
+ df = pickle.load(file)
57
+ log.write("Loaded dataframe back from : ", path)
58
+ try:
59
+ os.remove(path)
60
+ except:
61
+ pass
62
+ return df
@@ -56,7 +56,7 @@ dtype_dict ={
56
56
  'P_RANDOM' :[pl.Float64()]
57
57
  }
58
58
 
59
- def check_datatype(sumstats, verbose=True, log=Log()):
59
+ def check_datatype_polars(sumstats, verbose=True, log=Log()):
60
60
 
61
61
  #try:
62
62
  headers = []
@@ -112,7 +112,7 @@ def quick_convert_datatype(sumstats, log, verbose):
112
112
  pass
113
113
  return sumstats
114
114
 
115
- def check_dataframe_shape(sumstats, log, verbose):
115
+ def check_dataframe_shape_polars(sumstats, log, verbose):
116
116
  memory_in_mb = sumstats.estimated_size(unit="mb")
117
117
  try:
118
118
  log.write(" -Current Dataframe shape : {} x {} ; Memory usage: {:.2f} MB".format(len(sumstats),len(sumstats.columns),memory_in_mb), verbose=verbose)