gwaslab 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +57 -47
- gwaslab/{bd_common_data.py → bd/bd_common_data.py} +10 -9
- gwaslab/bd/bd_config.py +28 -0
- gwaslab/{bd_download.py → bd/bd_download.py} +1 -1
- gwaslab/{bd_get_hapmap3.py → bd/bd_get_hapmap3.py} +9 -6
- gwaslab/bd/bd_path_manager.py +110 -0
- gwaslab/data/formatbook.json +805 -9
- gwaslab/{ldsc_irwls.py → extension/ldsc/ldsc_irwls.py} +1 -1
- gwaslab/{ldsc_regressions.py → extension/ldsc/ldsc_regressions.py} +2 -2
- gwaslab/{ldsc_sumstats.py → extension/ldsc/ldsc_sumstats.py} +2 -2
- gwaslab/{prscs_mcmc_gtb.py → extension/prscs/prscs_mcmc_gtb.py} +1 -1
- gwaslab/g_Sumstats.py +130 -96
- gwaslab/g_SumstatsMulti.py +69 -40
- gwaslab/g_SumstatsPair.py +54 -37
- gwaslab/g_SumstatsSet.py +88 -81
- gwaslab/g_SumstatsT.py +6 -6
- gwaslab/g_Sumstats_polars.py +84 -84
- gwaslab/g_meta_update.py +1 -1
- gwaslab/g_vchange_status.py +4 -4
- gwaslab/g_version.py +2 -2
- gwaslab/{hm_casting.py → hm/hm_casting.py} +4 -4
- gwaslab/{hm_casting_polars.py → hm/hm_casting_polars.py} +4 -4
- gwaslab/hm/hm_harmonize_sumstats.py +1635 -0
- gwaslab/hm_harmonize_sumstats.py +3 -8
- gwaslab/{io_load_ld.py → io/io_load_ld.py} +16 -13
- gwaslab/{io_preformat_input.py → io/io_preformat_input.py} +152 -73
- gwaslab/{io_preformat_input_polars.py → io/io_preformat_input_polars.py} +7 -7
- gwaslab/{io_read_pipcs.py → io/io_read_pipcs.py} +6 -5
- gwaslab/{io_read_tabular.py → io/io_read_tabular.py} +2 -2
- gwaslab/{io_to_formats.py → io/io_to_formats.py} +13 -9
- gwaslab/{io_to_pickle.py → io/io_to_pickle.py} +16 -1
- gwaslab/{qc_check_datatype_polars.py → qc/qc_check_datatype_polars.py} +2 -2
- gwaslab/{qc_fix_sumstats.py → qc/qc_fix_sumstats.py} +60 -33
- gwaslab/{qc_fix_sumstats_polars.py → qc/qc_fix_sumstats_polars.py} +15 -11
- gwaslab/{util_abf_finemapping.py → util/util_abf_finemapping.py} +2 -2
- gwaslab/{util_ex_calculate_ldmatrix.py → util/util_ex_calculate_ldmatrix.py} +18 -8
- gwaslab/{util_ex_calculate_prs.py → util/util_ex_calculate_prs.py} +2 -2
- gwaslab/{util_ex_ldproxyfinder.py → util/util_ex_ldproxyfinder.py} +6 -6
- gwaslab/{util_ex_ldsc.py → util/util_ex_ldsc.py} +18 -13
- gwaslab/{util_ex_match_ldmatrix.py → util/util_ex_match_ldmatrix.py} +8 -7
- gwaslab/util/util_ex_phewwas.py +117 -0
- gwaslab/{util_ex_process_h5.py → util/util_ex_process_h5.py} +2 -2
- gwaslab/{util_ex_process_ref.py → util/util_ex_process_ref.py} +2 -2
- gwaslab/{util_ex_run_2samplemr.py → util/util_ex_run_2samplemr.py} +18 -7
- gwaslab/{util_ex_run_ccgwas.py → util/util_ex_run_ccgwas.py} +4 -4
- gwaslab/{util_ex_run_clumping.py → util/util_ex_run_clumping.py} +28 -13
- gwaslab/{util_ex_run_coloc.py → util/util_ex_run_coloc.py} +22 -10
- gwaslab/{util_ex_run_hyprcoloc.py → util/util_ex_run_hyprcoloc.py} +4 -4
- gwaslab/{util_ex_run_magma.py → util/util_ex_run_magma.py} +21 -11
- gwaslab/{util_ex_run_mesusie.py → util/util_ex_run_mesusie.py} +3 -3
- gwaslab/{util_ex_run_mtag.py → util/util_ex_run_mtag.py} +50 -18
- gwaslab/{util_ex_run_prscs.py → util/util_ex_run_prscs.py} +3 -3
- gwaslab/{util_ex_run_scdrs.py → util/util_ex_run_scdrs.py} +10 -4
- gwaslab/{util_ex_run_susie.py → util/util_ex_run_susie.py} +49 -26
- gwaslab/{util_in_fill_data.py → util/util_in_fill_data.py} +1 -1
- gwaslab/{util_in_filter_value.py → util/util_in_filter_value.py} +18 -11
- gwaslab/{util_in_get_sig.py → util/util_in_get_sig.py} +15 -13
- gwaslab/{util_in_meta.py → util/util_in_meta.py} +1 -1
- gwaslab/{util_in_meta_polars.py → util/util_in_meta_polars.py} +1 -1
- gwaslab/{viz_aux_annotate_plot.py → viz/viz_aux_annotate_plot.py} +1 -1
- gwaslab/{viz_aux_quickfix.py → viz/viz_aux_quickfix.py} +2 -2
- gwaslab/{viz_plot_compare_af.py → viz/viz_plot_compare_af.py} +1 -1
- gwaslab/{viz_plot_compare_effect.py → viz/viz_plot_compare_effect.py} +16 -8
- gwaslab/{viz_plot_credible_sets.py → viz/viz_plot_credible_sets.py} +6 -6
- gwaslab/{viz_plot_effect.py → viz/viz_plot_effect.py} +37 -69
- gwaslab/{viz_plot_miamiplot.py → viz/viz_plot_miamiplot.py} +28 -20
- gwaslab/{viz_plot_miamiplot2.py → viz/viz_plot_miamiplot2.py} +27 -22
- gwaslab/{viz_plot_mqqplot.py → viz/viz_plot_mqqplot.py} +48 -38
- gwaslab/{viz_plot_phe_heatmap.py → viz/viz_plot_phe_heatmap.py} +18 -15
- gwaslab/{viz_plot_qqplot.py → viz/viz_plot_qqplot.py} +4 -2
- gwaslab/{viz_plot_regional2.py → viz/viz_plot_regional2.py} +11 -9
- gwaslab/{viz_plot_regionalplot.py → viz/viz_plot_regionalplot.py} +5 -4
- gwaslab/{viz_plot_rg_heatmap.py → viz/viz_plot_rg_heatmap.py} +1 -1
- gwaslab/{viz_plot_scatter_with_reg.py → viz/viz_plot_scatter_with_reg.py} +10 -7
- gwaslab/{viz_plot_stackedregional.py → viz/viz_plot_stackedregional.py} +67 -33
- gwaslab/{viz_plot_trumpetplot.py → viz/viz_plot_trumpetplot.py} +11 -9
- {gwaslab-3.6.5.dist-info → gwaslab-3.6.7.dist-info}/METADATA +1 -1
- gwaslab-3.6.7.dist-info/RECORD +123 -0
- gwaslab/bd_config.py +0 -18
- gwaslab-3.6.5.dist-info/RECORD +0 -120
- /gwaslab/{ldsc_jackknife.py → extension/ldsc/ldsc_jackknife.py} +0 -0
- /gwaslab/{ldsc_ldscore.py → extension/ldsc/ldsc_ldscore.py} +0 -0
- /gwaslab/{ldsc_parse.py → extension/ldsc/ldsc_parse.py} +0 -0
- /gwaslab/{prscs_gigrnd.py → extension/prscs/prscs_gigrnd.py} +0 -0
- /gwaslab/{prscs_parse_genet.py → extension/prscs/prscs_parse_genet.py} +0 -0
- /gwaslab/{hm_rsid_to_chrpos.py → hm/hm_rsid_to_chrpos.py} +0 -0
- /gwaslab/{io_process_args.py → io/io_process_args.py} +0 -0
- /gwaslab/{io_read_ldsc.py → io/io_read_ldsc.py} +0 -0
- /gwaslab/{qc_build.py → qc/qc_build.py} +0 -0
- /gwaslab/{qc_check_datatype.py → qc/qc_check_datatype.py} +0 -0
- /gwaslab/{util_ex_gwascatalog.py → util/util_ex_gwascatalog.py} +0 -0
- /gwaslab/{util_ex_infer_ancestry.py → util/util_ex_infer_ancestry.py} +0 -0
- /gwaslab/{util_ex_plink_filter.py → util/util_ex_plink_filter.py} +0 -0
- /gwaslab/{util_in_calculate_gc.py → util/util_in_calculate_gc.py} +0 -0
- /gwaslab/{util_in_calculate_power.py → util/util_in_calculate_power.py} +0 -0
- /gwaslab/{util_in_convert_h2.py → util/util_in_convert_h2.py} +0 -0
- /gwaslab/{util_in_correct_winnerscurse.py → util/util_in_correct_winnerscurse.py} +0 -0
- /gwaslab/{util_in_estimate_ess.py → util/util_in_estimate_ess.py} +0 -0
- /gwaslab/{util_in_get_density.py → util/util_in_get_density.py} +0 -0
- /gwaslab/{util_in_merge.py → util/util_in_merge.py} +0 -0
- /gwaslab/{util_in_snphwe.py → util/util_in_snphwe.py} +0 -0
- /gwaslab/{viz_aux_chromatin.py → viz/viz_aux_chromatin.py} +0 -0
- /gwaslab/{viz_aux_property.py → viz/viz_aux_property.py} +0 -0
- /gwaslab/{viz_aux_reposition_text.py → viz/viz_aux_reposition_text.py} +0 -0
- /gwaslab/{viz_aux_save_figure.py → viz/viz_aux_save_figure.py} +0 -0
- /gwaslab/{viz_plot_forestplot.py → viz/viz_plot_forestplot.py} +0 -0
- {gwaslab-3.6.5.dist-info → gwaslab-3.6.7.dist-info}/WHEEL +0 -0
- {gwaslab-3.6.5.dist-info → gwaslab-3.6.7.dist-info}/licenses/LICENSE +0 -0
- {gwaslab-3.6.5.dist-info → gwaslab-3.6.7.dist-info}/licenses/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.6.5.dist-info → gwaslab-3.6.7.dist-info}/top_level.txt +0 -0
gwaslab/hm_harmonize_sumstats.py
CHANGED
|
@@ -27,7 +27,7 @@ from gwaslab.bd_common_data import _maketrans
|
|
|
27
27
|
from gwaslab.g_vchange_status import vchange_status
|
|
28
28
|
from gwaslab.g_version import _get_version
|
|
29
29
|
from gwaslab.cache_manager import CacheManager, PALINDROMIC_INDEL, NON_PALINDROMIC
|
|
30
|
-
|
|
30
|
+
from gwaslab.g_vchange_status import STATUS_CATEGORIES
|
|
31
31
|
#rsidtochrpos
|
|
32
32
|
#checkref
|
|
33
33
|
#parallelizeassignrsid
|
|
@@ -357,10 +357,7 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
|
|
|
357
357
|
|
|
358
358
|
log.write("\n",end="",show_time=False,verbose=verbose)
|
|
359
359
|
|
|
360
|
-
|
|
361
|
-
sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
|
|
362
|
-
#sumstats[status] = sumstats[status].astype("string")
|
|
363
|
-
|
|
360
|
+
sumstats[status] = pd.Categorical(sumstats[status],categories=STATUS_CATEGORIES)
|
|
364
361
|
|
|
365
362
|
available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
|
|
366
363
|
status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
|
|
@@ -681,9 +678,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
|
|
|
681
678
|
sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
|
|
682
679
|
log.write(" -Finished checking records", verbose=verbose)
|
|
683
680
|
|
|
684
|
-
|
|
685
|
-
sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
|
|
686
|
-
#sumstats[status] = sumstats[status].astype("string")
|
|
681
|
+
sumstats[status] = pd.Categorical(sumstats[status],categories=STATUS_CATEGORIES)
|
|
687
682
|
|
|
688
683
|
available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
|
|
689
684
|
status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
|
|
@@ -1,8 +1,6 @@
|
|
|
1
|
-
|
|
2
1
|
import scipy.sparse as sparse
|
|
3
2
|
import numpy as np
|
|
4
3
|
import pandas as pd
|
|
5
|
-
from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
|
|
6
4
|
import subprocess
|
|
7
5
|
import os
|
|
8
6
|
import re
|
|
@@ -10,16 +8,22 @@ import gc
|
|
|
10
8
|
import pandas as pd
|
|
11
9
|
import numpy as np
|
|
12
10
|
from gwaslab.g_Log import Log
|
|
13
|
-
from gwaslab.qc_fix_sumstats import start_to
|
|
14
|
-
from gwaslab.qc_fix_sumstats import finished
|
|
15
|
-
from gwaslab.util_in_get_sig import getsig
|
|
16
|
-
from gwaslab.util_ex_process_ref import _process_plink_input_files
|
|
17
11
|
from gwaslab.g_version import _checking_plink_version
|
|
18
|
-
|
|
19
|
-
from gwaslab.
|
|
20
|
-
|
|
21
|
-
from gwaslab.
|
|
22
|
-
from gwaslab.
|
|
12
|
+
|
|
13
|
+
from gwaslab.hm.hm_casting import _merge_mold_with_sumstats_by_chrpos
|
|
14
|
+
|
|
15
|
+
from gwaslab.qc.qc_fix_sumstats import start_to
|
|
16
|
+
from gwaslab.qc.qc_fix_sumstats import finished
|
|
17
|
+
|
|
18
|
+
from gwaslab.util.util_in_get_sig import getsig
|
|
19
|
+
from gwaslab.util.util_ex_process_ref import _process_plink_input_files
|
|
20
|
+
from gwaslab.util.util_in_filter_value import _exclude_hla
|
|
21
|
+
from gwaslab.util.util_ex_calculate_ldmatrix import _extract_variants_in_locus
|
|
22
|
+
from gwaslab.util.util_ex_calculate_ldmatrix import _export_snplist_and_locus_sumstats
|
|
23
|
+
from gwaslab.util.util_ex_calculate_ldmatrix import _extract_variants_in_locus
|
|
24
|
+
|
|
25
|
+
from gwaslab.viz.viz_plot_regional2 import _get_lead_id
|
|
26
|
+
|
|
23
27
|
|
|
24
28
|
def tofinemapping_using_ld(sumstats,
|
|
25
29
|
study=None,
|
|
@@ -72,7 +76,7 @@ def tofinemapping_using_ld(sumstats,
|
|
|
72
76
|
sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
|
|
73
77
|
else:
|
|
74
78
|
sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
|
|
75
|
-
|
|
79
|
+
log.write(" -Number of loci: {}...".format(len(sig_df)),verbose=verbose)
|
|
76
80
|
# Drop duplicate!!!!
|
|
77
81
|
log.write(" -Dropping duplicated SNPIDs...",verbose=verbose)
|
|
78
82
|
sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
|
|
@@ -447,7 +451,6 @@ def _merge_ld_map_with_sumstats(row,
|
|
|
447
451
|
# matching by SNPID
|
|
448
452
|
# preserve bim keys (use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.)
|
|
449
453
|
combined_df = pd.merge(ld_map, locus_sumstats, on=["CHR","POS"],how="inner")
|
|
450
|
-
|
|
451
454
|
# match allele
|
|
452
455
|
perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
|
|
453
456
|
log.write(" -Variants with perfect matched alleles:{}".format(sum(perfect_match)))
|
|
@@ -3,14 +3,16 @@ import numpy as np
|
|
|
3
3
|
import scipy.stats as ss
|
|
4
4
|
import gzip
|
|
5
5
|
import os
|
|
6
|
+
import re
|
|
6
7
|
import gc
|
|
7
|
-
from gwaslab.bd_common_data import get_format_dict
|
|
8
|
-
from gwaslab.qc_fix_sumstats import sortcolumn
|
|
9
|
-
from gwaslab.qc_fix_sumstats import _process_build
|
|
10
|
-
from gwaslab.qc_check_datatype import check_datatype
|
|
11
|
-
from gwaslab.qc_check_datatype import quick_convert_datatype
|
|
12
|
-
from gwaslab.qc_check_datatype import check_dataframe_memory_usage
|
|
8
|
+
from gwaslab.bd.bd_common_data import get_format_dict
|
|
9
|
+
from gwaslab.qc.qc_fix_sumstats import sortcolumn
|
|
10
|
+
from gwaslab.qc.qc_fix_sumstats import _process_build
|
|
11
|
+
from gwaslab.qc.qc_check_datatype import check_datatype
|
|
12
|
+
from gwaslab.qc.qc_check_datatype import quick_convert_datatype
|
|
13
|
+
from gwaslab.qc.qc_check_datatype import check_dataframe_memory_usage
|
|
13
14
|
from gwaslab.g_headers import _check_overlap_with_reserved_keys
|
|
15
|
+
from gwaslab.g_vchange_status import STATUS_CATEGORIES
|
|
14
16
|
#20221030
|
|
15
17
|
def preformat(sumstats,
|
|
16
18
|
fmt=None,
|
|
@@ -122,62 +124,22 @@ def preformat(sumstats,
|
|
|
122
124
|
|
|
123
125
|
if "sep" not in readargs.keys():
|
|
124
126
|
readargs["sep"] = "\t"
|
|
127
|
+
else:
|
|
128
|
+
meta_data = None
|
|
125
129
|
|
|
126
130
|
#########################################################################################################################################################
|
|
127
131
|
|
|
128
|
-
# check chr-separated path / vcf / then print header.
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
for chromosome in list(range(1,26))+["x","y","X","Y","MT","mt","m","M"]:
|
|
139
|
-
inpath_chr = inpath.replace("@",str(chromosome))
|
|
140
|
-
if isfile_casesensitive(inpath_chr):
|
|
141
|
-
inpath_chr_num_list.append(str(chromosome))
|
|
142
|
-
inpath_chr_list.append(inpath_chr)
|
|
143
|
-
log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
|
|
144
|
-
readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
|
|
145
|
-
row_one = pd.read_table(inpath_chr_list[0],**readargs_header)
|
|
146
|
-
# columns in the sumstats
|
|
147
|
-
raw_cols = row_one.columns
|
|
148
|
-
else:
|
|
149
|
-
##### loading data from tabular file#################################################
|
|
150
|
-
readargs_header = get_readargs_header(inpath = inpath, readargs = readargs)
|
|
151
|
-
row_one = pd.read_table(inpath,**readargs_header)
|
|
152
|
-
raw_cols = row_one.columns
|
|
153
|
-
|
|
154
|
-
if fmt=="vcf":
|
|
155
|
-
# expanded
|
|
156
|
-
format_cols = list(row_one["FORMAT"].str.split(":"))[0]
|
|
157
|
-
# fixed + study1 + expanded
|
|
158
|
-
raw_cols = meta_data["format_fixed"] + [raw_cols[9]] + format_cols
|
|
132
|
+
# check chr-separated path / vcf / then print header.
|
|
133
|
+
inpath, inpath_chr_list, inpath_chr_num_list, format_cols, raw_cols, usecols, dtype_dictionary = check_path_and_header(sumstats,
|
|
134
|
+
fmt,
|
|
135
|
+
meta_data,
|
|
136
|
+
readargs,
|
|
137
|
+
usecols,
|
|
138
|
+
dtype_dictionary,
|
|
139
|
+
rename_dictionary,
|
|
140
|
+
log,
|
|
141
|
+
verbose)
|
|
159
142
|
|
|
160
|
-
######################################################################################
|
|
161
|
-
elif type(sumstats) is pd.DataFrame:
|
|
162
|
-
## loading data from dataframe
|
|
163
|
-
raw_cols = sumstats.columns
|
|
164
|
-
|
|
165
|
-
################################################
|
|
166
|
-
for key,value in rename_dictionary.items():
|
|
167
|
-
# check avaiable keys key->raw header
|
|
168
|
-
# usecols : a list of raw headers to load from file/DataFrame
|
|
169
|
-
if key in raw_cols:
|
|
170
|
-
usecols.append(key)
|
|
171
|
-
if value in ["EA","NEA"]:
|
|
172
|
-
dtype_dictionary[key]="category"
|
|
173
|
-
if value in ["STATUS"]:
|
|
174
|
-
dtype_dictionary[key]="string"
|
|
175
|
-
if value in ["CHR"]:
|
|
176
|
-
dtype_dictionary[key]="string"
|
|
177
|
-
|
|
178
|
-
except ValueError:
|
|
179
|
-
raise ValueError("Please input a path or a pd.DataFrame, and make sure the separator is correct and the columns you specified are in the file.")
|
|
180
|
-
|
|
181
143
|
###################################################################################################################################################
|
|
182
144
|
## check columns/datatype to use
|
|
183
145
|
if snpid:
|
|
@@ -359,7 +321,7 @@ def preformat(sumstats,
|
|
|
359
321
|
try:
|
|
360
322
|
if type(sumstats) is str:
|
|
361
323
|
## loading data from path
|
|
362
|
-
inpath = sumstats
|
|
324
|
+
#inpath = sumstats
|
|
363
325
|
if "@" in inpath:
|
|
364
326
|
log.write("Start to initialize gl.Sumstats from files with pattern :" + inpath,verbose=verbose)
|
|
365
327
|
sumstats_chr_list=[]
|
|
@@ -445,14 +407,14 @@ def preformat(sumstats,
|
|
|
445
407
|
sumstats["N_CONTROL"] = ncontrol
|
|
446
408
|
|
|
447
409
|
### status ######################################################################################################
|
|
448
|
-
|
|
449
|
-
|
|
410
|
+
|
|
411
|
+
sumstats = process_status(sumstats=sumstats,build=build,status=status,log=log,verbose=verbose)
|
|
450
412
|
|
|
451
413
|
## ea/nea, ref/alt ##############################################################################################
|
|
452
414
|
sumstats = process_allele(sumstats=sumstats,log=log,verbose=verbose)
|
|
453
415
|
|
|
454
416
|
## NEAF to EAF ###########################################################################################################
|
|
455
|
-
if neaf is not None :
|
|
417
|
+
if neaf is not None or ("NEAF" in sumstats.columns and "EAF" not in sumstats.columns):
|
|
456
418
|
sumstats = process_neaf(sumstats=sumstats,log=log,verbose=verbose)
|
|
457
419
|
|
|
458
420
|
## reodering ###################################################################################################
|
|
@@ -562,9 +524,15 @@ def process_neaf(sumstats,log,verbose):
|
|
|
562
524
|
log.write(" -NEAF is specified...",verbose=verbose)
|
|
563
525
|
pre_number=len(sumstats)
|
|
564
526
|
log.write(" -Checking if 0<= NEAF <=1 ...",verbose=verbose)
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
527
|
+
if "NEAF" in sumstats.columns:
|
|
528
|
+
sumstats["NEAF"] = pd.to_numeric(sumstats["NEAF"], errors='coerce')
|
|
529
|
+
sumstats = sumstats.loc[(sumstats["NEAF"]>=0) & (sumstats["NEAF"]<=1),:]
|
|
530
|
+
sumstats["EAF"] = 1- sumstats["NEAF"]
|
|
531
|
+
sumstats.drop(columns=["NEAF"], inplace=True)
|
|
532
|
+
else:
|
|
533
|
+
sumstats["EAF"] = pd.to_numeric(sumstats["EAF"], errors='coerce')
|
|
534
|
+
sumstats = sumstats.loc[(sumstats["EAF"]>=0) & (sumstats["EAF"]<=1),:]
|
|
535
|
+
sumstats["EAF"] = 1- sumstats["EAF"]
|
|
568
536
|
log.write(" -Converted NEAF to EAF.",verbose=verbose)
|
|
569
537
|
after_number=len(sumstats)
|
|
570
538
|
log.write(" -Removed "+str(pre_number - after_number)+" variants with bad NEAF.",verbose=verbose)
|
|
@@ -599,13 +567,14 @@ def process_allele(sumstats,log,verbose):
|
|
|
599
567
|
sumstats["NEA"]=sumstats["NEA"].astype("category")
|
|
600
568
|
return sumstats
|
|
601
569
|
|
|
602
|
-
def process_status(sumstats,build,log,verbose):
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
570
|
+
def process_status(sumstats,build,status, log,verbose):
|
|
571
|
+
if status is None:
|
|
572
|
+
log.write(" -Initiating a status column: STATUS ...",verbose=verbose)
|
|
573
|
+
#sumstats["STATUS"] = int(build)*(10**5) +99999
|
|
574
|
+
build = _process_build(build,log,verbose)
|
|
575
|
+
sumstats["STATUS"] = build +"99999"
|
|
576
|
+
|
|
577
|
+
sumstats["STATUS"] = pd.Categorical(sumstats["STATUS"],categories=STATUS_CATEGORIES)
|
|
609
578
|
return sumstats
|
|
610
579
|
|
|
611
580
|
|
|
@@ -649,4 +618,114 @@ def _load_variants_with_pattern(inpath,usecols,dtype_dictionary,readargs,rename_
|
|
|
649
618
|
log.write(" -Loading only variants with pattern : {} ...".format(snpid_pat),verbose=verbose)
|
|
650
619
|
sumstats_filtered = pd.concat([chunk[chunk[chunk_snpid].str.match(snpid_pat, case=False,na=False) ] for chunk in sumstats_iter])
|
|
651
620
|
log.write(" -Loaded {} variants with pattern : {} ...".format(len(sumstats_filtered), snpid_pat),verbose=verbose)
|
|
652
|
-
return sumstats_filtered
|
|
621
|
+
return sumstats_filtered
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def check_path_and_header(sumstats=None,
|
|
625
|
+
fmt=None,
|
|
626
|
+
meta_data=None,
|
|
627
|
+
readargs=None,
|
|
628
|
+
usecols=None,
|
|
629
|
+
dtype_dictionary=None,
|
|
630
|
+
rename_dictionary=None,
|
|
631
|
+
log=None,
|
|
632
|
+
verbose=None):
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
if type(sumstats) is str:
|
|
636
|
+
## loading data from path #################################################
|
|
637
|
+
inpath = sumstats
|
|
638
|
+
|
|
639
|
+
try:
|
|
640
|
+
format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list = process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose)
|
|
641
|
+
|
|
642
|
+
except (FileNotFoundError, IndexError):
|
|
643
|
+
log.warning("Loading {} failed...Tesing if compressed/uncompressed...".format(inpath),verbose=verbose)
|
|
644
|
+
try:
|
|
645
|
+
if inpath[-3:]==".gz":
|
|
646
|
+
inpath = inpath[:-3]
|
|
647
|
+
log.write(" -Trying to load {}...".format(inpath),verbose=verbose)
|
|
648
|
+
format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list =process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose)
|
|
649
|
+
else:
|
|
650
|
+
inpath = inpath+".gz"
|
|
651
|
+
log.write(" -Trying to load {}...".format(inpath),verbose=verbose)
|
|
652
|
+
format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list = process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose)
|
|
653
|
+
except:
|
|
654
|
+
raise ValueError("Please input a valid path, and make sure the separator is correct and the columns you specified are in the file.")
|
|
655
|
+
|
|
656
|
+
######################################################################################
|
|
657
|
+
elif type(sumstats) is pd.DataFrame:
|
|
658
|
+
inpath = None
|
|
659
|
+
format_cols = None
|
|
660
|
+
inpath_chr_list = None
|
|
661
|
+
inpath_chr_num_list = None
|
|
662
|
+
## loading data from dataframe
|
|
663
|
+
raw_cols = sumstats.columns
|
|
664
|
+
|
|
665
|
+
################################################
|
|
666
|
+
for key,value in rename_dictionary.items():
|
|
667
|
+
# check avaiable keys key->raw header
|
|
668
|
+
# usecols : a list of raw headers to load from file/DataFrame
|
|
669
|
+
if key in raw_cols:
|
|
670
|
+
usecols.append(key)
|
|
671
|
+
if value in ["EA","NEA"]:
|
|
672
|
+
dtype_dictionary[key]="category"
|
|
673
|
+
if value in ["STATUS"]:
|
|
674
|
+
dtype_dictionary[key]="string"
|
|
675
|
+
if value in ["CHR"]:
|
|
676
|
+
dtype_dictionary[key]="string"
|
|
677
|
+
|
|
678
|
+
return inpath, inpath_chr_list, inpath_chr_num_list, format_cols, raw_cols, usecols, dtype_dictionary
|
|
679
|
+
|
|
680
|
+
def process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose):
|
|
681
|
+
|
|
682
|
+
format_cols = None
|
|
683
|
+
inpath_chr_list = None
|
|
684
|
+
inpath_chr_num_list = None
|
|
685
|
+
|
|
686
|
+
if "@" in inpath:
|
|
687
|
+
log.write(" -Detected @ in path: load sumstats by each chromosome...",verbose=verbose)
|
|
688
|
+
inpath_chr_list=[]
|
|
689
|
+
inpath_chr_num_list=[]
|
|
690
|
+
|
|
691
|
+
# create a regex pattern for matching
|
|
692
|
+
pat = os.path.basename(inpath).replace("@","(\w+)")
|
|
693
|
+
|
|
694
|
+
# get dir
|
|
695
|
+
dirname = os.path.dirname(inpath)
|
|
696
|
+
|
|
697
|
+
# all files in the directory
|
|
698
|
+
files = os.listdir(dirname)
|
|
699
|
+
|
|
700
|
+
files.sort()
|
|
701
|
+
|
|
702
|
+
for file in files:
|
|
703
|
+
# match
|
|
704
|
+
result = re.match(pat, file)
|
|
705
|
+
if result:
|
|
706
|
+
# get chr
|
|
707
|
+
chr_matched = str(result.group(1))
|
|
708
|
+
inpath_chr_num_list.append(chr_matched)
|
|
709
|
+
inpath_chr_list.append(inpath.replace("@",str(chr_matched)) )
|
|
710
|
+
|
|
711
|
+
log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
|
|
712
|
+
|
|
713
|
+
#if inpath_chr_list is empty-> IndexError
|
|
714
|
+
readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
|
|
715
|
+
row_one = pd.read_table(inpath_chr_list[0],**readargs_header)
|
|
716
|
+
# columns in the sumstats
|
|
717
|
+
raw_cols = row_one.columns
|
|
718
|
+
else:
|
|
719
|
+
##### loading data from tabular file#################################################
|
|
720
|
+
#if file not found, FileNotFoundError
|
|
721
|
+
readargs_header = get_readargs_header(inpath = inpath, readargs = readargs)
|
|
722
|
+
row_one = pd.read_table(inpath,**readargs_header)
|
|
723
|
+
raw_cols = row_one.columns
|
|
724
|
+
|
|
725
|
+
if fmt=="vcf":
|
|
726
|
+
# expanded
|
|
727
|
+
format_cols = list(row_one["FORMAT"].str.split(":"))[0]
|
|
728
|
+
# fixed + study1 + expanded
|
|
729
|
+
raw_cols = meta_data["format_fixed"] + [raw_cols[9]] + format_cols
|
|
730
|
+
|
|
731
|
+
return format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list
|
|
@@ -5,12 +5,12 @@ import scipy.stats as ss
|
|
|
5
5
|
import gzip
|
|
6
6
|
import os
|
|
7
7
|
import gc
|
|
8
|
-
from gwaslab.bd_common_data import get_format_dict
|
|
9
|
-
from gwaslab.qc_fix_sumstats import sortcolumn
|
|
10
|
-
from gwaslab.qc_fix_sumstats import _process_build
|
|
11
|
-
from gwaslab.qc_check_datatype_polars import
|
|
12
|
-
from gwaslab.qc_check_datatype_polars import quick_convert_datatype
|
|
13
|
-
from gwaslab.qc_check_datatype_polars import check_dataframe_memory_usage
|
|
8
|
+
from gwaslab.bd.bd_common_data import get_format_dict
|
|
9
|
+
from gwaslab.qc.qc_fix_sumstats import sortcolumn
|
|
10
|
+
from gwaslab.qc.qc_fix_sumstats import _process_build
|
|
11
|
+
from gwaslab.qc.qc_check_datatype_polars import check_datatype_polars
|
|
12
|
+
from gwaslab.qc.qc_check_datatype_polars import quick_convert_datatype
|
|
13
|
+
from gwaslab.qc.qc_check_datatype_polars import check_dataframe_memory_usage
|
|
14
14
|
from gwaslab.g_headers import _check_overlap_with_reserved_keys
|
|
15
15
|
#20221030
|
|
16
16
|
def preformatp(sumstats,
|
|
@@ -433,7 +433,7 @@ def preformatp(sumstats,
|
|
|
433
433
|
#sumstats = sortcolumn(sumstats=sumstats,log=log,verbose=verbose)
|
|
434
434
|
sumstats = quick_convert_datatype(sumstats,log=log,verbose=verbose)
|
|
435
435
|
|
|
436
|
-
|
|
436
|
+
check_datatype_polars(sumstats,log=log,verbose=verbose)
|
|
437
437
|
#gc.collect()
|
|
438
438
|
check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)
|
|
439
439
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from gwaslab.g_Log import Log
|
|
3
|
-
from gwaslab.qc_check_datatype import check_datatype
|
|
4
|
-
from gwaslab.qc_check_datatype import check_dataframe_memory_usage
|
|
3
|
+
from gwaslab.qc.qc_check_datatype import check_datatype
|
|
4
|
+
from gwaslab.qc.qc_check_datatype import check_dataframe_memory_usage
|
|
5
5
|
import re
|
|
6
6
|
import os
|
|
7
7
|
|
|
@@ -11,7 +11,8 @@ def _read_pipcs(data,
|
|
|
11
11
|
group=None,
|
|
12
12
|
studie_names=None,
|
|
13
13
|
log=Log(),
|
|
14
|
-
verbose=True
|
|
14
|
+
verbose=True,
|
|
15
|
+
**readcsv_kwargs):
|
|
15
16
|
|
|
16
17
|
log.write("Start to load PIP and CREDIBLE_SET_INDEX from file...",verbose=verbose)
|
|
17
18
|
log.write(" -File:{}".format(output_prefix),verbose=verbose)
|
|
@@ -32,14 +33,14 @@ def _read_pipcs(data,
|
|
|
32
33
|
pipcs_single_list=[]
|
|
33
34
|
for index,pipcs_path in enumerate(pipcs_path_list):
|
|
34
35
|
log.write(" -Loading {}:".format(pipcs_loci_list[index]) + pipcs_path)
|
|
35
|
-
pipcs_single = pd.read_csv(pipcs_path)
|
|
36
|
+
pipcs_single = pd.read_csv(pipcs_path,**readcsv_kwargs)
|
|
36
37
|
if "LOCUS" not in pipcs_single.columns:
|
|
37
38
|
pipcs_single["LOCUS"]=pipcs_loci_list[index]
|
|
38
39
|
pipcs_single_list.append(pipcs_single)
|
|
39
40
|
|
|
40
41
|
pipcs = pd.concat(pipcs_single_list, axis=0, ignore_index=True)
|
|
41
42
|
else:
|
|
42
|
-
pipcs = pd.read_csv("{}".format(output_prefix))
|
|
43
|
+
pipcs = pd.read_csv("{}".format(output_prefix),**readcsv_kwargs)
|
|
43
44
|
|
|
44
45
|
if "CHR" not in pipcs.columns:
|
|
45
46
|
log.write(" -Merging CHR and POS from main dataframe...",verbose=verbose)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
from gwaslab.bd_common_data import get_formats_list
|
|
2
|
+
from gwaslab.bd.bd_common_data import get_formats_list
|
|
3
3
|
from gwaslab.g_Log import Log
|
|
4
|
-
from gwaslab.bd_common_data import get_format_dict
|
|
4
|
+
from gwaslab.bd.bd_common_data import get_format_dict
|
|
5
5
|
|
|
6
6
|
def _read_tabular(path, fmt, **kwargs):
|
|
7
7
|
|
|
@@ -7,16 +7,19 @@ from pysam import tabix_compress
|
|
|
7
7
|
from pysam import tabix_index
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from datetime import date
|
|
10
|
-
from gwaslab.io_preformat_input import print_format_info
|
|
11
|
-
from gwaslab.bd_common_data import get_formats_list
|
|
12
10
|
from gwaslab.g_Log import Log
|
|
13
|
-
from gwaslab.bd_common_data import get_format_dict
|
|
14
|
-
from gwaslab.bd_common_data import get_number_to_chr
|
|
15
11
|
from gwaslab.g_version import gwaslab_info
|
|
16
|
-
|
|
17
|
-
from gwaslab.
|
|
18
|
-
|
|
19
|
-
from gwaslab.
|
|
12
|
+
|
|
13
|
+
from gwaslab.io.io_preformat_input import print_format_info
|
|
14
|
+
|
|
15
|
+
from gwaslab.bd.bd_common_data import get_format_dict
|
|
16
|
+
from gwaslab.bd.bd_common_data import get_number_to_chr
|
|
17
|
+
from gwaslab.bd.bd_common_data import get_formats_list
|
|
18
|
+
from gwaslab.bd.bd_get_hapmap3 import gethapmap3
|
|
19
|
+
|
|
20
|
+
from gwaslab.util.util_in_filter_value import _exclude_hla
|
|
21
|
+
from gwaslab.util.util_in_filter_value import _exclude
|
|
22
|
+
from gwaslab.util.util_in_filter_value import _extract
|
|
20
23
|
# to vcf
|
|
21
24
|
# to fmt
|
|
22
25
|
## vcf
|
|
@@ -402,11 +405,11 @@ def tofmt(sumstats,
|
|
|
402
405
|
|
|
403
406
|
####################################################################################################################
|
|
404
407
|
def _write_tabular(sumstats,rename_dictionary, path, tab_fmt, to_csvargs, to_tabular_kwargs, log, verbose):
|
|
405
|
-
chr_header = rename_dictionary["CHR"]
|
|
406
408
|
if tab_fmt=="tsv" or tab_fmt=="csv":
|
|
407
409
|
try:
|
|
408
410
|
log.write(f" -Fast to csv mode...",verbose=verbose)
|
|
409
411
|
if "@" in path:
|
|
412
|
+
chr_header = rename_dictionary["CHR"]
|
|
410
413
|
log.write(f" -@ detected: writing each chromosome to a single file...",verbose=verbose)
|
|
411
414
|
log.write(" -Chromosomes:{}...".format(list(sumstats["CHR"].unique())),verbose=verbose)
|
|
412
415
|
for single_chr in list(sumstats["CHR"].unique()):
|
|
@@ -420,6 +423,7 @@ def _write_tabular(sumstats,rename_dictionary, path, tab_fmt, to_csvargs, to_tab
|
|
|
420
423
|
except:
|
|
421
424
|
log.write(f"Error in using fast_to_csv. Falling back to original implementation.",verbose=verbose)
|
|
422
425
|
if "@" in path:
|
|
426
|
+
chr_header = rename_dictionary["CHR"]
|
|
423
427
|
log.write(f" -@ detected: writing each chromosome to a single file...",verbose=verbose)
|
|
424
428
|
log.write(" -Chromosomes:{}...".format(list(sumstats["CHR"].unique())),verbose=verbose)
|
|
425
429
|
for single_chr in list(sumstats["CHR"].unique()):
|
|
@@ -44,4 +44,19 @@ def load_data_from_pickle(path,usecols=None):
|
|
|
44
44
|
existing_cols.append(i)
|
|
45
45
|
data = data.loc[:,existing_cols]
|
|
46
46
|
gc.collect()
|
|
47
|
-
return data
|
|
47
|
+
return data
|
|
48
|
+
|
|
49
|
+
def _offload(df,path,log):
|
|
50
|
+
with open(path, 'wb') as file:
|
|
51
|
+
pickle.dump(df, file)
|
|
52
|
+
log.write("Dumpping dataframe to : ", path)
|
|
53
|
+
|
|
54
|
+
def _reload(path,log):
|
|
55
|
+
with open(path, 'rb') as file:
|
|
56
|
+
df = pickle.load(file)
|
|
57
|
+
log.write("Loaded dataframe back from : ", path)
|
|
58
|
+
try:
|
|
59
|
+
os.remove(path)
|
|
60
|
+
except:
|
|
61
|
+
pass
|
|
62
|
+
return df
|
|
@@ -56,7 +56,7 @@ dtype_dict ={
|
|
|
56
56
|
'P_RANDOM' :[pl.Float64()]
|
|
57
57
|
}
|
|
58
58
|
|
|
59
|
-
def
|
|
59
|
+
def check_datatype_polars(sumstats, verbose=True, log=Log()):
|
|
60
60
|
|
|
61
61
|
#try:
|
|
62
62
|
headers = []
|
|
@@ -112,7 +112,7 @@ def quick_convert_datatype(sumstats, log, verbose):
|
|
|
112
112
|
pass
|
|
113
113
|
return sumstats
|
|
114
114
|
|
|
115
|
-
def
|
|
115
|
+
def check_dataframe_shape_polars(sumstats, log, verbose):
|
|
116
116
|
memory_in_mb = sumstats.estimated_size(unit="mb")
|
|
117
117
|
try:
|
|
118
118
|
log.write(" -Current Dataframe shape : {} x {} ; Memory usage: {:.2f} MB".format(len(sumstats),len(sumstats.columns),memory_in_mb), verbose=verbose)
|