gwaslab 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +57 -47
- gwaslab/{bd_common_data.py → bd/bd_common_data.py} +10 -9
- gwaslab/bd/bd_config.py +28 -0
- gwaslab/{bd_download.py → bd/bd_download.py} +1 -1
- gwaslab/{bd_get_hapmap3.py → bd/bd_get_hapmap3.py} +9 -6
- gwaslab/bd/bd_path_manager.py +110 -0
- gwaslab/data/formatbook.json +805 -9
- gwaslab/{ldsc_irwls.py → extension/ldsc/ldsc_irwls.py} +1 -1
- gwaslab/{ldsc_regressions.py → extension/ldsc/ldsc_regressions.py} +2 -2
- gwaslab/{ldsc_sumstats.py → extension/ldsc/ldsc_sumstats.py} +2 -2
- gwaslab/{prscs_mcmc_gtb.py → extension/prscs/prscs_mcmc_gtb.py} +1 -1
- gwaslab/g_Sumstats.py +130 -96
- gwaslab/g_SumstatsMulti.py +69 -40
- gwaslab/g_SumstatsPair.py +54 -37
- gwaslab/g_SumstatsSet.py +88 -81
- gwaslab/g_SumstatsT.py +6 -6
- gwaslab/g_Sumstats_polars.py +84 -84
- gwaslab/g_meta_update.py +1 -1
- gwaslab/g_vchange_status.py +4 -4
- gwaslab/g_version.py +2 -2
- gwaslab/{hm_casting.py → hm/hm_casting.py} +4 -4
- gwaslab/{hm_casting_polars.py → hm/hm_casting_polars.py} +4 -4
- gwaslab/hm/hm_harmonize_sumstats.py +1635 -0
- gwaslab/hm_harmonize_sumstats.py +3 -8
- gwaslab/{io_load_ld.py → io/io_load_ld.py} +16 -13
- gwaslab/{io_preformat_input.py → io/io_preformat_input.py} +152 -73
- gwaslab/{io_preformat_input_polars.py → io/io_preformat_input_polars.py} +7 -7
- gwaslab/{io_read_pipcs.py → io/io_read_pipcs.py} +2 -2
- gwaslab/{io_read_tabular.py → io/io_read_tabular.py} +2 -2
- gwaslab/{io_to_formats.py → io/io_to_formats.py} +11 -8
- gwaslab/{io_to_pickle.py → io/io_to_pickle.py} +16 -1
- gwaslab/{qc_check_datatype_polars.py → qc/qc_check_datatype_polars.py} +2 -2
- gwaslab/{qc_fix_sumstats.py → qc/qc_fix_sumstats.py} +60 -33
- gwaslab/{qc_fix_sumstats_polars.py → qc/qc_fix_sumstats_polars.py} +15 -11
- gwaslab/{util_abf_finemapping.py → util/util_abf_finemapping.py} +2 -2
- gwaslab/{util_ex_calculate_ldmatrix.py → util/util_ex_calculate_ldmatrix.py} +18 -8
- gwaslab/{util_ex_calculate_prs.py → util/util_ex_calculate_prs.py} +2 -2
- gwaslab/{util_ex_ldproxyfinder.py → util/util_ex_ldproxyfinder.py} +6 -6
- gwaslab/{util_ex_ldsc.py → util/util_ex_ldsc.py} +18 -13
- gwaslab/{util_ex_match_ldmatrix.py → util/util_ex_match_ldmatrix.py} +8 -7
- gwaslab/util/util_ex_phewwas.py +127 -0
- gwaslab/{util_ex_process_h5.py → util/util_ex_process_h5.py} +2 -2
- gwaslab/{util_ex_process_ref.py → util/util_ex_process_ref.py} +2 -2
- gwaslab/{util_ex_run_2samplemr.py → util/util_ex_run_2samplemr.py} +18 -7
- gwaslab/{util_ex_run_ccgwas.py → util/util_ex_run_ccgwas.py} +4 -4
- gwaslab/{util_ex_run_clumping.py → util/util_ex_run_clumping.py} +28 -13
- gwaslab/{util_ex_run_coloc.py → util/util_ex_run_coloc.py} +22 -10
- gwaslab/{util_ex_run_hyprcoloc.py → util/util_ex_run_hyprcoloc.py} +4 -4
- gwaslab/{util_ex_run_magma.py → util/util_ex_run_magma.py} +21 -11
- gwaslab/{util_ex_run_mesusie.py → util/util_ex_run_mesusie.py} +3 -3
- gwaslab/{util_ex_run_mtag.py → util/util_ex_run_mtag.py} +50 -18
- gwaslab/{util_ex_run_prscs.py → util/util_ex_run_prscs.py} +3 -3
- gwaslab/{util_ex_run_scdrs.py → util/util_ex_run_scdrs.py} +10 -4
- gwaslab/{util_ex_run_susie.py → util/util_ex_run_susie.py} +49 -26
- gwaslab/{util_in_fill_data.py → util/util_in_fill_data.py} +1 -1
- gwaslab/{util_in_filter_value.py → util/util_in_filter_value.py} +18 -11
- gwaslab/{util_in_get_sig.py → util/util_in_get_sig.py} +15 -13
- gwaslab/{util_in_meta.py → util/util_in_meta.py} +1 -1
- gwaslab/{util_in_meta_polars.py → util/util_in_meta_polars.py} +1 -1
- gwaslab/{viz_aux_annotate_plot.py → viz/viz_aux_annotate_plot.py} +1 -1
- gwaslab/{viz_aux_quickfix.py → viz/viz_aux_quickfix.py} +2 -2
- gwaslab/{viz_plot_compare_af.py → viz/viz_plot_compare_af.py} +1 -1
- gwaslab/{viz_plot_compare_effect.py → viz/viz_plot_compare_effect.py} +16 -8
- gwaslab/{viz_plot_credible_sets.py → viz/viz_plot_credible_sets.py} +6 -6
- gwaslab/{viz_plot_effect.py → viz/viz_plot_effect.py} +37 -69
- gwaslab/{viz_plot_miamiplot.py → viz/viz_plot_miamiplot.py} +28 -20
- gwaslab/{viz_plot_miamiplot2.py → viz/viz_plot_miamiplot2.py} +27 -22
- gwaslab/{viz_plot_mqqplot.py → viz/viz_plot_mqqplot.py} +100 -46
- gwaslab/{viz_plot_phe_heatmap.py → viz/viz_plot_phe_heatmap.py} +18 -15
- gwaslab/{viz_plot_qqplot.py → viz/viz_plot_qqplot.py} +12 -28
- gwaslab/{viz_plot_regional2.py → viz/viz_plot_regional2.py} +11 -9
- gwaslab/{viz_plot_regionalplot.py → viz/viz_plot_regionalplot.py} +5 -4
- gwaslab/{viz_plot_rg_heatmap.py → viz/viz_plot_rg_heatmap.py} +1 -1
- gwaslab/{viz_plot_scatter_with_reg.py → viz/viz_plot_scatter_with_reg.py} +10 -7
- gwaslab/{viz_plot_stackedregional.py → viz/viz_plot_stackedregional.py} +67 -33
- gwaslab/{viz_plot_trumpetplot.py → viz/viz_plot_trumpetplot.py} +15 -9
- {gwaslab-3.6.6.dist-info → gwaslab-3.6.8.dist-info}/METADATA +1 -1
- gwaslab-3.6.8.dist-info/RECORD +123 -0
- gwaslab/bd_config.py +0 -18
- gwaslab-3.6.6.dist-info/RECORD +0 -120
- /gwaslab/{ldsc_jackknife.py → extension/ldsc/ldsc_jackknife.py} +0 -0
- /gwaslab/{ldsc_ldscore.py → extension/ldsc/ldsc_ldscore.py} +0 -0
- /gwaslab/{ldsc_parse.py → extension/ldsc/ldsc_parse.py} +0 -0
- /gwaslab/{prscs_gigrnd.py → extension/prscs/prscs_gigrnd.py} +0 -0
- /gwaslab/{prscs_parse_genet.py → extension/prscs/prscs_parse_genet.py} +0 -0
- /gwaslab/{hm_rsid_to_chrpos.py → hm/hm_rsid_to_chrpos.py} +0 -0
- /gwaslab/{io_process_args.py → io/io_process_args.py} +0 -0
- /gwaslab/{io_read_ldsc.py → io/io_read_ldsc.py} +0 -0
- /gwaslab/{qc_build.py → qc/qc_build.py} +0 -0
- /gwaslab/{qc_check_datatype.py → qc/qc_check_datatype.py} +0 -0
- /gwaslab/{util_ex_gwascatalog.py → util/util_ex_gwascatalog.py} +0 -0
- /gwaslab/{util_ex_infer_ancestry.py → util/util_ex_infer_ancestry.py} +0 -0
- /gwaslab/{util_ex_plink_filter.py → util/util_ex_plink_filter.py} +0 -0
- /gwaslab/{util_in_calculate_gc.py → util/util_in_calculate_gc.py} +0 -0
- /gwaslab/{util_in_calculate_power.py → util/util_in_calculate_power.py} +0 -0
- /gwaslab/{util_in_convert_h2.py → util/util_in_convert_h2.py} +0 -0
- /gwaslab/{util_in_correct_winnerscurse.py → util/util_in_correct_winnerscurse.py} +0 -0
- /gwaslab/{util_in_estimate_ess.py → util/util_in_estimate_ess.py} +0 -0
- /gwaslab/{util_in_get_density.py → util/util_in_get_density.py} +0 -0
- /gwaslab/{util_in_merge.py → util/util_in_merge.py} +0 -0
- /gwaslab/{util_in_snphwe.py → util/util_in_snphwe.py} +0 -0
- /gwaslab/{viz_aux_chromatin.py → viz/viz_aux_chromatin.py} +0 -0
- /gwaslab/{viz_aux_property.py → viz/viz_aux_property.py} +0 -0
- /gwaslab/{viz_aux_reposition_text.py → viz/viz_aux_reposition_text.py} +0 -0
- /gwaslab/{viz_aux_save_figure.py → viz/viz_aux_save_figure.py} +0 -0
- /gwaslab/{viz_plot_forestplot.py → viz/viz_plot_forestplot.py} +0 -0
- {gwaslab-3.6.6.dist-info → gwaslab-3.6.8.dist-info}/WHEEL +0 -0
- {gwaslab-3.6.6.dist-info → gwaslab-3.6.8.dist-info}/licenses/LICENSE +0 -0
- {gwaslab-3.6.6.dist-info → gwaslab-3.6.8.dist-info}/licenses/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.6.6.dist-info → gwaslab-3.6.8.dist-info}/top_level.txt +0 -0
gwaslab/hm_harmonize_sumstats.py
CHANGED
|
@@ -27,7 +27,7 @@ from gwaslab.bd_common_data import _maketrans
|
|
|
27
27
|
from gwaslab.g_vchange_status import vchange_status
|
|
28
28
|
from gwaslab.g_version import _get_version
|
|
29
29
|
from gwaslab.cache_manager import CacheManager, PALINDROMIC_INDEL, NON_PALINDROMIC
|
|
30
|
-
|
|
30
|
+
from gwaslab.g_vchange_status import STATUS_CATEGORIES
|
|
31
31
|
#rsidtochrpos
|
|
32
32
|
#checkref
|
|
33
33
|
#parallelizeassignrsid
|
|
@@ -357,10 +357,7 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
|
|
|
357
357
|
|
|
358
358
|
log.write("\n",end="",show_time=False,verbose=verbose)
|
|
359
359
|
|
|
360
|
-
|
|
361
|
-
sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
|
|
362
|
-
#sumstats[status] = sumstats[status].astype("string")
|
|
363
|
-
|
|
360
|
+
sumstats[status] = pd.Categorical(sumstats[status],categories=STATUS_CATEGORIES)
|
|
364
361
|
|
|
365
362
|
available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
|
|
366
363
|
status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
|
|
@@ -681,9 +678,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
|
|
|
681
678
|
sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
|
|
682
679
|
log.write(" -Finished checking records", verbose=verbose)
|
|
683
680
|
|
|
684
|
-
|
|
685
|
-
sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
|
|
686
|
-
#sumstats[status] = sumstats[status].astype("string")
|
|
681
|
+
sumstats[status] = pd.Categorical(sumstats[status],categories=STATUS_CATEGORIES)
|
|
687
682
|
|
|
688
683
|
available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
|
|
689
684
|
status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
|
|
@@ -1,8 +1,6 @@
|
|
|
1
|
-
|
|
2
1
|
import scipy.sparse as sparse
|
|
3
2
|
import numpy as np
|
|
4
3
|
import pandas as pd
|
|
5
|
-
from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
|
|
6
4
|
import subprocess
|
|
7
5
|
import os
|
|
8
6
|
import re
|
|
@@ -10,16 +8,22 @@ import gc
|
|
|
10
8
|
import pandas as pd
|
|
11
9
|
import numpy as np
|
|
12
10
|
from gwaslab.g_Log import Log
|
|
13
|
-
from gwaslab.qc_fix_sumstats import start_to
|
|
14
|
-
from gwaslab.qc_fix_sumstats import finished
|
|
15
|
-
from gwaslab.util_in_get_sig import getsig
|
|
16
|
-
from gwaslab.util_ex_process_ref import _process_plink_input_files
|
|
17
11
|
from gwaslab.g_version import _checking_plink_version
|
|
18
|
-
|
|
19
|
-
from gwaslab.
|
|
20
|
-
|
|
21
|
-
from gwaslab.
|
|
22
|
-
from gwaslab.
|
|
12
|
+
|
|
13
|
+
from gwaslab.hm.hm_casting import _merge_mold_with_sumstats_by_chrpos
|
|
14
|
+
|
|
15
|
+
from gwaslab.qc.qc_fix_sumstats import start_to
|
|
16
|
+
from gwaslab.qc.qc_fix_sumstats import finished
|
|
17
|
+
|
|
18
|
+
from gwaslab.util.util_in_get_sig import getsig
|
|
19
|
+
from gwaslab.util.util_ex_process_ref import _process_plink_input_files
|
|
20
|
+
from gwaslab.util.util_in_filter_value import _exclude_hla
|
|
21
|
+
from gwaslab.util.util_ex_calculate_ldmatrix import _extract_variants_in_locus
|
|
22
|
+
from gwaslab.util.util_ex_calculate_ldmatrix import _export_snplist_and_locus_sumstats
|
|
23
|
+
from gwaslab.util.util_ex_calculate_ldmatrix import _extract_variants_in_locus
|
|
24
|
+
|
|
25
|
+
from gwaslab.viz.viz_plot_regional2 import _get_lead_id
|
|
26
|
+
|
|
23
27
|
|
|
24
28
|
def tofinemapping_using_ld(sumstats,
|
|
25
29
|
study=None,
|
|
@@ -72,7 +76,7 @@ def tofinemapping_using_ld(sumstats,
|
|
|
72
76
|
sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
|
|
73
77
|
else:
|
|
74
78
|
sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
|
|
75
|
-
|
|
79
|
+
log.write(" -Number of loci: {}...".format(len(sig_df)),verbose=verbose)
|
|
76
80
|
# Drop duplicate!!!!
|
|
77
81
|
log.write(" -Dropping duplicated SNPIDs...",verbose=verbose)
|
|
78
82
|
sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
|
|
@@ -447,7 +451,6 @@ def _merge_ld_map_with_sumstats(row,
|
|
|
447
451
|
# matching by SNPID
|
|
448
452
|
# preserve bim keys (use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.)
|
|
449
453
|
combined_df = pd.merge(ld_map, locus_sumstats, on=["CHR","POS"],how="inner")
|
|
450
|
-
|
|
451
454
|
# match allele
|
|
452
455
|
perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
|
|
453
456
|
log.write(" -Variants with perfect matched alleles:{}".format(sum(perfect_match)))
|
|
@@ -3,14 +3,16 @@ import numpy as np
|
|
|
3
3
|
import scipy.stats as ss
|
|
4
4
|
import gzip
|
|
5
5
|
import os
|
|
6
|
+
import re
|
|
6
7
|
import gc
|
|
7
|
-
from gwaslab.bd_common_data import get_format_dict
|
|
8
|
-
from gwaslab.qc_fix_sumstats import sortcolumn
|
|
9
|
-
from gwaslab.qc_fix_sumstats import _process_build
|
|
10
|
-
from gwaslab.qc_check_datatype import check_datatype
|
|
11
|
-
from gwaslab.qc_check_datatype import quick_convert_datatype
|
|
12
|
-
from gwaslab.qc_check_datatype import check_dataframe_memory_usage
|
|
8
|
+
from gwaslab.bd.bd_common_data import get_format_dict
|
|
9
|
+
from gwaslab.qc.qc_fix_sumstats import sortcolumn
|
|
10
|
+
from gwaslab.qc.qc_fix_sumstats import _process_build
|
|
11
|
+
from gwaslab.qc.qc_check_datatype import check_datatype
|
|
12
|
+
from gwaslab.qc.qc_check_datatype import quick_convert_datatype
|
|
13
|
+
from gwaslab.qc.qc_check_datatype import check_dataframe_memory_usage
|
|
13
14
|
from gwaslab.g_headers import _check_overlap_with_reserved_keys
|
|
15
|
+
from gwaslab.g_vchange_status import STATUS_CATEGORIES
|
|
14
16
|
#20221030
|
|
15
17
|
def preformat(sumstats,
|
|
16
18
|
fmt=None,
|
|
@@ -122,62 +124,22 @@ def preformat(sumstats,
|
|
|
122
124
|
|
|
123
125
|
if "sep" not in readargs.keys():
|
|
124
126
|
readargs["sep"] = "\t"
|
|
127
|
+
else:
|
|
128
|
+
meta_data = None
|
|
125
129
|
|
|
126
130
|
#########################################################################################################################################################
|
|
127
131
|
|
|
128
|
-
# check chr-separated path / vcf / then print header.
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
for chromosome in list(range(1,26))+["x","y","X","Y","MT","mt","m","M"]:
|
|
139
|
-
inpath_chr = inpath.replace("@",str(chromosome))
|
|
140
|
-
if isfile_casesensitive(inpath_chr):
|
|
141
|
-
inpath_chr_num_list.append(str(chromosome))
|
|
142
|
-
inpath_chr_list.append(inpath_chr)
|
|
143
|
-
log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
|
|
144
|
-
readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
|
|
145
|
-
row_one = pd.read_table(inpath_chr_list[0],**readargs_header)
|
|
146
|
-
# columns in the sumstats
|
|
147
|
-
raw_cols = row_one.columns
|
|
148
|
-
else:
|
|
149
|
-
##### loading data from tabular file#################################################
|
|
150
|
-
readargs_header = get_readargs_header(inpath = inpath, readargs = readargs)
|
|
151
|
-
row_one = pd.read_table(inpath,**readargs_header)
|
|
152
|
-
raw_cols = row_one.columns
|
|
153
|
-
|
|
154
|
-
if fmt=="vcf":
|
|
155
|
-
# expanded
|
|
156
|
-
format_cols = list(row_one["FORMAT"].str.split(":"))[0]
|
|
157
|
-
# fixed + study1 + expanded
|
|
158
|
-
raw_cols = meta_data["format_fixed"] + [raw_cols[9]] + format_cols
|
|
132
|
+
# check chr-separated path / vcf / then print header.
|
|
133
|
+
inpath, inpath_chr_list, inpath_chr_num_list, format_cols, raw_cols, usecols, dtype_dictionary = check_path_and_header(sumstats,
|
|
134
|
+
fmt,
|
|
135
|
+
meta_data,
|
|
136
|
+
readargs,
|
|
137
|
+
usecols,
|
|
138
|
+
dtype_dictionary,
|
|
139
|
+
rename_dictionary,
|
|
140
|
+
log,
|
|
141
|
+
verbose)
|
|
159
142
|
|
|
160
|
-
######################################################################################
|
|
161
|
-
elif type(sumstats) is pd.DataFrame:
|
|
162
|
-
## loading data from dataframe
|
|
163
|
-
raw_cols = sumstats.columns
|
|
164
|
-
|
|
165
|
-
################################################
|
|
166
|
-
for key,value in rename_dictionary.items():
|
|
167
|
-
# check avaiable keys key->raw header
|
|
168
|
-
# usecols : a list of raw headers to load from file/DataFrame
|
|
169
|
-
if key in raw_cols:
|
|
170
|
-
usecols.append(key)
|
|
171
|
-
if value in ["EA","NEA"]:
|
|
172
|
-
dtype_dictionary[key]="category"
|
|
173
|
-
if value in ["STATUS"]:
|
|
174
|
-
dtype_dictionary[key]="string"
|
|
175
|
-
if value in ["CHR"]:
|
|
176
|
-
dtype_dictionary[key]="string"
|
|
177
|
-
|
|
178
|
-
except ValueError:
|
|
179
|
-
raise ValueError("Please input a path or a pd.DataFrame, and make sure the separator is correct and the columns you specified are in the file.")
|
|
180
|
-
|
|
181
143
|
###################################################################################################################################################
|
|
182
144
|
## check columns/datatype to use
|
|
183
145
|
if snpid:
|
|
@@ -359,7 +321,7 @@ def preformat(sumstats,
|
|
|
359
321
|
try:
|
|
360
322
|
if type(sumstats) is str:
|
|
361
323
|
## loading data from path
|
|
362
|
-
inpath = sumstats
|
|
324
|
+
#inpath = sumstats
|
|
363
325
|
if "@" in inpath:
|
|
364
326
|
log.write("Start to initialize gl.Sumstats from files with pattern :" + inpath,verbose=verbose)
|
|
365
327
|
sumstats_chr_list=[]
|
|
@@ -445,14 +407,14 @@ def preformat(sumstats,
|
|
|
445
407
|
sumstats["N_CONTROL"] = ncontrol
|
|
446
408
|
|
|
447
409
|
### status ######################################################################################################
|
|
448
|
-
|
|
449
|
-
|
|
410
|
+
|
|
411
|
+
sumstats = process_status(sumstats=sumstats,build=build,status=status,log=log,verbose=verbose)
|
|
450
412
|
|
|
451
413
|
## ea/nea, ref/alt ##############################################################################################
|
|
452
414
|
sumstats = process_allele(sumstats=sumstats,log=log,verbose=verbose)
|
|
453
415
|
|
|
454
416
|
## NEAF to EAF ###########################################################################################################
|
|
455
|
-
if neaf is not None :
|
|
417
|
+
if neaf is not None or ("NEAF" in sumstats.columns and "EAF" not in sumstats.columns):
|
|
456
418
|
sumstats = process_neaf(sumstats=sumstats,log=log,verbose=verbose)
|
|
457
419
|
|
|
458
420
|
## reodering ###################################################################################################
|
|
@@ -562,9 +524,15 @@ def process_neaf(sumstats,log,verbose):
|
|
|
562
524
|
log.write(" -NEAF is specified...",verbose=verbose)
|
|
563
525
|
pre_number=len(sumstats)
|
|
564
526
|
log.write(" -Checking if 0<= NEAF <=1 ...",verbose=verbose)
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
527
|
+
if "NEAF" in sumstats.columns:
|
|
528
|
+
sumstats["NEAF"] = pd.to_numeric(sumstats["NEAF"], errors='coerce')
|
|
529
|
+
sumstats = sumstats.loc[(sumstats["NEAF"]>=0) & (sumstats["NEAF"]<=1),:]
|
|
530
|
+
sumstats["EAF"] = 1- sumstats["NEAF"]
|
|
531
|
+
sumstats.drop(columns=["NEAF"], inplace=True)
|
|
532
|
+
else:
|
|
533
|
+
sumstats["EAF"] = pd.to_numeric(sumstats["EAF"], errors='coerce')
|
|
534
|
+
sumstats = sumstats.loc[(sumstats["EAF"]>=0) & (sumstats["EAF"]<=1),:]
|
|
535
|
+
sumstats["EAF"] = 1- sumstats["EAF"]
|
|
568
536
|
log.write(" -Converted NEAF to EAF.",verbose=verbose)
|
|
569
537
|
after_number=len(sumstats)
|
|
570
538
|
log.write(" -Removed "+str(pre_number - after_number)+" variants with bad NEAF.",verbose=verbose)
|
|
@@ -599,13 +567,14 @@ def process_allele(sumstats,log,verbose):
|
|
|
599
567
|
sumstats["NEA"]=sumstats["NEA"].astype("category")
|
|
600
568
|
return sumstats
|
|
601
569
|
|
|
602
|
-
def process_status(sumstats,build,log,verbose):
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
570
|
+
def process_status(sumstats,build,status, log,verbose):
|
|
571
|
+
if status is None:
|
|
572
|
+
log.write(" -Initiating a status column: STATUS ...",verbose=verbose)
|
|
573
|
+
#sumstats["STATUS"] = int(build)*(10**5) +99999
|
|
574
|
+
build = _process_build(build,log,verbose)
|
|
575
|
+
sumstats["STATUS"] = build +"99999"
|
|
576
|
+
|
|
577
|
+
sumstats["STATUS"] = pd.Categorical(sumstats["STATUS"],categories=STATUS_CATEGORIES)
|
|
609
578
|
return sumstats
|
|
610
579
|
|
|
611
580
|
|
|
@@ -649,4 +618,114 @@ def _load_variants_with_pattern(inpath,usecols,dtype_dictionary,readargs,rename_
|
|
|
649
618
|
log.write(" -Loading only variants with pattern : {} ...".format(snpid_pat),verbose=verbose)
|
|
650
619
|
sumstats_filtered = pd.concat([chunk[chunk[chunk_snpid].str.match(snpid_pat, case=False,na=False) ] for chunk in sumstats_iter])
|
|
651
620
|
log.write(" -Loaded {} variants with pattern : {} ...".format(len(sumstats_filtered), snpid_pat),verbose=verbose)
|
|
652
|
-
return sumstats_filtered
|
|
621
|
+
return sumstats_filtered
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def check_path_and_header(sumstats=None,
|
|
625
|
+
fmt=None,
|
|
626
|
+
meta_data=None,
|
|
627
|
+
readargs=None,
|
|
628
|
+
usecols=None,
|
|
629
|
+
dtype_dictionary=None,
|
|
630
|
+
rename_dictionary=None,
|
|
631
|
+
log=None,
|
|
632
|
+
verbose=None):
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
if type(sumstats) is str:
|
|
636
|
+
## loading data from path #################################################
|
|
637
|
+
inpath = sumstats
|
|
638
|
+
|
|
639
|
+
try:
|
|
640
|
+
format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list = process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose)
|
|
641
|
+
|
|
642
|
+
except (FileNotFoundError, IndexError):
|
|
643
|
+
log.warning("Loading {} failed...Tesing if compressed/uncompressed...".format(inpath),verbose=verbose)
|
|
644
|
+
try:
|
|
645
|
+
if inpath[-3:]==".gz":
|
|
646
|
+
inpath = inpath[:-3]
|
|
647
|
+
log.write(" -Trying to load {}...".format(inpath),verbose=verbose)
|
|
648
|
+
format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list =process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose)
|
|
649
|
+
else:
|
|
650
|
+
inpath = inpath+".gz"
|
|
651
|
+
log.write(" -Trying to load {}...".format(inpath),verbose=verbose)
|
|
652
|
+
format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list = process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose)
|
|
653
|
+
except:
|
|
654
|
+
raise ValueError("Please input a valid path, and make sure the separator is correct and the columns you specified are in the file.")
|
|
655
|
+
|
|
656
|
+
######################################################################################
|
|
657
|
+
elif type(sumstats) is pd.DataFrame:
|
|
658
|
+
inpath = None
|
|
659
|
+
format_cols = None
|
|
660
|
+
inpath_chr_list = None
|
|
661
|
+
inpath_chr_num_list = None
|
|
662
|
+
## loading data from dataframe
|
|
663
|
+
raw_cols = sumstats.columns
|
|
664
|
+
|
|
665
|
+
################################################
|
|
666
|
+
for key,value in rename_dictionary.items():
|
|
667
|
+
# check avaiable keys key->raw header
|
|
668
|
+
# usecols : a list of raw headers to load from file/DataFrame
|
|
669
|
+
if key in raw_cols:
|
|
670
|
+
usecols.append(key)
|
|
671
|
+
if value in ["EA","NEA"]:
|
|
672
|
+
dtype_dictionary[key]="category"
|
|
673
|
+
if value in ["STATUS"]:
|
|
674
|
+
dtype_dictionary[key]="string"
|
|
675
|
+
if value in ["CHR"]:
|
|
676
|
+
dtype_dictionary[key]="string"
|
|
677
|
+
|
|
678
|
+
return inpath, inpath_chr_list, inpath_chr_num_list, format_cols, raw_cols, usecols, dtype_dictionary
|
|
679
|
+
|
|
680
|
+
def process_inpath_and_load_header(inpath, fmt, meta_data, readargs, log, verbose):
|
|
681
|
+
|
|
682
|
+
format_cols = None
|
|
683
|
+
inpath_chr_list = None
|
|
684
|
+
inpath_chr_num_list = None
|
|
685
|
+
|
|
686
|
+
if "@" in inpath:
|
|
687
|
+
log.write(" -Detected @ in path: load sumstats by each chromosome...",verbose=verbose)
|
|
688
|
+
inpath_chr_list=[]
|
|
689
|
+
inpath_chr_num_list=[]
|
|
690
|
+
|
|
691
|
+
# create a regex pattern for matching
|
|
692
|
+
pat = os.path.basename(inpath).replace("@","(\w+)")
|
|
693
|
+
|
|
694
|
+
# get dir
|
|
695
|
+
dirname = os.path.dirname(inpath)
|
|
696
|
+
|
|
697
|
+
# all files in the directory
|
|
698
|
+
files = os.listdir(dirname)
|
|
699
|
+
|
|
700
|
+
files.sort()
|
|
701
|
+
|
|
702
|
+
for file in files:
|
|
703
|
+
# match
|
|
704
|
+
result = re.match(pat, file)
|
|
705
|
+
if result:
|
|
706
|
+
# get chr
|
|
707
|
+
chr_matched = str(result.group(1))
|
|
708
|
+
inpath_chr_num_list.append(chr_matched)
|
|
709
|
+
inpath_chr_list.append(inpath.replace("@",str(chr_matched)) )
|
|
710
|
+
|
|
711
|
+
log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
|
|
712
|
+
|
|
713
|
+
#if inpath_chr_list is empty-> IndexError
|
|
714
|
+
readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
|
|
715
|
+
row_one = pd.read_table(inpath_chr_list[0],**readargs_header)
|
|
716
|
+
# columns in the sumstats
|
|
717
|
+
raw_cols = row_one.columns
|
|
718
|
+
else:
|
|
719
|
+
##### loading data from tabular file#################################################
|
|
720
|
+
#if file not found, FileNotFoundError
|
|
721
|
+
readargs_header = get_readargs_header(inpath = inpath, readargs = readargs)
|
|
722
|
+
row_one = pd.read_table(inpath,**readargs_header)
|
|
723
|
+
raw_cols = row_one.columns
|
|
724
|
+
|
|
725
|
+
if fmt=="vcf":
|
|
726
|
+
# expanded
|
|
727
|
+
format_cols = list(row_one["FORMAT"].str.split(":"))[0]
|
|
728
|
+
# fixed + study1 + expanded
|
|
729
|
+
raw_cols = meta_data["format_fixed"] + [raw_cols[9]] + format_cols
|
|
730
|
+
|
|
731
|
+
return format_cols, raw_cols, inpath_chr_list, inpath_chr_num_list
|
|
@@ -5,12 +5,12 @@ import scipy.stats as ss
|
|
|
5
5
|
import gzip
|
|
6
6
|
import os
|
|
7
7
|
import gc
|
|
8
|
-
from gwaslab.bd_common_data import get_format_dict
|
|
9
|
-
from gwaslab.qc_fix_sumstats import sortcolumn
|
|
10
|
-
from gwaslab.qc_fix_sumstats import _process_build
|
|
11
|
-
from gwaslab.qc_check_datatype_polars import
|
|
12
|
-
from gwaslab.qc_check_datatype_polars import quick_convert_datatype
|
|
13
|
-
from gwaslab.qc_check_datatype_polars import check_dataframe_memory_usage
|
|
8
|
+
from gwaslab.bd.bd_common_data import get_format_dict
|
|
9
|
+
from gwaslab.qc.qc_fix_sumstats import sortcolumn
|
|
10
|
+
from gwaslab.qc.qc_fix_sumstats import _process_build
|
|
11
|
+
from gwaslab.qc.qc_check_datatype_polars import check_datatype_polars
|
|
12
|
+
from gwaslab.qc.qc_check_datatype_polars import quick_convert_datatype
|
|
13
|
+
from gwaslab.qc.qc_check_datatype_polars import check_dataframe_memory_usage
|
|
14
14
|
from gwaslab.g_headers import _check_overlap_with_reserved_keys
|
|
15
15
|
#20221030
|
|
16
16
|
def preformatp(sumstats,
|
|
@@ -433,7 +433,7 @@ def preformatp(sumstats,
|
|
|
433
433
|
#sumstats = sortcolumn(sumstats=sumstats,log=log,verbose=verbose)
|
|
434
434
|
sumstats = quick_convert_datatype(sumstats,log=log,verbose=verbose)
|
|
435
435
|
|
|
436
|
-
|
|
436
|
+
check_datatype_polars(sumstats,log=log,verbose=verbose)
|
|
437
437
|
#gc.collect()
|
|
438
438
|
check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)
|
|
439
439
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from gwaslab.g_Log import Log
|
|
3
|
-
from gwaslab.qc_check_datatype import check_datatype
|
|
4
|
-
from gwaslab.qc_check_datatype import check_dataframe_memory_usage
|
|
3
|
+
from gwaslab.qc.qc_check_datatype import check_datatype
|
|
4
|
+
from gwaslab.qc.qc_check_datatype import check_dataframe_memory_usage
|
|
5
5
|
import re
|
|
6
6
|
import os
|
|
7
7
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
from gwaslab.bd_common_data import get_formats_list
|
|
2
|
+
from gwaslab.bd.bd_common_data import get_formats_list
|
|
3
3
|
from gwaslab.g_Log import Log
|
|
4
|
-
from gwaslab.bd_common_data import get_format_dict
|
|
4
|
+
from gwaslab.bd.bd_common_data import get_format_dict
|
|
5
5
|
|
|
6
6
|
def _read_tabular(path, fmt, **kwargs):
|
|
7
7
|
|
|
@@ -7,16 +7,19 @@ from pysam import tabix_compress
|
|
|
7
7
|
from pysam import tabix_index
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from datetime import date
|
|
10
|
-
from gwaslab.io_preformat_input import print_format_info
|
|
11
|
-
from gwaslab.bd_common_data import get_formats_list
|
|
12
10
|
from gwaslab.g_Log import Log
|
|
13
|
-
from gwaslab.bd_common_data import get_format_dict
|
|
14
|
-
from gwaslab.bd_common_data import get_number_to_chr
|
|
15
11
|
from gwaslab.g_version import gwaslab_info
|
|
16
|
-
|
|
17
|
-
from gwaslab.
|
|
18
|
-
|
|
19
|
-
from gwaslab.
|
|
12
|
+
|
|
13
|
+
from gwaslab.io.io_preformat_input import print_format_info
|
|
14
|
+
|
|
15
|
+
from gwaslab.bd.bd_common_data import get_format_dict
|
|
16
|
+
from gwaslab.bd.bd_common_data import get_number_to_chr
|
|
17
|
+
from gwaslab.bd.bd_common_data import get_formats_list
|
|
18
|
+
from gwaslab.bd.bd_get_hapmap3 import gethapmap3
|
|
19
|
+
|
|
20
|
+
from gwaslab.util.util_in_filter_value import _exclude_hla
|
|
21
|
+
from gwaslab.util.util_in_filter_value import _exclude
|
|
22
|
+
from gwaslab.util.util_in_filter_value import _extract
|
|
20
23
|
# to vcf
|
|
21
24
|
# to fmt
|
|
22
25
|
## vcf
|
|
@@ -44,4 +44,19 @@ def load_data_from_pickle(path,usecols=None):
|
|
|
44
44
|
existing_cols.append(i)
|
|
45
45
|
data = data.loc[:,existing_cols]
|
|
46
46
|
gc.collect()
|
|
47
|
-
return data
|
|
47
|
+
return data
|
|
48
|
+
|
|
49
|
+
def _offload(df,path,log):
|
|
50
|
+
with open(path, 'wb') as file:
|
|
51
|
+
pickle.dump(df, file)
|
|
52
|
+
log.write("Dumpping dataframe to : ", path)
|
|
53
|
+
|
|
54
|
+
def _reload(path,log):
|
|
55
|
+
with open(path, 'rb') as file:
|
|
56
|
+
df = pickle.load(file)
|
|
57
|
+
log.write("Loaded dataframe back from : ", path)
|
|
58
|
+
try:
|
|
59
|
+
os.remove(path)
|
|
60
|
+
except:
|
|
61
|
+
pass
|
|
62
|
+
return df
|
|
@@ -56,7 +56,7 @@ dtype_dict ={
|
|
|
56
56
|
'P_RANDOM' :[pl.Float64()]
|
|
57
57
|
}
|
|
58
58
|
|
|
59
|
-
def
|
|
59
|
+
def check_datatype_polars(sumstats, verbose=True, log=Log()):
|
|
60
60
|
|
|
61
61
|
#try:
|
|
62
62
|
headers = []
|
|
@@ -112,7 +112,7 @@ def quick_convert_datatype(sumstats, log, verbose):
|
|
|
112
112
|
pass
|
|
113
113
|
return sumstats
|
|
114
114
|
|
|
115
|
-
def
|
|
115
|
+
def check_dataframe_shape_polars(sumstats, log, verbose):
|
|
116
116
|
memory_in_mb = sumstats.estimated_size(unit="mb")
|
|
117
117
|
try:
|
|
118
118
|
log.write(" -Current Dataframe shape : {} x {} ; Memory usage: {:.2f} MB".format(len(sumstats),len(sumstats.columns),memory_in_mb), verbose=verbose)
|
|
@@ -7,22 +7,29 @@ from multiprocessing import Pool
|
|
|
7
7
|
from liftover import get_lifter
|
|
8
8
|
from liftover import ChainFile
|
|
9
9
|
from functools import partial
|
|
10
|
+
|
|
10
11
|
from gwaslab.g_vchange_status import vchange_status
|
|
11
12
|
from gwaslab.g_vchange_status import status_match
|
|
12
13
|
from gwaslab.g_vchange_status import change_status
|
|
13
14
|
from gwaslab.g_Log import Log
|
|
14
|
-
from gwaslab.bd_common_data import get_chr_to_number
|
|
15
|
-
from gwaslab.bd_common_data import get_number_to_chr
|
|
16
|
-
from gwaslab.bd_common_data import get_chr_list
|
|
17
|
-
from gwaslab.qc_check_datatype import check_datatype
|
|
18
|
-
from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
19
|
-
from gwaslab.qc_build import _process_build
|
|
20
|
-
from gwaslab.qc_build import _set_build
|
|
21
15
|
from gwaslab.g_version import _get_version
|
|
22
|
-
from gwaslab.
|
|
23
|
-
|
|
24
|
-
from gwaslab.
|
|
25
|
-
from gwaslab.bd_common_data import
|
|
16
|
+
from gwaslab.g_vchange_status import STATUS_CATEGORIES
|
|
17
|
+
|
|
18
|
+
from gwaslab.bd.bd_common_data import get_chr_to_number
|
|
19
|
+
from gwaslab.bd.bd_common_data import get_number_to_chr
|
|
20
|
+
from gwaslab.bd.bd_common_data import get_chr_list
|
|
21
|
+
from gwaslab.bd.bd_common_data import get_chain
|
|
22
|
+
from gwaslab.bd.bd_common_data import NA_STRINGS
|
|
23
|
+
|
|
24
|
+
from gwaslab.qc.qc_check_datatype import check_datatype
|
|
25
|
+
from gwaslab.qc.qc_check_datatype import check_dataframe_shape
|
|
26
|
+
from gwaslab.qc.qc_build import _process_build
|
|
27
|
+
from gwaslab.qc.qc_build import _set_build
|
|
28
|
+
|
|
29
|
+
from gwaslab.util.util_in_fill_data import _convert_betase_to_mlog10p
|
|
30
|
+
from gwaslab.util.util_in_fill_data import _convert_betase_to_p
|
|
31
|
+
from gwaslab.util.util_in_fill_data import _convert_mlog10p_to_p
|
|
32
|
+
|
|
26
33
|
#process build
|
|
27
34
|
#setbuild
|
|
28
35
|
#fixID
|
|
@@ -69,7 +76,7 @@ from gwaslab.bd_common_data import get_chain
|
|
|
69
76
|
|
|
70
77
|
def fixID(sumstats,
|
|
71
78
|
snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",fixprefix=False,
|
|
72
|
-
fixchrpos=False,fixid=False,fixeanea=False,fixeanea_flip=False,fixsep=False,
|
|
79
|
+
fixchrpos=False,fixid=False,fixeanea=False,fixeanea_flip=False,fixsep=False, reversea=False,
|
|
73
80
|
overwrite=False,verbose=True,forcefixid=False,log=Log()):
|
|
74
81
|
'''
|
|
75
82
|
1. fx SNPid
|
|
@@ -120,7 +127,21 @@ def fixID(sumstats,
|
|
|
120
127
|
except:
|
|
121
128
|
log.write(" -Force converting SNPID to pd.string data type...",verbose=verbose)
|
|
122
129
|
sumstats[snpid] = sumstats[snpid].astype("string")
|
|
130
|
+
############################ checking string NA ###################################################
|
|
131
|
+
log.write(" -Checking NA strings :{}".format(",".join(NA_STRINGS)),verbose=verbose)
|
|
132
|
+
if snpid in sumstats.columns:
|
|
133
|
+
log.write(" -Checking if SNPID contains NA strings...",verbose=verbose)
|
|
134
|
+
is_snpid_string_na = sumstats[snpid].isin(NA_STRINGS)
|
|
135
|
+
if sum(is_snpid_string_na) >0:
|
|
136
|
+
log.write(" -Converting {} NA strings in SNPID to pd.NA...".format(sum(is_snpid_string_na)),verbose=verbose)
|
|
137
|
+
sumstats.loc[is_snpid_string_na ,snpid] = pd.NA
|
|
123
138
|
|
|
139
|
+
if rsid in sumstats.columns:
|
|
140
|
+
log.write(" -Checking if rsID contains NA strings...",verbose=verbose)
|
|
141
|
+
is_rsid_string_na = sumstats[rsid].isin(NA_STRINGS)
|
|
142
|
+
if sum(is_rsid_string_na) >0:
|
|
143
|
+
log.write(" -Converting {} NA strings in rsID to pd.NA...".format(sum(is_rsid_string_na)),verbose=verbose)
|
|
144
|
+
sumstats.loc[is_rsid_string_na ,rsid] = pd.NA
|
|
124
145
|
############################ checking ###################################################
|
|
125
146
|
if snpid in sumstats.columns:
|
|
126
147
|
log.write(" -Checking if SNPID is CHR:POS:NEA:EA...(separator: - ,: , _)",verbose=verbose)
|
|
@@ -148,7 +169,15 @@ def fixID(sumstats,
|
|
|
148
169
|
log.write(" -A look at the unrecognized rsID :",set(sumstats.loc[(~is_rsid)&(~is_rs_chrpos),rsid].head()),"...", verbose=verbose)
|
|
149
170
|
|
|
150
171
|
############################ fixing chr pos###################################################
|
|
151
|
-
|
|
172
|
+
if reversea == True:
|
|
173
|
+
if snpid in sumstats.columns:
|
|
174
|
+
log.write(" -Reversing Alleles in SNPID...", verbose=verbose)
|
|
175
|
+
to_fix = is_chrposrefalt
|
|
176
|
+
to_fix_num = sum(to_fix)
|
|
177
|
+
if to_fix_num>0 and verbose: log.write(" -Number of variants could be reversed: "+str(to_fix_num)+" ...")
|
|
178
|
+
extracted = sumstats.loc[to_fix, snpid].str.extract(r'(^\w+[:_-]\d+[:_-])([ATCG]+)([:_-])([ATCG]+$)', flags=re.IGNORECASE)
|
|
179
|
+
sumstats.loc[to_fix, snpid] = extracted[0] + extracted[3] + extracted[2] + extracted[1]
|
|
180
|
+
|
|
152
181
|
if fixchrpos == True:
|
|
153
182
|
# from snpid or rsid, extract CHR:POS to fix CHR and POS
|
|
154
183
|
if snpid in sumstats.columns:
|
|
@@ -537,24 +566,24 @@ def removedup(sumstats,mode="dm",chrom="CHR",pos="POS",snpid="SNPID",ea="EA",nea
|
|
|
537
566
|
pre_number =len(sumstats)
|
|
538
567
|
specified_columns = []
|
|
539
568
|
if "d" in mode:
|
|
540
|
-
specified_columns.append(rsid)
|
|
541
|
-
specified_columns.append(snpid)
|
|
542
|
-
specified_columns.append(chrom)
|
|
543
|
-
specified_columns.append(pos)
|
|
544
|
-
specified_columns.append(ea)
|
|
545
|
-
specified_columns.append(nea)
|
|
569
|
+
if rsid in sumstats.columns: specified_columns.append(rsid)
|
|
570
|
+
if snpid in sumstats.columns: specified_columns.append(snpid)
|
|
571
|
+
if chrom in sumstats.columns: specified_columns.append(chrom)
|
|
572
|
+
if pos in sumstats.columns: specified_columns.append(pos)
|
|
573
|
+
if ea in sumstats.columns: specified_columns.append(ea)
|
|
574
|
+
if nea in sumstats.columns: specified_columns.append(nea)
|
|
546
575
|
if "r" in mode:
|
|
547
|
-
specified_columns.append(rsid)
|
|
576
|
+
if rsid in sumstats.columns:specified_columns.append(rsid)
|
|
548
577
|
if "s" in mode:
|
|
549
|
-
specified_columns.append(snpid)
|
|
578
|
+
if snpid in sumstats.columns:specified_columns.append(snpid)
|
|
550
579
|
if "m" in mode:
|
|
551
|
-
specified_columns.append(chrom)
|
|
552
|
-
specified_columns.append(pos)
|
|
580
|
+
if chrom in sumstats.columns:specified_columns.append(chrom)
|
|
581
|
+
if pos in sumstats.columns:specified_columns.append(pos)
|
|
553
582
|
if "c" in mode:
|
|
554
|
-
specified_columns.append(chrom)
|
|
555
|
-
specified_columns.append(pos)
|
|
556
|
-
specified_columns.append(ea)
|
|
557
|
-
specified_columns.append(nea)
|
|
583
|
+
if chrom in sumstats.columns:specified_columns.append(chrom)
|
|
584
|
+
if pos in sumstats.columns:specified_columns.append(pos)
|
|
585
|
+
if ea in sumstats.columns:specified_columns.append(ea)
|
|
586
|
+
if nea in sumstats.columns:specified_columns.append(nea)
|
|
558
587
|
sumstats = sumstats.loc[~sumstats[specified_columns].isna().any(axis=1),:]
|
|
559
588
|
after_number=len(sumstats)
|
|
560
589
|
log.write(" -Removed ",pre_number -after_number," variants with NA values in {} .".format(set(specified_columns)), verbose=verbose)
|
|
@@ -1123,19 +1152,17 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
|
|
|
1123
1152
|
cols_to_check.append(header)
|
|
1124
1153
|
if header=="STATUS":
|
|
1125
1154
|
log.write(" -Checking STATUS and converting STATUS to categories....", verbose=verbose)
|
|
1126
|
-
|
|
1127
|
-
sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
|
|
1155
|
+
sumstats[header] = pd.Categorical(sumstats[header],categories=STATUS_CATEGORIES)
|
|
1128
1156
|
return sumstats
|
|
1129
1157
|
|
|
1130
1158
|
if dtype in ["Int64","Int32","int","int32","in64"]:
|
|
1131
1159
|
log.write(" -Checking if {} <= {} <= {} ...".format( var_range[0] ,header, var_range[1]), verbose=verbose)
|
|
1132
1160
|
sumstats[header] = np.floor(pd.to_numeric(sumstats[header], errors='coerce')).astype(dtype)
|
|
1133
|
-
|
|
1161
|
+
is_valid = (sumstats[header]>=var_range[0]) & (sumstats[header]<=var_range[1])
|
|
1134
1162
|
elif dtype in ["Float64","Float32","float","float64","float32"]:
|
|
1135
1163
|
log.write(" -Checking if {} < {} < {} ...".format( var_range[0] ,header, var_range[1]),verbose=verbose)
|
|
1136
1164
|
sumstats[header] = pd.to_numeric(sumstats[header], errors='coerce').astype(dtype)
|
|
1137
|
-
|
|
1138
|
-
is_valid = (sumstats[header]>=var_range[0]) & (sumstats[header]<=var_range[1])
|
|
1165
|
+
is_valid = (sumstats[header]>var_range[0]) & (sumstats[header]<var_range[1])
|
|
1139
1166
|
is_valid = is_valid.fillna(False)
|
|
1140
1167
|
|
|
1141
1168
|
if header=="P":
|
|
@@ -1835,4 +1862,4 @@ def check_col(df_col_names, verbose=True, log=Log(), cols=None, function=None):
|
|
|
1835
1862
|
###############################################################################################################
|
|
1836
1863
|
def _df_split(dataframe, n):
|
|
1837
1864
|
k, m = divmod(len(dataframe), n)
|
|
1838
|
-
return [dataframe.iloc[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]
|
|
1865
|
+
return [dataframe.iloc[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]
|