gwaslab 3.5.6__py3-none-any.whl → 3.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +2 -0
- gwaslab/bd_common_data.py +1 -0
- gwaslab/bd_get_hapmap3.py +0 -1
- gwaslab/data/formatbook.json +78 -0
- gwaslab/g_Sumstats.py +98 -24
- gwaslab/g_SumstatsMulti.py +287 -0
- gwaslab/g_SumstatsPair.py +101 -16
- gwaslab/g_Sumstats_polars.py +245 -0
- gwaslab/g_headers.py +12 -3
- gwaslab/g_meta.py +123 -47
- gwaslab/g_meta_update.py +48 -0
- gwaslab/g_vchange_status_polars.py +44 -0
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +169 -110
- gwaslab/hm_casting_polars.py +202 -0
- gwaslab/hm_harmonize_sumstats.py +19 -8
- gwaslab/io_load_ld.py +529 -0
- gwaslab/io_preformat_input.py +11 -0
- gwaslab/io_preformat_input_polars.py +632 -0
- gwaslab/io_process_args.py +25 -1
- gwaslab/io_read_ldsc.py +34 -3
- gwaslab/io_read_pipcs.py +62 -6
- gwaslab/prscs_gigrnd.py +122 -0
- gwaslab/prscs_mcmc_gtb.py +136 -0
- gwaslab/prscs_parse_genet.py +98 -0
- gwaslab/qc_build.py +53 -0
- gwaslab/qc_check_datatype.py +10 -8
- gwaslab/qc_check_datatype_polars.py +128 -0
- gwaslab/qc_fix_sumstats.py +25 -23
- gwaslab/qc_fix_sumstats_polars.py +193 -0
- gwaslab/util_ex_calculate_ldmatrix.py +49 -19
- gwaslab/util_ex_gwascatalog.py +71 -28
- gwaslab/util_ex_ldsc.py +67 -21
- gwaslab/util_ex_match_ldmatrix.py +396 -0
- gwaslab/util_ex_run_2samplemr.py +0 -2
- gwaslab/util_ex_run_ccgwas.py +155 -0
- gwaslab/util_ex_run_coloc.py +1 -1
- gwaslab/util_ex_run_hyprcoloc.py +117 -0
- gwaslab/util_ex_run_mesusie.py +155 -0
- gwaslab/util_ex_run_mtag.py +92 -0
- gwaslab/util_ex_run_prscs.py +85 -0
- gwaslab/util_ex_run_susie.py +40 -9
- gwaslab/util_in_estimate_ess.py +18 -0
- gwaslab/util_in_fill_data.py +20 -1
- gwaslab/util_in_filter_value.py +10 -5
- gwaslab/util_in_get_sig.py +71 -13
- gwaslab/util_in_meta.py +168 -4
- gwaslab/util_in_meta_polars.py +174 -0
- gwaslab/viz_plot_compare_effect.py +87 -23
- gwaslab/viz_plot_credible_sets.py +55 -11
- gwaslab/viz_plot_effect.py +22 -12
- gwaslab/viz_plot_miamiplot2.py +3 -2
- gwaslab/viz_plot_mqqplot.py +165 -141
- gwaslab/viz_plot_qqplot.py +6 -6
- gwaslab/viz_plot_regional2.py +5 -13
- gwaslab/viz_plot_rg_heatmap.py +6 -1
- gwaslab/viz_plot_stackedregional.py +21 -6
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/METADATA +9 -7
- gwaslab-3.5.8.dist-info/RECORD +117 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
- gwaslab-3.5.6.dist-info/RECORD +0 -96
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import time
|
|
4
|
+
import copy
|
|
5
|
+
from gwaslab.g_Sumstats_summary import summarize
|
|
6
|
+
from gwaslab.g_Sumstats_summary import lookupstatus
|
|
7
|
+
from gwaslab.io_preformat_input_polars import preformatp
|
|
8
|
+
from gwaslab.io_to_formats import _to_format
|
|
9
|
+
from gwaslab.g_Log import Log
|
|
10
|
+
from gwaslab.qc_fix_sumstats import fixID
|
|
11
|
+
from gwaslab.qc_fix_sumstats import flipSNPID
|
|
12
|
+
from gwaslab.qc_fix_sumstats import stripSNPID
|
|
13
|
+
from gwaslab.qc_fix_sumstats import removedup
|
|
14
|
+
from gwaslab.qc_fix_sumstats import fixchr
|
|
15
|
+
from gwaslab.qc_fix_sumstats import fixpos
|
|
16
|
+
from gwaslab.qc_fix_sumstats import fixallele
|
|
17
|
+
from gwaslab.qc_fix_sumstats import parallelnormalizeallele
|
|
18
|
+
from gwaslab.qc_fix_sumstats import sanitycheckstats
|
|
19
|
+
from gwaslab.qc_fix_sumstats import parallelizeliftovervariant
|
|
20
|
+
from gwaslab.qc_fix_sumstats import flipallelestats
|
|
21
|
+
from gwaslab.qc_fix_sumstats import sortcoordinate
|
|
22
|
+
from gwaslab.qc_fix_sumstats import sortcolumn
|
|
23
|
+
from gwaslab.qc_fix_sumstats import _set_build
|
|
24
|
+
from gwaslab.qc_fix_sumstats import _process_build
|
|
25
|
+
from gwaslab.hm_harmonize_sumstats import parallelecheckaf
|
|
26
|
+
from gwaslab.hm_harmonize_sumstats import paralleleinferaf
|
|
27
|
+
from gwaslab.hm_harmonize_sumstats import checkref
|
|
28
|
+
from gwaslab.hm_harmonize_sumstats import oldcheckref
|
|
29
|
+
from gwaslab.hm_harmonize_sumstats import rsidtochrpos
|
|
30
|
+
from gwaslab.hm_harmonize_sumstats import parallelizeassignrsid
|
|
31
|
+
from gwaslab.hm_harmonize_sumstats import parallelinferstrand
|
|
32
|
+
from gwaslab.hm_harmonize_sumstats import parallelrsidtochrpos
|
|
33
|
+
from gwaslab.hm_harmonize_sumstats import _paralleleinferafwithmaf
|
|
34
|
+
from gwaslab.util_in_filter_value import filtervalues
|
|
35
|
+
from gwaslab.util_in_filter_value import filterout
|
|
36
|
+
from gwaslab.util_in_filter_value import filterin
|
|
37
|
+
from gwaslab.util_in_filter_value import filterregionin
|
|
38
|
+
from gwaslab.util_in_filter_value import filterregionout
|
|
39
|
+
from gwaslab.util_in_filter_value import _filter_indel
|
|
40
|
+
from gwaslab.util_in_filter_value import _filter_palindromic
|
|
41
|
+
from gwaslab.util_in_filter_value import _filter_snp
|
|
42
|
+
from gwaslab.util_in_filter_value import _filter_region
|
|
43
|
+
from gwaslab.util_in_filter_value import _exclude_hla
|
|
44
|
+
from gwaslab.util_in_filter_value import _search_variants
|
|
45
|
+
from gwaslab.util_in_filter_value import inferbuild
|
|
46
|
+
from gwaslab.util_in_filter_value import sampling
|
|
47
|
+
from gwaslab.util_in_filter_value import _get_flanking
|
|
48
|
+
from gwaslab.util_in_filter_value import _get_flanking_by_chrpos
|
|
49
|
+
from gwaslab.util_in_filter_value import _get_flanking_by_id
|
|
50
|
+
from gwaslab.util_in_calculate_gc import lambdaGC
|
|
51
|
+
from gwaslab.util_in_convert_h2 import _get_per_snp_r2
|
|
52
|
+
from gwaslab.util_in_get_sig import getsig
|
|
53
|
+
from gwaslab.util_in_get_density import getsignaldensity
|
|
54
|
+
from gwaslab.util_in_get_density import assigndensity
|
|
55
|
+
from gwaslab.util_in_get_sig import annogene
|
|
56
|
+
from gwaslab.util_in_get_sig import getnovel
|
|
57
|
+
from gwaslab.util_in_get_sig import _check_cis
|
|
58
|
+
from gwaslab.util_in_get_sig import _check_novel_set
|
|
59
|
+
from gwaslab.util_in_fill_data import filldata
|
|
60
|
+
from gwaslab.bd_get_hapmap3 import gethapmap3
|
|
61
|
+
from gwaslab.bd_common_data import get_chr_list
|
|
62
|
+
from gwaslab.bd_common_data import get_number_to_chr
|
|
63
|
+
from gwaslab.bd_common_data import get_chr_to_number
|
|
64
|
+
from gwaslab.bd_common_data import get_high_ld
|
|
65
|
+
from gwaslab.bd_common_data import get_format_dict
|
|
66
|
+
from gwaslab.bd_common_data import get_formats_list
|
|
67
|
+
from gwaslab.g_version import _show_version
|
|
68
|
+
from gwaslab.g_version import gwaslab_info
|
|
69
|
+
from gwaslab.g_meta import _init_meta
|
|
70
|
+
from gwaslab.g_meta import _append_meta_record
|
|
71
|
+
from gwaslab.g_meta_update import _update_meta
|
|
72
|
+
from gwaslab.util_ex_run_clumping import _clump
|
|
73
|
+
from gwaslab.util_ex_calculate_ldmatrix import tofinemapping
|
|
74
|
+
from gwaslab.io_load_ld import tofinemapping_using_ld
|
|
75
|
+
from gwaslab.util_ex_calculate_prs import _calculate_prs
|
|
76
|
+
from gwaslab.viz_plot_mqqplot import mqqplot
|
|
77
|
+
from gwaslab.viz_plot_trumpetplot import plottrumpet
|
|
78
|
+
from gwaslab.viz_plot_compare_af import plotdaf
|
|
79
|
+
from gwaslab.util_ex_run_susie import _run_susie_rss
|
|
80
|
+
from gwaslab.util_ex_run_susie import _get_cs_lead
|
|
81
|
+
from gwaslab.qc_fix_sumstats import _check_data_consistency
|
|
82
|
+
from gwaslab.util_ex_ldsc import _estimate_h2_by_ldsc
|
|
83
|
+
from gwaslab.util_ex_ldsc import _estimate_rg_by_ldsc
|
|
84
|
+
from gwaslab.util_ex_ldsc import _estimate_h2_cts_by_ldsc
|
|
85
|
+
from gwaslab.util_ex_ldsc import _estimate_partitioned_h2_by_ldsc
|
|
86
|
+
from gwaslab.util_ex_ldproxyfinder import _extract_ld_proxy
|
|
87
|
+
from gwaslab.bd_get_hapmap3 import gethapmap3
|
|
88
|
+
from gwaslab.util_abf_finemapping import abf_finemapping
|
|
89
|
+
from gwaslab.util_abf_finemapping import make_cs
|
|
90
|
+
from gwaslab.io_read_pipcs import _read_pipcs
|
|
91
|
+
from gwaslab.util_in_estimate_ess import _get_ess
|
|
92
|
+
from gwaslab.viz_plot_credible_sets import _plot_cs
|
|
93
|
+
from gwaslab.hm_casting import _align_with_mold
|
|
94
|
+
from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
|
|
95
|
+
import gc
|
|
96
|
+
from gwaslab.viz_plot_phe_heatmap import _gwheatmap
|
|
97
|
+
from gwaslab.util_ex_run_prscs import _run_prscs
|
|
98
|
+
|
|
99
|
+
#20220309
|
|
100
|
+
class Sumstatsp():
|
|
101
|
+
def __init__(self,
|
|
102
|
+
sumstats,
|
|
103
|
+
fmt=None,
|
|
104
|
+
tab_fmt="tsv",
|
|
105
|
+
snpid=None,
|
|
106
|
+
rsid=None,
|
|
107
|
+
chrom=None,
|
|
108
|
+
pos=None,
|
|
109
|
+
ea=None,
|
|
110
|
+
nea=None,
|
|
111
|
+
ref=None,
|
|
112
|
+
alt=None,
|
|
113
|
+
eaf=None,
|
|
114
|
+
neaf=None,
|
|
115
|
+
maf=None,
|
|
116
|
+
n=None,
|
|
117
|
+
beta=None,
|
|
118
|
+
se=None,
|
|
119
|
+
chisq=None,
|
|
120
|
+
z=None,
|
|
121
|
+
f=None,
|
|
122
|
+
t=None,
|
|
123
|
+
p=None,
|
|
124
|
+
q=None,
|
|
125
|
+
mlog10p=None,
|
|
126
|
+
test=None,
|
|
127
|
+
info=None,
|
|
128
|
+
OR=None,
|
|
129
|
+
OR_95L=None,
|
|
130
|
+
OR_95U=None,
|
|
131
|
+
beta_95L=None,
|
|
132
|
+
beta_95U=None,
|
|
133
|
+
HR=None,
|
|
134
|
+
HR_95L=None,
|
|
135
|
+
HR_95U=None,
|
|
136
|
+
ncase=None,
|
|
137
|
+
ncontrol=None,
|
|
138
|
+
neff=None,
|
|
139
|
+
i2=None,
|
|
140
|
+
phet=None,
|
|
141
|
+
dof=None,
|
|
142
|
+
snpr2=None,
|
|
143
|
+
status=None,
|
|
144
|
+
other=[],
|
|
145
|
+
chrom_pat=None,
|
|
146
|
+
snpid_pat=None,
|
|
147
|
+
usekeys=None,
|
|
148
|
+
direction=None,
|
|
149
|
+
verbose=True,
|
|
150
|
+
study="Study_1",
|
|
151
|
+
trait="Trait_1",
|
|
152
|
+
build="99",
|
|
153
|
+
species="homo sapiens",
|
|
154
|
+
build_infer=False,
|
|
155
|
+
**readargs):
|
|
156
|
+
|
|
157
|
+
# basic attributes
|
|
158
|
+
self.data = pd.DataFrame()
|
|
159
|
+
self.log = Log()
|
|
160
|
+
self.ldsc_h2 = None
|
|
161
|
+
self.ldsc_h2_results = None
|
|
162
|
+
self.ldsc_rg = pd.DataFrame()
|
|
163
|
+
self.ldsc_h2_cts = None
|
|
164
|
+
self.ldsc_partitioned_h2_summary = None
|
|
165
|
+
self.ldsc_partitioned_h2_results = None
|
|
166
|
+
# meta information
|
|
167
|
+
self.meta = _init_meta()
|
|
168
|
+
self.build = build
|
|
169
|
+
self.meta["gwaslab"]["study_name"] = study
|
|
170
|
+
self.meta["gwaslab"]["species"] = species
|
|
171
|
+
|
|
172
|
+
# initialize attributes for clumping and finmapping
|
|
173
|
+
#self.to_finemapping_file_path = ""
|
|
174
|
+
#self.to_finemapping_file = pd.DataFrame()
|
|
175
|
+
#self.plink_log = ""
|
|
176
|
+
|
|
177
|
+
# path / file / plink_log
|
|
178
|
+
self.finemapping = dict()
|
|
179
|
+
|
|
180
|
+
# clumps / clumps_raw / plink_log
|
|
181
|
+
self.clumps = dict()
|
|
182
|
+
|
|
183
|
+
#
|
|
184
|
+
self.pipcs = pd.DataFrame()
|
|
185
|
+
|
|
186
|
+
# print gwaslab version information
|
|
187
|
+
_show_version(self.log, verbose=verbose)
|
|
188
|
+
|
|
189
|
+
#preformat the data
|
|
190
|
+
self.data = preformatp(
|
|
191
|
+
sumstats=sumstats,
|
|
192
|
+
fmt=fmt,
|
|
193
|
+
tab_fmt = tab_fmt,
|
|
194
|
+
snpid=snpid,
|
|
195
|
+
rsid=rsid,
|
|
196
|
+
chrom=chrom,
|
|
197
|
+
pos=pos,
|
|
198
|
+
ea=ea,
|
|
199
|
+
nea=nea,
|
|
200
|
+
ref=ref,
|
|
201
|
+
alt=alt,
|
|
202
|
+
eaf=eaf,
|
|
203
|
+
neaf=neaf,
|
|
204
|
+
maf=maf,
|
|
205
|
+
n=n,
|
|
206
|
+
beta=beta,
|
|
207
|
+
se=se,
|
|
208
|
+
chisq=chisq,
|
|
209
|
+
z=z,
|
|
210
|
+
f=f,
|
|
211
|
+
t=t,
|
|
212
|
+
p=p,
|
|
213
|
+
q=q,
|
|
214
|
+
mlog10p=mlog10p,
|
|
215
|
+
test=test,
|
|
216
|
+
info=info,
|
|
217
|
+
OR=OR,
|
|
218
|
+
OR_95L=OR_95L,
|
|
219
|
+
OR_95U=OR_95U,
|
|
220
|
+
beta_95L=beta_95L,
|
|
221
|
+
beta_95U=beta_95U,
|
|
222
|
+
HR=HR,
|
|
223
|
+
HR_95L=HR_95L,
|
|
224
|
+
HR_95U=HR_95U,
|
|
225
|
+
i2=i2,
|
|
226
|
+
phet=phet,
|
|
227
|
+
dof=dof,
|
|
228
|
+
snpr2=snpr2,
|
|
229
|
+
ncase=ncase,
|
|
230
|
+
ncontrol=ncontrol,
|
|
231
|
+
neff=neff,
|
|
232
|
+
direction=direction,
|
|
233
|
+
study=study,
|
|
234
|
+
build=build,
|
|
235
|
+
trait=trait,
|
|
236
|
+
status=status,
|
|
237
|
+
other=other,
|
|
238
|
+
usekeys=usekeys,
|
|
239
|
+
chrom_pat=chrom_pat,
|
|
240
|
+
snpid_pat=snpid_pat,
|
|
241
|
+
verbose=verbose,
|
|
242
|
+
readargs=readargs,
|
|
243
|
+
log=self.log)
|
|
244
|
+
|
|
245
|
+
gc.collect()
|
gwaslab/g_headers.py
CHANGED
|
@@ -33,9 +33,10 @@ dtype_dic={
|
|
|
33
33
|
'SNPR2' : 'float64' ,
|
|
34
34
|
'DOF' : 'Int64' ,
|
|
35
35
|
'P_HET' : 'float64' ,
|
|
36
|
-
'
|
|
36
|
+
'I2' : 'float64' ,
|
|
37
37
|
'DENSITY' : 'Int64' ,
|
|
38
38
|
'N' : 'Int64' ,
|
|
39
|
+
'N_EFF' : 'float64' ,
|
|
39
40
|
'N_CASE' : 'Int64' ,
|
|
40
41
|
'N_CONTROL' : 'Int64' ,
|
|
41
42
|
'GENENAME' : 'string' ,
|
|
@@ -92,9 +93,10 @@ description_dic={
|
|
|
92
93
|
'SNPR2' :' per variant R2 ',
|
|
93
94
|
'DOF' :' degree of freedom ',
|
|
94
95
|
'P_HET' :' heterogeneity test P value ',
|
|
95
|
-
'
|
|
96
|
+
'I2' :' heterogeneity I2 ',
|
|
96
97
|
'DENSITY' :' signal density ',
|
|
97
98
|
'N' :' total sample size ',
|
|
99
|
+
'N_EFF' :' Effective sample size ',
|
|
98
100
|
'N_CASE' :' number of cases ',
|
|
99
101
|
'N_CONTROL' :' number of controls ',
|
|
100
102
|
'GENENAME' :' nearest gene symbol ',
|
|
@@ -117,7 +119,14 @@ def _get_headers(mode="all"):
|
|
|
117
119
|
if mode=="info":
|
|
118
120
|
return ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]
|
|
119
121
|
elif mode=="stats":
|
|
120
|
-
return ["BETA","SE","P","MLOG10P",
|
|
122
|
+
return ["BETA","SE","P","MLOG10P",
|
|
123
|
+
"N","N_CASE","N_CONTROL","N_EFF",
|
|
124
|
+
"Z","T","F",
|
|
125
|
+
"OR","OR_95L","OR_95U",
|
|
126
|
+
"HR","HR_95L","HR_95U",
|
|
127
|
+
"MAF","EAF",
|
|
128
|
+
"BETA_95L","BETA_95U",
|
|
129
|
+
"P_HET","I2"]
|
|
121
130
|
else:
|
|
122
131
|
return description_dic.keys()
|
|
123
132
|
|
gwaslab/g_meta.py
CHANGED
|
@@ -1,54 +1,130 @@
|
|
|
1
1
|
from gwaslab.g_version import gwaslab_info
|
|
2
2
|
|
|
3
|
-
def _init_meta():
|
|
4
|
-
|
|
5
|
-
"
|
|
6
|
-
"
|
|
7
|
-
"study_type":"Unknown",
|
|
8
|
-
"species":"homo sapiens",
|
|
9
|
-
"genome_build":"99",
|
|
10
|
-
"variants":{
|
|
11
|
-
"variant_number":"Unknown",
|
|
12
|
-
"min_P":"Unknown",
|
|
13
|
-
"number_of_chromosomes":"Unknown",
|
|
14
|
-
},
|
|
3
|
+
def _init_meta(object="Sumstats"):
|
|
4
|
+
metadata_ssf ={
|
|
5
|
+
"genotyping_technology":"Unknown",
|
|
6
|
+
"gwas_id":"Unknown",
|
|
15
7
|
"samples":{
|
|
16
|
-
"sample_size":"Unknown",
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
"
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
8
|
+
"sample_size":"Unknown",
|
|
9
|
+
"sample_ancestry":"European",
|
|
10
|
+
"ancestry_method":"self-reported|genetically determined",
|
|
11
|
+
} ,
|
|
12
|
+
"trait_description":"Unknown",
|
|
13
|
+
"minor_allele_freq_lower_limit":"Unknown",
|
|
14
|
+
"data_file_name":"Unknown",
|
|
15
|
+
"file_type":"Unknown",
|
|
16
|
+
"data_file_md5sum":"Unknown",
|
|
17
|
+
"is_harmonised":"Unchecked",
|
|
18
|
+
"is_sorted":"Unchecked",
|
|
19
|
+
"genome_assembly":"Unknown",
|
|
20
|
+
"date_last_modified":"Unknown",
|
|
21
|
+
"coordinate_system":"1-based",
|
|
22
|
+
"sex": "M|F|combined"
|
|
23
|
+
}
|
|
24
|
+
metadata_multi ={
|
|
25
|
+
"genome_assembly":"Unknown",
|
|
26
|
+
"date_last_modified":"Unknown",
|
|
27
|
+
"coordinate_system":"1-based"
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
# Sumstats
|
|
31
|
+
if object=="Sumstats":
|
|
32
|
+
metadata = {"gwaslab":{
|
|
33
|
+
"gwaslab_version": gwaslab_info()["version"],
|
|
34
|
+
"gwaslab_object":"gwaslab.Sumstats",
|
|
35
|
+
"study_name":"Sumstats1",
|
|
36
|
+
"study_type":"Unknown",
|
|
37
|
+
"species":"homo sapiens",
|
|
38
|
+
"genome_build":"99",
|
|
39
|
+
"sample_prevalence":"Unknown",
|
|
40
|
+
"population_prevalence":"Unknown",
|
|
41
|
+
"variants":{
|
|
42
|
+
"variant_number":"Unknown",
|
|
43
|
+
"min_P":"Unknown",
|
|
44
|
+
"number_of_chromosomes":"Unknown",
|
|
32
45
|
},
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
46
|
+
"samples":{
|
|
47
|
+
"sample_size":"Unknown",
|
|
48
|
+
"sample_size_case":"Unknown",
|
|
49
|
+
"sample_size_control":"Unknown",
|
|
50
|
+
"sample_size_median":"Unknown",
|
|
51
|
+
"sample_size_min":"Unknown",
|
|
52
|
+
},
|
|
53
|
+
"references":{
|
|
54
|
+
"ref_rsid_tsv":"Unknown",
|
|
55
|
+
"ref_rsid_vcf":"Unknown",
|
|
56
|
+
"ref_seq":"Unknown",
|
|
57
|
+
"ref_infer":"Unknown",
|
|
58
|
+
"ref_infer_af":"Unknown",
|
|
59
|
+
"ref_infer_daf":"Unknown",
|
|
60
|
+
"ref_rsid_to_chrpos_tsv":"Unknown",
|
|
61
|
+
"ref_rsid_to_chrpos_vcf":"Unknown"
|
|
62
|
+
}}}
|
|
63
|
+
metadata |= metadata_ssf
|
|
64
|
+
|
|
65
|
+
# SumstatsPair
|
|
66
|
+
elif object=="SumstatsPair":
|
|
67
|
+
metadata = {"gwaslab":{
|
|
68
|
+
"gwaslab_version": gwaslab_info()["version"],
|
|
69
|
+
"gwaslab_object":"gwaslab.SumstatsPair",
|
|
70
|
+
"group_name":"Group1",
|
|
71
|
+
"species":"homo sapiens",
|
|
72
|
+
"genome_build":"99",
|
|
73
|
+
"variants":{
|
|
74
|
+
"variant_number":"Unknown",
|
|
75
|
+
"min_P":"Unknown",
|
|
76
|
+
"number_of_chromosomes":"Unknown",
|
|
77
|
+
},
|
|
78
|
+
"samples":{
|
|
79
|
+
"sample_size":"Unknown",
|
|
80
|
+
"sample_size_case":"Unknown",
|
|
81
|
+
"sample_size_control":"Unknown",
|
|
82
|
+
"sample_size_median":"Unknown",
|
|
83
|
+
"sample_size_min":"Unknown",
|
|
84
|
+
},
|
|
85
|
+
"references":{
|
|
86
|
+
"ref_rsid_tsv":"Unknown",
|
|
87
|
+
"ref_rsid_vcf":"Unknown",
|
|
88
|
+
"ref_seq":"Unknown",
|
|
89
|
+
"ref_infer":"Unknown",
|
|
90
|
+
"ref_infer_af":"Unknown",
|
|
91
|
+
"ref_infer_daf":"Unknown",
|
|
92
|
+
"ref_rsid_to_chrpos_tsv":"Unknown",
|
|
93
|
+
"ref_rsid_to_chrpos_vcf":"Unknown"
|
|
94
|
+
}}}
|
|
95
|
+
metadata |= metadata_multi
|
|
96
|
+
|
|
97
|
+
# SumstatsMulti
|
|
98
|
+
elif object=="SumstatsMulti":
|
|
99
|
+
metadata = {"gwaslab":{
|
|
100
|
+
"gwaslab_version": gwaslab_info()["version"],
|
|
101
|
+
"gwaslab_object":"gwaslab.SumstatsMulti",
|
|
102
|
+
"group_name":"Group1",
|
|
103
|
+
"species":"homo sapiens",
|
|
104
|
+
"genome_build":"99",
|
|
105
|
+
"variants":{
|
|
106
|
+
"variant_number":"Unknown",
|
|
107
|
+
"min_P":"Unknown",
|
|
108
|
+
"number_of_chromosomes":"Unknown",
|
|
109
|
+
},
|
|
110
|
+
"samples":{
|
|
111
|
+
"sample_size":"Unknown",
|
|
112
|
+
"sample_size_case":"Unknown",
|
|
113
|
+
"sample_size_control":"Unknown",
|
|
114
|
+
"sample_size_median":"Unknown",
|
|
115
|
+
"sample_size_min":"Unknown",
|
|
116
|
+
},
|
|
117
|
+
"references":{
|
|
118
|
+
"ref_rsid_tsv":"Unknown",
|
|
119
|
+
"ref_rsid_vcf":"Unknown",
|
|
120
|
+
"ref_seq":"Unknown",
|
|
121
|
+
"ref_infer":"Unknown",
|
|
122
|
+
"ref_infer_af":"Unknown",
|
|
123
|
+
"ref_infer_daf":"Unknown",
|
|
124
|
+
"ref_rsid_to_chrpos_tsv":"Unknown",
|
|
125
|
+
"ref_rsid_to_chrpos_vcf":"Unknown"
|
|
126
|
+
}}}
|
|
127
|
+
metadata |= metadata_multi
|
|
52
128
|
return metadata.copy()
|
|
53
129
|
|
|
54
130
|
def _append_meta_record(old, new):
|
gwaslab/g_meta_update.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from gwaslab.util_in_filter_value import inferbuild
|
|
3
|
+
from gwaslab.g_Log import Log
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
def _update_meta(meta, sumstats, object="Sumstats",log=Log(), verbose=True):
|
|
7
|
+
|
|
8
|
+
meta["gwaslab"]["variants"]["variant_number"] = len(sumstats)
|
|
9
|
+
|
|
10
|
+
if "CHR" in sumstats.columns:
|
|
11
|
+
meta["gwaslab"]["variants"]["number_of_chromosomes"] = len(sumstats["CHR"].unique())
|
|
12
|
+
|
|
13
|
+
if meta["gwaslab"]["gwaslab_object"]=="gwaslab.Sumstats":
|
|
14
|
+
if "P" in sumstats.columns:
|
|
15
|
+
meta["gwaslab"]["variants"]["min_P"]=np.nanmin(sumstats["P"])
|
|
16
|
+
if "EAF" in sumstats.columns:
|
|
17
|
+
meta["gwaslab"]["variants"]["min_minor_allele_freq"]=min (np.min(sumstats["EAF"]) , 1- np.max(sumstats["EAF"]))
|
|
18
|
+
if "N" in sumstats.columns:
|
|
19
|
+
meta["gwaslab"]["samples"]["sample_size"] = int(sumstats["N"].max())
|
|
20
|
+
meta["gwaslab"]["samples"]["sample_size_median"] = sumstats["N"].median()
|
|
21
|
+
meta["gwaslab"]["samples"]["sample_size_min"] = int(sumstats["N"].min())
|
|
22
|
+
|
|
23
|
+
if meta["gwaslab"]["gwaslab_object"]=="gwaslab.SumstatsMulti" or meta["gwaslab"]["gwaslab_object"]=="gwaslab.SumstatsPair":
|
|
24
|
+
nstudy = meta["gwaslab"]['number_of_studies']
|
|
25
|
+
for i in range(nstudy):
|
|
26
|
+
i_form_1 = i + 1
|
|
27
|
+
meta["gwaslab"]["variants"][i_form_1]=dict()
|
|
28
|
+
meta["gwaslab"]["samples"][i_form_1] =dict()
|
|
29
|
+
|
|
30
|
+
if "P_{}".format(i_form_1) in sumstats.columns:
|
|
31
|
+
p = "P_{}".format(i_form_1)
|
|
32
|
+
|
|
33
|
+
meta["gwaslab"]["variants"][i_form_1]["min_P"]= np.nanmin(sumstats[p])
|
|
34
|
+
if "N_{}".format(i_form_1) in sumstats.columns:
|
|
35
|
+
n = "N_{}".format(i_form_1)
|
|
36
|
+
meta["gwaslab"]["samples"][i_form_1]["sample_size"] = int(sumstats[n].max())
|
|
37
|
+
meta["gwaslab"]["samples"][i_form_1]["sample_size_median"] = sumstats[n].median()
|
|
38
|
+
meta["gwaslab"]["samples"][i_form_1]["sample_size_min"] = int(sumstats[n].min())
|
|
39
|
+
if "EAF_{}".format(i_form_1) in sumstats.columns:
|
|
40
|
+
eaf="EAF_{}".format(i_form_1)
|
|
41
|
+
meta["gwaslab"]["variants"][i_form_1]["min_minor_allele_freq"]=min (np.min(sumstats[eaf]) , 1- np.max(sumstats[eaf]))
|
|
42
|
+
|
|
43
|
+
if meta["gwaslab"]["genome_build"] == "99":
|
|
44
|
+
_, meta["gwaslab"]["genome_build"] = inferbuild(sumstats, change_status=False, log=log, verbose=verbose)
|
|
45
|
+
|
|
46
|
+
meta["date_last_modified"] = str(time.strftime('%Y/%m/%d'))
|
|
47
|
+
|
|
48
|
+
return meta
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import polars as pl
|
|
3
|
+
|
|
4
|
+
def vchange_statusp(sumstats, matched_index, status, digit, before, after):
|
|
5
|
+
dic={}
|
|
6
|
+
for i in range(len(before)):
|
|
7
|
+
dic[before[i]]=after[i]
|
|
8
|
+
|
|
9
|
+
sumstats = sumstats.with_columns(pl.col(status).cast(pl.String).alias(status))
|
|
10
|
+
|
|
11
|
+
if digit>1:
|
|
12
|
+
sumstats = sumstats.with_columns(
|
|
13
|
+
pl.when( matched_index )
|
|
14
|
+
.then( pl.col(status).str.slice(0,digit-1) + pl.col(status).str.slice(digit-1,1).str.replace_many(dic) + pl.col(status).str.slice(digit))
|
|
15
|
+
.otherwise( pl.col(status) )
|
|
16
|
+
.alias(status)
|
|
17
|
+
)
|
|
18
|
+
else:
|
|
19
|
+
sumstats = sumstats.with_columns(
|
|
20
|
+
pl.when( matched_index )
|
|
21
|
+
.then( pl.col(status).str.slice(0,1).str.replace_many(dic) + pl.col(status).str.slice(digit) )
|
|
22
|
+
.otherwise( pl.col(status) )
|
|
23
|
+
.alias(status)
|
|
24
|
+
)
|
|
25
|
+
return sumstats
|
|
26
|
+
|
|
27
|
+
def copy_statusp(sumstats, matched_index, from_status, to_status, digit):
|
|
28
|
+
sumstats = sumstats.with_columns(pl.col(from_status).cast(pl.String).alias(from_status))
|
|
29
|
+
sumstats = sumstats.with_columns(pl.col(to_status).cast(pl.String).alias(to_status))
|
|
30
|
+
if digit>1:
|
|
31
|
+
sumstats = sumstats.with_columns(
|
|
32
|
+
pl.when( matched_index )
|
|
33
|
+
.then( pl.col(from_status).str.slice(0,digit-1) + pl.col(to_status).str.slice(digit-1,1) + pl.col(from_status).str.slice(digit))
|
|
34
|
+
.otherwise( pl.col(to_status) )
|
|
35
|
+
.alias(to_status)
|
|
36
|
+
)
|
|
37
|
+
else:
|
|
38
|
+
sumstats = sumstats.with_columns(
|
|
39
|
+
pl.when( matched_index )
|
|
40
|
+
.then( pl.col(from_status).str.slice(0,1) + pl.col(to_status).str.slice(digit) )
|
|
41
|
+
.otherwise( pl.col(to_status) )
|
|
42
|
+
.alias(to_status)
|
|
43
|
+
)
|
|
44
|
+
return sumstats
|