gsMap 1.62__py3-none-any.whl → 1.63__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN_VAE/adjacency_matrix.py +1 -1
- gsMap/GNN_VAE/model.py +5 -5
- gsMap/GNN_VAE/train.py +1 -1
- gsMap/__init__.py +1 -1
- gsMap/cauchy_combination_test.py +14 -36
- gsMap/config.py +473 -404
- gsMap/diagnosis.py +273 -0
- gsMap/find_latent_representation.py +22 -86
- gsMap/format_sumstats.py +79 -82
- gsMap/generate_ldscore.py +145 -78
- gsMap/latent_to_gene.py +65 -104
- gsMap/main.py +1 -9
- gsMap/report.py +160 -0
- gsMap/run_all_mode.py +195 -0
- gsMap/spatial_ldsc_multiple_sumstats.py +187 -112
- gsMap/templates/report_template.html +198 -0
- gsMap/utils/__init__.py +0 -0
- gsMap/{generate_r2_matrix.py → utils/generate_r2_matrix.py} +1 -9
- gsMap/{make_annotations.py → utils/make_annotations.py} +1 -43
- gsMap/utils/manhattan_plot.py +639 -0
- gsMap/{regression_read.py → utils/regression_read.py} +1 -1
- gsMap/visualize.py +100 -55
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/METADATA +16 -46
- gsmap-1.63.dist-info/RECORD +30 -0
- gsmap-1.62.dist-info/RECORD +0 -24
- /gsMap/{jackknife.py → utils/jackknife.py} +0 -0
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/LICENSE +0 -0
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/WHEEL +0 -0
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/entry_points.txt +0 -0
@@ -11,7 +11,7 @@ import pandas as pd
|
|
11
11
|
import pyranges as pr
|
12
12
|
from progress.bar import IncrementalBar
|
13
13
|
|
14
|
-
from gsMap.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
|
14
|
+
from gsMap.utils.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
|
15
15
|
|
16
16
|
|
17
17
|
logger = logging.getLogger(__name__)
|
@@ -52,7 +52,6 @@ class MakeAnnotationConfig:
|
|
52
52
|
raise ValueError(f"Invalid ld_wind_unit: {self.ld_wind_unit}. Choose from 'CM', 'BP', or 'SNP'.")
|
53
53
|
|
54
54
|
|
55
|
-
|
56
55
|
class Snp_Annotator:
|
57
56
|
"""
|
58
57
|
1. Annotate SNPs based on score of genes.
|
@@ -517,44 +516,3 @@ def run_make_annotation(args: MakeAnnotationConfig):
|
|
517
516
|
args, const_max_size
|
518
517
|
)
|
519
518
|
ldscore_generate.compute_ldscore()
|
520
|
-
|
521
|
-
|
522
|
-
if __name__ == '__main__':
|
523
|
-
parser = argparse.ArgumentParser(description='make_annotations.py',
|
524
|
-
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
525
|
-
add_make_annotation_args(parser)
|
526
|
-
|
527
|
-
# Store the Params
|
528
|
-
TEST = True
|
529
|
-
if TEST:
|
530
|
-
name = 'Cortex_151507'
|
531
|
-
TASK_ID = 2
|
532
|
-
test_dir = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021'
|
533
|
-
config = MakeAnnotationConfig(
|
534
|
-
input_feather_file=f'{test_dir}/{name}/gene_markers/{name}_rank.feather',
|
535
|
-
sample_name=name,
|
536
|
-
output_dir=f'{test_dir}/{name}/snp_annotation/new_run',
|
537
|
-
gtf_file='/storage/yangjianLab/songliyang/ReferenceGenome/GRCh37/gencode.v39lift37.annotation.gtf',
|
538
|
-
bfile_root='/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC',
|
539
|
-
baseline_annotation=None,
|
540
|
-
keep_snp_root='/storage/yangjianLab/sharedata/LDSC_resource/hapmap3_snps/hm',
|
541
|
-
chr=TASK_ID,
|
542
|
-
window_size=50000,
|
543
|
-
cells_per_chunk=500,
|
544
|
-
ld_wind=1,
|
545
|
-
ld_wind_unit='CM',
|
546
|
-
r2_cache_dir='/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/r2_matrix',
|
547
|
-
use_gpu=True,
|
548
|
-
snps_per_chunk=100_000
|
549
|
-
)
|
550
|
-
|
551
|
-
else:
|
552
|
-
args = parser.parse_args()
|
553
|
-
config=MakeAnnotationConfig(**vars(args))
|
554
|
-
|
555
|
-
logger.info(f'Running make_annotation for {config.sample_name}')
|
556
|
-
pprint.pprint(config)
|
557
|
-
start_time = time.time()
|
558
|
-
run_make_annotation(config)
|
559
|
-
end_time = time.time()
|
560
|
-
logger.info(f'Make SNP annotation for {config.sample_name} finished. Time spent: {(end_time - start_time) / 60:.2f} min.')
|
@@ -0,0 +1,639 @@
|
|
1
|
+
'''
|
2
|
+
Modified from dash-bio ManhattanPlot (https://github.com/plotly/dash-bio/blob/master/dash_bio/component_factory/_manhattan.py)
|
3
|
+
'''
|
4
|
+
import numpy as np
|
5
|
+
import pandas as pd
|
6
|
+
from pandas.api.types import is_numeric_dtype
|
7
|
+
|
8
|
+
import plotly.graph_objects as go
|
9
|
+
import warnings
|
10
|
+
|
11
|
+
# %%
|
12
|
+
SUGGESTIVE_LINE_LABEL = "suggestive line"
|
13
|
+
GENOMEWIDE_LINE_LABEL = "genomewide line"
|
14
|
+
|
15
|
+
|
16
|
+
def _get_hover_text(df, snpname=None, genename=None, annotationname=None):
|
17
|
+
"""Format the hover text used in Manhattan and Volcano plots.
|
18
|
+
:param (dataFrame) df: A pandas dataframe.
|
19
|
+
:param (string) snpname: A string denoting the column name for the SNP
|
20
|
+
names (e.g., rs number). More generally, this column could be anything
|
21
|
+
that identifies each point being plotted. For example,
|
22
|
+
in an Epigenomewide association study (EWAS), this could be the probe
|
23
|
+
name or cg number. This column should be a character. This argument is
|
24
|
+
optional, however it is necessary to specify it if you want to
|
25
|
+
highlight points on the plot using the highlight argument in the
|
26
|
+
figure method.
|
27
|
+
:param (string) genename: A string denoting the column name for the
|
28
|
+
GENE names.
|
29
|
+
:param (string) annotationname: A string denoting the column name for
|
30
|
+
annotations. This could be any annotation information that you
|
31
|
+
want to include in the plot (e.g., zscore, effect size, minor allele
|
32
|
+
frequency).
|
33
|
+
"""
|
34
|
+
hover_text = ''
|
35
|
+
if snpname is not None and snpname in df.columns:
|
36
|
+
hover_text = 'SNP: ' + df[snpname].astype(str)
|
37
|
+
|
38
|
+
if genename is not None and genename in df.columns:
|
39
|
+
hover_text = hover_text \
|
40
|
+
+ '<br>GENE: ' \
|
41
|
+
+ df[genename].astype(str)
|
42
|
+
|
43
|
+
if annotationname is not None and annotationname in df.columns:
|
44
|
+
hover_text = hover_text \
|
45
|
+
+ '<br>' \
|
46
|
+
+ df[annotationname].astype(str)
|
47
|
+
|
48
|
+
return hover_text
|
49
|
+
|
50
|
+
|
51
|
+
def ManhattanPlot(
|
52
|
+
dataframe,
|
53
|
+
chrm="CHR",
|
54
|
+
bp="BP",
|
55
|
+
p="P",
|
56
|
+
snp="SNP",
|
57
|
+
gene="GENE",
|
58
|
+
annotation=None,
|
59
|
+
logp=True,
|
60
|
+
title="Manhattan Plot",
|
61
|
+
showgrid=True,
|
62
|
+
xlabel=None,
|
63
|
+
ylabel='-log10(p)',
|
64
|
+
point_size=5,
|
65
|
+
showlegend=True,
|
66
|
+
col=None,
|
67
|
+
suggestiveline_value=-np.log10(1e-8),
|
68
|
+
suggestiveline_color='#636efa',
|
69
|
+
suggestiveline_width=1,
|
70
|
+
genomewideline_value=-np.log10(5e-8),
|
71
|
+
genomewideline_color='#EF553B',
|
72
|
+
genomewideline_width=1,
|
73
|
+
highlight=True,
|
74
|
+
highlight_color="red",
|
75
|
+
highlight_gene_list=None,
|
76
|
+
):
|
77
|
+
"""Returns a figure for a manhattan plot.
|
78
|
+
|
79
|
+
Keyword arguments:
|
80
|
+
- dataframe (dataframe; required): A pandas dataframe which must contain at
|
81
|
+
least the following three columns:
|
82
|
+
- the chromosome number
|
83
|
+
- genomic base-pair position
|
84
|
+
- a numeric quantity to plot such as a p-value or zscore
|
85
|
+
- chrm (string; default 'CHR'): A string denoting the column name for
|
86
|
+
the chromosome. This column must be float or integer. Minimum
|
87
|
+
number of chromosomes required is 1. If you have X, Y, or MT
|
88
|
+
chromosomes, be sure to renumber these 23, 24, 25, etc.
|
89
|
+
- bp (string; default 'BP'): A string denoting the column name for the
|
90
|
+
chromosomal position.
|
91
|
+
- p (string; default 'P'): A string denoting the column name for the
|
92
|
+
float quantity to be plotted on the y-axis. This column must be
|
93
|
+
numeric. It does not have to be a p-value. It can be any numeric
|
94
|
+
quantity such as peak heights, Bayes factors, test statistics. If
|
95
|
+
it is not a p-value, make sure to set logp = False.
|
96
|
+
- snp (string; default 'SNP'): A string denoting the column name for
|
97
|
+
the SNP names (e.g., rs number). More generally, this column could
|
98
|
+
be anything that identifies each point being plotted. For example,
|
99
|
+
in an Epigenomewide association study (EWAS), this could be the
|
100
|
+
probe name or cg number. This column should be a character. This
|
101
|
+
argument is optional, however it is necessary to specify it if you
|
102
|
+
want to highlight points on the plot, using the highlight argument
|
103
|
+
in the figure method.
|
104
|
+
- gene (string; default 'GENE'): A string denoting the column name for
|
105
|
+
the GENE names. This column could be a string or a float. More
|
106
|
+
generally, it could be any annotation information that you want
|
107
|
+
to include in the plot.
|
108
|
+
- annotation (string; optional): A string denoting the column to use
|
109
|
+
as annotations. This column could be a string or a float. It
|
110
|
+
could be any annotation information that you want to include in
|
111
|
+
the plot (e.g., zscore, effect size, minor allele frequency).
|
112
|
+
- logp (bool; optional): If True, the -log10 of the p-value is
|
113
|
+
plotted. It isn't very useful to plot raw p-values; however,
|
114
|
+
plotting the raw value could be useful for other genome-wide plots
|
115
|
+
(e.g., peak heights, Bayes factors, test statistics, other
|
116
|
+
"scores", etc.)
|
117
|
+
- title (string; default 'Manhattan Plot'): The title of the graph.
|
118
|
+
- showgrid (bool; default true): Boolean indicating whether gridlines
|
119
|
+
should be shown.
|
120
|
+
- xlabel (string; optional): Label of the x axis.
|
121
|
+
- ylabel (string; default '-log10(p)'): Label of the y axis.
|
122
|
+
- point_size (number; default 5): Size of the points of the Scatter
|
123
|
+
plot.
|
124
|
+
- showlegend (bool; default true): Boolean indicating whether legends
|
125
|
+
should be shown.
|
126
|
+
- col (string; optional): A string representing the color of the
|
127
|
+
points of the scatter plot. Can be in any color format accepted by
|
128
|
+
plotly.graph_objects.
|
129
|
+
- suggestiveline_value (bool | float; default 8): A value which must
|
130
|
+
be either False to deactivate the option, or a numerical value
|
131
|
+
corresponding to the p-value at which the line should be drawn.
|
132
|
+
The line has no influence on the data points.
|
133
|
+
- suggestiveline_color (string; default 'grey'): Color of the suggestive
|
134
|
+
line.
|
135
|
+
- suggestiveline_width (number; default 2): Width of the suggestive
|
136
|
+
line.
|
137
|
+
- genomewideline_value (bool | float; default -log10(5e-8)): A boolean
|
138
|
+
which must be either False to deactivate the option, or a numerical value
|
139
|
+
corresponding to the p-value above which the data points are
|
140
|
+
considered significant.
|
141
|
+
- genomewideline_color (string; default 'red'): Color of the genome-wide
|
142
|
+
line. Can be in any color format accepted by plotly.graph_objects.
|
143
|
+
- genomewideline_width (number; default 1): Width of the genome-wide
|
144
|
+
line.
|
145
|
+
- highlight (bool; default True): turning on/off the highlighting of
|
146
|
+
data points considered significant.
|
147
|
+
- highlight_color (string; default 'red'): Color of the data points
|
148
|
+
highlighted because they are significant. Can be in any color
|
149
|
+
format accepted by plotly.graph_objects.
|
150
|
+
|
151
|
+
# ...
|
152
|
+
Example 1: Random Manhattan Plot
|
153
|
+
'''
|
154
|
+
dataframe = pd.DataFrame(
|
155
|
+
np.random.randint(0,100,size=(100, 3)),
|
156
|
+
columns=['P', 'CHR', 'BP'])
|
157
|
+
fig = create_manhattan(dataframe, title='XYZ Manhattan plot')
|
158
|
+
|
159
|
+
plotly.offline.plot(fig, image='png')
|
160
|
+
'''
|
161
|
+
|
162
|
+
"""
|
163
|
+
|
164
|
+
mh = _ManhattanPlot(
|
165
|
+
dataframe,
|
166
|
+
chrm=chrm,
|
167
|
+
bp=bp,
|
168
|
+
p=p,
|
169
|
+
snp=snp,
|
170
|
+
gene=gene,
|
171
|
+
annotation=annotation,
|
172
|
+
logp=logp
|
173
|
+
)
|
174
|
+
|
175
|
+
return mh.figure(
|
176
|
+
title=title,
|
177
|
+
showgrid=showgrid,
|
178
|
+
xlabel=xlabel,
|
179
|
+
ylabel=ylabel,
|
180
|
+
point_size=point_size,
|
181
|
+
showlegend=showlegend,
|
182
|
+
col=col,
|
183
|
+
suggestiveline_value=suggestiveline_value,
|
184
|
+
suggestiveline_color=suggestiveline_color,
|
185
|
+
suggestiveline_width=suggestiveline_width,
|
186
|
+
genomewideline_value=genomewideline_value,
|
187
|
+
genomewideline_color=genomewideline_color,
|
188
|
+
genomewideline_width=genomewideline_width,
|
189
|
+
highlight=highlight,
|
190
|
+
highlight_color=highlight_color,
|
191
|
+
highlight_gene_list=highlight_gene_list
|
192
|
+
)
|
193
|
+
|
194
|
+
|
195
|
+
class _ManhattanPlot():
|
196
|
+
|
197
|
+
def __init__(
|
198
|
+
self,
|
199
|
+
x,
|
200
|
+
chrm="CHR",
|
201
|
+
bp="BP",
|
202
|
+
p="P",
|
203
|
+
snp="SNP",
|
204
|
+
gene="GENE",
|
205
|
+
annotation=None,
|
206
|
+
logp=True
|
207
|
+
):
|
208
|
+
"""
|
209
|
+
Keyword arguments:
|
210
|
+
- dataframe (dataframe; required): A pandas dataframe which
|
211
|
+
must contain at least the following three columns:
|
212
|
+
- the chromosome number
|
213
|
+
- genomic base-pair position
|
214
|
+
- a numeric quantity to plot such as a p-value or zscore
|
215
|
+
- chrm (string; default 'CHR'): A string denoting the column name for the
|
216
|
+
chromosome. This column must be float or integer. Minimum number
|
217
|
+
of chromosomes required is 1. If you have X, Y, or MT chromosomes,
|
218
|
+
be sure to renumber these 23, 24, 25, etc.
|
219
|
+
- bp (string; default 'BP'): A string denoting the column name for the
|
220
|
+
chromosomal position.
|
221
|
+
- p (string; default 'P'): A string denoting the column name for the
|
222
|
+
float quantity to be plotted on the y-axis. This column must be
|
223
|
+
numeric. This does not have to be a p-value. It can be any
|
224
|
+
numeric quantity such as peak heights, bayes factors, test
|
225
|
+
statistics. If it is not a p-value, make sure to set logp = FALSE.
|
226
|
+
- snp (string; default 'SNP'): A string denoting the column name for the
|
227
|
+
SNP names (e.g. rs number). More generally, this column could be
|
228
|
+
anything that identifies each point being plotted. For example, in
|
229
|
+
an Epigenomewide association study (EWAS) this could be the probe
|
230
|
+
name or cg number. This column should be a character. This
|
231
|
+
argument is optional, however it is necessary to specify if you
|
232
|
+
want to highlight points on the plot using the highlight argument
|
233
|
+
in the figure method.
|
234
|
+
- gene (string; default 'GENE'): A string denoting the column name for the
|
235
|
+
GENE names. This column could be a string or a float. More
|
236
|
+
generally, it could be any annotation information that you want
|
237
|
+
to include in the plot.
|
238
|
+
- annotation (string; optional): A string denoting the column name for
|
239
|
+
an annotation. This column could be a string or a float. This
|
240
|
+
could be any annotation information that you want to include in
|
241
|
+
the plot (e.g. zscore, effect size, minor allele frequency).
|
242
|
+
- logp (bool; default True): If True, the -log10 of the p-value is
|
243
|
+
plotted. It isn't very useful to plot raw p-values; however,
|
244
|
+
plotting the raw value could be useful for other genome-wide plots
|
245
|
+
(e.g., peak heights, Bayes factors, test statistics, other
|
246
|
+
"scores", etc.).
|
247
|
+
|
248
|
+
Returns:
|
249
|
+
- A ManhattanPlot object."""
|
250
|
+
|
251
|
+
# checking the validity of the arguments
|
252
|
+
|
253
|
+
# Make sure you have chrm, bp and p columns and that they are of
|
254
|
+
# numeric type
|
255
|
+
if chrm not in x.columns.values:
|
256
|
+
raise KeyError("Column %s not found in 'x' data.frame" % chrm)
|
257
|
+
else:
|
258
|
+
if not is_numeric_dtype(x[chrm].dtype):
|
259
|
+
raise TypeError("%s column should be numeric. Do you have "
|
260
|
+
"'X', 'Y', 'MT', etc? If so change to "
|
261
|
+
"numbers and try again." % chrm)
|
262
|
+
|
263
|
+
if bp not in x.columns.values:
|
264
|
+
raise KeyError("Column %s not found in 'x' data.frame" % bp)
|
265
|
+
else:
|
266
|
+
if not is_numeric_dtype(x[bp].dtype):
|
267
|
+
raise TypeError("%s column should be numeric type" % bp)
|
268
|
+
|
269
|
+
if p not in x.columns.values:
|
270
|
+
raise KeyError("Column %s not found in 'x' data.frame" % p)
|
271
|
+
else:
|
272
|
+
if not is_numeric_dtype(x[p].dtype):
|
273
|
+
raise TypeError("%s column should be numeric type" % p)
|
274
|
+
|
275
|
+
# Create a new DataFrame with columns named after chrm, bp, and p.
|
276
|
+
self.data = pd.DataFrame(data=x[[chrm, bp, p]])
|
277
|
+
|
278
|
+
if snp is not None:
|
279
|
+
if snp not in x.columns.values:
|
280
|
+
# Warn if you don't have a snp column
|
281
|
+
raise KeyError(
|
282
|
+
"snp argument specified as %s but column not found in "
|
283
|
+
"'x' data.frame" % snp)
|
284
|
+
else:
|
285
|
+
# If the input DataFrame has a snp column, add it to the new
|
286
|
+
# DataFrame
|
287
|
+
self.data[snp] = x[snp]
|
288
|
+
|
289
|
+
if gene is not None:
|
290
|
+
if gene not in x.columns.values:
|
291
|
+
# Warn if you don't have a gene column
|
292
|
+
raise KeyError(
|
293
|
+
"gene argument specified as %s but column not found in "
|
294
|
+
"'x' data.frame" % gene)
|
295
|
+
else:
|
296
|
+
# If the input DataFrame has a gene column, add it to the new
|
297
|
+
# DataFrame
|
298
|
+
self.data[gene] = x[gene]
|
299
|
+
|
300
|
+
if annotation is not None:
|
301
|
+
if annotation not in x.columns.values:
|
302
|
+
# Warn if you don't have an annotation column
|
303
|
+
raise KeyError(
|
304
|
+
"annotation argument specified as %s but column not "
|
305
|
+
"found in 'x' data.frame" % annotation
|
306
|
+
)
|
307
|
+
else:
|
308
|
+
# If the input DataFrame has a gene column, add it to the new
|
309
|
+
# DataFrame
|
310
|
+
self.data[annotation] = x[annotation]
|
311
|
+
|
312
|
+
self.xlabel = ""
|
313
|
+
self.ticks = []
|
314
|
+
self.ticksLabels = []
|
315
|
+
self.nChr = len(x[chrm].unique())
|
316
|
+
self.chrName = chrm
|
317
|
+
self.pName = p
|
318
|
+
self.snpName = snp
|
319
|
+
self.geneName = gene
|
320
|
+
self.annotationName = annotation
|
321
|
+
self.logp = logp
|
322
|
+
|
323
|
+
# Set positions, ticks, and labels for plotting
|
324
|
+
|
325
|
+
self.index = 'INDEX'
|
326
|
+
self.pos = 'POSITION'
|
327
|
+
|
328
|
+
# Fixes the bug where one chromosome is missing by adding a sequential
|
329
|
+
# index column.
|
330
|
+
idx = 0
|
331
|
+
for i in self.data[chrm].unique():
|
332
|
+
idx = idx + 1
|
333
|
+
self.data.loc[self.data[chrm] == i, self.index] = int(idx)
|
334
|
+
# Set the type to be the same as provided for chrm column
|
335
|
+
self.data[self.index] = \
|
336
|
+
self.data[self.index].astype(self.data[chrm].dtype)
|
337
|
+
|
338
|
+
# This section sets up positions and ticks. Ticks should be placed in
|
339
|
+
# the middle of a chromosome. The new pos column is added that keeps
|
340
|
+
# a running sum of the positions of each successive chromosome.
|
341
|
+
# For example:
|
342
|
+
# chrm bp pos
|
343
|
+
# 1 1 1
|
344
|
+
# 1 2 2
|
345
|
+
# 2 1 3
|
346
|
+
# 2 2 4
|
347
|
+
# 3 1 5
|
348
|
+
|
349
|
+
if self.nChr == 1:
|
350
|
+
# For a single chromosome
|
351
|
+
self.data[self.pos] = self.data[bp]
|
352
|
+
self.ticks.append(int(len(self.data[self.pos]) / 2.) + 1)
|
353
|
+
self.xlabel = "Chromosome %s position" % (self.data[chrm].unique())
|
354
|
+
self.ticksLabels = self.ticks
|
355
|
+
else:
|
356
|
+
# For multiple chromosomes
|
357
|
+
lastbase = 0
|
358
|
+
for i in self.data[self.index].unique():
|
359
|
+
if i == 1:
|
360
|
+
self.data.loc[self.data[self.index] == i, self.pos] = \
|
361
|
+
self.data.loc[self.data[self.index] == i, bp].values
|
362
|
+
else:
|
363
|
+
prevbp = self.data.loc[self.data[self.index] == i - 1, bp]
|
364
|
+
# Shift the basepair position by the largest bp of the
|
365
|
+
# current chromosome
|
366
|
+
lastbase = lastbase + prevbp.iat[-1]
|
367
|
+
|
368
|
+
self.data.loc[self.data[self.index] == i, self.pos] = \
|
369
|
+
self.data.loc[self.data[self.index] == i, bp].values \
|
370
|
+
+ lastbase
|
371
|
+
|
372
|
+
tmin = min(self.data.loc[self.data[self.index] == i, self.pos])
|
373
|
+
tmax = max(self.data.loc[self.data[self.index] == i, self.pos])
|
374
|
+
self.ticks.append(int((tmin + tmax) / 2.) + 1)
|
375
|
+
|
376
|
+
self.xlabel = 'Chromosome'
|
377
|
+
self.data[self.pos] = self.data[self.pos].astype(
|
378
|
+
self.data[bp].dtype)
|
379
|
+
|
380
|
+
if self.nChr > 10: # To avoid crowded labels
|
381
|
+
self.ticksLabels = [
|
382
|
+
t if np.mod(int(t), 2) # Only every two ticks
|
383
|
+
else ''
|
384
|
+
for t in self.data[chrm].unique()
|
385
|
+
]
|
386
|
+
else:
|
387
|
+
self.ticksLabels = self.data[chrm].unique() # All the ticks
|
388
|
+
|
389
|
+
def figure(
|
390
|
+
self,
|
391
|
+
title="Manhattan Plot",
|
392
|
+
showgrid=True,
|
393
|
+
xlabel=None,
|
394
|
+
ylabel='-log10(p)',
|
395
|
+
point_size=5,
|
396
|
+
showlegend=True,
|
397
|
+
col=None,
|
398
|
+
suggestiveline_value=-np.log10(1e-8),
|
399
|
+
suggestiveline_color='blue',
|
400
|
+
suggestiveline_width=1,
|
401
|
+
genomewideline_value=-np.log10(5e-8),
|
402
|
+
genomewideline_color='red',
|
403
|
+
genomewideline_width=1,
|
404
|
+
highlight=True,
|
405
|
+
highlight_color="red",
|
406
|
+
highlight_gene_list=None
|
407
|
+
):
|
408
|
+
"""Keyword arguments:
|
409
|
+
- title (string; default 'Manhattan Plot'): The title of the
|
410
|
+
graph.
|
411
|
+
- showgrid (bool; default True): Boolean indicating whether
|
412
|
+
gridlines should be shown.
|
413
|
+
- xlabel (string; optional): Label of the x axis.
|
414
|
+
- ylabel (string; default '-log10(p)'): Label of the y axis.
|
415
|
+
- point_size (number; default 5): Size of the points of the
|
416
|
+
scatter plot.
|
417
|
+
- showlegend (bool; default True): Boolean indicating whether
|
418
|
+
legends should be shown.
|
419
|
+
- col (string; optional): A string representing the color of the
|
420
|
+
points of the Scatter plot. Can be in any color format
|
421
|
+
accepted by plotly.graph_objects.
|
422
|
+
- suggestiveline_value (bool | float; default 8): A value which
|
423
|
+
must be either False to deactivate the option, or a numerical value
|
424
|
+
corresponding to the p-value at which the line should be
|
425
|
+
drawn. The line has no influence on the data points.
|
426
|
+
- suggestiveline_color (string; default 'grey'): Color of the
|
427
|
+
suggestive line.
|
428
|
+
- suggestiveline_width (number; default 2): Width of the
|
429
|
+
suggestive line.
|
430
|
+
- genomewideline_value (bool | float; default -log10(5e-8)): A
|
431
|
+
boolean which must be either False to deactivate the option, or a
|
432
|
+
numerical value corresponding to the p-value above which the
|
433
|
+
data points are considered significant.
|
434
|
+
- genomewideline_color (string; default 'red'): Color of the
|
435
|
+
genome-wide line. Can be in any color format accepted by
|
436
|
+
plotly.graph_objects.
|
437
|
+
- genomewideline_width (number; default 1): Width of the genome
|
438
|
+
wide line.
|
439
|
+
- highlight (bool; default True): Whether to turn on or off the
|
440
|
+
highlighting of data points considered significant.
|
441
|
+
- highlight_color (string; default 'red'): Color of the data
|
442
|
+
points highlighted because they are significant. Can be in any
|
443
|
+
color format accepted by plotly.graph_objects.
|
444
|
+
|
445
|
+
Returns:
|
446
|
+
- A figure formatted for plotly.graph_objects.
|
447
|
+
|
448
|
+
"""
|
449
|
+
|
450
|
+
xmin = min(self.data[self.pos].values)
|
451
|
+
xmax = max(self.data[self.pos].values)
|
452
|
+
|
453
|
+
horizontallines = []
|
454
|
+
|
455
|
+
if suggestiveline_value:
|
456
|
+
suggestiveline = go.layout.Shape(
|
457
|
+
name=SUGGESTIVE_LINE_LABEL,
|
458
|
+
type="line",
|
459
|
+
fillcolor=suggestiveline_color,
|
460
|
+
line=dict(
|
461
|
+
color=suggestiveline_color,
|
462
|
+
width=suggestiveline_width
|
463
|
+
),
|
464
|
+
x0=xmin, x1=xmax, xref="x",
|
465
|
+
y0=suggestiveline_value, y1=suggestiveline_value, yref="y"
|
466
|
+
)
|
467
|
+
horizontallines.append(suggestiveline)
|
468
|
+
|
469
|
+
if genomewideline_value:
|
470
|
+
genomewideline = go.layout.Shape(
|
471
|
+
name=GENOMEWIDE_LINE_LABEL,
|
472
|
+
type="line",
|
473
|
+
fillcolor=genomewideline_color,
|
474
|
+
line=dict(
|
475
|
+
color=genomewideline_color,
|
476
|
+
width=genomewideline_width
|
477
|
+
),
|
478
|
+
x0=xmin, x1=xmax, xref="x",
|
479
|
+
y0=genomewideline_value, y1=genomewideline_value, yref="y"
|
480
|
+
)
|
481
|
+
horizontallines.append(genomewideline)
|
482
|
+
|
483
|
+
data_to_plot = [] # To contain the data traces
|
484
|
+
highlight_tmp = pd.DataFrame() # Empty DataFrame to contain the highlighted data
|
485
|
+
|
486
|
+
if highlight:
|
487
|
+
if not isinstance(highlight, bool):
|
488
|
+
if self.snpName not in self.data.columns.values:
|
489
|
+
raise KeyError(
|
490
|
+
"snp argument specified for highlight as %s but "
|
491
|
+
"column not found in the data.frame" % self.snpName
|
492
|
+
)
|
493
|
+
else:
|
494
|
+
if not highlight_gene_list:
|
495
|
+
raise KeyError(
|
496
|
+
"Please provide a list of genes to highlight"
|
497
|
+
)
|
498
|
+
common_genes = set(self.data[self.geneName].values).intersection(highlight_gene_list)
|
499
|
+
if len(common_genes) == 0:
|
500
|
+
raise Warning(
|
501
|
+
"No common genes found in the data to highlight"
|
502
|
+
)
|
503
|
+
elif len(common_genes) < len(highlight_gene_list):
|
504
|
+
warnings.warn(
|
505
|
+
f"Some genes don't contain any SNP to highlight: "
|
506
|
+
f": {set(highlight_gene_list) - common_genes}"
|
507
|
+
)
|
508
|
+
|
509
|
+
highlight_tmp = self.data
|
510
|
+
|
511
|
+
highlight_tmp = highlight_tmp[highlight_tmp[self.geneName].isin(common_genes)]
|
512
|
+
|
513
|
+
highlight_hover_text = _get_hover_text(
|
514
|
+
highlight_tmp,
|
515
|
+
snpname=self.snpName,
|
516
|
+
genename=self.geneName,
|
517
|
+
annotationname=self.annotationName
|
518
|
+
)
|
519
|
+
|
520
|
+
|
521
|
+
|
522
|
+
# Remove the highlighted data from the DataFrame if not empty
|
523
|
+
if highlight_tmp.empty:
|
524
|
+
data = self.data
|
525
|
+
else:
|
526
|
+
data = self.data.drop(self.data.index[highlight_tmp.index])
|
527
|
+
|
528
|
+
if self.nChr == 1:
|
529
|
+
|
530
|
+
if col is None:
|
531
|
+
col = ['black']
|
532
|
+
|
533
|
+
# If single chromosome, ticks and labels automatic.
|
534
|
+
layout = go.Layout(
|
535
|
+
title=title,
|
536
|
+
xaxis={
|
537
|
+
'title': self.xlabel if xlabel is None else xlabel,
|
538
|
+
'showgrid': showgrid,
|
539
|
+
'range': [xmin, xmax],
|
540
|
+
},
|
541
|
+
yaxis={'title': ylabel},
|
542
|
+
hovermode='closest'
|
543
|
+
)
|
544
|
+
|
545
|
+
hover_text = _get_hover_text(
|
546
|
+
data,
|
547
|
+
snpname=self.snpName,
|
548
|
+
genename=self.geneName,
|
549
|
+
annotationname=self.annotationName
|
550
|
+
)
|
551
|
+
|
552
|
+
data_to_plot.append(
|
553
|
+
go.Scattergl(
|
554
|
+
x=data[self.pos].values,
|
555
|
+
y=-np.log10(data[self.pName].values) if self.logp
|
556
|
+
else data[self.pName].values,
|
557
|
+
mode="markers",
|
558
|
+
showlegend=showlegend,
|
559
|
+
name="chr%i" % data[self.chrName].unique(),
|
560
|
+
marker={
|
561
|
+
'color': col[0],
|
562
|
+
'size': point_size
|
563
|
+
},
|
564
|
+
text=hover_text
|
565
|
+
)
|
566
|
+
)
|
567
|
+
else:
|
568
|
+
# if multiple chrms, use the ticks and labels you created above.
|
569
|
+
layout = go.Layout(
|
570
|
+
title=title,
|
571
|
+
xaxis={
|
572
|
+
'title': self.xlabel if xlabel is None else xlabel,
|
573
|
+
'showgrid': showgrid,
|
574
|
+
'range': [xmin, xmax],
|
575
|
+
'tickmode': "array",
|
576
|
+
'tickvals': self.ticks,
|
577
|
+
'ticktext': self.ticksLabels,
|
578
|
+
'ticks': "outside"
|
579
|
+
},
|
580
|
+
yaxis={'title': ylabel},
|
581
|
+
hovermode='closest'
|
582
|
+
)
|
583
|
+
|
584
|
+
icol = 0
|
585
|
+
if col is None:
|
586
|
+
col = [
|
587
|
+
'black' if np.mod(i, 2)
|
588
|
+
else 'grey' for i in range(self.nChr)
|
589
|
+
]
|
590
|
+
|
591
|
+
for i in data[self.index].unique():
|
592
|
+
tmp = data[data[self.index] == i]
|
593
|
+
|
594
|
+
chromo = tmp[self.chrName].unique() # Get chromosome name
|
595
|
+
|
596
|
+
hover_text = _get_hover_text(
|
597
|
+
tmp,
|
598
|
+
snpname=self.snpName,
|
599
|
+
genename=self.geneName,
|
600
|
+
annotationname=self.annotationName
|
601
|
+
)
|
602
|
+
|
603
|
+
data_to_plot.append(
|
604
|
+
go.Scattergl(
|
605
|
+
x=tmp[self.pos].values,
|
606
|
+
y=-np.log10(tmp[self.pName].values) if self.logp else tmp[self.pName].values,
|
607
|
+
mode="markers",
|
608
|
+
showlegend=showlegend,
|
609
|
+
name="Chr%i" % chromo,
|
610
|
+
marker={
|
611
|
+
'color': col[icol],
|
612
|
+
'size': point_size
|
613
|
+
},
|
614
|
+
text=hover_text
|
615
|
+
)
|
616
|
+
)
|
617
|
+
|
618
|
+
icol = icol + 1
|
619
|
+
|
620
|
+
|
621
|
+
if not highlight_tmp.empty:
|
622
|
+
data_to_plot.append(
|
623
|
+
go.Scattergl(
|
624
|
+
x=highlight_tmp[self.pos].values,
|
625
|
+
y=-np.log10(highlight_tmp[self.pName].values) if self.logp
|
626
|
+
else highlight_tmp[self.pName].values,
|
627
|
+
mode="markers",
|
628
|
+
text=highlight_hover_text,
|
629
|
+
marker=dict(
|
630
|
+
color=highlight_color,
|
631
|
+
size=point_size * 2,
|
632
|
+
),
|
633
|
+
name="SNP-Gene Pairs of interest"
|
634
|
+
)
|
635
|
+
)
|
636
|
+
|
637
|
+
layout.shapes = horizontallines
|
638
|
+
|
639
|
+
return go.Figure(data=data_to_plot, layout=layout)
|