plot-misc 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- plot_misc/__init__.py +1 -0
- plot_misc/_version.py +1 -0
- plot_misc/barchart.py +523 -0
- plot_misc/constants.py +118 -0
- plot_misc/errors.py +328 -0
- plot_misc/example_data/__init__.py +1 -0
- plot_misc/example_data/example_datasets/bar_points.tsv.gz +0 -0
- plot_misc/example_data/example_datasets/barchart.tsv.gz +0 -0
- plot_misc/example_data/example_datasets/calibration_bins.tsv.gz +0 -0
- plot_misc/example_data/example_datasets/calibration_data.tsv.gz +0 -0
- plot_misc/example_data/example_datasets/forest_data.tsv.gz +0 -0
- plot_misc/example_data/example_datasets/group_bar.tsv.gz +0 -0
- plot_misc/example_data/example_datasets/heatmap_data.tsv.gz +0 -0
- plot_misc/example_data/example_datasets/incidence_matrix_data.tsv.gz +0 -0
- plot_misc/example_data/example_datasets/lollipop_data.tsv.gz +0 -0
- plot_misc/example_data/example_datasets/mace_associations.tsv.gz +0 -0
- plot_misc/example_data/example_datasets/net_benefit.tsv.gz +0 -0
- plot_misc/example_data/example_datasets/string_data.txt +1 -0
- plot_misc/example_data/example_datasets/volcano.tsv.gz +0 -0
- plot_misc/example_data/examples.py +637 -0
- plot_misc/forest.py +1478 -0
- plot_misc/heatmap.py +369 -0
- plot_misc/incidencematrix.py +394 -0
- plot_misc/machine_learning.py +1143 -0
- plot_misc/piechart.py +197 -0
- plot_misc/utils/__init__.py +1 -0
- plot_misc/utils/colour.py +171 -0
- plot_misc/utils/formatting.py +369 -0
- plot_misc/utils/utils.py +1151 -0
- plot_misc/volcano.py +203 -0
- plot_misc-2.0.2.dist-info/METADATA +107 -0
- plot_misc-2.0.2.dist-info/RECORD +35 -0
- plot_misc-2.0.2.dist-info/WHEEL +5 -0
- plot_misc-2.0.2.dist-info/licenses/LICENSE +18 -0
- plot_misc-2.0.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,637 @@
|
|
|
1
|
+
"""Provides centralised access to example data sets that can be used in tests
|
|
2
|
+
and also in example code and/or jupyter notebooks.
|
|
3
|
+
|
|
4
|
+
Notes
|
|
5
|
+
-----
|
|
6
|
+
Data can be "added" either through functions that generate the data on the fly
|
|
7
|
+
or via functions that load the data from a static file located in the
|
|
8
|
+
``example_data`` directory. The data files being added should be as small as
|
|
9
|
+
possible (i.e. kilobyte/megabyte range). The dataset functions should be
|
|
10
|
+
decorated with the ``@dataset`` decorator, so the example module knows about
|
|
11
|
+
them. If the function is loading a dataset from a file in the package, it
|
|
12
|
+
should look for the path in ``_ROOT_DATASETS_DIR``.
|
|
13
|
+
|
|
14
|
+
Examples
|
|
15
|
+
--------
|
|
16
|
+
|
|
17
|
+
Registering a function as a dataset providing function:
|
|
18
|
+
|
|
19
|
+
>>> @dataset
|
|
20
|
+
>>> def dummy_data(*args, **kwargs):
|
|
21
|
+
>>> \"\"\"A dummy dataset function that returns a small list.
|
|
22
|
+
>>>
|
|
23
|
+
>>> Returns
|
|
24
|
+
>>> -------
|
|
25
|
+
>>> data : `list`
|
|
26
|
+
>>> A list of length 3 with ``['A', 'B', 'C']``
|
|
27
|
+
>>>
|
|
28
|
+
>>> Notes
|
|
29
|
+
>>> -----
|
|
30
|
+
>>> This function is called ``dummy_data`` and has been decorated with a
|
|
31
|
+
>>> ``@dataset`` decorator which makes it available with the
|
|
32
|
+
>>> `example_data.get_data(<NAME>)` function and also
|
|
33
|
+
>>> `example_data.help(<NAME>)` functions.
|
|
34
|
+
>>> \"\"\"
|
|
35
|
+
>>> return ['A', 'B', 'C']
|
|
36
|
+
|
|
37
|
+
The dataset can then be used as follows:
|
|
38
|
+
|
|
39
|
+
>>> from skeleton_package.example_data import examples
|
|
40
|
+
>>> examples.get_data('dummy_data')
|
|
41
|
+
>>> ['A', 'B', 'C']
|
|
42
|
+
|
|
43
|
+
A dataset function that loads a dataset from file, these functions should load
|
|
44
|
+
from the ``_ROOT_DATASETS_DIR``:
|
|
45
|
+
|
|
46
|
+
>>> @dataset
|
|
47
|
+
>>> def dummy_load_data(*args, **kwargs):
|
|
48
|
+
>>> \"\"\"A dummy dataset function that loads a string from a file.
|
|
49
|
+
>>>
|
|
50
|
+
>>> Returns
|
|
51
|
+
>>> -------
|
|
52
|
+
>>> str_data : `str`
|
|
53
|
+
>>> A string of data loaded from an example data file.
|
|
54
|
+
>>>
|
|
55
|
+
>>> Notes
|
|
56
|
+
>>> -----
|
|
57
|
+
>>> This function is called ``dummy_data`` and has been decorated with a
|
|
58
|
+
>>> ``@dataset`` decorator which makes it available with the
|
|
59
|
+
>>> `example_data.get_data(<NAME>)` function and also
|
|
60
|
+
>>> `example_data.help(<NAME>)` functions. The path to this dataset is
|
|
61
|
+
>>> built from ``_ROOT_DATASETS_DIR``.
|
|
62
|
+
>>> \"\"\"
|
|
63
|
+
>>> load_path = os.path.join(_ROOT_DATASETS_DIR, "string_data.txt")
|
|
64
|
+
>>> with open(load_path) as data_file:
|
|
65
|
+
>>> return data_file.read().strip()
|
|
66
|
+
|
|
67
|
+
The dataset can then be used as follows:
|
|
68
|
+
|
|
69
|
+
>>> from skeleton_package.example_data import examples
|
|
70
|
+
>>> examples.get_data('dummy_load_data')
|
|
71
|
+
>>> 'an example data string'
|
|
72
|
+
"""
|
|
73
|
+
import os
|
|
74
|
+
import re
|
|
75
|
+
import pandas as pd
|
|
76
|
+
import numpy as np
|
|
77
|
+
from plot_misc.constants import (
|
|
78
|
+
UtilsNames,
|
|
79
|
+
ForestNames,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# The name of the example datasets directory
|
|
83
|
+
_EXAMPLE_DATASETS = "example_datasets"
|
|
84
|
+
"""The example dataset directory name (`str`)
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
_ROOT_DATASETS_DIR = os.path.join(os.path.dirname(__file__), _EXAMPLE_DATASETS)
|
|
88
|
+
"""The root path to the dataset files that are available (`str`)
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
_DATASETS = dict()
|
|
92
|
+
"""This will hold the registered dataset functions (`dict`)
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
97
|
+
def dataset(func):
|
|
98
|
+
"""Register a dataset generating function. This function should be used as
|
|
99
|
+
a decorator.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
func : `function`
|
|
104
|
+
The function to register as a dataset. It is registered as under the
|
|
105
|
+
function name.
|
|
106
|
+
|
|
107
|
+
Returns
|
|
108
|
+
-------
|
|
109
|
+
func : `function`
|
|
110
|
+
The function that has been registered.
|
|
111
|
+
|
|
112
|
+
Raises
|
|
113
|
+
------
|
|
114
|
+
KeyError
|
|
115
|
+
If a function of the same name has already been registered.
|
|
116
|
+
|
|
117
|
+
Notes
|
|
118
|
+
-----
|
|
119
|
+
The dataset function should accept ``*args`` and ``**kwargs`` and should be
|
|
120
|
+
decorated with the ``@dataset`` decorator.
|
|
121
|
+
|
|
122
|
+
Examples
|
|
123
|
+
--------
|
|
124
|
+
Create a dataset function that returns a dictionary.
|
|
125
|
+
|
|
126
|
+
>>> @dataset
|
|
127
|
+
>>> def get_dict(*args, **kwargs):
|
|
128
|
+
>>> \"\"\"A dictionary to test or use as an example.
|
|
129
|
+
>>>
|
|
130
|
+
>>> Returns
|
|
131
|
+
>>> -------
|
|
132
|
+
>>> test_dict : `dict`
|
|
133
|
+
>>> A small dictionary of string keys and numeric values
|
|
134
|
+
>>> \"\"\"
|
|
135
|
+
>>> return {'A': 1, 'B': 2, 'C': 3}
|
|
136
|
+
>>>
|
|
137
|
+
|
|
138
|
+
The dataset can then be used as follows:
|
|
139
|
+
|
|
140
|
+
>>> from skeleton_package.example_data import examples
|
|
141
|
+
>>> examples.get_data('get_dict')
|
|
142
|
+
>>> {'A': 1, 'B': 2, 'C': 3}
|
|
143
|
+
|
|
144
|
+
"""
|
|
145
|
+
try:
|
|
146
|
+
_DATASETS[func.__name__]
|
|
147
|
+
raise KeyError("function already registered")
|
|
148
|
+
except KeyError:
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
_DATASETS[func.__name__] = func
|
|
152
|
+
return func
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
156
|
+
def get_data(name, *args, **kwargs):
|
|
157
|
+
"""Central point to get the datasets.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
name : `str`
|
|
162
|
+
A name for the dataset that should correspond to a registered
|
|
163
|
+
dataset function.
|
|
164
|
+
*args
|
|
165
|
+
Arguments to the data generating functions
|
|
166
|
+
**kwargs
|
|
167
|
+
Keyword arguments to the data generating functions
|
|
168
|
+
|
|
169
|
+
Returns
|
|
170
|
+
-------
|
|
171
|
+
dataset : `Any`
|
|
172
|
+
The requested datasets
|
|
173
|
+
"""
|
|
174
|
+
try:
|
|
175
|
+
return _DATASETS[name](*args, **kwargs)
|
|
176
|
+
except KeyError as e:
|
|
177
|
+
raise KeyError("dataset not available: {0}".format(name)) from e
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
181
|
+
def list_datasets():
|
|
182
|
+
"""List all the registered datasets.
|
|
183
|
+
|
|
184
|
+
Returns
|
|
185
|
+
-------
|
|
186
|
+
datasets : `list` of `tuple`
|
|
187
|
+
The registered datasets. Element [0] for each tuple is the dataset name
|
|
188
|
+
and element [1] is a short description captured from the docstring.
|
|
189
|
+
"""
|
|
190
|
+
datasets = []
|
|
191
|
+
for d in _DATASETS.keys():
|
|
192
|
+
desc = re.sub(
|
|
193
|
+
r'(Parameters|Returns).*$', '', _DATASETS[d].__doc__.replace(
|
|
194
|
+
'\n', ' '
|
|
195
|
+
)
|
|
196
|
+
).strip()
|
|
197
|
+
datasets.append((d, desc))
|
|
198
|
+
return datasets
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
202
|
+
def help(name):
|
|
203
|
+
"""Central point to get help for the datasets.
|
|
204
|
+
|
|
205
|
+
Parameters
|
|
206
|
+
----------
|
|
207
|
+
name : `str`
|
|
208
|
+
A name for the dataset that should correspond to a unique key in the
|
|
209
|
+
DATASETS module level dictionary.
|
|
210
|
+
|
|
211
|
+
Returns
|
|
212
|
+
-------
|
|
213
|
+
help : `str`
|
|
214
|
+
The docstring for the function
|
|
215
|
+
"""
|
|
216
|
+
docs = ["Dataset: {0}\n{1}\n\n".format(name, "-" * (len(name) + 9))]
|
|
217
|
+
try:
|
|
218
|
+
docs.extend(
|
|
219
|
+
["{0}\n".format(re.sub(r"^\s{4}", "", i))
|
|
220
|
+
for i in _DATASETS[name].__doc__.split("\n")]
|
|
221
|
+
)
|
|
222
|
+
return "".join(docs)
|
|
223
|
+
except KeyError as e:
|
|
224
|
+
raise KeyError("dataset not available: {0}".format(name)) from e
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
228
|
+
@dataset
|
|
229
|
+
def dummy_data():
|
|
230
|
+
"""A dummy dataset function that returns a small list.
|
|
231
|
+
|
|
232
|
+
Returns
|
|
233
|
+
-------
|
|
234
|
+
data : `list`
|
|
235
|
+
A list of length 3 with ``['A', 'B', 'C']``
|
|
236
|
+
|
|
237
|
+
Notes
|
|
238
|
+
-----
|
|
239
|
+
This function is called ``dummy_data`` and has been decorated with a
|
|
240
|
+
``@dataset`` decorator which makes it available with the
|
|
241
|
+
`example_data.get_data(<NAME>)` function and also
|
|
242
|
+
`example_data.help(<NAME>)` functions.
|
|
243
|
+
"""
|
|
244
|
+
return ['A', 'B', 'C']
|
|
245
|
+
|
|
246
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
247
|
+
@dataset
|
|
248
|
+
def dummy_load_data():
|
|
249
|
+
"""A dummy dataset function that loads a string from a file.
|
|
250
|
+
|
|
251
|
+
Returns
|
|
252
|
+
-------
|
|
253
|
+
str_data : `str`
|
|
254
|
+
A string of data loaded from an example data file.
|
|
255
|
+
|
|
256
|
+
Notes
|
|
257
|
+
-----
|
|
258
|
+
This function is called ``dummy_data`` and has been decorated with a
|
|
259
|
+
``@dataset`` decorator which makes it available with the
|
|
260
|
+
`example_data.get_data(<NAME>)` function and also
|
|
261
|
+
`example_data.help(<NAME>)` functions. The path to this dataset is built
|
|
262
|
+
from ``_ROOT_DATASETS_DIR``.
|
|
263
|
+
"""
|
|
264
|
+
load_path = os.path.join(_ROOT_DATASETS_DIR, "string_data.txt")
|
|
265
|
+
with open(load_path) as data_file:
|
|
266
|
+
return data_file.read().strip()
|
|
267
|
+
|
|
268
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
269
|
+
@dataset
|
|
270
|
+
def load_forest_data(**kwargs):
|
|
271
|
+
"""
|
|
272
|
+
Loads data on the test performance of a number of polygenics scores.
|
|
273
|
+
Estimates represent c-statistics with confidence intervals.
|
|
274
|
+
|
|
275
|
+
Returns
|
|
276
|
+
-------
|
|
277
|
+
pd.DataFrame
|
|
278
|
+
"""
|
|
279
|
+
# files
|
|
280
|
+
df = pd.read_csv(
|
|
281
|
+
os.path.join(_ROOT_DATASETS_DIR, 'forest_data.tsv.gz'),
|
|
282
|
+
sep='\t', index_col=0, **kwargs,
|
|
283
|
+
)
|
|
284
|
+
# add y-axis
|
|
285
|
+
df[ForestNames.y_col] = \
|
|
286
|
+
[
|
|
287
|
+
0.0, 2.0, 4.0, 0.0, 2.0, 4.0, 0.0, 2.0, 4.0, 10.0, 12.0,
|
|
288
|
+
14.0, 10.0, 12.0, 14.0, 10.0, 12.0, 14.0, 20.0, 22.0, 24.0,
|
|
289
|
+
20.0, 22.0, 24.0, 20.0, 22.0, 24.0, 30.0, 32.0, 34.0, 30.0,
|
|
290
|
+
32.0, 34.0, 30.0, 32.0, 34.0, 40.0, 42.0, 44.0, 40.0, 42.0,
|
|
291
|
+
44.0, 40.0, 42.0, 44.0, 50.0, 52.0, 54.0, 50.0, 52.0, 54.0,
|
|
292
|
+
50.0, 52.0, 54.0
|
|
293
|
+
]
|
|
294
|
+
# return
|
|
295
|
+
return df
|
|
296
|
+
|
|
297
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
298
|
+
@dataset
|
|
299
|
+
def load_barchart_data(**kwargs):
|
|
300
|
+
"""
|
|
301
|
+
Loads data counting the number of associations between cardiac chambers
|
|
302
|
+
(`LV`, `RV`, `LA`) and cardiac outcomes.
|
|
303
|
+
|
|
304
|
+
Returns
|
|
305
|
+
-------
|
|
306
|
+
pd.DataFrame
|
|
307
|
+
"""
|
|
308
|
+
# files
|
|
309
|
+
df = pd.read_csv(
|
|
310
|
+
os.path.join(_ROOT_DATASETS_DIR, 'barchart.tsv.gz'),
|
|
311
|
+
sep='\t', index_col=0, **kwargs,
|
|
312
|
+
)
|
|
313
|
+
# return
|
|
314
|
+
return df
|
|
315
|
+
|
|
316
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
317
|
+
@dataset
|
|
318
|
+
def load_groupbar_data(**kwargs):
|
|
319
|
+
"""
|
|
320
|
+
Loads data representing mean and SD percentage of sarcomere disruption
|
|
321
|
+
per knockdown gene and control in iPS-CM
|
|
322
|
+
|
|
323
|
+
Returns
|
|
324
|
+
-------
|
|
325
|
+
pd.DataFrame
|
|
326
|
+
"""
|
|
327
|
+
# files
|
|
328
|
+
df = pd.read_csv(
|
|
329
|
+
os.path.join(_ROOT_DATASETS_DIR, 'group_bar.tsv.gz'),
|
|
330
|
+
sep='\t', index_col=None, **kwargs,
|
|
331
|
+
)
|
|
332
|
+
# return
|
|
333
|
+
return df
|
|
334
|
+
|
|
335
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
336
|
+
@dataset
|
|
337
|
+
def load_barpoints_data(**kwargs):
|
|
338
|
+
"""
|
|
339
|
+
Loads individual data points representing percentage of sarcomere
|
|
340
|
+
disruption per knockdown gene and control in iPS-CM
|
|
341
|
+
|
|
342
|
+
Returns
|
|
343
|
+
-------
|
|
344
|
+
pd.DataFrame
|
|
345
|
+
"""
|
|
346
|
+
# files
|
|
347
|
+
df = pd.read_csv(
|
|
348
|
+
os.path.join(_ROOT_DATASETS_DIR, 'bar_points.tsv.gz'),
|
|
349
|
+
sep='\t', index_col=None, **kwargs,
|
|
350
|
+
)
|
|
351
|
+
# return
|
|
352
|
+
return df
|
|
353
|
+
|
|
354
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
355
|
+
@dataset
|
|
356
|
+
def load_heatmap_data(**kwargs):
|
|
357
|
+
"""
|
|
358
|
+
Loads data representing pvalue times direction of exposures (columns)
|
|
359
|
+
effects on outcomes (rows).
|
|
360
|
+
|
|
361
|
+
Returns
|
|
362
|
+
-------
|
|
363
|
+
pd.DataFrame
|
|
364
|
+
"""
|
|
365
|
+
# files
|
|
366
|
+
df = pd.read_csv(
|
|
367
|
+
os.path.join(_ROOT_DATASETS_DIR, 'heatmap_data.tsv.gz'),
|
|
368
|
+
sep='\t', index_col=0, **kwargs,
|
|
369
|
+
)
|
|
370
|
+
# return
|
|
371
|
+
return df
|
|
372
|
+
|
|
373
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
374
|
+
@dataset
|
|
375
|
+
def load_lollipop_data(**kwargs):
|
|
376
|
+
"""
|
|
377
|
+
Loads a feature importance table. Can be used to test the
|
|
378
|
+
`machine_learning` module.
|
|
379
|
+
|
|
380
|
+
Returns
|
|
381
|
+
-------
|
|
382
|
+
pd.DataFrame
|
|
383
|
+
"""
|
|
384
|
+
# files
|
|
385
|
+
df = pd.read_csv(
|
|
386
|
+
os.path.join(_ROOT_DATASETS_DIR, 'lollipop_data.tsv.gz'),
|
|
387
|
+
sep='\t', index_col=0, **kwargs,
|
|
388
|
+
)
|
|
389
|
+
# return
|
|
390
|
+
return df
|
|
391
|
+
|
|
392
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
393
|
+
@dataset
|
|
394
|
+
def load_net_benefit_data(**kwargs):
|
|
395
|
+
"""
|
|
396
|
+
Loads a table containing the predicted probabilities for two models, as
|
|
397
|
+
well as the outcome data. Can be used to test the `machine_learning` module.
|
|
398
|
+
|
|
399
|
+
Returns
|
|
400
|
+
-------
|
|
401
|
+
pd.DataFrame
|
|
402
|
+
"""
|
|
403
|
+
# files
|
|
404
|
+
df = pd.read_csv(
|
|
405
|
+
os.path.join(_ROOT_DATASETS_DIR, 'net_benefit.tsv.gz'),
|
|
406
|
+
sep='\t', index_col=False, **kwargs,
|
|
407
|
+
)
|
|
408
|
+
# return
|
|
409
|
+
return df
|
|
410
|
+
|
|
411
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
412
|
+
@dataset
|
|
413
|
+
def load_volcano_data(**kwargs):
|
|
414
|
+
"""
|
|
415
|
+
Loads a table with effect estimates and p-values. Can be used to test the
|
|
416
|
+
`volcano` module.
|
|
417
|
+
|
|
418
|
+
Returns
|
|
419
|
+
-------
|
|
420
|
+
pd.DataFrame
|
|
421
|
+
"""
|
|
422
|
+
# files
|
|
423
|
+
df = pd.read_csv(
|
|
424
|
+
os.path.join(_ROOT_DATASETS_DIR, 'volcano.tsv.gz'),
|
|
425
|
+
sep='\t', index_col=0, **kwargs
|
|
426
|
+
)
|
|
427
|
+
# return
|
|
428
|
+
return df
|
|
429
|
+
|
|
430
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
431
|
+
@dataset
|
|
432
|
+
def heatmap_data(**kwargs):
|
|
433
|
+
"""
|
|
434
|
+
Creates a dummy results pd.DF object to test the `make_heatmap` program.
|
|
435
|
+
"""
|
|
436
|
+
data = pd.DataFrame({
|
|
437
|
+
UtilsNames.mat_index: [
|
|
438
|
+
'ldlc_glgc', 'hdlc_glgc', 'ldlc_glgc', 'hdlc_glgc', 'ldlc_glgc',
|
|
439
|
+
'hdlc_glgc', 'ldlc_glgc', 'hdlc_glgc'
|
|
440
|
+
],
|
|
441
|
+
UtilsNames.mat_exposure: [
|
|
442
|
+
'SCF', 'SCF', 'TRAIL', 'TRAIL', 'IP10', 'IP10', 'IL2ra', 'IL2ra'
|
|
443
|
+
],
|
|
444
|
+
UtilsNames.mat_outcome: [
|
|
445
|
+
'LDL-C', 'HDL-C', 'LDL-C', 'HDL-C', 'LDL-C', 'HDL-C', 'LDL-C',
|
|
446
|
+
'HDL-C'
|
|
447
|
+
],
|
|
448
|
+
UtilsNames.mat_point: [
|
|
449
|
+
np.nan, 0.0278005, np.nan, -0.15723944, 0.0321544, -0.02524,
|
|
450
|
+
-0.2353, 0.023522
|
|
451
|
+
],
|
|
452
|
+
UtilsNames.mat_pvalue: [
|
|
453
|
+
np.nan, 0.000534346, np.nan, 0.20464, 0.0001, 0.95426, 0.0052353,
|
|
454
|
+
0.25353
|
|
455
|
+
]
|
|
456
|
+
}, **kwargs)
|
|
457
|
+
data.index = data[UtilsNames.mat_index]; data.index.name = 'index'
|
|
458
|
+
return data
|
|
459
|
+
|
|
460
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
461
|
+
@dataset
|
|
462
|
+
def heatmap_point_matrix(**kwargs):
|
|
463
|
+
"""
|
|
464
|
+
Creates a dummy results pd.DF object to test the `make_heatmap` program.
|
|
465
|
+
Includes point estimates.
|
|
466
|
+
"""
|
|
467
|
+
data = pd.DataFrame({
|
|
468
|
+
UtilsNames.mat_exposure_list[0]: [
|
|
469
|
+
0.023522, -0.233500,
|
|
470
|
+
],
|
|
471
|
+
UtilsNames.mat_exposure_list[1]: [
|
|
472
|
+
-0.025240, 0.032154,
|
|
473
|
+
],
|
|
474
|
+
UtilsNames.mat_exposure_list[2]: [
|
|
475
|
+
np.nan, np.nan,
|
|
476
|
+
],
|
|
477
|
+
UtilsNames.mat_exposure_list[3]: [
|
|
478
|
+
-0.0157239, 0.027800
|
|
479
|
+
],
|
|
480
|
+
}, **kwargs)
|
|
481
|
+
data.index = UtilsNames.mat_outcome_list
|
|
482
|
+
data.index.name = UtilsNames.mat_outcome
|
|
483
|
+
return data
|
|
484
|
+
|
|
485
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
486
|
+
@dataset
|
|
487
|
+
def heatmap_pvalue_matrix(**kwargs):
|
|
488
|
+
"""
|
|
489
|
+
Creates a dummy results pd.DF object to test the `make_heatmap` program.
|
|
490
|
+
Includes p-values.
|
|
491
|
+
"""
|
|
492
|
+
data = pd.DataFrame({
|
|
493
|
+
UtilsNames.mat_exposure_list[0]: [
|
|
494
|
+
0.253530, 0.005235,
|
|
495
|
+
],
|
|
496
|
+
UtilsNames.mat_exposure_list[1]: [
|
|
497
|
+
0.95426, 0.00010,
|
|
498
|
+
],
|
|
499
|
+
UtilsNames.mat_exposure_list[2]: [
|
|
500
|
+
np.nan, np.nan,
|
|
501
|
+
],
|
|
502
|
+
UtilsNames.mat_exposure_list[3]: [
|
|
503
|
+
0.204640, 0.000534,
|
|
504
|
+
],
|
|
505
|
+
}, **kwargs)
|
|
506
|
+
data.index = UtilsNames.mat_outcome_list
|
|
507
|
+
data.index.name = UtilsNames.mat_outcome
|
|
508
|
+
return data
|
|
509
|
+
|
|
510
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
511
|
+
@dataset
|
|
512
|
+
def load_calibration_data(**kwargs):
|
|
513
|
+
"""
|
|
514
|
+
Loads a table with binary outcomes and predicted risk. Can be used to test
|
|
515
|
+
the `machine_learning.calibration` function.
|
|
516
|
+
|
|
517
|
+
Returns
|
|
518
|
+
-------
|
|
519
|
+
pd.DataFrame
|
|
520
|
+
"""
|
|
521
|
+
# files
|
|
522
|
+
df = pd.read_csv(
|
|
523
|
+
os.path.join(_ROOT_DATASETS_DIR, 'calibration_data.tsv.gz'),
|
|
524
|
+
sep='\t', index_col=0, **kwargs,
|
|
525
|
+
)
|
|
526
|
+
# return
|
|
527
|
+
return df
|
|
528
|
+
|
|
529
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
530
|
+
@dataset
|
|
531
|
+
def load_calibration_bins(**kwargs):
|
|
532
|
+
"""
|
|
533
|
+
Loads a table with observed and predicted risk in 6 equally sized bins,
|
|
534
|
+
with lower and upper 95% confidence intervals for the observed risk. Can
|
|
535
|
+
be used to test the `machine_learning.calibration` function.
|
|
536
|
+
|
|
537
|
+
Returns
|
|
538
|
+
-------
|
|
539
|
+
pd.DataFrame
|
|
540
|
+
"""
|
|
541
|
+
# files
|
|
542
|
+
df = pd.read_csv(
|
|
543
|
+
os.path.join(_ROOT_DATASETS_DIR, 'calibration_bins.tsv.gz'),
|
|
544
|
+
sep='\t', index_col=0, **kwargs,
|
|
545
|
+
)
|
|
546
|
+
# return
|
|
547
|
+
return df
|
|
548
|
+
|
|
549
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
550
|
+
@dataset
|
|
551
|
+
def load_incidence_matrix_data(**kwargs):
|
|
552
|
+
"""
|
|
553
|
+
Loads a table linking genes to traits, represented by a `1` with a `0`
|
|
554
|
+
for genes and traits without a potential association.
|
|
555
|
+
|
|
556
|
+
Returns
|
|
557
|
+
-------
|
|
558
|
+
pd.DataFrame
|
|
559
|
+
"""
|
|
560
|
+
# files
|
|
561
|
+
df = pd.read_csv(
|
|
562
|
+
os.path.join(_ROOT_DATASETS_DIR, 'incidence_matrix_data.tsv.gz'),
|
|
563
|
+
sep='\t', index_col=0, **kwargs,
|
|
564
|
+
)
|
|
565
|
+
# return
|
|
566
|
+
return df
|
|
567
|
+
|
|
568
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
569
|
+
@dataset
|
|
570
|
+
def load_percentage_data(**kwargs):
|
|
571
|
+
"""Example data with counts, percentages, and group labels"""
|
|
572
|
+
counts = [10, 8, 5, 15, 13, 10, 5, 10, 8, 10, 6]
|
|
573
|
+
labels = ["PKP2", "MYL2", "JUP", "DSC2", "DSG2", "TTN",
|
|
574
|
+
"DES", "DSP", "PLN", "RBM20", "BAG3"]
|
|
575
|
+
percentage = [c/sum(counts) * 100 for c in counts]
|
|
576
|
+
data = pd.DataFrame({
|
|
577
|
+
"labels": [f"{l} ({p}%)" for l,p in zip(labels, percentage)],
|
|
578
|
+
"counts": counts,
|
|
579
|
+
"percentages": [c/sum(counts) * 100 for c in counts],
|
|
580
|
+
})
|
|
581
|
+
# returns
|
|
582
|
+
return data
|
|
583
|
+
|
|
584
|
+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
585
|
+
@dataset
|
|
586
|
+
def load_mace_associations(**kwargs):
|
|
587
|
+
"""
|
|
588
|
+
Loads a table with hazard ratio's for the associations of one standard
|
|
589
|
+
deviation change in LDL-C or Apo-B with the time to major adverse
|
|
590
|
+
cardiovascular event (MACE).
|
|
591
|
+
|
|
592
|
+
The follow columns are incluced:
|
|
593
|
+
1 index (model) : a string combining the expoure and the type of Cox
|
|
594
|
+
regression model employed.
|
|
595
|
+
Model 2 is simply adjusted for cardiovascular risk factors, where
|
|
596
|
+
the remaining models are simply subgroup specific associations,
|
|
597
|
+
with the relevant subgroups indicated by the `Model` column.
|
|
598
|
+
2 covariate : the exposure.
|
|
599
|
+
Either LDL-C or Apo-B, ignore the unit in brackets, all variables
|
|
600
|
+
were standardised to a mean of zero and standard deviation of 1 prior
|
|
601
|
+
analysis.
|
|
602
|
+
3 coef : the log hazard ratio.
|
|
603
|
+
4 exp(coef) : the hazard ratio.
|
|
604
|
+
5 se(coef) : the standard error of coef.
|
|
605
|
+
6 coef lower 95% : the lower bound of the confidence intterval.
|
|
606
|
+
7 coef upper 95% : the upper bound of the confidence interval.
|
|
607
|
+
8 p : the p-value of coef.
|
|
608
|
+
9 PH p-value : the `proportional hazards` assumption p-value.
|
|
609
|
+
Small p-values point towards possible violations of the proportional
|
|
610
|
+
hazards assumption.
|
|
611
|
+
10 Interaction p-value : The interaction p-value comparing the coef of
|
|
612
|
+
two subgroups.
|
|
613
|
+
11 events : the total number of incidencent MACE.
|
|
614
|
+
12 total sample size : the total sample size.
|
|
615
|
+
13 outcome : the outcome as a string.
|
|
616
|
+
14 Model : the model as a string.
|
|
617
|
+
15 Exposure : the expousre as a string.
|
|
618
|
+
16 covariates : a comma delimited string of the covariates used in each
|
|
619
|
+
model.
|
|
620
|
+
17 col : the dot colour in hex code.
|
|
621
|
+
18 Comparison : the comparison as a string.
|
|
622
|
+
19 round : the necessary rounding.
|
|
623
|
+
20 string_estimates : the hazard ratio and confidence interval as a
|
|
624
|
+
formatted string.
|
|
625
|
+
21 string_interaction_pval: the interaction p-value as a formatted string.
|
|
626
|
+
|
|
627
|
+
Returns
|
|
628
|
+
-------
|
|
629
|
+
pd.DataFrame
|
|
630
|
+
"""
|
|
631
|
+
# files
|
|
632
|
+
df = pd.read_csv(
|
|
633
|
+
os.path.join(_ROOT_DATASETS_DIR, 'mace_associations.tsv.gz'),
|
|
634
|
+
sep='\t', index_col=0, **kwargs,
|
|
635
|
+
)
|
|
636
|
+
# return
|
|
637
|
+
return df
|