plot-misc 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. plot_misc/__init__.py +1 -0
  2. plot_misc/_version.py +1 -0
  3. plot_misc/barchart.py +523 -0
  4. plot_misc/constants.py +118 -0
  5. plot_misc/errors.py +328 -0
  6. plot_misc/example_data/__init__.py +1 -0
  7. plot_misc/example_data/example_datasets/bar_points.tsv.gz +0 -0
  8. plot_misc/example_data/example_datasets/barchart.tsv.gz +0 -0
  9. plot_misc/example_data/example_datasets/calibration_bins.tsv.gz +0 -0
  10. plot_misc/example_data/example_datasets/calibration_data.tsv.gz +0 -0
  11. plot_misc/example_data/example_datasets/forest_data.tsv.gz +0 -0
  12. plot_misc/example_data/example_datasets/group_bar.tsv.gz +0 -0
  13. plot_misc/example_data/example_datasets/heatmap_data.tsv.gz +0 -0
  14. plot_misc/example_data/example_datasets/incidence_matrix_data.tsv.gz +0 -0
  15. plot_misc/example_data/example_datasets/lollipop_data.tsv.gz +0 -0
  16. plot_misc/example_data/example_datasets/mace_associations.tsv.gz +0 -0
  17. plot_misc/example_data/example_datasets/net_benefit.tsv.gz +0 -0
  18. plot_misc/example_data/example_datasets/string_data.txt +1 -0
  19. plot_misc/example_data/example_datasets/volcano.tsv.gz +0 -0
  20. plot_misc/example_data/examples.py +637 -0
  21. plot_misc/forest.py +1478 -0
  22. plot_misc/heatmap.py +369 -0
  23. plot_misc/incidencematrix.py +394 -0
  24. plot_misc/machine_learning.py +1143 -0
  25. plot_misc/piechart.py +197 -0
  26. plot_misc/utils/__init__.py +1 -0
  27. plot_misc/utils/colour.py +171 -0
  28. plot_misc/utils/formatting.py +369 -0
  29. plot_misc/utils/utils.py +1151 -0
  30. plot_misc/volcano.py +203 -0
  31. plot_misc-2.0.2.dist-info/METADATA +107 -0
  32. plot_misc-2.0.2.dist-info/RECORD +35 -0
  33. plot_misc-2.0.2.dist-info/WHEEL +5 -0
  34. plot_misc-2.0.2.dist-info/licenses/LICENSE +18 -0
  35. plot_misc-2.0.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,637 @@
1
+ """Provides centralised access to example data sets that can be used in tests
2
+ and also in example code and/or jupyter notebooks.
3
+
4
+ Notes
5
+ -----
6
+ Data can be "added" either through functions that generate the data on the fly
7
+ or via functions that load the data from a static file located in the
8
+ ``example_data`` directory. The data files being added should be as small as
9
+ possible (i.e. kilobyte/megabyte range). The dataset functions should be
10
+ decorated with the ``@dataset`` decorator, so the example module knows about
11
+ them. If the function is loading a dataset from a file in the package, it
12
+ should look for the path in ``_ROOT_DATASETS_DIR``.
13
+
14
+ Examples
15
+ --------
16
+
17
+ Registering a function as a dataset providing function:
18
+
19
+ >>> @dataset
20
+ >>> def dummy_data(*args, **kwargs):
21
+ >>> \"\"\"A dummy dataset function that returns a small list.
22
+ >>>
23
+ >>> Returns
24
+ >>> -------
25
+ >>> data : `list`
26
+ >>> A list of length 3 with ``['A', 'B', 'C']``
27
+ >>>
28
+ >>> Notes
29
+ >>> -----
30
+ >>> This function is called ``dummy_data`` and has been decorated with a
31
+ >>> ``@dataset`` decorator which makes it available with the
32
+ >>> `example_data.get_data(<NAME>)` function and also
33
+ >>> `example_data.help(<NAME>)` functions.
34
+ >>> \"\"\"
35
+ >>> return ['A', 'B', 'C']
36
+
37
+ The dataset can then be used as follows:
38
+
39
+ >>> from skeleton_package.example_data import examples
40
+ >>> examples.get_data('dummy_data')
41
+ >>> ['A', 'B', 'C']
42
+
43
+ A dataset function that loads a dataset from file, these functions should load
44
+ from the ``_ROOT_DATASETS_DIR``:
45
+
46
+ >>> @dataset
47
+ >>> def dummy_load_data(*args, **kwargs):
48
+ >>> \"\"\"A dummy dataset function that loads a string from a file.
49
+ >>>
50
+ >>> Returns
51
+ >>> -------
52
+ >>> str_data : `str`
53
+ >>> A string of data loaded from an example data file.
54
+ >>>
55
+ >>> Notes
56
+ >>> -----
57
+ >>> This function is called ``dummy_data`` and has been decorated with a
58
+ >>> ``@dataset`` decorator which makes it available with the
59
+ >>> `example_data.get_data(<NAME>)` function and also
60
+ >>> `example_data.help(<NAME>)` functions. The path to this dataset is
61
+ >>> built from ``_ROOT_DATASETS_DIR``.
62
+ >>> \"\"\"
63
+ >>> load_path = os.path.join(_ROOT_DATASETS_DIR, "string_data.txt")
64
+ >>> with open(load_path) as data_file:
65
+ >>> return data_file.read().strip()
66
+
67
+ The dataset can then be used as follows:
68
+
69
+ >>> from skeleton_package.example_data import examples
70
+ >>> examples.get_data('dummy_load_data')
71
+ >>> 'an example data string'
72
+ """
73
+ import os
74
+ import re
75
+ import pandas as pd
76
+ import numpy as np
77
+ from plot_misc.constants import (
78
+ UtilsNames,
79
+ ForestNames,
80
+ )
81
+
82
+ # The name of the example datasets directory
83
+ _EXAMPLE_DATASETS = "example_datasets"
84
+ """The example dataset directory name (`str`)
85
+ """
86
+
87
+ _ROOT_DATASETS_DIR = os.path.join(os.path.dirname(__file__), _EXAMPLE_DATASETS)
88
+ """The root path to the dataset files that are available (`str`)
89
+ """
90
+
91
+ _DATASETS = dict()
92
+ """This will hold the registered dataset functions (`dict`)
93
+ """
94
+
95
+
96
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
97
+ def dataset(func):
98
+ """Register a dataset generating function. This function should be used as
99
+ a decorator.
100
+
101
+ Parameters
102
+ ----------
103
+ func : `function`
104
+ The function to register as a dataset. It is registered as under the
105
+ function name.
106
+
107
+ Returns
108
+ -------
109
+ func : `function`
110
+ The function that has been registered.
111
+
112
+ Raises
113
+ ------
114
+ KeyError
115
+ If a function of the same name has already been registered.
116
+
117
+ Notes
118
+ -----
119
+ The dataset function should accept ``*args`` and ``**kwargs`` and should be
120
+ decorated with the ``@dataset`` decorator.
121
+
122
+ Examples
123
+ --------
124
+ Create a dataset function that returns a dictionary.
125
+
126
+ >>> @dataset
127
+ >>> def get_dict(*args, **kwargs):
128
+ >>> \"\"\"A dictionary to test or use as an example.
129
+ >>>
130
+ >>> Returns
131
+ >>> -------
132
+ >>> test_dict : `dict`
133
+ >>> A small dictionary of string keys and numeric values
134
+ >>> \"\"\"
135
+ >>> return {'A': 1, 'B': 2, 'C': 3}
136
+ >>>
137
+
138
+ The dataset can then be used as follows:
139
+
140
+ >>> from skeleton_package.example_data import examples
141
+ >>> examples.get_data('get_dict')
142
+ >>> {'A': 1, 'B': 2, 'C': 3}
143
+
144
+ """
145
+ try:
146
+ _DATASETS[func.__name__]
147
+ raise KeyError("function already registered")
148
+ except KeyError:
149
+ pass
150
+
151
+ _DATASETS[func.__name__] = func
152
+ return func
153
+
154
+
155
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
156
+ def get_data(name, *args, **kwargs):
157
+ """Central point to get the datasets.
158
+
159
+ Parameters
160
+ ----------
161
+ name : `str`
162
+ A name for the dataset that should correspond to a registered
163
+ dataset function.
164
+ *args
165
+ Arguments to the data generating functions
166
+ **kwargs
167
+ Keyword arguments to the data generating functions
168
+
169
+ Returns
170
+ -------
171
+ dataset : `Any`
172
+ The requested datasets
173
+ """
174
+ try:
175
+ return _DATASETS[name](*args, **kwargs)
176
+ except KeyError as e:
177
+ raise KeyError("dataset not available: {0}".format(name)) from e
178
+
179
+
180
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
181
+ def list_datasets():
182
+ """List all the registered datasets.
183
+
184
+ Returns
185
+ -------
186
+ datasets : `list` of `tuple`
187
+ The registered datasets. Element [0] for each tuple is the dataset name
188
+ and element [1] is a short description captured from the docstring.
189
+ """
190
+ datasets = []
191
+ for d in _DATASETS.keys():
192
+ desc = re.sub(
193
+ r'(Parameters|Returns).*$', '', _DATASETS[d].__doc__.replace(
194
+ '\n', ' '
195
+ )
196
+ ).strip()
197
+ datasets.append((d, desc))
198
+ return datasets
199
+
200
+
201
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
202
+ def help(name):
203
+ """Central point to get help for the datasets.
204
+
205
+ Parameters
206
+ ----------
207
+ name : `str`
208
+ A name for the dataset that should correspond to a unique key in the
209
+ DATASETS module level dictionary.
210
+
211
+ Returns
212
+ -------
213
+ help : `str`
214
+ The docstring for the function
215
+ """
216
+ docs = ["Dataset: {0}\n{1}\n\n".format(name, "-" * (len(name) + 9))]
217
+ try:
218
+ docs.extend(
219
+ ["{0}\n".format(re.sub(r"^\s{4}", "", i))
220
+ for i in _DATASETS[name].__doc__.split("\n")]
221
+ )
222
+ return "".join(docs)
223
+ except KeyError as e:
224
+ raise KeyError("dataset not available: {0}".format(name)) from e
225
+
226
+
227
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
228
+ @dataset
229
+ def dummy_data():
230
+ """A dummy dataset function that returns a small list.
231
+
232
+ Returns
233
+ -------
234
+ data : `list`
235
+ A list of length 3 with ``['A', 'B', 'C']``
236
+
237
+ Notes
238
+ -----
239
+ This function is called ``dummy_data`` and has been decorated with a
240
+ ``@dataset`` decorator which makes it available with the
241
+ `example_data.get_data(<NAME>)` function and also
242
+ `example_data.help(<NAME>)` functions.
243
+ """
244
+ return ['A', 'B', 'C']
245
+
246
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
247
+ @dataset
248
+ def dummy_load_data():
249
+ """A dummy dataset function that loads a string from a file.
250
+
251
+ Returns
252
+ -------
253
+ str_data : `str`
254
+ A string of data loaded from an example data file.
255
+
256
+ Notes
257
+ -----
258
+ This function is called ``dummy_data`` and has been decorated with a
259
+ ``@dataset`` decorator which makes it available with the
260
+ `example_data.get_data(<NAME>)` function and also
261
+ `example_data.help(<NAME>)` functions. The path to this dataset is built
262
+ from ``_ROOT_DATASETS_DIR``.
263
+ """
264
+ load_path = os.path.join(_ROOT_DATASETS_DIR, "string_data.txt")
265
+ with open(load_path) as data_file:
266
+ return data_file.read().strip()
267
+
268
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
269
+ @dataset
270
+ def load_forest_data(**kwargs):
271
+ """
272
+ Loads data on the test performance of a number of polygenics scores.
273
+ Estimates represent c-statistics with confidence intervals.
274
+
275
+ Returns
276
+ -------
277
+ pd.DataFrame
278
+ """
279
+ # files
280
+ df = pd.read_csv(
281
+ os.path.join(_ROOT_DATASETS_DIR, 'forest_data.tsv.gz'),
282
+ sep='\t', index_col=0, **kwargs,
283
+ )
284
+ # add y-axis
285
+ df[ForestNames.y_col] = \
286
+ [
287
+ 0.0, 2.0, 4.0, 0.0, 2.0, 4.0, 0.0, 2.0, 4.0, 10.0, 12.0,
288
+ 14.0, 10.0, 12.0, 14.0, 10.0, 12.0, 14.0, 20.0, 22.0, 24.0,
289
+ 20.0, 22.0, 24.0, 20.0, 22.0, 24.0, 30.0, 32.0, 34.0, 30.0,
290
+ 32.0, 34.0, 30.0, 32.0, 34.0, 40.0, 42.0, 44.0, 40.0, 42.0,
291
+ 44.0, 40.0, 42.0, 44.0, 50.0, 52.0, 54.0, 50.0, 52.0, 54.0,
292
+ 50.0, 52.0, 54.0
293
+ ]
294
+ # return
295
+ return df
296
+
297
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
298
+ @dataset
299
+ def load_barchart_data(**kwargs):
300
+ """
301
+ Loads data counting the number of associations between cardiac chambers
302
+ (`LV`, `RV`, `LA`) and cardiac outcomes.
303
+
304
+ Returns
305
+ -------
306
+ pd.DataFrame
307
+ """
308
+ # files
309
+ df = pd.read_csv(
310
+ os.path.join(_ROOT_DATASETS_DIR, 'barchart.tsv.gz'),
311
+ sep='\t', index_col=0, **kwargs,
312
+ )
313
+ # return
314
+ return df
315
+
316
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
317
+ @dataset
318
+ def load_groupbar_data(**kwargs):
319
+ """
320
+ Loads data representing mean and SD percentage of sarcomere disruption
321
+ per knockdown gene and control in iPS-CM
322
+
323
+ Returns
324
+ -------
325
+ pd.DataFrame
326
+ """
327
+ # files
328
+ df = pd.read_csv(
329
+ os.path.join(_ROOT_DATASETS_DIR, 'group_bar.tsv.gz'),
330
+ sep='\t', index_col=None, **kwargs,
331
+ )
332
+ # return
333
+ return df
334
+
335
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
336
+ @dataset
337
+ def load_barpoints_data(**kwargs):
338
+ """
339
+ Loads individual data points representing percentage of sarcomere
340
+ disruption per knockdown gene and control in iPS-CM
341
+
342
+ Returns
343
+ -------
344
+ pd.DataFrame
345
+ """
346
+ # files
347
+ df = pd.read_csv(
348
+ os.path.join(_ROOT_DATASETS_DIR, 'bar_points.tsv.gz'),
349
+ sep='\t', index_col=None, **kwargs,
350
+ )
351
+ # return
352
+ return df
353
+
354
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
355
+ @dataset
356
+ def load_heatmap_data(**kwargs):
357
+ """
358
+ Loads data representing pvalue times direction of exposures (columns)
359
+ effects on outcomes (rows).
360
+
361
+ Returns
362
+ -------
363
+ pd.DataFrame
364
+ """
365
+ # files
366
+ df = pd.read_csv(
367
+ os.path.join(_ROOT_DATASETS_DIR, 'heatmap_data.tsv.gz'),
368
+ sep='\t', index_col=0, **kwargs,
369
+ )
370
+ # return
371
+ return df
372
+
373
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
374
+ @dataset
375
+ def load_lollipop_data(**kwargs):
376
+ """
377
+ Loads a feature importance table. Can be used to test the
378
+ `machine_learning` module.
379
+
380
+ Returns
381
+ -------
382
+ pd.DataFrame
383
+ """
384
+ # files
385
+ df = pd.read_csv(
386
+ os.path.join(_ROOT_DATASETS_DIR, 'lollipop_data.tsv.gz'),
387
+ sep='\t', index_col=0, **kwargs,
388
+ )
389
+ # return
390
+ return df
391
+
392
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
393
+ @dataset
394
+ def load_net_benefit_data(**kwargs):
395
+ """
396
+ Loads a table containing the predicted probabilities for two models, as
397
+ well as the outcome data. Can be used to test the `machine_learning` module.
398
+
399
+ Returns
400
+ -------
401
+ pd.DataFrame
402
+ """
403
+ # files
404
+ df = pd.read_csv(
405
+ os.path.join(_ROOT_DATASETS_DIR, 'net_benefit.tsv.gz'),
406
+ sep='\t', index_col=False, **kwargs,
407
+ )
408
+ # return
409
+ return df
410
+
411
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
412
+ @dataset
413
+ def load_volcano_data(**kwargs):
414
+ """
415
+ Loads a table with effect estimates and p-values. Can be used to test the
416
+ `volcano` module.
417
+
418
+ Returns
419
+ -------
420
+ pd.DataFrame
421
+ """
422
+ # files
423
+ df = pd.read_csv(
424
+ os.path.join(_ROOT_DATASETS_DIR, 'volcano.tsv.gz'),
425
+ sep='\t', index_col=0, **kwargs
426
+ )
427
+ # return
428
+ return df
429
+
430
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
431
+ @dataset
432
+ def heatmap_data(**kwargs):
433
+ """
434
+ Creates a dummy results pd.DF object to test the `make_heatmap` program.
435
+ """
436
+ data = pd.DataFrame({
437
+ UtilsNames.mat_index: [
438
+ 'ldlc_glgc', 'hdlc_glgc', 'ldlc_glgc', 'hdlc_glgc', 'ldlc_glgc',
439
+ 'hdlc_glgc', 'ldlc_glgc', 'hdlc_glgc'
440
+ ],
441
+ UtilsNames.mat_exposure: [
442
+ 'SCF', 'SCF', 'TRAIL', 'TRAIL', 'IP10', 'IP10', 'IL2ra', 'IL2ra'
443
+ ],
444
+ UtilsNames.mat_outcome: [
445
+ 'LDL-C', 'HDL-C', 'LDL-C', 'HDL-C', 'LDL-C', 'HDL-C', 'LDL-C',
446
+ 'HDL-C'
447
+ ],
448
+ UtilsNames.mat_point: [
449
+ np.nan, 0.0278005, np.nan, -0.15723944, 0.0321544, -0.02524,
450
+ -0.2353, 0.023522
451
+ ],
452
+ UtilsNames.mat_pvalue: [
453
+ np.nan, 0.000534346, np.nan, 0.20464, 0.0001, 0.95426, 0.0052353,
454
+ 0.25353
455
+ ]
456
+ }, **kwargs)
457
+ data.index = data[UtilsNames.mat_index]; data.index.name = 'index'
458
+ return data
459
+
460
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
461
+ @dataset
462
+ def heatmap_point_matrix(**kwargs):
463
+ """
464
+ Creates a dummy results pd.DF object to test the `make_heatmap` program.
465
+ Includes point estimates.
466
+ """
467
+ data = pd.DataFrame({
468
+ UtilsNames.mat_exposure_list[0]: [
469
+ 0.023522, -0.233500,
470
+ ],
471
+ UtilsNames.mat_exposure_list[1]: [
472
+ -0.025240, 0.032154,
473
+ ],
474
+ UtilsNames.mat_exposure_list[2]: [
475
+ np.nan, np.nan,
476
+ ],
477
+ UtilsNames.mat_exposure_list[3]: [
478
+ -0.0157239, 0.027800
479
+ ],
480
+ }, **kwargs)
481
+ data.index = UtilsNames.mat_outcome_list
482
+ data.index.name = UtilsNames.mat_outcome
483
+ return data
484
+
485
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
486
+ @dataset
487
+ def heatmap_pvalue_matrix(**kwargs):
488
+ """
489
+ Creates a dummy results pd.DF object to test the `make_heatmap` program.
490
+ Includes p-values.
491
+ """
492
+ data = pd.DataFrame({
493
+ UtilsNames.mat_exposure_list[0]: [
494
+ 0.253530, 0.005235,
495
+ ],
496
+ UtilsNames.mat_exposure_list[1]: [
497
+ 0.95426, 0.00010,
498
+ ],
499
+ UtilsNames.mat_exposure_list[2]: [
500
+ np.nan, np.nan,
501
+ ],
502
+ UtilsNames.mat_exposure_list[3]: [
503
+ 0.204640, 0.000534,
504
+ ],
505
+ }, **kwargs)
506
+ data.index = UtilsNames.mat_outcome_list
507
+ data.index.name = UtilsNames.mat_outcome
508
+ return data
509
+
510
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
511
+ @dataset
512
+ def load_calibration_data(**kwargs):
513
+ """
514
+ Loads a table with binary outcomes and predicted risk. Can be used to test
515
+ the `machine_learning.calibration` function.
516
+
517
+ Returns
518
+ -------
519
+ pd.DataFrame
520
+ """
521
+ # files
522
+ df = pd.read_csv(
523
+ os.path.join(_ROOT_DATASETS_DIR, 'calibration_data.tsv.gz'),
524
+ sep='\t', index_col=0, **kwargs,
525
+ )
526
+ # return
527
+ return df
528
+
529
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
530
+ @dataset
531
+ def load_calibration_bins(**kwargs):
532
+ """
533
+ Loads a table with observed and predicted risk in 6 equally sized bins,
534
+ with lower and upper 95% confidence intervals for the observed risk. Can
535
+ be used to test the `machine_learning.calibration` function.
536
+
537
+ Returns
538
+ -------
539
+ pd.DataFrame
540
+ """
541
+ # files
542
+ df = pd.read_csv(
543
+ os.path.join(_ROOT_DATASETS_DIR, 'calibration_bins.tsv.gz'),
544
+ sep='\t', index_col=0, **kwargs,
545
+ )
546
+ # return
547
+ return df
548
+
549
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
550
+ @dataset
551
+ def load_incidence_matrix_data(**kwargs):
552
+ """
553
+ Loads a table linking genes to traits, represented by a `1` with a `0`
554
+ for genes and traits without a potential association.
555
+
556
+ Returns
557
+ -------
558
+ pd.DataFrame
559
+ """
560
+ # files
561
+ df = pd.read_csv(
562
+ os.path.join(_ROOT_DATASETS_DIR, 'incidence_matrix_data.tsv.gz'),
563
+ sep='\t', index_col=0, **kwargs,
564
+ )
565
+ # return
566
+ return df
567
+
568
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
569
+ @dataset
570
+ def load_percentage_data(**kwargs):
571
+ """Example data with counts, percentages, and group labels"""
572
+ counts = [10, 8, 5, 15, 13, 10, 5, 10, 8, 10, 6]
573
+ labels = ["PKP2", "MYL2", "JUP", "DSC2", "DSG2", "TTN",
574
+ "DES", "DSP", "PLN", "RBM20", "BAG3"]
575
+ percentage = [c/sum(counts) * 100 for c in counts]
576
+ data = pd.DataFrame({
577
+ "labels": [f"{l} ({p}%)" for l,p in zip(labels, percentage)],
578
+ "counts": counts,
579
+ "percentages": [c/sum(counts) * 100 for c in counts],
580
+ })
581
+ # returns
582
+ return data
583
+
584
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
585
+ @dataset
586
+ def load_mace_associations(**kwargs):
587
+ """
588
+ Loads a table with hazard ratio's for the associations of one standard
589
+ deviation change in LDL-C or Apo-B with the time to major adverse
590
+ cardiovascular event (MACE).
591
+
592
+ The follow columns are incluced:
593
+ 1 index (model) : a string combining the expoure and the type of Cox
594
+ regression model employed.
595
+ Model 2 is simply adjusted for cardiovascular risk factors, where
596
+ the remaining models are simply subgroup specific associations,
597
+ with the relevant subgroups indicated by the `Model` column.
598
+ 2 covariate : the exposure.
599
+ Either LDL-C or Apo-B, ignore the unit in brackets, all variables
600
+ were standardised to a mean of zero and standard deviation of 1 prior
601
+ analysis.
602
+ 3 coef : the log hazard ratio.
603
+ 4 exp(coef) : the hazard ratio.
604
+ 5 se(coef) : the standard error of coef.
605
+ 6 coef lower 95% : the lower bound of the confidence intterval.
606
+ 7 coef upper 95% : the upper bound of the confidence interval.
607
+ 8 p : the p-value of coef.
608
+ 9 PH p-value : the `proportional hazards` assumption p-value.
609
+ Small p-values point towards possible violations of the proportional
610
+ hazards assumption.
611
+ 10 Interaction p-value : The interaction p-value comparing the coef of
612
+ two subgroups.
613
+ 11 events : the total number of incidencent MACE.
614
+ 12 total sample size : the total sample size.
615
+ 13 outcome : the outcome as a string.
616
+ 14 Model : the model as a string.
617
+ 15 Exposure : the expousre as a string.
618
+ 16 covariates : a comma delimited string of the covariates used in each
619
+ model.
620
+ 17 col : the dot colour in hex code.
621
+ 18 Comparison : the comparison as a string.
622
+ 19 round : the necessary rounding.
623
+ 20 string_estimates : the hazard ratio and confidence interval as a
624
+ formatted string.
625
+ 21 string_interaction_pval: the interaction p-value as a formatted string.
626
+
627
+ Returns
628
+ -------
629
+ pd.DataFrame
630
+ """
631
+ # files
632
+ df = pd.read_csv(
633
+ os.path.join(_ROOT_DATASETS_DIR, 'mace_associations.tsv.gz'),
634
+ sep='\t', index_col=0, **kwargs,
635
+ )
636
+ # return
637
+ return df