DeConveil 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deconveil/__version__.py +1 -0
- deconveil/dds.py +1 -1
- deconveil/utils_fit.py +148 -0
- deconveil/utils_processing.py +82 -0
- {deconveil-0.1.1.dist-info → deconveil-0.1.3.dist-info}/METADATA +1 -1
- {deconveil-0.1.1.dist-info → deconveil-0.1.3.dist-info}/RECORD +9 -8
- {deconveil-0.1.1.dist-info → deconveil-0.1.3.dist-info}/WHEEL +0 -0
- {deconveil-0.1.1.dist-info → deconveil-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {deconveil-0.1.1.dist-info → deconveil-0.1.3.dist-info}/top_level.txt +0 -0
deconveil/__version__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.3"
|
deconveil/dds.py
CHANGED
|
@@ -17,10 +17,10 @@ from deconveil.utils_fit import fit_rough_dispersions
|
|
|
17
17
|
from deconveil.utils_fit import fit_moments_dispersions2
|
|
18
18
|
from deconveil.utils_fit import grid_fit_beta
|
|
19
19
|
from deconveil.utils_fit import irls_glm
|
|
20
|
+
from deconveil.utils_fit import build_design_matrix
|
|
20
21
|
|
|
21
22
|
from pydeseq2.preprocessing import deseq2_norm_fit
|
|
22
23
|
from pydeseq2.preprocessing import deseq2_norm_transform
|
|
23
|
-
from pydeseq2.utils import build_design_matrix
|
|
24
24
|
from pydeseq2.utils import dispersion_trend
|
|
25
25
|
from pydeseq2.utils import mean_absolute_deviation
|
|
26
26
|
from pydeseq2.utils import n_or_more_replicates
|
deconveil/utils_fit.py
CHANGED
|
@@ -523,3 +523,151 @@ def nbinomFn(
|
|
|
523
523
|
).sum(0)
|
|
524
524
|
|
|
525
525
|
return prior - nll
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def build_design_matrix(
|
|
529
|
+
metadata: pd.DataFrame,
|
|
530
|
+
design_factors: Union[str, List[str]] = "condition",
|
|
531
|
+
ref_level: Optional[List[str]] = None,
|
|
532
|
+
continuous_factors: Optional[List[str]] = None,
|
|
533
|
+
expanded: bool = False,
|
|
534
|
+
intercept: bool = True,
|
|
535
|
+
) -> pd.DataFrame:
|
|
536
|
+
"""Build design_matrix matrix for DEA.
|
|
537
|
+
|
|
538
|
+
Unless specified, the reference factor is chosen alphabetically.
|
|
539
|
+
|
|
540
|
+
Parameters
|
|
541
|
+
----------
|
|
542
|
+
metadata : pandas.DataFrame
|
|
543
|
+
DataFrame containing metadata information.
|
|
544
|
+
Must be indexed by sample barcodes.
|
|
545
|
+
|
|
546
|
+
design_factors : str or list
|
|
547
|
+
Name of the columns of metadata to be used as design_matrix variables.
|
|
548
|
+
(default: ``"condition"``).
|
|
549
|
+
|
|
550
|
+
ref_level : dict or None
|
|
551
|
+
An optional list of two strings of the form ``["factor", "ref_level"]``
|
|
552
|
+
specifying the factor of interest and the desired reference level, e.g.
|
|
553
|
+
``["condition", "A"]``. (default: ``None``).
|
|
554
|
+
|
|
555
|
+
continuous_factors : list or None
|
|
556
|
+
An optional list of continuous (as opposed to categorical) factors. Any factor
|
|
557
|
+
not in ``continuous_factors`` will be considered categorical (default: ``None``).
|
|
558
|
+
|
|
559
|
+
expanded : bool
|
|
560
|
+
If true, use one column per category. Else, use n-1 columns, for each n-level
|
|
561
|
+
categorical factor.
|
|
562
|
+
(default: ``False``).
|
|
563
|
+
|
|
564
|
+
intercept : bool
|
|
565
|
+
If true, add an intercept (a column containing only ones). (default: ``True``).
|
|
566
|
+
|
|
567
|
+
Returns
|
|
568
|
+
-------
|
|
569
|
+
pandas.DataFrame
|
|
570
|
+
A DataFrame with experiment design information (to split cohorts).
|
|
571
|
+
Indexed by sample barcodes.
|
|
572
|
+
"""
|
|
573
|
+
if isinstance(
|
|
574
|
+
design_factors, str
|
|
575
|
+
): # if there is a single factor, convert to singleton list
|
|
576
|
+
design_factors = [design_factors]
|
|
577
|
+
|
|
578
|
+
for factor in design_factors:
|
|
579
|
+
# Check that each factor has at least 2 levels
|
|
580
|
+
if len(np.unique(metadata[factor])) < 2:
|
|
581
|
+
raise ValueError(
|
|
582
|
+
f"Factors should take at least two values, but {factor} "
|
|
583
|
+
f"takes the single value '{np.unique(metadata[factor])}'."
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
# Check that level factors in the design don't contain underscores. If so, convert
|
|
587
|
+
# them to hyphens
|
|
588
|
+
warning_issued = False
|
|
589
|
+
for factor in design_factors:
|
|
590
|
+
if np.any(["_" in value for value in metadata[factor]]):
|
|
591
|
+
if not warning_issued:
|
|
592
|
+
warnings.warn(
|
|
593
|
+
"""Some factor levels in the design contain underscores ('_').
|
|
594
|
+
They will be converted to hyphens ('-').""",
|
|
595
|
+
UserWarning,
|
|
596
|
+
stacklevel=2,
|
|
597
|
+
)
|
|
598
|
+
warning_issued = True
|
|
599
|
+
metadata[factor] = metadata[factor].apply(lambda x: x.replace("_", "-"))
|
|
600
|
+
|
|
601
|
+
if continuous_factors is not None:
|
|
602
|
+
categorical_factors = [
|
|
603
|
+
factor for factor in design_factors if factor not in continuous_factors
|
|
604
|
+
]
|
|
605
|
+
else:
|
|
606
|
+
categorical_factors = design_factors
|
|
607
|
+
|
|
608
|
+
# Check that there is at least one categorical factor
|
|
609
|
+
if len(categorical_factors) > 0:
|
|
610
|
+
design_matrix = pd.get_dummies(
|
|
611
|
+
metadata[categorical_factors], drop_first=not expanded
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
if ref_level is not None:
|
|
615
|
+
if len(ref_level) != 2:
|
|
616
|
+
raise KeyError("The reference level should contain 2 strings.")
|
|
617
|
+
if ref_level[1] not in metadata[ref_level[0]].values:
|
|
618
|
+
raise KeyError(
|
|
619
|
+
f"The metadata data should contain a '{ref_level[0]}' column"
|
|
620
|
+
f" with a '{ref_level[1]}' level."
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
# Check that the reference level is not in the matrix (if unexpanded design)
|
|
624
|
+
ref_level_name = "_".join(ref_level)
|
|
625
|
+
if (not expanded) and ref_level_name in design_matrix.columns:
|
|
626
|
+
# Remove the reference level and add one
|
|
627
|
+
factor_cols = [
|
|
628
|
+
col for col in design_matrix.columns if col.startswith(ref_level[0])
|
|
629
|
+
]
|
|
630
|
+
missing_level = next(
|
|
631
|
+
level
|
|
632
|
+
for level in np.unique(metadata[ref_level[0]])
|
|
633
|
+
if f"{ref_level[0]}_{level}" not in design_matrix.columns
|
|
634
|
+
)
|
|
635
|
+
design_matrix[f"{ref_level[0]}_{missing_level}"] = 1 - design_matrix[
|
|
636
|
+
factor_cols
|
|
637
|
+
].sum(1)
|
|
638
|
+
design_matrix.drop(ref_level_name, axis="columns", inplace=True)
|
|
639
|
+
|
|
640
|
+
if not expanded:
|
|
641
|
+
# Add reference level as column name suffix
|
|
642
|
+
for factor in design_factors:
|
|
643
|
+
if ref_level is None or factor != ref_level[0]:
|
|
644
|
+
# The reference is the unique level that is no longer there
|
|
645
|
+
ref = next(
|
|
646
|
+
level
|
|
647
|
+
for level in np.unique(metadata[factor])
|
|
648
|
+
if f"{factor}_{level}" not in design_matrix.columns
|
|
649
|
+
)
|
|
650
|
+
else:
|
|
651
|
+
# The reference level is given as an argument
|
|
652
|
+
ref = ref_level[1]
|
|
653
|
+
design_matrix.columns = [
|
|
654
|
+
f"{col}_vs_{ref}" if col.startswith(factor) else col
|
|
655
|
+
for col in design_matrix.columns
|
|
656
|
+
]
|
|
657
|
+
else:
|
|
658
|
+
# There is no categorical factor in the design
|
|
659
|
+
design_matrix = pd.DataFrame(index=metadata.index)
|
|
660
|
+
|
|
661
|
+
if intercept:
|
|
662
|
+
design_matrix.insert(0, "intercept", 1)
|
|
663
|
+
|
|
664
|
+
# Convert categorical factors one-hot encodings to int
|
|
665
|
+
design_matrix = design_matrix.astype("int")
|
|
666
|
+
|
|
667
|
+
# Add continuous factors
|
|
668
|
+
if continuous_factors is not None:
|
|
669
|
+
for factor in continuous_factors:
|
|
670
|
+
# This factor should be numeric
|
|
671
|
+
design_matrix[factor] = pd.to_numeric(metadata[factor])
|
|
672
|
+
return design_matrix
|
|
673
|
+
|
deconveil/utils_processing.py
CHANGED
|
@@ -5,11 +5,93 @@ from pathlib import Path
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
|
+
import deconveil
|
|
8
9
|
|
|
9
10
|
from typing import List, Literal, Optional, Dict, Any, cast
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
|
|
14
|
+
def load_test_data(
|
|
15
|
+
modality: Literal["rna", "cnv", "metadata", "cnv_tumor"] = "rna",
|
|
16
|
+
dataset: Literal["tcga_brca"] = "tcga_brca",
|
|
17
|
+
debug: bool = False,
|
|
18
|
+
debug_seed: int = 42,
|
|
19
|
+
) -> pd.DataFrame:
|
|
20
|
+
"""Load TCGA-BRCA example data from the DeConveil package.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
modality : {"rna", "cnv", "metadata", "cnv_tumor"}
|
|
25
|
+
Type of data to load.
|
|
26
|
+
|
|
27
|
+
dataset : {"tcga_brca"}
|
|
28
|
+
Dataset name. Only "tcga_brca" is currently supported.
|
|
29
|
+
|
|
30
|
+
debug : bool, optional
|
|
31
|
+
If True, randomly subsample 10 samples and 100 features (if applicable).
|
|
32
|
+
Default is False.
|
|
33
|
+
|
|
34
|
+
debug_seed : int, optional
|
|
35
|
+
Random seed for reproducibility of debug subsampling. Default is 42.
|
|
36
|
+
|
|
37
|
+
Returns
|
|
38
|
+
-------
|
|
39
|
+
pandas.DataFrame
|
|
40
|
+
The requested data modality as a DataFrame.
|
|
41
|
+
"""
|
|
42
|
+
assert modality in ["rna", "cnv", "metadata", "cnv_tumor"], (
|
|
43
|
+
"modality must be one of: 'rna', 'cnv', 'metadata', 'cnv_tumor'"
|
|
44
|
+
)
|
|
45
|
+
assert dataset in ["tcga_brca"], (
|
|
46
|
+
"dataset must be one of: 'tcga_brca'"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Locate data within the package
|
|
50
|
+
datasets_path = Path(__file__).resolve().parent.parent / "datasets" / dataset
|
|
51
|
+
|
|
52
|
+
# Construct file paths
|
|
53
|
+
file_map = {
|
|
54
|
+
"rna": datasets_path / "rna.csv",
|
|
55
|
+
"cnv": datasets_path / "cnv.csv",
|
|
56
|
+
"metadata": datasets_path / "metadata.csv",
|
|
57
|
+
"cnv_tumor": datasets_path / "cnv_tumor.csv",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
data_path = file_map[modality]
|
|
61
|
+
if not data_path.exists():
|
|
62
|
+
raise FileNotFoundError(f"Data file not found: {data_path}")
|
|
63
|
+
|
|
64
|
+
# Load the CSV
|
|
65
|
+
df = pd.read_csv(data_path, index_col=0)
|
|
66
|
+
|
|
67
|
+
# Apply debug mode subsampling
|
|
68
|
+
if debug:
|
|
69
|
+
df = df.sample(n=min(10, df.shape[0]), random_state=debug_seed)
|
|
70
|
+
if modality in ["rna", "cnv"]:
|
|
71
|
+
df = df.sample(n=min(100, df.shape[1]), axis=1, random_state=debug_seed)
|
|
72
|
+
|
|
73
|
+
return df
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def replace_underscores(factors: List[str]):
|
|
77
|
+
"""Replace all underscores from strings in a list by hyphens.
|
|
78
|
+
|
|
79
|
+
To be used on design factors to avoid bugs due to the reliance on
|
|
80
|
+
``str.split("_")`` in parts of the code.
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
factors : list
|
|
85
|
+
A list of strings which may contain underscores.
|
|
86
|
+
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
list
|
|
90
|
+
A list of strings in which underscores were replaced by hyphens.
|
|
91
|
+
"""
|
|
92
|
+
return [factor.replace("_", "-") for factor in factors]
|
|
93
|
+
|
|
94
|
+
|
|
13
95
|
def filter_low_count_genes(
|
|
14
96
|
df: pd.DataFrame,
|
|
15
97
|
other_dfs: Optional[List[pd.DataFrame]] = None,
|
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
deconveil/__init__.py,sha256=_6FL_AYiycv9nP3mKJiQ4zl4aU83YSWnV2YoIZr9Mv0,188
|
|
2
|
-
deconveil/
|
|
2
|
+
deconveil/__version__.py,sha256=XEqb2aiIn8fzGE68Mph4ck1FtQqsR_am0wRWvrYPffQ,22
|
|
3
|
+
deconveil/dds.py,sha256=0MNwtDzCjqjoJR-rrCmVu3JOaDd3gXuToOzTBXJMxak,49039
|
|
3
4
|
deconveil/default_inference.py,sha256=J40O0-qZChLnLrLGmhwxjaTVsV7REWAUQOTf8qSwWk0,9466
|
|
4
5
|
deconveil/ds.py,sha256=Vb9p152U1KXltrXFpMoBxY6YRW25dP4CO26_osbz6Aw,29476
|
|
5
6
|
deconveil/grid_search.py,sha256=iOHR8ur10MyrrfEZHr409lGulGxODufsjG6j7lQ7tWs,5181
|
|
6
7
|
deconveil/inference.py,sha256=B3zf3q_mbCTX3gHJwuXnTuy9uyXOxEjuWyaSR6VtVEo,10429
|
|
7
8
|
deconveil/utils_clustering.py,sha256=twspPvXQ6pvw_NaY1ebyvswuH3ZvVBGn7DeOpZ1XatI,5939
|
|
8
|
-
deconveil/utils_fit.py,sha256=
|
|
9
|
+
deconveil/utils_fit.py,sha256=SdGcBQjN3cyzbSFessufYOOOJAQCOjNcy3etbwmodsM,21583
|
|
9
10
|
deconveil/utils_plot.py,sha256=1JQthYXaEUKUWa0fy8owkyJ1CTkQxlrSRAqPkXMk7Us,9857
|
|
10
|
-
deconveil/utils_processing.py,sha256=
|
|
11
|
-
deconveil-0.1.
|
|
12
|
-
deconveil-0.1.
|
|
13
|
-
deconveil-0.1.
|
|
14
|
-
deconveil-0.1.
|
|
15
|
-
deconveil-0.1.
|
|
11
|
+
deconveil/utils_processing.py,sha256=9j35FAfQ7oNjdH1FWHP90DBTyL5RwlgdVbbW9de10VI,6560
|
|
12
|
+
deconveil-0.1.3.dist-info/licenses/LICENSE,sha256=BJ0f3JRteiF7tjiARi8syxiu4yKmckc0nWlHCKXttKQ,1078
|
|
13
|
+
deconveil-0.1.3.dist-info/METADATA,sha256=yL6AwQ5ziGhrI5lE4FDCHOadT81W6yEIDWGsHni6Q5w,1097
|
|
14
|
+
deconveil-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
deconveil-0.1.3.dist-info/top_level.txt,sha256=yAWZbw0eg8XpbMsswoq-VzBGfQHrfWOqNHnu2qQ2xO4,10
|
|
16
|
+
deconveil-0.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|