DeConveil 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ __version__ = "0.1.3"
deconveil/dds.py CHANGED
@@ -17,10 +17,10 @@ from deconveil.utils_fit import fit_rough_dispersions
17
17
  from deconveil.utils_fit import fit_moments_dispersions2
18
18
  from deconveil.utils_fit import grid_fit_beta
19
19
  from deconveil.utils_fit import irls_glm
20
+ from deconveil.utils_fit import build_design_matrix
20
21
 
21
22
  from pydeseq2.preprocessing import deseq2_norm_fit
22
23
  from pydeseq2.preprocessing import deseq2_norm_transform
23
- from pydeseq2.utils import build_design_matrix
24
24
  from pydeseq2.utils import dispersion_trend
25
25
  from pydeseq2.utils import mean_absolute_deviation
26
26
  from pydeseq2.utils import n_or_more_replicates
deconveil/utils_fit.py CHANGED
@@ -523,3 +523,151 @@ def nbinomFn(
523
523
  ).sum(0)
524
524
 
525
525
  return prior - nll
526
+
527
+
528
+ def build_design_matrix(
529
+ metadata: pd.DataFrame,
530
+ design_factors: Union[str, List[str]] = "condition",
531
+ ref_level: Optional[List[str]] = None,
532
+ continuous_factors: Optional[List[str]] = None,
533
+ expanded: bool = False,
534
+ intercept: bool = True,
535
+ ) -> pd.DataFrame:
536
+ """Build design_matrix matrix for DEA.
537
+
538
+ Unless specified, the reference factor is chosen alphabetically.
539
+
540
+ Parameters
541
+ ----------
542
+ metadata : pandas.DataFrame
543
+ DataFrame containing metadata information.
544
+ Must be indexed by sample barcodes.
545
+
546
+ design_factors : str or list
547
+ Name of the columns of metadata to be used as design_matrix variables.
548
+ (default: ``"condition"``).
549
+
550
+ ref_level : dict or None
551
+ An optional list of two strings of the form ``["factor", "ref_level"]``
552
+ specifying the factor of interest and the desired reference level, e.g.
553
+ ``["condition", "A"]``. (default: ``None``).
554
+
555
+ continuous_factors : list or None
556
+ An optional list of continuous (as opposed to categorical) factors. Any factor
557
+ not in ``continuous_factors`` will be considered categorical (default: ``None``).
558
+
559
+ expanded : bool
560
+ If true, use one column per category. Else, use n-1 columns, for each n-level
561
+ categorical factor.
562
+ (default: ``False``).
563
+
564
+ intercept : bool
565
+ If true, add an intercept (a column containing only ones). (default: ``True``).
566
+
567
+ Returns
568
+ -------
569
+ pandas.DataFrame
570
+ A DataFrame with experiment design information (to split cohorts).
571
+ Indexed by sample barcodes.
572
+ """
573
+ if isinstance(
574
+ design_factors, str
575
+ ): # if there is a single factor, convert to singleton list
576
+ design_factors = [design_factors]
577
+
578
+ for factor in design_factors:
579
+ # Check that each factor has at least 2 levels
580
+ if len(np.unique(metadata[factor])) < 2:
581
+ raise ValueError(
582
+ f"Factors should take at least two values, but {factor} "
583
+ f"takes the single value '{np.unique(metadata[factor])}'."
584
+ )
585
+
586
+ # Check that level factors in the design don't contain underscores. If so, convert
587
+ # them to hyphens
588
+ warning_issued = False
589
+ for factor in design_factors:
590
+ if np.any(["_" in value for value in metadata[factor]]):
591
+ if not warning_issued:
592
+ warnings.warn(
593
+ """Some factor levels in the design contain underscores ('_').
594
+ They will be converted to hyphens ('-').""",
595
+ UserWarning,
596
+ stacklevel=2,
597
+ )
598
+ warning_issued = True
599
+ metadata[factor] = metadata[factor].apply(lambda x: x.replace("_", "-"))
600
+
601
+ if continuous_factors is not None:
602
+ categorical_factors = [
603
+ factor for factor in design_factors if factor not in continuous_factors
604
+ ]
605
+ else:
606
+ categorical_factors = design_factors
607
+
608
+ # Check that there is at least one categorical factor
609
+ if len(categorical_factors) > 0:
610
+ design_matrix = pd.get_dummies(
611
+ metadata[categorical_factors], drop_first=not expanded
612
+ )
613
+
614
+ if ref_level is not None:
615
+ if len(ref_level) != 2:
616
+ raise KeyError("The reference level should contain 2 strings.")
617
+ if ref_level[1] not in metadata[ref_level[0]].values:
618
+ raise KeyError(
619
+ f"The metadata data should contain a '{ref_level[0]}' column"
620
+ f" with a '{ref_level[1]}' level."
621
+ )
622
+
623
+ # Check that the reference level is not in the matrix (if unexpanded design)
624
+ ref_level_name = "_".join(ref_level)
625
+ if (not expanded) and ref_level_name in design_matrix.columns:
626
+ # Remove the reference level and add one
627
+ factor_cols = [
628
+ col for col in design_matrix.columns if col.startswith(ref_level[0])
629
+ ]
630
+ missing_level = next(
631
+ level
632
+ for level in np.unique(metadata[ref_level[0]])
633
+ if f"{ref_level[0]}_{level}" not in design_matrix.columns
634
+ )
635
+ design_matrix[f"{ref_level[0]}_{missing_level}"] = 1 - design_matrix[
636
+ factor_cols
637
+ ].sum(1)
638
+ design_matrix.drop(ref_level_name, axis="columns", inplace=True)
639
+
640
+ if not expanded:
641
+ # Add reference level as column name suffix
642
+ for factor in design_factors:
643
+ if ref_level is None or factor != ref_level[0]:
644
+ # The reference is the unique level that is no longer there
645
+ ref = next(
646
+ level
647
+ for level in np.unique(metadata[factor])
648
+ if f"{factor}_{level}" not in design_matrix.columns
649
+ )
650
+ else:
651
+ # The reference level is given as an argument
652
+ ref = ref_level[1]
653
+ design_matrix.columns = [
654
+ f"{col}_vs_{ref}" if col.startswith(factor) else col
655
+ for col in design_matrix.columns
656
+ ]
657
+ else:
658
+ # There is no categorical factor in the design
659
+ design_matrix = pd.DataFrame(index=metadata.index)
660
+
661
+ if intercept:
662
+ design_matrix.insert(0, "intercept", 1)
663
+
664
+ # Convert categorical factors one-hot encodings to int
665
+ design_matrix = design_matrix.astype("int")
666
+
667
+ # Add continuous factors
668
+ if continuous_factors is not None:
669
+ for factor in continuous_factors:
670
+ # This factor should be numeric
671
+ design_matrix[factor] = pd.to_numeric(metadata[factor])
672
+ return design_matrix
673
+
@@ -5,11 +5,93 @@ from pathlib import Path
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
8
+ import deconveil
8
9
 
9
10
  from typing import List, Literal, Optional, Dict, Any, cast
10
11
 
11
12
 
12
13
 
14
+ def load_test_data(
15
+ modality: Literal["rna", "cnv", "metadata", "cnv_tumor"] = "rna",
16
+ dataset: Literal["tcga_brca"] = "tcga_brca",
17
+ debug: bool = False,
18
+ debug_seed: int = 42,
19
+ ) -> pd.DataFrame:
20
+ """Load TCGA-BRCA example data from the DeConveil package.
21
+
22
+ Parameters
23
+ ----------
24
+ modality : {"rna", "cnv", "metadata", "cnv_tumor"}
25
+ Type of data to load.
26
+
27
+ dataset : {"tcga_brca"}
28
+ Dataset name. Only "tcga_brca" is currently supported.
29
+
30
+ debug : bool, optional
31
+ If True, randomly subsample 10 samples and 100 features (if applicable).
32
+ Default is False.
33
+
34
+ debug_seed : int, optional
35
+ Random seed for reproducibility of debug subsampling. Default is 42.
36
+
37
+ Returns
38
+ -------
39
+ pandas.DataFrame
40
+ The requested data modality as a DataFrame.
41
+ """
42
+ assert modality in ["rna", "cnv", "metadata", "cnv_tumor"], (
43
+ "modality must be one of: 'rna', 'cnv', 'metadata', 'cnv_tumor'"
44
+ )
45
+ assert dataset in ["tcga_brca"], (
46
+ "dataset must be one of: 'tcga_brca'"
47
+ )
48
+
49
+ # Locate data within the package
50
+ datasets_path = Path(__file__).resolve().parent.parent / "datasets" / dataset
51
+
52
+ # Construct file paths
53
+ file_map = {
54
+ "rna": datasets_path / "rna.csv",
55
+ "cnv": datasets_path / "cnv.csv",
56
+ "metadata": datasets_path / "metadata.csv",
57
+ "cnv_tumor": datasets_path / "cnv_tumor.csv",
58
+ }
59
+
60
+ data_path = file_map[modality]
61
+ if not data_path.exists():
62
+ raise FileNotFoundError(f"Data file not found: {data_path}")
63
+
64
+ # Load the CSV
65
+ df = pd.read_csv(data_path, index_col=0)
66
+
67
+ # Apply debug mode subsampling
68
+ if debug:
69
+ df = df.sample(n=min(10, df.shape[0]), random_state=debug_seed)
70
+ if modality in ["rna", "cnv"]:
71
+ df = df.sample(n=min(100, df.shape[1]), axis=1, random_state=debug_seed)
72
+
73
+ return df
74
+
75
+
76
+ def replace_underscores(factors: List[str]):
77
+ """Replace all underscores from strings in a list by hyphens.
78
+
79
+ To be used on design factors to avoid bugs due to the reliance on
80
+ ``str.split("_")`` in parts of the code.
81
+
82
+ Parameters
83
+ ----------
84
+ factors : list
85
+ A list of strings which may contain underscores.
86
+
87
+ Returns
88
+ -------
89
+ list
90
+ A list of strings in which underscores were replaced by hyphens.
91
+ """
92
+ return [factor.replace("_", "-") for factor in factors]
93
+
94
+
13
95
  def filter_low_count_genes(
14
96
  df: pd.DataFrame,
15
97
  other_dfs: Optional[List[pd.DataFrame]] = None,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: DeConveil
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: An extension of PyDESeq2/DESeq2 designed to account for genome aneuploidy
5
5
  Home-page: https://github.com/caravagnalab/DeConveil
6
6
  Author: Katsiaryna Davydzenka
@@ -1,15 +1,16 @@
1
1
  deconveil/__init__.py,sha256=_6FL_AYiycv9nP3mKJiQ4zl4aU83YSWnV2YoIZr9Mv0,188
2
- deconveil/dds.py,sha256=ccYi6o6c6yw_5AXWdpbs0MIALp1xLsUvtp-QhEzH3cQ,49034
2
+ deconveil/__version__.py,sha256=XEqb2aiIn8fzGE68Mph4ck1FtQqsR_am0wRWvrYPffQ,22
3
+ deconveil/dds.py,sha256=0MNwtDzCjqjoJR-rrCmVu3JOaDd3gXuToOzTBXJMxak,49039
3
4
  deconveil/default_inference.py,sha256=J40O0-qZChLnLrLGmhwxjaTVsV7REWAUQOTf8qSwWk0,9466
4
5
  deconveil/ds.py,sha256=Vb9p152U1KXltrXFpMoBxY6YRW25dP4CO26_osbz6Aw,29476
5
6
  deconveil/grid_search.py,sha256=iOHR8ur10MyrrfEZHr409lGulGxODufsjG6j7lQ7tWs,5181
6
7
  deconveil/inference.py,sha256=B3zf3q_mbCTX3gHJwuXnTuy9uyXOxEjuWyaSR6VtVEo,10429
7
8
  deconveil/utils_clustering.py,sha256=twspPvXQ6pvw_NaY1ebyvswuH3ZvVBGn7DeOpZ1XatI,5939
8
- deconveil/utils_fit.py,sha256=ODtIwFKKKchQBiwdNhPSOCt5wsPcpLHCKRENf6JmF18,15785
9
+ deconveil/utils_fit.py,sha256=SdGcBQjN3cyzbSFessufYOOOJAQCOjNcy3etbwmodsM,21583
9
10
  deconveil/utils_plot.py,sha256=1JQthYXaEUKUWa0fy8owkyJ1CTkQxlrSRAqPkXMk7Us,9857
10
- deconveil/utils_processing.py,sha256=CB99CwQst7eUiIgE58yl7_3E6uD9CgQoU_Qmprjyt-s,4141
11
- deconveil-0.1.1.dist-info/licenses/LICENSE,sha256=BJ0f3JRteiF7tjiARi8syxiu4yKmckc0nWlHCKXttKQ,1078
12
- deconveil-0.1.1.dist-info/METADATA,sha256=zuxmrMBmMhAz8bcoDdMZhZcs38oomC-mNhue4hsiVY4,1097
13
- deconveil-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- deconveil-0.1.1.dist-info/top_level.txt,sha256=yAWZbw0eg8XpbMsswoq-VzBGfQHrfWOqNHnu2qQ2xO4,10
15
- deconveil-0.1.1.dist-info/RECORD,,
11
+ deconveil/utils_processing.py,sha256=9j35FAfQ7oNjdH1FWHP90DBTyL5RwlgdVbbW9de10VI,6560
12
+ deconveil-0.1.3.dist-info/licenses/LICENSE,sha256=BJ0f3JRteiF7tjiARi8syxiu4yKmckc0nWlHCKXttKQ,1078
13
+ deconveil-0.1.3.dist-info/METADATA,sha256=yL6AwQ5ziGhrI5lE4FDCHOadT81W6yEIDWGsHni6Q5w,1097
14
+ deconveil-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ deconveil-0.1.3.dist-info/top_level.txt,sha256=yAWZbw0eg8XpbMsswoq-VzBGfQHrfWOqNHnu2qQ2xO4,10
16
+ deconveil-0.1.3.dist-info/RECORD,,