DeConveil 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {deconveil-0.1.0 → deconveil-0.1.2}/DeConveil.egg-info/PKG-INFO +3 -2
  2. deconveil-0.1.2/DeConveil.egg-info/SOURCES.txt +19 -0
  3. deconveil-0.1.2/DeConveil.egg-info/top_level.txt +1 -0
  4. {deconveil-0.1.0 → deconveil-0.1.2}/PKG-INFO +3 -2
  5. deconveil-0.1.2/README.md +69 -0
  6. deconveil-0.1.2/deconveil/__version__.py +1 -0
  7. {deconveil-0.1.0/DeConveil → deconveil-0.1.2/deconveil}/dds.py +7 -11
  8. {deconveil-0.1.0/DeConveil → deconveil-0.1.2/deconveil}/default_inference.py +9 -17
  9. {deconveil-0.1.0/DeConveil → deconveil-0.1.2/deconveil}/ds.py +1 -3
  10. {deconveil-0.1.0/DeConveil → deconveil-0.1.2/deconveil}/grid_search.py +2 -2
  11. {deconveil-0.1.0/DeConveil → deconveil-0.1.2/deconveil}/inference.py +2 -9
  12. deconveil-0.1.2/deconveil/utils_clustering.py +201 -0
  13. deconveil-0.1.0/DeConveil/utils_CNaware.py → deconveil-0.1.2/deconveil/utils_fit.py +132 -268
  14. deconveil-0.1.2/deconveil/utils_plot.py +308 -0
  15. deconveil-0.1.2/deconveil/utils_processing.py +132 -0
  16. {deconveil-0.1.0 → deconveil-0.1.2}/setup.py +2 -2
  17. deconveil-0.1.0/DeConveil.egg-info/SOURCES.txt +0 -15
  18. deconveil-0.1.0/DeConveil.egg-info/top_level.txt +0 -1
  19. deconveil-0.1.0/README.md +0 -40
  20. {deconveil-0.1.0 → deconveil-0.1.2}/DeConveil.egg-info/dependency_links.txt +0 -0
  21. {deconveil-0.1.0 → deconveil-0.1.2}/DeConveil.egg-info/requires.txt +0 -0
  22. {deconveil-0.1.0 → deconveil-0.1.2}/LICENSE +0 -0
  23. {deconveil-0.1.0/DeConveil → deconveil-0.1.2/deconveil}/__init__.py +0 -0
  24. {deconveil-0.1.0 → deconveil-0.1.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: DeConveil
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: An extension of PyDESeq2/DESeq2 designed to account for genome aneuploidy
5
5
  Home-page: https://github.com/caravagnalab/DeConveil
6
6
  Author: Katsiaryna Davydzenka
@@ -29,6 +29,7 @@ Dynamic: author
29
29
  Dynamic: author-email
30
30
  Dynamic: home-page
31
31
  Dynamic: license
32
+ Dynamic: license-file
32
33
  Dynamic: provides-extra
33
34
  Dynamic: requires-dist
34
35
  Dynamic: requires-python
@@ -0,0 +1,19 @@
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ DeConveil.egg-info/PKG-INFO
5
+ DeConveil.egg-info/SOURCES.txt
6
+ DeConveil.egg-info/dependency_links.txt
7
+ DeConveil.egg-info/requires.txt
8
+ DeConveil.egg-info/top_level.txt
9
+ deconveil/__init__.py
10
+ deconveil/__version__.py
11
+ deconveil/dds.py
12
+ deconveil/default_inference.py
13
+ deconveil/ds.py
14
+ deconveil/grid_search.py
15
+ deconveil/inference.py
16
+ deconveil/utils_clustering.py
17
+ deconveil/utils_fit.py
18
+ deconveil/utils_plot.py
19
+ deconveil/utils_processing.py
@@ -0,0 +1 @@
1
+ deconveil
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: DeConveil
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: An extension of PyDESeq2/DESeq2 designed to account for genome aneuploidy
5
5
  Home-page: https://github.com/caravagnalab/DeConveil
6
6
  Author: Katsiaryna Davydzenka
@@ -29,6 +29,7 @@ Dynamic: author
29
29
  Dynamic: author-email
30
30
  Dynamic: home-page
31
31
  Dynamic: license
32
+ Dynamic: license-file
32
33
  Dynamic: provides-extra
33
34
  Dynamic: requires-dist
34
35
  Dynamic: requires-python
@@ -0,0 +1,69 @@
1
+ # DeConveil
2
+
3
+ <img src="docs/deconveil_logo.png" align="right" width="300">
4
+
5
+ #
6
+ [![pypi version](https://img.shields.io/pypi/v/DeConveil)](https://pypi.org/project/DeConveil)
7
+
8
+ The goal of *DeConveil* is the extension of Differential Gene Expression testing by accounting for genome aneuploidy.
9
+ This computational framework extends traditional DGE analysis by integrating DNA Copy Number Variation (CNV) data.
10
+ This approach adjusts for dosage effects and categorizes genes as *dosage-sensitive (DSG)*, *dosage-insensitive (DIG)*, and *dosage-compensated (DCG)*, separating the expression changes caused by CNVs from other alterations in transcriptional regulation.
11
+ To perform this gene separation we need to carry out DGE testing using both *PyDESeq2 (CN-naive)* and *DeConveil (CN-aware)* methods.
12
+
13
+ You can download the results of our analysis from [deconveilCaseStudies](https://github.com/kdavydzenka/deconveilCaseStudies)
14
+
15
+
16
+ ### Installation
17
+
18
+ **Pre-required installations before running DeConveil**
19
+
20
+ Python libraries are required to be installed: *pydeseq2*
21
+
22
+ `pip install pydeseq2`
23
+
24
+ `pip install DeConveil`
25
+
26
+ or `git clone https://github.com/caravagnalab/DeConveil.git`
27
+
28
+
29
+ **Input data**
30
+
31
+ DeConveil requires the following input matrices:
32
+
33
+ - matched mRNA read counts (normal and tumor samples) and absolute CN values (for normal diploid samples we assign CN=2), structured as NxG matrix, where N represents the number of samples and G represents the number of genes;
34
+
35
+ - a design matrix structured as an N × F matrix, where N is the number of samples and F is the number of features or covariates.
36
+
37
+ Example of CN data for a given gene *g*:
38
+ CN = [1, 2, 3, 4, 5, 6].
39
+
40
+ An example of the input data can be found in the *test_deconveil* Jupyter Notebook.
41
+
42
+
43
+ **Output data**
44
+
45
+ `res_CNnaive.csv` (for *PyDESeq2* method) and `res_CNaware.csv` (for *DeConveil*) data frames reporting *log2FC* and *p.adjust* values for both methods.
46
+
47
+ These data frames are further processed to separate gene groups using `define_gene_groups()` function included in DeConveil framework.
48
+
49
+ A tutorial of the analysis workflow is available in `test_deconveil.ipynb`
50
+
51
+
52
+ #### Citation
53
+
54
+ [![](http://img.shields.io/badge/doi-10.1101/2025.03.29.646108-red.svg)](https://doi.org/10.1101/2025.03.29.646108)
55
+
56
+ If you use `DeConveil`, cite:
57
+
58
+ K. Davydzenka, G. Caravagna, G. Sanguinetti. Extending differential gene expression testing to handle genome aneuploidy in cancer. [bioRxiv preprint](https://doi.org/10.1101/2025.03.29.646108), 2025.
59
+
60
+
61
+ #### Copyright and contacts
62
+
63
+ Katsiaryna Davydzenka, Cancer Data Science (CDS) Laboratory.
64
+
65
+ [![](https://img.shields.io/badge/CDS%20Lab%20Github-caravagnalab-seagreen.svg)](https://github.com/caravagnalab)
66
+ [![](https://img.shields.io/badge/CDS%20Lab%20webpage-https://www.caravagnalab.org/-red.svg)](https://www.caravagnalab.org/)
67
+
68
+
69
+
@@ -0,0 +1 @@
1
+ __version__ = "0.1.2"
@@ -1,11 +1,7 @@
1
1
  import sys
2
2
  import time
3
3
  import warnings
4
- from typing import List
5
- from typing import Literal
6
- from typing import Optional
7
- from typing import Union
8
- from typing import cast
4
+ from typing import List, Literal, Optional, Union, cast
9
5
 
10
6
  import numpy as np
11
7
  import pandas as pd
@@ -16,15 +12,15 @@ from scipy.stats import trim_mean # type: ignore
16
12
 
17
13
  from deconveil.default_inference import DefInference
18
14
  from deconveil.inference import Inference
19
- from deconveil import utils_CNaware
20
- from deconveil.utils_CNaware import fit_rough_dispersions
21
- from deconveil.utils_CNaware import fit_moments_dispersions2
22
- from deconveil.utils_CNaware import grid_fit_beta
23
- from deconveil.utils_CNaware import irls_glm
15
+ from deconveil import utils_fit
16
+ from deconveil.utils_fit import fit_rough_dispersions
17
+ from deconveil.utils_fit import fit_moments_dispersions2
18
+ from deconveil.utils_fit import grid_fit_beta
19
+ from deconveil.utils_fit import irls_glm
20
+ from deconveil.utils_fit import build_design_matrix
24
21
 
25
22
  from pydeseq2.preprocessing import deseq2_norm_fit
26
23
  from pydeseq2.preprocessing import deseq2_norm_transform
27
- from pydeseq2.utils import build_design_matrix
28
24
  from pydeseq2.utils import dispersion_trend
29
25
  from pydeseq2.utils import mean_absolute_deviation
30
26
  from pydeseq2.utils import n_or_more_replicates
@@ -1,17 +1,13 @@
1
- from typing import Literal
2
- from typing import Optional
3
- from typing import Tuple
1
+ from typing import Literal, Optional, Tuple
4
2
 
5
3
  import numpy as np
6
4
  import pandas as pd
7
- from joblib import Parallel # type: ignore
8
- from joblib import delayed
9
- from joblib import parallel_backend
5
+ from joblib import Parallel, delayed, parallel_backend # type: ignore
10
6
  from scipy.optimize import minimize # type: ignore
11
7
 
12
8
  from deconveil import inference
13
- from deconveil import utils_CNaware
14
- from deconveil.utils_CNaware import fit_lin_mu
9
+ from deconveil import utils_fit
10
+ from deconveil.utils_fit import fit_lin_mu
15
11
 
16
12
  from pydeseq2 import utils
17
13
  from pydeseq2.utils import get_num_processes
@@ -42,8 +38,8 @@ class DefInference(inference.Inference):
42
38
  Joblib backend.
43
39
  """
44
40
 
45
- fit_rough_dispersions = staticmethod(utils_CNaware.fit_rough_dispersions) # type: ignore
46
- fit_moments_dispersions2 = staticmethod(utils_CNaware.fit_moments_dispersions2) # type: ignore
41
+ fit_rough_dispersions = staticmethod(utils_fit.fit_rough_dispersions) # type: ignore
42
+ fit_moments_dispersions2 = staticmethod(utils_fit.fit_moments_dispersions2) # type: ignore
47
43
 
48
44
  def __init__(
49
45
  self,
@@ -79,7 +75,7 @@ class DefInference(inference.Inference):
79
75
  verbose=self._joblib_verbosity,
80
76
  batch_size=self._batch_size,
81
77
  )(
82
- delayed(utils_CNaware.fit_lin_mu)(
78
+ delayed(utils_fit.fit_lin_mu)(
83
79
  counts=counts[:, i],
84
80
  size_factors=size_factors,
85
81
  design_matrix=design_matrix,
@@ -110,7 +106,7 @@ class DefInference(inference.Inference):
110
106
  verbose=self._joblib_verbosity,
111
107
  batch_size=self._batch_size,
112
108
  )(
113
- delayed(utils_CNaware.irls_glm)(
109
+ delayed(utils_fit.irls_glm)(
114
110
  counts=counts[:, i],
115
111
  size_factors=size_factors,
116
112
  design_matrix=design_matrix,
@@ -262,7 +258,7 @@ class DefInference(inference.Inference):
262
258
  verbose=self._joblib_verbosity,
263
259
  batch_size=self._batch_size,
264
260
  )(
265
- delayed(utils_CNaware.nbinomGLM)(
261
+ delayed(utils_fit.nbinomGLM)(
266
262
  design_matrix=design_matrix,
267
263
  counts=counts[:, i],
268
264
  cnv=cnv[:, i],
@@ -278,7 +274,3 @@ class DefInference(inference.Inference):
278
274
  res = zip(*res)
279
275
  lfcs, inv_hessians, l_bfgs_b_converged_ = (np.array(m) for m in res)
280
276
  return lfcs, inv_hessians, l_bfgs_b_converged_
281
-
282
-
283
-
284
-
@@ -1,8 +1,6 @@
1
1
  import sys
2
2
  import time
3
- from typing import List
4
- from typing import Literal
5
- from typing import Optional
3
+ from typing import List, Literal, Optional
6
4
 
7
5
  import numpy as np
8
6
  import pandas as pd
@@ -3,7 +3,7 @@ from typing import Optional
3
3
  import numpy as np
4
4
  from scipy.special import gammaln # type: ignore
5
5
 
6
- from deconveil import utils_CNaware
6
+ from deconveil import utils_fit
7
7
 
8
8
 
9
9
  def grid_fit_beta(
@@ -156,7 +156,7 @@ def grid_fit_shrink_beta(
156
156
  def loss(beta: np.ndarray) -> float:
157
157
  # closure to minimize
158
158
  return (
159
- utils_CNaware.nbinomFn(
159
+ utils_fit.nbinomFn(
160
160
  beta,
161
161
  design_matrix,
162
162
  counts,
@@ -1,8 +1,6 @@
1
1
  from abc import ABC
2
2
  from abc import abstractmethod
3
- from typing import Literal
4
- from typing import Optional
5
- from typing import Tuple
3
+ from typing import Literal, Optional, Tuple
6
4
 
7
5
  import numpy as np
8
6
  import pandas as pd
@@ -365,9 +363,4 @@ class Inference(ABC):
365
363
  converged: ndarray
366
364
  Whether L-BFGS-B converged for each optimization problem.
367
365
  """
368
-
369
-
370
-
371
-
372
-
373
-
366
+
@@ -0,0 +1,201 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.decomposition import PCA
4
+ from sklearn.cluster import KMeans, AgglomerativeClustering
5
+ from sklearn.metrics import silhouette_score
6
+ from scipy.spatial.distance import pdist, squareform
7
+ import random
8
+
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
11
+
12
+
13
+ def pca_cluster_cn(
14
+ gene_cn: pd.DataFrame,
15
+ n_components: int = 20,
16
+ k: int = 2,
17
+ method: str = "kmeans",
18
+ random_state: int = 0,
19
+ ) -> dict:
20
+ """
21
+ Perform PCA on gene-level CN and cluster patients in PCA space.
22
+
23
+ Parameters
24
+ ----------
25
+ gene_cn : DataFrame
26
+ Gene x Sample matrix of CN values (log2 ratios).
27
+ n_components : int
28
+ Number of PCA components to keep.
29
+ k : int
30
+ Number of clusters.
31
+ method : str
32
+ 'kmeans' or 'hierarchical'.
33
+ random_state : int
34
+ For reproducibility.
35
+
36
+ Returns
37
+ -------
38
+ dict with:
39
+ - labels: pd.Series (sample -> cluster)
40
+ - pca_coords: DataFrame of PCA coords
41
+ - explained_var: explained variance ratios
42
+ """
43
+ X = gene_cn.fillna(0).T # samples × genes
44
+ pca = PCA(n_components=min(n_components, X.shape[1]))
45
+ coords = pca.fit_transform(X)
46
+ coords_df = pd.DataFrame(
47
+ coords, index=X.index, columns=[f"PC{i+1}" for i in range(coords.shape[1])]
48
+ )
49
+
50
+ if method == "kmeans":
51
+ model = KMeans(n_clusters=k, n_init=20, random_state=random_state)
52
+ labels = model.fit_predict(coords)
53
+ elif method == "hierarchical":
54
+ model = AgglomerativeClustering(n_clusters=k)
55
+ labels = model.fit_predict(coords)
56
+ else:
57
+ raise ValueError("method must be 'kmeans' or 'hierarchical'")
58
+
59
+ labels = pd.Series(labels, index=X.index, name="cluster")
60
+ return {
61
+ "labels": labels,
62
+ "pca_coords": coords_df,
63
+ "explained_var": pca.explained_variance_ratio_,
64
+ }
65
+
66
+
67
+ def consensus_cluster_cn(
68
+ gene_cn: pd.DataFrame,
69
+ k: int = 2,
70
+ n_resamples: int = 50,
71
+ sample_fraction: float = 0.8,
72
+ feature_fraction: float = 0.8,
73
+ top_genes: int = 2000,
74
+ random_state: int = 0,
75
+ ) -> dict:
76
+ """
77
+ Consensus clustering of patients based on CN profiles.
78
+
79
+ Parameters
80
+ ----------
81
+ gene_cn : DataFrame
82
+ Gene x Sample CN matrix.
83
+ k : int
84
+ Number of clusters.
85
+ n_resamples : int
86
+ Number of resampling iterations.
87
+ sample_fraction : float
88
+ Fraction of patients sampled each iteration.
89
+ feature_fraction : float
90
+ Fraction of genes sampled each iteration.
91
+ top_genes : int
92
+ Use top variable genes only.
93
+ random_state : int
94
+ For reproducibility.
95
+
96
+ Returns
97
+ -------
98
+ dict with:
99
+ - labels: pd.Series (sample -> cluster) from consensus
100
+ - consensus_matrix: DataFrame (samples × samples) with co-clustering frequencies
101
+ """
102
+ rng = np.random.RandomState(random_state)
103
+
104
+ # Select top variable genes
105
+ var_genes = gene_cn.var(axis=1).sort_values(ascending=False).index[:top_genes]
106
+ data = gene_cn.loc[var_genes].fillna(0).values # genes × samples
107
+ samples = gene_cn.columns.tolist()
108
+ n = len(samples)
109
+
110
+ co_mat = np.zeros((n, n))
111
+ counts = np.zeros((n, n))
112
+
113
+ for r in range(n_resamples):
114
+ samp_idx = rng.choice(n, size=int(sample_fraction * n), replace=False)
115
+ feat_idx = rng.choice(
116
+ data.shape[0], size=int(feature_fraction * data.shape[0]), replace=False
117
+ )
118
+ X = data[feat_idx][:, samp_idx].T # subsampled patients × genes
119
+
120
+ # k-means in subsample
121
+ km = KMeans(n_clusters=k, n_init=10, random_state=rng).fit(X)
122
+ labels_sub = km.labels_
123
+
124
+ # update co-occurrence
125
+ for i, si in enumerate(samp_idx):
126
+ for j, sj in enumerate(samp_idx):
127
+ counts[si, sj] += 1
128
+ if labels_sub[i] == labels_sub[j]:
129
+ co_mat[si, sj] += 1
130
+
131
+ consensus = np.divide(co_mat, counts, out=np.zeros_like(co_mat), where=counts > 0)
132
+ consensus_df = pd.DataFrame(consensus, index=samples, columns=samples)
133
+
134
+ # Cluster consensus matrix
135
+ dist = 1 - consensus
136
+ agg = AgglomerativeClustering(n_clusters=k, affinity="precomputed", linkage="average")
137
+ labels = agg.fit_predict(dist)
138
+ labels = pd.Series(labels, index=samples, name="cluster")
139
+
140
+ return {"labels": labels, "consensus_matrix": consensus_df}
141
+
142
+
143
+ def consensus_cdf_range(
144
+ gene_cn, k_values=(2,3,4,5,6),
145
+ n_resamples=50, sample_fraction=0.8, feature_fraction=0.8,
146
+ top_genes=2000, random_state=0
147
+ ):
148
+ """
149
+ Run consensus clustering across multiple k and plot CDFs.
150
+
151
+ Parameters
152
+ ----------
153
+ gene_cn : DataFrame
154
+ Gene × Sample CN matrix.
155
+ k_values : list/tuple
156
+ Range of k to test.
157
+ n_resamples, sample_fraction, feature_fraction, top_genes, random_state
158
+ Passed to consensus_cluster_cn().
159
+
160
+ Returns
161
+ -------
162
+ dict
163
+ {k: {"labels", "consensus_matrix", "auc"}}
164
+ """
165
+ results = {}
166
+
167
+ plt.figure(figsize=(7,5))
168
+
169
+ for k in k_values:
170
+ res = consensus_cluster_cn(
171
+ gene_cn, k=k,
172
+ n_resamples=n_resamples,
173
+ sample_fraction=sample_fraction,
174
+ feature_fraction=feature_fraction,
175
+ top_genes=top_genes,
176
+ random_state=random_state
177
+ )
178
+
179
+ mat = res["consensus_matrix"].values
180
+ mask = ~np.eye(mat.shape[0], dtype=bool)
181
+ vals = mat[mask]
182
+
183
+ sorted_vals = np.sort(vals)
184
+ cdf = np.arange(1, len(sorted_vals)+1) / len(sorted_vals)
185
+
186
+ # Compute area under CDF (AUC)
187
+ auc = np.trapz(cdf, sorted_vals)
188
+ res["auc"] = auc
189
+ results[k] = res
190
+
191
+ plt.plot(sorted_vals, cdf, lw=2, label=f"k={k} (AUC={auc:.3f})")
192
+
193
+ plt.xlabel("Consensus value")
194
+ plt.ylabel("Cumulative fraction")
195
+ plt.title("Consensus CDF across k", fontsize=14)
196
+ plt.legend()
197
+ plt.grid(True, alpha=0.3)
198
+ plt.tight_layout()
199
+ plt.show()
200
+
201
+ return results