PyPI - DeConveil - Versions diffs - 0.1.0__tar.gz → 0.1.2__tar.gz - Mend

DeConveil 0.1.0tar.gz → 0.1.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{deconveil-0.1.0 → deconveil-0.1.2}/DeConveil.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: DeConveil
-Version: 0.1.0
+Version: 0.1.2
 Summary: An extension of PyDESeq2/DESeq2 designed to account for genome aneuploidy
 Home-page: https://github.com/caravagnalab/DeConveil
 Author: Katsiaryna Davydzenka
@@ -29,6 +29,7 @@ Dynamic: author
 Dynamic: author-email
 Dynamic: home-page
 Dynamic: license
+Dynamic: license-file
 Dynamic: provides-extra
 Dynamic: requires-dist
 Dynamic: requires-python

deconveil-0.1.2/DeConveil.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,19 @@
+LICENSE
+README.md
+setup.py
+DeConveil.egg-info/PKG-INFO
+DeConveil.egg-info/SOURCES.txt
+DeConveil.egg-info/dependency_links.txt
+DeConveil.egg-info/requires.txt
+DeConveil.egg-info/top_level.txt
+deconveil/__init__.py
+deconveil/__version__.py
+deconveil/dds.py
+deconveil/default_inference.py
+deconveil/ds.py
+deconveil/grid_search.py
+deconveil/inference.py
+deconveil/utils_clustering.py
+deconveil/utils_fit.py
+deconveil/utils_plot.py
+deconveil/utils_processing.py

deconveil-0.1.2/DeConveil.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ deconveil

{deconveil-0.1.0 → deconveil-0.1.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: DeConveil
-Version: 0.1.0
+Version: 0.1.2
 Summary: An extension of PyDESeq2/DESeq2 designed to account for genome aneuploidy
 Home-page: https://github.com/caravagnalab/DeConveil
 Author: Katsiaryna Davydzenka
@@ -29,6 +29,7 @@ Dynamic: author
 Dynamic: author-email
 Dynamic: home-page
 Dynamic: license
+Dynamic: license-file
 Dynamic: provides-extra
 Dynamic: requires-dist
 Dynamic: requires-python

deconveil-0.1.2/README.md ADDED Viewed

@@ -0,0 +1,69 @@
+# DeConveil
+<img src="docs/deconveil_logo.png" align="right" width="300">
+#
+[![pypi version](https://img.shields.io/pypi/v/DeConveil)](https://pypi.org/project/DeConveil)
+The goal of *DeConveil* is the extension of Differential Gene Expression testing by accounting for genome aneuploidy.
+This computational framework extends traditional DGE analysis by integrating DNA Copy Number Variation (CNV) data.
+This approach adjusts for dosage effects and categorizes genes as *dosage-sensitive (DSG)*, *dosage-insensitive (DIG)*, and *dosage-compensated (DCG)*, separating the expression changes caused by CNVs from other alterations in transcriptional regulation.
+To perform this gene separation we need to carry out DGE testing using both *PyDESeq2 (CN-naive)* and *DeConveil (CN-aware)* methods.
+You can download the results of our analysis from [deconveilCaseStudies](https://github.com/kdavydzenka/deconveilCaseStudies)
+### Installation
+**Pre-required installations before running DeConveil**
+Python libraries are required to be installed: *pydeseq2*
+`pip install pydeseq2`
+`pip install DeConveil`
+or `git clone https://github.com/caravagnalab/DeConveil.git`
+**Input data**
+DeConveil requires the following input matrices:
+    - matched mRNA read counts (normal and tumor samples) and absolute CN values (for normal diploid samples we assign CN=2), structured as NxG matrix, where N represents the number of samples and G represents the number of genes;
+    - a design matrix structured as an N × F matrix, where N is the number of samples and F is the number of features or covariates.
+Example of CN data for a given gene *g*:
+CN = [1, 2, 3, 4, 5, 6].
+An example of the input data can be found in the *test_deconveil* Jupyter Notebook.
+**Output data**
+`res_CNnaive.csv` (for *PyDESeq2* method) and `res_CNaware.csv` (for *DeConveil*) data frames reporting *log2FC* and *p.adjust* values for both methods.
+These data frames are further processed to separate gene groups using `define_gene_groups()` function included in DeConveil framework.
+A tutorial of the analysis workflow is available in `test_deconveil.ipynb`
+#### Citation
+[![](http://img.shields.io/badge/doi-10.1101/2025.03.29.646108-red.svg)](https://doi.org/10.1101/2025.03.29.646108)
+If you use `DeConveil`, cite:
+K. Davydzenka, G. Caravagna, G. Sanguinetti. Extending differential gene expression testing to handle genome aneuploidy in cancer. [bioRxiv preprint](https://doi.org/10.1101/2025.03.29.646108), 2025.
+#### Copyright and contacts
+Katsiaryna Davydzenka, Cancer Data Science (CDS) Laboratory.
+[![](https://img.shields.io/badge/CDS%20Lab%20Github-caravagnalab-seagreen.svg)](https://github.com/caravagnalab)
+[![](https://img.shields.io/badge/CDS%20Lab%20webpage-https://www.caravagnalab.org/-red.svg)](https://www.caravagnalab.org/)

deconveil-0.1.2/deconveil/__version__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.2"

{deconveil-0.1.0/DeConveil → deconveil-0.1.2/deconveil}/dds.py RENAMED Viewed

@@ -1,11 +1,7 @@
 import sys
 import time
 import warnings
-from typing import List
-from typing import Literal
-from typing import Optional
-from typing import Union
-from typing import cast
+from typing import List, Literal, Optional, Union, cast
 import numpy as np
 import pandas as pd
@@ -16,15 +12,15 @@ from scipy.stats import trim_mean  # type: ignore
 from deconveil.default_inference import DefInference
 from deconveil.inference import Inference
-from deconveil import utils_CNaware
-from deconveil.utils_CNaware import fit_rough_dispersions
-from deconveil.utils_CNaware import fit_moments_dispersions2
-from deconveil.utils_CNaware import grid_fit_beta
-from deconveil.utils_CNaware import irls_glm
+from deconveil import utils_fit
+from deconveil.utils_fit import fit_rough_dispersions
+from deconveil.utils_fit import fit_moments_dispersions2
+from deconveil.utils_fit import grid_fit_beta
+from deconveil.utils_fit import irls_glm
+from deconveil.utils_fit import build_design_matrix
 from pydeseq2.preprocessing import deseq2_norm_fit
 from pydeseq2.preprocessing import deseq2_norm_transform
-from pydeseq2.utils import build_design_matrix
 from pydeseq2.utils import dispersion_trend
 from pydeseq2.utils import mean_absolute_deviation
 from pydeseq2.utils import n_or_more_replicates

{deconveil-0.1.0/DeConveil → deconveil-0.1.2/deconveil}/default_inference.py RENAMED Viewed

@@ -1,17 +1,13 @@
-from typing import Literal
-from typing import Optional
-from typing import Tuple
+from typing import Literal, Optional, Tuple
 import numpy as np
 import pandas as pd
-from joblib import Parallel  # type: ignore
-from joblib import delayed
-from joblib import parallel_backend
+from joblib import Parallel, delayed, parallel_backend  # type: ignore
 from scipy.optimize import minimize  # type: ignore
 from deconveil import inference
-from deconveil import utils_CNaware
-from deconveil.utils_CNaware import fit_lin_mu
+from deconveil import utils_fit
+from deconveil.utils_fit import fit_lin_mu
 from pydeseq2 import utils
 from pydeseq2.utils import get_num_processes
@@ -42,8 +38,8 @@ class DefInference(inference.Inference):
         Joblib backend.
     """
-    fit_rough_dispersions = staticmethod(utils_CNaware.fit_rough_dispersions)  # type: ignore
-    fit_moments_dispersions2 = staticmethod(utils_CNaware.fit_moments_dispersions2)  # type: ignore
+    fit_rough_dispersions = staticmethod(utils_fit.fit_rough_dispersions)  # type: ignore
+    fit_moments_dispersions2 = staticmethod(utils_fit.fit_moments_dispersions2)  # type: ignore
     def __init__(
         self,
@@ -79,7 +75,7 @@ class DefInference(inference.Inference):
                     verbose=self._joblib_verbosity,
                     batch_size=self._batch_size,
                 )(
-                    delayed(utils_CNaware.fit_lin_mu)(
+                    delayed(utils_fit.fit_lin_mu)(
                         counts=counts[:, i],
                         size_factors=size_factors,
                         design_matrix=design_matrix,
@@ -110,7 +106,7 @@ class DefInference(inference.Inference):
                 verbose=self._joblib_verbosity,
                 batch_size=self._batch_size,
             )(
-                delayed(utils_CNaware.irls_glm)(
+                delayed(utils_fit.irls_glm)(
                     counts=counts[:, i],
                     size_factors=size_factors,
                     design_matrix=design_matrix,
@@ -262,7 +258,7 @@ class DefInference(inference.Inference):
                 verbose=self._joblib_verbosity,
                 batch_size=self._batch_size,
             )(
-                delayed(utils_CNaware.nbinomGLM)(
+                delayed(utils_fit.nbinomGLM)(
                     design_matrix=design_matrix,
                     counts=counts[:, i],
                     cnv=cnv[:, i],
@@ -278,7 +274,3 @@ class DefInference(inference.Inference):
         res = zip(*res)
         lfcs, inv_hessians, l_bfgs_b_converged_ = (np.array(m) for m in res)
         return lfcs, inv_hessians, l_bfgs_b_converged_

{deconveil-0.1.0/DeConveil → deconveil-0.1.2/deconveil}/ds.py RENAMED Viewed

@@ -1,8 +1,6 @@
 import sys
 import time
-from typing import List
-from typing import Literal
-from typing import Optional
+from typing import List, Literal, Optional
 import numpy as np
 import pandas as pd

{deconveil-0.1.0/DeConveil → deconveil-0.1.2/deconveil}/grid_search.py RENAMED Viewed

@@ -3,7 +3,7 @@ from typing import Optional
 import numpy as np
 from scipy.special import gammaln  # type: ignore
-from deconveil import utils_CNaware
+from deconveil import utils_fit
 def grid_fit_beta(
@@ -156,7 +156,7 @@ def grid_fit_shrink_beta(
     def loss(beta: np.ndarray) -> float:
         # closure to minimize
         return (
-            utils_CNaware.nbinomFn(
+            utils_fit.nbinomFn(
                 beta,
                 design_matrix,
                 counts,

{deconveil-0.1.0/DeConveil → deconveil-0.1.2/deconveil}/inference.py RENAMED Viewed

@@ -1,8 +1,6 @@
 from abc import ABC
 from abc import abstractmethod
-from typing import Literal
-from typing import Optional
-from typing import Tuple
+from typing import Literal, Optional, Tuple
 import numpy as np
 import pandas as pd
@@ -365,9 +363,4 @@ class Inference(ABC):
         converged: ndarray
             Whether L-BFGS-B converged for each optimization problem.
         """

deconveil-0.1.2/deconveil/utils_clustering.py ADDED Viewed

@@ -0,0 +1,201 @@
+import numpy as np
+import pandas as pd
+from sklearn.decomposition import PCA
+from sklearn.cluster import KMeans, AgglomerativeClustering
+from sklearn.metrics import silhouette_score
+from scipy.spatial.distance import pdist, squareform
+import random
+import matplotlib.pyplot as plt
+import seaborn as sns
+def pca_cluster_cn(
+    gene_cn: pd.DataFrame,
+    n_components: int = 20,
+    k: int = 2,
+    method: str = "kmeans",
+    random_state: int = 0,
+) -> dict:
+    """
+    Perform PCA on gene-level CN and cluster patients in PCA space.
+    Parameters
+    ----------
+    gene_cn : DataFrame
+        Gene x Sample matrix of CN values (log2 ratios).
+    n_components : int
+        Number of PCA components to keep.
+    k : int
+        Number of clusters.
+    method : str
+        'kmeans' or 'hierarchical'.
+    random_state : int
+        For reproducibility.
+    Returns
+    -------
+    dict with:
+        - labels: pd.Series (sample -> cluster)
+        - pca_coords: DataFrame of PCA coords
+        - explained_var: explained variance ratios
+    """
+    X = gene_cn.fillna(0).T  # samples × genes
+    pca = PCA(n_components=min(n_components, X.shape[1]))
+    coords = pca.fit_transform(X)
+    coords_df = pd.DataFrame(
+        coords, index=X.index, columns=[f"PC{i+1}" for i in range(coords.shape[1])]
+    )
+    if method == "kmeans":
+        model = KMeans(n_clusters=k, n_init=20, random_state=random_state)
+        labels = model.fit_predict(coords)
+    elif method == "hierarchical":
+        model = AgglomerativeClustering(n_clusters=k)
+        labels = model.fit_predict(coords)
+    else:
+        raise ValueError("method must be 'kmeans' or 'hierarchical'")
+    labels = pd.Series(labels, index=X.index, name="cluster")
+    return {
+        "labels": labels,
+        "pca_coords": coords_df,
+        "explained_var": pca.explained_variance_ratio_,
+    }
+def consensus_cluster_cn(
+    gene_cn: pd.DataFrame,
+    k: int = 2,
+    n_resamples: int = 50,
+    sample_fraction: float = 0.8,
+    feature_fraction: float = 0.8,
+    top_genes: int = 2000,
+    random_state: int = 0,
+) -> dict:
+    """
+    Consensus clustering of patients based on CN profiles.
+    Parameters
+    ----------
+    gene_cn : DataFrame
+        Gene x Sample CN matrix.
+    k : int
+        Number of clusters.
+    n_resamples : int
+        Number of resampling iterations.
+    sample_fraction : float
+        Fraction of patients sampled each iteration.
+    feature_fraction : float
+        Fraction of genes sampled each iteration.
+    top_genes : int
+        Use top variable genes only.
+    random_state : int
+        For reproducibility.
+    Returns
+    -------
+    dict with:
+        - labels: pd.Series (sample -> cluster) from consensus
+        - consensus_matrix: DataFrame (samples × samples) with co-clustering frequencies
+    """
+    rng = np.random.RandomState(random_state)
+    # Select top variable genes
+    var_genes = gene_cn.var(axis=1).sort_values(ascending=False).index[:top_genes]
+    data = gene_cn.loc[var_genes].fillna(0).values  # genes × samples
+    samples = gene_cn.columns.tolist()
+    n = len(samples)
+    co_mat = np.zeros((n, n))
+    counts = np.zeros((n, n))
+    for r in range(n_resamples):
+        samp_idx = rng.choice(n, size=int(sample_fraction * n), replace=False)
+        feat_idx = rng.choice(
+            data.shape[0], size=int(feature_fraction * data.shape[0]), replace=False
+        )
+        X = data[feat_idx][:, samp_idx].T  # subsampled patients × genes
+        # k-means in subsample
+        km = KMeans(n_clusters=k, n_init=10, random_state=rng).fit(X)
+        labels_sub = km.labels_
+        # update co-occurrence
+        for i, si in enumerate(samp_idx):
+            for j, sj in enumerate(samp_idx):
+                counts[si, sj] += 1
+                if labels_sub[i] == labels_sub[j]:
+                    co_mat[si, sj] += 1
+    consensus = np.divide(co_mat, counts, out=np.zeros_like(co_mat), where=counts > 0)
+    consensus_df = pd.DataFrame(consensus, index=samples, columns=samples)
+    # Cluster consensus matrix
+    dist = 1 - consensus
+    agg = AgglomerativeClustering(n_clusters=k, affinity="precomputed", linkage="average")
+    labels = agg.fit_predict(dist)
+    labels = pd.Series(labels, index=samples, name="cluster")
+    return {"labels": labels, "consensus_matrix": consensus_df}
+def consensus_cdf_range(
+    gene_cn, k_values=(2,3,4,5,6),
+    n_resamples=50, sample_fraction=0.8, feature_fraction=0.8,
+    top_genes=2000, random_state=0
+):
+    """
+    Run consensus clustering across multiple k and plot CDFs.
+    Parameters
+    ----------
+    gene_cn : DataFrame
+        Gene × Sample CN matrix.
+    k_values : list/tuple
+        Range of k to test.
+    n_resamples, sample_fraction, feature_fraction, top_genes, random_state
+        Passed to consensus_cluster_cn().
+    Returns
+    -------
+    dict
+        {k: {"labels", "consensus_matrix", "auc"}}
+    """
+    results = {}
+    plt.figure(figsize=(7,5))
+    for k in k_values:
+        res = consensus_cluster_cn(
+            gene_cn, k=k,
+            n_resamples=n_resamples,
+            sample_fraction=sample_fraction,
+            feature_fraction=feature_fraction,
+            top_genes=top_genes,
+            random_state=random_state
+        )
+        mat = res["consensus_matrix"].values
+        mask = ~np.eye(mat.shape[0], dtype=bool)
+        vals = mat[mask]
+        sorted_vals = np.sort(vals)
+        cdf = np.arange(1, len(sorted_vals)+1) / len(sorted_vals)
+        # Compute area under CDF (AUC)
+        auc = np.trapz(cdf, sorted_vals)
+        res["auc"] = auc
+        results[k] = res
+        plt.plot(sorted_vals, cdf, lw=2, label=f"k={k} (AUC={auc:.3f})")
+    plt.xlabel("Consensus value")
+    plt.ylabel("Cumulative fraction")
+    plt.title("Consensus CDF across k", fontsize=14)
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    plt.tight_layout()
+    plt.show()
+    return results

DeConveil 0.1.0__tar.gz → 0.1.2__tar.gz

DeConveil 0.1.0tar.gz → 0.1.2tar.gz