DeConveil 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ __version__ = "0.1.2"
@@ -1,11 +1,7 @@
1
1
  import sys
2
2
  import time
3
3
  import warnings
4
- from typing import List
5
- from typing import Literal
6
- from typing import Optional
7
- from typing import Union
8
- from typing import cast
4
+ from typing import List, Literal, Optional, Union, cast
9
5
 
10
6
  import numpy as np
11
7
  import pandas as pd
@@ -16,15 +12,15 @@ from scipy.stats import trim_mean # type: ignore
16
12
 
17
13
  from deconveil.default_inference import DefInference
18
14
  from deconveil.inference import Inference
19
- from deconveil import utils_CNaware
20
- from deconveil.utils_CNaware import fit_rough_dispersions
21
- from deconveil.utils_CNaware import fit_moments_dispersions2
22
- from deconveil.utils_CNaware import grid_fit_beta
23
- from deconveil.utils_CNaware import irls_glm
15
+ from deconveil import utils_fit
16
+ from deconveil.utils_fit import fit_rough_dispersions
17
+ from deconveil.utils_fit import fit_moments_dispersions2
18
+ from deconveil.utils_fit import grid_fit_beta
19
+ from deconveil.utils_fit import irls_glm
20
+ from deconveil.utils_fit import build_design_matrix
24
21
 
25
22
  from pydeseq2.preprocessing import deseq2_norm_fit
26
23
  from pydeseq2.preprocessing import deseq2_norm_transform
27
- from pydeseq2.utils import build_design_matrix
28
24
  from pydeseq2.utils import dispersion_trend
29
25
  from pydeseq2.utils import mean_absolute_deviation
30
26
  from pydeseq2.utils import n_or_more_replicates
@@ -1,17 +1,13 @@
1
- from typing import Literal
2
- from typing import Optional
3
- from typing import Tuple
1
+ from typing import Literal, Optional, Tuple
4
2
 
5
3
  import numpy as np
6
4
  import pandas as pd
7
- from joblib import Parallel # type: ignore
8
- from joblib import delayed
9
- from joblib import parallel_backend
5
+ from joblib import Parallel, delayed, parallel_backend # type: ignore
10
6
  from scipy.optimize import minimize # type: ignore
11
7
 
12
8
  from deconveil import inference
13
- from deconveil import utils_CNaware
14
- from deconveil.utils_CNaware import fit_lin_mu
9
+ from deconveil import utils_fit
10
+ from deconveil.utils_fit import fit_lin_mu
15
11
 
16
12
  from pydeseq2 import utils
17
13
  from pydeseq2.utils import get_num_processes
@@ -42,8 +38,8 @@ class DefInference(inference.Inference):
42
38
  Joblib backend.
43
39
  """
44
40
 
45
- fit_rough_dispersions = staticmethod(utils_CNaware.fit_rough_dispersions) # type: ignore
46
- fit_moments_dispersions2 = staticmethod(utils_CNaware.fit_moments_dispersions2) # type: ignore
41
+ fit_rough_dispersions = staticmethod(utils_fit.fit_rough_dispersions) # type: ignore
42
+ fit_moments_dispersions2 = staticmethod(utils_fit.fit_moments_dispersions2) # type: ignore
47
43
 
48
44
  def __init__(
49
45
  self,
@@ -79,7 +75,7 @@ class DefInference(inference.Inference):
79
75
  verbose=self._joblib_verbosity,
80
76
  batch_size=self._batch_size,
81
77
  )(
82
- delayed(utils_CNaware.fit_lin_mu)(
78
+ delayed(utils_fit.fit_lin_mu)(
83
79
  counts=counts[:, i],
84
80
  size_factors=size_factors,
85
81
  design_matrix=design_matrix,
@@ -110,7 +106,7 @@ class DefInference(inference.Inference):
110
106
  verbose=self._joblib_verbosity,
111
107
  batch_size=self._batch_size,
112
108
  )(
113
- delayed(utils_CNaware.irls_glm)(
109
+ delayed(utils_fit.irls_glm)(
114
110
  counts=counts[:, i],
115
111
  size_factors=size_factors,
116
112
  design_matrix=design_matrix,
@@ -262,7 +258,7 @@ class DefInference(inference.Inference):
262
258
  verbose=self._joblib_verbosity,
263
259
  batch_size=self._batch_size,
264
260
  )(
265
- delayed(utils_CNaware.nbinomGLM)(
261
+ delayed(utils_fit.nbinomGLM)(
266
262
  design_matrix=design_matrix,
267
263
  counts=counts[:, i],
268
264
  cnv=cnv[:, i],
@@ -278,7 +274,3 @@ class DefInference(inference.Inference):
278
274
  res = zip(*res)
279
275
  lfcs, inv_hessians, l_bfgs_b_converged_ = (np.array(m) for m in res)
280
276
  return lfcs, inv_hessians, l_bfgs_b_converged_
281
-
282
-
283
-
284
-
@@ -1,8 +1,6 @@
1
1
  import sys
2
2
  import time
3
- from typing import List
4
- from typing import Literal
5
- from typing import Optional
3
+ from typing import List, Literal, Optional
6
4
 
7
5
  import numpy as np
8
6
  import pandas as pd
@@ -3,7 +3,7 @@ from typing import Optional
3
3
  import numpy as np
4
4
  from scipy.special import gammaln # type: ignore
5
5
 
6
- from deconveil import utils_CNaware
6
+ from deconveil import utils_fit
7
7
 
8
8
 
9
9
  def grid_fit_beta(
@@ -156,7 +156,7 @@ def grid_fit_shrink_beta(
156
156
  def loss(beta: np.ndarray) -> float:
157
157
  # closure to minimize
158
158
  return (
159
- utils_CNaware.nbinomFn(
159
+ utils_fit.nbinomFn(
160
160
  beta,
161
161
  design_matrix,
162
162
  counts,
@@ -1,8 +1,6 @@
1
1
  from abc import ABC
2
2
  from abc import abstractmethod
3
- from typing import Literal
4
- from typing import Optional
5
- from typing import Tuple
3
+ from typing import Literal, Optional, Tuple
6
4
 
7
5
  import numpy as np
8
6
  import pandas as pd
@@ -365,9 +363,4 @@ class Inference(ABC):
365
363
  converged: ndarray
366
364
  Whether L-BFGS-B converged for each optimization problem.
367
365
  """
368
-
369
-
370
-
371
-
372
-
373
-
366
+
@@ -0,0 +1,201 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.decomposition import PCA
4
+ from sklearn.cluster import KMeans, AgglomerativeClustering
5
+ from sklearn.metrics import silhouette_score
6
+ from scipy.spatial.distance import pdist, squareform
7
+ import random
8
+
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
11
+
12
+
13
+ def pca_cluster_cn(
14
+ gene_cn: pd.DataFrame,
15
+ n_components: int = 20,
16
+ k: int = 2,
17
+ method: str = "kmeans",
18
+ random_state: int = 0,
19
+ ) -> dict:
20
+ """
21
+ Perform PCA on gene-level CN and cluster patients in PCA space.
22
+
23
+ Parameters
24
+ ----------
25
+ gene_cn : DataFrame
26
+ Gene x Sample matrix of CN values (log2 ratios).
27
+ n_components : int
28
+ Number of PCA components to keep.
29
+ k : int
30
+ Number of clusters.
31
+ method : str
32
+ 'kmeans' or 'hierarchical'.
33
+ random_state : int
34
+ For reproducibility.
35
+
36
+ Returns
37
+ -------
38
+ dict with:
39
+ - labels: pd.Series (sample -> cluster)
40
+ - pca_coords: DataFrame of PCA coords
41
+ - explained_var: explained variance ratios
42
+ """
43
+ X = gene_cn.fillna(0).T # samples × genes
44
+ pca = PCA(n_components=min(n_components, X.shape[1]))
45
+ coords = pca.fit_transform(X)
46
+ coords_df = pd.DataFrame(
47
+ coords, index=X.index, columns=[f"PC{i+1}" for i in range(coords.shape[1])]
48
+ )
49
+
50
+ if method == "kmeans":
51
+ model = KMeans(n_clusters=k, n_init=20, random_state=random_state)
52
+ labels = model.fit_predict(coords)
53
+ elif method == "hierarchical":
54
+ model = AgglomerativeClustering(n_clusters=k)
55
+ labels = model.fit_predict(coords)
56
+ else:
57
+ raise ValueError("method must be 'kmeans' or 'hierarchical'")
58
+
59
+ labels = pd.Series(labels, index=X.index, name="cluster")
60
+ return {
61
+ "labels": labels,
62
+ "pca_coords": coords_df,
63
+ "explained_var": pca.explained_variance_ratio_,
64
+ }
65
+
66
+
67
+ def consensus_cluster_cn(
68
+ gene_cn: pd.DataFrame,
69
+ k: int = 2,
70
+ n_resamples: int = 50,
71
+ sample_fraction: float = 0.8,
72
+ feature_fraction: float = 0.8,
73
+ top_genes: int = 2000,
74
+ random_state: int = 0,
75
+ ) -> dict:
76
+ """
77
+ Consensus clustering of patients based on CN profiles.
78
+
79
+ Parameters
80
+ ----------
81
+ gene_cn : DataFrame
82
+ Gene x Sample CN matrix.
83
+ k : int
84
+ Number of clusters.
85
+ n_resamples : int
86
+ Number of resampling iterations.
87
+ sample_fraction : float
88
+ Fraction of patients sampled each iteration.
89
+ feature_fraction : float
90
+ Fraction of genes sampled each iteration.
91
+ top_genes : int
92
+ Use top variable genes only.
93
+ random_state : int
94
+ For reproducibility.
95
+
96
+ Returns
97
+ -------
98
+ dict with:
99
+ - labels: pd.Series (sample -> cluster) from consensus
100
+ - consensus_matrix: DataFrame (samples × samples) with co-clustering frequencies
101
+ """
102
+ rng = np.random.RandomState(random_state)
103
+
104
+ # Select top variable genes
105
+ var_genes = gene_cn.var(axis=1).sort_values(ascending=False).index[:top_genes]
106
+ data = gene_cn.loc[var_genes].fillna(0).values # genes × samples
107
+ samples = gene_cn.columns.tolist()
108
+ n = len(samples)
109
+
110
+ co_mat = np.zeros((n, n))
111
+ counts = np.zeros((n, n))
112
+
113
+ for r in range(n_resamples):
114
+ samp_idx = rng.choice(n, size=int(sample_fraction * n), replace=False)
115
+ feat_idx = rng.choice(
116
+ data.shape[0], size=int(feature_fraction * data.shape[0]), replace=False
117
+ )
118
+ X = data[feat_idx][:, samp_idx].T # subsampled patients × genes
119
+
120
+ # k-means in subsample
121
+ km = KMeans(n_clusters=k, n_init=10, random_state=rng).fit(X)
122
+ labels_sub = km.labels_
123
+
124
+ # update co-occurrence
125
+ for i, si in enumerate(samp_idx):
126
+ for j, sj in enumerate(samp_idx):
127
+ counts[si, sj] += 1
128
+ if labels_sub[i] == labels_sub[j]:
129
+ co_mat[si, sj] += 1
130
+
131
+ consensus = np.divide(co_mat, counts, out=np.zeros_like(co_mat), where=counts > 0)
132
+ consensus_df = pd.DataFrame(consensus, index=samples, columns=samples)
133
+
134
+ # Cluster consensus matrix
135
+ dist = 1 - consensus
136
+ agg = AgglomerativeClustering(n_clusters=k, affinity="precomputed", linkage="average")
137
+ labels = agg.fit_predict(dist)
138
+ labels = pd.Series(labels, index=samples, name="cluster")
139
+
140
+ return {"labels": labels, "consensus_matrix": consensus_df}
141
+
142
+
143
+ def consensus_cdf_range(
144
+ gene_cn, k_values=(2,3,4,5,6),
145
+ n_resamples=50, sample_fraction=0.8, feature_fraction=0.8,
146
+ top_genes=2000, random_state=0
147
+ ):
148
+ """
149
+ Run consensus clustering across multiple k and plot CDFs.
150
+
151
+ Parameters
152
+ ----------
153
+ gene_cn : DataFrame
154
+ Gene × Sample CN matrix.
155
+ k_values : list/tuple
156
+ Range of k to test.
157
+ n_resamples, sample_fraction, feature_fraction, top_genes, random_state
158
+ Passed to consensus_cluster_cn().
159
+
160
+ Returns
161
+ -------
162
+ dict
163
+ {k: {"labels", "consensus_matrix", "auc"}}
164
+ """
165
+ results = {}
166
+
167
+ plt.figure(figsize=(7,5))
168
+
169
+ for k in k_values:
170
+ res = consensus_cluster_cn(
171
+ gene_cn, k=k,
172
+ n_resamples=n_resamples,
173
+ sample_fraction=sample_fraction,
174
+ feature_fraction=feature_fraction,
175
+ top_genes=top_genes,
176
+ random_state=random_state
177
+ )
178
+
179
+ mat = res["consensus_matrix"].values
180
+ mask = ~np.eye(mat.shape[0], dtype=bool)
181
+ vals = mat[mask]
182
+
183
+ sorted_vals = np.sort(vals)
184
+ cdf = np.arange(1, len(sorted_vals)+1) / len(sorted_vals)
185
+
186
+ # Compute area under CDF (AUC)
187
+ auc = np.trapz(cdf, sorted_vals)
188
+ res["auc"] = auc
189
+ results[k] = res
190
+
191
+ plt.plot(sorted_vals, cdf, lw=2, label=f"k={k} (AUC={auc:.3f})")
192
+
193
+ plt.xlabel("Consensus value")
194
+ plt.ylabel("Cumulative fraction")
195
+ plt.title("Consensus CDF across k", fontsize=14)
196
+ plt.legend()
197
+ plt.grid(True, alpha=0.3)
198
+ plt.tight_layout()
199
+ plt.show()
200
+
201
+ return results
@@ -1,30 +1,20 @@
1
1
  import os
2
2
  import multiprocessing
3
3
  import warnings
4
- from math import ceil
5
- from math import floor
4
+ from math import ceil, floor
6
5
  from pathlib import Path
7
- from typing import List
8
- from typing import Literal
9
- from typing import Optional
10
- from typing import Tuple
11
- from typing import Union
12
- from typing import cast
6
+ from typing import List, Literal, Optional, Tuple, Union, cast, Dict, Any
13
7
 
14
8
  import numpy as np
15
9
  import pandas as pd
16
- from matplotlib import pyplot as plt
17
10
  from scipy.linalg import solve # type: ignore
18
11
  from scipy.optimize import minimize # type: ignore
19
12
  from scipy.special import gammaln # type: ignore
20
13
  from scipy.special import polygamma # type: ignore
21
14
  from scipy.stats import norm # type: ignore
22
15
  from sklearn.linear_model import LinearRegression # type: ignore
23
- import matplotlib.pyplot as plt
24
- import seaborn as sns
25
16
 
26
17
  from deconveil.grid_search import grid_fit_beta
27
-
28
18
  from pydeseq2.utils import fit_alpha_mle
29
19
  from pydeseq2.utils import get_num_processes
30
20
  from pydeseq2.grid_search import grid_fit_alpha
@@ -209,7 +199,6 @@ def fit_rough_dispersions(
209
199
  return np.maximum(alpha_rde, 0)
210
200
 
211
201
 
212
-
213
202
  def fit_moments_dispersions2(
214
203
  normed_counts: np.ndarray, size_factors: np.ndarray
215
204
  ) -> np.ndarray:
@@ -470,6 +459,7 @@ def nbinomGLM(
470
459
  inv_hessian = np.linalg.inv(ddf(beta, 1))
471
460
 
472
461
  return beta, inv_hessian, converged
462
+
473
463
 
474
464
  def nbinomFn(
475
465
  beta: np.ndarray,
@@ -535,275 +525,149 @@ def nbinomFn(
535
525
  return prior - nll
536
526
 
537
527
 
528
+ def build_design_matrix(
529
+ metadata: pd.DataFrame,
530
+ design_factors: Union[str, List[str]] = "condition",
531
+ ref_level: Optional[List[str]] = None,
532
+ continuous_factors: Optional[List[str]] = None,
533
+ expanded: bool = False,
534
+ intercept: bool = True,
535
+ ) -> pd.DataFrame:
536
+ """Build design_matrix matrix for DEA.
538
537
 
539
- def process_results(file_path, method, lfc_cut = 1.0, pval_cut = 0.05):
540
- df = pd.read_csv(file_path, index_col=0)
541
- df['isDE'] = (np.abs(df['log2FoldChange']) >= lfc_cut) & (df['padj'] <= pval_cut)
542
- df['DEtype'] = np.where(
543
- ~df['isDE'],
544
- "n.s.",
545
- np.where(df['log2FoldChange'] > 0, "Up-reg", "Down-reg")
546
- )
547
- df['method'] = method
548
- return df[['log2FoldChange', 'padj', 'isDE', 'DEtype', 'method']]
549
-
538
+ Unless specified, the reference factor is chosen alphabetically.
550
539
 
551
- def define_gene_groups(res_joint):
552
- DSGs = res_joint[
553
- ((res_joint['DEtype_naive'] == "Up-reg") & (res_joint['DEtype_aware'] == "n.s.")) |
554
- ((res_joint['DEtype_naive'] == "Down-reg") & (res_joint['DEtype_aware'] == "n.s."))
555
- ].assign(gene_category='DSGs')
556
-
557
- DIGs = res_joint[
558
- ((res_joint['DEtype_naive'] == "Up-reg") & (res_joint['DEtype_aware'] == "Up-reg")) |
559
- ((res_joint['DEtype_naive'] == "Down-reg") & (res_joint['DEtype_aware'] == "Down-reg"))
560
- ].assign(gene_category='DIGs')
561
-
562
- DCGs = res_joint[
563
- ((res_joint['DEtype_naive'] == "n.s.") & (res_joint['DEtype_aware'] == "Up-reg")) |
564
- ((res_joint['DEtype_naive'] == "n.s.") & (res_joint['DEtype_aware'] == "Down-reg"))
565
- ].assign(gene_category='DCGs')
566
-
567
- non_DEGs = res_joint[
568
- (res_joint['DEtype_naive'] == "n.s.") & (res_joint['DEtype_aware'] == "n.s.")
569
- ].assign(gene_category='non-DEGs')
570
-
571
- return {
572
- "DSGs": DSGs,
573
- "DIGs": DIGs,
574
- "DCGs": DCGs,
575
- "non_DEGs": non_DEGs
576
- }
577
-
578
-
579
- def generate_volcano_plot(plot_data, lfc_cut=1.0, pval_cut=0.05, xlim=None, ylim=None):
580
- plot_data['gene_group'] = plot_data['gene_group'].astype('category')
581
-
582
- # Define gene group colors
583
- gene_group_colors = {
584
- "DIGs": "#8F3931FF",
585
- "DSGs": "#FFB977",
586
- "DCGs": "#FFC300"
587
- }
588
-
589
- # Create a FacetGrid for faceted plots
590
- g = sns.FacetGrid(
591
- plot_data,
592
- col="method",
593
- margin_titles=True,
594
- hue="gene_group",
595
- palette=gene_group_colors,
596
- sharey=False,
597
- sharex=True
598
- )
540
+ Parameters
541
+ ----------
542
+ metadata : pandas.DataFrame
543
+ DataFrame containing metadata information.
544
+ Must be indexed by sample barcodes.
599
545
 
600
-
601
- # Add points for "DIGs"
602
- g.map_dataframe(
603
- sns.scatterplot,
604
- x="log2FC",
605
- y="-log10(padj)",
606
- alpha=0.1,
607
- size=0.5,
608
- legend=False,
609
- data=plot_data[plot_data['gene_group'].isin(["DIGs"])]
610
- )
546
+ design_factors : str or list
547
+ Name of the columns of metadata to be used as design_matrix variables.
548
+ (default: ``"condition"``).
611
549
 
612
- # Add points for "DSGs" and "DCGs"
613
- g.map_dataframe(
614
- sns.scatterplot,
615
- x="log2FC",
616
- y="-log10(padj)",
617
- alpha=1.0,
618
- size=3.0,
619
- legend=False,
620
- data=plot_data[plot_data['gene_group'].isin(["DSGs", "DCGs"])]
621
- )
622
-
623
- # Add vertical and horizontal dashed lines
624
- for ax in g.axes.flat:
625
- ax.axvline(x=-lfc_cut, color="gray", linestyle="dashed")
626
- ax.axvline(x=lfc_cut, color="gray", linestyle="dashed")
627
- ax.axhline(y=-np.log10(pval_cut), color="gray", linestyle="dashed")
628
-
629
- if xlim:
630
- ax.set_xlim(xlim)
631
- if ylim:
632
- ax.set_ylim(ylim)
633
-
634
- # Set axis labels
635
- g.set_axis_labels("Log2 FC", "-Log10 P-value")
636
-
637
- # Add titles, legends, and customize
638
- g.add_legend(title="Gene category")
639
- g.set_titles(row_template="{row_name}", col_template="{col_name}")
640
- g.tight_layout()
641
-
642
- # Adjust font sizes for better readability
643
- for ax in g.axes.flat:
644
- ax.tick_params(axis='both', labelsize=12)
645
- ax.set_xlabel("Log2 FC", fontsize=14)
646
- ax.set_ylabel("-Log10 P-value", fontsize=14)
647
-
648
- # Save or display the plot
649
- plt.show()
550
+ ref_level : dict or None
551
+ An optional list of two strings of the form ``["factor", "ref_level"]``
552
+ specifying the factor of interest and the desired reference level, e.g.
553
+ ``["condition", "A"]``. (default: ``None``).
650
554
 
555
+ continuous_factors : list or None
556
+ An optional list of continuous (as opposed to categorical) factors. Any factor
557
+ not in ``continuous_factors`` will be considered categorical (default: ``None``).
651
558
 
652
- def plot_cnv_hist(cnv_mean, binwidth=0.2):
653
- """
654
- Plots a histogram of the CNV mean distribution.
559
+ expanded : bool
560
+ If true, use one column per category. Else, use n-1 columns, for each n-level
561
+ categorical factor.
562
+ (default: ``False``).
655
563
 
656
- Parameters:
657
- cnv_mean (pd.Series or list): The CNV mean values to plot.
658
- binwidth (float): The bin width for the histogram.
564
+ intercept : bool
565
+ If true, add an intercept (a column containing only ones). (default: ``True``).
566
+
567
+ Returns
568
+ -------
569
+ pandas.DataFrame
570
+ A DataFrame with experiment design information (to split cohorts).
571
+ Indexed by sample barcodes.
659
572
  """
660
- # Convert to a DataFrame if it's not already
661
- if isinstance(cnv_mean, list):
662
- cnv_mean = pd.DataFrame({'cnv_mean': cnv_mean})
663
- elif isinstance(cnv_mean, pd.Series):
664
- cnv_mean = cnv_mean.to_frame(name='cnv_mean')
665
-
666
- # Create the histogram plot
667
- plt.figure(figsize=(5, 5))
668
- sns.histplot(
669
- cnv_mean['cnv_mean'],
670
- bins=int((cnv_mean['cnv_mean'].max() - cnv_mean['cnv_mean'].min()) / binwidth),
671
- kde=False,
672
- color="#F39B7F",
673
- edgecolor="black",
674
- alpha=0.7
675
- )
573
+ if isinstance(
574
+ design_factors, str
575
+ ): # if there is a single factor, convert to singleton list
576
+ design_factors = [design_factors]
577
+
578
+ for factor in design_factors:
579
+ # Check that each factor has at least 2 levels
580
+ if len(np.unique(metadata[factor])) < 2:
581
+ raise ValueError(
582
+ f"Factors should take at least two values, but {factor} "
583
+ f"takes the single value '{np.unique(metadata[factor])}'."
584
+ )
676
585
 
677
- # Add labels and titles
678
- plt.title("", fontsize=14)
679
- plt.xlabel("CN state", fontsize=14, labelpad=8)
680
- plt.ylabel("Frequency", fontsize=14, labelpad=8)
586
+ # Check that level factors in the design don't contain underscores. If so, convert
587
+ # them to hyphens
588
+ warning_issued = False
589
+ for factor in design_factors:
590
+ if np.any(["_" in value for value in metadata[factor]]):
591
+ if not warning_issued:
592
+ warnings.warn(
593
+ """Some factor levels in the design contain underscores ('_').
594
+ They will be converted to hyphens ('-').""",
595
+ UserWarning,
596
+ stacklevel=2,
597
+ )
598
+ warning_issued = True
599
+ metadata[factor] = metadata[factor].apply(lambda x: x.replace("_", "-"))
681
600
 
682
- # Customize the appearance of axes
683
- plt.xticks(fontsize=12, color="black", rotation=45, ha="right")
684
- plt.yticks(fontsize=12, color="black")
685
- plt.gca().spines["top"].set_visible(False)
686
- plt.gca().spines["right"].set_visible(False)
687
- plt.gca().spines["left"].set_linewidth(1)
688
- plt.gca().spines["bottom"].set_linewidth(1)
601
+ if continuous_factors is not None:
602
+ categorical_factors = [
603
+ factor for factor in design_factors if factor not in continuous_factors
604
+ ]
605
+ else:
606
+ categorical_factors = design_factors
689
607
 
690
- # Add a grid
691
- plt.grid(visible=False)
608
+ # Check that there is at least one categorical factor
609
+ if len(categorical_factors) > 0:
610
+ design_matrix = pd.get_dummies(
611
+ metadata[categorical_factors], drop_first=not expanded
612
+ )
692
613
 
693
- # Show the plot
694
- plt.tight_layout()
695
- plt.show()
614
+ if ref_level is not None:
615
+ if len(ref_level) != 2:
616
+ raise KeyError("The reference level should contain 2 strings.")
617
+ if ref_level[1] not in metadata[ref_level[0]].values:
618
+ raise KeyError(
619
+ f"The metadata data should contain a '{ref_level[0]}' column"
620
+ f" with a '{ref_level[1]}' level."
621
+ )
696
622
 
623
+ # Check that the reference level is not in the matrix (if unexpanded design)
624
+ ref_level_name = "_".join(ref_level)
625
+ if (not expanded) and ref_level_name in design_matrix.columns:
626
+ # Remove the reference level and add one
627
+ factor_cols = [
628
+ col for col in design_matrix.columns if col.startswith(ref_level[0])
629
+ ]
630
+ missing_level = next(
631
+ level
632
+ for level in np.unique(metadata[ref_level[0]])
633
+ if f"{ref_level[0]}_{level}" not in design_matrix.columns
634
+ )
635
+ design_matrix[f"{ref_level[0]}_{missing_level}"] = 1 - design_matrix[
636
+ factor_cols
637
+ ].sum(1)
638
+ design_matrix.drop(ref_level_name, axis="columns", inplace=True)
639
+
640
+ if not expanded:
641
+ # Add reference level as column name suffix
642
+ for factor in design_factors:
643
+ if ref_level is None or factor != ref_level[0]:
644
+ # The reference is the unique level that is no longer there
645
+ ref = next(
646
+ level
647
+ for level in np.unique(metadata[factor])
648
+ if f"{factor}_{level}" not in design_matrix.columns
649
+ )
650
+ else:
651
+ # The reference level is given as an argument
652
+ ref = ref_level[1]
653
+ design_matrix.columns = [
654
+ f"{col}_vs_{ref}" if col.startswith(factor) else col
655
+ for col in design_matrix.columns
656
+ ]
657
+ else:
658
+ # There is no categorical factor in the design
659
+ design_matrix = pd.DataFrame(index=metadata.index)
697
660
 
698
- def plot_stacked_bar(combined_data):
699
- """
700
- Creates a stacked bar plot of gene counts by CNV group for each tumor type.
701
-
702
- Parameters:
703
- - combined_data: DataFrame containing the data to plot.
704
- """
705
- # Define CNV colors inside the function
706
- cnv_colors = {
707
- "loss": "#0000FF",
708
- "neutral": "#808080",
709
- "gain": "#00FF00",
710
- "amplification": "#FF0000"
711
- }
712
-
713
- tumor_types = combined_data['tumor_type'].unique()
714
-
715
- # Create subplots for each tumor type
716
- fig, axes = plt.subplots(1, len(tumor_types), figsize=(5, 5), sharey=True)
717
-
718
- # If there's only one tumor type, axes will not be an array, so we convert it into a list
719
- if len(tumor_types) == 1:
720
- axes = [axes]
721
-
722
- for idx, tumor_type in enumerate(tumor_types):
723
- ax = axes[idx]
724
- tumor_data = combined_data[combined_data['tumor_type'] == tumor_type]
725
-
726
- # Create a table of counts for CNV group vs gene group
727
- counts = pd.crosstab(tumor_data['gene_group'], tumor_data['cnv_group'])
728
-
729
- # Plot stacked bars
730
- counts.plot(kind='bar', stacked=True, ax=ax, color=[cnv_colors[group] for group in counts.columns], width=0.6)
661
+ if intercept:
662
+ design_matrix.insert(0, "intercept", 1)
731
663
 
732
- ax.set_title(tumor_type, fontsize=16)
733
- ax.set_xlabel("")
734
- ax.set_ylabel("Gene Counts", fontsize=16)
735
-
736
- # Customize axis labels and tick marks
737
- ax.tick_params(axis='x', labelsize=16, labelcolor="black")
738
- ax.tick_params(axis='y', labelsize=16, labelcolor="black")
739
-
740
- # Overall settings for layout and labels
741
- plt.xticks(fontsize=12, color="black", rotation=45, ha="right")
742
- plt.tight_layout()
743
- plt.show()
744
-
664
+ # Convert categorical factors one-hot encodings to int
665
+ design_matrix = design_matrix.astype("int")
745
666
 
746
- def plot_percentage_bar(barplot_data):
747
- """
748
- Creates a bar plot showing the percentage of genes for each gene group across tumor types.
749
-
750
- Parameters:
751
- - barplot_data: DataFrame containing 'gene_group', 'percentage', and 'Count' columns.
752
- """
753
- # Define the gene group colors inside the function
754
- gene_group_colors = {
755
- "DIGs": "#8F3931FF",
756
- "DSGs": "#FFB977",
757
- "DCGs": "#FFC300"
758
- }
759
-
760
- tumor_types = barplot_data['tumor_type'].unique()
761
-
762
- plt.figure(figsize=(5, 5))
763
- sns.set(style="whitegrid")
667
+ # Add continuous factors
668
+ if continuous_factors is not None:
669
+ for factor in continuous_factors:
670
+ # This factor should be numeric
671
+ design_matrix[factor] = pd.to_numeric(metadata[factor])
672
+ return design_matrix
764
673
 
765
- # Create subplots for each tumor type
766
- fig, axes = plt.subplots(1, len(tumor_types), figsize=(5, 5), sharey=True)
767
-
768
- # If only one tumor type, ensure axes is a list
769
- if len(tumor_types) == 1:
770
- axes = [axes]
771
-
772
- for idx, tumor_type in enumerate(tumor_types):
773
- ax = axes[idx]
774
- tumor_data = barplot_data[barplot_data['tumor_type'] == tumor_type]
775
-
776
- # Plot the percentage bar plot
777
- sns.barplot(data=tumor_data, x="gene_group", y="percentage", hue="gene_group",
778
- palette=gene_group_colors, ax=ax, width=0.6)
779
-
780
- # Add counts and percentages as labels
781
- for p in ax.patches:
782
- height = p.get_height()
783
- gene_group = p.get_x() + p.get_width() / 2 # Get the x position of the patch (bar)
784
-
785
- # Find the gene_group in the data based on its position
786
- group_name = tumor_data.iloc[int(gene_group)]['gene_group']
787
- count = tumor_data.loc[tumor_data['gene_group'] == group_name, 'Count'].values[0]
788
- percentage = tumor_data.loc[tumor_data['gene_group'] == group_name, 'percentage'].values[0]
789
-
790
- # Position the labels slightly above the bars
791
- ax.text(p.get_x() + p.get_width() / 2, height + 0.5, f'{count} ({round(percentage, 1)}%)',
792
- ha='center', va='bottom', fontsize=12, color="black")
793
-
794
- ax.set_title(tumor_type, fontsize=16)
795
- ax.set_xlabel("")
796
- ax.set_ylabel("Percentage of Genes", fontsize=16)
797
-
798
- # Customize axis labels and tick marks
799
- ax.tick_params(axis='x', labelsize=16, labelcolor="black", rotation=45)
800
- ax.tick_params(axis='y', labelsize=16, labelcolor="black")
801
-
802
- # Explicitly set the x-tick labels with proper rotation and alignment
803
- for tick in ax.get_xticklabels():
804
- tick.set_horizontalalignment('right') # This ensures proper alignment for x-ticks
805
- tick.set_rotation(45)
806
-
807
- # Overall settings for layout and labels
808
- plt.tight_layout()
809
- plt.show()
@@ -0,0 +1,308 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+
7
+
8
+ def plot_volcano(plot_data, lfc_cut=1.0, pval_cut=0.05, xlim=None, ylim=None):
9
+ plot_data['gene_group'] = plot_data['gene_group'].astype('category')
10
+
11
+ # Define gene group colors
12
+ gene_group_colors = {
13
+ "DIGs": "#8F3931FF",
14
+ "DSGs": "#FFB977",
15
+ "DCGs": "#FFC300"
16
+ }
17
+
18
+ # Create a FacetGrid for faceted plots
19
+ g = sns.FacetGrid(
20
+ plot_data,
21
+ col="method",
22
+ margin_titles=True,
23
+ hue="gene_group",
24
+ palette=gene_group_colors,
25
+ sharey=False,
26
+ sharex=True
27
+ )
28
+
29
+
30
+ # Add points for "DIGs"
31
+ g.map_dataframe(
32
+ sns.scatterplot,
33
+ x="log2FC",
34
+ y="-log10(padj)",
35
+ alpha=0.2,
36
+ size=0.5,
37
+ legend=False,
38
+ data=plot_data[plot_data['gene_group'] == "DIGs"]
39
+ )
40
+
41
+ # Add points for "DSGs" and "DCGs
42
+ g.map_dataframe(
43
+ sns.scatterplot,
44
+ x="log2FC",
45
+ y="-log10(padj)",
46
+ alpha=0.8,
47
+ s=3.0,
48
+ legend=False,
49
+ data=plot_data[plot_data['gene_group'] == "DSGs"]
50
+ )
51
+
52
+ g.map_dataframe(
53
+ sns.scatterplot,
54
+ x="log2FC",
55
+ y="-log10(padj)",
56
+ alpha=1.0,
57
+ s=3.0,
58
+ legend=False,
59
+ data=plot_data[plot_data['gene_group'] == "DCGs"],
60
+ zorder=5 # force to front
61
+ )
62
+
63
+ # Threshold lines
64
+ for ax in g.axes.flat:
65
+ ax.axvline(x=-lfc_cut, color="gray", linestyle="dashed")
66
+ ax.axvline(x=lfc_cut, color="gray", linestyle="dashed")
67
+ ax.axhline(y=-np.log10(pval_cut), color="gray", linestyle="dashed")
68
+
69
+ if xlim:
70
+ ax.set_xlim(xlim)
71
+ if ylim:
72
+ ax.set_ylim(ylim)
73
+
74
+ # Labels and legend
75
+ g.set_axis_labels("Log2 FC", "-Log10 P-value")
76
+ g.add_legend(title="Gene category")
77
+ g.set_titles(row_template="{row_name}", col_template="{col_name}")
78
+ g.tight_layout()
79
+
80
+ # Axis formatting
81
+ for ax in g.axes.flat:
82
+ ax.tick_params(axis='both', labelsize=12)
83
+ ax.set_xlabel("Log2 FC", fontsize=14)
84
+ ax.set_ylabel("-Log10 P-value", fontsize=14)
85
+
86
+ # Save or display the plot
87
+ plt.show()
88
+
89
+
90
+ def plot_cnv_hist(cnv_mean, binwidth=0.2, title="CNV Mean Distribution"):
91
+ """
92
+ Plots a histogram of the CNV mean distribution.
93
+
94
+ Parameters:
95
+ cnv_mean (pd.Series or list): The CNV mean values to plot.
96
+ binwidth (float): The bin width for the histogram.
97
+ title (str): The title of the plot.
98
+ """
99
+ # Convert to a DataFrame if it's not already
100
+ if isinstance(cnv_mean, list):
101
+ cnv_mean = pd.DataFrame({'cnv_mean': cnv_mean})
102
+ elif isinstance(cnv_mean, pd.Series):
103
+ cnv_mean = cnv_mean.to_frame(name='cnv_mean')
104
+
105
+ # Create the histogram plot
106
+ plt.figure(figsize=(5, 5))
107
+ sns.histplot(
108
+ cnv_mean['cnv_mean'],
109
+ bins=int((cnv_mean['cnv_mean'].max() - cnv_mean['cnv_mean'].min()) / binwidth),
110
+ kde=False,
111
+ color="#F39B7F",
112
+ edgecolor="black",
113
+ alpha=0.7
114
+ )
115
+
116
+ # Add labels and titles
117
+ plt.title(title, fontsize=14, pad=12)
118
+ plt.xlabel("CN state", fontsize=14, labelpad=8)
119
+ plt.ylabel("Frequency", fontsize=14, labelpad=8)
120
+
121
+ # Customize the appearance of axes
122
+ plt.xticks(fontsize=12, color="black", rotation=45, ha="right")
123
+ plt.yticks(fontsize=12, color="black")
124
+ plt.gca().spines["top"].set_visible(False)
125
+ plt.gca().spines["right"].set_visible(False)
126
+ plt.gca().spines["left"].set_linewidth(1)
127
+ plt.gca().spines["bottom"].set_linewidth(1)
128
+
129
+ # Add a grid
130
+ plt.grid(visible=False)
131
+
132
+ # Show the plot
133
+ plt.tight_layout()
134
+ plt.show()
135
+
136
+
137
+ def plot_stacked_bar(combined_data):
138
+ """
139
+ Creates a stacked bar plot of gene counts by CNV group for each tumor type.
140
+
141
+ Parameters:
142
+ - combined_data: DataFrame containing the data to plot.
143
+ """
144
+ # Define CNV colors inside the function
145
+ cnv_colors = {
146
+ "loss": "dodgerblue",
147
+ "neutral": "gray",
148
+ "gain": "yellowgreen",
149
+ "amplification": "coral"
150
+ }
151
+
152
+ tumor_types = combined_data['tumor_type'].unique()
153
+
154
+ # Create subplots for each tumor type
155
+ fig, axes = plt.subplots(1, len(tumor_types), figsize=(5, 5), sharey=True)
156
+
157
+ # If there's only one tumor type, axes will not be an array, so we convert it into a list
158
+ if len(tumor_types) == 1:
159
+ axes = [axes]
160
+
161
+ for idx, tumor_type in enumerate(tumor_types):
162
+ ax = axes[idx]
163
+ tumor_data = combined_data[combined_data['tumor_type'] == tumor_type]
164
+
165
+ # Create a table of counts for CNV group vs gene group
166
+ counts = pd.crosstab(tumor_data['gene_group'], tumor_data['cnv_group'])
167
+
168
+ # Plot stacked bars
169
+ counts.plot(kind='bar', stacked=True, ax=ax, color=[cnv_colors[group] for group in counts.columns], width=0.6)
170
+
171
+ ax.set_title(tumor_type, fontsize=16)
172
+ ax.set_xlabel("")
173
+ ax.set_ylabel("Gene Counts", fontsize=16)
174
+
175
+ # Customize axis labels and tick marks
176
+ ax.tick_params(axis='x', labelsize=16, labelcolor="black")
177
+ ax.tick_params(axis='y', labelsize=16, labelcolor="black")
178
+
179
+ # Overall settings for layout and labels
180
+ plt.xticks(fontsize=12, color="black", rotation=45, ha="right")
181
+ plt.tight_layout()
182
+ plt.show()
183
+
184
+
185
+ def plot_percentage_bar(barplot_data):
186
+ """
187
+ Creates a bar plot showing the percentage of genes for each gene group across tumor types.
188
+
189
+ Parameters:
190
+ - barplot_data: DataFrame containing 'gene_group', 'percentage', and 'Count' columns.
191
+ """
192
+ # Define the gene group colors inside the function
193
+ gene_group_colors = {
194
+ "DIGs": "#8F3931FF",
195
+ "DSGs": "#FFB977",
196
+ "DCGs": "#FFC300"
197
+ }
198
+
199
+ tumor_types = barplot_data['tumor_type'].unique()
200
+
201
+ plt.figure(figsize=(5, 5))
202
+ sns.set(style="whitegrid")
203
+
204
+ # Create subplots for each tumor type
205
+ fig, axes = plt.subplots(1, len(tumor_types), figsize=(5, 5), sharey=True)
206
+
207
+ # If only one tumor type, ensure axes is a list
208
+ if len(tumor_types) == 1:
209
+ axes = [axes]
210
+
211
+ for idx, tumor_type in enumerate(tumor_types):
212
+ ax = axes[idx]
213
+ tumor_data = barplot_data[barplot_data['tumor_type'] == tumor_type]
214
+
215
+ # Plot the percentage bar plot
216
+ sns.barplot(data=tumor_data, x="gene_group", y="percentage", hue="gene_group",
217
+ palette=gene_group_colors, ax=ax, width=0.6)
218
+
219
+ # Add counts and percentages as labels
220
+ for p in ax.patches:
221
+ height = p.get_height()
222
+ gene_group = p.get_x() + p.get_width() / 2 # Get the x position of the patch (bar)
223
+
224
+ # Find the gene_group in the data based on its position
225
+ group_name = tumor_data.iloc[int(gene_group)]['gene_group']
226
+ count = tumor_data.loc[tumor_data['gene_group'] == group_name, 'Count'].values[0]
227
+ percentage = tumor_data.loc[tumor_data['gene_group'] == group_name, 'percentage'].values[0]
228
+
229
+ # Position the labels slightly above the bars
230
+ ax.text(p.get_x() + p.get_width() / 2, height + 0.5, f'{count} ({round(percentage, 1)}%)',
231
+ ha='center', va='bottom', fontsize=12, color="black")
232
+
233
+ ax.set_title(tumor_type, fontsize=16)
234
+ ax.set_xlabel("")
235
+ ax.set_ylabel("Percentage of Genes", fontsize=16)
236
+
237
+ # Customize axis labels and tick marks
238
+ ax.tick_params(axis='x', labelsize=16, labelcolor="black", rotation=45)
239
+ ax.tick_params(axis='y', labelsize=16, labelcolor="black")
240
+
241
+ # Explicitly set the x-tick labels with proper rotation and alignment
242
+ for tick in ax.get_xticklabels():
243
+ tick.set_horizontalalignment('right') # This ensures proper alignment for x-ticks
244
+ tick.set_rotation(45)
245
+
246
+ # Overall settings for layout and labels
247
+ plt.tight_layout()
248
+ plt.show()
249
+
250
+
251
+ def plot_pca_clusters(pca_coords, labels, explained_var, title="PCA Clustering"):
252
+ """
253
+ Scatterplot of first 2 PCs with cluster colors, showing variance explained.
254
+
255
+ Parameters
256
+ ----------
257
+ pca_coords : DataFrame
258
+ PCA coordinates (samples × PCs), from pca_cluster_cn().
259
+ labels : pd.Series
260
+ Cluster assignments (index must match pca_coords).
261
+ explained_var : array-like
262
+ Explained variance ratio for each PC.
263
+ title : str
264
+ Plot title.
265
+ """
266
+ df_plot = pca_coords.copy()
267
+ df_plot["cluster"] = labels.astype(str)
268
+
269
+ plt.figure(figsize=(7,6))
270
+ sns.scatterplot(
271
+ x="PC1", y="PC2", hue="cluster",
272
+ data=df_plot, palette="Set2", s=70, alpha=0.9, edgecolor="k"
273
+ )
274
+
275
+ # Format axis labels with variance %
276
+ pc1_var = explained_var[0] * 100
277
+ pc2_var = explained_var[1] * 100
278
+ plt.xlabel(f"PC1 ({pc1_var:.1f}% variance)")
279
+ plt.ylabel(f"PC2 ({pc2_var:.1f}% variance)")
280
+
281
+ plt.title(title, fontsize=14)
282
+ plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc="upper left")
283
+ plt.tight_layout()
284
+ plt.show()
285
+
286
+
287
+ def plot_consensus_matrix(consensus_matrix, labels, title="Consensus Matrix"):
288
+ """
289
+ Heatmap of consensus matrix with samples ordered by cluster.
290
+
291
+ Parameters
292
+ ----------
293
+ consensus_matrix : pd.DataFrame
294
+ Sample × Sample consensus values.
295
+ labels : pd.Series
296
+ Final cluster assignments (same sample index).
297
+ """
298
+ # Order samples by cluster
299
+ ordered_samples = labels.sort_values().index
300
+ mat = consensus_matrix.loc[ordered_samples, ordered_samples]
301
+
302
+ plt.figure(figsize=(7,6))
303
+ sns.heatmap(mat, cmap="viridis", square=True, cbar_kws={"label": "Consensus"})
304
+ plt.title(title, fontsize=14)
305
+ plt.xlabel("Samples")
306
+ plt.ylabel("Samples")
307
+ plt.tight_layout()
308
+ plt.show()
@@ -0,0 +1,132 @@
1
+ import os
2
+ import warnings
3
+ from math import ceil, floor
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ from typing import List, Literal, Optional, Dict, Any, cast
10
+
11
+
12
+
13
+ def filter_low_count_genes(
14
+ df: pd.DataFrame,
15
+ other_dfs: Optional[List[pd.DataFrame]] = None,
16
+ min_count: int = 10,
17
+ min_samples: Optional[int] = 3,
18
+ min_frac: Optional[float] = None,
19
+ return_mask: bool = False
20
+ ) -> Dict[str, Any]:
21
+ """
22
+ Filter genes (columns) by expression thresholds.
23
+
24
+ Parameters
25
+ ----------
26
+ df : pd.DataFrame
27
+ Main dataframe (genes as columns, samples as rows).
28
+ other_dfs : list of pd.DataFrame, optional
29
+ Other dataframes with the same columns to filter in parallel.
30
+ min_count : int, default=10
31
+ Minimum expression/count threshold.
32
+ min_samples : int, default=3
33
+ Minimum number of samples meeting the threshold.
34
+ min_frac : float, optional
35
+ Fraction of samples that must meet the threshold.
36
+ If provided, overrides min_samples.
37
+ return_mask : bool, default=False
38
+ If True, also return the boolean mask of kept genes.
39
+
40
+ Returns
41
+ -------
42
+ result : dict
43
+ {
44
+ "filtered_df": pd.DataFrame,
45
+ "other_filtered": list[pd.DataFrame] or None,
46
+ "mask": pd.Series (if return_mask),
47
+ "stats": dict with counts
48
+ }
49
+ """
50
+ # compute required min_samples
51
+ if min_frac is not None:
52
+ min_samples = max(1, int(round(min_frac * df.shape[0])))
53
+
54
+ # gene-wise filter mask
55
+ mask = (df >= min_count).sum(axis=0) >= min_samples
56
+
57
+ # apply mask
58
+ filtered_df = df.loc[:, mask]
59
+ filtered_others = [odf.loc[:, mask] for odf in other_dfs] if other_dfs else None
60
+
61
+ # collect stats
62
+ stats = {
63
+ "n_total": df.shape[1],
64
+ "n_kept": int(mask.sum()),
65
+ "n_removed": int((~mask).sum()),
66
+ "min_count": min_count,
67
+ "min_samples": min_samples,
68
+ }
69
+
70
+ result = {
71
+ "filtered_df": filtered_df,
72
+ "other_filtered": filtered_others,
73
+ "stats": stats,
74
+ }
75
+ if return_mask:
76
+ result["mask"] = mask
77
+
78
+ return result
79
+
80
+
81
+ def process_results(file_path, method, lfc_cut = 1.0, pval_cut = 0.05):
82
+ df = pd.read_csv(file_path, index_col=0)
83
+ df['isDE'] = (np.abs(df['log2FoldChange']) >= lfc_cut) & (df['padj'] <= pval_cut)
84
+ df['DEtype'] = np.where(
85
+ ~df['isDE'],
86
+ "n.s.",
87
+ np.where(df['log2FoldChange'] > 0, "Up-reg", "Down-reg")
88
+ )
89
+ df['method'] = method
90
+ return df[['log2FoldChange', 'padj', 'isDE', 'DEtype', 'method']]
91
+
92
+
93
+ def define_gene_groups(res_joint):
94
+ DSGs = res_joint[
95
+ ((res_joint['DEtype_naive'] == "Up-reg") & (res_joint['DEtype_aware'] == "n.s.")) |
96
+ ((res_joint['DEtype_naive'] == "Down-reg") & (res_joint['DEtype_aware'] == "n.s."))
97
+ ].assign(gene_category='DSGs')
98
+
99
+ DIGs = res_joint[
100
+ ((res_joint['DEtype_naive'] == "Up-reg") & (res_joint['DEtype_aware'] == "Up-reg")) |
101
+ ((res_joint['DEtype_naive'] == "Down-reg") & (res_joint['DEtype_aware'] == "Down-reg"))
102
+ ].assign(gene_category='DIGs')
103
+
104
+ DCGs = res_joint[
105
+ ((res_joint['DEtype_naive'] == "n.s.") & (res_joint['DEtype_aware'] == "Up-reg")) |
106
+ ((res_joint['DEtype_naive'] == "n.s.") & (res_joint['DEtype_aware'] == "Down-reg"))
107
+ ].assign(gene_category='DCGs')
108
+
109
+ non_DEGs = res_joint[
110
+ (res_joint['DEtype_naive'] == "n.s.") & (res_joint['DEtype_aware'] == "n.s.")
111
+ ].assign(gene_category='non-DEGs')
112
+
113
+ return {
114
+ "DSGs": DSGs,
115
+ "DIGs": DIGs,
116
+ "DCGs": DCGs,
117
+ "non_DEGs": non_DEGs
118
+ }
119
+
120
+
121
+ def clean_gene_group(df, mode="naive"):
122
+ """Rename and subset a gene group dataframe for a given mode."""
123
+ suffix = f"_{mode}"
124
+ rename_map = {
125
+ f"logFC{suffix}": "log2FC",
126
+ f"padj{suffix}": "padj",
127
+ f"isDE{suffix}": "isDE",
128
+ f"DEtype{suffix}": "DEtype",
129
+ f"method{suffix}": "method",
130
+ "gene_category": "gene_group"
131
+ }
132
+ return df.rename(columns=rename_map)[["log2FC", "padj", "isDE", "DEtype", "method", "gene_group"]]
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: DeConveil
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: An extension of PyDESeq2/DESeq2 designed to account for genome aneuploidy
5
5
  Home-page: https://github.com/caravagnalab/DeConveil
6
6
  Author: Katsiaryna Davydzenka
@@ -29,6 +29,7 @@ Dynamic: author
29
29
  Dynamic: author-email
30
30
  Dynamic: home-page
31
31
  Dynamic: license
32
+ Dynamic: license-file
32
33
  Dynamic: provides-extra
33
34
  Dynamic: requires-dist
34
35
  Dynamic: requires-python
@@ -0,0 +1,16 @@
1
+ deconveil/__init__.py,sha256=_6FL_AYiycv9nP3mKJiQ4zl4aU83YSWnV2YoIZr9Mv0,188
2
+ deconveil/__version__.py,sha256=K5SiDdEGYMpdqXThrqwTqECJJBOQNTQDrnpc2K5mzKs,21
3
+ deconveil/dds.py,sha256=0MNwtDzCjqjoJR-rrCmVu3JOaDd3gXuToOzTBXJMxak,49039
4
+ deconveil/default_inference.py,sha256=J40O0-qZChLnLrLGmhwxjaTVsV7REWAUQOTf8qSwWk0,9466
5
+ deconveil/ds.py,sha256=Vb9p152U1KXltrXFpMoBxY6YRW25dP4CO26_osbz6Aw,29476
6
+ deconveil/grid_search.py,sha256=iOHR8ur10MyrrfEZHr409lGulGxODufsjG6j7lQ7tWs,5181
7
+ deconveil/inference.py,sha256=B3zf3q_mbCTX3gHJwuXnTuy9uyXOxEjuWyaSR6VtVEo,10429
8
+ deconveil/utils_clustering.py,sha256=twspPvXQ6pvw_NaY1ebyvswuH3ZvVBGn7DeOpZ1XatI,5939
9
+ deconveil/utils_fit.py,sha256=SdGcBQjN3cyzbSFessufYOOOJAQCOjNcy3etbwmodsM,21583
10
+ deconveil/utils_plot.py,sha256=1JQthYXaEUKUWa0fy8owkyJ1CTkQxlrSRAqPkXMk7Us,9857
11
+ deconveil/utils_processing.py,sha256=CB99CwQst7eUiIgE58yl7_3E6uD9CgQoU_Qmprjyt-s,4141
12
+ deconveil-0.1.2.dist-info/licenses/LICENSE,sha256=BJ0f3JRteiF7tjiARi8syxiu4yKmckc0nWlHCKXttKQ,1078
13
+ deconveil-0.1.2.dist-info/METADATA,sha256=JqHZYXo0lLvPjoj_cDT-IwHADSKdESJQxorDbpsk3-k,1097
14
+ deconveil-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ deconveil-0.1.2.dist-info/top_level.txt,sha256=yAWZbw0eg8XpbMsswoq-VzBGfQHrfWOqNHnu2qQ2xO4,10
16
+ deconveil-0.1.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1 @@
1
+ deconveil
@@ -1,12 +0,0 @@
1
- DeConveil/__init__.py,sha256=_6FL_AYiycv9nP3mKJiQ4zl4aU83YSWnV2YoIZr9Mv0,188
2
- DeConveil/dds.py,sha256=gJDe6uoolAaS6v7C9DIRM_05LDuhlQO3u6WbMamMMbA,49126
3
- DeConveil/default_inference.py,sha256=Ym4mOq4yG7CbvtWF_HqibJrWQLTpUHRgsqGo2cKE5OA,9570
4
- DeConveil/ds.py,sha256=WUpSTCxMt1yeQf6zORmNUnz-whwyTaizmg0i_09v73c,29512
5
- DeConveil/grid_search.py,sha256=Gy9wLBe1YrJdimpTz4oFrTGv6pvhK3kVPa6FrI2vq8o,5189
6
- DeConveil/inference.py,sha256=fcvoL7KHuuLVxF8sz7QIT9A1o80sIRpOcIvyyF_rJGc,10480
7
- DeConveil/utils_CNaware.py,sha256=cRyPMNHW0JjXEonDHlRNAuzHrP9egJ2aERSRvp5ugGI,25182
8
- DeConveil-0.1.0.dist-info/LICENSE,sha256=BJ0f3JRteiF7tjiARi8syxiu4yKmckc0nWlHCKXttKQ,1078
9
- DeConveil-0.1.0.dist-info/METADATA,sha256=SNyxZkjZxPh2ZtIsAkGDlbgibe-AqGsPta6a10n65oc,1075
10
- DeConveil-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
11
- DeConveil-0.1.0.dist-info/top_level.txt,sha256=LI9aJqnkaO3AawK0hV1oHW0dd6n16Jv3uZyQdC2n0bg,10
12
- DeConveil-0.1.0.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- DeConveil
File without changes