sclab 0.3.1__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sclab might be problematic. Click here for more details.
- {sclab-0.3.1 → sclab-0.3.3}/PKG-INFO +5 -5
- {sclab-0.3.1 → sclab-0.3.3}/README.md +4 -4
- {sclab-0.3.1 → sclab-0.3.3}/pyproject.toml +1 -1
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/__init__.py +1 -1
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/_sclab.py +2 -1
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/dataset/processor/_results_panel.py +26 -12
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/examples/processor_steps/_differential_expression.py +2 -1
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/preprocess/__init__.py +6 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/preprocess/_cca.py +26 -4
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/preprocess/_cca_integrate.py +4 -4
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/preprocess/_normalize_weighted.py +5 -1
- sclab-0.3.3/src/sclab/preprocess/_pca.py +51 -0
- sclab-0.3.3/src/sclab/preprocess/_preprocess.py +155 -0
- sclab-0.3.3/src/sclab/preprocess/_qc.py +38 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/preprocess/_transfer_metadata.py +6 -5
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/cellflow/pseudotime/_pseudotime.py +5 -1
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/differential_expression/__init__.py +2 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/differential_expression/_pseudobulk_edger.py +24 -21
- sclab-0.3.3/src/sclab/tools/differential_expression/_pseudobulk_limma.py +257 -0
- {sclab-0.3.1 → sclab-0.3.3}/LICENSE +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/_io.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/_methods_registry.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/dataset/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/dataset/_dataset.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/dataset/_exceptions.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/dataset/plotter/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/dataset/plotter/_controls.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/dataset/plotter/_plotter.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/dataset/plotter/_utils.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/dataset/processor/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/dataset/processor/_processor.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/dataset/processor/step/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/dataset/processor/step/_basic_processor_step.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/dataset/processor/step/_processor_step_base.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/event/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/event/_broker.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/event/_client.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/event/_utils.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/examples/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/examples/processor_steps/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/examples/processor_steps/_cluster.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/examples/processor_steps/_doublet_detection.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/examples/processor_steps/_gene_expression.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/examples/processor_steps/_integration.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/examples/processor_steps/_neighbors.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/examples/processor_steps/_pca.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/examples/processor_steps/_preprocess.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/examples/processor_steps/_qc.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/examples/processor_steps/_umap.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/gui/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/gui/components/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/gui/components/_guided_pseudotime.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/gui/components/_transfer_metadata.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/methods/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/preprocess/_filter_obs.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/preprocess/_harmony.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/preprocess/_harmony_integrate.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/preprocess/_subset.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/preprocess/_transform.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/preprocess/_utils.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/scanpy/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/scanpy/_compat.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/scanpy/_settings.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/scanpy/logging.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/scanpy/plotting/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/scanpy/plotting/_rcmod.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/scanpy/plotting/palettes.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/scanpy/readwrite.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/cellflow/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/cellflow/density_dynamics/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/cellflow/density_dynamics/_density_dynamics.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/cellflow/pseudotime/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/cellflow/pseudotime/timeseries.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/cellflow/utils/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/cellflow/utils/density_nd.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/cellflow/utils/interpolate.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/cellflow/utils/smoothen.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/cellflow/utils/times.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/differential_expression/_pseudobulk_helpers.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/doublet_detection/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/doublet_detection/_scrublet.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/labeling/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/tools/labeling/sctype.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/utils/__init__.py +0 -0
- {sclab-0.3.1 → sclab-0.3.3}/src/sclab/utils/_write_excel.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sclab
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: sclab
|
|
5
5
|
Author-email: Argenis Arriojas <ArriojasMaldonado001@umb.edu>
|
|
6
6
|
Requires-Python: >=3.10,<3.13
|
|
@@ -65,7 +65,6 @@ Open a Jupyter Notebook and run the following:
|
|
|
65
65
|
```python
|
|
66
66
|
from IPython.display import display
|
|
67
67
|
from sclab import SCLabDashboard
|
|
68
|
-
from sclab.examples.processor_steps import QC, Preprocess, PCA, Neighbors, UMAP, Cluster
|
|
69
68
|
import scanpy as sc
|
|
70
69
|
|
|
71
70
|
# Load your data
|
|
@@ -73,8 +72,6 @@ adata = sc.read_10x_h5("your_data.h5")
|
|
|
73
72
|
|
|
74
73
|
# Create dashboard
|
|
75
74
|
dashboard = SCLabDashboard(adata, name="My Analysis")
|
|
76
|
-
# Add desired processing steps to the interface
|
|
77
|
-
dashboard.pr.add_steps({"Processing": [QC, Preprocess, PCA, Neighbors, UMAP, Cluster]})
|
|
78
75
|
|
|
79
76
|
# Display dashboard
|
|
80
77
|
display(dashboard)
|
|
@@ -84,8 +81,10 @@ display(dashboard)
|
|
|
84
81
|
# dashboard.pl # Plotter
|
|
85
82
|
# dashboard.pr # Processor
|
|
86
83
|
|
|
87
|
-
# the
|
|
84
|
+
# the active AnnData object is found within the dataset object:
|
|
88
85
|
# dashboard.ds.adata
|
|
86
|
+
|
|
87
|
+
# by default, the dashboard will update the loaded AnnData object in-place
|
|
89
88
|
```
|
|
90
89
|
|
|
91
90
|
## Components
|
|
@@ -94,6 +93,7 @@ display(dashboard)
|
|
|
94
93
|
|
|
95
94
|
The main interface that integrates all components with a tabbed layout:
|
|
96
95
|
- Main graph for visualizations
|
|
96
|
+
- Results panel
|
|
97
97
|
- Observations table
|
|
98
98
|
- Genes table
|
|
99
99
|
- Event logs
|
|
@@ -24,7 +24,6 @@ Open a Jupyter Notebook and run the following:
|
|
|
24
24
|
```python
|
|
25
25
|
from IPython.display import display
|
|
26
26
|
from sclab import SCLabDashboard
|
|
27
|
-
from sclab.examples.processor_steps import QC, Preprocess, PCA, Neighbors, UMAP, Cluster
|
|
28
27
|
import scanpy as sc
|
|
29
28
|
|
|
30
29
|
# Load your data
|
|
@@ -32,8 +31,6 @@ adata = sc.read_10x_h5("your_data.h5")
|
|
|
32
31
|
|
|
33
32
|
# Create dashboard
|
|
34
33
|
dashboard = SCLabDashboard(adata, name="My Analysis")
|
|
35
|
-
# Add desired processing steps to the interface
|
|
36
|
-
dashboard.pr.add_steps({"Processing": [QC, Preprocess, PCA, Neighbors, UMAP, Cluster]})
|
|
37
34
|
|
|
38
35
|
# Display dashboard
|
|
39
36
|
display(dashboard)
|
|
@@ -43,8 +40,10 @@ display(dashboard)
|
|
|
43
40
|
# dashboard.pl # Plotter
|
|
44
41
|
# dashboard.pr # Processor
|
|
45
42
|
|
|
46
|
-
# the
|
|
43
|
+
# the active AnnData object is found within the dataset object:
|
|
47
44
|
# dashboard.ds.adata
|
|
45
|
+
|
|
46
|
+
# by default, the dashboard will update the loaded AnnData object in-place
|
|
48
47
|
```
|
|
49
48
|
|
|
50
49
|
## Components
|
|
@@ -53,6 +52,7 @@ display(dashboard)
|
|
|
53
52
|
|
|
54
53
|
The main interface that integrates all components with a tabbed layout:
|
|
55
54
|
- Main graph for visualizations
|
|
55
|
+
- Results panel
|
|
56
56
|
- Observations table
|
|
57
57
|
- Genes table
|
|
58
58
|
- Event logs
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
from io import BytesIO
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
import tempfile
|
|
5
4
|
|
|
6
5
|
from anndata import AnnData
|
|
7
6
|
from IPython.display import display
|
|
@@ -238,6 +237,8 @@ class DataLoader(VBox):
|
|
|
238
237
|
self.adata = adata
|
|
239
238
|
|
|
240
239
|
def on_upload(self, *args, **kwargs):
|
|
240
|
+
import tempfile
|
|
241
|
+
|
|
241
242
|
from .scanpy.readwrite import read_10x_h5, read_h5ad
|
|
242
243
|
|
|
243
244
|
files = self.upload.value
|
|
@@ -1,14 +1,24 @@
|
|
|
1
|
-
from ipywidgets import
|
|
1
|
+
from ipywidgets import Box, Dropdown, Layout, Stack, VBox, link
|
|
2
2
|
|
|
3
3
|
from sclab.event import EventBroker, EventClient
|
|
4
4
|
|
|
5
|
+
# Create a layout with a bottom border to act as the horizontal line
|
|
6
|
+
hr_layout = Layout(
|
|
7
|
+
border="1px solid black", # 1px width, solid style, black color
|
|
8
|
+
margin="10px 0", # Add margin for spacing above and below
|
|
9
|
+
width="100%", # Extend the line across the full width
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
# Create a Box widget with the styled layout
|
|
13
|
+
hr = Box(layout=hr_layout)
|
|
14
|
+
|
|
5
15
|
|
|
6
16
|
class _Results:
|
|
7
17
|
namespace: str
|
|
8
18
|
|
|
9
19
|
|
|
10
|
-
class ResultsPanel(
|
|
11
|
-
available_results:
|
|
20
|
+
class ResultsPanel(VBox, EventClient):
|
|
21
|
+
available_results: Dropdown
|
|
12
22
|
results_stack: Stack
|
|
13
23
|
|
|
14
24
|
events: list[str] = [
|
|
@@ -22,7 +32,7 @@ class ResultsPanel(GridBox, EventClient):
|
|
|
22
32
|
):
|
|
23
33
|
EventClient.__init__(self, broker)
|
|
24
34
|
|
|
25
|
-
self.available_results =
|
|
35
|
+
self.available_results = Dropdown(options={}, description="Category")
|
|
26
36
|
self.results_stack = Stack([])
|
|
27
37
|
|
|
28
38
|
link(
|
|
@@ -30,15 +40,19 @@ class ResultsPanel(GridBox, EventClient):
|
|
|
30
40
|
(self.results_stack, "selected_index"),
|
|
31
41
|
)
|
|
32
42
|
|
|
33
|
-
|
|
43
|
+
VBox.__init__(
|
|
34
44
|
self,
|
|
35
|
-
[
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
45
|
+
[
|
|
46
|
+
self.available_results,
|
|
47
|
+
hr,
|
|
48
|
+
self.results_stack,
|
|
49
|
+
],
|
|
50
|
+
# layout=Layout(
|
|
51
|
+
# width="100%",
|
|
52
|
+
# grid_template_columns="150px auto",
|
|
53
|
+
# grid_template_areas=""" "available-results selected-results_stack" """,
|
|
54
|
+
# border="0px solid black",
|
|
55
|
+
# ),
|
|
42
56
|
)
|
|
43
57
|
|
|
44
58
|
def add_result(self, results: _Results):
|
|
@@ -25,7 +25,7 @@ class DifferentialExpressionResults(VBox):
|
|
|
25
25
|
|
|
26
26
|
def __init__(self, dataset: SCLabDataset):
|
|
27
27
|
self.dataset = dataset
|
|
28
|
-
self.result_selector = Dropdown()
|
|
28
|
+
self.result_selector = Dropdown(description="Analysis Name")
|
|
29
29
|
self.group_selector = ToggleButtons()
|
|
30
30
|
self.table_output = Output()
|
|
31
31
|
|
|
@@ -198,6 +198,7 @@ class DifferentialExpression(ProcessorStepBase):
|
|
|
198
198
|
reference=reference,
|
|
199
199
|
layer=layer,
|
|
200
200
|
key_added=key_added,
|
|
201
|
+
pts=True,
|
|
201
202
|
)
|
|
202
203
|
|
|
203
204
|
self.results.sync_results_list(focus_result=key_added)
|
|
@@ -2,6 +2,9 @@ from ._cca_integrate import cca_integrate, cca_integrate_pair
|
|
|
2
2
|
from ._filter_obs import filter_obs
|
|
3
3
|
from ._harmony_integrate import harmony_integrate
|
|
4
4
|
from ._normalize_weighted import normalize_weighted
|
|
5
|
+
from ._pca import pca
|
|
6
|
+
from ._preprocess import preprocess
|
|
7
|
+
from ._qc import qc
|
|
5
8
|
from ._subset import subset_obs, subset_var
|
|
6
9
|
from ._transfer_metadata import transfer_metadata
|
|
7
10
|
from ._transform import pool_neighbors
|
|
@@ -12,7 +15,10 @@ __all__ = [
|
|
|
12
15
|
"filter_obs",
|
|
13
16
|
"harmony_integrate",
|
|
14
17
|
"normalize_weighted",
|
|
18
|
+
"pca",
|
|
15
19
|
"pool_neighbors",
|
|
20
|
+
"preprocess",
|
|
21
|
+
"qc",
|
|
16
22
|
"subset_obs",
|
|
17
23
|
"subset_var",
|
|
18
24
|
"transfer_metadata",
|
|
@@ -1,24 +1,31 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import os
|
|
2
3
|
from typing import Literal
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
6
|
+
from joblib import Parallel, delayed
|
|
5
7
|
from numpy import matrix
|
|
6
8
|
from numpy.typing import NDArray
|
|
7
9
|
from scipy.linalg import svd
|
|
8
10
|
from scipy.sparse import csc_matrix, csr_matrix, issparse
|
|
11
|
+
from scipy.sparse import vstack as sparse_vstack
|
|
9
12
|
from scipy.sparse.linalg import svds
|
|
10
13
|
from sklearn.utils.extmath import randomized_svd
|
|
11
14
|
|
|
12
15
|
logger = logging.getLogger(__name__)
|
|
13
16
|
|
|
14
17
|
|
|
18
|
+
N_CPUS = os.cpu_count()
|
|
19
|
+
|
|
20
|
+
|
|
15
21
|
def cca(
|
|
16
22
|
X: NDArray | csr_matrix | csc_matrix,
|
|
17
23
|
Y: NDArray | csr_matrix | csc_matrix,
|
|
18
24
|
n_components=None,
|
|
19
|
-
svd_solver: Literal["full", "partial", "randomized"] = "
|
|
25
|
+
svd_solver: Literal["full", "partial", "randomized"] = "randomized",
|
|
20
26
|
normalize: bool = False,
|
|
21
27
|
random_state=42,
|
|
28
|
+
n_jobs: int = N_CPUS,
|
|
22
29
|
) -> tuple[NDArray, NDArray, NDArray]:
|
|
23
30
|
"""
|
|
24
31
|
CCA-style integration for two single-cell matrices with unequal numbers of cells.
|
|
@@ -50,7 +57,7 @@ def cca(
|
|
|
50
57
|
k = n_components or min(n1, n2)
|
|
51
58
|
|
|
52
59
|
if issparse(X):
|
|
53
|
-
C = _cross_covariance_sparse(X, Y)
|
|
60
|
+
C = _cross_covariance_sparse(X, Y, n_jobs=n_jobs)
|
|
54
61
|
else:
|
|
55
62
|
C = _cross_covariance_dense(X, Y)
|
|
56
63
|
|
|
@@ -103,7 +110,7 @@ def _svd_decomposition(
|
|
|
103
110
|
return Uc, s, Vct
|
|
104
111
|
|
|
105
112
|
|
|
106
|
-
def _cross_covariance_sparse(X: csr_matrix, Y: csr_matrix) -> NDArray:
|
|
113
|
+
def _cross_covariance_sparse(X: csr_matrix, Y: csr_matrix, n_jobs=N_CPUS) -> NDArray:
|
|
107
114
|
_, p1 = X.shape
|
|
108
115
|
_, p2 = Y.shape
|
|
109
116
|
if p1 != p2:
|
|
@@ -118,7 +125,7 @@ def _cross_covariance_sparse(X: csr_matrix, Y: csr_matrix) -> NDArray:
|
|
|
118
125
|
mux: matrix = X.mean(axis=0)
|
|
119
126
|
muy: matrix = Y.mean(axis=0)
|
|
120
127
|
|
|
121
|
-
XYt: csr_matrix = X
|
|
128
|
+
XYt: csr_matrix = _spmm_parallel(X, Y.T, n_jobs=n_jobs)
|
|
122
129
|
Xmuyt: matrix = X.dot(muy.T)
|
|
123
130
|
muxYt: matrix = Y.dot(mux.T).T
|
|
124
131
|
muxmuyt: float = (mux @ muy.T)[0, 0]
|
|
@@ -152,3 +159,18 @@ def _dense_scale(A: NDArray) -> NDArray:
|
|
|
152
159
|
A = np.asarray(A)
|
|
153
160
|
eps = np.finfo(A.dtype).eps
|
|
154
161
|
return A / (A.std(axis=0, ddof=1, keepdims=True) + eps)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _spmm_chunk(A_csr, X, start, stop):
|
|
165
|
+
return A_csr[start:stop, :] @ X
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _spmm_parallel(A_csr: csr_matrix, X_csc: csc_matrix, n_jobs=N_CPUS):
|
|
169
|
+
n = A_csr.shape[0]
|
|
170
|
+
|
|
171
|
+
bounds = np.linspace(0, n, n_jobs + 1, dtype=int)
|
|
172
|
+
Ys = Parallel(n_jobs=n_jobs, prefer="processes")(
|
|
173
|
+
delayed(_spmm_chunk)(A_csr, X_csc, bounds[i], bounds[i + 1])
|
|
174
|
+
for i in range(n_jobs)
|
|
175
|
+
)
|
|
176
|
+
return sparse_vstack(Ys) # result is sparse if X is sparse, dense otherwise
|
|
@@ -13,8 +13,8 @@ def cca_integrate(
|
|
|
13
13
|
reference_batch: str | list[str] | None = None,
|
|
14
14
|
mask_var: str | None = None,
|
|
15
15
|
n_components: int = 30,
|
|
16
|
-
svd_solver: str = "
|
|
17
|
-
normalize: bool =
|
|
16
|
+
svd_solver: str = "randomized",
|
|
17
|
+
normalize: bool = True,
|
|
18
18
|
random_state: int | None = None,
|
|
19
19
|
):
|
|
20
20
|
n_groups = adata.obs[key].nunique()
|
|
@@ -46,8 +46,8 @@ def cca_integrate_pair(
|
|
|
46
46
|
adjusted_basis: str | None = None,
|
|
47
47
|
mask_var: str | None = None,
|
|
48
48
|
n_components: int = 30,
|
|
49
|
-
svd_solver: str = "
|
|
50
|
-
normalize: bool =
|
|
49
|
+
svd_solver: str = "randomized",
|
|
50
|
+
normalize: bool = True,
|
|
51
51
|
random_state: int | None = None,
|
|
52
52
|
):
|
|
53
53
|
if basis is None:
|
|
@@ -9,6 +9,7 @@ def normalize_weighted(
|
|
|
9
9
|
adata: AnnData,
|
|
10
10
|
target_scale: float | None = None,
|
|
11
11
|
batch_key: str | None = None,
|
|
12
|
+
q: float = 0.99,
|
|
12
13
|
) -> None:
|
|
13
14
|
if batch_key is not None:
|
|
14
15
|
for _, idx in adata.obs.groupby(batch_key, observed=True).groups.items():
|
|
@@ -22,6 +23,8 @@ def normalize_weighted(
|
|
|
22
23
|
|
|
23
24
|
return
|
|
24
25
|
|
|
26
|
+
target_scale = None
|
|
27
|
+
|
|
25
28
|
X: csr_matrix
|
|
26
29
|
Y: csr_matrix
|
|
27
30
|
Z: csr_matrix
|
|
@@ -38,6 +41,7 @@ def normalize_weighted(
|
|
|
38
41
|
Y.eliminate_zeros()
|
|
39
42
|
Y.data = -Y.data * np.log(Y.data)
|
|
40
43
|
entropy = Y.sum(axis=0)
|
|
44
|
+
entropy[:, entropy.A1 < np.quantile(entropy.A1, q)] *= 0.0
|
|
41
45
|
|
|
42
46
|
Z = X.multiply(entropy)
|
|
43
47
|
Z = Z.tocsr()
|
|
@@ -48,7 +52,7 @@ def normalize_weighted(
|
|
|
48
52
|
"ignore", category=RuntimeWarning, message="divide by zero"
|
|
49
53
|
)
|
|
50
54
|
scale = Z.sum(axis=1)
|
|
51
|
-
Z =
|
|
55
|
+
Z = X.multiply(1 / scale)
|
|
52
56
|
Z = Z.tocsr()
|
|
53
57
|
|
|
54
58
|
if target_scale is None:
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from anndata import AnnData
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def pca(
|
|
5
|
+
adata: AnnData,
|
|
6
|
+
layer: str | None = None,
|
|
7
|
+
n_comps: int = 30,
|
|
8
|
+
mask_var: str | None = None,
|
|
9
|
+
batch_key: str | None = None,
|
|
10
|
+
reference_batch: str | None = None,
|
|
11
|
+
zero_center: bool = False,
|
|
12
|
+
):
|
|
13
|
+
import scanpy as sc
|
|
14
|
+
|
|
15
|
+
pca_kwargs = dict(
|
|
16
|
+
n_comps=n_comps,
|
|
17
|
+
layer=layer,
|
|
18
|
+
mask_var=mask_var,
|
|
19
|
+
svd_solver="arpack",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
if reference_batch:
|
|
23
|
+
obs_mask = adata.obs[batch_key] == reference_batch
|
|
24
|
+
adata_ref = adata[obs_mask].copy()
|
|
25
|
+
if mask_var == "highly_variable":
|
|
26
|
+
sc.pp.highly_variable_genes(
|
|
27
|
+
adata_ref, layer=f"{layer if layer else 'X'}_log1p", flavor="seurat"
|
|
28
|
+
)
|
|
29
|
+
hvg_seurat = adata_ref.var["highly_variable"]
|
|
30
|
+
sc.pp.highly_variable_genes(
|
|
31
|
+
adata_ref,
|
|
32
|
+
layer=layer,
|
|
33
|
+
flavor="seurat_v3_paper",
|
|
34
|
+
n_top_genes=hvg_seurat.sum(),
|
|
35
|
+
)
|
|
36
|
+
hvg_seurat_v3 = adata_ref.var["highly_variable"]
|
|
37
|
+
adata_ref.var["highly_variable"] = hvg_seurat | hvg_seurat_v3
|
|
38
|
+
|
|
39
|
+
sc.pp.pca(adata_ref, **pca_kwargs)
|
|
40
|
+
uns_pca = adata_ref.uns["pca"]
|
|
41
|
+
uns_pca["reference_batch"] = reference_batch
|
|
42
|
+
PCs = adata_ref.varm["PCs"]
|
|
43
|
+
adata.obsm["X_pca"] = adata.X.dot(PCs)
|
|
44
|
+
adata.uns["pca"] = uns_pca
|
|
45
|
+
adata.varm["PCs"] = PCs
|
|
46
|
+
else:
|
|
47
|
+
sc.pp.pca(adata, **pca_kwargs)
|
|
48
|
+
adata.obsm["X_pca"] = adata.X.dot(adata.varm["PCs"])
|
|
49
|
+
|
|
50
|
+
if zero_center:
|
|
51
|
+
adata.obsm["X_pca"] -= adata.obsm["X_pca"].mean(axis=0, keepdims=True)
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from anndata import AnnData, ImplicitModificationWarning
|
|
6
|
+
from tqdm.auto import tqdm
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def preprocess(
|
|
10
|
+
adata: AnnData,
|
|
11
|
+
counts_layer: str = "counts",
|
|
12
|
+
group_by: str | None = None,
|
|
13
|
+
min_cells: int = 5,
|
|
14
|
+
min_genes: int = 5,
|
|
15
|
+
compute_hvg: bool = True,
|
|
16
|
+
regress_total_counts: bool = False,
|
|
17
|
+
regress_n_genes: bool = False,
|
|
18
|
+
normalization_method: Literal["library", "weighted", "none"] = "library",
|
|
19
|
+
target_scale: float = 1e4,
|
|
20
|
+
weighted_norm_quantile: float = 0.9,
|
|
21
|
+
log1p: bool = True,
|
|
22
|
+
scale: bool = True,
|
|
23
|
+
):
|
|
24
|
+
import scanpy as sc
|
|
25
|
+
|
|
26
|
+
from ._normalize_weighted import normalize_weighted
|
|
27
|
+
|
|
28
|
+
with tqdm(total=100, bar_format="{percentage:3.0f}%|{bar}|") as pbar:
|
|
29
|
+
if counts_layer not in adata.layers:
|
|
30
|
+
adata.layers[counts_layer] = adata.X.copy()
|
|
31
|
+
|
|
32
|
+
if f"{counts_layer}_log1p" not in adata.layers:
|
|
33
|
+
adata.layers[f"{counts_layer}_log1p"] = sc.pp.log1p(
|
|
34
|
+
adata.layers[counts_layer].copy()
|
|
35
|
+
)
|
|
36
|
+
pbar.update(10)
|
|
37
|
+
|
|
38
|
+
adata.X = adata.layers[counts_layer].copy()
|
|
39
|
+
sc.pp.calculate_qc_metrics(
|
|
40
|
+
adata,
|
|
41
|
+
percent_top=None,
|
|
42
|
+
log1p=False,
|
|
43
|
+
inplace=True,
|
|
44
|
+
)
|
|
45
|
+
sc.pp.filter_cells(adata, min_genes=min_genes)
|
|
46
|
+
sc.pp.filter_genes(adata, min_cells=min_cells)
|
|
47
|
+
pbar.update(10)
|
|
48
|
+
|
|
49
|
+
sc.pp.calculate_qc_metrics(
|
|
50
|
+
adata,
|
|
51
|
+
percent_top=None,
|
|
52
|
+
log1p=False,
|
|
53
|
+
inplace=True,
|
|
54
|
+
)
|
|
55
|
+
pbar.update(10)
|
|
56
|
+
|
|
57
|
+
if compute_hvg:
|
|
58
|
+
if group_by is not None:
|
|
59
|
+
adata.var["highly_variable"] = False
|
|
60
|
+
for name, idx in adata.obs.groupby(
|
|
61
|
+
group_by, observed=True
|
|
62
|
+
).groups.items():
|
|
63
|
+
hvg_seurat = sc.pp.highly_variable_genes(
|
|
64
|
+
adata[idx],
|
|
65
|
+
layer=f"{counts_layer}_log1p",
|
|
66
|
+
flavor="seurat",
|
|
67
|
+
inplace=False,
|
|
68
|
+
)["highly_variable"]
|
|
69
|
+
|
|
70
|
+
hvg_seurat_v3 = sc.pp.highly_variable_genes(
|
|
71
|
+
adata[idx],
|
|
72
|
+
layer=counts_layer,
|
|
73
|
+
flavor="seurat_v3_paper",
|
|
74
|
+
n_top_genes=hvg_seurat.sum(),
|
|
75
|
+
inplace=False,
|
|
76
|
+
)["highly_variable"]
|
|
77
|
+
|
|
78
|
+
adata.var[f"highly_variable_{name}"] = hvg_seurat | hvg_seurat_v3
|
|
79
|
+
adata.var["highly_variable"] |= adata.var[f"highly_variable_{name}"]
|
|
80
|
+
|
|
81
|
+
else:
|
|
82
|
+
sc.pp.highly_variable_genes(
|
|
83
|
+
adata, layer=f"{counts_layer}_log1p", flavor="seurat"
|
|
84
|
+
)
|
|
85
|
+
hvg_seurat = adata.var["highly_variable"]
|
|
86
|
+
|
|
87
|
+
sc.pp.highly_variable_genes(
|
|
88
|
+
adata,
|
|
89
|
+
layer=counts_layer,
|
|
90
|
+
flavor="seurat_v3_paper",
|
|
91
|
+
n_top_genes=hvg_seurat.sum(),
|
|
92
|
+
)
|
|
93
|
+
hvg_seurat_v3 = adata.var["highly_variable"]
|
|
94
|
+
|
|
95
|
+
adata.var["highly_variable"] = hvg_seurat | hvg_seurat_v3
|
|
96
|
+
|
|
97
|
+
pbar.update(10)
|
|
98
|
+
pbar.update(10)
|
|
99
|
+
|
|
100
|
+
new_layer = counts_layer
|
|
101
|
+
if normalization_method == "library":
|
|
102
|
+
new_layer += "_normt"
|
|
103
|
+
sc.pp.normalize_total(adata, target_sum=target_scale)
|
|
104
|
+
elif normalization_method == "weighted":
|
|
105
|
+
new_layer += "_normw"
|
|
106
|
+
normalize_weighted(
|
|
107
|
+
adata,
|
|
108
|
+
target_scale=target_scale,
|
|
109
|
+
batch_key=group_by,
|
|
110
|
+
q=weighted_norm_quantile,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
pbar.update(10)
|
|
114
|
+
pbar.update(10)
|
|
115
|
+
|
|
116
|
+
if log1p:
|
|
117
|
+
new_layer += "_log1p"
|
|
118
|
+
adata.uns.pop("log1p", None)
|
|
119
|
+
sc.pp.log1p(adata)
|
|
120
|
+
pbar.update(10)
|
|
121
|
+
|
|
122
|
+
vars_to_regress = []
|
|
123
|
+
if regress_n_genes:
|
|
124
|
+
vars_to_regress.append("n_genes_by_counts")
|
|
125
|
+
|
|
126
|
+
if regress_total_counts and log1p:
|
|
127
|
+
adata.obs["log1p_total_counts"] = np.log1p(adata.obs["total_counts"])
|
|
128
|
+
vars_to_regress.append("log1p_total_counts")
|
|
129
|
+
elif regress_total_counts:
|
|
130
|
+
vars_to_regress.append("total_counts")
|
|
131
|
+
|
|
132
|
+
if vars_to_regress:
|
|
133
|
+
new_layer += "_regr"
|
|
134
|
+
sc.pp.regress_out(adata, keys=vars_to_regress, n_jobs=1)
|
|
135
|
+
pbar.update(10)
|
|
136
|
+
|
|
137
|
+
if scale:
|
|
138
|
+
new_layer += "_scale"
|
|
139
|
+
if group_by is not None:
|
|
140
|
+
for _, idx in adata.obs.groupby(group_by, observed=True).groups.items():
|
|
141
|
+
with warnings.catch_warnings():
|
|
142
|
+
warnings.filterwarnings(
|
|
143
|
+
"ignore",
|
|
144
|
+
category=ImplicitModificationWarning,
|
|
145
|
+
message="Modifying `X` on a view results in data being overridden",
|
|
146
|
+
)
|
|
147
|
+
adata[idx].X = sc.pp.scale(adata[idx].X, zero_center=False)
|
|
148
|
+
else:
|
|
149
|
+
sc.pp.scale(adata, zero_center=False)
|
|
150
|
+
|
|
151
|
+
adata.layers[new_layer] = adata.X.copy()
|
|
152
|
+
|
|
153
|
+
pbar.update(10)
|
|
154
|
+
|
|
155
|
+
adata.X = adata.X.astype(np.float32)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from anndata import AnnData
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def qc(
|
|
6
|
+
adata: AnnData,
|
|
7
|
+
counts_layer: str = "counts",
|
|
8
|
+
min_counts: int = 50,
|
|
9
|
+
min_genes: int = 5,
|
|
10
|
+
min_cells: int = 5,
|
|
11
|
+
max_rank: int = 0,
|
|
12
|
+
):
|
|
13
|
+
import scanpy as sc
|
|
14
|
+
|
|
15
|
+
if counts_layer not in adata.layers:
|
|
16
|
+
adata.layers[counts_layer] = adata.X.copy()
|
|
17
|
+
|
|
18
|
+
adata.layers["qc_tmp_current_X"] = adata.X
|
|
19
|
+
adata.X = adata.layers[counts_layer].copy()
|
|
20
|
+
rowsums = np.asarray(adata.X.sum(axis=1)).squeeze()
|
|
21
|
+
|
|
22
|
+
obs_idx = adata.obs_names[rowsums >= min_counts]
|
|
23
|
+
adata._inplace_subset_obs(obs_idx)
|
|
24
|
+
|
|
25
|
+
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
|
|
26
|
+
|
|
27
|
+
sc.pp.filter_cells(adata, min_genes=min_genes)
|
|
28
|
+
sc.pp.filter_genes(adata, min_cells=min_cells)
|
|
29
|
+
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
|
|
30
|
+
adata.obs["barcode_rank"] = adata.obs["total_counts"].rank(ascending=False)
|
|
31
|
+
|
|
32
|
+
# Restore original X
|
|
33
|
+
adata.X = adata.layers.pop("qc_tmp_current_X")
|
|
34
|
+
|
|
35
|
+
if max_rank > 0:
|
|
36
|
+
series = adata.obs["barcode_rank"]
|
|
37
|
+
index = series.loc[series < max_rank].index
|
|
38
|
+
adata._inplace_subset_obs(index)
|
|
@@ -23,18 +23,19 @@ def transfer_metadata(
|
|
|
23
23
|
min_neighs: int = 5,
|
|
24
24
|
weight_by: Literal["connectivity", "distance", "constant"] = "connectivity",
|
|
25
25
|
):
|
|
26
|
-
D: csr_matrix = adata.obsp["distances"]
|
|
27
|
-
C: csr_matrix = adata.obsp["connectivities"]
|
|
26
|
+
D: csr_matrix = adata.obsp["distances"].copy()
|
|
27
|
+
C: csr_matrix = adata.obsp["connectivities"].copy()
|
|
28
28
|
D = D.tocsr()
|
|
29
|
+
W: csr_matrix
|
|
29
30
|
|
|
30
31
|
match weight_by:
|
|
31
32
|
case "connectivity":
|
|
32
|
-
W = C.tocsr()
|
|
33
|
+
W = C.tocsr().copy()
|
|
33
34
|
case "distance":
|
|
34
|
-
W = D.tocsr()
|
|
35
|
+
W = D.tocsr().copy()
|
|
35
36
|
W.data = 1.0 / W.data
|
|
36
37
|
case "constant":
|
|
37
|
-
W = D.tocsr()
|
|
38
|
+
W = D.tocsr().copy()
|
|
38
39
|
W.data[:] = 1.0
|
|
39
40
|
case _:
|
|
40
41
|
raise ValueError(f"Unsupported weight_by {weight_by}")
|
|
@@ -280,6 +280,7 @@ def estimate_periodic_pseudotime_start(
|
|
|
280
280
|
time_key: str = "pseudotime",
|
|
281
281
|
bandwidth: float = 1 / 64,
|
|
282
282
|
show_plot: bool = False,
|
|
283
|
+
nth_root: int = 1,
|
|
283
284
|
):
|
|
284
285
|
# TODO: Test implementation
|
|
285
286
|
pseudotime = adata.obs[time_key].values.copy()
|
|
@@ -316,7 +317,10 @@ def estimate_periodic_pseudotime_start(
|
|
|
316
317
|
roots = (x[idx] + x[1:][idx]) / 2
|
|
317
318
|
heights = yp[idx]
|
|
318
319
|
|
|
319
|
-
|
|
320
|
+
roots = roots[heights.argsort()]
|
|
321
|
+
heights = heights[heights.argsort()]
|
|
322
|
+
|
|
323
|
+
max_peak_x = roots[nth_root - 1]
|
|
320
324
|
|
|
321
325
|
if show_plot:
|
|
322
326
|
plt.hist(
|
|
@@ -12,9 +12,9 @@ def pseudobulk_edger(
|
|
|
12
12
|
cell_identity_key: str | None = None,
|
|
13
13
|
batch_key: str | None = None,
|
|
14
14
|
layer: str | None = None,
|
|
15
|
-
replicas_per_group: int =
|
|
15
|
+
replicas_per_group: int = 5,
|
|
16
16
|
min_cells_per_group: int = 30,
|
|
17
|
-
bootstrap_sampling: bool =
|
|
17
|
+
bootstrap_sampling: bool = False,
|
|
18
18
|
use_cells: dict[str, list[str]] | None = None,
|
|
19
19
|
aggregate: bool = True,
|
|
20
20
|
verbosity: int = 0,
|
|
@@ -134,7 +134,7 @@ def pseudobulk_edger(
|
|
|
134
134
|
|
|
135
135
|
try:
|
|
136
136
|
R(f"""
|
|
137
|
-
outs <-
|
|
137
|
+
outs <- fit_edger_model(aggr_adata, "{gk}", "{cell_identity_key}", "{batch_key}", verbosity = {verbosity})
|
|
138
138
|
fit <- outs$fit
|
|
139
139
|
y <- outs$y
|
|
140
140
|
""")
|
|
@@ -214,33 +214,20 @@ suppressPackageStartupMessages({
|
|
|
214
214
|
library(MAST)
|
|
215
215
|
})
|
|
216
216
|
|
|
217
|
-
|
|
217
|
+
fit_edger_model <- function(adata_, group_key, cell_identity_key = "None", batch_key = "None", verbosity = 0){
|
|
218
218
|
|
|
219
219
|
if (verbosity > 0){
|
|
220
220
|
cat("Group key:", group_key, "\n")
|
|
221
221
|
cat("Cell identity key:", cell_identity_key, "\n")
|
|
222
222
|
}
|
|
223
223
|
|
|
224
|
-
# create an edgeR object with counts and grouping factor
|
|
225
|
-
y <- DGEList(assay(adata_, "X"), group = colData(adata_)[[group_key]])
|
|
226
|
-
# filter out genes with low counts
|
|
227
|
-
if (verbosity > 1){
|
|
228
|
-
cat("Dimensions before subsetting:", dim(y), "\n")
|
|
229
|
-
}
|
|
230
|
-
keep <- filterByExpr(y)
|
|
231
|
-
y <- y[keep, , keep.lib.sizes=FALSE]
|
|
232
|
-
if (verbosity > 1){
|
|
233
|
-
cat("Dimensions after subsetting:", dim(y), "\n")
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
# normalize
|
|
237
|
-
y <- calcNormFactors(y)
|
|
238
224
|
# create a vector that is concatentation of condition and cell type that we will later use with contrasts
|
|
239
225
|
if (cell_identity_key == "None"){
|
|
240
226
|
group <- colData(adata_)[[group_key]]
|
|
241
227
|
} else {
|
|
242
228
|
group <- paste0(colData(adata_)[[group_key]], "_", colData(adata_)[[cell_identity_key]])
|
|
243
229
|
}
|
|
230
|
+
|
|
244
231
|
if (verbosity > 1){
|
|
245
232
|
cat("Group(s):", group, "\n")
|
|
246
233
|
}
|
|
@@ -255,10 +242,28 @@ fit_model <- function(adata_, group_key, cell_identity_key = "None", batch_key =
|
|
|
255
242
|
design <- model.matrix(~ 0 + group + replica + batch)
|
|
256
243
|
}
|
|
257
244
|
|
|
245
|
+
# create an edgeR object with counts and grouping factor
|
|
246
|
+
y <- DGEList(assay(adata_, "X"), group = colData(adata_)[[group_key]])
|
|
247
|
+
|
|
248
|
+
# filter out genes with low counts
|
|
249
|
+
if (verbosity > 1){
|
|
250
|
+
cat("Dimensions before subsetting:", dim(y), "\n")
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
keep <- filterByExpr(y)
|
|
254
|
+
y <- y[keep, , keep.lib.sizes=FALSE]
|
|
255
|
+
if (verbosity > 1){
|
|
256
|
+
cat("Dimensions after subsetting:", dim(y), "\n")
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
# normalize
|
|
260
|
+
y <- calcNormFactors(y)
|
|
261
|
+
|
|
258
262
|
# estimate dispersion
|
|
259
263
|
y <- estimateDisp(y, design = design)
|
|
260
264
|
# fit the model
|
|
261
265
|
fit <- glmQLFit(y, design)
|
|
266
|
+
|
|
262
267
|
return(list("fit"=fit, "design"=design, "y"=y))
|
|
263
268
|
}
|
|
264
269
|
"""
|
|
@@ -282,9 +287,7 @@ def _try_imports():
|
|
|
282
287
|
except ModuleNotFoundError:
|
|
283
288
|
message = (
|
|
284
289
|
"edger_pseudobulk requires rpy2 and anndata2ri to be installed.\n"
|
|
285
|
-
"
|
|
286
|
-
"$ pip install rpy2 sclab-tools[r]\n"
|
|
287
|
-
"or\n"
|
|
290
|
+
"please install with one of the following:\n"
|
|
288
291
|
"$ pip install rpy2 anndata2ri\n"
|
|
289
292
|
"or\n"
|
|
290
293
|
"$ conda install -c conda-forge rpy2 anndata2ri\n"
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from anndata import AnnData
|
|
3
|
+
|
|
4
|
+
from ._pseudobulk_helpers import aggregate_and_filter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def pseudobulk_limma(
|
|
8
|
+
adata_: AnnData,
|
|
9
|
+
group_key: str,
|
|
10
|
+
condition_group: str | list[str] | None = None,
|
|
11
|
+
reference_group: str | None = None,
|
|
12
|
+
cell_identity_key: str | None = None,
|
|
13
|
+
batch_key: str | None = None,
|
|
14
|
+
layer: str | None = None,
|
|
15
|
+
replicas_per_group: int = 5,
|
|
16
|
+
min_cells_per_group: int = 30,
|
|
17
|
+
bootstrap_sampling: bool = False,
|
|
18
|
+
use_cells: dict[str, list[str]] | None = None,
|
|
19
|
+
aggregate: bool = True,
|
|
20
|
+
verbosity: int = 0,
|
|
21
|
+
) -> dict[str, pd.DataFrame]:
|
|
22
|
+
_try_imports()
|
|
23
|
+
import anndata2ri # noqa: F401
|
|
24
|
+
import rpy2.robjects as robjects
|
|
25
|
+
from rpy2.rinterface_lib.embedded import RRuntimeError # noqa: F401
|
|
26
|
+
from rpy2.robjects import pandas2ri # noqa: F401
|
|
27
|
+
from rpy2.robjects.conversion import localconverter # noqa: F401
|
|
28
|
+
|
|
29
|
+
R = robjects.r
|
|
30
|
+
|
|
31
|
+
if aggregate:
|
|
32
|
+
aggr_adata = aggregate_and_filter(
|
|
33
|
+
adata_,
|
|
34
|
+
group_key,
|
|
35
|
+
cell_identity_key,
|
|
36
|
+
layer,
|
|
37
|
+
replicas_per_group,
|
|
38
|
+
min_cells_per_group,
|
|
39
|
+
bootstrap_sampling,
|
|
40
|
+
use_cells,
|
|
41
|
+
)
|
|
42
|
+
else:
|
|
43
|
+
aggr_adata = adata_.copy()
|
|
44
|
+
|
|
45
|
+
with localconverter(anndata2ri.converter):
|
|
46
|
+
R.assign("aggr_adata", aggr_adata)
|
|
47
|
+
|
|
48
|
+
# defines the R function for fitting the model with limma
|
|
49
|
+
R(_fit_model_r_script)
|
|
50
|
+
|
|
51
|
+
if condition_group is None:
|
|
52
|
+
condition_group_list = aggr_adata.obs[group_key].unique()
|
|
53
|
+
elif isinstance(condition_group, str):
|
|
54
|
+
condition_group_list = [condition_group]
|
|
55
|
+
else:
|
|
56
|
+
condition_group_list = condition_group
|
|
57
|
+
|
|
58
|
+
if cell_identity_key is not None:
|
|
59
|
+
cids = aggr_adata.obs[cell_identity_key].unique()
|
|
60
|
+
else:
|
|
61
|
+
cids = [""]
|
|
62
|
+
|
|
63
|
+
tt_dict = {}
|
|
64
|
+
for condition_group in condition_group_list:
|
|
65
|
+
if reference_group is not None and condition_group == reference_group:
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
if verbosity > 0:
|
|
69
|
+
print(f"Fitting model for {condition_group}...")
|
|
70
|
+
|
|
71
|
+
if reference_group is not None:
|
|
72
|
+
gk = group_key
|
|
73
|
+
else:
|
|
74
|
+
gk = f"{group_key}_{condition_group}"
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
R(f"""
|
|
78
|
+
outs <- fit_limma_model(aggr_adata, "{gk}", "{cell_identity_key}", verbosity = {verbosity})
|
|
79
|
+
fit <- outs$fit
|
|
80
|
+
v <- outs$v
|
|
81
|
+
""")
|
|
82
|
+
|
|
83
|
+
except RRuntimeError as e:
|
|
84
|
+
print("Error fitting model for", condition_group)
|
|
85
|
+
print("Error:", e)
|
|
86
|
+
print("Skipping...", flush=True)
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
if reference_group is None:
|
|
90
|
+
new_contrasts_tuples = [
|
|
91
|
+
(
|
|
92
|
+
condition_group, # common prefix
|
|
93
|
+
"", # condition group
|
|
94
|
+
"not", # reference group
|
|
95
|
+
cid, # cell identity
|
|
96
|
+
)
|
|
97
|
+
for cid in cids
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
else:
|
|
101
|
+
new_contrasts_tuples = [
|
|
102
|
+
(
|
|
103
|
+
"", # common prefix
|
|
104
|
+
condition_group, # condition group
|
|
105
|
+
reference_group, # reference group
|
|
106
|
+
cid, # cell identity
|
|
107
|
+
)
|
|
108
|
+
for cid in cids
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
new_contrasts = [
|
|
112
|
+
f"group{cnd}{prefix}_{cid}".strip("_")
|
|
113
|
+
+ "-"
|
|
114
|
+
+ f"group{ref}{prefix}_{cid}".strip("_")
|
|
115
|
+
for prefix, cnd, ref, cid in new_contrasts_tuples
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
for contrast, contrast_tuple in zip(new_contrasts, new_contrasts_tuples):
|
|
119
|
+
prefix, cnd, ref, cid = contrast_tuple
|
|
120
|
+
|
|
121
|
+
if ref == "not":
|
|
122
|
+
cnd, ref = "", "rest"
|
|
123
|
+
|
|
124
|
+
contrast_key = f"{prefix}{cnd}_vs_{ref}"
|
|
125
|
+
if cid:
|
|
126
|
+
contrast_key = f"{cell_identity_key}:{cid}|{contrast_key}"
|
|
127
|
+
|
|
128
|
+
if verbosity > 0:
|
|
129
|
+
print(f"Computing contrast: {contrast_key}... ({contrast})")
|
|
130
|
+
|
|
131
|
+
R(f"myContrast <- makeContrasts('{contrast}', levels = v$design)")
|
|
132
|
+
R("fit2 <- contrasts.fit(fit, myContrast)")
|
|
133
|
+
R("fit2 <- eBayes(fit2)")
|
|
134
|
+
R("tt <- topTable(fit2, n = Inf)")
|
|
135
|
+
tt: pd.DataFrame = pandas2ri.rpy2py(R("tt"))
|
|
136
|
+
tt.index.name = "gene_ids"
|
|
137
|
+
|
|
138
|
+
genes = tt.index
|
|
139
|
+
cnd, ref = [c[5:] for c in contrast.split("-")]
|
|
140
|
+
tt["pct_expr_cnd"] = aggr_adata.var[f"pct_expr_{cnd}"].loc[genes]
|
|
141
|
+
tt["pct_expr_ref"] = aggr_adata.var[f"pct_expr_{ref}"].loc[genes]
|
|
142
|
+
tt["num_expr_cnd"] = aggr_adata.var[f"num_expr_{cnd}"].loc[genes]
|
|
143
|
+
tt["num_expr_ref"] = aggr_adata.var[f"num_expr_{ref}"].loc[genes]
|
|
144
|
+
tt["tot_expr_cnd"] = aggr_adata.var[f"tot_expr_{cnd}"].loc[genes]
|
|
145
|
+
tt["tot_expr_ref"] = aggr_adata.var[f"tot_expr_{ref}"].loc[genes]
|
|
146
|
+
tt["mean_cnd"] = tt["tot_expr_cnd"] / tt["num_expr_cnd"]
|
|
147
|
+
tt["mean_ref"] = tt["tot_expr_ref"] / tt["num_expr_ref"]
|
|
148
|
+
tt_dict[contrast_key] = tt
|
|
149
|
+
|
|
150
|
+
return tt_dict
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
_fit_model_r_script = """
|
|
154
|
+
suppressPackageStartupMessages({
|
|
155
|
+
library(edgeR)
|
|
156
|
+
library(limma)
|
|
157
|
+
library(MAST)
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
fit_limma_model <- function(adata_, group_key, cell_identity_key = "None", batch_key = "None", verbosity = 0){
|
|
161
|
+
|
|
162
|
+
if (verbosity > 0){
|
|
163
|
+
cat("Group key:", group_key, "\n")
|
|
164
|
+
cat("Cell identity key:", cell_identity_key, "\n")
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
# create a vector that is concatentation of condition and cell type that we will later use with contrasts
|
|
168
|
+
if (cell_identity_key == "None"){
|
|
169
|
+
group <- colData(adata_)[[group_key]]
|
|
170
|
+
} else {
|
|
171
|
+
group <- paste0(colData(adata_)[[group_key]], "_", colData(adata_)[[cell_identity_key]])
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
if (verbosity > 1){
|
|
175
|
+
cat("Group(s):", group, "\n")
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
group <- factor(group)
|
|
179
|
+
replica <- factor(colData(adata_)$replica)
|
|
180
|
+
|
|
181
|
+
# create a design matrix
|
|
182
|
+
if (batch_key == "None"){
|
|
183
|
+
design <- model.matrix(~ 0 + group + replica)
|
|
184
|
+
} else {
|
|
185
|
+
batch <- factor(colData(adata_)[[batch_key]])
|
|
186
|
+
design <- model.matrix(~ 0 + group + replica + batch)
|
|
187
|
+
}
|
|
188
|
+
colnames(design) <- make.names(colnames(design))
|
|
189
|
+
|
|
190
|
+
# create an edgeR object with counts and grouping factor
|
|
191
|
+
y <- DGEList(assay(adata_, "X"), group = group)
|
|
192
|
+
|
|
193
|
+
# filter out genes with low counts
|
|
194
|
+
if (verbosity > 1){
|
|
195
|
+
cat("Dimensions before subsetting:", dim(y), "\n")
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
keep <- filterByExpr(y, design = design)
|
|
199
|
+
y <- y[keep, , keep.lib.sizes=FALSE]
|
|
200
|
+
if (verbosity > 1){
|
|
201
|
+
cat("Dimensions after subsetting:", dim(y), "\n")
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
# normalize
|
|
205
|
+
y <- calcNormFactors(y)
|
|
206
|
+
|
|
207
|
+
# Apply voom transformation to prepare for linear modeling
|
|
208
|
+
v <- voom(y, design, plot = verbosity > 1)
|
|
209
|
+
|
|
210
|
+
# Fit the linear model
|
|
211
|
+
fit <- lmFit(v, design)
|
|
212
|
+
ne <- limma::nonEstimable(design)
|
|
213
|
+
if (!is.null(ne) && verbosity > 0) cat("Non-estimable:", ne, "\n")
|
|
214
|
+
fit <- eBayes(fit)
|
|
215
|
+
|
|
216
|
+
return(list("fit"=fit, "design"=design, "v"=v))
|
|
217
|
+
}
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _try_imports():
|
|
222
|
+
try:
|
|
223
|
+
import rpy2.robjects as robjects
|
|
224
|
+
from rpy2.robjects.packages import PackageNotInstalledError, importr
|
|
225
|
+
|
|
226
|
+
robjects.r("options(warn=-1)")
|
|
227
|
+
import anndata2ri # noqa: F401
|
|
228
|
+
from rpy2.rinterface_lib.embedded import RRuntimeError # noqa: F401
|
|
229
|
+
from rpy2.robjects import numpy2ri, pandas2ri # noqa: F401
|
|
230
|
+
from rpy2.robjects.conversion import localconverter # noqa: F401
|
|
231
|
+
|
|
232
|
+
importr("edgeR")
|
|
233
|
+
importr("limma")
|
|
234
|
+
importr("MAST")
|
|
235
|
+
importr("SingleCellExperiment")
|
|
236
|
+
|
|
237
|
+
except ModuleNotFoundError:
|
|
238
|
+
message = (
|
|
239
|
+
"pseudobulk_limma requires rpy2 and anndata2ri to be installed.\n"
|
|
240
|
+
"please install with one of the following:\n"
|
|
241
|
+
"$ pip install rpy2 anndata2ri\n"
|
|
242
|
+
"or\n"
|
|
243
|
+
"$ conda install -c conda-forge rpy2 anndata2ri\n"
|
|
244
|
+
)
|
|
245
|
+
print(message)
|
|
246
|
+
raise ModuleNotFoundError(message)
|
|
247
|
+
|
|
248
|
+
except PackageNotInstalledError:
|
|
249
|
+
message = (
|
|
250
|
+
"pseudobulk_limma requires the following R packages to be installed: limma, edgeR, MAST, and SingleCellExperiment.\n"
|
|
251
|
+
"> \n"
|
|
252
|
+
"> if (!require('BiocManager', quietly = TRUE)) install.packages('BiocManager');\n"
|
|
253
|
+
"> BiocManager::install(c('limma', 'edgeR', 'MAST', 'SingleCellExperiment'));\n"
|
|
254
|
+
"> \n"
|
|
255
|
+
)
|
|
256
|
+
print(message)
|
|
257
|
+
raise ImportError(message)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|