scez 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scez-0.2.1/.devcontainer/Dockerfile +22 -0
- scez-0.2.1/.devcontainer/devcontainer.json +12 -0
- scez-0.2.1/.github/dependabot.yml +12 -0
- scez-0.2.1/.github/workflows/main.yml +54 -0
- scez-0.2.1/.github/workflows/python-publish.yml +62 -0
- scez-0.2.1/.gitignore +10 -0
- scez-0.2.1/LICENSE +21 -0
- scez-0.2.1/PKG-INFO +45 -0
- scez-0.2.1/README.md +30 -0
- scez-0.2.1/environment.yml +25 -0
- scez-0.2.1/pyproject.toml +42 -0
- scez-0.2.1/scez/__init__.py +38 -0
- scez-0.2.1/scez/diffexp.py +200 -0
- scez-0.2.1/scez/preprocess.py +74 -0
- scez-0.2.1/scez/representation.py +44 -0
- scez-0.2.1/scez/tests/__init__.py +0 -0
- scez-0.2.1/scez/tests/test_scez.py +19 -0
- scez-0.2.1/scez/utils.py +64 -0
- scez-0.2.1/setup-miniconda-patched-environment.yml +26 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
FROM continuumio/miniconda3
|
|
2
|
+
|
|
3
|
+
# Install system dependencies
|
|
4
|
+
USER root
|
|
5
|
+
RUN apt-get update \
|
|
6
|
+
&& apt-get install -y build-essential git tree curl sudo vim wget \
|
|
7
|
+
&& apt-get clean \
|
|
8
|
+
&& apt-get purge
|
|
9
|
+
|
|
10
|
+
# Install conda env
|
|
11
|
+
COPY environment.yml /tmp/env.yaml
|
|
12
|
+
|
|
13
|
+
# Install mamba in the base environment
|
|
14
|
+
RUN conda config --add channels conda-forge \
|
|
15
|
+
&& conda config --add channels bioconda \
|
|
16
|
+
&& conda config --set channel_priority false
|
|
17
|
+
|
|
18
|
+
# Create the dev environment using mamba
|
|
19
|
+
RUN conda install -y -n base -c conda-forge mamba \
|
|
20
|
+
&& mamba env create -n dev -f /tmp/env.yaml \
|
|
21
|
+
&& conda clean --all --yes \
|
|
22
|
+
&& rm -f /tmp/env.yaml
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# To get started with Dependabot version updates, you'll need to specify which
|
|
2
|
+
# package ecosystems to update and where the package manifests are located.
|
|
3
|
+
# Please see the documentation for more information:
|
|
4
|
+
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
|
|
5
|
+
# https://containers.dev/guide/dependabot
|
|
6
|
+
|
|
7
|
+
version: 2
|
|
8
|
+
updates:
|
|
9
|
+
- package-ecosystem: "devcontainers"
|
|
10
|
+
directory: "/"
|
|
11
|
+
schedule:
|
|
12
|
+
interval: weekly
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
name: Python package
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [ "main" ]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [ "main" ]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
build:
|
|
11
|
+
|
|
12
|
+
runs-on: ${{ matrix.os-version }}
|
|
13
|
+
name: ${{ matrix.os-version }} (${{ matrix.python-version }})
|
|
14
|
+
|
|
15
|
+
strategy:
|
|
16
|
+
fail-fast: false
|
|
17
|
+
max-parallel: 5
|
|
18
|
+
matrix:
|
|
19
|
+
os-version: ["ubuntu-latest"]
|
|
20
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
21
|
+
|
|
22
|
+
steps:
|
|
23
|
+
- uses: actions/checkout@v3
|
|
24
|
+
- name: "Set up Python ${{ matrix.python-version }}"
|
|
25
|
+
uses: actions/setup-python@v3
|
|
26
|
+
with:
|
|
27
|
+
python-version: ${{ matrix.python-version }}
|
|
28
|
+
- name: "Install flake8"
|
|
29
|
+
run: |
|
|
30
|
+
pip install flake8 tomli
|
|
31
|
+
- name: "Lint with flake8"
|
|
32
|
+
run: |
|
|
33
|
+
# stop the build if there are Python syntax errors or undefined names
|
|
34
|
+
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
|
35
|
+
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
|
36
|
+
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
|
37
|
+
- name: "Install miniconda"
|
|
38
|
+
uses: conda-incubator/setup-miniconda@v3
|
|
39
|
+
with:
|
|
40
|
+
miniconda-version: "latest"
|
|
41
|
+
auto-update-conda: true
|
|
42
|
+
python-version: ${{ matrix.python-version }}
|
|
43
|
+
channels: conda-forge,bioconda
|
|
44
|
+
environment-file: environment.yml
|
|
45
|
+
- name: "Install pytest"
|
|
46
|
+
shell: bash -l {0}
|
|
47
|
+
run: |
|
|
48
|
+
python -m pip install --upgrade pip
|
|
49
|
+
pip install uv
|
|
50
|
+
uv pip install --system build pytest tomli
|
|
51
|
+
- name: "Test with pytest"
|
|
52
|
+
shell: bash -l {0}
|
|
53
|
+
run: |
|
|
54
|
+
pytest -s
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
name: Publish PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build:
|
|
10
|
+
|
|
11
|
+
runs-on: ${{ matrix.os-version }}
|
|
12
|
+
name: ${{ matrix.os-version }} (${{ matrix.python-version }})
|
|
13
|
+
|
|
14
|
+
strategy:
|
|
15
|
+
fail-fast: false
|
|
16
|
+
max-parallel: 5
|
|
17
|
+
matrix:
|
|
18
|
+
os-version: ["ubuntu-latest"]
|
|
19
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
20
|
+
|
|
21
|
+
steps:
|
|
22
|
+
- uses: actions/checkout@v3
|
|
23
|
+
- name: "Set up Python ${{ matrix.python-version }}"
|
|
24
|
+
uses: actions/setup-python@v3
|
|
25
|
+
with:
|
|
26
|
+
python-version: ${{ matrix.python-version }}
|
|
27
|
+
- name: "Install flake8"
|
|
28
|
+
run: |
|
|
29
|
+
pip install flake8 tomli
|
|
30
|
+
- name: "Lint with flake8"
|
|
31
|
+
run: |
|
|
32
|
+
# stop the build if there are Python syntax errors or undefined names
|
|
33
|
+
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
|
34
|
+
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
|
35
|
+
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
|
36
|
+
- name: "Install miniconda"
|
|
37
|
+
uses: conda-incubator/setup-miniconda@v3
|
|
38
|
+
with:
|
|
39
|
+
miniconda-version: "latest"
|
|
40
|
+
auto-update-conda: true
|
|
41
|
+
python-version: ${{ matrix.python-version }}
|
|
42
|
+
channels: conda-forge,bioconda
|
|
43
|
+
environment-file: environment.yml
|
|
44
|
+
- name: "Install pytest"
|
|
45
|
+
shell: bash -l {0}
|
|
46
|
+
run: |
|
|
47
|
+
python -m pip install --upgrade pip
|
|
48
|
+
pip install uv
|
|
49
|
+
uv pip install --system build pytest tomli
|
|
50
|
+
- name: "Test with pytest"
|
|
51
|
+
shell: bash -l {0}
|
|
52
|
+
run: |
|
|
53
|
+
pytest -s
|
|
54
|
+
- name: Build package
|
|
55
|
+
shell: bash -l {0}
|
|
56
|
+
run: |
|
|
57
|
+
python -m build
|
|
58
|
+
- name: Publish package
|
|
59
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
60
|
+
with:
|
|
61
|
+
user: __token__
|
|
62
|
+
password: ${{ secrets.PYPI_TOKEN }}
|
scez-0.2.1/.gitignore
ADDED
scez-0.2.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023 Abolfazl (Abe)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
scez-0.2.1/PKG-INFO
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scez
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Single Cell Analysis, Easy Mode!
|
|
5
|
+
Project-URL: Source, https://github.com/abearab/scez
|
|
6
|
+
Author-email: Abe Arab <abarbiology@gmail.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Requires-Python: <4.0,>=3.9
|
|
11
|
+
Provides-Extra: test
|
|
12
|
+
Requires-Dist: pytest; extra == 'test'
|
|
13
|
+
Requires-Dist: tomli; extra == 'test'
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
## scez – single cell, easy mode
|
|
17
|
+
[](https://github.com/abearab/scez/actions/workflows/main.yml)
|
|
18
|
+
[](https://badge.fury.io/py/scez)
|
|
19
|
+
[](https://pepy.tech/project/scez)
|
|
20
|
+
[](https://pepy.tech/project/scez)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
### Description
|
|
24
|
+
There are many tools available for single-cell RNA-seq analysis, but they often require a lot of understanding of the underlying algorithms, reading of documentation, and setting up analysis environments. This takes time and effort, and can be a barrier to entry for many projects. [Single-Cell Best Practices](https://github.com/theislab/single-cell-best-practices) is a great resource for learning about the best practices for single-cell analysis. `scez` aims to provide functionalities for single-cell analysis through definitions of analysis "tasks" and implementation of these "best practices" in a user-friendly way.
|
|
25
|
+
|
|
26
|
+
This is more a personal effort to streamline my own analysis workflows, but I hope it can be useful to others as well.
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
### Installation
|
|
30
|
+
|
|
31
|
+
First, create a new conda environment with the provided `environment.yml` file:
|
|
32
|
+
```bash
|
|
33
|
+
conda env create -f https://raw.githubusercontent.com/abearab/scez/main/environment.yml
|
|
34
|
+
conda activate scez
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Then, install scez using uv / pip:
|
|
38
|
+
```bash
|
|
39
|
+
uv pip install scez
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Or, to install the latest version from the repository:
|
|
43
|
+
```bash
|
|
44
|
+
uv pip install git+https://github.com/abearab/scez.git
|
|
45
|
+
```
|
scez-0.2.1/README.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
## scez – single cell, easy mode
|
|
2
|
+
[](https://github.com/abearab/scez/actions/workflows/main.yml)
|
|
3
|
+
[](https://badge.fury.io/py/scez)
|
|
4
|
+
[](https://pepy.tech/project/scez)
|
|
5
|
+
[](https://pepy.tech/project/scez)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
### Description
|
|
9
|
+
There are many tools available for single-cell RNA-seq analysis, but they often require a lot of understanding of the underlying algorithms, reading of documentation, and setting up analysis environments. This takes time and effort, and can be a barrier to entry for many projects. [Single-Cell Best Practices](https://github.com/theislab/single-cell-best-practices) is a great resource for learning about the best practices for single-cell analysis. `scez` aims to provide functionalities for single-cell analysis through definitions of analysis "tasks" and implementation of these "best practices" in a user-friendly way.
|
|
10
|
+
|
|
11
|
+
This is more a personal effort to streamline my own analysis workflows, but I hope it can be useful to others as well.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
### Installation
|
|
15
|
+
|
|
16
|
+
First, create a new conda environment with the provided `environment.yml` file:
|
|
17
|
+
```bash
|
|
18
|
+
conda env create -f https://raw.githubusercontent.com/abearab/scez/main/environment.yml
|
|
19
|
+
conda activate scez
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Then, install scez using uv / pip:
|
|
23
|
+
```bash
|
|
24
|
+
uv pip install scez
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Or, to install the latest version from the repository:
|
|
28
|
+
```bash
|
|
29
|
+
uv pip install git+https://github.com/abearab/scez.git
|
|
30
|
+
```
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
name: scez
|
|
2
|
+
channels:
|
|
3
|
+
- anaconda
|
|
4
|
+
- conda-forge
|
|
5
|
+
- bioconda
|
|
6
|
+
dependencies:
|
|
7
|
+
- python>=3.9
|
|
8
|
+
- scanpy
|
|
9
|
+
- pertpy
|
|
10
|
+
- python-igraph
|
|
11
|
+
- leidenalg
|
|
12
|
+
- anndata
|
|
13
|
+
- scipy
|
|
14
|
+
- scar
|
|
15
|
+
- scikit-learn
|
|
16
|
+
- matplotlib
|
|
17
|
+
- ipykernel
|
|
18
|
+
- mscorefonts
|
|
19
|
+
- pip
|
|
20
|
+
- pip:
|
|
21
|
+
- numba
|
|
22
|
+
- adpbulk
|
|
23
|
+
- pydeseq2
|
|
24
|
+
- adjustText
|
|
25
|
+
- watermark
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "scez"
|
|
3
|
+
version = "0.2.1"
|
|
4
|
+
description = "Single Cell Analysis, Easy Mode!"
|
|
5
|
+
authors = [
|
|
6
|
+
{ name = "Abe Arab", email = "abarbiology@gmail.com" }
|
|
7
|
+
]
|
|
8
|
+
license = { text = "MIT" }
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9,<4.0"
|
|
11
|
+
classifiers = [
|
|
12
|
+
"License :: OSI Approved :: MIT License"
|
|
13
|
+
]
|
|
14
|
+
# dependencies = [
|
|
15
|
+
# "numpy",
|
|
16
|
+
# "pandas",
|
|
17
|
+
# "bottleneck",
|
|
18
|
+
# "tqdm",
|
|
19
|
+
# "tomli",
|
|
20
|
+
# "matplotlib",
|
|
21
|
+
# "seaborn",
|
|
22
|
+
# "adjustText",
|
|
23
|
+
# "scanpy",
|
|
24
|
+
# "anndata",
|
|
25
|
+
# "pertpy",
|
|
26
|
+
# "adpbulk",
|
|
27
|
+
# "pydeseq2",
|
|
28
|
+
# "blitzgsea",
|
|
29
|
+
# ]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Source = "https://github.com/abearab/scez"
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
test = [
|
|
36
|
+
"pytest",
|
|
37
|
+
"tomli",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[build-system]
|
|
41
|
+
requires = ["hatchling"]
|
|
42
|
+
build-backend = "hatchling.build"
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""scez – Single Cell Analysis, Easy Mode!"""
|
|
2
|
+
|
|
3
|
+
from . import diffexp as de
|
|
4
|
+
from . import preprocess as pp
|
|
5
|
+
from . import representation as rp
|
|
6
|
+
from . import utils
|
|
7
|
+
import scanpy as sc
|
|
8
|
+
import matplotlib.pyplot as plt
|
|
9
|
+
|
|
10
|
+
import tomli
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
with open(Path(__file__).parent.parent / 'pyproject.toml', 'rb') as f:
|
|
14
|
+
toml_dict = tomli.load(f)
|
|
15
|
+
__version__ = toml_dict['project']['version']
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
sc.settings.verbosity = 1 # verbosity: errors (0), warnings (1), info (2), hints (3)
|
|
19
|
+
sc.settings.set_figure_params(dpi=100, dpi_save=300, frameon=False, figsize=(5, 5), facecolor='white')
|
|
20
|
+
sc.logging.print_header()
|
|
21
|
+
|
|
22
|
+
# https://stackoverflow.com/questions/21884271/warning-about-too-many-open-figures
|
|
23
|
+
plt.rcParams.update({'figure.max_open_warning': 0})
|
|
24
|
+
plt.close('all')
|
|
25
|
+
|
|
26
|
+
# https://stackoverflow.com/questions/3899980/how-to-change-the-font-size-on-a-matplotlib-plot
|
|
27
|
+
|
|
28
|
+
SMALL_SIZE = 6
|
|
29
|
+
MEDIUM_SIZE = 8
|
|
30
|
+
BIGGER_SIZE = 10
|
|
31
|
+
|
|
32
|
+
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
|
|
33
|
+
plt.rc('axes', titlesize=SMALL_SIZE) # font size of the axes title
|
|
34
|
+
plt.rc('axes', labelsize=MEDIUM_SIZE) # font size of the x and y labels
|
|
35
|
+
plt.rc('xtick', labelsize=SMALL_SIZE) # font size of the tick labels
|
|
36
|
+
plt.rc('ytick', labelsize=SMALL_SIZE) # font size of the tick labels
|
|
37
|
+
plt.rc('legend', fontsize=SMALL_SIZE) # legend font size
|
|
38
|
+
plt.rc('figure', titlesize=BIGGER_SIZE) # font size of the figure title
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import seaborn as sns
|
|
5
|
+
import anndata as ad
|
|
6
|
+
|
|
7
|
+
from pydeseq2.dds import DeseqDataSet
|
|
8
|
+
from pydeseq2.default_inference import DefaultInference
|
|
9
|
+
from pydeseq2.ds import DeseqStats
|
|
10
|
+
from .utils import run_adjust_text
|
|
11
|
+
from adpbulk import ADPBulk
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def pseudobulk_by_clusters(adt, condition, cluster_col='leiden', method="mean"):
|
|
15
|
+
# initialize the object
|
|
16
|
+
adpb = ADPBulk(adt, [cluster_col, condition], method=method)
|
|
17
|
+
|
|
18
|
+
# perform the pseudobulking
|
|
19
|
+
pseudobulk_matrix = adpb.fit_transform()
|
|
20
|
+
|
|
21
|
+
# retrieve the sample metadata (useful for easy incorporation with edgeR)
|
|
22
|
+
sample_meta = adpb.get_meta()
|
|
23
|
+
|
|
24
|
+
out = ad.AnnData(
|
|
25
|
+
X=pseudobulk_matrix,
|
|
26
|
+
obs=sample_meta.set_index('SampleName')
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
return out
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def run_deseq(adata, design, tested_level, ref_level, n_cpus=8):
|
|
33
|
+
|
|
34
|
+
inference = DefaultInference(n_cpus=n_cpus)
|
|
35
|
+
|
|
36
|
+
dds = DeseqDataSet(
|
|
37
|
+
counts=adata.to_df().astype(int),
|
|
38
|
+
metadata=adata.obs,
|
|
39
|
+
design_factors=design, # compare samples based on the "condition"
|
|
40
|
+
refit_cooks=True,
|
|
41
|
+
inference=inference,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
dds.deseq2()
|
|
45
|
+
|
|
46
|
+
stat_res = DeseqStats(
|
|
47
|
+
dds,
|
|
48
|
+
contrast=[design, tested_level, ref_level],
|
|
49
|
+
inference=inference
|
|
50
|
+
)
|
|
51
|
+
stat_res.summary()
|
|
52
|
+
|
|
53
|
+
df = stat_res.results_df
|
|
54
|
+
|
|
55
|
+
return df
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def plot_volcano(df, title=None, labels=None, n_genes=False, side='both',
|
|
59
|
+
font_scale=1, dot_size = 5,
|
|
60
|
+
color = '#1f77b4', color_highlight = '#FFA500',
|
|
61
|
+
ax = None, **kwargs):
|
|
62
|
+
dot_size_highlight = dot_size * 1.1
|
|
63
|
+
annotate_font_size = 5 * font_scale
|
|
64
|
+
scatter_font_size = 8 * font_scale
|
|
65
|
+
label_font_size = 9 * font_scale
|
|
66
|
+
title_font_size = 10 * font_scale
|
|
67
|
+
|
|
68
|
+
if 'name' not in df.columns: df['name'] = df.index.to_list()
|
|
69
|
+
df['-log10(pvalue)'] = - np.log10(df.pvalue)
|
|
70
|
+
|
|
71
|
+
if not ax: fig, ax = plt.subplots(figsize=(3, 3))
|
|
72
|
+
|
|
73
|
+
# Scatter plot
|
|
74
|
+
ax.scatter(
|
|
75
|
+
df['log2FoldChange'],
|
|
76
|
+
df['-log10(pvalue)'],
|
|
77
|
+
alpha=0.9, s=dot_size, c=color,
|
|
78
|
+
**kwargs
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Set background color to transparent
|
|
82
|
+
ax.set_facecolor('none')
|
|
83
|
+
|
|
84
|
+
# Set smaller font size
|
|
85
|
+
ax.tick_params(axis='both', which='both', labelsize=scatter_font_size)
|
|
86
|
+
|
|
87
|
+
# Set labels
|
|
88
|
+
ax.set_xlabel('log2FoldChange', fontsize=label_font_size)
|
|
89
|
+
ax.set_ylabel('-log10(pvalue)', fontsize=label_font_size)
|
|
90
|
+
|
|
91
|
+
# Set plot title
|
|
92
|
+
if not title:
|
|
93
|
+
ax.set_title('Volcano Plot', fontsize=title_font_size)
|
|
94
|
+
else:
|
|
95
|
+
ax.set_title(title, fontsize=title_font_size)
|
|
96
|
+
|
|
97
|
+
ax.grid(False)
|
|
98
|
+
|
|
99
|
+
# check if `labels` is provided or set that based on `n_genes` and `side`
|
|
100
|
+
if labels and n_genes:
|
|
101
|
+
# error message if both labels and n_genes are provided and say one of them is allowed
|
|
102
|
+
raise ValueError('Provide either labels or n_genes, not both!')
|
|
103
|
+
|
|
104
|
+
elif n_genes and side == 'positive':
|
|
105
|
+
# Highlight top genes
|
|
106
|
+
top_genes = df.query('log2FoldChange > 0').nlargest(n_genes, '-log10(pvalue)')
|
|
107
|
+
labels = [row['name'] for _, row in top_genes.iterrows()]
|
|
108
|
+
|
|
109
|
+
elif n_genes and side == 'negative':
|
|
110
|
+
# Highlight top genes
|
|
111
|
+
top_genes = df.query('log2FoldChange < 0').nlargest(n_genes, '-log10(pvalue)')
|
|
112
|
+
labels = [row['name'] for _, row in top_genes.iterrows()]
|
|
113
|
+
|
|
114
|
+
elif n_genes and side == 'both':
|
|
115
|
+
# Highlight top genes
|
|
116
|
+
top_genes = df.nlargest(n_genes, '-log10(pvalue)')
|
|
117
|
+
labels = [row['name'] for _, row in top_genes.iterrows()]
|
|
118
|
+
|
|
119
|
+
# Highlight the points from given labels
|
|
120
|
+
if labels:
|
|
121
|
+
for label in labels:
|
|
122
|
+
ax.scatter(
|
|
123
|
+
df.loc[label, 'log2FoldChange'],
|
|
124
|
+
df.loc[label, '-log10(pvalue)'],
|
|
125
|
+
s=dot_size_highlight, c=color_highlight
|
|
126
|
+
)
|
|
127
|
+
run_adjust_text(
|
|
128
|
+
df.loc[labels, 'log2FoldChange'],
|
|
129
|
+
df.loc[labels, '-log10(pvalue)'],
|
|
130
|
+
labels,
|
|
131
|
+
font_size=annotate_font_size, ax=ax, use_arrow=False
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if not ax:
|
|
135
|
+
plt.tight_layout()
|
|
136
|
+
plt.show()
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def plot_top_DEG_violinplot(adata, df, layer=None, title=None, labels=None, n_genes=False, side='both', font_scale=1, figsize=(10, 4), **kwargs):
|
|
140
|
+
|
|
141
|
+
label_font_size = 9 * font_scale
|
|
142
|
+
title_font_size = 10 * font_scale
|
|
143
|
+
|
|
144
|
+
if 'name' not in df.columns: df['name'] = df.index.to_list()
|
|
145
|
+
|
|
146
|
+
if labels and n_genes:
|
|
147
|
+
# error message if both labels and n_genes are provided and say one of them is allowed
|
|
148
|
+
raise ValueError('Provide either labels or n_genes, not both!')
|
|
149
|
+
|
|
150
|
+
if not labels and not n_genes:
|
|
151
|
+
# error message if neither labels nor n_genes are provided
|
|
152
|
+
raise ValueError('Provide either labels or n_genes!')
|
|
153
|
+
|
|
154
|
+
if labels:
|
|
155
|
+
# Highlight the points from given list
|
|
156
|
+
selected_genes = df.loc[labels]
|
|
157
|
+
|
|
158
|
+
elif n_genes and side == 'positive':
|
|
159
|
+
# Highlight top genes
|
|
160
|
+
selected_genes = df.query('log2FoldChange > 0').nlargest(n_genes, '-log10(pvalue)')
|
|
161
|
+
|
|
162
|
+
elif n_genes and side == 'negative':
|
|
163
|
+
# Highlight top genes
|
|
164
|
+
selected_genes = df.query('log2FoldChange < 0').nlargest(n_genes, '-log10(pvalue)')
|
|
165
|
+
|
|
166
|
+
elif n_genes and side == 'both':
|
|
167
|
+
# Highlight top genes
|
|
168
|
+
selected_genes = df.nlargest(n_genes, '-log10(pvalue)')
|
|
169
|
+
|
|
170
|
+
# Filter the single-cell dataset for the selected genes
|
|
171
|
+
subset_adata = adata[:, selected_genes.index].copy()
|
|
172
|
+
subset_adata.var.index = subset_adata.var.index.str.split('_').str[0]
|
|
173
|
+
|
|
174
|
+
# Convert the subset of adata to a DataFrame
|
|
175
|
+
subset_df = subset_adata.to_df(layer=layer)
|
|
176
|
+
|
|
177
|
+
# Merge the DataFrame with .obs to include the 'sample' information
|
|
178
|
+
merged_df = pd.merge(subset_df, adata.obs[['sample']], left_index=True, right_index=True)
|
|
179
|
+
|
|
180
|
+
# Melt the DataFrame to prepare for violin plot
|
|
181
|
+
melted_df = pd.melt(merged_df, id_vars='sample', var_name='Gene', value_name='Counts')
|
|
182
|
+
|
|
183
|
+
# Create a violin plot
|
|
184
|
+
plt.figure(figsize=figsize)
|
|
185
|
+
sns.violinplot(x='Gene', y='Counts', hue='sample', data=melted_df, split=True, inner='quartile', palette='Set2', **kwargs)
|
|
186
|
+
sns.stripplot(x='Gene', y='Counts', hue='sample', data=melted_df, dodge=True, jitter=True, color='black', size=1, alpha=0.3, **kwargs)
|
|
187
|
+
|
|
188
|
+
plt.xticks(rotation=45, ha='right', fontsize=label_font_size)
|
|
189
|
+
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=label_font_size)
|
|
190
|
+
|
|
191
|
+
if not title:
|
|
192
|
+
plt.title('Top Differentially Expressed Genes', fontsize=title_font_size)
|
|
193
|
+
else:
|
|
194
|
+
plt.title(title, fontsize=title_font_size)
|
|
195
|
+
plt.show()
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def write_top_DEGs(df, sample_id, result_dir='.', n_hits=200):
|
|
199
|
+
df['-log10(pvalue)'] = - np.log10(df.pvalue)
|
|
200
|
+
df.nlargest(n_hits, '-log10(pvalue)').to_csv(f'{result_dir}/{sample_id}_top_{n_hits}.csv') # Adjust the number as needed
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import scanpy as sc
|
|
3
|
+
import scar
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def normalization(adata, target_sum=1e4, max_value=10, final_layer='scaled', keep_initial_layer=True):
|
|
7
|
+
if keep_initial_layer == True:
|
|
8
|
+
adata.layers['raw_counts'] = adata.X.copy()
|
|
9
|
+
elif type(keep_initial_layer) == str:
|
|
10
|
+
adata.layers[keep_initial_layer] = adata.X.copy()
|
|
11
|
+
|
|
12
|
+
# normalize counts to target_sum (default 1e4)
|
|
13
|
+
counts = sc.pp.normalize_total(adata, target_sum=target_sum, inplace=False)
|
|
14
|
+
# log1p transform
|
|
15
|
+
adata.layers["log1p_norm"] = sc.pp.log1p(counts["X"], copy=True)
|
|
16
|
+
# scale counts
|
|
17
|
+
adata.layers['scaled'] = sc.pp.scale(adata, max_value=max_value, copy=True).X
|
|
18
|
+
# set the final layer
|
|
19
|
+
adata.X = adata.layers[final_layer]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def remove_ambient_rna(adata_filtered_feature_bc, adata_raw_feature_bc):
|
|
23
|
+
scar.setup_anndata(
|
|
24
|
+
adata = adata_filtered_feature_bc,
|
|
25
|
+
raw_adata = adata_raw_feature_bc,
|
|
26
|
+
prob = 0.995,
|
|
27
|
+
kneeplot = True
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
adata_scar = scar.model(
|
|
31
|
+
raw_count=adata_filtered_feature_bc.to_df(), # In the case of Anndata object, scar will automatically use the estimated ambient_profile present in adata.uns.
|
|
32
|
+
# ambient_profile=adata_filtered_feature_bc.uns['ambient_profile_Gene Expression'],
|
|
33
|
+
feature_type='mRNA',
|
|
34
|
+
sparsity=1,
|
|
35
|
+
# device=device # Both cpu and cuda are supported.
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
adata_scar.train(
|
|
39
|
+
epochs=200,
|
|
40
|
+
batch_size=64,
|
|
41
|
+
verbose=True
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# After training, we can infer the native true signal
|
|
45
|
+
adata_scar.inference(batch_size=256) # by defaut, batch_size = None, set a batch_size if getting a memory issue
|
|
46
|
+
|
|
47
|
+
denoised_count = pd.DataFrame(
|
|
48
|
+
adata_scar.native_counts,
|
|
49
|
+
index=adata_filtered_feature_bc.obs_names,
|
|
50
|
+
columns=adata_filtered_feature_bc.var_names
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
adata = adata_filtered_feature_bc.copy()
|
|
54
|
+
adata.layers['raw_counts'] = adata.X
|
|
55
|
+
adata.layers['scar_denoised_counts'] = denoised_count.to_numpy()
|
|
56
|
+
|
|
57
|
+
return adata
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def clustering(
|
|
61
|
+
adata
|
|
62
|
+
):
|
|
63
|
+
pass
|
|
64
|
+
# , n_pcs=50, n_neighbors=30, use_highly_variable='Yes',
|
|
65
|
+
# use_rep=None, resolution=None
|
|
66
|
+
|
|
67
|
+
# if use_highly_variable == 'Yes':
|
|
68
|
+
# sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
|
|
69
|
+
# sc.tl.pca(adata, svd_solver='arpack', use_highly_variable=True)
|
|
70
|
+
# else:
|
|
71
|
+
# sc.pp.pca(adata, n_comps=n_pcs)
|
|
72
|
+
# sc.pp.neighbors(adata, use_rep=use_rep, n_neighbors=n_neighbors)#, n_pcs=n_pcs)
|
|
73
|
+
# sc.tl.umap(adata)
|
|
74
|
+
# sc.tl.leiden(adata, resolution=resolution)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from itertools import product
|
|
2
|
+
import matplotlib.pyplot as plt
|
|
3
|
+
import scanpy as sc
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def optimising_umap_layout(adata, cluster_key='leiden',MIN_DISTS = [0.1, 1, 2], SPREADS = [0.5, 1, 5]):
|
|
8
|
+
# https://scanpy-tutorials.readthedocs.io/en/latest/plotting/advanced.html
|
|
9
|
+
# Copy adata not to modify UMAP in the original adata object
|
|
10
|
+
adata_temp = adata.copy()
|
|
11
|
+
|
|
12
|
+
# Create grid of plots, with a little extra room for the legends
|
|
13
|
+
fig, axes = plt.subplots(
|
|
14
|
+
len(MIN_DISTS), len(SPREADS), figsize=(len(SPREADS) * 3 + 2, len(MIN_DISTS) * 3)
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# Loop through different umap parameters, recomputting and replotting UMAP for each of them
|
|
18
|
+
for (i, min_dist), (j, spread) in product(enumerate(MIN_DISTS), enumerate(SPREADS)):
|
|
19
|
+
ax = axes[i][j]
|
|
20
|
+
param_str = " ".join(["min_dist =", str(min_dist), "and spread =", str(spread)])
|
|
21
|
+
# Recompute UMAP with new parameters
|
|
22
|
+
sc.tl.umap(adata_temp, min_dist=min_dist, spread=spread)
|
|
23
|
+
# Create plot, placing it in grid
|
|
24
|
+
sc.pl.umap(
|
|
25
|
+
adata_temp,
|
|
26
|
+
color=[cluster_key],
|
|
27
|
+
title=param_str,
|
|
28
|
+
s=40,
|
|
29
|
+
ax=ax,
|
|
30
|
+
show=False,
|
|
31
|
+
)
|
|
32
|
+
plt.tight_layout()
|
|
33
|
+
plt.show()
|
|
34
|
+
plt.close()
|
|
35
|
+
del adata_temp
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def random_ordering(adata):
|
|
39
|
+
# Randomly order cells by making a random index and subsetting AnnData based on it
|
|
40
|
+
# Set a random seed to ensure that the cell ordering will be reproducible
|
|
41
|
+
np.random.seed(0)
|
|
42
|
+
random_indices = np.random.permutation(list(range(adata.shape[0])))
|
|
43
|
+
|
|
44
|
+
return random_indices
|
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
import matplotlib.pyplot as plt
|
|
3
|
+
import scanpy as sc
|
|
4
|
+
import scez
|
|
5
|
+
import tomli
|
|
6
|
+
|
|
7
|
+
with open('pyproject.toml', 'rb') as f:
|
|
8
|
+
toml_dict = tomli.load(f)
|
|
9
|
+
version = toml_dict['project']['version']
|
|
10
|
+
|
|
11
|
+
class TestScezConfig(unittest.TestCase):
|
|
12
|
+
def test_version(self):
|
|
13
|
+
self.assertEqual(scez.__version__, version)
|
|
14
|
+
|
|
15
|
+
def test_scanpy_settings(self):
|
|
16
|
+
self.assertEqual(sc.settings.verbosity, 1)
|
|
17
|
+
|
|
18
|
+
if __name__ == '__main__':
|
|
19
|
+
unittest.main()
|
scez-0.2.1/scez/utils.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from matplotlib import pyplot as plt
|
|
3
|
+
from adjustText import adjust_text
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def rank_genes_to_df(adata, n=50):
|
|
7
|
+
result = adata.uns['rank_genes_groups']
|
|
8
|
+
|
|
9
|
+
groups = result['names'].dtype.names
|
|
10
|
+
|
|
11
|
+
df = pd.DataFrame(
|
|
12
|
+
{group + '_' + key: result[key][group]
|
|
13
|
+
for group in groups for key in ['names', 'scores']}).head(n)
|
|
14
|
+
|
|
15
|
+
return df
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def add_marker_feature(adata, marker, marker_name, clusters_name, thr = 0, figsize=(10, 4)):
|
|
19
|
+
|
|
20
|
+
adata.obs[marker_name] = ''
|
|
21
|
+
adata.obs.loc[adata.to_df().loc[:,marker] <= thr, marker_name] = f'{marker}-'
|
|
22
|
+
adata.obs.loc[adata.to_df().loc[:,marker] > thr, marker_name] = f'{marker}+'
|
|
23
|
+
|
|
24
|
+
df = pd.concat([
|
|
25
|
+
adata.obs.groupby([marker_name,clusters_name]).size()[f'{marker}+'],
|
|
26
|
+
adata.obs.groupby([marker_name,clusters_name]).size()[f'{marker}-']
|
|
27
|
+
],axis=1).rename(columns={0:f'{marker}+',1:f'{marker}-'})
|
|
28
|
+
|
|
29
|
+
# Make some labels.
|
|
30
|
+
labels = df[f'{marker}+'] / df.sum(axis=1) * 100
|
|
31
|
+
labels = labels.round(decimals=1)
|
|
32
|
+
labels.sort_values(ascending=False,inplace=True)
|
|
33
|
+
df = df.loc[labels.index,]
|
|
34
|
+
|
|
35
|
+
ax = df.plot.bar(stacked=True,rot=0,figsize=figsize)
|
|
36
|
+
|
|
37
|
+
rects = ax.patches
|
|
38
|
+
|
|
39
|
+
for rect, label in zip(rects, labels):
|
|
40
|
+
height = rect.get_height()
|
|
41
|
+
ax.text(
|
|
42
|
+
rect.get_x() + rect.get_width() / 2, height + 5, str(label) + "%",
|
|
43
|
+
ha="center", va="bottom", fontsize=8
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
ax.set_yscale('log')
|
|
47
|
+
ax.set_ylabel('# of cells')
|
|
48
|
+
return ax
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def run_adjust_text(x, y, labels, ax=None, use_arrow=True, font_weight='bold', font_size=8):
|
|
52
|
+
texts = [
|
|
53
|
+
plt.text(
|
|
54
|
+
x[i], y[i],
|
|
55
|
+
labels[i],
|
|
56
|
+
fontdict={'weight': font_weight, 'size': font_size},
|
|
57
|
+
ha='center', va='center'
|
|
58
|
+
) for i in range(len(x))
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
if use_arrow:
|
|
62
|
+
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'), ax = ax)
|
|
63
|
+
else:
|
|
64
|
+
adjust_text(texts, ax = ax)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
name: scez
|
|
2
|
+
channels:
|
|
3
|
+
- anaconda
|
|
4
|
+
- conda-forge
|
|
5
|
+
- bioconda
|
|
6
|
+
dependencies:
|
|
7
|
+
- python>=3.9
|
|
8
|
+
- scanpy
|
|
9
|
+
- pertpy
|
|
10
|
+
- python-igraph
|
|
11
|
+
- leidenalg
|
|
12
|
+
- anndata
|
|
13
|
+
- scipy
|
|
14
|
+
- scar
|
|
15
|
+
- scikit-learn
|
|
16
|
+
- matplotlib
|
|
17
|
+
- ipykernel
|
|
18
|
+
- mscorefonts
|
|
19
|
+
- pip
|
|
20
|
+
- pip:
|
|
21
|
+
- numba
|
|
22
|
+
- adpbulk
|
|
23
|
+
- pydeseq2
|
|
24
|
+
- adjustText
|
|
25
|
+
- watermark
|
|
26
|
+
- python=3.12
|