scez 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ FROM continuumio/miniconda3
2
+
3
+ # Install system dependencies
4
+ USER root
5
+ RUN apt-get update \
6
+ && apt-get install -y build-essential git tree curl sudo vim wget \
7
+ && apt-get clean \
8
+ && apt-get purge
9
+
10
+ # Install conda env
11
+ COPY environment.yml /tmp/env.yaml
12
+
13
+ # Install mamba in the base environment
14
+ RUN conda config --add channels conda-forge \
15
+ && conda config --add channels bioconda \
16
+ && conda config --set channel_priority false
17
+
18
+ # Create the dev environment using mamba
19
+ RUN conda install -y -n base -c conda-forge mamba \
20
+ && mamba env create -n dev -f /tmp/env.yaml \
21
+ && conda clean --all --yes \
22
+ && rm -f /tmp/env.yaml
@@ -0,0 +1,12 @@
1
+ {
2
+ "name": "scez",
3
+ "build": {
4
+ "dockerfile": "Dockerfile",
5
+ "context": "../"
6
+ },
7
+ "features": {
8
+ "ghcr.io/devcontainers/features/docker-in-docker:2": {
9
+ "version": "latest"
10
+ }
11
+ }
12
+ }
@@ -0,0 +1,12 @@
1
+ # To get started with Dependabot version updates, you'll need to specify which
2
+ # package ecosystems to update and where the package manifests are located.
3
+ # Please see the documentation for more information:
4
+ # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5
+ # https://containers.dev/guide/dependabot
6
+
7
+ version: 2
8
+ updates:
9
+ - package-ecosystem: "devcontainers"
10
+ directory: "/"
11
+ schedule:
12
+ interval: weekly
@@ -0,0 +1,54 @@
1
+ name: Python package
2
+
3
+ on:
4
+ push:
5
+ branches: [ "main" ]
6
+ pull_request:
7
+ branches: [ "main" ]
8
+
9
+ jobs:
10
+ build:
11
+
12
+ runs-on: ${{ matrix.os-version }}
13
+ name: ${{ matrix.os-version }} (${{ matrix.python-version }})
14
+
15
+ strategy:
16
+ fail-fast: false
17
+ max-parallel: 5
18
+ matrix:
19
+ os-version: ["ubuntu-latest"]
20
+ python-version: ["3.11", "3.12", "3.13"]
21
+
22
+ steps:
23
+ - uses: actions/checkout@v3
24
+ - name: "Set up Python ${{ matrix.python-version }}"
25
+ uses: actions/setup-python@v3
26
+ with:
27
+ python-version: ${{ matrix.python-version }}
28
+ - name: "Install flake8"
29
+ run: |
30
+ pip install flake8 tomli
31
+ - name: "Lint with flake8"
32
+ run: |
33
+ # stop the build if there are Python syntax errors or undefined names
34
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
35
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
36
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
37
+ - name: "Install miniconda"
38
+ uses: conda-incubator/setup-miniconda@v3
39
+ with:
40
+ miniconda-version: "latest"
41
+ auto-update-conda: true
42
+ python-version: ${{ matrix.python-version }}
43
+ channels: conda-forge,bioconda
44
+ environment-file: environment.yml
45
+ - name: "Install pytest"
46
+ shell: bash -l {0}
47
+ run: |
48
+ python -m pip install --upgrade pip
49
+ pip install uv
50
+ uv pip install --system build pytest tomli
51
+ - name: "Test with pytest"
52
+ shell: bash -l {0}
53
+ run: |
54
+ pytest -s
@@ -0,0 +1,62 @@
1
+ name: Publish PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+
8
+ jobs:
9
+ build:
10
+
11
+ runs-on: ${{ matrix.os-version }}
12
+ name: ${{ matrix.os-version }} (${{ matrix.python-version }})
13
+
14
+ strategy:
15
+ fail-fast: false
16
+ max-parallel: 5
17
+ matrix:
18
+ os-version: ["ubuntu-latest"]
19
+ python-version: ["3.11", "3.12", "3.13"]
20
+
21
+ steps:
22
+ - uses: actions/checkout@v3
23
+ - name: "Set up Python ${{ matrix.python-version }}"
24
+ uses: actions/setup-python@v3
25
+ with:
26
+ python-version: ${{ matrix.python-version }}
27
+ - name: "Install flake8"
28
+ run: |
29
+ pip install flake8 tomli
30
+ - name: "Lint with flake8"
31
+ run: |
32
+ # stop the build if there are Python syntax errors or undefined names
33
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
34
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
35
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
36
+ - name: "Install miniconda"
37
+ uses: conda-incubator/setup-miniconda@v3
38
+ with:
39
+ miniconda-version: "latest"
40
+ auto-update-conda: true
41
+ python-version: ${{ matrix.python-version }}
42
+ channels: conda-forge,bioconda
43
+ environment-file: environment.yml
44
+ - name: "Install pytest"
45
+ shell: bash -l {0}
46
+ run: |
47
+ python -m pip install --upgrade pip
48
+ pip install uv
49
+ uv pip install --system build pytest tomli
50
+ - name: "Test with pytest"
51
+ shell: bash -l {0}
52
+ run: |
53
+ pytest -s
54
+ - name: Build package
55
+ shell: bash -l {0}
56
+ run: |
57
+ python -m build
58
+ - name: Publish package
59
+ uses: pypa/gh-action-pypi-publish@release/v1
60
+ with:
61
+ user: __token__
62
+ password: ${{ secrets.PYPI_TOKEN }}
scez-0.2.1/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ .idea/*
2
+ dist/*
3
+ build/*
4
+ actions-runner/*
5
+ *.lock
6
+ **pyc
7
+ .DS_Store
8
+ uv.lock
9
+ .python-version
10
+ .venv/
scez-0.2.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Abolfazl (Abe)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
scez-0.2.1/PKG-INFO ADDED
@@ -0,0 +1,45 @@
1
+ Metadata-Version: 2.4
2
+ Name: scez
3
+ Version: 0.2.1
4
+ Summary: Single Cell Analysis, Easy Mode!
5
+ Project-URL: Source, https://github.com/abearab/scez
6
+ Author-email: Abe Arab <abarbiology@gmail.com>
7
+ License: MIT
8
+ License-File: LICENSE
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Requires-Python: <4.0,>=3.9
11
+ Provides-Extra: test
12
+ Requires-Dist: pytest; extra == 'test'
13
+ Requires-Dist: tomli; extra == 'test'
14
+ Description-Content-Type: text/markdown
15
+
16
+ ## scez – single cell, easy mode
17
+ [![package](https://github.com/abearab/scez/actions/workflows/main.yml/badge.svg)](https://github.com/abearab/scez/actions/workflows/main.yml)
18
+ [![PyPI version](https://badge.fury.io/py/scez.svg)](https://badge.fury.io/py/scez)
19
+ [![Downloads](https://static.pepy.tech/badge/scez)](https://pepy.tech/project/scez)
20
+ [![Downloads](https://static.pepy.tech/badge/scez/month)](https://pepy.tech/project/scez)
21
+
22
+
23
+ ### Description
24
+ There are many tools available for single-cell RNA-seq analysis, but they often require a lot of understanding of the underlying algorithms, reading of documentation, and setting up analysis environments. This takes time and effort, and can be a barrier to entry for many projects. [Single-Cell Best Practices](https://github.com/theislab/single-cell-best-practices) is a great resource for learning about the best practices for single-cell analysis. `scez` aims to provide functionalities for single-cell analysis through definitions of analysis "tasks" and implementation of these "best practices" in a user-friendly way.
25
+
26
+ This is more a personal effort to streamline my own analysis workflows, but I hope it can be useful to others as well.
27
+
28
+
29
+ ### Installation
30
+
31
+ First, create a new conda environment with the provided `environment.yml` file:
32
+ ```bash
33
+ conda env create -f https://raw.githubusercontent.com/abearab/scez/main/environment.yml
34
+ conda activate scez
35
+ ```
36
+
37
+ Then, install scez using uv / pip:
38
+ ```bash
39
+ uv pip install scez
40
+ ```
41
+
42
+ Or, to install the latest version from the repository:
43
+ ```bash
44
+ uv pip install git+https://github.com/abearab/scez.git
45
+ ```
scez-0.2.1/README.md ADDED
@@ -0,0 +1,30 @@
1
+ ## scez – single cell, easy mode
2
+ [![package](https://github.com/abearab/scez/actions/workflows/main.yml/badge.svg)](https://github.com/abearab/scez/actions/workflows/main.yml)
3
+ [![PyPI version](https://badge.fury.io/py/scez.svg)](https://badge.fury.io/py/scez)
4
+ [![Downloads](https://static.pepy.tech/badge/scez)](https://pepy.tech/project/scez)
5
+ [![Downloads](https://static.pepy.tech/badge/scez/month)](https://pepy.tech/project/scez)
6
+
7
+
8
+ ### Description
9
+ There are many tools available for single-cell RNA-seq analysis, but they often require a lot of understanding of the underlying algorithms, reading of documentation, and setting up analysis environments. This takes time and effort, and can be a barrier to entry for many projects. [Single-Cell Best Practices](https://github.com/theislab/single-cell-best-practices) is a great resource for learning about the best practices for single-cell analysis. `scez` aims to provide functionalities for single-cell analysis through definitions of analysis "tasks" and implementation of these "best practices" in a user-friendly way.
10
+
11
+ This is more a personal effort to streamline my own analysis workflows, but I hope it can be useful to others as well.
12
+
13
+
14
+ ### Installation
15
+
16
+ First, create a new conda environment with the provided `environment.yml` file:
17
+ ```bash
18
+ conda env create -f https://raw.githubusercontent.com/abearab/scez/main/environment.yml
19
+ conda activate scez
20
+ ```
21
+
22
+ Then, install scez using uv / pip:
23
+ ```bash
24
+ uv pip install scez
25
+ ```
26
+
27
+ Or, to install the latest version from the repository:
28
+ ```bash
29
+ uv pip install git+https://github.com/abearab/scez.git
30
+ ```
@@ -0,0 +1,25 @@
1
+ name: scez
2
+ channels:
3
+ - anaconda
4
+ - conda-forge
5
+ - bioconda
6
+ dependencies:
7
+ - python>=3.9
8
+ - scanpy
9
+ - pertpy
10
+ - python-igraph
11
+ - leidenalg
12
+ - anndata
13
+ - scipy
14
+ - scar
15
+ - scikit-learn
16
+ - matplotlib
17
+ - ipykernel
18
+ - mscorefonts
19
+ - pip
20
+ - pip:
21
+ - numba
22
+ - adpbulk
23
+ - pydeseq2
24
+ - adjustText
25
+ - watermark
@@ -0,0 +1,42 @@
1
+ [project]
2
+ name = "scez"
3
+ version = "0.2.1"
4
+ description = "Single Cell Analysis, Easy Mode!"
5
+ authors = [
6
+ { name = "Abe Arab", email = "abarbiology@gmail.com" }
7
+ ]
8
+ license = { text = "MIT" }
9
+ readme = "README.md"
10
+ requires-python = ">=3.9,<4.0"
11
+ classifiers = [
12
+ "License :: OSI Approved :: MIT License"
13
+ ]
14
+ # dependencies = [
15
+ # "numpy",
16
+ # "pandas",
17
+ # "bottleneck",
18
+ # "tqdm",
19
+ # "tomli",
20
+ # "matplotlib",
21
+ # "seaborn",
22
+ # "adjustText",
23
+ # "scanpy",
24
+ # "anndata",
25
+ # "pertpy",
26
+ # "adpbulk",
27
+ # "pydeseq2",
28
+ # "blitzgsea",
29
+ # ]
30
+
31
+ [project.urls]
32
+ Source = "https://github.com/abearab/scez"
33
+
34
+ [project.optional-dependencies]
35
+ test = [
36
+ "pytest",
37
+ "tomli",
38
+ ]
39
+
40
+ [build-system]
41
+ requires = ["hatchling"]
42
+ build-backend = "hatchling.build"
@@ -0,0 +1,38 @@
1
+ """scez – Single Cell Analysis, Easy Mode!"""
2
+
3
+ from . import diffexp as de
4
+ from . import preprocess as pp
5
+ from . import representation as rp
6
+ from . import utils
7
+ import scanpy as sc
8
+ import matplotlib.pyplot as plt
9
+
10
+ import tomli
11
+ from pathlib import Path
12
+
13
+ with open(Path(__file__).parent.parent / 'pyproject.toml', 'rb') as f:
14
+ toml_dict = tomli.load(f)
15
+ __version__ = toml_dict['project']['version']
16
+
17
+
18
+ sc.settings.verbosity = 1 # verbosity: errors (0), warnings (1), info (2), hints (3)
19
+ sc.settings.set_figure_params(dpi=100, dpi_save=300, frameon=False, figsize=(5, 5), facecolor='white')
20
+ sc.logging.print_header()
21
+
22
+ # https://stackoverflow.com/questions/21884271/warning-about-too-many-open-figures
23
+ plt.rcParams.update({'figure.max_open_warning': 0})
24
+ plt.close('all')
25
+
26
+ # https://stackoverflow.com/questions/3899980/how-to-change-the-font-size-on-a-matplotlib-plot
27
+
28
+ SMALL_SIZE = 6
29
+ MEDIUM_SIZE = 8
30
+ BIGGER_SIZE = 10
31
+
32
+ plt.rc('font', size=SMALL_SIZE) # controls default text sizes
33
+ plt.rc('axes', titlesize=SMALL_SIZE) # font size of the axes title
34
+ plt.rc('axes', labelsize=MEDIUM_SIZE) # font size of the x and y labels
35
+ plt.rc('xtick', labelsize=SMALL_SIZE) # font size of the tick labels
36
+ plt.rc('ytick', labelsize=SMALL_SIZE) # font size of the tick labels
37
+ plt.rc('legend', fontsize=SMALL_SIZE) # legend font size
38
+ plt.rc('figure', titlesize=BIGGER_SIZE) # font size of the figure title
@@ -0,0 +1,200 @@
1
+ import matplotlib.pyplot as plt
2
+ import numpy as np
3
+ import pandas as pd
4
+ import seaborn as sns
5
+ import anndata as ad
6
+
7
+ from pydeseq2.dds import DeseqDataSet
8
+ from pydeseq2.default_inference import DefaultInference
9
+ from pydeseq2.ds import DeseqStats
10
+ from .utils import run_adjust_text
11
+ from adpbulk import ADPBulk
12
+
13
+
14
+ def pseudobulk_by_clusters(adt, condition, cluster_col='leiden', method="mean"):
15
+ # initialize the object
16
+ adpb = ADPBulk(adt, [cluster_col, condition], method=method)
17
+
18
+ # perform the pseudobulking
19
+ pseudobulk_matrix = adpb.fit_transform()
20
+
21
+ # retrieve the sample metadata (useful for easy incorporation with edgeR)
22
+ sample_meta = adpb.get_meta()
23
+
24
+ out = ad.AnnData(
25
+ X=pseudobulk_matrix,
26
+ obs=sample_meta.set_index('SampleName')
27
+ )
28
+
29
+ return out
30
+
31
+
32
+ def run_deseq(adata, design, tested_level, ref_level, n_cpus=8):
33
+
34
+ inference = DefaultInference(n_cpus=n_cpus)
35
+
36
+ dds = DeseqDataSet(
37
+ counts=adata.to_df().astype(int),
38
+ metadata=adata.obs,
39
+ design_factors=design, # compare samples based on the "condition"
40
+ refit_cooks=True,
41
+ inference=inference,
42
+ )
43
+
44
+ dds.deseq2()
45
+
46
+ stat_res = DeseqStats(
47
+ dds,
48
+ contrast=[design, tested_level, ref_level],
49
+ inference=inference
50
+ )
51
+ stat_res.summary()
52
+
53
+ df = stat_res.results_df
54
+
55
+ return df
56
+
57
+
58
+ def plot_volcano(df, title=None, labels=None, n_genes=False, side='both',
59
+ font_scale=1, dot_size = 5,
60
+ color = '#1f77b4', color_highlight = '#FFA500',
61
+ ax = None, **kwargs):
62
+ dot_size_highlight = dot_size * 1.1
63
+ annotate_font_size = 5 * font_scale
64
+ scatter_font_size = 8 * font_scale
65
+ label_font_size = 9 * font_scale
66
+ title_font_size = 10 * font_scale
67
+
68
+ if 'name' not in df.columns: df['name'] = df.index.to_list()
69
+ df['-log10(pvalue)'] = - np.log10(df.pvalue)
70
+
71
+ if not ax: fig, ax = plt.subplots(figsize=(3, 3))
72
+
73
+ # Scatter plot
74
+ ax.scatter(
75
+ df['log2FoldChange'],
76
+ df['-log10(pvalue)'],
77
+ alpha=0.9, s=dot_size, c=color,
78
+ **kwargs
79
+ )
80
+
81
+ # Set background color to transparent
82
+ ax.set_facecolor('none')
83
+
84
+ # Set smaller font size
85
+ ax.tick_params(axis='both', which='both', labelsize=scatter_font_size)
86
+
87
+ # Set labels
88
+ ax.set_xlabel('log2FoldChange', fontsize=label_font_size)
89
+ ax.set_ylabel('-log10(pvalue)', fontsize=label_font_size)
90
+
91
+ # Set plot title
92
+ if not title:
93
+ ax.set_title('Volcano Plot', fontsize=title_font_size)
94
+ else:
95
+ ax.set_title(title, fontsize=title_font_size)
96
+
97
+ ax.grid(False)
98
+
99
+ # check if `labels` is provided or set that based on `n_genes` and `side`
100
+ if labels and n_genes:
101
+ # error message if both labels and n_genes are provided and say one of them is allowed
102
+ raise ValueError('Provide either labels or n_genes, not both!')
103
+
104
+ elif n_genes and side == 'positive':
105
+ # Highlight top genes
106
+ top_genes = df.query('log2FoldChange > 0').nlargest(n_genes, '-log10(pvalue)')
107
+ labels = [row['name'] for _, row in top_genes.iterrows()]
108
+
109
+ elif n_genes and side == 'negative':
110
+ # Highlight top genes
111
+ top_genes = df.query('log2FoldChange < 0').nlargest(n_genes, '-log10(pvalue)')
112
+ labels = [row['name'] for _, row in top_genes.iterrows()]
113
+
114
+ elif n_genes and side == 'both':
115
+ # Highlight top genes
116
+ top_genes = df.nlargest(n_genes, '-log10(pvalue)')
117
+ labels = [row['name'] for _, row in top_genes.iterrows()]
118
+
119
+ # Highlight the points from given labels
120
+ if labels:
121
+ for label in labels:
122
+ ax.scatter(
123
+ df.loc[label, 'log2FoldChange'],
124
+ df.loc[label, '-log10(pvalue)'],
125
+ s=dot_size_highlight, c=color_highlight
126
+ )
127
+ run_adjust_text(
128
+ df.loc[labels, 'log2FoldChange'],
129
+ df.loc[labels, '-log10(pvalue)'],
130
+ labels,
131
+ font_size=annotate_font_size, ax=ax, use_arrow=False
132
+ )
133
+
134
+ if not ax:
135
+ plt.tight_layout()
136
+ plt.show()
137
+
138
+
139
+ def plot_top_DEG_violinplot(adata, df, layer=None, title=None, labels=None, n_genes=False, side='both', font_scale=1, figsize=(10, 4), **kwargs):
140
+
141
+ label_font_size = 9 * font_scale
142
+ title_font_size = 10 * font_scale
143
+
144
+ if 'name' not in df.columns: df['name'] = df.index.to_list()
145
+
146
+ if labels and n_genes:
147
+ # error message if both labels and n_genes are provided and say one of them is allowed
148
+ raise ValueError('Provide either labels or n_genes, not both!')
149
+
150
+ if not labels and not n_genes:
151
+ # error message if neither labels nor n_genes are provided
152
+ raise ValueError('Provide either labels or n_genes!')
153
+
154
+ if labels:
155
+ # Highlight the points from given list
156
+ selected_genes = df.loc[labels]
157
+
158
+ elif n_genes and side == 'positive':
159
+ # Highlight top genes
160
+ selected_genes = df.query('log2FoldChange > 0').nlargest(n_genes, '-log10(pvalue)')
161
+
162
+ elif n_genes and side == 'negative':
163
+ # Highlight top genes
164
+ selected_genes = df.query('log2FoldChange < 0').nlargest(n_genes, '-log10(pvalue)')
165
+
166
+ elif n_genes and side == 'both':
167
+ # Highlight top genes
168
+ selected_genes = df.nlargest(n_genes, '-log10(pvalue)')
169
+
170
+ # Filter the single-cell dataset for the selected genes
171
+ subset_adata = adata[:, selected_genes.index].copy()
172
+ subset_adata.var.index = subset_adata.var.index.str.split('_').str[0]
173
+
174
+ # Convert the subset of adata to a DataFrame
175
+ subset_df = subset_adata.to_df(layer=layer)
176
+
177
+ # Merge the DataFrame with .obs to include the 'sample' information
178
+ merged_df = pd.merge(subset_df, adata.obs[['sample']], left_index=True, right_index=True)
179
+
180
+ # Melt the DataFrame to prepare for violin plot
181
+ melted_df = pd.melt(merged_df, id_vars='sample', var_name='Gene', value_name='Counts')
182
+
183
+ # Create a violin plot
184
+ plt.figure(figsize=figsize)
185
+ sns.violinplot(x='Gene', y='Counts', hue='sample', data=melted_df, split=True, inner='quartile', palette='Set2', **kwargs)
186
+ sns.stripplot(x='Gene', y='Counts', hue='sample', data=melted_df, dodge=True, jitter=True, color='black', size=1, alpha=0.3, **kwargs)
187
+
188
+ plt.xticks(rotation=45, ha='right', fontsize=label_font_size)
189
+ plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=label_font_size)
190
+
191
+ if not title:
192
+ plt.title('Top Differentially Expressed Genes', fontsize=title_font_size)
193
+ else:
194
+ plt.title(title, fontsize=title_font_size)
195
+ plt.show()
196
+
197
+
198
+ def write_top_DEGs(df, sample_id, result_dir='.', n_hits=200):
199
+ df['-log10(pvalue)'] = - np.log10(df.pvalue)
200
+ df.nlargest(n_hits, '-log10(pvalue)').to_csv(f'{result_dir}/{sample_id}_top_{n_hits}.csv') # Adjust the number as needed
@@ -0,0 +1,74 @@
1
+ import pandas as pd
2
+ import scanpy as sc
3
+ import scar
4
+
5
+
6
+ def normalization(adata, target_sum=1e4, max_value=10, final_layer='scaled', keep_initial_layer=True):
7
+ if keep_initial_layer == True:
8
+ adata.layers['raw_counts'] = adata.X.copy()
9
+ elif type(keep_initial_layer) == str:
10
+ adata.layers[keep_initial_layer] = adata.X.copy()
11
+
12
+ # normalize counts to target_sum (default 1e4)
13
+ counts = sc.pp.normalize_total(adata, target_sum=target_sum, inplace=False)
14
+ # log1p transform
15
+ adata.layers["log1p_norm"] = sc.pp.log1p(counts["X"], copy=True)
16
+ # scale counts
17
+ adata.layers['scaled'] = sc.pp.scale(adata, max_value=max_value, copy=True).X
18
+ # set the final layer
19
+ adata.X = adata.layers[final_layer]
20
+
21
+
22
+ def remove_ambient_rna(adata_filtered_feature_bc, adata_raw_feature_bc):
23
+ scar.setup_anndata(
24
+ adata = adata_filtered_feature_bc,
25
+ raw_adata = adata_raw_feature_bc,
26
+ prob = 0.995,
27
+ kneeplot = True
28
+ )
29
+
30
+ adata_scar = scar.model(
31
+ raw_count=adata_filtered_feature_bc.to_df(), # In the case of Anndata object, scar will automatically use the estimated ambient_profile present in adata.uns.
32
+ # ambient_profile=adata_filtered_feature_bc.uns['ambient_profile_Gene Expression'],
33
+ feature_type='mRNA',
34
+ sparsity=1,
35
+ # device=device # Both cpu and cuda are supported.
36
+ )
37
+
38
+ adata_scar.train(
39
+ epochs=200,
40
+ batch_size=64,
41
+ verbose=True
42
+ )
43
+
44
+ # After training, we can infer the native true signal
45
+ adata_scar.inference(batch_size=256) # by defaut, batch_size = None, set a batch_size if getting a memory issue
46
+
47
+ denoised_count = pd.DataFrame(
48
+ adata_scar.native_counts,
49
+ index=adata_filtered_feature_bc.obs_names,
50
+ columns=adata_filtered_feature_bc.var_names
51
+ )
52
+
53
+ adata = adata_filtered_feature_bc.copy()
54
+ adata.layers['raw_counts'] = adata.X
55
+ adata.layers['scar_denoised_counts'] = denoised_count.to_numpy()
56
+
57
+ return adata
58
+
59
+
60
+ def clustering(
61
+ adata
62
+ ):
63
+ pass
64
+ # , n_pcs=50, n_neighbors=30, use_highly_variable='Yes',
65
+ # use_rep=None, resolution=None
66
+
67
+ # if use_highly_variable == 'Yes':
68
+ # sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
69
+ # sc.tl.pca(adata, svd_solver='arpack', use_highly_variable=True)
70
+ # else:
71
+ # sc.pp.pca(adata, n_comps=n_pcs)
72
+ # sc.pp.neighbors(adata, use_rep=use_rep, n_neighbors=n_neighbors)#, n_pcs=n_pcs)
73
+ # sc.tl.umap(adata)
74
+ # sc.tl.leiden(adata, resolution=resolution)
@@ -0,0 +1,44 @@
1
+ from itertools import product
2
+ import matplotlib.pyplot as plt
3
+ import scanpy as sc
4
+ import numpy as np
5
+
6
+
7
+ def optimising_umap_layout(adata, cluster_key='leiden',MIN_DISTS = [0.1, 1, 2], SPREADS = [0.5, 1, 5]):
8
+ # https://scanpy-tutorials.readthedocs.io/en/latest/plotting/advanced.html
9
+ # Copy adata not to modify UMAP in the original adata object
10
+ adata_temp = adata.copy()
11
+
12
+ # Create grid of plots, with a little extra room for the legends
13
+ fig, axes = plt.subplots(
14
+ len(MIN_DISTS), len(SPREADS), figsize=(len(SPREADS) * 3 + 2, len(MIN_DISTS) * 3)
15
+ )
16
+
17
+ # Loop through different umap parameters, recomputting and replotting UMAP for each of them
18
+ for (i, min_dist), (j, spread) in product(enumerate(MIN_DISTS), enumerate(SPREADS)):
19
+ ax = axes[i][j]
20
+ param_str = " ".join(["min_dist =", str(min_dist), "and spread =", str(spread)])
21
+ # Recompute UMAP with new parameters
22
+ sc.tl.umap(adata_temp, min_dist=min_dist, spread=spread)
23
+ # Create plot, placing it in grid
24
+ sc.pl.umap(
25
+ adata_temp,
26
+ color=[cluster_key],
27
+ title=param_str,
28
+ s=40,
29
+ ax=ax,
30
+ show=False,
31
+ )
32
+ plt.tight_layout()
33
+ plt.show()
34
+ plt.close()
35
+ del adata_temp
36
+
37
+
38
+ def random_ordering(adata):
39
+ # Randomly order cells by making a random index and subsetting AnnData based on it
40
+ # Set a random seed to ensure that the cell ordering will be reproducible
41
+ np.random.seed(0)
42
+ random_indices = np.random.permutation(list(range(adata.shape[0])))
43
+
44
+ return random_indices
File without changes
@@ -0,0 +1,19 @@
1
+ import unittest
2
+ import matplotlib.pyplot as plt
3
+ import scanpy as sc
4
+ import scez
5
+ import tomli
6
+
7
+ with open('pyproject.toml', 'rb') as f:
8
+ toml_dict = tomli.load(f)
9
+ version = toml_dict['project']['version']
10
+
11
+ class TestScezConfig(unittest.TestCase):
12
+ def test_version(self):
13
+ self.assertEqual(scez.__version__, version)
14
+
15
+ def test_scanpy_settings(self):
16
+ self.assertEqual(sc.settings.verbosity, 1)
17
+
18
+ if __name__ == '__main__':
19
+ unittest.main()
@@ -0,0 +1,64 @@
1
+ import pandas as pd
2
+ from matplotlib import pyplot as plt
3
+ from adjustText import adjust_text
4
+
5
+
6
+ def rank_genes_to_df(adata, n=50):
7
+ result = adata.uns['rank_genes_groups']
8
+
9
+ groups = result['names'].dtype.names
10
+
11
+ df = pd.DataFrame(
12
+ {group + '_' + key: result[key][group]
13
+ for group in groups for key in ['names', 'scores']}).head(n)
14
+
15
+ return df
16
+
17
+
18
+ def add_marker_feature(adata, marker, marker_name, clusters_name, thr = 0, figsize=(10, 4)):
19
+
20
+ adata.obs[marker_name] = ''
21
+ adata.obs.loc[adata.to_df().loc[:,marker] <= thr, marker_name] = f'{marker}-'
22
+ adata.obs.loc[adata.to_df().loc[:,marker] > thr, marker_name] = f'{marker}+'
23
+
24
+ df = pd.concat([
25
+ adata.obs.groupby([marker_name,clusters_name]).size()[f'{marker}+'],
26
+ adata.obs.groupby([marker_name,clusters_name]).size()[f'{marker}-']
27
+ ],axis=1).rename(columns={0:f'{marker}+',1:f'{marker}-'})
28
+
29
+ # Make some labels.
30
+ labels = df[f'{marker}+'] / df.sum(axis=1) * 100
31
+ labels = labels.round(decimals=1)
32
+ labels.sort_values(ascending=False,inplace=True)
33
+ df = df.loc[labels.index,]
34
+
35
+ ax = df.plot.bar(stacked=True,rot=0,figsize=figsize)
36
+
37
+ rects = ax.patches
38
+
39
+ for rect, label in zip(rects, labels):
40
+ height = rect.get_height()
41
+ ax.text(
42
+ rect.get_x() + rect.get_width() / 2, height + 5, str(label) + "%",
43
+ ha="center", va="bottom", fontsize=8
44
+ )
45
+
46
+ ax.set_yscale('log')
47
+ ax.set_ylabel('# of cells')
48
+ return ax
49
+
50
+
51
+ def run_adjust_text(x, y, labels, ax=None, use_arrow=True, font_weight='bold', font_size=8):
52
+ texts = [
53
+ plt.text(
54
+ x[i], y[i],
55
+ labels[i],
56
+ fontdict={'weight': font_weight, 'size': font_size},
57
+ ha='center', va='center'
58
+ ) for i in range(len(x))
59
+ ]
60
+
61
+ if use_arrow:
62
+ adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'), ax = ax)
63
+ else:
64
+ adjust_text(texts, ax = ax)
@@ -0,0 +1,26 @@
1
+ name: scez
2
+ channels:
3
+ - anaconda
4
+ - conda-forge
5
+ - bioconda
6
+ dependencies:
7
+ - python>=3.9
8
+ - scanpy
9
+ - pertpy
10
+ - python-igraph
11
+ - leidenalg
12
+ - anndata
13
+ - scipy
14
+ - scar
15
+ - scikit-learn
16
+ - matplotlib
17
+ - ipykernel
18
+ - mscorefonts
19
+ - pip
20
+ - pip:
21
+ - numba
22
+ - adpbulk
23
+ - pydeseq2
24
+ - adjustText
25
+ - watermark
26
+ - python=3.12