fastccc 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fastccc-0.1.1/LICENSE +21 -0
- fastccc-0.1.1/PKG-INFO +91 -0
- fastccc-0.1.1/README.md +70 -0
- fastccc-0.1.1/fastccc/0.01index_mat.npy +0 -0
- fastccc-0.1.1/fastccc/__init__.py +8 -0
- fastccc-0.1.1/fastccc/build_reference.py +455 -0
- fastccc-0.1.1/fastccc/cauchy_combine.py +55 -0
- fastccc-0.1.1/fastccc/ccc_utils.py +75 -0
- fastccc-0.1.1/fastccc/core.py +933 -0
- fastccc-0.1.1/fastccc/dist_complex.py +91 -0
- fastccc-0.1.1/fastccc/dist_iid_set.py +61 -0
- fastccc-0.1.1/fastccc/dist_lr.py +43 -0
- fastccc-0.1.1/fastccc/distrib.py +543 -0
- fastccc-0.1.1/fastccc/distrib_digit.py +266 -0
- fastccc-0.1.1/fastccc/hk_genes.py +3806 -0
- fastccc-0.1.1/fastccc/infer_query.py +735 -0
- fastccc-0.1.1/fastccc/preproc_utils.py +14 -0
- fastccc-0.1.1/fastccc/preprocess.py +274 -0
- fastccc-0.1.1/fastccc/score.py +185 -0
- fastccc-0.1.1/fastccc/visualize.py +104 -0
- fastccc-0.1.1/pyproject.toml +30 -0
fastccc-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 S. Hou
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
fastccc-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fastccc
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A permutation-free framework for scalable, robust, and reference-based cell-cell communication analysis in single cell transcriptomics studies.
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Author: Svvord
|
|
7
|
+
Author-email: cchousiyu@163.com
|
|
8
|
+
Requires-Python: >=3.11,<3.13
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Requires-Dist: gseapy (>=1.1.4,<2.0.0)
|
|
13
|
+
Requires-Dist: loguru (>=0.7.2,<0.8.0)
|
|
14
|
+
Requires-Dist: numpy (>=1.24,<2.0)
|
|
15
|
+
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
|
16
|
+
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
17
|
+
Requires-Dist: psutil (>=6.1.1,<7.0.0)
|
|
18
|
+
Requires-Dist: scanpy (>=1.10.3,<2.0.0)
|
|
19
|
+
Requires-Dist: scipy (>=1.14.1,<2.0.0)
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# FastCCC: A permutation-free framework for scalable, robust, and reference-based cell-cell communication analysis in single cell transcriptomics studies.
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
[](https://doi.org/10.1038/s41467-025-66272-z)
|
|
26
|
+
[](https://svvord.github.io/FastCCC/)
|
|
27
|
+
[](https://github.com/Svvord/FastCCC/blob/main/LICENSE)
|
|
28
|
+
|
|
29
|
+
**[2025.02.01]** Update: To minimize the size of transmitted panel data, we leverage FastCCC’s speed to compute essential reference data during first-time usage. This process incurs only an additional 1–2 minutes during initial activation. Meanwhile, the storage requirement for uploading the panel data has been significantly reduced (from 3GB to 5MB per tissue panel).
|
|
30
|
+
|
|
31
|
+
**[2025.01.23]** We have provided a comprehensive [tutorial](https://svvord.github.io/FastCCC/) on the usage of FastCCC, which includes detailed instructions on installation, usage, and more. We highly recommend referring to this [tutorial](https://svvord.github.io/FastCCC/) for a step-by-step guide.
|
|
32
|
+
|
|
33
|
+
## Overview
|
|
34
|
+

|
|
35
|
+
<p align="justify"> Detecting cell-cell communications (CCCs) in single-cell transcriptomics studies is fundamental for understanding the function of multicellular organisms. Here, we introduce FastCCC, a permutation-free framework that enables scalable, robust, and reference-based analysis for identifying critical CCCs and uncovering biological insights. FastCCC relies on fast Fourier transformation-based convolution to compute $p$-values analytically without permutations, introduces a modular algebraic operation framework to capture a broad spectrum of CCC patterns, and can leverage atlas-scale single cell references to enhance CCC analysis on user-collected datasets. To support routine reference-based CCC analysis, we constructed the first human CCC reference panel, encompassing 19 distinct tissue types, over 450 unique cell types, and approximately 16 million cells. We demonstrate the advantages of FastCCC across multiple datasets, most of which exceed the analytical capabilities of existing CCC methods. In real datasets, FastCCC reliably captures biologically meaningful CCCs, even in highly complex tissue environments, including differential interactions between endothelial and immune cells linked to COVID-19 severity, dynamic communications in thymic tissue during T-cell development, as well as distinct interactions in reference-based CCC analysis. </p>
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
### Method 1: Installing via conda
|
|
39
|
+
You can install the environment using Conda by following the steps:
|
|
40
|
+
```bash
|
|
41
|
+
conda create -n FastCCC python=3.11
|
|
42
|
+
conda activate FastCCC
|
|
43
|
+
```
|
|
44
|
+
Get FastCCC from github:
|
|
45
|
+
```bash
|
|
46
|
+
git clone https://github.com/Svvord/FastCCC.git
|
|
47
|
+
```
|
|
48
|
+
Go to the folder `FastCCC` and install:
|
|
49
|
+
```bash
|
|
50
|
+
cd ./FastCCC
|
|
51
|
+
pip install -e .
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Method 2: Installing via pip
|
|
55
|
+
We are currently organizing the code and packaging functionalities to enhance user convenience. Once the code is finalized, we will upload it to PyPI to support installation via pip install. At this stage, please use the code available on GitHub and install it using Conda or Poetry.
|
|
56
|
+
```bash
|
|
57
|
+
pip install # coming soon.
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Method 3: Installing developing version via Poetry
|
|
61
|
+
For developing, we are using the [Poetry] package manager. To install Poetry, follow the instructions [here](https://python-poetry.org/docs/#installing-with-pipx).
|
|
62
|
+
```bash
|
|
63
|
+
git clone https://github.com/Svvord/FastCCC.git
|
|
64
|
+
cd ./FastCCC
|
|
65
|
+
poetry install
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## How to use `FastCCC`
|
|
69
|
+
Check our [vignettes](https://svvord.github.io/FastCCC/).
|
|
70
|
+
|
|
71
|
+
## Citing the work
|
|
72
|
+
If you find the `FastCCC` package or any of the source code in this repository useful for your work, please [cite](https://www.biorxiv.org/content/10.1101/2025.01.27.635115v1):
|
|
73
|
+
|
|
74
|
+
> Hou, S., Ma, W. & Zhou, X. FastCCC: a permutation-free framework for scalable, robust, and reference-based cell-cell communication analysis in single cell transcriptomics studies. Nat Commun 16, 11428 (2025). https://doi.org/10.1038/s41467-025-66272-z
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
@article{hou_fastccc_2025,
|
|
78
|
+
title = {{FastCCC}: a permutation-free framework for scalable, robust, and reference-based cell-cell communication analysis in single cell transcriptomics studies},
|
|
79
|
+
author = {Hou, Siyu and Ma, Wenjing and Zhou, Xiang},
|
|
80
|
+
journal = {Nature Communications},
|
|
81
|
+
volume = {16},
|
|
82
|
+
year = {2025},
|
|
83
|
+
eid = {11428},
|
|
84
|
+
doi = {10.1038/s41467-025-66272-z},
|
|
85
|
+
url = {https://www.nature.com/articles/s41467-025-66272-z}
|
|
86
|
+
}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
Visit our [group website](https://xiangzhou.github.io/) for more statistical
|
|
91
|
+
tools on analyzing genetics, genomics and transcriptomics data.
|
fastccc-0.1.1/README.md
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# FastCCC: A permutation-free framework for scalable, robust, and reference-based cell-cell communication analysis in single cell transcriptomics studies.
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
[](https://doi.org/10.1038/s41467-025-66272-z)
|
|
5
|
+
[](https://svvord.github.io/FastCCC/)
|
|
6
|
+
[](https://github.com/Svvord/FastCCC/blob/main/LICENSE)
|
|
7
|
+
|
|
8
|
+
**[2025.02.01]** Update: To minimize the size of transmitted panel data, we leverage FastCCC’s speed to compute essential reference data during first-time usage. This process incurs only an additional 1–2 minutes during initial activation. Meanwhile, the storage requirement for uploading the panel data has been significantly reduced (from 3GB to 5MB per tissue panel).
|
|
9
|
+
|
|
10
|
+
**[2025.01.23]** We have provided a comprehensive [tutorial](https://svvord.github.io/FastCCC/) on the usage of FastCCC, which includes detailed instructions on installation, usage, and more. We highly recommend referring to this [tutorial](https://svvord.github.io/FastCCC/) for a step-by-step guide.
|
|
11
|
+
|
|
12
|
+
## Overview
|
|
13
|
+

|
|
14
|
+
<p align="justify"> Detecting cell-cell communications (CCCs) in single-cell transcriptomics studies is fundamental for understanding the function of multicellular organisms. Here, we introduce FastCCC, a permutation-free framework that enables scalable, robust, and reference-based analysis for identifying critical CCCs and uncovering biological insights. FastCCC relies on fast Fourier transformation-based convolution to compute $p$-values analytically without permutations, introduces a modular algebraic operation framework to capture a broad spectrum of CCC patterns, and can leverage atlas-scale single cell references to enhance CCC analysis on user-collected datasets. To support routine reference-based CCC analysis, we constructed the first human CCC reference panel, encompassing 19 distinct tissue types, over 450 unique cell types, and approximately 16 million cells. We demonstrate the advantages of FastCCC across multiple datasets, most of which exceed the analytical capabilities of existing CCC methods. In real datasets, FastCCC reliably captures biologically meaningful CCCs, even in highly complex tissue environments, including differential interactions between endothelial and immune cells linked to COVID-19 severity, dynamic communications in thymic tissue during T-cell development, as well as distinct interactions in reference-based CCC analysis. </p>
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
### Method 1: Installing via conda
|
|
18
|
+
You can install the environment using Conda by following the steps:
|
|
19
|
+
```bash
|
|
20
|
+
conda create -n FastCCC python=3.11
|
|
21
|
+
conda activate FastCCC
|
|
22
|
+
```
|
|
23
|
+
Get FastCCC from github:
|
|
24
|
+
```bash
|
|
25
|
+
git clone https://github.com/Svvord/FastCCC.git
|
|
26
|
+
```
|
|
27
|
+
Go to the folder `FastCCC` and install:
|
|
28
|
+
```bash
|
|
29
|
+
cd ./FastCCC
|
|
30
|
+
pip install -e .
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Method 2: Installing via pip
|
|
34
|
+
We are currently organizing the code and packaging functionalities to enhance user convenience. Once the code is finalized, we will upload it to PyPI to support installation via pip install. At this stage, please use the code available on GitHub and install it using Conda or Poetry.
|
|
35
|
+
```bash
|
|
36
|
+
pip install # coming soon.
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Method 3: Installing developing version via Poetry
|
|
40
|
+
For developing, we are using the [Poetry] package manager. To install Poetry, follow the instructions [here](https://python-poetry.org/docs/#installing-with-pipx).
|
|
41
|
+
```bash
|
|
42
|
+
git clone https://github.com/Svvord/FastCCC.git
|
|
43
|
+
cd ./FastCCC
|
|
44
|
+
poetry install
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## How to use `FastCCC`
|
|
48
|
+
Check our [vignettes](https://svvord.github.io/FastCCC/).
|
|
49
|
+
|
|
50
|
+
## Citing the work
|
|
51
|
+
If you find the `FastCCC` package or any of the source code in this repository useful for your work, please [cite](https://www.biorxiv.org/content/10.1101/2025.01.27.635115v1):
|
|
52
|
+
|
|
53
|
+
> Hou, S., Ma, W. & Zhou, X. FastCCC: a permutation-free framework for scalable, robust, and reference-based cell-cell communication analysis in single cell transcriptomics studies. Nat Commun 16, 11428 (2025). https://doi.org/10.1038/s41467-025-66272-z
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
@article{hou_fastccc_2025,
|
|
57
|
+
title = {{FastCCC}: a permutation-free framework for scalable, robust, and reference-based cell-cell communication analysis in single cell transcriptomics studies},
|
|
58
|
+
author = {Hou, Siyu and Ma, Wenjing and Zhou, Xiang},
|
|
59
|
+
journal = {Nature Communications},
|
|
60
|
+
volume = {16},
|
|
61
|
+
year = {2025},
|
|
62
|
+
eid = {11428},
|
|
63
|
+
doi = {10.1038/s41467-025-66272-z},
|
|
64
|
+
url = {https://www.nature.com/articles/s41467-025-66272-z}
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
Visit our [group website](https://xiangzhou.github.io/) for more statistical
|
|
70
|
+
tools on analyzing genetics, genomics and transcriptomics data.
|
|
Binary file
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from loguru import logger
|
|
3
|
+
|
|
4
|
+
logger.remove()
|
|
5
|
+
logger.add(sys.stdout, level="INFO", format='<cyan>{time:YYYY-MM-DD HH:mm:ss}</cyan> | <level>{level: <8}</level> | <level>{message}</level>')
|
|
6
|
+
|
|
7
|
+
from .core import Cauchy_combination_of_statistical_analysis_methods
|
|
8
|
+
from .core import statistical_analysis_method
|
|
@@ -0,0 +1,455 @@
|
|
|
1
|
+
import scanpy as sc
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from scipy.sparse import issparse
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
from loguru import logger
|
|
7
|
+
from .preprocess import get_interactions
|
|
8
|
+
from . import preproc_utils
|
|
9
|
+
from .core import calculate_cluster_percents, analyze_interactions_percents
|
|
10
|
+
from .distrib_digit import Distribution_digit, get_pmf_array_from_samples_for_digitized_bins, get_minimum_distribution_for_digit
|
|
11
|
+
from . import dist_complex
|
|
12
|
+
from . import dist_lr
|
|
13
|
+
from . import score
|
|
14
|
+
import itertools
|
|
15
|
+
from scipy.signal import fftconvolve
|
|
16
|
+
from collections import Counter
|
|
17
|
+
import os
|
|
18
|
+
import pickle
|
|
19
|
+
|
|
20
|
+
def digitize_transform(x, n_bins=50):
|
|
21
|
+
def _digitize(x: np.ndarray, bins: np.ndarray, side="both") -> np.ndarray:
|
|
22
|
+
assert x.ndim == 1 and bins.ndim == 1
|
|
23
|
+
left_digits = np.digitize(x, bins)
|
|
24
|
+
if side == "one":
|
|
25
|
+
return left_digits
|
|
26
|
+
right_digits = np.digitize(x, bins, right=True)
|
|
27
|
+
rands = np.random.rand(len(x)) # uniform random numbers
|
|
28
|
+
digits = rands * (right_digits - left_digits) + left_digits
|
|
29
|
+
digits = np.ceil(digits).astype(np.int64)
|
|
30
|
+
return digits
|
|
31
|
+
|
|
32
|
+
# non_zero_ids = x.nonzero()
|
|
33
|
+
# non_zero_row = x[non_zero_ids]
|
|
34
|
+
'''
|
|
35
|
+
input x 就是 非0的,直接针对csr,coo数据的
|
|
36
|
+
'''
|
|
37
|
+
bins = np.quantile(x, np.linspace(0, 1, n_bins - 1))
|
|
38
|
+
non_zero_digits = _digitize(x, bins)
|
|
39
|
+
return non_zero_digits
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def calculate_L_R_and_IS_score(mean_counts, interactions):
|
|
43
|
+
p1_index = []
|
|
44
|
+
p2_index = []
|
|
45
|
+
all_index = []
|
|
46
|
+
for i in itertools.product(sorted(mean_counts.index), sorted(mean_counts.index)):
|
|
47
|
+
p1_index.append(i[0])
|
|
48
|
+
p2_index.append(i[1])
|
|
49
|
+
all_index.append('|'.join(i))
|
|
50
|
+
p1 = mean_counts.loc[p1_index, interactions['multidata_1_id']]
|
|
51
|
+
p2 = mean_counts.loc[p2_index, interactions['multidata_2_id']]
|
|
52
|
+
p1.columns = interactions.index
|
|
53
|
+
p2.columns = interactions.index
|
|
54
|
+
p1.index = all_index
|
|
55
|
+
p2.index = all_index
|
|
56
|
+
interactions_strength = (p1 + p2)/2 * (p1 > 0) * (p2>0)
|
|
57
|
+
return p1, p2, interactions_strength
|
|
58
|
+
|
|
59
|
+
def calculate_L_R_and_IS_percents(cluster_percents, interactions, threshold=0.1, sep='|'):
|
|
60
|
+
|
|
61
|
+
p1_index = []
|
|
62
|
+
p2_index = []
|
|
63
|
+
all_index = []
|
|
64
|
+
for i in itertools.product(sorted(cluster_percents.index), sorted(cluster_percents.index)):
|
|
65
|
+
p1_index.append(i[0])
|
|
66
|
+
p2_index.append(i[1])
|
|
67
|
+
all_index.append(sep.join(i))
|
|
68
|
+
|
|
69
|
+
p1 = cluster_percents.loc[p1_index, interactions['multidata_1_id']]
|
|
70
|
+
p2 = cluster_percents.loc[p2_index, interactions['multidata_2_id']]
|
|
71
|
+
p1.columns = interactions.index
|
|
72
|
+
p2.columns = interactions.index
|
|
73
|
+
p1.index = all_index
|
|
74
|
+
p2.index = all_index
|
|
75
|
+
|
|
76
|
+
interactions_strength = (p1>threshold) * (p2>threshold)
|
|
77
|
+
# print((p1>threshold) * (p2>threshold))
|
|
78
|
+
|
|
79
|
+
return p1, p2, interactions_strength
|
|
80
|
+
|
|
81
|
+
def calculate_mean_pmfs(counts_df, labels_df, complex_table, gene_pmf_dict, n_fft=100):
|
|
82
|
+
meta_dict = Counter(labels_df.cell_type)
|
|
83
|
+
####### clusters_mean #######
|
|
84
|
+
clusters_mean_dict = {}
|
|
85
|
+
for celltype in sorted(meta_dict):
|
|
86
|
+
clusters_mean_dict[celltype] = {}
|
|
87
|
+
n_sum = meta_dict[celltype]
|
|
88
|
+
if n_sum < n_fft:
|
|
89
|
+
for gene in counts_df.columns:
|
|
90
|
+
if gene not in gene_pmf_dict:
|
|
91
|
+
continue
|
|
92
|
+
else:
|
|
93
|
+
clusters_mean_dict[celltype][gene] = gene_pmf_dict[gene][n_sum]
|
|
94
|
+
else:
|
|
95
|
+
for gene in counts_df.columns:
|
|
96
|
+
if gene not in gene_pmf_dict:
|
|
97
|
+
continue
|
|
98
|
+
else:
|
|
99
|
+
clusters_mean_dict[celltype][gene] = gene_pmf_dict[gene][1] ** n_sum / n_sum
|
|
100
|
+
mean_pmfs = pd.DataFrame(clusters_mean_dict).T
|
|
101
|
+
complex_func = get_minimum_distribution_for_digit
|
|
102
|
+
mean_pmfs = dist_complex.combine_complex_distribution_df(mean_pmfs, complex_table, complex_func)
|
|
103
|
+
return mean_pmfs
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def rank_preprocess(adata):
|
|
107
|
+
np.random.seed(42) # add seed to ensure reproduiablity
|
|
108
|
+
assert issparse(adata.X), "Anndata.X should be a sparse matrix format."
|
|
109
|
+
if adata.shape[1] < 5000:
|
|
110
|
+
logger.warning("Do you use whole transcriptomes? Raw data w\o filtering genes should work better.")
|
|
111
|
+
|
|
112
|
+
for i in tqdm(range(adata.shape[0]), desc="Ranking genes for cells", unit="cell",
|
|
113
|
+
bar_format="{l_bar}{bar} | {n_fmt}/{total_fmt} cells completed", leave=False):
|
|
114
|
+
indices = slice(adata.X.indptr[i], adata.X.indptr[i+1])
|
|
115
|
+
x = adata.X.data[indices]
|
|
116
|
+
adata.X.data[indices] = digitize_transform(x)
|
|
117
|
+
logger.success("Rank preprocess done.")
|
|
118
|
+
return adata
|
|
119
|
+
|
|
120
|
+
def get_fastccc_input(adata, lrdb_file_path, convert_type = 'hgnc_symbol'):
|
|
121
|
+
logger.info("Loading LRIs database. hgnc_symbol as gene name is requested.")
|
|
122
|
+
interactions = get_interactions(lrdb_file_path)
|
|
123
|
+
##### gene_table ########
|
|
124
|
+
gene_table = pd.read_csv(os.path.join(lrdb_file_path, 'gene_table.csv'))
|
|
125
|
+
protein_table = pd.read_csv(os.path.join(lrdb_file_path, 'protein_table.csv'))
|
|
126
|
+
gene_table = gene_table.merge(protein_table, left_on='protein_id', right_on='id_protein')
|
|
127
|
+
#########################
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
##### complex_table ######
|
|
131
|
+
complex_composition = pd.read_csv(os.path.join(lrdb_file_path, 'complex_composition_table.csv'))
|
|
132
|
+
complex_table = pd.read_csv(os.path.join(lrdb_file_path, 'complex_table.csv'))
|
|
133
|
+
complex_table = complex_table.merge(complex_composition, left_on='complex_multidata_id', right_on='complex_multidata_id')
|
|
134
|
+
|
|
135
|
+
# 让我们只关注 'complex_multidata_id','protein_multidata_id'
|
|
136
|
+
complex_table = complex_table[['complex_multidata_id','protein_multidata_id']]
|
|
137
|
+
'''
|
|
138
|
+
complex_table(pandas.DataFrame):
|
|
139
|
+
=======================================================
|
|
140
|
+
| complex_multidata_id | protein_multidata_id
|
|
141
|
+
-------------------------------------------------------
|
|
142
|
+
0 | 1355 | 1134
|
|
143
|
+
1 | 1356 | 1175
|
|
144
|
+
2 | 1357 | 1167
|
|
145
|
+
=======================================================
|
|
146
|
+
'''
|
|
147
|
+
##########################
|
|
148
|
+
|
|
149
|
+
##### feature to id conversion ######
|
|
150
|
+
# 不在 标准列表 里的 gene 就不要了
|
|
151
|
+
tmp = gene_table[[convert_type, 'protein_multidata_id']]
|
|
152
|
+
tmp = tmp.drop_duplicates()
|
|
153
|
+
tmp.set_index('protein_multidata_id', inplace=True)
|
|
154
|
+
|
|
155
|
+
select_columns = []
|
|
156
|
+
columns_names = []
|
|
157
|
+
for foo, boo in zip(tmp.index, tmp[convert_type]):
|
|
158
|
+
if boo in adata.var_names:#counts.columns:
|
|
159
|
+
select_columns.append(boo)
|
|
160
|
+
columns_names.append(foo)
|
|
161
|
+
|
|
162
|
+
reduced_counts = adata[:, select_columns].to_df()
|
|
163
|
+
reduced_counts.columns = columns_names
|
|
164
|
+
reduced_counts = reduced_counts.T.groupby(reduced_counts.columns).mean().T
|
|
165
|
+
# FutureWarning: DataFrame.groupby with axis=1 is deprecated. Do `frame.T.groupby(...)` without axis instead.
|
|
166
|
+
# reduced_counts = reduced_counts.groupby(reduced_counts.columns, axis=1).mean()
|
|
167
|
+
######################################
|
|
168
|
+
|
|
169
|
+
########## filter genes ############
|
|
170
|
+
# gene 在 所有 cell 上为 0 不要
|
|
171
|
+
reduced_counts = preproc_utils.filter_empty_genes(reduced_counts)
|
|
172
|
+
######################################
|
|
173
|
+
|
|
174
|
+
######################################################################
|
|
175
|
+
# 3.Other DF filtered #
|
|
176
|
+
######################################################################
|
|
177
|
+
# 一个 interaction 可能只有 partA 存在,但是 partB 不存在
|
|
178
|
+
# 只有 如果 任意一部分不存在, 另一部分没必要参与后续计算
|
|
179
|
+
|
|
180
|
+
##### delete item not involved interactions ####
|
|
181
|
+
|
|
182
|
+
foo_dict = complex_table.groupby('complex_multidata_id').apply(lambda x: list(x['protein_multidata_id'].values), include_groups=False).to_dict()
|
|
183
|
+
'''
|
|
184
|
+
dictionary complex_id: [protein_id_1, pid2, pid3, ...]
|
|
185
|
+
foo_dict = {
|
|
186
|
+
1355: [1134],
|
|
187
|
+
1356: [1175],
|
|
188
|
+
xxxx: [AAAA, BBBB, CCCC],
|
|
189
|
+
}
|
|
190
|
+
'''
|
|
191
|
+
|
|
192
|
+
def __content__(key):
|
|
193
|
+
if key not in foo_dict:
|
|
194
|
+
return [key]
|
|
195
|
+
else:
|
|
196
|
+
return foo_dict[key]
|
|
197
|
+
|
|
198
|
+
def __exist__(key, df):
|
|
199
|
+
# 目前的 complex 策略就是 全部都要有
|
|
200
|
+
# 经测试,这是cpdb用的策略
|
|
201
|
+
for item in __content__(key):
|
|
202
|
+
if item not in df.columns:
|
|
203
|
+
return False
|
|
204
|
+
return True
|
|
205
|
+
|
|
206
|
+
temp_list = []
|
|
207
|
+
temp_dict = {}
|
|
208
|
+
for item in reduced_counts.columns:
|
|
209
|
+
temp_dict[item] = False
|
|
210
|
+
|
|
211
|
+
# 注释是为了验证 interactions 的过滤策略, 完全一致
|
|
212
|
+
# print(interactions)
|
|
213
|
+
select_index = []
|
|
214
|
+
for partA, partB in zip(interactions.multidata_1_id, interactions.multidata_2_id):
|
|
215
|
+
if __exist__(partA, reduced_counts) and __exist__(partB, reduced_counts):
|
|
216
|
+
temp_list.extend([partA, partB])
|
|
217
|
+
select_index.append(True)
|
|
218
|
+
else:
|
|
219
|
+
select_index.append(False)
|
|
220
|
+
interactions_filtered = interactions[select_index]
|
|
221
|
+
|
|
222
|
+
for item in temp_list:
|
|
223
|
+
for subitem in __content__(item):
|
|
224
|
+
if subitem in temp_dict:
|
|
225
|
+
temp_dict[subitem] = True
|
|
226
|
+
select_index = [key for key in temp_dict if temp_dict[key]]
|
|
227
|
+
reduced_counts = reduced_counts[select_index]
|
|
228
|
+
|
|
229
|
+
counts_df = reduced_counts
|
|
230
|
+
temp_list = set(temp_list)
|
|
231
|
+
select_index = [True if item in temp_list else False for item in complex_table.complex_multidata_id]
|
|
232
|
+
complex_table = complex_table[select_index]
|
|
233
|
+
interactions = interactions_filtered
|
|
234
|
+
logger.success("Requested data for fastccc is prepared.")
|
|
235
|
+
return counts_df, complex_table, interactions
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def fastccc_for_reference(reference_name, save_path, counts_df, labels_df, complex_table, interactions, min_percentile = 0.1, ref_debug_mode=False, query_debug_mode=False, for_uploading=False):
|
|
239
|
+
logger.info("Running FastCCC.")
|
|
240
|
+
mean_counts = score.calculate_cluster_mean(counts_df, labels_df)
|
|
241
|
+
complex_func = score.calculate_complex_min_func
|
|
242
|
+
mean_counts = score.combine_complex_distribution_df(mean_counts, complex_table, complex_func)
|
|
243
|
+
percents = calculate_cluster_percents(counts_df, labels_df, complex_table)
|
|
244
|
+
|
|
245
|
+
n_bins = 50
|
|
246
|
+
precision_digit = 0.01
|
|
247
|
+
pmf_bins_digit = np.arange(0, n_bins+precision_digit - 1e-10, precision_digit)
|
|
248
|
+
|
|
249
|
+
#######
|
|
250
|
+
logger.info("Calculating null distributions.")
|
|
251
|
+
n_fft = 100
|
|
252
|
+
gene_sum_pmf_dict = {}
|
|
253
|
+
basic_info_dict = {}
|
|
254
|
+
for gene in counts_df.columns:
|
|
255
|
+
samples = counts_df[gene].values
|
|
256
|
+
|
|
257
|
+
loc = np.mean(samples)
|
|
258
|
+
scale = np.std(samples)
|
|
259
|
+
basic_info_dict[gene] = {'loc':loc, 'scale':scale}
|
|
260
|
+
|
|
261
|
+
gene_sum_pmf_dict[gene] = {1: get_pmf_array_from_samples_for_digitized_bins(samples)}
|
|
262
|
+
basic_info_dict[gene]['expr_dist'] = gene_sum_pmf_dict[gene][1]
|
|
263
|
+
|
|
264
|
+
for item in range(2,n_fft):
|
|
265
|
+
gene_sum_pmf_dict[gene][item] = fftconvolve(gene_sum_pmf_dict[gene][item-1], gene_sum_pmf_dict[gene][1])
|
|
266
|
+
|
|
267
|
+
gene_pmf_dict = {}
|
|
268
|
+
for gene in counts_df.columns:
|
|
269
|
+
gene_pmf_dict[gene] = {}
|
|
270
|
+
loc = basic_info_dict[gene]['loc']
|
|
271
|
+
scale = basic_info_dict[gene]['scale']
|
|
272
|
+
for item in range(1,n_fft):
|
|
273
|
+
pmf = gene_sum_pmf_dict[gene][item]
|
|
274
|
+
cdf = np.cumsum(pmf)
|
|
275
|
+
pmf_array = np.diff(cdf[np.int64(pmf_bins_digit * item)],prepend=0)
|
|
276
|
+
if item == 1:
|
|
277
|
+
gene_pmf_dict[gene][item] = Distribution_digit('other', pmf_array=pmf_array, loc=loc, scale=scale, is_align=True)
|
|
278
|
+
else:
|
|
279
|
+
gene_pmf_dict[gene][item] = Distribution_digit('other', pmf_array=pmf_array, is_align=True)
|
|
280
|
+
|
|
281
|
+
if ref_debug_mode or query_debug_mode:
|
|
282
|
+
p1_index = []
|
|
283
|
+
p2_index = []
|
|
284
|
+
all_index = []
|
|
285
|
+
for i in itertools.product(sorted(mean_counts.index), sorted(mean_counts.index)):
|
|
286
|
+
p1_index.append(i[0])
|
|
287
|
+
p2_index.append(i[1])
|
|
288
|
+
all_index.append('|'.join(i))
|
|
289
|
+
p1 = mean_counts.loc[p1_index, interactions['multidata_1_id']]
|
|
290
|
+
p2 = mean_counts.loc[p2_index, interactions['multidata_2_id']]
|
|
291
|
+
p1.columns = interactions.index
|
|
292
|
+
p2.columns = interactions.index
|
|
293
|
+
p1.index = all_index
|
|
294
|
+
p2.index = all_index
|
|
295
|
+
interactions_strength = (p1 + p2)/2 * (p1 > 0) * (p2>0)
|
|
296
|
+
L_perc, R_perc, percents_analysis = calculate_L_R_and_IS_percents(percents, interactions, threshold=min_percentile)
|
|
297
|
+
|
|
298
|
+
meta_dict = Counter(labels_df.cell_type)
|
|
299
|
+
####### clusters_mean #######
|
|
300
|
+
clusters_mean_dict = {}
|
|
301
|
+
for celltype in sorted(meta_dict):
|
|
302
|
+
clusters_mean_dict[celltype] = {}
|
|
303
|
+
n_sum = meta_dict[celltype]
|
|
304
|
+
if n_sum < n_fft:
|
|
305
|
+
for gene in counts_df.columns:
|
|
306
|
+
clusters_mean_dict[celltype][gene] = gene_pmf_dict[gene][n_sum]
|
|
307
|
+
else:
|
|
308
|
+
for gene in counts_df.columns:
|
|
309
|
+
clusters_mean_dict[celltype][gene] = gene_pmf_dict[gene][1] ** n_sum / n_sum
|
|
310
|
+
mean_pmfs = pd.DataFrame(clusters_mean_dict).T
|
|
311
|
+
complex_func = get_minimum_distribution_for_digit
|
|
312
|
+
mean_pmfs = dist_complex.combine_complex_distribution_df(mean_pmfs, complex_table, complex_func)
|
|
313
|
+
|
|
314
|
+
logger.info("Calculating sig. LRIs.")
|
|
315
|
+
pvals = dist_lr.calculate_key_interactions_pvalue(
|
|
316
|
+
mean_pmfs, interactions, interactions_strength, percents_analysis, method='Arithmetic'
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
if query_debug_mode:
|
|
320
|
+
pvals.to_csv(f'{save_path}/debug_pvals.txt', sep='\t')
|
|
321
|
+
return
|
|
322
|
+
|
|
323
|
+
if ref_debug_mode:
|
|
324
|
+
pvals.to_csv(f'{save_path}/ref_pvals.txt', sep='\t')
|
|
325
|
+
percents_analysis.to_csv(f'{save_path}/ref_percents_analysis.txt', sep='\t')
|
|
326
|
+
L_perc.to_csv(f'{save_path}/ref_percents_L.txt', sep='\t')
|
|
327
|
+
R_perc.to_csv(f'{save_path}/ref_percents_R.txt', sep='\t')
|
|
328
|
+
# interactions_strength.to_csv(f'{save_path}/ref_interactions_strength.csv')
|
|
329
|
+
p1.to_csv(f'{save_path}/ref_interactions_strength_L.txt', sep='\t')
|
|
330
|
+
p2.to_csv(f'{save_path}/ref_interactions_strength_R.txt', sep='\t')
|
|
331
|
+
|
|
332
|
+
####### save reference results #######
|
|
333
|
+
logger.info("Saving reference.")
|
|
334
|
+
if for_uploading:
|
|
335
|
+
with open(f'{save_path}/basic_info_dict.pkl', 'wb') as f:
|
|
336
|
+
pickle.dump(basic_info_dict, f)
|
|
337
|
+
else:
|
|
338
|
+
with open(f'{save_path}/ref_gene_pmf_dict.pkl', 'wb') as f:
|
|
339
|
+
pickle.dump(gene_pmf_dict, f)
|
|
340
|
+
with open(f'{save_path}/ref_percents.pkl', 'wb') as f:
|
|
341
|
+
pickle.dump(percents, f)
|
|
342
|
+
with open(f'{save_path}/ref_mean_counts.pkl', 'wb') as f:
|
|
343
|
+
pickle.dump(mean_counts, f)
|
|
344
|
+
with open(f'{save_path}/complex_table.pkl', 'wb') as f:
|
|
345
|
+
pickle.dump(complex_table, f)
|
|
346
|
+
with open(f'{save_path}/interactions.pkl', 'wb') as f:
|
|
347
|
+
pickle.dump(interactions, f)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def record_hk_genes(adata):
|
|
351
|
+
from .hk_genes import housekeeping_genes
|
|
352
|
+
select_index = [item for item in adata.var_names if item in housekeeping_genes]
|
|
353
|
+
hk_adata = adata[:, select_index]
|
|
354
|
+
mean_hk_rnk = hk_adata.X.mean(axis=0)
|
|
355
|
+
return mean_hk_rnk, select_index
|
|
356
|
+
|
|
357
|
+
def record_adjustment_info(adata, save_path):
|
|
358
|
+
mean_hk_rnk, gene_index = record_hk_genes(adata)
|
|
359
|
+
ref_hk = pd.DataFrame(np.array(mean_hk_rnk).flatten(), index=gene_index, columns=['ref_hk'])
|
|
360
|
+
ref_hk.to_csv(f'{save_path}/ref_hk.txt', sep='\t')
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
reference_config = {}
|
|
364
|
+
|
|
365
|
+
from datetime import date
|
|
366
|
+
|
|
367
|
+
def dumps(toml_dict, table=""):
|
|
368
|
+
document = []
|
|
369
|
+
for key, value in toml_dict.items():
|
|
370
|
+
match value:
|
|
371
|
+
case dict():
|
|
372
|
+
table_key = f"{table}.{key}" if table else key
|
|
373
|
+
document.append(
|
|
374
|
+
f"\n[{table_key}]\n{_dumps_dict(value)}"
|
|
375
|
+
)
|
|
376
|
+
case _:
|
|
377
|
+
document.append(f"{key} = {_dumps_value(value)}")
|
|
378
|
+
return "\n".join(document)
|
|
379
|
+
|
|
380
|
+
def _dumps_dict(toml_dict):
|
|
381
|
+
document = []
|
|
382
|
+
for key, value in toml_dict.items():
|
|
383
|
+
key = f'"{key}"'
|
|
384
|
+
document.append(f"{key} = {_dumps_value(value)}")
|
|
385
|
+
return "\n".join(document)
|
|
386
|
+
|
|
387
|
+
def _dumps_value(value):
|
|
388
|
+
match value:
|
|
389
|
+
case bool():
|
|
390
|
+
return "true" if value else "false"
|
|
391
|
+
case float() | int():
|
|
392
|
+
return str(value)
|
|
393
|
+
case str():
|
|
394
|
+
return f'"{value}"'
|
|
395
|
+
case date():
|
|
396
|
+
return value.isoformat()
|
|
397
|
+
case list():
|
|
398
|
+
return f"[{', '.join(_dumps_value(v) for v in value)}]"
|
|
399
|
+
case _:
|
|
400
|
+
raise TypeError(
|
|
401
|
+
f"{type(value).__name__} {value!r} is not supported"
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
def save_config(save_path):
|
|
405
|
+
logger.info("Saving reference config.")
|
|
406
|
+
save_content = dumps(reference_config)
|
|
407
|
+
with open(f'{save_path}/config.toml', 'w') as f:
|
|
408
|
+
f.write(save_content)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def build_reference_workflow(database_file_path, reference_counts_file_path, celltype_file_path, reference_name, save_path, meta_key=None, min_percentile = 0.1, debug_mode=False, for_uploading=False):
|
|
412
|
+
logger.info(f"Start building CCC reference.")
|
|
413
|
+
|
|
414
|
+
reference_config['reference_name'] = reference_name
|
|
415
|
+
reference_config['min_percentile'] = min_percentile
|
|
416
|
+
if database_file_path.endswith('/'):
|
|
417
|
+
reference_config['LRI_database'] = database_file_path[:-1].split('/')[-1]
|
|
418
|
+
else:
|
|
419
|
+
reference_config['LRI_database'] = database_file_path.split('/')[-1]
|
|
420
|
+
|
|
421
|
+
logger.info(f"Reference_name = {reference_config['reference_name']}")
|
|
422
|
+
logger.info(f"min_percentile = {reference_config['min_percentile']}")
|
|
423
|
+
logger.info(f"LRI database = {reference_config['LRI_database']}")
|
|
424
|
+
|
|
425
|
+
save_path = os.path.join(save_path, reference_name)
|
|
426
|
+
if not os.path.exists(save_path):
|
|
427
|
+
os.makedirs(save_path)
|
|
428
|
+
logger.success(f"Reference save dir {save_path} is created.")
|
|
429
|
+
else:
|
|
430
|
+
logger.warning(f"{save_path} already exists, all files will be overwritten")
|
|
431
|
+
|
|
432
|
+
reference = sc.read_h5ad(reference_counts_file_path)
|
|
433
|
+
sc.pp.filter_cells(reference, min_genes=50)
|
|
434
|
+
logger.info(f"Reading reference adata, {reference.shape[0]} cells x {reference.shape[1]} genes.")
|
|
435
|
+
|
|
436
|
+
if meta_key is not None:
|
|
437
|
+
labels_df = pd.DataFrame(reference.obs[meta_key])
|
|
438
|
+
labels_df.columns = ['cell_type']
|
|
439
|
+
labels_df.index.name = 'barcode_sample'
|
|
440
|
+
else:
|
|
441
|
+
labels_df = pd.read_csv(celltype_file_path, sep='\t', index_col=0)
|
|
442
|
+
for barcode in reference.obs_names:
|
|
443
|
+
assert barcode in labels_df.index, "The index of query data doesn't match the index of labels"
|
|
444
|
+
labels_df = labels_df.loc[reference.obs_names, :]
|
|
445
|
+
|
|
446
|
+
ct_counter = Counter(labels_df['cell_type'])
|
|
447
|
+
reference_config['celltype'] = ct_counter
|
|
448
|
+
|
|
449
|
+
reference = rank_preprocess(reference)
|
|
450
|
+
record_adjustment_info(reference, save_path)
|
|
451
|
+
counts_df, complex_table, interactions = get_fastccc_input(reference, database_file_path)
|
|
452
|
+
fastccc_for_reference(reference_name, save_path, counts_df, labels_df, complex_table, interactions, min_percentile, ref_debug_mode=debug_mode, for_uploading=for_uploading)
|
|
453
|
+
save_config(save_path)
|
|
454
|
+
logger.success(f"Reference '{reference_name}' is built.")
|
|
455
|
+
|