BAITS 0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- baits-0.1/.gitignore +1 -0
- baits-0.1/.readthedocs.yaml +22 -0
- baits-0.1/BAITS/.ipynb_checkpoints/__init__-checkpoint.py +5 -0
- baits-0.1/BAITS/VDJ/__init__.py +3 -0
- baits-0.1/BAITS/VDJ/pl/__init__.py +1 -0
- baits-0.1/BAITS/VDJ/pl/basic_plot.py +97 -0
- baits-0.1/BAITS/VDJ/tl/.ipynb_checkpoints/__init__-checkpoint.py +4 -0
- baits-0.1/BAITS/VDJ/tl/.ipynb_checkpoints/bcr_cluster-checkpoint.py +69 -0
- baits-0.1/BAITS/VDJ/tl/.ipynb_checkpoints/qc-checkpoint.py +44 -0
- baits-0.1/BAITS/VDJ/tl/.ipynb_checkpoints/summarize_BCR-checkpoint.py +71 -0
- baits-0.1/BAITS/VDJ/tl/__init__.py +4 -0
- baits-0.1/BAITS/VDJ/tl/bcr_cluster.py +69 -0
- baits-0.1/BAITS/VDJ/tl/bcr_desc.py +72 -0
- baits-0.1/BAITS/VDJ/tl/qc.py +44 -0
- baits-0.1/BAITS/VDJ/tl/summarize_BCR.py +71 -0
- baits-0.1/BAITS/__init__.py +3 -0
- baits-0.1/BAITS/st/__init__.py +3 -0
- baits-0.1/BAITS/st/gr/.ipynb_checkpoints/__init__-checkpoint.py +4 -0
- baits-0.1/BAITS/st/gr/.ipynb_checkpoints/_aggr-checkpoint.py +183 -0
- baits-0.1/BAITS/st/gr/.ipynb_checkpoints/_build_spatial_adj-checkpoint.py +206 -0
- baits-0.1/BAITS/st/gr/.ipynb_checkpoints/_get_adj-checkpoint.py +38 -0
- baits-0.1/BAITS/st/gr/.ipynb_checkpoints/_utils-checkpoint.py +11 -0
- baits-0.1/BAITS/st/gr/__init__.py +4 -0
- baits-0.1/BAITS/st/gr/_aggr.py +183 -0
- baits-0.1/BAITS/st/gr/_build_spatial_adj.py +206 -0
- baits-0.1/BAITS/st/gr/_get_adj.py +38 -0
- baits-0.1/BAITS/st/gr/_utils.py +11 -0
- baits-0.1/BAITS/st/pl/.ipynb_checkpoints/__init__-checkpoint.py +3 -0
- baits-0.1/BAITS/st/pl/.ipynb_checkpoints/_clustering_results-checkpoint.py +35 -0
- baits-0.1/BAITS/st/pl/.ipynb_checkpoints/_enrichment_score-checkpoint.py +31 -0
- baits-0.1/BAITS/st/pl/.ipynb_checkpoints/_kde_filter-checkpoint.py +39 -0
- baits-0.1/BAITS/st/pl/.ipynb_checkpoints/_silhouette_score-checkpoint.py +41 -0
- baits-0.1/BAITS/st/pl/__init__.py +3 -0
- baits-0.1/BAITS/st/pl/_clustering_results.py +35 -0
- baits-0.1/BAITS/st/pl/_enrichment_score.py +31 -0
- baits-0.1/BAITS/st/pl/_kde_filter.py +39 -0
- baits-0.1/BAITS/st/pl/_silhouette_score.py +41 -0
- baits-0.1/BAITS/st/tl/.ipynb_checkpoints/_GMM-checkpoint.py +329 -0
- baits-0.1/BAITS/st/tl/.ipynb_checkpoints/__init__-checkpoint.py +5 -0
- baits-0.1/BAITS/st/tl/.ipynb_checkpoints/_batch_process-checkpoint.py +90 -0
- baits-0.1/BAITS/st/tl/.ipynb_checkpoints/_cluster_Auto_k-checkpoint.py +279 -0
- baits-0.1/BAITS/st/tl/.ipynb_checkpoints/_dbscan_cluster-checkpoint.py +44 -0
- baits-0.1/BAITS/st/tl/.ipynb_checkpoints/_kde_filter-checkpoint.py +107 -0
- baits-0.1/BAITS/st/tl/_GMM.py +329 -0
- baits-0.1/BAITS/st/tl/__init__.py +5 -0
- baits-0.1/BAITS/st/tl/_batch_process.py +90 -0
- baits-0.1/BAITS/st/tl/_cluster_Auto_k.py +279 -0
- baits-0.1/BAITS/st/tl/_dbscan_cluster.py +44 -0
- baits-0.1/BAITS/st/tl/_kde_filter.py +107 -0
- baits-0.1/LICENSE +19 -0
- baits-0.1/Makefile +20 -0
- baits-0.1/PKG-INFO +96 -0
- baits-0.1/README.md +33 -0
- baits-0.1/images/BAITS.png +0 -0
- baits-0.1/make.bat +35 -0
- baits-0.1/pyproject.toml +96 -0
baits-0.1/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
build/
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Read the Docs configuration file
|
|
2
|
+
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
|
3
|
+
|
|
4
|
+
# Required
|
|
5
|
+
version: 2
|
|
6
|
+
|
|
7
|
+
# Set the OS, Python version, and other tools you might need
|
|
8
|
+
build:
|
|
9
|
+
os: ubuntu-24.04
|
|
10
|
+
tools:
|
|
11
|
+
python: "3.10"
|
|
12
|
+
|
|
13
|
+
# Build documentation in the "docs/" directory with Sphinx
|
|
14
|
+
sphinx:
|
|
15
|
+
configuration: docs/conf.py
|
|
16
|
+
|
|
17
|
+
# Optionally, but recommended,
|
|
18
|
+
# declare the Python requirements required to build your documentation
|
|
19
|
+
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
|
|
20
|
+
python:
|
|
21
|
+
install:
|
|
22
|
+
- requirements: docs/requirements.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .basic_plot import _plot_bar, _boxplot, _scatter_plot, _plot_xcr
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import matplotlib.pyplot as plt
|
|
4
|
+
import matplotlib.ticker as mticker
|
|
5
|
+
import seaborn as sns
|
|
6
|
+
|
|
7
|
+
def _plot_bar(df, x, y, groupby=None, palette='Set2', xlabel=None, ylabel='Clone Richeness', ylog=False, ax=None, figsize=(4,3.5) ):
|
|
8
|
+
plt.figure(figsize=figsize)
|
|
9
|
+
sns.barplot(y=y, x=x, data=df, hue=groupby, palette='Set2', alpha=0.5)
|
|
10
|
+
sns.stripplot(y=y, x=x, data=df, hue=groupby, dodge=True, jitter=0.2, palette='Set2')
|
|
11
|
+
plt.xlabel(xlabel)
|
|
12
|
+
plt.ylabel(ylabel)
|
|
13
|
+
if ylog:
|
|
14
|
+
plt.yscale('log')
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _boxplot(df, x, y, groupby=None, palette='Set2', xlabel=None, ylabel=None, log=False, ax=None ):
|
|
18
|
+
|
|
19
|
+
sns.boxplot(data=df, x=x, y=y, hue=groupby, palette=palette, ax=ax )
|
|
20
|
+
|
|
21
|
+
if ax:
|
|
22
|
+
ax.set_xlabel(xlabel) # 设置x轴标签
|
|
23
|
+
if ax:
|
|
24
|
+
ax.set_ylabel(ylabel) # 设置y轴标签
|
|
25
|
+
|
|
26
|
+
if log:
|
|
27
|
+
plt.yscale('log')
|
|
28
|
+
|
|
29
|
+
if groupby is not None:
|
|
30
|
+
plt.legend(loc='upper right', bbox_to_anchor=(1, 1))
|
|
31
|
+
|
|
32
|
+
plt.xlabel(xlabel)
|
|
33
|
+
plt.ylabel(ylabel)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _qc_boxplot_clone(clone_df, clone_spatial_df, Cgene_key, group_by, figsize=(6, 3) ):
|
|
37
|
+
fig, axes = plt.subplots(1, 2, figsize=figsize)
|
|
38
|
+
_boxplot(df=clone_df, x=Cgene_key, y='clone_by_group', palette='Set2', xlabel=None, ylabel='clone_by_group', log=False, ax=axes[0] )
|
|
39
|
+
_boxplot(df=clone_spatial_df, x=group_by, y='clone_by_group_spatialLoc', groupby=Cgene_key, palette='Set2', xlabel=None, ylabel='clone_by_group_spatialLoc', log=True, ax=axes[1])
|
|
40
|
+
plt.tight_layout()
|
|
41
|
+
plt.show()
|
|
42
|
+
|
|
43
|
+
def _qc_boxplot_umis(umis_df, umis_spatial_df, Cgene_key, group_by, figsize=(6, 3) ):
|
|
44
|
+
fig, axes = plt.subplots(1, 2, figsize=figsize)
|
|
45
|
+
_boxplot(df=umis_df, x=Cgene_key, y='umis_by_group', palette='Set2', xlabel=None, ylabel='umis_by_group', log=True, ax=axes[0] )
|
|
46
|
+
_boxplot(df=umis_spatial_df, x=group_by, y='umis_by_group_spatialLoc', groupby=Cgene_key, palette='Set2', xlabel=None, ylabel='umis_by_group_spatialLoc', log=True, ax=axes[1])
|
|
47
|
+
plt.tight_layout()
|
|
48
|
+
plt.show()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _scatter_plot(df, x, y, groupby=None, palette='Set2', xlabel=None, ylabel=None, x_log=False, y_log=False, ax=None ):
|
|
52
|
+
|
|
53
|
+
sns.scatterplot( data=df, x=x, y=y, hue=groupby, palette=palette, edgecolor='black', ax=ax )
|
|
54
|
+
|
|
55
|
+
if ax:
|
|
56
|
+
ax.set_xlabel(xlabel) # 设置x轴标签
|
|
57
|
+
if ax:
|
|
58
|
+
ax.set_ylabel(ylabel) # 设置y轴标签
|
|
59
|
+
|
|
60
|
+
if x_log:
|
|
61
|
+
plt.xscale('log')
|
|
62
|
+
if y_log:
|
|
63
|
+
plt.yscale('log')
|
|
64
|
+
if groupby is not None:
|
|
65
|
+
plt.legend(loc='upper right', bbox_to_anchor=(1, 1))
|
|
66
|
+
|
|
67
|
+
plt.xlabel(xlabel)
|
|
68
|
+
plt.ylabel(ylabel)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _plot_xcr(xcr_df, clone_col, loc_x_key, loc_y_key ):
|
|
72
|
+
xcr_df = xcr_df.sort_values(clone_col)
|
|
73
|
+
xcr_df['clone_access'] = xcr_df[clone_col].astype('category').cat.codes
|
|
74
|
+
clone = list(xcr_df['clone_access'])
|
|
75
|
+
|
|
76
|
+
x_min = xcr_df[loc_x_key].min(); x_max = xcr_df[loc_x_key].max()+1
|
|
77
|
+
y_min = xcr_df[loc_y_key].min(); y_max = xcr_df[loc_y_key].max()+1
|
|
78
|
+
|
|
79
|
+
x = list(xcr_df[loc_x_key]); y = list(xcr_df[loc_y_key])
|
|
80
|
+
xcr_mat = np.zeros(( x_max+1, y_max+1 ))
|
|
81
|
+
xcr_mat_c = np.zeros(( x_max+1, y_max+1 ))
|
|
82
|
+
for i in range(len(x)):
|
|
83
|
+
xcr_mat[x[i],y[i]]+=1
|
|
84
|
+
xcr_mat_c[x[i],y[i]] = clone[i]
|
|
85
|
+
xcr_row, xcr_col = np.where(xcr_mat )
|
|
86
|
+
xcr_size = xcr_mat[xcr_row, xcr_col] * 0.3 / np.percentile(xcr_mat[xcr_mat!=0].flatten(),0.9)
|
|
87
|
+
xcr_size = np.clip(xcr_size,0,1) * 3
|
|
88
|
+
xcr_color = xcr_mat_c[xcr_row, xcr_col]
|
|
89
|
+
|
|
90
|
+
fig, ax = plt.subplots(1,1, figsize=((y_max-y_min)*3/10000, (x_max-x_min)*3/10000 ))
|
|
91
|
+
ax.scatter(xcr_col, xcr_row, s=xcr_size, c=xcr_color, marker='o',cmap='coolwarm')
|
|
92
|
+
ax.invert_yaxis()
|
|
93
|
+
plt.show()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from scipy.cluster.hierarchy import linkage, fcluster
|
|
4
|
+
from scipy.spatial.distance import squareform
|
|
5
|
+
from Levenshtein import distance as levenshtein_distance
|
|
6
|
+
from itertools import combinations
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
|
|
9
|
+
def _cluster_cdr3nt(cdr3_sequences, threshold=0.85):
|
|
10
|
+
# Calculate pairwise distances
|
|
11
|
+
n = len(cdr3_sequences)
|
|
12
|
+
dist_matrix = np.zeros((n, n))
|
|
13
|
+
|
|
14
|
+
for i, j in combinations(range(n), 2):
|
|
15
|
+
dist_matrix[i, j] = levenshtein_distance(cdr3_sequences[i], cdr3_sequences[j])
|
|
16
|
+
dist_matrix[j, i] = dist_matrix[i, j]
|
|
17
|
+
|
|
18
|
+
condensed_dist = squareform(dist_matrix)
|
|
19
|
+
Z = linkage(condensed_dist, method='average')
|
|
20
|
+
max_dist = max(len(seq) for seq in cdr3_sequences) * (1 - threshold)
|
|
21
|
+
clusters = fcluster(Z, max_dist, criterion='distance')
|
|
22
|
+
|
|
23
|
+
return {seq: cluster for seq, cluster in zip(cdr3_sequences, clusters)}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _build_bcr_family(bcr_data, cdr3nt_col, threshold=0.85):
|
|
27
|
+
|
|
28
|
+
tmp_dd_ss = bcr_data.drop_duplicates(subset=[cdr3nt_col])
|
|
29
|
+
|
|
30
|
+
if tmp_dd_ss.empty:
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
comp_cdr3_nt = tmp_dd_ss[cdr3nt_col].tolist()
|
|
34
|
+
|
|
35
|
+
if len(comp_cdr3_nt)==1:
|
|
36
|
+
return {comp_cdr3_nt[0]: 0 }
|
|
37
|
+
|
|
38
|
+
family_groups = _cluster_cdr3nt(comp_cdr3_nt, threshold)
|
|
39
|
+
return family_groups
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def build_bcr_family(bcr_data, Vgene_col, Jgene_col, cdr3nt_col, Cgene_key, Cgene='IGH', threshold=0.85):
|
|
43
|
+
|
|
44
|
+
data = bcr_data[[Vgene_col, Jgene_col, cdr3nt_col, Cgene_key]].copy()
|
|
45
|
+
|
|
46
|
+
# Filter by c_gene
|
|
47
|
+
data = data[data['Cgene'].str.contains(Cgene)].copy()
|
|
48
|
+
|
|
49
|
+
data = data.drop_duplicates(subset=[Vgene_col, Jgene_col, cdr3nt_col])
|
|
50
|
+
data['CDR3_length'] = (data[cdr3nt_col].str.len()/3).astype(int)
|
|
51
|
+
data['cloneGroup'] = data[Vgene_col] + "_" + data[Jgene_col] + "_" + data[cdr3nt_col].astype(str)
|
|
52
|
+
data['family'] = ''
|
|
53
|
+
for group in set(data['cloneGroup']):
|
|
54
|
+
tmp_data = data[data['cloneGroup'] == group].copy()
|
|
55
|
+
result = _build_bcr_family(tmp_data, cdr3nt_col, threshold=0.85)
|
|
56
|
+
|
|
57
|
+
if not result:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
for seq, family_num_idx in result.items():
|
|
61
|
+
family_name = group + '_family_' + str(family_num_idx)
|
|
62
|
+
mask = (data['cloneGroup'] == group) & (data[cdr3nt_col]==(seq) )
|
|
63
|
+
data.loc[mask, 'family'] = family_name
|
|
64
|
+
|
|
65
|
+
data['family_id'] = [ Cgene + '_family_' +x for x in pd.Categorical(data['family']).codes.astype(str)]
|
|
66
|
+
bcr_data = pd.merge(bcr_data, data, on = [Vgene_col, Jgene_col, Cgene_key, cdr3nt_col], how='left')
|
|
67
|
+
|
|
68
|
+
return bcr_data
|
|
69
|
+
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import seaborn as sns
|
|
4
|
+
from pl.basic_plot import _qc_boxplot_clone, _qc_boxplot_umis
|
|
5
|
+
|
|
6
|
+
def calculate_qc_clones(df, group_by, Cgene_key, clone_key,loc_x_key='X', loc_y_key='Y', plot=True):
|
|
7
|
+
clone_df = df[[group_by, Cgene_key, clone_key ]].drop_duplicates().groupby([group_by, Cgene_key ]).size().reset_index(name='clone_by_group')
|
|
8
|
+
clone_spatial_df = df[[group_by, Cgene_key, clone_key, loc_x_key, loc_y_key]].drop_duplicates().groupby([group_by, Cgene_key, loc_x_key, loc_y_key]).size().reset_index(name='clone_by_group_spatialLoc')
|
|
9
|
+
|
|
10
|
+
if plot:
|
|
11
|
+
_qc_boxplot_clone( clone_df, clone_spatial_df, Cgene_key, group_by, figsize=(7, 3.5) )
|
|
12
|
+
cloneDict = clone_df.set_index([group_by, Cgene_key])['clone_by_group'].to_dict()
|
|
13
|
+
clonesSpatialDict = clone_spatial_df.set_index([group_by, Cgene_key, loc_x_key, loc_y_key])['clone_by_group_spatialLoc'].to_dict()
|
|
14
|
+
df['clone_by_group'] = df.apply(lambda row: cloneDict.get((row[group_by], row[Cgene_key]), None), axis=1)
|
|
15
|
+
df['clone_by_group_spatialLoc'] = df.apply(lambda row: clonesSpatialDict.get((row[group_by], row[Cgene_key], row[loc_x_key], row[loc_y_key]), None), axis=1)
|
|
16
|
+
return df
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def calculate_qc_umis(df, group_by, Cgene_key, clone_key, loc_x_key='X', loc_y_key='Y', plot=True, figsize=(7, 3.5)):
|
|
20
|
+
umis_df = df[[group_by, Cgene_key, clone_key ]].groupby([group_by, Cgene_key ]).size().reset_index(name='umis_by_group')
|
|
21
|
+
umis_spatial_df = df[[group_by, Cgene_key, clone_key, loc_x_key, loc_y_key]].groupby([group_by, Cgene_key, loc_x_key, loc_y_key]).size().reset_index(name='umis_by_group_spatialLoc')
|
|
22
|
+
|
|
23
|
+
if plot:
|
|
24
|
+
_qc_boxplot_umis( umis_df, umis_spatial_df, Cgene_key, group_by, figsize=(7, 3.5) )
|
|
25
|
+
umisDict = umis_df.set_index([group_by, Cgene_key])['umis_by_group'].to_dict()
|
|
26
|
+
umisSpatialDict = umis_spatial_df.set_index([group_by, Cgene_key, loc_x_key, loc_y_key])['umis_by_group_spatialLoc'].to_dict()
|
|
27
|
+
df['umis_by_group'] = df.apply(lambda row: umisDict.get((row[group_by], row[Cgene_key]), None), axis=1)
|
|
28
|
+
df['umis_by_group_spatialLoc'] = df.apply(lambda row: umisSpatialDict.get((row[group_by], row[Cgene_key], row[loc_x_key], row[loc_y_key]), None), axis=1)
|
|
29
|
+
return df
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def filter_clones(df, clone_key, min_clone=1):
|
|
33
|
+
return df[df[clone_key] > min_clone]
|
|
34
|
+
|
|
35
|
+
def filter_clones_spatial(df, clone_spatial_key, min_clone_spatial=1):
|
|
36
|
+
return df[df[clone_spatial_key] > min_clone_spatial]
|
|
37
|
+
|
|
38
|
+
def filter_umi(df, umi_key, min_umi=1):
|
|
39
|
+
return df[df[umi_key] > min_umi]
|
|
40
|
+
|
|
41
|
+
def filter_umi_spatial(df, clone_umi_key, min_umi_spatial=1):
|
|
42
|
+
return df[df[clone_umi_key] > min_umi_spatial]
|
|
43
|
+
|
|
44
|
+
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
sys.path.append(str(Path(__file__).parent.parent))
|
|
7
|
+
from pl.basic_plot import _plot_bar
|
|
8
|
+
|
|
9
|
+
from .bcr_desc import compute_index
|
|
10
|
+
|
|
11
|
+
def stat_clone(df, groupby, Cgene_key, clone_key, plot=True, palette='Set2', xlabel=None, ylabel=None, ylog=False, figsize=(4,3.5) ):
|
|
12
|
+
y_name = 'clone_by_'+groupby
|
|
13
|
+
clone_df = df[[groupby, Cgene_key, clone_key]].drop_duplicates().groupby([groupby, Cgene_key]).size().reset_index(name = y_name)
|
|
14
|
+
|
|
15
|
+
if plot:
|
|
16
|
+
_plot_bar(clone_df, Cgene_key, y_name, groupby=Cgene_key, palette='Set2', xlabel=None, ylabel=None, ylog=False, figsize=(4,3.5) )
|
|
17
|
+
|
|
18
|
+
cloneDict = clone_df.set_index([groupby, Cgene_key])['clone_by_'+groupby].to_dict()
|
|
19
|
+
df[y_name] = df.apply(lambda row: cloneDict.get((row[groupby], row[Cgene_key]), None), axis=1)
|
|
20
|
+
|
|
21
|
+
return df
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def aggregate_clone_df(df, group_by, Cgene_key, clone_key, groups, count_basis='location', loc_x_key='X', loc_y_key='Y', Umi_key='UMI'):
|
|
25
|
+
if count_basis == 'location':
|
|
26
|
+
lst = list(set([group_by, Cgene_key, clone_key, loc_x_key, loc_y_key] + groups))
|
|
27
|
+
loc_df = df[lst].drop_duplicates()
|
|
28
|
+
_Index_compute_count = loc_df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts().reset_index(name='count')
|
|
29
|
+
_Index_compute_freq = loc_df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts(normalize=True).reset_index(name='freq')
|
|
30
|
+
_Index_compute = pd.merge(_Index_compute_freq, _Index_compute_count, on = groups+[clone_key] )
|
|
31
|
+
return _Index_compute
|
|
32
|
+
|
|
33
|
+
if count_basis == 'UMI':
|
|
34
|
+
_Index_compute_count = df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts().reset_index(name='count')
|
|
35
|
+
_Index_compute_freq = df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts(normalize=True).reset_index(name='freq')
|
|
36
|
+
_Index_compute = pd.merge(_Index_compute_freq, _Index_compute_count, on = groups+[clone_key] )
|
|
37
|
+
return _Index_compute
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def compute_grouped_index(df, group_by, Cgene_key, clone_key, groups, count_basis='location', loc_x_key='X', loc_y_key='Y', Umi_key=None, index='shannon_entropy'):
|
|
41
|
+
if count_basis=='location':
|
|
42
|
+
_Index_compute = aggregate_clone_df(df, group_by, Cgene_key, clone_key, groups, count_basis=count_basis, loc_x_key='X', loc_y_key='Y').copy()
|
|
43
|
+
if count_basis=='UMI':
|
|
44
|
+
_Index_compute = aggregate_clone_df(df, group_by, Cgene_key, clone_key, groups, count_basis=count_basis, Umi_key=Umi_key).copy()
|
|
45
|
+
|
|
46
|
+
tmp_df = _Index_compute.groupby(groups)['freq'].apply(lambda x: compute_index(index, x))
|
|
47
|
+
|
|
48
|
+
if index != 'renyi_entropy':
|
|
49
|
+
tmp_df = tmp_df.reset_index(name=index).dropna(subset=[index])
|
|
50
|
+
else:
|
|
51
|
+
tmp_df = tmp_df.melt(ignore_index=False, var_name='alpha', value_name = index).reset_index()
|
|
52
|
+
cols = list(range(0, len(groups))) + list(range(len(groups)+1, len(tmp_df.columns)))
|
|
53
|
+
tmp_df = tmp_df.iloc[:, cols ]
|
|
54
|
+
|
|
55
|
+
return tmp_df
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from scipy.cluster.hierarchy import linkage, fcluster
|
|
4
|
+
from scipy.spatial.distance import squareform
|
|
5
|
+
from Levenshtein import distance as levenshtein_distance
|
|
6
|
+
from itertools import combinations
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
|
|
9
|
+
def _cluster_cdr3nt(cdr3_sequences, threshold=0.85):
|
|
10
|
+
# Calculate pairwise distances
|
|
11
|
+
n = len(cdr3_sequences)
|
|
12
|
+
dist_matrix = np.zeros((n, n))
|
|
13
|
+
|
|
14
|
+
for i, j in combinations(range(n), 2):
|
|
15
|
+
dist_matrix[i, j] = levenshtein_distance(cdr3_sequences[i], cdr3_sequences[j])
|
|
16
|
+
dist_matrix[j, i] = dist_matrix[i, j]
|
|
17
|
+
|
|
18
|
+
condensed_dist = squareform(dist_matrix)
|
|
19
|
+
Z = linkage(condensed_dist, method='average')
|
|
20
|
+
max_dist = max(len(seq) for seq in cdr3_sequences) * (1 - threshold)
|
|
21
|
+
clusters = fcluster(Z, max_dist, criterion='distance')
|
|
22
|
+
|
|
23
|
+
return {seq: cluster for seq, cluster in zip(cdr3_sequences, clusters)}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _build_bcr_family(bcr_data, cdr3nt_col, threshold=0.85):
|
|
27
|
+
|
|
28
|
+
tmp_dd_ss = bcr_data.drop_duplicates(subset=[cdr3nt_col])
|
|
29
|
+
|
|
30
|
+
if tmp_dd_ss.empty:
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
comp_cdr3_nt = tmp_dd_ss[cdr3nt_col].tolist()
|
|
34
|
+
|
|
35
|
+
if len(comp_cdr3_nt)==1:
|
|
36
|
+
return {comp_cdr3_nt[0]: 0 }
|
|
37
|
+
|
|
38
|
+
family_groups = _cluster_cdr3nt(comp_cdr3_nt, threshold)
|
|
39
|
+
return family_groups
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def build_bcr_family(bcr_data, Vgene_col, Jgene_col, cdr3nt_col, Cgene_key, Cgene='IGH', threshold=0.85):
|
|
43
|
+
|
|
44
|
+
data = bcr_data[[Vgene_col, Jgene_col, cdr3nt_col, Cgene_key]].copy()
|
|
45
|
+
|
|
46
|
+
# Filter by c_gene
|
|
47
|
+
data = data[data['Cgene'].str.contains(Cgene)].copy()
|
|
48
|
+
|
|
49
|
+
data = data.drop_duplicates(subset=[Vgene_col, Jgene_col, cdr3nt_col])
|
|
50
|
+
data['CDR3_length'] = (data[cdr3nt_col].str.len()/3).astype(int)
|
|
51
|
+
data['cloneGroup'] = data[Vgene_col] + "_" + data[Jgene_col] + "_" + data[cdr3nt_col].astype(str)
|
|
52
|
+
data['family'] = ''
|
|
53
|
+
for group in set(data['cloneGroup']):
|
|
54
|
+
tmp_data = data[data['cloneGroup'] == group].copy()
|
|
55
|
+
result = _build_bcr_family(tmp_data, cdr3nt_col, threshold=0.85)
|
|
56
|
+
|
|
57
|
+
if not result:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
for seq, family_num_idx in result.items():
|
|
61
|
+
family_name = group + '_family_' + str(family_num_idx)
|
|
62
|
+
mask = (data['cloneGroup'] == group) & (data[cdr3nt_col]==(seq) )
|
|
63
|
+
data.loc[mask, 'family'] = family_name
|
|
64
|
+
|
|
65
|
+
data['family_id'] = [ Cgene + '_family_' +x for x in pd.Categorical(data['family']).codes.astype(str)]
|
|
66
|
+
bcr_data = pd.merge(bcr_data, data, on = [Vgene_col, Jgene_col, Cgene_key, cdr3nt_col], how='left')
|
|
67
|
+
|
|
68
|
+
return bcr_data
|
|
69
|
+
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
def renyi_entropy(probabilities, alpha_values=range(10)):
|
|
5
|
+
"""
|
|
6
|
+
Calculate the Rényi entropy for a given probability distribution and alpha.
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
probabilities (list or numpy array): Probability distribution (should sum to 1).
|
|
10
|
+
alpha (float): The order of the Rényi entropy.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
float: The calculated Rényi entropy.
|
|
14
|
+
"""
|
|
15
|
+
results = {}
|
|
16
|
+
for alpha in alpha_values:
|
|
17
|
+
if alpha == 1:
|
|
18
|
+
entropy = -np.sum(probabilities * np.log(probabilities))
|
|
19
|
+
else:
|
|
20
|
+
entropy = 1 / (1 - alpha) * np.log(np.sum(probabilities**alpha))
|
|
21
|
+
results[alpha] = [entropy]
|
|
22
|
+
return pd.DataFrame.from_dict(results)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def shannon_entropy(p):
|
|
26
|
+
p = p[p > 0] # Only consider non-zero probabilities
|
|
27
|
+
H = -np.sum(p * np.log2(p))
|
|
28
|
+
return H
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def CPK(count):
|
|
32
|
+
return len(count)/sum(count) * 1000
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def normalize_shannon_entropy(p):
|
|
36
|
+
p = p[p > 0] # Only consider non-zero probabilities
|
|
37
|
+
H = -np.sum(p * np.log2(p)) / np.log2(len(p))
|
|
38
|
+
return H
|
|
39
|
+
|
|
40
|
+
def gini_index(data):
|
|
41
|
+
if len(data) == 0:
|
|
42
|
+
raise ValueError("Input data cannot be empty.")
|
|
43
|
+
sorted_data = np.sort(data)
|
|
44
|
+
n = len(data)
|
|
45
|
+
cumulative_sum = np.cumsum(sorted_data)
|
|
46
|
+
gini_index = (n + 1 - 2 * np.sum(cumulative_sum) / np.sum(sorted_data)) / n
|
|
47
|
+
return gini_index
|
|
48
|
+
|
|
49
|
+
def Clonality(p):
|
|
50
|
+
C = 1 - shannon_entropy(p) / np.log2(len(p))
|
|
51
|
+
return C
|
|
52
|
+
|
|
53
|
+
def compute_index(function_name, p):
|
|
54
|
+
# 使用 globals() 来根据输入的函数名称调用对应的函数
|
|
55
|
+
functions = {
|
|
56
|
+
'shannon_entropy': shannon_entropy,
|
|
57
|
+
'normalize_shannon_entropy': normalize_shannon_entropy,
|
|
58
|
+
'Clonality': Clonality,
|
|
59
|
+
'renyi_entropy': renyi_entropy,
|
|
60
|
+
'gini_index': gini_index,
|
|
61
|
+
'CPK':CPK
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# 检查用户输入的函数是否在已定义的函数字典中
|
|
65
|
+
if function_name in functions:
|
|
66
|
+
if function_name=='renyi_entropy':
|
|
67
|
+
return functions[function_name](p)
|
|
68
|
+
else:
|
|
69
|
+
return functions[function_name](p)
|
|
70
|
+
else:
|
|
71
|
+
raise ValueError(f"Function '{function_name}' not recognized.")
|
|
72
|
+
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import seaborn as sns
|
|
4
|
+
from pl.basic_plot import _qc_boxplot_clone, _qc_boxplot_umis
|
|
5
|
+
|
|
6
|
+
def calculate_qc_clones(df, group_by, Cgene_key, clone_key,loc_x_key='X', loc_y_key='Y', plot=True):
|
|
7
|
+
clone_df = df[[group_by, Cgene_key, clone_key ]].drop_duplicates().groupby([group_by, Cgene_key ]).size().reset_index(name='clone_by_group')
|
|
8
|
+
clone_spatial_df = df[[group_by, Cgene_key, clone_key, loc_x_key, loc_y_key]].drop_duplicates().groupby([group_by, Cgene_key, loc_x_key, loc_y_key]).size().reset_index(name='clone_by_group_spatialLoc')
|
|
9
|
+
|
|
10
|
+
if plot:
|
|
11
|
+
_qc_boxplot_clone( clone_df, clone_spatial_df, Cgene_key, group_by, figsize=(7, 3.5) )
|
|
12
|
+
cloneDict = clone_df.set_index([group_by, Cgene_key])['clone_by_group'].to_dict()
|
|
13
|
+
clonesSpatialDict = clone_spatial_df.set_index([group_by, Cgene_key, loc_x_key, loc_y_key])['clone_by_group_spatialLoc'].to_dict()
|
|
14
|
+
df['clone_by_group'] = df.apply(lambda row: cloneDict.get((row[group_by], row[Cgene_key]), None), axis=1)
|
|
15
|
+
df['clone_by_group_spatialLoc'] = df.apply(lambda row: clonesSpatialDict.get((row[group_by], row[Cgene_key], row[loc_x_key], row[loc_y_key]), None), axis=1)
|
|
16
|
+
return df
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def calculate_qc_umis(df, group_by, Cgene_key, clone_key, loc_x_key='X', loc_y_key='Y', plot=True, figsize=(7, 3.5)):
|
|
20
|
+
umis_df = df[[group_by, Cgene_key, clone_key ]].groupby([group_by, Cgene_key ]).size().reset_index(name='umis_by_group')
|
|
21
|
+
umis_spatial_df = df[[group_by, Cgene_key, clone_key, loc_x_key, loc_y_key]].groupby([group_by, Cgene_key, loc_x_key, loc_y_key]).size().reset_index(name='umis_by_group_spatialLoc')
|
|
22
|
+
|
|
23
|
+
if plot:
|
|
24
|
+
_qc_boxplot_umis( umis_df, umis_spatial_df, Cgene_key, group_by, figsize=(7, 3.5) )
|
|
25
|
+
umisDict = umis_df.set_index([group_by, Cgene_key])['umis_by_group'].to_dict()
|
|
26
|
+
umisSpatialDict = umis_spatial_df.set_index([group_by, Cgene_key, loc_x_key, loc_y_key])['umis_by_group_spatialLoc'].to_dict()
|
|
27
|
+
df['umis_by_group'] = df.apply(lambda row: umisDict.get((row[group_by], row[Cgene_key]), None), axis=1)
|
|
28
|
+
df['umis_by_group_spatialLoc'] = df.apply(lambda row: umisSpatialDict.get((row[group_by], row[Cgene_key], row[loc_x_key], row[loc_y_key]), None), axis=1)
|
|
29
|
+
return df
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def filter_clones(df, clone_key, min_clone=1):
|
|
33
|
+
return df[df[clone_key] > min_clone]
|
|
34
|
+
|
|
35
|
+
def filter_clones_spatial(df, clone_spatial_key, min_clone_spatial=1):
|
|
36
|
+
return df[df[clone_spatial_key] > min_clone_spatial]
|
|
37
|
+
|
|
38
|
+
def filter_umi(df, umi_key, min_umi=1):
|
|
39
|
+
return df[df[umi_key] > min_umi]
|
|
40
|
+
|
|
41
|
+
def filter_umi_spatial(df, clone_umi_key, min_umi_spatial=1):
|
|
42
|
+
return df[df[clone_umi_key] > min_umi_spatial]
|
|
43
|
+
|
|
44
|
+
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
sys.path.append(str(Path(__file__).parent.parent))
|
|
7
|
+
from pl.basic_plot import _plot_bar
|
|
8
|
+
|
|
9
|
+
from .bcr_desc import compute_index
|
|
10
|
+
|
|
11
|
+
def stat_clone(df, groupby, Cgene_key, clone_key, plot=True, palette='Set2', xlabel=None, ylabel=None, ylog=False, figsize=(4,3.5) ):
|
|
12
|
+
y_name = 'clone_by_'+groupby
|
|
13
|
+
clone_df = df[[groupby, Cgene_key, clone_key]].drop_duplicates().groupby([groupby, Cgene_key]).size().reset_index(name = y_name)
|
|
14
|
+
|
|
15
|
+
if plot:
|
|
16
|
+
_plot_bar(clone_df, Cgene_key, y_name, groupby=Cgene_key, palette='Set2', xlabel=None, ylabel=None, ylog=False, figsize=(4,3.5) )
|
|
17
|
+
|
|
18
|
+
cloneDict = clone_df.set_index([groupby, Cgene_key])['clone_by_'+groupby].to_dict()
|
|
19
|
+
df[y_name] = df.apply(lambda row: cloneDict.get((row[groupby], row[Cgene_key]), None), axis=1)
|
|
20
|
+
|
|
21
|
+
return df
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def aggregate_clone_df(df, group_by, Cgene_key, clone_key, groups, count_basis='location', loc_x_key='X', loc_y_key='Y', Umi_key='UMI'):
|
|
25
|
+
if count_basis == 'location':
|
|
26
|
+
lst = list(set([group_by, Cgene_key, clone_key, loc_x_key, loc_y_key] + groups))
|
|
27
|
+
loc_df = df[lst].drop_duplicates()
|
|
28
|
+
_Index_compute_count = loc_df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts().reset_index(name='count')
|
|
29
|
+
_Index_compute_freq = loc_df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts(normalize=True).reset_index(name='freq')
|
|
30
|
+
_Index_compute = pd.merge(_Index_compute_freq, _Index_compute_count, on = groups+[clone_key] )
|
|
31
|
+
return _Index_compute
|
|
32
|
+
|
|
33
|
+
if count_basis == 'UMI':
|
|
34
|
+
_Index_compute_count = df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts().reset_index(name='count')
|
|
35
|
+
_Index_compute_freq = df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts(normalize=True).reset_index(name='freq')
|
|
36
|
+
_Index_compute = pd.merge(_Index_compute_freq, _Index_compute_count, on = groups+[clone_key] )
|
|
37
|
+
return _Index_compute
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def compute_grouped_index(df, group_by, Cgene_key, clone_key, groups, count_basis='location', loc_x_key='X', loc_y_key='Y', Umi_key=None, index='shannon_entropy'):
|
|
41
|
+
if count_basis=='location':
|
|
42
|
+
_Index_compute = aggregate_clone_df(df, group_by, Cgene_key, clone_key, groups, count_basis=count_basis, loc_x_key='X', loc_y_key='Y').copy()
|
|
43
|
+
if count_basis=='UMI':
|
|
44
|
+
_Index_compute = aggregate_clone_df(df, group_by, Cgene_key, clone_key, groups, count_basis=count_basis, Umi_key=Umi_key).copy()
|
|
45
|
+
|
|
46
|
+
tmp_df = _Index_compute.groupby(groups)['freq'].apply(lambda x: compute_index(index, x))
|
|
47
|
+
|
|
48
|
+
if index != 'renyi_entropy':
|
|
49
|
+
tmp_df = tmp_df.reset_index(name=index).dropna(subset=[index])
|
|
50
|
+
else:
|
|
51
|
+
tmp_df = tmp_df.melt(ignore_index=False, var_name='alpha', value_name = index).reset_index()
|
|
52
|
+
cols = list(range(0, len(groups))) + list(range(len(groups)+1, len(tmp_df.columns)))
|
|
53
|
+
tmp_df = tmp_df.iloc[:, cols ]
|
|
54
|
+
|
|
55
|
+
return tmp_df
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|