BAITS 0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. baits-0.1/.gitignore +1 -0
  2. baits-0.1/.readthedocs.yaml +22 -0
  3. baits-0.1/BAITS/.ipynb_checkpoints/__init__-checkpoint.py +5 -0
  4. baits-0.1/BAITS/VDJ/__init__.py +3 -0
  5. baits-0.1/BAITS/VDJ/pl/__init__.py +1 -0
  6. baits-0.1/BAITS/VDJ/pl/basic_plot.py +97 -0
  7. baits-0.1/BAITS/VDJ/tl/.ipynb_checkpoints/__init__-checkpoint.py +4 -0
  8. baits-0.1/BAITS/VDJ/tl/.ipynb_checkpoints/bcr_cluster-checkpoint.py +69 -0
  9. baits-0.1/BAITS/VDJ/tl/.ipynb_checkpoints/qc-checkpoint.py +44 -0
  10. baits-0.1/BAITS/VDJ/tl/.ipynb_checkpoints/summarize_BCR-checkpoint.py +71 -0
  11. baits-0.1/BAITS/VDJ/tl/__init__.py +4 -0
  12. baits-0.1/BAITS/VDJ/tl/bcr_cluster.py +69 -0
  13. baits-0.1/BAITS/VDJ/tl/bcr_desc.py +72 -0
  14. baits-0.1/BAITS/VDJ/tl/qc.py +44 -0
  15. baits-0.1/BAITS/VDJ/tl/summarize_BCR.py +71 -0
  16. baits-0.1/BAITS/__init__.py +3 -0
  17. baits-0.1/BAITS/st/__init__.py +3 -0
  18. baits-0.1/BAITS/st/gr/.ipynb_checkpoints/__init__-checkpoint.py +4 -0
  19. baits-0.1/BAITS/st/gr/.ipynb_checkpoints/_aggr-checkpoint.py +183 -0
  20. baits-0.1/BAITS/st/gr/.ipynb_checkpoints/_build_spatial_adj-checkpoint.py +206 -0
  21. baits-0.1/BAITS/st/gr/.ipynb_checkpoints/_get_adj-checkpoint.py +38 -0
  22. baits-0.1/BAITS/st/gr/.ipynb_checkpoints/_utils-checkpoint.py +11 -0
  23. baits-0.1/BAITS/st/gr/__init__.py +4 -0
  24. baits-0.1/BAITS/st/gr/_aggr.py +183 -0
  25. baits-0.1/BAITS/st/gr/_build_spatial_adj.py +206 -0
  26. baits-0.1/BAITS/st/gr/_get_adj.py +38 -0
  27. baits-0.1/BAITS/st/gr/_utils.py +11 -0
  28. baits-0.1/BAITS/st/pl/.ipynb_checkpoints/__init__-checkpoint.py +3 -0
  29. baits-0.1/BAITS/st/pl/.ipynb_checkpoints/_clustering_results-checkpoint.py +35 -0
  30. baits-0.1/BAITS/st/pl/.ipynb_checkpoints/_enrichment_score-checkpoint.py +31 -0
  31. baits-0.1/BAITS/st/pl/.ipynb_checkpoints/_kde_filter-checkpoint.py +39 -0
  32. baits-0.1/BAITS/st/pl/.ipynb_checkpoints/_silhouette_score-checkpoint.py +41 -0
  33. baits-0.1/BAITS/st/pl/__init__.py +3 -0
  34. baits-0.1/BAITS/st/pl/_clustering_results.py +35 -0
  35. baits-0.1/BAITS/st/pl/_enrichment_score.py +31 -0
  36. baits-0.1/BAITS/st/pl/_kde_filter.py +39 -0
  37. baits-0.1/BAITS/st/pl/_silhouette_score.py +41 -0
  38. baits-0.1/BAITS/st/tl/.ipynb_checkpoints/_GMM-checkpoint.py +329 -0
  39. baits-0.1/BAITS/st/tl/.ipynb_checkpoints/__init__-checkpoint.py +5 -0
  40. baits-0.1/BAITS/st/tl/.ipynb_checkpoints/_batch_process-checkpoint.py +90 -0
  41. baits-0.1/BAITS/st/tl/.ipynb_checkpoints/_cluster_Auto_k-checkpoint.py +279 -0
  42. baits-0.1/BAITS/st/tl/.ipynb_checkpoints/_dbscan_cluster-checkpoint.py +44 -0
  43. baits-0.1/BAITS/st/tl/.ipynb_checkpoints/_kde_filter-checkpoint.py +107 -0
  44. baits-0.1/BAITS/st/tl/_GMM.py +329 -0
  45. baits-0.1/BAITS/st/tl/__init__.py +5 -0
  46. baits-0.1/BAITS/st/tl/_batch_process.py +90 -0
  47. baits-0.1/BAITS/st/tl/_cluster_Auto_k.py +279 -0
  48. baits-0.1/BAITS/st/tl/_dbscan_cluster.py +44 -0
  49. baits-0.1/BAITS/st/tl/_kde_filter.py +107 -0
  50. baits-0.1/LICENSE +19 -0
  51. baits-0.1/Makefile +20 -0
  52. baits-0.1/PKG-INFO +96 -0
  53. baits-0.1/README.md +33 -0
  54. baits-0.1/images/BAITS.png +0 -0
  55. baits-0.1/make.bat +35 -0
  56. baits-0.1/pyproject.toml +96 -0
baits-0.1/.gitignore ADDED
@@ -0,0 +1 @@
1
+ build/
@@ -0,0 +1,22 @@
1
+ # Read the Docs configuration file
2
+ # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3
+
4
+ # Required
5
+ version: 2
6
+
7
+ # Set the OS, Python version, and other tools you might need
8
+ build:
9
+ os: ubuntu-24.04
10
+ tools:
11
+ python: "3.10"
12
+
13
+ # Build documentation in the "docs/" directory with Sphinx
14
+ sphinx:
15
+ configuration: docs/conf.py
16
+
17
+ # Optionally, but recommended,
18
+ # declare the Python requirements required to build your documentation
19
+ # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
20
+ python:
21
+ install:
22
+ - requirements: docs/requirements.txt
@@ -0,0 +1,5 @@
1
+ from . import gr, pl, tl
2
+
3
+ __all__ = ["gr", "pl", "tl"]
4
+
5
+ __version__ = version("B-HIT")
@@ -0,0 +1,3 @@
1
+ from . import tl, pl
2
+
3
+ __all__ = ["tl", "pl"]
@@ -0,0 +1 @@
1
+ from .basic_plot import _plot_bar, _boxplot, _scatter_plot, _plot_xcr
@@ -0,0 +1,97 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import matplotlib.ticker as mticker
5
+ import seaborn as sns
6
+
7
+ def _plot_bar(df, x, y, groupby=None, palette='Set2', xlabel=None, ylabel='Clone Richeness', ylog=False, ax=None, figsize=(4,3.5) ):
8
+ plt.figure(figsize=figsize)
9
+ sns.barplot(y=y, x=x, data=df, hue=groupby, palette='Set2', alpha=0.5)
10
+ sns.stripplot(y=y, x=x, data=df, hue=groupby, dodge=True, jitter=0.2, palette='Set2')
11
+ plt.xlabel(xlabel)
12
+ plt.ylabel(ylabel)
13
+ if ylog:
14
+ plt.yscale('log')
15
+
16
+
17
+ def _boxplot(df, x, y, groupby=None, palette='Set2', xlabel=None, ylabel=None, log=False, ax=None ):
18
+
19
+ sns.boxplot(data=df, x=x, y=y, hue=groupby, palette=palette, ax=ax )
20
+
21
+ if ax:
22
+ ax.set_xlabel(xlabel) # 设置x轴标签
23
+ if ax:
24
+ ax.set_ylabel(ylabel) # 设置y轴标签
25
+
26
+ if log:
27
+ plt.yscale('log')
28
+
29
+ if groupby is not None:
30
+ plt.legend(loc='upper right', bbox_to_anchor=(1, 1))
31
+
32
+ plt.xlabel(xlabel)
33
+ plt.ylabel(ylabel)
34
+
35
+
36
+ def _qc_boxplot_clone(clone_df, clone_spatial_df, Cgene_key, group_by, figsize=(6, 3) ):
37
+ fig, axes = plt.subplots(1, 2, figsize=figsize)
38
+ _boxplot(df=clone_df, x=Cgene_key, y='clone_by_group', palette='Set2', xlabel=None, ylabel='clone_by_group', log=False, ax=axes[0] )
39
+ _boxplot(df=clone_spatial_df, x=group_by, y='clone_by_group_spatialLoc', groupby=Cgene_key, palette='Set2', xlabel=None, ylabel='clone_by_group_spatialLoc', log=True, ax=axes[1])
40
+ plt.tight_layout()
41
+ plt.show()
42
+
43
+ def _qc_boxplot_umis(umis_df, umis_spatial_df, Cgene_key, group_by, figsize=(6, 3) ):
44
+ fig, axes = plt.subplots(1, 2, figsize=figsize)
45
+ _boxplot(df=umis_df, x=Cgene_key, y='umis_by_group', palette='Set2', xlabel=None, ylabel='umis_by_group', log=True, ax=axes[0] )
46
+ _boxplot(df=umis_spatial_df, x=group_by, y='umis_by_group_spatialLoc', groupby=Cgene_key, palette='Set2', xlabel=None, ylabel='umis_by_group_spatialLoc', log=True, ax=axes[1])
47
+ plt.tight_layout()
48
+ plt.show()
49
+
50
+
51
+ def _scatter_plot(df, x, y, groupby=None, palette='Set2', xlabel=None, ylabel=None, x_log=False, y_log=False, ax=None ):
52
+
53
+ sns.scatterplot( data=df, x=x, y=y, hue=groupby, palette=palette, edgecolor='black', ax=ax )
54
+
55
+ if ax:
56
+ ax.set_xlabel(xlabel) # 设置x轴标签
57
+ if ax:
58
+ ax.set_ylabel(ylabel) # 设置y轴标签
59
+
60
+ if x_log:
61
+ plt.xscale('log')
62
+ if y_log:
63
+ plt.yscale('log')
64
+ if groupby is not None:
65
+ plt.legend(loc='upper right', bbox_to_anchor=(1, 1))
66
+
67
+ plt.xlabel(xlabel)
68
+ plt.ylabel(ylabel)
69
+
70
+
71
+ def _plot_xcr(xcr_df, clone_col, loc_x_key, loc_y_key ):
72
+ xcr_df = xcr_df.sort_values(clone_col)
73
+ xcr_df['clone_access'] = xcr_df[clone_col].astype('category').cat.codes
74
+ clone = list(xcr_df['clone_access'])
75
+
76
+ x_min = xcr_df[loc_x_key].min(); x_max = xcr_df[loc_x_key].max()+1
77
+ y_min = xcr_df[loc_y_key].min(); y_max = xcr_df[loc_y_key].max()+1
78
+
79
+ x = list(xcr_df[loc_x_key]); y = list(xcr_df[loc_y_key])
80
+ xcr_mat = np.zeros(( x_max+1, y_max+1 ))
81
+ xcr_mat_c = np.zeros(( x_max+1, y_max+1 ))
82
+ for i in range(len(x)):
83
+ xcr_mat[x[i],y[i]]+=1
84
+ xcr_mat_c[x[i],y[i]] = clone[i]
85
+ xcr_row, xcr_col = np.where(xcr_mat )
86
+ xcr_size = xcr_mat[xcr_row, xcr_col] * 0.3 / np.percentile(xcr_mat[xcr_mat!=0].flatten(),0.9)
87
+ xcr_size = np.clip(xcr_size,0,1) * 3
88
+ xcr_color = xcr_mat_c[xcr_row, xcr_col]
89
+
90
+ fig, ax = plt.subplots(1,1, figsize=((y_max-y_min)*3/10000, (x_max-x_min)*3/10000 ))
91
+ ax.scatter(xcr_col, xcr_row, s=xcr_size, c=xcr_color, marker='o',cmap='coolwarm')
92
+ ax.invert_yaxis()
93
+ plt.show()
94
+
95
+
96
+
97
+
@@ -0,0 +1,4 @@
1
+ from .summarize_BCR import stat_clone, compute_grouped_index, aggregate_clone_df
2
+ from .bcr_cluster import build_bcr_family
3
+ from .qc import calculate_qc_clones, calculate_qc_umis, filter_clones, filter_clones_spatial, filter_umi, filter_umi_spatial
4
+
@@ -0,0 +1,69 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from scipy.cluster.hierarchy import linkage, fcluster
4
+ from scipy.spatial.distance import squareform
5
+ from Levenshtein import distance as levenshtein_distance
6
+ from itertools import combinations
7
+ from collections import defaultdict
8
+
9
+ def _cluster_cdr3nt(cdr3_sequences, threshold=0.85):
10
+ # Calculate pairwise distances
11
+ n = len(cdr3_sequences)
12
+ dist_matrix = np.zeros((n, n))
13
+
14
+ for i, j in combinations(range(n), 2):
15
+ dist_matrix[i, j] = levenshtein_distance(cdr3_sequences[i], cdr3_sequences[j])
16
+ dist_matrix[j, i] = dist_matrix[i, j]
17
+
18
+ condensed_dist = squareform(dist_matrix)
19
+ Z = linkage(condensed_dist, method='average')
20
+ max_dist = max(len(seq) for seq in cdr3_sequences) * (1 - threshold)
21
+ clusters = fcluster(Z, max_dist, criterion='distance')
22
+
23
+ return {seq: cluster for seq, cluster in zip(cdr3_sequences, clusters)}
24
+
25
+
26
+ def _build_bcr_family(bcr_data, cdr3nt_col, threshold=0.85):
27
+
28
+ tmp_dd_ss = bcr_data.drop_duplicates(subset=[cdr3nt_col])
29
+
30
+ if tmp_dd_ss.empty:
31
+ return None
32
+
33
+ comp_cdr3_nt = tmp_dd_ss[cdr3nt_col].tolist()
34
+
35
+ if len(comp_cdr3_nt)==1:
36
+ return {comp_cdr3_nt[0]: 0 }
37
+
38
+ family_groups = _cluster_cdr3nt(comp_cdr3_nt, threshold)
39
+ return family_groups
40
+
41
+
42
+ def build_bcr_family(bcr_data, Vgene_col, Jgene_col, cdr3nt_col, Cgene_key, Cgene='IGH', threshold=0.85):
43
+
44
+ data = bcr_data[[Vgene_col, Jgene_col, cdr3nt_col, Cgene_key]].copy()
45
+
46
+ # Filter by c_gene
47
+ data = data[data['Cgene'].str.contains(Cgene)].copy()
48
+
49
+ data = data.drop_duplicates(subset=[Vgene_col, Jgene_col, cdr3nt_col])
50
+ data['CDR3_length'] = (data[cdr3nt_col].str.len()/3).astype(int)
51
+ data['cloneGroup'] = data[Vgene_col] + "_" + data[Jgene_col] + "_" + data[cdr3nt_col].astype(str)
52
+ data['family'] = ''
53
+ for group in set(data['cloneGroup']):
54
+ tmp_data = data[data['cloneGroup'] == group].copy()
55
+ result = _build_bcr_family(tmp_data, cdr3nt_col, threshold=0.85)
56
+
57
+ if not result:
58
+ continue
59
+
60
+ for seq, family_num_idx in result.items():
61
+ family_name = group + '_family_' + str(family_num_idx)
62
+ mask = (data['cloneGroup'] == group) & (data[cdr3nt_col]==(seq) )
63
+ data.loc[mask, 'family'] = family_name
64
+
65
+ data['family_id'] = [ Cgene + '_family_' +x for x in pd.Categorical(data['family']).codes.astype(str)]
66
+ bcr_data = pd.merge(bcr_data, data, on = [Vgene_col, Jgene_col, Cgene_key, cdr3nt_col], how='left')
67
+
68
+ return bcr_data
69
+
@@ -0,0 +1,44 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ from pl.basic_plot import _qc_boxplot_clone, _qc_boxplot_umis
5
+
6
+ def calculate_qc_clones(df, group_by, Cgene_key, clone_key,loc_x_key='X', loc_y_key='Y', plot=True):
7
+ clone_df = df[[group_by, Cgene_key, clone_key ]].drop_duplicates().groupby([group_by, Cgene_key ]).size().reset_index(name='clone_by_group')
8
+ clone_spatial_df = df[[group_by, Cgene_key, clone_key, loc_x_key, loc_y_key]].drop_duplicates().groupby([group_by, Cgene_key, loc_x_key, loc_y_key]).size().reset_index(name='clone_by_group_spatialLoc')
9
+
10
+ if plot:
11
+ _qc_boxplot_clone( clone_df, clone_spatial_df, Cgene_key, group_by, figsize=(7, 3.5) )
12
+ cloneDict = clone_df.set_index([group_by, Cgene_key])['clone_by_group'].to_dict()
13
+ clonesSpatialDict = clone_spatial_df.set_index([group_by, Cgene_key, loc_x_key, loc_y_key])['clone_by_group_spatialLoc'].to_dict()
14
+ df['clone_by_group'] = df.apply(lambda row: cloneDict.get((row[group_by], row[Cgene_key]), None), axis=1)
15
+ df['clone_by_group_spatialLoc'] = df.apply(lambda row: clonesSpatialDict.get((row[group_by], row[Cgene_key], row[loc_x_key], row[loc_y_key]), None), axis=1)
16
+ return df
17
+
18
+
19
+ def calculate_qc_umis(df, group_by, Cgene_key, clone_key, loc_x_key='X', loc_y_key='Y', plot=True, figsize=(7, 3.5)):
20
+ umis_df = df[[group_by, Cgene_key, clone_key ]].groupby([group_by, Cgene_key ]).size().reset_index(name='umis_by_group')
21
+ umis_spatial_df = df[[group_by, Cgene_key, clone_key, loc_x_key, loc_y_key]].groupby([group_by, Cgene_key, loc_x_key, loc_y_key]).size().reset_index(name='umis_by_group_spatialLoc')
22
+
23
+ if plot:
24
+ _qc_boxplot_umis( umis_df, umis_spatial_df, Cgene_key, group_by, figsize=(7, 3.5) )
25
+ umisDict = umis_df.set_index([group_by, Cgene_key])['umis_by_group'].to_dict()
26
+ umisSpatialDict = umis_spatial_df.set_index([group_by, Cgene_key, loc_x_key, loc_y_key])['umis_by_group_spatialLoc'].to_dict()
27
+ df['umis_by_group'] = df.apply(lambda row: umisDict.get((row[group_by], row[Cgene_key]), None), axis=1)
28
+ df['umis_by_group_spatialLoc'] = df.apply(lambda row: umisSpatialDict.get((row[group_by], row[Cgene_key], row[loc_x_key], row[loc_y_key]), None), axis=1)
29
+ return df
30
+
31
+
32
+ def filter_clones(df, clone_key, min_clone=1):
33
+ return df[df[clone_key] > min_clone]
34
+
35
+ def filter_clones_spatial(df, clone_spatial_key, min_clone_spatial=1):
36
+ return df[df[clone_spatial_key] > min_clone_spatial]
37
+
38
+ def filter_umi(df, umi_key, min_umi=1):
39
+ return df[df[umi_key] > min_umi]
40
+
41
+ def filter_umi_spatial(df, clone_umi_key, min_umi_spatial=1):
42
+ return df[df[clone_umi_key] > min_umi_spatial]
43
+
44
+
@@ -0,0 +1,71 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ import sys
5
+ from pathlib import Path
6
+ sys.path.append(str(Path(__file__).parent.parent))
7
+ from pl.basic_plot import _plot_bar
8
+
9
+ from .bcr_desc import compute_index
10
+
11
+ def stat_clone(df, groupby, Cgene_key, clone_key, plot=True, palette='Set2', xlabel=None, ylabel=None, ylog=False, figsize=(4,3.5) ):
12
+ y_name = 'clone_by_'+groupby
13
+ clone_df = df[[groupby, Cgene_key, clone_key]].drop_duplicates().groupby([groupby, Cgene_key]).size().reset_index(name = y_name)
14
+
15
+ if plot:
16
+ _plot_bar(clone_df, Cgene_key, y_name, groupby=Cgene_key, palette='Set2', xlabel=None, ylabel=None, ylog=False, figsize=(4,3.5) )
17
+
18
+ cloneDict = clone_df.set_index([groupby, Cgene_key])['clone_by_'+groupby].to_dict()
19
+ df[y_name] = df.apply(lambda row: cloneDict.get((row[groupby], row[Cgene_key]), None), axis=1)
20
+
21
+ return df
22
+
23
+
24
+ def aggregate_clone_df(df, group_by, Cgene_key, clone_key, groups, count_basis='location', loc_x_key='X', loc_y_key='Y', Umi_key='UMI'):
25
+ if count_basis == 'location':
26
+ lst = list(set([group_by, Cgene_key, clone_key, loc_x_key, loc_y_key] + groups))
27
+ loc_df = df[lst].drop_duplicates()
28
+ _Index_compute_count = loc_df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts().reset_index(name='count')
29
+ _Index_compute_freq = loc_df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts(normalize=True).reset_index(name='freq')
30
+ _Index_compute = pd.merge(_Index_compute_freq, _Index_compute_count, on = groups+[clone_key] )
31
+ return _Index_compute
32
+
33
+ if count_basis == 'UMI':
34
+ _Index_compute_count = df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts().reset_index(name='count')
35
+ _Index_compute_freq = df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts(normalize=True).reset_index(name='freq')
36
+ _Index_compute = pd.merge(_Index_compute_freq, _Index_compute_count, on = groups+[clone_key] )
37
+ return _Index_compute
38
+
39
+
40
+ def compute_grouped_index(df, group_by, Cgene_key, clone_key, groups, count_basis='location', loc_x_key='X', loc_y_key='Y', Umi_key=None, index='shannon_entropy'):
41
+ if count_basis=='location':
42
+ _Index_compute = aggregate_clone_df(df, group_by, Cgene_key, clone_key, groups, count_basis=count_basis, loc_x_key='X', loc_y_key='Y').copy()
43
+ if count_basis=='UMI':
44
+ _Index_compute = aggregate_clone_df(df, group_by, Cgene_key, clone_key, groups, count_basis=count_basis, Umi_key=Umi_key).copy()
45
+
46
+ tmp_df = _Index_compute.groupby(groups)['freq'].apply(lambda x: compute_index(index, x))
47
+
48
+ if index != 'renyi_entropy':
49
+ tmp_df = tmp_df.reset_index(name=index).dropna(subset=[index])
50
+ else:
51
+ tmp_df = tmp_df.melt(ignore_index=False, var_name='alpha', value_name = index).reset_index()
52
+ cols = list(range(0, len(groups))) + list(range(len(groups)+1, len(tmp_df.columns)))
53
+ tmp_df = tmp_df.iloc[:, cols ]
54
+
55
+ return tmp_df
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
@@ -0,0 +1,4 @@
1
+ from .summarize_BCR import stat_clone, compute_grouped_index, aggregate_clone_df
2
+ from .bcr_cluster import build_bcr_family
3
+ from .qc import calculate_qc_clones, calculate_qc_umis, filter_clones, filter_clones_spatial, filter_umi, filter_umi_spatial
4
+
@@ -0,0 +1,69 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from scipy.cluster.hierarchy import linkage, fcluster
4
+ from scipy.spatial.distance import squareform
5
+ from Levenshtein import distance as levenshtein_distance
6
+ from itertools import combinations
7
+ from collections import defaultdict
8
+
9
+ def _cluster_cdr3nt(cdr3_sequences, threshold=0.85):
10
+ # Calculate pairwise distances
11
+ n = len(cdr3_sequences)
12
+ dist_matrix = np.zeros((n, n))
13
+
14
+ for i, j in combinations(range(n), 2):
15
+ dist_matrix[i, j] = levenshtein_distance(cdr3_sequences[i], cdr3_sequences[j])
16
+ dist_matrix[j, i] = dist_matrix[i, j]
17
+
18
+ condensed_dist = squareform(dist_matrix)
19
+ Z = linkage(condensed_dist, method='average')
20
+ max_dist = max(len(seq) for seq in cdr3_sequences) * (1 - threshold)
21
+ clusters = fcluster(Z, max_dist, criterion='distance')
22
+
23
+ return {seq: cluster for seq, cluster in zip(cdr3_sequences, clusters)}
24
+
25
+
26
+ def _build_bcr_family(bcr_data, cdr3nt_col, threshold=0.85):
27
+
28
+ tmp_dd_ss = bcr_data.drop_duplicates(subset=[cdr3nt_col])
29
+
30
+ if tmp_dd_ss.empty:
31
+ return None
32
+
33
+ comp_cdr3_nt = tmp_dd_ss[cdr3nt_col].tolist()
34
+
35
+ if len(comp_cdr3_nt)==1:
36
+ return {comp_cdr3_nt[0]: 0 }
37
+
38
+ family_groups = _cluster_cdr3nt(comp_cdr3_nt, threshold)
39
+ return family_groups
40
+
41
+
42
+ def build_bcr_family(bcr_data, Vgene_col, Jgene_col, cdr3nt_col, Cgene_key, Cgene='IGH', threshold=0.85):
43
+
44
+ data = bcr_data[[Vgene_col, Jgene_col, cdr3nt_col, Cgene_key]].copy()
45
+
46
+ # Filter by c_gene
47
+ data = data[data['Cgene'].str.contains(Cgene)].copy()
48
+
49
+ data = data.drop_duplicates(subset=[Vgene_col, Jgene_col, cdr3nt_col])
50
+ data['CDR3_length'] = (data[cdr3nt_col].str.len()/3).astype(int)
51
+ data['cloneGroup'] = data[Vgene_col] + "_" + data[Jgene_col] + "_" + data[cdr3nt_col].astype(str)
52
+ data['family'] = ''
53
+ for group in set(data['cloneGroup']):
54
+ tmp_data = data[data['cloneGroup'] == group].copy()
55
+ result = _build_bcr_family(tmp_data, cdr3nt_col, threshold=0.85)
56
+
57
+ if not result:
58
+ continue
59
+
60
+ for seq, family_num_idx in result.items():
61
+ family_name = group + '_family_' + str(family_num_idx)
62
+ mask = (data['cloneGroup'] == group) & (data[cdr3nt_col]==(seq) )
63
+ data.loc[mask, 'family'] = family_name
64
+
65
+ data['family_id'] = [ Cgene + '_family_' +x for x in pd.Categorical(data['family']).codes.astype(str)]
66
+ bcr_data = pd.merge(bcr_data, data, on = [Vgene_col, Jgene_col, Cgene_key, cdr3nt_col], how='left')
67
+
68
+ return bcr_data
69
+
@@ -0,0 +1,72 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ def renyi_entropy(probabilities, alpha_values=range(10)):
5
+ """
6
+ Calculate the Rényi entropy for a given probability distribution and alpha.
7
+
8
+ Parameters:
9
+ probabilities (list or numpy array): Probability distribution (should sum to 1).
10
+ alpha (float): The order of the Rényi entropy.
11
+
12
+ Returns:
13
+ float: The calculated Rényi entropy.
14
+ """
15
+ results = {}
16
+ for alpha in alpha_values:
17
+ if alpha == 1:
18
+ entropy = -np.sum(probabilities * np.log(probabilities))
19
+ else:
20
+ entropy = 1 / (1 - alpha) * np.log(np.sum(probabilities**alpha))
21
+ results[alpha] = [entropy]
22
+ return pd.DataFrame.from_dict(results)
23
+
24
+
25
+ def shannon_entropy(p):
26
+ p = p[p > 0] # Only consider non-zero probabilities
27
+ H = -np.sum(p * np.log2(p))
28
+ return H
29
+
30
+
31
+ def CPK(count):
32
+ return len(count)/sum(count) * 1000
33
+
34
+
35
+ def normalize_shannon_entropy(p):
36
+ p = p[p > 0] # Only consider non-zero probabilities
37
+ H = -np.sum(p * np.log2(p)) / np.log2(len(p))
38
+ return H
39
+
40
+ def gini_index(data):
41
+ if len(data) == 0:
42
+ raise ValueError("Input data cannot be empty.")
43
+ sorted_data = np.sort(data)
44
+ n = len(data)
45
+ cumulative_sum = np.cumsum(sorted_data)
46
+ gini_index = (n + 1 - 2 * np.sum(cumulative_sum) / np.sum(sorted_data)) / n
47
+ return gini_index
48
+
49
+ def Clonality(p):
50
+ C = 1 - shannon_entropy(p) / np.log2(len(p))
51
+ return C
52
+
53
+ def compute_index(function_name, p):
54
+ # 使用 globals() 来根据输入的函数名称调用对应的函数
55
+ functions = {
56
+ 'shannon_entropy': shannon_entropy,
57
+ 'normalize_shannon_entropy': normalize_shannon_entropy,
58
+ 'Clonality': Clonality,
59
+ 'renyi_entropy': renyi_entropy,
60
+ 'gini_index': gini_index,
61
+ 'CPK':CPK
62
+ }
63
+
64
+ # 检查用户输入的函数是否在已定义的函数字典中
65
+ if function_name in functions:
66
+ if function_name=='renyi_entropy':
67
+ return functions[function_name](p)
68
+ else:
69
+ return functions[function_name](p)
70
+ else:
71
+ raise ValueError(f"Function '{function_name}' not recognized.")
72
+
@@ -0,0 +1,44 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ from pl.basic_plot import _qc_boxplot_clone, _qc_boxplot_umis
5
+
6
+ def calculate_qc_clones(df, group_by, Cgene_key, clone_key,loc_x_key='X', loc_y_key='Y', plot=True):
7
+ clone_df = df[[group_by, Cgene_key, clone_key ]].drop_duplicates().groupby([group_by, Cgene_key ]).size().reset_index(name='clone_by_group')
8
+ clone_spatial_df = df[[group_by, Cgene_key, clone_key, loc_x_key, loc_y_key]].drop_duplicates().groupby([group_by, Cgene_key, loc_x_key, loc_y_key]).size().reset_index(name='clone_by_group_spatialLoc')
9
+
10
+ if plot:
11
+ _qc_boxplot_clone( clone_df, clone_spatial_df, Cgene_key, group_by, figsize=(7, 3.5) )
12
+ cloneDict = clone_df.set_index([group_by, Cgene_key])['clone_by_group'].to_dict()
13
+ clonesSpatialDict = clone_spatial_df.set_index([group_by, Cgene_key, loc_x_key, loc_y_key])['clone_by_group_spatialLoc'].to_dict()
14
+ df['clone_by_group'] = df.apply(lambda row: cloneDict.get((row[group_by], row[Cgene_key]), None), axis=1)
15
+ df['clone_by_group_spatialLoc'] = df.apply(lambda row: clonesSpatialDict.get((row[group_by], row[Cgene_key], row[loc_x_key], row[loc_y_key]), None), axis=1)
16
+ return df
17
+
18
+
19
+ def calculate_qc_umis(df, group_by, Cgene_key, clone_key, loc_x_key='X', loc_y_key='Y', plot=True, figsize=(7, 3.5)):
20
+ umis_df = df[[group_by, Cgene_key, clone_key ]].groupby([group_by, Cgene_key ]).size().reset_index(name='umis_by_group')
21
+ umis_spatial_df = df[[group_by, Cgene_key, clone_key, loc_x_key, loc_y_key]].groupby([group_by, Cgene_key, loc_x_key, loc_y_key]).size().reset_index(name='umis_by_group_spatialLoc')
22
+
23
+ if plot:
24
+ _qc_boxplot_umis( umis_df, umis_spatial_df, Cgene_key, group_by, figsize=(7, 3.5) )
25
+ umisDict = umis_df.set_index([group_by, Cgene_key])['umis_by_group'].to_dict()
26
+ umisSpatialDict = umis_spatial_df.set_index([group_by, Cgene_key, loc_x_key, loc_y_key])['umis_by_group_spatialLoc'].to_dict()
27
+ df['umis_by_group'] = df.apply(lambda row: umisDict.get((row[group_by], row[Cgene_key]), None), axis=1)
28
+ df['umis_by_group_spatialLoc'] = df.apply(lambda row: umisSpatialDict.get((row[group_by], row[Cgene_key], row[loc_x_key], row[loc_y_key]), None), axis=1)
29
+ return df
30
+
31
+
32
+ def filter_clones(df, clone_key, min_clone=1):
33
+ return df[df[clone_key] > min_clone]
34
+
35
+ def filter_clones_spatial(df, clone_spatial_key, min_clone_spatial=1):
36
+ return df[df[clone_spatial_key] > min_clone_spatial]
37
+
38
+ def filter_umi(df, umi_key, min_umi=1):
39
+ return df[df[umi_key] > min_umi]
40
+
41
+ def filter_umi_spatial(df, clone_umi_key, min_umi_spatial=1):
42
+ return df[df[clone_umi_key] > min_umi_spatial]
43
+
44
+
@@ -0,0 +1,71 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ import sys
5
+ from pathlib import Path
6
+ sys.path.append(str(Path(__file__).parent.parent))
7
+ from pl.basic_plot import _plot_bar
8
+
9
+ from .bcr_desc import compute_index
10
+
11
+ def stat_clone(df, groupby, Cgene_key, clone_key, plot=True, palette='Set2', xlabel=None, ylabel=None, ylog=False, figsize=(4,3.5) ):
12
+ y_name = 'clone_by_'+groupby
13
+ clone_df = df[[groupby, Cgene_key, clone_key]].drop_duplicates().groupby([groupby, Cgene_key]).size().reset_index(name = y_name)
14
+
15
+ if plot:
16
+ _plot_bar(clone_df, Cgene_key, y_name, groupby=Cgene_key, palette='Set2', xlabel=None, ylabel=None, ylog=False, figsize=(4,3.5) )
17
+
18
+ cloneDict = clone_df.set_index([groupby, Cgene_key])['clone_by_'+groupby].to_dict()
19
+ df[y_name] = df.apply(lambda row: cloneDict.get((row[groupby], row[Cgene_key]), None), axis=1)
20
+
21
+ return df
22
+
23
+
24
+ def aggregate_clone_df(df, group_by, Cgene_key, clone_key, groups, count_basis='location', loc_x_key='X', loc_y_key='Y', Umi_key='UMI'):
25
+ if count_basis == 'location':
26
+ lst = list(set([group_by, Cgene_key, clone_key, loc_x_key, loc_y_key] + groups))
27
+ loc_df = df[lst].drop_duplicates()
28
+ _Index_compute_count = loc_df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts().reset_index(name='count')
29
+ _Index_compute_freq = loc_df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts(normalize=True).reset_index(name='freq')
30
+ _Index_compute = pd.merge(_Index_compute_freq, _Index_compute_count, on = groups+[clone_key] )
31
+ return _Index_compute
32
+
33
+ if count_basis == 'UMI':
34
+ _Index_compute_count = df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts().reset_index(name='count')
35
+ _Index_compute_freq = df[ groups+[clone_key] ].groupby(groups)[clone_key].value_counts(normalize=True).reset_index(name='freq')
36
+ _Index_compute = pd.merge(_Index_compute_freq, _Index_compute_count, on = groups+[clone_key] )
37
+ return _Index_compute
38
+
39
+
40
+ def compute_grouped_index(df, group_by, Cgene_key, clone_key, groups, count_basis='location', loc_x_key='X', loc_y_key='Y', Umi_key=None, index='shannon_entropy'):
41
+ if count_basis=='location':
42
+ _Index_compute = aggregate_clone_df(df, group_by, Cgene_key, clone_key, groups, count_basis=count_basis, loc_x_key='X', loc_y_key='Y').copy()
43
+ if count_basis=='UMI':
44
+ _Index_compute = aggregate_clone_df(df, group_by, Cgene_key, clone_key, groups, count_basis=count_basis, Umi_key=Umi_key).copy()
45
+
46
+ tmp_df = _Index_compute.groupby(groups)['freq'].apply(lambda x: compute_index(index, x))
47
+
48
+ if index != 'renyi_entropy':
49
+ tmp_df = tmp_df.reset_index(name=index).dropna(subset=[index])
50
+ else:
51
+ tmp_df = tmp_df.melt(ignore_index=False, var_name='alpha', value_name = index).reset_index()
52
+ cols = list(range(0, len(groups))) + list(range(len(groups)+1, len(tmp_df.columns)))
53
+ tmp_df = tmp_df.iloc[:, cols ]
54
+
55
+ return tmp_df
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
@@ -0,0 +1,3 @@
1
+ from . import st, VDJ
2
+
3
+ __all__ = ["st", "VDJ"]
@@ -0,0 +1,3 @@
1
+ from . import tl, pl, gr
2
+
3
+ __all__ = ["tl", "pl", "gr"]
@@ -0,0 +1,4 @@
1
+ from ._aggr import aggregate_neighbors
2
+ from ._get_adj import get_adj
3
+ from ._build_spatial_adj import spatial_neighbors
4
+ from ._utils import _save_data