oscb 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
oscb-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,73 @@
1
+ Metadata-Version: 2.2
2
+ Name: oscb
3
+ Version: 0.1.0
4
+ Summary: OSCB aims to provide automated end-to-end single-cell analyses ML pipelines to simplify and standardize the process of single-cell data formatting, quality control, loading, model development, and model evaluation.
5
+ Home-page: https://github.com/cirisjl/Machine-learning-development-environment-for-single-cell-sequencing-data-analyses
6
+ Author: Lei Jiang
7
+ Author-email: leijiang@missouri.edu
8
+ License: MIT
9
+ Keywords: single-cell,benchmarks
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.6, <=3.12
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: leidenalg>=0.8.10
16
+ Requires-Dist: matplotlib>=3.5.1
17
+ Requires-Dist: networkx>=2.6.3
18
+ Requires-Dist: numpy>=1.26.4
19
+ Requires-Dist: pandas>=1.3.5
20
+ Requires-Dist: python_igraph>=0.9.9
21
+ Requires-Dist: python_louvain>=0.16
22
+ Requires-Dist: scanpy
23
+ Requires-Dist: muon
24
+ Requires-Dist: mudata
25
+ Requires-Dist: tqdm
26
+ Requires-Dist: requests
27
+ Requires-Dist: scib
28
+ Requires-Dist: zss
29
+ Requires-Dist: grakel
30
+ Requires-Dist: scikit_learn>=1.0.2
31
+ Requires-Dist: scipy>=1.7.3
32
+ Requires-Dist: umap_learn>=0.5.2
33
+ Dynamic: author
34
+ Dynamic: author-email
35
+ Dynamic: classifier
36
+ Dynamic: description
37
+ Dynamic: description-content-type
38
+ Dynamic: home-page
39
+ Dynamic: keywords
40
+ Dynamic: license
41
+ Dynamic: requires-dist
42
+ Dynamic: requires-python
43
+ Dynamic: summary
44
+
45
+ # Overview
46
+
47
+ --------------------------------------------------------------------------------
48
+
49
+
50
+ Machine learning (ML) is transforming single-cell sequencing data analysis; however, the barriers of technology complexity and biology knowledge remain challenging for the involvement of the ML community in single-cell data analysis. We present an ML development environment for single-cell sequencing data analyses with a diverse set of AI-Ready benchmark datasets. A cloud-based platform is built to dynamically scale workflows for collecting, processing, and managing various single-cell sequencing data to make them ML-ready. In addition, benchmarks for each problem formulation and a code-level and web-interface IDE for single-cell analysis method development are provided.
51
+
52
+
53
+ ![Workflow](https://oscb.missouri.edu/assets/30e6000a-5e6f-440f-bec5-5c7ceb256c55)
54
+
55
+ OSCB aims to provide automated end-to-end single-cell analyses ML pipelines to simplify and standardize the process of single-cell data formatting, quality control, loading, model development, and model evaluation.
56
+
57
+
58
+ **Workflows** are developed for collecting, processing, and managing diverse single-cell sequencing data to make them ML-ready and build benchmarks.
59
+
60
+ **IDE** is provided for supporting partial method development.
61
+
62
+ **Assessment utilities** are provided for evaluating results and report generation.
63
+
64
+ This **end-to-end pipeline** transforms the traditional “static” machine Learning into **continuous learning** on extensive new data.
65
+
66
+
67
+ By **in-depth fusing models with data**, this platform could ultimately help many single-cell sequencing researchers substantially.
68
+
69
+
70
+ ![Tools](https://oscb.missouri.edu/assets/c18ffb2a-814f-452c-921b-e399b99c41b4)
71
+
72
+
73
+ OSCB is an on-going effort, and we are planning to increase our coverage in the future.
oscb-0.1.0/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Overview
2
+
3
+ --------------------------------------------------------------------------------
4
+
5
+
6
+ Machine learning (ML) is transforming single-cell sequencing data analysis; however, the barriers of technology complexity and biology knowledge remain challenging for the involvement of the ML community in single-cell data analysis. We present an ML development environment for single-cell sequencing data analyses with a diverse set of AI-Ready benchmark datasets. A cloud-based platform is built to dynamically scale workflows for collecting, processing, and managing various single-cell sequencing data to make them ML-ready. In addition, benchmarks for each problem formulation and a code-level and web-interface IDE for single-cell analysis method development are provided.
7
+
8
+
9
+ ![Workflow](https://oscb.missouri.edu/assets/30e6000a-5e6f-440f-bec5-5c7ceb256c55)
10
+
11
+ OSCB aims to provide automated end-to-end single-cell analyses ML pipelines to simplify and standardize the process of single-cell data formatting, quality control, loading, model development, and model evaluation.
12
+
13
+
14
+ **Workflows** are developed for collecting, processing, and managing diverse single-cell sequencing data to make them ML-ready and build benchmarks.
15
+
16
+ **IDE** is provided for supporting partial method development.
17
+
18
+ **Assessment utilities** are provided for evaluating results and report generation.
19
+
20
+ This **end-to-end pipeline** transforms the traditional “static” machine Learning into **continuous learning** on extensive new data.
21
+
22
+
23
+ By **in-depth fusing models with data**, this platform could ultimately help many single-cell sequencing researchers substantially.
24
+
25
+
26
+ ![Tools](https://oscb.missouri.edu/assets/c18ffb2a-814f-452c-921b-e399b99c41b4)
27
+
28
+
29
+ OSCB is an on-going effort, and we are planning to increase our coverage in the future.
@@ -0,0 +1,14 @@
1
+ from __future__ import absolute_import
2
+
3
+ from .evaluation import annotation, ccc, clustering, imputation, integration, multimodal, trajectory, annotation
4
+ from .data import *
5
+ from .evaluator import *
6
+ from .utilization import *
7
+ from .utils import *
8
+ # from .evaluation import ccc
9
+ # from .evaluation import clustering
10
+ # from .evaluation import imputation
11
+ # from .evaluation import integration
12
+ # from .evaluation import multimodal
13
+ # from .evaluation import trajectory
14
+ # from .evaluation import annotation
@@ -0,0 +1,135 @@
1
+ import requests
2
+ from tqdm import tqdm
3
+ import os
4
+ from pathlib import Path
5
+ import hashlib
6
+ import re
7
+ from muon import MuData
8
+ import muon as mu
9
+ import numpy as np
10
+ import pandas as pd
11
+ import scanpy as sc
12
+ import anndata as ad
13
+ import mudata as md
14
+ from .utils import *
15
+
16
+
17
+
18
+ class FileDownloader:
19
+ def __init__(self, chunk_size=8192):
20
+ self.chunk_size = chunk_size
21
+ self.session = requests.Session()
22
+
23
+ def get_filename_from_response(self, headers):
24
+ """
25
+ Extracts filename from Content-Disposition header or URL.
26
+ """
27
+ print(headers)
28
+ if "content-disposition" in headers:
29
+ cd = headers["content-disposition"]
30
+ match = re.search(r"filename\*?=['\"]?(.*?)['\"]?(?:;|$)", cd)
31
+ if match:
32
+ filename = match.group(1)
33
+ # Handle potential encoding if filename* is used
34
+ if filename.startswith("utf-8''"):
35
+ filename = filename.split("''", 1)[1]
36
+ filename = requests.utils.unquote(filename)
37
+ return filename
38
+ return None
39
+
40
+ def get_file_size(self, response):
41
+ # response = self.session.head(url)
42
+ return int(response.headers.get('content-length', 0))
43
+
44
+ def get_file_hash(self, file_path):
45
+ sha256_hash = hashlib.sha256()
46
+ with open(file_path, "rb") as f:
47
+ for byte_block in iter(lambda: f.read(4096), b""):
48
+ sha256_hash.update(byte_block)
49
+ return sha256_hash.hexdigest()
50
+
51
+ def download(self, url, data_dict, data_folder='downloads/', verify_hash=None):
52
+ try:
53
+ response = self.session.post(url, json=data_dict, stream=True)
54
+ response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
55
+ total_size = self.get_file_size(response)
56
+ file_name = self.get_filename_from_response(response.headers)
57
+ local_file_path = os.path.join(data_folder, file_name)
58
+ local_file_path = Path(local_file_path)
59
+ # Make dir
60
+ local_file_path.parent.mkdir(parents=True, exist_ok=True)
61
+
62
+ # Progress bar
63
+ progress = tqdm(total=total_size,
64
+ unit='B',
65
+ unit_scale=True,
66
+ desc=local_file_path.name)
67
+
68
+ with local_file_path.open('wb') as f:
69
+ for chunk in response.iter_content(chunk_size=self.chunk_size):
70
+ if chunk:
71
+ f.write(chunk)
72
+ progress.update(len(chunk))
73
+ progress.close()
74
+
75
+ # File validation
76
+ if verify_hash:
77
+ downloaded_hash = self.get_file_hash(local_file_path)
78
+ if downloaded_hash != verify_hash:
79
+ raise ValueError("File hash verification failed.")
80
+
81
+ print(f"File downloaded successfully to: {local_file_path}")
82
+
83
+ return local_file_path
84
+
85
+ except Exception as e:
86
+ progress.close()
87
+ print(f"Download failed: {str(e)}")
88
+ if local_file_path.exists():
89
+ local_file_path.unlink()
90
+ return None
91
+
92
+ def download_multiple(self, url_list, data_folder):
93
+ results = []
94
+ for url in url_list:
95
+ filename = url.split('/')[-1]
96
+ local_file_path = Path(data_folder) / filename
97
+ success = self.download(url, local_file_path)
98
+ results.append({
99
+ 'url': url,
100
+ 'success': success,
101
+ 'local_file_path': str(local_file_path)
102
+ })
103
+ return results
104
+
105
+
106
+
107
+ def DataLoader(benchmarks_id, data_folder='downloads/', server_endpoint=server_endpoint+'download'):
108
+ dataset_id, task = get_dataset_id(benchmarks_id)
109
+ if task is not None:
110
+ print(f"Downloading dataset for {task} Benchmarks.")
111
+ else:
112
+ print("Downloading dataset.")
113
+ data_dict = {
114
+ "dataset_id": dataset_id
115
+ }
116
+
117
+ downloader = FileDownloader()
118
+ adata_path = downloader.download(server_endpoint, data_dict, data_folder="downloads")
119
+
120
+ if os.path.isfile(adata_path):
121
+ if str(adata_path).endswith(".h5mu"):
122
+ mdata = muon.read_h5mu(adata_path)
123
+ return mdata
124
+ else:
125
+ adata = sc.read_h5ad(adata_path)
126
+ return adata
127
+ else:
128
+ return None
129
+
130
+
131
+ def split_data(adata):
132
+ train_adata = adata[adata.obs.split_idx.str.contains('train'), :].copy()
133
+ test_adata = adata[adata.obs.split_idx.str.contains('test'), :].copy()
134
+
135
+ return train_adata, test_adata
@@ -0,0 +1,10 @@
1
+ from __future__ import absolute_import
2
+
3
+ from .annotation import *
4
+ from .ccc import *
5
+ from .clustering import *
6
+ from .imputation import *
7
+ from .integration import *
8
+ from .multimodal import *
9
+ from .trajectory import *
10
+ from .annotation import *
@@ -0,0 +1,39 @@
1
+ import numpy as np
2
+ import sklearn.preprocessing
3
+ from sklearn.metrics import f1_score, accuracy_score
4
+
5
+
6
+ def annotation_metrics(labels, labels_pred):
7
+ print("Encode labels", flush=True)
8
+ labels = labels.astype('category')
9
+ labels_pred = labels_pred.astype('category')
10
+ if labels.isna().any():
11
+ labels = labels.cat.add_categories(['Unkown'])
12
+ labels = labels.fillna('Unkown')
13
+ if labels_pred.isna().any():
14
+ labels_pred = labels_pred.cat.add_categories(['Unkown'])
15
+ labels_pred = labels_pred.fillna('Unkown')
16
+ cats = list(labels.dtype.categories) + list(labels_pred.dtype.categories)
17
+ encoder = sklearn.preprocessing.LabelEncoder().fit(cats)
18
+ labels = encoder.transform(labels)
19
+ labels_pred = encoder.transform(labels_pred)
20
+
21
+ print("Compute prediction accuracy", flush=True)
22
+ accuracy = accuracy_score(labels, labels_pred)
23
+ accuracy = float('{:.4f}'.format(accuracy))
24
+
25
+ print("Compute F1 score", flush=True)
26
+ f1_macro = float('{:.4f}'.format(f1_score(
27
+ labels, labels_pred,
28
+ average='macro'
29
+ )))
30
+ f1_micro = float('{:.4f}'.format(f1_score(
31
+ labels, labels_pred,
32
+ average='micro'
33
+ )))
34
+ f1_weighted = float('{:.4f}'.format(f1_score(
35
+ labels, labels_pred,
36
+ average='weighted'
37
+ )))
38
+
39
+ return accuracy, f1_macro, f1_micro, f1_weighted
@@ -0,0 +1,74 @@
1
+ from typing import Union
2
+
3
+ import anndata
4
+ import collections
5
+ import numpy as np
6
+ from sklearn.metrics import auc
7
+ from sklearn.metrics import precision_recall_curve
8
+
9
+ # Cell Cell Communication
10
+ def ccc_metrics(adata, ccc_pred="ccc_pred", ccc_target="ccc_target", score="score", top_prop=0.05):
11
+ # Precision-recall AUC
12
+ gt = join_truth_and_pred(adata, ccc_pred, ccc_target, score)
13
+ precision, recall, _ = precision_recall_curve(
14
+ gt["response"], gt[score], pos_label=1
15
+ )
16
+
17
+ auc_score = auc(recall, precision)
18
+
19
+ # Odds Ratio
20
+ gt = gt.sort_values(score, ascending=False)
21
+ top_n = int(adata.uns[ccc_target].shape[0] * top_prop)
22
+
23
+ # assign the top rank interactions to 1
24
+ a = np.zeros(len(gt[score]))
25
+ a[0:top_n] = 1
26
+ gt.loc[:, ["top_n"]] = a
27
+
28
+ top = gt[gt["top_n"] == 1]
29
+ tp = np.sum(top.response == 1)
30
+ fp = np.sum(top.response == 0)
31
+
32
+ bot = gt[gt["top_n"] == 0]
33
+ fn = np.sum(bot.response == 1)
34
+ tn = np.sum(bot.response == 0)
35
+
36
+ numerator = tp * tn
37
+ denominator = fp * fn
38
+ if denominator == 0:
39
+ if numerator == 0:
40
+ # undefined
41
+ oddsratio_score = np.nan
42
+ else:
43
+ # perfect score
44
+ oddsratio_score = np.inf
45
+ else:
46
+ oddsratio_score = numerator / denominator
47
+ oddsratio_score = _sigmoid_transform(oddsratio_score)
48
+
49
+ return float('{:.4f}'.format(auc_score)), float('{:.4f}'.format(oddsratio_score))
50
+
51
+
52
+ # Join predictions to target
53
+ def join_truth_and_pred(adata, ccc_pred="ccc_pred", ccc_target="ccc_target", score="lrscore"):
54
+ merge_keys = list(adata.uns["merge_keys"])
55
+ gt = adata.uns[ccc_target].merge(adata.uns[ccc_pred], on=merge_keys, how="left")
56
+
57
+ gt.loc[gt["response"].isna(), "response"] = 0
58
+ gt.loc[gt[score].isna(), score] = np.nanmin(gt[score]) - np.finfo(float).eps
59
+
60
+ return gt
61
+
62
+
63
+ def _sigmoid_transform(x):
64
+ return 1 - 1 / (1 + x / 2)
65
+
66
+
67
+ def aggregate_method_scores(adata, how, ccc_pred="LIANA", score="score"):
68
+ merge_keys = list(adata.uns["merge_keys"])
69
+ return (
70
+ adata.uns[ccc_pred]
71
+ .groupby(merge_keys)
72
+ .agg(score=(score, how))
73
+ .reset_index()
74
+ )
@@ -0,0 +1,20 @@
1
+ from sklearn.metrics import adjusted_rand_score as ARI
2
+ from sklearn.metrics import normalized_mutual_info_score as NMI
3
+ from sklearn.metrics import silhouette_score
4
+ from sklearn.metrics import fowlkes_mallows_score as FM
5
+
6
+ def clustering_metrics(labels, labels_pred, embedding):
7
+ asw_score = silhouette_score(embedding, labels)
8
+ nmi_score = NMI(labels, labels_pred)
9
+ ari_score = ARI(labels, labels_pred)
10
+ fm_score = FM(labels, labels_pred)
11
+ asw_score = float('{:.4f}'.format(asw_score))
12
+ nmi_score = float('{:.4f}'.format(nmi_score))
13
+ ari_score = float('{:.4f}'.format(ari_score))
14
+ fm_score = float('{:.4f}'.format(fm_score))
15
+
16
+ print(
17
+ "Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nFowlkes Mallows: %.4f"
18
+ % (asw_score, nmi_score, ari_score, fm_score)
19
+ )
20
+ return asw_score, nmi_score, ari_score, fm_score
@@ -0,0 +1,161 @@
1
+ from scipy import sparse
2
+ import importlib
3
+ import numbers
4
+ import anndata
5
+ import scanpy as sc
6
+ import sklearn.metrics
7
+ import numpy as np
8
+ import pandas as pd
9
+ import re
10
+ import warnings
11
+
12
+
13
+ # test_data = adata.obsm["test"]
14
+ # denoised_data = adata.obsm["denoised"]
15
+ # train_data = adata.obsm["train"]
16
+ def imputation_metrics(adata, denoised_layer, train='train', test='test'):
17
+ #Mean-squared error
18
+ test_adata = anndata.AnnData(X=adata.obsm[test], obs=adata.obs, var=adata.var)
19
+ denoised_adata = anndata.AnnData(
20
+ X=adata.layers[denoised_layer], obs=adata.obs, var=adata.var
21
+ )
22
+
23
+ # scaling and transformation
24
+ target_sum = 10000
25
+
26
+ sc.pp.normalize_total(test_adata, target_sum)
27
+ sc.pp.log1p(test_adata)
28
+
29
+ sc.pp.normalize_total(denoised_adata, target_sum)
30
+ sc.pp.log1p(denoised_adata)
31
+
32
+ mse = sklearn.metrics.mean_squared_error(
33
+ toarray(test_adata.X), toarray(denoised_adata.X)
34
+ )
35
+
36
+ # Poisson loss
37
+ test_data = adata.obsm[test]
38
+ denoised_data = adata.layers[denoised_layer]
39
+
40
+ # scaling
41
+ initial_sum = adata.obsm[train].sum()
42
+ target_sum = test_data.sum()
43
+ denoised_data = denoised_data * target_sum / initial_sum
44
+
45
+ possion = poisson_nll_loss(toarray(test_data), toarray(denoised_data))
46
+
47
+ return float('{:.4f}'.format(mse)), float('{:.4f}'.format(possion))
48
+
49
+
50
+ def poisson_nll_loss(y_pred: np.ndarray, y_true: np.ndarray) -> float:
51
+ return (y_pred - y_true * np.log(y_pred + 1e-6)).mean()
52
+
53
+
54
+ def toarray(x):
55
+ """Convert an array-like to a np.ndarray.
56
+
57
+ Parameters
58
+ ----------
59
+ x : array-like
60
+ Array-like to be converted
61
+ Returns
62
+ -------
63
+ x : np.ndarray
64
+ """
65
+ if is_SparseDataFrame(x):
66
+ x = x.to_coo().toarray()
67
+ elif is_SparseSeries(x):
68
+ x = x.to_dense().to_numpy()
69
+ elif isinstance(x, (pd.DataFrame, pd.Series, pd.Index)):
70
+ x = x.to_numpy()
71
+ elif isinstance(x, sparse.spmatrix):
72
+ x = x.toarray()
73
+ elif isinstance(x, np.matrix):
74
+ x = x.A
75
+ elif isinstance(x, list):
76
+ x_out = []
77
+ for xi in x:
78
+ try:
79
+ xi = toarray(xi)
80
+ except TypeError:
81
+ # recursed too far
82
+ pass
83
+ x_out.append(xi)
84
+ # convert x_out from list to array
85
+ x = np.array(x_out, dtype=_check_numpy_dtype(x_out))
86
+ elif isinstance(x, (np.ndarray, numbers.Number)):
87
+ pass
88
+ else:
89
+ raise TypeError("Expected array-like. Got {}".format(type(x)))
90
+ return x
91
+
92
+
93
+ def is_SparseSeries(X):
94
+ with warnings.catch_warnings():
95
+ warnings.filterwarnings(
96
+ "ignore",
97
+ "The SparseSeries class is removed from pandas. Accessing it from the "
98
+ "top-level namespace will also be removed in the next version",
99
+ FutureWarning,
100
+ )
101
+ try:
102
+ return isinstance(X, pd.SparseSeries)
103
+ except AttributeError:
104
+ return False
105
+
106
+
107
+ def is_SparseDataFrame(X):
108
+ with warnings.catch_warnings():
109
+ warnings.filterwarnings(
110
+ "ignore",
111
+ "The SparseDataFrame class is removed from pandas. Accessing it from the "
112
+ "top-level namespace will also be removed in the next version",
113
+ FutureWarning,
114
+ )
115
+ try:
116
+ return isinstance(X, pd.SparseDataFrame)
117
+ except AttributeError:
118
+ return False
119
+
120
+
121
+ def is_sparse_dataframe(x):
122
+ if isinstance(x, pd.DataFrame) and not is_SparseDataFrame(x):
123
+ try:
124
+ x.sparse
125
+ return True
126
+ except AttributeError:
127
+ pass
128
+ return False
129
+
130
+
131
+ def is_sparse_series(x):
132
+ if isinstance(x, pd.Series) and not is_SparseSeries(x):
133
+ try:
134
+ x.sparse
135
+ return True
136
+ except AttributeError:
137
+ pass
138
+ return False
139
+
140
+
141
+ def dataframe_to_sparse(x, fill_value=0.0):
142
+ x = pd.DataFrame.sparse.from_spmatrix(
143
+ sparse.coo_matrix(x.values), index=x.index, columns=x.columns
144
+ )
145
+ x.sparse.fill_value = fill_value
146
+ return x
147
+
148
+
149
+ def SparseDataFrame(X, columns=None, index=None, default_fill_value=0.0):
150
+ if sparse.issparse(X):
151
+ X = pd.DataFrame.sparse.from_spmatrix(X)
152
+ X.sparse.fill_value = default_fill_value
153
+ else:
154
+ if is_SparseDataFrame(X) or not isinstance(X, pd.DataFrame):
155
+ X = pd.DataFrame(X)
156
+ X = dataframe_to_sparse(X, fill_value=default_fill_value)
157
+ if columns is not None:
158
+ X.columns = columns
159
+ if index is not None:
160
+ X.index = index
161
+ return X
@@ -0,0 +1,60 @@
1
+ from scib.metrics import metrics
2
+
3
+ # https://github.com/theislab/scib/blob/main/scib/metrics/metrics.py
4
+ # https://scib.readthedocs.io/en/latest/api/scib.metrics.metrics_all.html
5
+ def integration_metrics(adata, adata_int, batch_key='batch', label_key='cell_type', species="mouse"):
6
+ """All metrics
7
+
8
+ :Biological conservation:
9
+ + HVG overlap :func:`~scib.metrics.hvg_overlap`
10
+ + Cell type ASW :func:`~scib.metrics.silhouette`
11
+ + Isolated label ASW :func:`~scib.metrics.isolated_labels`
12
+ + Isolated label F1 :func:`~scib.metrics.isolated_labels`
13
+ + NMI cluster/label :func:`~scib.metrics.nmi`
14
+ + ARI cluster/label :func:`~scib.metrics.ari`
15
+ + Cell cycle conservation :func:`~scib.metrics.cell_cycle`
16
+ + cLISI (cell type Local Inverse Simpson's Index) :func:`~scib.metrics.clisi_graph`
17
+ + Trajectory conservation :func:`~scib.metrics.trajectory_conservation`
18
+
19
+ :Batch correction:
20
+ + Graph connectivity :func:`~scib.metrics.graph_connectivity`
21
+ + Batch ASW :func:`~scib.metrics.silhouette_batch`
22
+ + Principal component regression :func:`~scib.metrics.pcr_comparison`
23
+ + kBET (k-nearest neighbour batch effect test) :func:`~scib.metrics.kBET`
24
+ + iLISI (integration Local Inverse Simpson's Index) :func:`~scib.metrics.ilisi_graph`
25
+
26
+ :param adata: unintegrated, preprocessed anndata object
27
+ :param adata_int: integrated anndata object
28
+ :param batch_key: name of batch column in adata.obs and adata_int.obs
29
+ :param label_key: name of biological label (cell type) column in adata.obs and adata_int.obs
30
+ :param kwargs:
31
+ Parameters to pass on to :func:`~scib.metrics.metrics` function:
32
+
33
+ + ``embed``
34
+ + ``cluster_key``
35
+ + ``cluster_nmi``
36
+ + ``nmi_method``
37
+ + ``nmi_dir``
38
+ + ``si_metric``
39
+ + ``organism``
40
+ + ``n_isolated``
41
+ + ``subsample``
42
+ + ``type_``
43
+ """
44
+
45
+ metrics_all = metrics(adata, adata_int, batch_key=batch_key, label_key=label_key, cluster_nmi=None, ari_=True, nmi_=True, nmi_method='arithmetic', nmi_dir=None, silhouette_=True, si_metric='euclidean', pcr_=True, cell_cycle_=True, organism=species, hvg_score_=True, isolated_labels_=True, isolated_labels_f1_=True, isolated_labels_asw_=True, n_isolated=True, graph_conn_=True, trajectory_=False, kBET_=True)
46
+ biological_conservation_metrics = ['NMI_cluster/label', 'ARI_cluster/label', 'ASW_label', 'cell_cycle_conservation','isolated_label_F1', 'isolated_label_silhouette', 'hvg_overlap']
47
+ # metrics_dict = metrics_all.dropna().to_dict()[0]
48
+ metrics_dict = metrics_all.fillna(0).to_dict()[0]
49
+
50
+ for key, value in metrics_dict.items():
51
+ metrics_dict[key] = float('{:.4f}'.format(value))
52
+
53
+ bc_total = 0
54
+ for key in biological_conservation_metrics:
55
+ bc_total += metrics_dict[key]
56
+ biological_conservation_score = float('{:.4f}'.format(bc_total/len(biological_conservation_metrics)))
57
+
58
+ metrics_dict['Biological Conservation'] = biological_conservation_score
59
+
60
+ return metrics_dict
@@ -0,0 +1,49 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import os
4
+ import subprocess
5
+ import scanpy as sc
6
+ import scipy.io
7
+ import scib
8
+ import muon as mu
9
+ from muon import MuData
10
+
11
+
12
+ def multimodal_metrics(mdata, embed, mod1='rna', batch='group', label_key='cell_type'):
13
+ scib_anndata = sc.AnnData(mdata.obsm[embed]).copy()
14
+ scib_anndata.obs = mdata.obs.copy()
15
+ scib_anndata.obsp["connectivities"] = mdata.obsp["connectivities"].copy()
16
+ scib_anndata.obsm[embed] = mdata.obsm[embed].copy()
17
+ scib_anndata = scib_anndata[~scib_anndata.obs[f"{mod1}:{batch}"].isna()] # Remove NaN in batch
18
+ scib_anndata = scib_anndata[~scib_anndata.obs[f"{mod1}:{label_key}"].isna()] # Remove NaN in cell type label
19
+ scib_anndata.obs[f"{mod1}:{batch}"] = scib_anndata.obs[f"{mod1}:{batch}"].astype("category")
20
+ scib_anndata.obs[f"{mod1}:{label_key}"] = scib_anndata.obs[f"{mod1}:{label_key}"].astype("category")
21
+
22
+ metrics = scib.metrics.metrics(
23
+ scib_anndata,
24
+ scib_anndata,
25
+ batch_key=f"{mod1}:{batch}",
26
+ label_key=f"{mod1}:{label_key}",
27
+ embed=embed,
28
+ ari_=True,
29
+ nmi_=True,
30
+ silhouette_=True,
31
+ graph_conn_=True,
32
+ isolated_labels_asw_=True,
33
+ )
34
+
35
+ biological_conservation_metrics = ['NMI_cluster/label', 'ARI_cluster/label', 'ASW_label', 'cell_cycle_conservation','isolated_label_F1', 'isolated_label_silhouette', 'hvg_overlap']
36
+ metrics = metrics.fillna(0).to_dict()[0]
37
+
38
+ for key, value in metrics.items():
39
+ metrics[key] = float('{:.4f}'.format(value))
40
+
41
+ bc_total = 0
42
+ for key in biological_conservation_metrics:
43
+ bc_total += metrics[key]
44
+ biological_conservation_score = float('{:.4f}'.format(bc_total/len(biological_conservation_metrics)))
45
+
46
+ metrics['Biological Conservation'] = biological_conservation_score
47
+ scib_anndata = None
48
+
49
+ return metrics