oscb 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
oscb/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ from __future__ import absolute_import
2
+
3
+ from .evaluation import annotation, ccc, clustering, imputation, integration, multimodal, trajectory, annotation
4
+ from .data import *
5
+ from .evaluator import *
6
+ from .utilization import *
7
+ from .utils import *
8
+ # from .evaluation import ccc
9
+ # from .evaluation import clustering
10
+ # from .evaluation import imputation
11
+ # from .evaluation import integration
12
+ # from .evaluation import multimodal
13
+ # from .evaluation import trajectory
14
+ # from .evaluation import annotation
oscb/data.py ADDED
@@ -0,0 +1,135 @@
1
+ import requests
2
+ from tqdm import tqdm
3
+ import os
4
+ from pathlib import Path
5
+ import hashlib
6
+ import re
7
+ from muon import MuData
8
+ import muon as mu
9
+ import numpy as np
10
+ import pandas as pd
11
+ import scanpy as sc
12
+ import anndata as ad
13
+ import mudata as md
14
+ from .utils import *
15
+
16
+
17
+
18
+ class FileDownloader:
19
+ def __init__(self, chunk_size=8192):
20
+ self.chunk_size = chunk_size
21
+ self.session = requests.Session()
22
+
23
+ def get_filename_from_response(self, headers):
24
+ """
25
+ Extracts filename from Content-Disposition header or URL.
26
+ """
27
+ print(headers)
28
+ if "content-disposition" in headers:
29
+ cd = headers["content-disposition"]
30
+ match = re.search(r"filename\*?=['\"]?(.*?)['\"]?(?:;|$)", cd)
31
+ if match:
32
+ filename = match.group(1)
33
+ # Handle potential encoding if filename* is used
34
+ if filename.startswith("utf-8''"):
35
+ filename = filename.split("''", 1)[1]
36
+ filename = requests.utils.unquote(filename)
37
+ return filename
38
+ return None
39
+
40
+ def get_file_size(self, response):
41
+ # response = self.session.head(url)
42
+ return int(response.headers.get('content-length', 0))
43
+
44
+ def get_file_hash(self, file_path):
45
+ sha256_hash = hashlib.sha256()
46
+ with open(file_path, "rb") as f:
47
+ for byte_block in iter(lambda: f.read(4096), b""):
48
+ sha256_hash.update(byte_block)
49
+ return sha256_hash.hexdigest()
50
+
51
+ def download(self, url, data_dict, data_folder='downloads/', verify_hash=None):
52
+ try:
53
+ response = self.session.post(url, json=data_dict, stream=True)
54
+ response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
55
+ total_size = self.get_file_size(response)
56
+ file_name = self.get_filename_from_response(response.headers)
57
+ local_file_path = os.path.join(data_folder, file_name)
58
+ local_file_path = Path(local_file_path)
59
+ # Make dir
60
+ local_file_path.parent.mkdir(parents=True, exist_ok=True)
61
+
62
+ # Progress bar
63
+ progress = tqdm(total=total_size,
64
+ unit='B',
65
+ unit_scale=True,
66
+ desc=local_file_path.name)
67
+
68
+ with local_file_path.open('wb') as f:
69
+ for chunk in response.iter_content(chunk_size=self.chunk_size):
70
+ if chunk:
71
+ f.write(chunk)
72
+ progress.update(len(chunk))
73
+ progress.close()
74
+
75
+ # File validation
76
+ if verify_hash:
77
+ downloaded_hash = self.get_file_hash(local_file_path)
78
+ if downloaded_hash != verify_hash:
79
+ raise ValueError("File hash verification failed.")
80
+
81
+ print(f"File downloaded successfully to: {local_file_path}")
82
+
83
+ return local_file_path
84
+
85
+ except Exception as e:
86
+ progress.close()
87
+ print(f"Download failed: {str(e)}")
88
+ if local_file_path.exists():
89
+ local_file_path.unlink()
90
+ return None
91
+
92
+ def download_multiple(self, url_list, data_folder):
93
+ results = []
94
+ for url in url_list:
95
+ filename = url.split('/')[-1]
96
+ local_file_path = Path(data_folder) / filename
97
+ success = self.download(url, local_file_path)
98
+ results.append({
99
+ 'url': url,
100
+ 'success': success,
101
+ 'local_file_path': str(local_file_path)
102
+ })
103
+ return results
104
+
105
+
106
+
107
+ def DataLoader(benchmarks_id, data_folder='downloads/', server_endpoint=server_endpoint+'download'):
108
+ dataset_id, task = get_dataset_id(benchmarks_id)
109
+ if task is not None:
110
+ print(f"Downloading dataset for {task} Benchmarks.")
111
+ else:
112
+ print("Downloading dataset.")
113
+ data_dict = {
114
+ "dataset_id": dataset_id
115
+ }
116
+
117
+ downloader = FileDownloader()
118
+ adata_path = downloader.download(server_endpoint, data_dict, data_folder="downloads")
119
+
120
+ if os.path.isfile(adata_path):
121
+ if str(adata_path).endswith(".h5mu"):
122
+ mdata = muon.read_h5mu(adata_path)
123
+ return mdata
124
+ else:
125
+ adata = sc.read_h5ad(adata_path)
126
+ return adata
127
+ else:
128
+ return None
129
+
130
+
131
+ def split_data(adata):
132
+ train_adata = adata[adata.obs.split_idx.str.contains('train'), :].copy()
133
+ test_adata = adata[adata.obs.split_idx.str.contains('test'), :].copy()
134
+
135
+ return train_adata, test_adata
@@ -0,0 +1,10 @@
1
+ from __future__ import absolute_import
2
+
3
+ from .annotation import *
4
+ from .ccc import *
5
+ from .clustering import *
6
+ from .imputation import *
7
+ from .integration import *
8
+ from .multimodal import *
9
+ from .trajectory import *
10
+ from .annotation import *
@@ -0,0 +1,39 @@
1
+ import numpy as np
2
+ import sklearn.preprocessing
3
+ from sklearn.metrics import f1_score, accuracy_score
4
+
5
+
6
+ def annotation_metrics(labels, labels_pred):
7
+ print("Encode labels", flush=True)
8
+ labels = labels.astype('category')
9
+ labels_pred = labels_pred.astype('category')
10
+ if labels.isna().any():
11
+ labels = labels.cat.add_categories(['Unkown'])
12
+ labels = labels.fillna('Unkown')
13
+ if labels_pred.isna().any():
14
+ labels_pred = labels_pred.cat.add_categories(['Unkown'])
15
+ labels_pred = labels_pred.fillna('Unkown')
16
+ cats = list(labels.dtype.categories) + list(labels_pred.dtype.categories)
17
+ encoder = sklearn.preprocessing.LabelEncoder().fit(cats)
18
+ labels = encoder.transform(labels)
19
+ labels_pred = encoder.transform(labels_pred)
20
+
21
+ print("Compute prediction accuracy", flush=True)
22
+ accuracy = accuracy_score(labels, labels_pred)
23
+ accuracy = float('{:.4f}'.format(accuracy))
24
+
25
+ print("Compute F1 score", flush=True)
26
+ f1_macro = float('{:.4f}'.format(f1_score(
27
+ labels, labels_pred,
28
+ average='macro'
29
+ )))
30
+ f1_micro = float('{:.4f}'.format(f1_score(
31
+ labels, labels_pred,
32
+ average='micro'
33
+ )))
34
+ f1_weighted = float('{:.4f}'.format(f1_score(
35
+ labels, labels_pred,
36
+ average='weighted'
37
+ )))
38
+
39
+ return accuracy, f1_macro, f1_micro, f1_weighted
oscb/evaluation/ccc.py ADDED
@@ -0,0 +1,74 @@
1
+ from typing import Union
2
+
3
+ import anndata
4
+ import collections
5
+ import numpy as np
6
+ from sklearn.metrics import auc
7
+ from sklearn.metrics import precision_recall_curve
8
+
9
+ # Cell Cell Communication
10
+ def ccc_metrics(adata, ccc_pred="ccc_pred", ccc_target="ccc_target", score="score", top_prop=0.05):
11
+ # Precision-recall AUC
12
+ gt = join_truth_and_pred(adata, ccc_pred, ccc_target, score)
13
+ precision, recall, _ = precision_recall_curve(
14
+ gt["response"], gt[score], pos_label=1
15
+ )
16
+
17
+ auc_score = auc(recall, precision)
18
+
19
+ # Odds Ratio
20
+ gt = gt.sort_values(score, ascending=False)
21
+ top_n = int(adata.uns[ccc_target].shape[0] * top_prop)
22
+
23
+ # assign the top rank interactions to 1
24
+ a = np.zeros(len(gt[score]))
25
+ a[0:top_n] = 1
26
+ gt.loc[:, ["top_n"]] = a
27
+
28
+ top = gt[gt["top_n"] == 1]
29
+ tp = np.sum(top.response == 1)
30
+ fp = np.sum(top.response == 0)
31
+
32
+ bot = gt[gt["top_n"] == 0]
33
+ fn = np.sum(bot.response == 1)
34
+ tn = np.sum(bot.response == 0)
35
+
36
+ numerator = tp * tn
37
+ denominator = fp * fn
38
+ if denominator == 0:
39
+ if numerator == 0:
40
+ # undefined
41
+ oddsratio_score = np.nan
42
+ else:
43
+ # perfect score
44
+ oddsratio_score = np.inf
45
+ else:
46
+ oddsratio_score = numerator / denominator
47
+ oddsratio_score = _sigmoid_transform(oddsratio_score)
48
+
49
+ return float('{:.4f}'.format(auc_score)), float('{:.4f}'.format(oddsratio_score))
50
+
51
+
52
+ # Join predictions to target
53
+ def join_truth_and_pred(adata, ccc_pred="ccc_pred", ccc_target="ccc_target", score="lrscore"):
54
+ merge_keys = list(adata.uns["merge_keys"])
55
+ gt = adata.uns[ccc_target].merge(adata.uns[ccc_pred], on=merge_keys, how="left")
56
+
57
+ gt.loc[gt["response"].isna(), "response"] = 0
58
+ gt.loc[gt[score].isna(), score] = np.nanmin(gt[score]) - np.finfo(float).eps
59
+
60
+ return gt
61
+
62
+
63
+ def _sigmoid_transform(x):
64
+ return 1 - 1 / (1 + x / 2)
65
+
66
+
67
+ def aggregate_method_scores(adata, how, ccc_pred="LIANA", score="score"):
68
+ merge_keys = list(adata.uns["merge_keys"])
69
+ return (
70
+ adata.uns[ccc_pred]
71
+ .groupby(merge_keys)
72
+ .agg(score=(score, how))
73
+ .reset_index()
74
+ )
@@ -0,0 +1,20 @@
1
+ from sklearn.metrics import adjusted_rand_score as ARI
2
+ from sklearn.metrics import normalized_mutual_info_score as NMI
3
+ from sklearn.metrics import silhouette_score
4
+ from sklearn.metrics import fowlkes_mallows_score as FM
5
+
6
+ def clustering_metrics(labels, labels_pred, embedding):
7
+ asw_score = silhouette_score(embedding, labels)
8
+ nmi_score = NMI(labels, labels_pred)
9
+ ari_score = ARI(labels, labels_pred)
10
+ fm_score = FM(labels, labels_pred)
11
+ asw_score = float('{:.4f}'.format(asw_score))
12
+ nmi_score = float('{:.4f}'.format(nmi_score))
13
+ ari_score = float('{:.4f}'.format(ari_score))
14
+ fm_score = float('{:.4f}'.format(fm_score))
15
+
16
+ print(
17
+ "Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nFowlkes Mallows: %.4f"
18
+ % (asw_score, nmi_score, ari_score, fm_score)
19
+ )
20
+ return asw_score, nmi_score, ari_score, fm_score
@@ -0,0 +1,161 @@
1
+ from scipy import sparse
2
+ import importlib
3
+ import numbers
4
+ import anndata
5
+ import scanpy as sc
6
+ import sklearn.metrics
7
+ import numpy as np
8
+ import pandas as pd
9
+ import re
10
+ import warnings
11
+
12
+
13
+ # test_data = adata.obsm["test"]
14
+ # denoised_data = adata.obsm["denoised"]
15
+ # train_data = adata.obsm["train"]
16
+ def imputation_metrics(adata, denoised_layer, train='train', test='test'):
17
+ #Mean-squared error
18
+ test_adata = anndata.AnnData(X=adata.obsm[test], obs=adata.obs, var=adata.var)
19
+ denoised_adata = anndata.AnnData(
20
+ X=adata.layers[denoised_layer], obs=adata.obs, var=adata.var
21
+ )
22
+
23
+ # scaling and transformation
24
+ target_sum = 10000
25
+
26
+ sc.pp.normalize_total(test_adata, target_sum)
27
+ sc.pp.log1p(test_adata)
28
+
29
+ sc.pp.normalize_total(denoised_adata, target_sum)
30
+ sc.pp.log1p(denoised_adata)
31
+
32
+ mse = sklearn.metrics.mean_squared_error(
33
+ toarray(test_adata.X), toarray(denoised_adata.X)
34
+ )
35
+
36
+ # Poisson loss
37
+ test_data = adata.obsm[test]
38
+ denoised_data = adata.layers[denoised_layer]
39
+
40
+ # scaling
41
+ initial_sum = adata.obsm[train].sum()
42
+ target_sum = test_data.sum()
43
+ denoised_data = denoised_data * target_sum / initial_sum
44
+
45
+ possion = poisson_nll_loss(toarray(test_data), toarray(denoised_data))
46
+
47
+ return float('{:.4f}'.format(mse)), float('{:.4f}'.format(possion))
48
+
49
+
50
+ def poisson_nll_loss(y_pred: np.ndarray, y_true: np.ndarray) -> float:
51
+ return (y_pred - y_true * np.log(y_pred + 1e-6)).mean()
52
+
53
+
54
+ def toarray(x):
55
+ """Convert an array-like to a np.ndarray.
56
+
57
+ Parameters
58
+ ----------
59
+ x : array-like
60
+ Array-like to be converted
61
+ Returns
62
+ -------
63
+ x : np.ndarray
64
+ """
65
+ if is_SparseDataFrame(x):
66
+ x = x.to_coo().toarray()
67
+ elif is_SparseSeries(x):
68
+ x = x.to_dense().to_numpy()
69
+ elif isinstance(x, (pd.DataFrame, pd.Series, pd.Index)):
70
+ x = x.to_numpy()
71
+ elif isinstance(x, sparse.spmatrix):
72
+ x = x.toarray()
73
+ elif isinstance(x, np.matrix):
74
+ x = x.A
75
+ elif isinstance(x, list):
76
+ x_out = []
77
+ for xi in x:
78
+ try:
79
+ xi = toarray(xi)
80
+ except TypeError:
81
+ # recursed too far
82
+ pass
83
+ x_out.append(xi)
84
+ # convert x_out from list to array
85
+ x = np.array(x_out, dtype=_check_numpy_dtype(x_out))
86
+ elif isinstance(x, (np.ndarray, numbers.Number)):
87
+ pass
88
+ else:
89
+ raise TypeError("Expected array-like. Got {}".format(type(x)))
90
+ return x
91
+
92
+
93
+ def is_SparseSeries(X):
94
+ with warnings.catch_warnings():
95
+ warnings.filterwarnings(
96
+ "ignore",
97
+ "The SparseSeries class is removed from pandas. Accessing it from the "
98
+ "top-level namespace will also be removed in the next version",
99
+ FutureWarning,
100
+ )
101
+ try:
102
+ return isinstance(X, pd.SparseSeries)
103
+ except AttributeError:
104
+ return False
105
+
106
+
107
+ def is_SparseDataFrame(X):
108
+ with warnings.catch_warnings():
109
+ warnings.filterwarnings(
110
+ "ignore",
111
+ "The SparseDataFrame class is removed from pandas. Accessing it from the "
112
+ "top-level namespace will also be removed in the next version",
113
+ FutureWarning,
114
+ )
115
+ try:
116
+ return isinstance(X, pd.SparseDataFrame)
117
+ except AttributeError:
118
+ return False
119
+
120
+
121
+ def is_sparse_dataframe(x):
122
+ if isinstance(x, pd.DataFrame) and not is_SparseDataFrame(x):
123
+ try:
124
+ x.sparse
125
+ return True
126
+ except AttributeError:
127
+ pass
128
+ return False
129
+
130
+
131
+ def is_sparse_series(x):
132
+ if isinstance(x, pd.Series) and not is_SparseSeries(x):
133
+ try:
134
+ x.sparse
135
+ return True
136
+ except AttributeError:
137
+ pass
138
+ return False
139
+
140
+
141
+ def dataframe_to_sparse(x, fill_value=0.0):
142
+ x = pd.DataFrame.sparse.from_spmatrix(
143
+ sparse.coo_matrix(x.values), index=x.index, columns=x.columns
144
+ )
145
+ x.sparse.fill_value = fill_value
146
+ return x
147
+
148
+
149
+ def SparseDataFrame(X, columns=None, index=None, default_fill_value=0.0):
150
+ if sparse.issparse(X):
151
+ X = pd.DataFrame.sparse.from_spmatrix(X)
152
+ X.sparse.fill_value = default_fill_value
153
+ else:
154
+ if is_SparseDataFrame(X) or not isinstance(X, pd.DataFrame):
155
+ X = pd.DataFrame(X)
156
+ X = dataframe_to_sparse(X, fill_value=default_fill_value)
157
+ if columns is not None:
158
+ X.columns = columns
159
+ if index is not None:
160
+ X.index = index
161
+ return X
@@ -0,0 +1,60 @@
1
+ from scib.metrics import metrics
2
+
3
+ # https://github.com/theislab/scib/blob/main/scib/metrics/metrics.py
4
+ # https://scib.readthedocs.io/en/latest/api/scib.metrics.metrics_all.html
5
+ def integration_metrics(adata, adata_int, batch_key='batch', label_key='cell_type', species="mouse"):
6
+ """All metrics
7
+
8
+ :Biological conservation:
9
+ + HVG overlap :func:`~scib.metrics.hvg_overlap`
10
+ + Cell type ASW :func:`~scib.metrics.silhouette`
11
+ + Isolated label ASW :func:`~scib.metrics.isolated_labels`
12
+ + Isolated label F1 :func:`~scib.metrics.isolated_labels`
13
+ + NMI cluster/label :func:`~scib.metrics.nmi`
14
+ + ARI cluster/label :func:`~scib.metrics.ari`
15
+ + Cell cycle conservation :func:`~scib.metrics.cell_cycle`
16
+ + cLISI (cell type Local Inverse Simpson's Index) :func:`~scib.metrics.clisi_graph`
17
+ + Trajectory conservation :func:`~scib.metrics.trajectory_conservation`
18
+
19
+ :Batch correction:
20
+ + Graph connectivity :func:`~scib.metrics.graph_connectivity`
21
+ + Batch ASW :func:`~scib.metrics.silhouette_batch`
22
+ + Principal component regression :func:`~scib.metrics.pcr_comparison`
23
+ + kBET (k-nearest neighbour batch effect test) :func:`~scib.metrics.kBET`
24
+ + iLISI (integration Local Inverse Simpson's Index) :func:`~scib.metrics.ilisi_graph`
25
+
26
+ :param adata: unintegrated, preprocessed anndata object
27
+ :param adata_int: integrated anndata object
28
+ :param batch_key: name of batch column in adata.obs and adata_int.obs
29
+ :param label_key: name of biological label (cell type) column in adata.obs and adata_int.obs
30
+ :param kwargs:
31
+ Parameters to pass on to :func:`~scib.metrics.metrics` function:
32
+
33
+ + ``embed``
34
+ + ``cluster_key``
35
+ + ``cluster_nmi``
36
+ + ``nmi_method``
37
+ + ``nmi_dir``
38
+ + ``si_metric``
39
+ + ``organism``
40
+ + ``n_isolated``
41
+ + ``subsample``
42
+ + ``type_``
43
+ """
44
+
45
+ metrics_all = metrics(adata, adata_int, batch_key=batch_key, label_key=label_key, cluster_nmi=None, ari_=True, nmi_=True, nmi_method='arithmetic', nmi_dir=None, silhouette_=True, si_metric='euclidean', pcr_=True, cell_cycle_=True, organism=species, hvg_score_=True, isolated_labels_=True, isolated_labels_f1_=True, isolated_labels_asw_=True, n_isolated=True, graph_conn_=True, trajectory_=False, kBET_=True)
46
+ biological_conservation_metrics = ['NMI_cluster/label', 'ARI_cluster/label', 'ASW_label', 'cell_cycle_conservation','isolated_label_F1', 'isolated_label_silhouette', 'hvg_overlap']
47
+ # metrics_dict = metrics_all.dropna().to_dict()[0]
48
+ metrics_dict = metrics_all.fillna(0).to_dict()[0]
49
+
50
+ for key, value in metrics_dict.items():
51
+ metrics_dict[key] = float('{:.4f}'.format(value))
52
+
53
+ bc_total = 0
54
+ for key in biological_conservation_metrics:
55
+ bc_total += metrics_dict[key]
56
+ biological_conservation_score = float('{:.4f}'.format(bc_total/len(biological_conservation_metrics)))
57
+
58
+ metrics_dict['Biological Conservation'] = biological_conservation_score
59
+
60
+ return metrics_dict
@@ -0,0 +1,49 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import os
4
+ import subprocess
5
+ import scanpy as sc
6
+ import scipy.io
7
+ import scib
8
+ import muon as mu
9
+ from muon import MuData
10
+
11
+
12
+ def multimodal_metrics(mdata, embed, mod1='rna', batch='group', label_key='cell_type'):
13
+ scib_anndata = sc.AnnData(mdata.obsm[embed]).copy()
14
+ scib_anndata.obs = mdata.obs.copy()
15
+ scib_anndata.obsp["connectivities"] = mdata.obsp["connectivities"].copy()
16
+ scib_anndata.obsm[embed] = mdata.obsm[embed].copy()
17
+ scib_anndata = scib_anndata[~scib_anndata.obs[f"{mod1}:{batch}"].isna()] # Remove NaN in batch
18
+ scib_anndata = scib_anndata[~scib_anndata.obs[f"{mod1}:{label_key}"].isna()] # Remove NaN in cell type label
19
+ scib_anndata.obs[f"{mod1}:{batch}"] = scib_anndata.obs[f"{mod1}:{batch}"].astype("category")
20
+ scib_anndata.obs[f"{mod1}:{label_key}"] = scib_anndata.obs[f"{mod1}:{label_key}"].astype("category")
21
+
22
+ metrics = scib.metrics.metrics(
23
+ scib_anndata,
24
+ scib_anndata,
25
+ batch_key=f"{mod1}:{batch}",
26
+ label_key=f"{mod1}:{label_key}",
27
+ embed=embed,
28
+ ari_=True,
29
+ nmi_=True,
30
+ silhouette_=True,
31
+ graph_conn_=True,
32
+ isolated_labels_asw_=True,
33
+ )
34
+
35
+ biological_conservation_metrics = ['NMI_cluster/label', 'ARI_cluster/label', 'ASW_label', 'cell_cycle_conservation','isolated_label_F1', 'isolated_label_silhouette', 'hvg_overlap']
36
+ metrics = metrics.fillna(0).to_dict()[0]
37
+
38
+ for key, value in metrics.items():
39
+ metrics[key] = float('{:.4f}'.format(value))
40
+
41
+ bc_total = 0
42
+ for key in biological_conservation_metrics:
43
+ bc_total += metrics[key]
44
+ biological_conservation_score = float('{:.4f}'.format(bc_total/len(biological_conservation_metrics)))
45
+
46
+ metrics['Biological Conservation'] = biological_conservation_score
47
+ scib_anndata = None
48
+
49
+ return metrics
@@ -0,0 +1,196 @@
1
+ import random
2
+ import numpy as np
3
+ import networkx as nx
4
+ import zss
5
+ from math import inf
6
+ from grakel import GraphKernel, Graph
7
+
8
+
9
+ def trajectory_metrics(traj, bm_traj, root_node):
10
+ # Graph edit distance
11
+ traj_dict = traj_to_dict(traj)
12
+ bm_traj_dict = traj_to_dict(bm_traj)
13
+ traj_ls = []
14
+ for key in traj_dict.keys():
15
+ traj_ls.append((traj_dict[key], key))
16
+
17
+ bm_traj_ls = []
18
+ for key in bm_traj_dict.keys():
19
+ bm_traj_ls.append((bm_traj_dict[key], key))
20
+
21
+ traj_ls = sorted(traj_ls, key=lambda edge: (edge[0], edge[1]))
22
+ bm_traj_ls = sorted(bm_traj_ls, key=lambda edge: (edge[0], edge[1]))
23
+
24
+ G1 = nx.DiGraph()
25
+ G1.add_edges_from(bm_traj_ls)
26
+
27
+ G2 = nx.DiGraph()
28
+ G2.add_edges_from(traj_ls)
29
+
30
+ # Calculate the graph edit distance
31
+ distance = graph_edit_distance(G1, G2)
32
+
33
+ max_distance = 4*len(bm_traj_dict.keys()) + 2
34
+
35
+ ged_score = (max_distance-distance)/max_distance
36
+
37
+
38
+ # Jaccard similarity coefficient
39
+ total_n = len(bm_traj_dict.keys()) + len(traj_dict.keys())
40
+
41
+ nn = 0
42
+ for key in bm_traj_dict.keys():
43
+ if key in traj_dict.keys():
44
+ if bm_traj_dict[key] == traj_dict[key]:
45
+ nn += 1
46
+
47
+ total_n = total_n - nn
48
+ jsc_score = nn/total_n
49
+
50
+
51
+ # Graph kernel score
52
+ traj_ls = sorted(traj_ls, key=lambda edge: (edge[0], edge[1]))
53
+ bm_traj_ls = sorted(bm_traj_ls, key=lambda edge: (edge[0], edge[1]))
54
+
55
+ gks_score = graph_kernel_score(bm_traj_ls, traj_ls, root_node=root_node)
56
+
57
+
58
+ # Tree edit distance
59
+ # Build trees from edge lists
60
+ bm_tree = build_tree_from_edges(bm_traj_ls, root_node)
61
+ tree = build_tree_from_edges(traj_ls, root_node)
62
+
63
+ # Calculate tree edit distance
64
+ tree_distance = zss.distance(
65
+ tree, bm_tree,
66
+ get_children=lambda node: node.children,
67
+ insert_cost=insert_cost,
68
+ remove_cost=remove_cost,
69
+ update_cost=update_cost
70
+ )
71
+ tree_max_distance = 2*len(bm_traj_dict.keys())-2
72
+ ted_score = (tree_max_distance-tree_distance)/tree_max_distance
73
+
74
+ mean = (ged_score + gks_score + jsc_score + ted_score)/4
75
+
76
+ return float('{:.4f}'.format(ged_score)), float('{:.4f}'.format(gks_score)), float('{:.4f}'.format(jsc_score)), float('{:.4f}'.format(ted_score)), float('{:.4f}'.format(mean))
77
+
78
+
79
+ def traj_to_dict(df):
80
+ graph = {}
81
+ for i, row in df.iterrows():
82
+ graph[row['to']] = row['from']
83
+
84
+ return graph
85
+
86
+
87
+
88
+ class TreeNode:
89
+ def __init__(self, label):
90
+ self.label = label
91
+ self.children = []
92
+
93
+
94
+ def build_tree_from_edges(edges, root_node):
95
+ nodes = {}
96
+ for parent, child in edges:
97
+ if parent not in nodes:
98
+ nodes[parent] = TreeNode(parent)
99
+ if child not in nodes:
100
+ nodes[child] = TreeNode(child)
101
+ nodes[parent].children.append(nodes[child])
102
+ return nodes[root_node] # Return the root node
103
+
104
+
105
+ def insert_cost(node):
106
+ return 1
107
+
108
+
109
+ def remove_cost(node):
110
+ return 1
111
+
112
+
113
+ def update_cost(node1, node2):
114
+ return 0 if node1.label == node2.label else inf
115
+
116
+
117
+ def graph_edit_distance(G1, G2):
118
+ # Initialize cost for edges
119
+ node_cost = 0
120
+ nodes1 = set(G1.nodes)
121
+ nodes2 = set(G2.nodes)
122
+
123
+ # Calculate edge substitution cost
124
+ for node1 in nodes1:
125
+ if node1 in nodes2:
126
+ nodes2.remove(node1) # Matched edges
127
+ else:
128
+ node_cost += 1 # Unmatched edge in G1 (deletion)
129
+
130
+ node_cost += len(nodes2) # Remaining unmatched edges in G2 (insertion)
131
+
132
+ # Initialize cost for edges
133
+ edge_cost = 0
134
+ edges1 = set(G1.edges)
135
+ edges2 = set(G2.edges)
136
+
137
+ # Calculate edge substitution cost
138
+ for edge1 in edges1:
139
+ if edge1 in edges2:
140
+ edges2.remove(edge1) # Matched edges
141
+ else:
142
+ edge_cost += 1 # Unmatched edge in G1 (deletion)
143
+
144
+ edge_cost += len(edges2) # Remaining unmatched edges in G2 (insertion)
145
+
146
+ return node_cost + edge_cost
147
+
148
+
149
+ def graph_kernel_score(edges1, edges2, root_node):
150
+ edges1 = find_root_node(edges1, root_node)
151
+ edges2 = find_root_node(edges2, root_node)
152
+
153
+ G1 = nx.Graph()
154
+ G1.add_edges_from(edges1)
155
+
156
+ G2 = nx.Graph()
157
+ G2.add_edges_from(edges2)
158
+
159
+ grakel_G1 = nx_to_grakel(G1)
160
+ grakel_G2 = nx_to_grakel(G2)
161
+
162
+ # Initialize the Weisfeiler-Lehman subtree kernel
163
+ gk = GraphKernel(kernel={"name": "shortest_path"}, normalize=True)
164
+
165
+ # Compute the kernel matrix
166
+ G = [grakel_G1, grakel_G2]
167
+ K = gk.fit_transform(G)
168
+
169
+ final_score = K[0,1]
170
+
171
+ return final_score
172
+
173
+ # Convert NetworkX graphs to Grakel format
174
+ def nx_to_grakel(G):
175
+ nodes = list(G.nodes)
176
+ edges = list(G.edges)
177
+ node_labels = {node: i for i, node in enumerate(nodes)}
178
+ edges_transformed = [(node_labels[edge[0]], node_labels[edge[1]]) for edge in edges]
179
+ return (edges_transformed, {i: label for label, i in node_labels.items()})
180
+
181
+
182
+ def find_root_node(edges1, root_node):
183
+ if edges1[0][0] == root_node:
184
+ pass
185
+
186
+ else:
187
+ for i in range(1, len(edges1)):
188
+ if edges1[i][0] == root_node:
189
+ aaa = edges1[0]
190
+ edges1[0] = edges1[i]
191
+ edges1[i] = aaa
192
+ i = len(edges1)+1
193
+ else:
194
+ pass
195
+
196
+ return edges1
oscb/evaluator.py ADDED
@@ -0,0 +1,219 @@
1
+ from .evaluation.annotation import *
2
+ from .evaluation.ccc import *
3
+ from .evaluation.clustering import *
4
+ from .evaluation.imputation import *
5
+ from .evaluation.integration import *
6
+ from .evaluation.multimodal import *
7
+ from .evaluation.trajectory import *
8
+ from .evaluation.annotation import *
9
+ from datetime import datetime
10
+ from .utils import *
11
+ import requests
12
+ import json
13
+
14
+
15
+ def eval(adata, adata_int=None, benchmarks_id=None, task=None, cluster_key=None, label_key=None, label_pred_key=None, embedding_key=None, ccc_pred="ccc_pred", ccc_target="ccc_target", score="score", denoised_layer=None, train='train', test='test', mod1_key='rna', mod2_key='atac', traj_key=None, bm_traj_key=None, root_node=None, species=None, server_endpoint=server_endpoint+'benchmarks/', method="Your method"):
16
+ if adata is None:
17
+ raise ValueError("adata is required.")
18
+
19
+ benchmarks = None
20
+ current_date_and_time = datetime.now()
21
+ benchmarks_data = None
22
+
23
+ if benchmarks_id is not None:
24
+ dataset_id, task = get_dataset_id(benchmarks_id)
25
+ url = server_endpoint + benchmarks_id
26
+ response = requests.get(url)
27
+ if response.status_code == 200:
28
+ try:
29
+ benchmarks = response.json()
30
+ benchmarks_data = benchmarks['benchmarks_plot']['data']
31
+ match task:
32
+ case "Clustering" | "CL":
33
+ label_key = benchmarks['label']
34
+
35
+ case "Imputation" | "IM":
36
+ species = benchmarks['species']
37
+
38
+ case "Batch Integration" | "BI":
39
+ label_key = benchmarks['label']
40
+ batch_key = benchmarks['batch_key']
41
+ species = benchmarks['species']
42
+
43
+ case "Trajectory" | "TJ":
44
+ label_key = benchmarks['label']
45
+ root_node = benchmarks['origin_group']
46
+ bm_traj_key = benchmarks['bm_traj']
47
+
48
+ case "Cell-Cell Communication" | "CCC":
49
+ label_key = benchmarks['label']
50
+ ccc_target = benchmarks['ccc_target']
51
+ species = benchmarks['species']
52
+
53
+ case "Multimodal Data Integration" | "MI":
54
+ mod1_key = benchmarks['mod1']
55
+ mod2_key = benchmarks['mod2']
56
+ label_key = benchmarks['label']
57
+ batch_key = benchmarks['batch_key']
58
+
59
+ case "Cell Type Annotation" | "CT":
60
+ label_key = benchmarks['label']
61
+ # species = benchmarks['species']
62
+
63
+ except Exception as e:
64
+ print(f"Failed to get Benchmarks: {str(e)}")
65
+ else:
66
+ print(f"Failed to get Benchmarks: {benchmarks_id}.")
67
+
68
+ if task is not None:
69
+ task_info = {
70
+ "benchmarksId": benchmarks_id,
71
+ "datasetId": dataset_id,
72
+ "task_type": task,
73
+ "tool": method,
74
+ "created_on": current_date_and_time
75
+ }
76
+ match task:
77
+ case "Clustering" | "CL":
78
+ if cluster_key is not None and label_key is not None and embedding_key is not None:
79
+ asw_score, nmi_score, ari_score, fm_score = clustering_metrics(adata.obs[label_key], adata.obs[cluster_key], adata.obsm[embedding_key])
80
+ results = {
81
+ "benchmarksId": benchmarks_id,
82
+ "datasetId": dataset_id,
83
+ "task_type": task,
84
+ "tool": method,
85
+ "Silhouette": asw_score,
86
+ "NMI": nmi_score,
87
+ "ARI": ari_score,
88
+ "Fowlkes Mallows": fm_score,
89
+ "created_on": current_date_and_time
90
+ }
91
+ if benchmarks_data is not None:
92
+ labels, y_labels, data = get_bar_plot_data(benchmarks_data, user_results=results)
93
+ plot_bars(task, labels, y_labels, data)
94
+
95
+ return results
96
+ else:
97
+ raise ValueError(f"cluster_key, label_key and embedding_key are required for {task}.")
98
+
99
+ case "Imputation" | "IM":
100
+ if denoised_layer is not None:
101
+ mse, possion = imputation_metrics(adata, denoised_layer=denoised_layer)
102
+ results = {
103
+ "benchmarksId": benchmarks_id,
104
+ "datasetId": dataset_id,
105
+ "task_type": task,
106
+ "tool": method,
107
+ "MSE": mse,
108
+ "Possion": possion,
109
+ "created_on": current_date_and_time
110
+ }
111
+ if benchmarks_data is not None:
112
+ labels, y_labels, data = get_bar_plot_data(benchmarks_data, user_results=results)
113
+ plot_bars(task, labels, y_labels, data)
114
+ return results
115
+ else:
116
+ raise ValueError(f"denoised_layer is required for {task}.")
117
+
118
+ case "Batch Integration" | "BI":
119
+ if adata_int is not None and label_key is not None and batch_key is not None:
120
+ metrics_dict = integration_metrics(adata, adata_int, batch_key=batch_key, label_key=label_key, species=species)
121
+ results = {**task_info, **metrics_dict}
122
+ if benchmarks_data is not None:
123
+ labels, y_labels, data = get_bar_plot_data(benchmarks_data, user_results=results)
124
+ plot_bars(task, labels, y_labels, data)
125
+ return results
126
+ else:
127
+ raise ValueError(f"adata_int, label_key and batch_key are required for {task}.")
128
+
129
+ case "Trajectory" | "TJ":
130
+ if traj_key is not None and bm_traj_key is not None and root_node is not None:
131
+ ged_score, gks_score, jsc_score, ted_score, mean = trajectory_metrics(adata.uns[traj_key], adata.uns[bm_traj_key], adata.uns[root_node])
132
+ results = {
133
+ "benchmarksId": benchmarks_id,
134
+ "datasetId": dataset_id,
135
+ "task_type": task,
136
+ "tool": method,
137
+ "Graph Edit Distance": ged_score,
138
+ "Graph Kernel Score": gks_score,
139
+ "Jaccard Similarity Coefficient": jsc_score,
140
+ "Tree Edit Distance": ted_score,
141
+ "Mean": mean,
142
+ "created_on": current_date_and_time
143
+ }
144
+ if benchmarks_data is not None:
145
+ labels, y_labels, data = get_bar_plot_data(benchmarks_data, user_results=results)
146
+ plot_bars(task, labels, y_labels, data)
147
+ return results
148
+ else:
149
+ raise ValueError(f"adata_int, label_key and batch_key are required for {task}.")
150
+
151
+ case "Cell-Cell Communication" | "CCC":
152
+ if ccc_pred is not None and ccc_target is not None and score is not None:
153
+ auc_score, oddsratio_score = ccc_metrics(adata, ccc_pred=ccc_pred, ccc_target=ccc_target, score='score')
154
+ results = {
155
+ "benchmarksId": benchmarks_id,
156
+ "datasetId": dataset_id,
157
+ "task_type": task,
158
+ "tool": method,
159
+ "Precision-recall AUC": auc_score,
160
+ "Odds Ratio": oddsratio_score,
161
+ "created_on": current_date_and_time
162
+ }
163
+ if benchmarks_data is not None:
164
+ labels, y_labels, data = get_bar_plot_data(benchmarks_data, user_results=results)
165
+ plot_bars(task, labels, y_labels, data)
166
+ return results
167
+ else:
168
+ raise ValueError(f"ccc_pred, ccc_target and score are required for {task}.")
169
+
170
+ case "Multimodal Data Integration" | "MI":
171
+ if embedding_key is not None and mod1_key is not None and batch_key is not None and label_key is not None:
172
+ metrics_dict = multimodal_metrics(mdata, embed=embedding_key, mod1=mod1_key, batch=batch_key, label_key=label_key)
173
+ results = {**task_info, **metrics_dict}
174
+ if benchmarks_data is not None:
175
+ labels, y_labels, data = get_bar_plot_data(benchmarks_data, user_results=results)
176
+ plot_bars(task, labels, y_labels, data)
177
+ return results
178
+ else:
179
+ raise ValueError(f"embedding_key, mod1_key, label_key and batch_key are required for {task}.")
180
+
181
+ case "Cell Type Annotation" | "CT":
182
+ if label_pred_key is not None and label_key is not None:
183
+ accuracy, f1_macro, f1_micro, f1_weighted = annotation_metrics(adata.obs[label_key], adata.obs[label_pred_key])
184
+ results = {
185
+ "benchmarksId": benchmarks_id,
186
+ "datasetId": dataset_id,
187
+ "task_type": task,
188
+ "tool": method,
189
+ "Accuracy": accuracy,
190
+ "F1_macro": f1_macro,
191
+ "F1_micro": f1_micro,
192
+ "F1_weighted": f1_weighted,
193
+ "created_on": current_date_and_time
194
+ }
195
+ if benchmarks_data is not None:
196
+ labels, y_labels, data = get_bar_plot_data(benchmarks_data, user_results=results)
197
+ plot_bars(task, labels, y_labels, data)
198
+ return results
199
+ else:
200
+ raise ValueError(f"label_pred_key, and label_key are required for {task}.")
201
+
202
+ case _: # Default case, equivalent to 'default' in other languages
203
+ raise ValueError(f"{task} is not supported. Please input the task name from the following list [Clustering, Imputation, Batch Integration, Trajectory, Cell-Cell Communication, Multimodal Data Integration, Cell Type Annotation].")
204
+ else:
205
+ raise ValueError("benchmarks_id or task is required.")
206
+
207
+
208
+ def write_json(data, file_path="./output.json"):
209
+ # Open the file in write mode ('w') and use json.dump() to write the dictionary
210
+ with open(file_path, 'w') as json_file:
211
+ json.dump(data, json_file, indent=4, default=serialize_datetime) # indent=4 for pretty-printing
212
+
213
+ print(f"Dictionary successfully written to {file_path}")
214
+
215
+
216
+ def serialize_datetime(obj):
217
+ if isinstance(obj, datetime):
218
+ return obj.isoformat()
219
+ raise TypeError("Type not serializable")
oscb/utilization.py ADDED
@@ -0,0 +1,133 @@
1
+
2
+ # pip install nvidia-ml-py
3
+ import os
4
+ import time
5
+ import psutil
6
+ # import GPUtil
7
+ from threading import Thread
8
+ from pynvml import *
9
+ from .utils import *
10
+
11
+
12
+
13
+ class Monitor(Thread):
14
+ def __init__(self, delay=1):
15
+ super(Monitor, self).__init__()
16
+ self.stopped = False
17
+ self.delay = delay # Time between calls to GPUtil
18
+ self.time_points = []
19
+ self.cpu_usage = []
20
+ self.mem_usage = []
21
+ self.gpu_usage = []
22
+ self.gpu_mem_usage = []
23
+ self.start()
24
+
25
+
26
+ def run(self):
27
+ while not self.stopped:
28
+ # Obtaining all the essential details
29
+ self.time_points.append(time.time())
30
+ self.cpu_usage.append(psutil.cpu_percent())
31
+ self.mem_usage.append(psutil.virtual_memory().percent)
32
+ # self.gpu_mem_usage.append(self.gpu_mem_percent())
33
+ self.gpu_usage.append(self.get_nvidia_info()['gpus'][0]['gpu_utilization'])
34
+ self.gpu_mem_usage.append(self.get_nvidia_info()['gpus'][0]['memory_utilization'])
35
+ time.sleep(self.delay)
36
+
37
+
38
+ def stop(self):
39
+ self.stopped = True
40
+ sys_info = self.get_sys_info()
41
+ results = {
42
+ "sys_info": sys_info,
43
+ "CPU": self.cpu_usage,
44
+ "Memory": self.mem_usage,
45
+ "GPU": self.gpu_usage,
46
+ "GPU Memory": self.gpu_mem_usage,
47
+ 'time_points': self.time_points
48
+ }
49
+ plot_lines(results)
50
+
51
+ return results
52
+
53
+
54
+ def get_sys_info(self) -> dict:
55
+ sys_info: dict = {}
56
+ cpu, ram = self.get_cpu_mem_info()
57
+ gpus = self.get_nvidia_info()
58
+ sys_info['CPU'] = cpu
59
+ sys_info['RAM'] = ram
60
+ if len(gpus['gpus']) > 0:
61
+ gpu_list = []
62
+ for i in range(len(gpus['gpus'])):
63
+ gpu = f"{gpus['gpus'][i]['gpu_model']} @ {gpus['gpus'][i]['total']} GB"
64
+ gpu_list.append(gpu)
65
+
66
+ sys_info['GPU'] = gpu_list
67
+
68
+ return sys_info
69
+
70
+
71
+ def get_cpu_mem_info(self):
72
+ import platform
73
+
74
+ n_cores = psutil.cpu_count(logical=False)
75
+ # n_thread = psutil.cpu_count()
76
+ freq = float('{:.2f}'.format(psutil.cpu_freq().current / 1000)) # GHz
77
+ cpu_model = platform.processor()
78
+ mem_total = round(psutil.virtual_memory().total / 1024 / 1024 / 1024, 2) # GB
79
+ # mem_free = round(psutil.virtual_memory().available / 1024 / 1024 / 1024, 2) # GB
80
+ # mem_process_used = round(psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024, 2) # GB
81
+ cpu = f"{cpu_model} {n_cores}-core @ {freq} GHz"
82
+ ram = f"{mem_total} GB"
83
+
84
+ return cpu, ram
85
+
86
+
87
+ def get_nvidia_info(self):
88
+ nvidia_dict = {
89
+ "state": True,
90
+ "nvidia_version": "",
91
+ "nvidia_count": 0,
92
+ "gpus": []
93
+ }
94
+ try:
95
+ nvmlInit()
96
+ nvidia_dict["nvidia_version"] = nvmlSystemGetDriverVersion()
97
+ nvidia_dict["nvidia_count"] = nvmlDeviceGetCount()
98
+ for i in range(nvidia_dict["nvidia_count"]):
99
+ handle = nvmlDeviceGetHandleByIndex(i)
100
+ memory_info = nvmlDeviceGetMemoryInfo(handle)
101
+ utilization = nvmlDeviceGetUtilizationRates(handle)
102
+ gpu = {
103
+ "gpu_model": nvmlDeviceGetName(handle),
104
+ "total": round(memory_info.total / 1024 / 1024 / 1024, 2), # GB
105
+ "free": round(memory_info.free / 1024 / 1024 / 1024, 2), # GB
106
+ "used": round(memory_info.used / 1024 / 1024 / 1024, 2), # GB
107
+ "gpu_utilization": utilization.gpu,
108
+ "memory_utilization": round(memory_info.used * 100 / memory_info.total, 2),
109
+ "temperature": f"{nvmlDeviceGetTemperature(handle, 0)}℃",
110
+ "powerStatus": nvmlDeviceGetPowerState(handle)
111
+ }
112
+ nvidia_dict['gpus'].append(gpu)
113
+ except NVMLError as _:
114
+ nvidia_dict["state"] = False
115
+ except Exception as _:
116
+ nvidia_dict["state"] = False
117
+ finally:
118
+ try:
119
+ nvmlShutdown()
120
+ except:
121
+ pass
122
+ return nvidia_dict
123
+
124
+
125
+ def gpu_mem_percent(self):
126
+ mem_rate = 0.0
127
+ info = self.get_nvidia_info()
128
+ if len(info['gpus']) > 0:
129
+ used = info['gpus'][0]['used']
130
+ tot = info['gpus'][0]['total']
131
+ mem_rate = used/tot
132
+
133
+ return mem_rate
oscb/utils.py ADDED
@@ -0,0 +1,107 @@
1
+ import matplotlib
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+
5
+
6
+ server_endpoint = "http://c4130-110233.wisc.cloudlab.us:5005/api/"
7
+
8
+
9
+ def get_dataset_id(benchmarks_id):
10
+ task = None
11
+ dataset_id = None
12
+ if benchmarks_id.startswith("CL-"):
13
+ dataset_id = benchmarks_id.removeprefix("CL-")
14
+ task = "Clustering"
15
+
16
+ elif benchmarks_id.startswith("IM-"):
17
+ dataset_id = benchmarks_id.removeprefix("IM-")
18
+ task = "Imputation"
19
+
20
+ elif benchmarks_id.startswith("BI-"):
21
+ dataset_id = benchmarks_id.removeprefix("BI-")
22
+ task = "Batch Integration"
23
+
24
+ elif benchmarks_id.startswith("TJ-"):
25
+ dataset_id = benchmarks_id.removeprefix("TJ-")
26
+ task = "Trajectory"
27
+
28
+ elif benchmarks_id.startswith("CCC-"):
29
+ dataset_id = benchmarks_id.removeprefix("CCC-")
30
+ task = "Cell-Cell Communication"
31
+
32
+ elif benchmarks_id.startswith("MI-"):
33
+ dataset_id = benchmarks_id.removeprefix("MI-")
34
+ task = "Multimodal Data Integration"
35
+
36
+ elif benchmarks_id.startswith("CT-"):
37
+ dataset_id = benchmarks_id.removeprefix("CT-")
38
+ task = "Cell Type Annotation"
39
+
40
+ return dataset_id, task
41
+
42
+
43
+ def get_bar_plot_data(benchmark_data, user_results=None):
44
+ labels = benchmark_data[0]['x']
45
+ y_labels = []
46
+ data = []
47
+ y_user = []
48
+
49
+ # Add Benchmark data
50
+ for i in range(len(benchmark_data)):
51
+ data.append(benchmark_data[i]['y'])
52
+ y_labels.append(benchmark_data[i]['name'])
53
+
54
+ # Add user results
55
+ if user_results is not None:
56
+ y_labels.append(user_results['tool'])
57
+ for label in labels:
58
+ y_user.append(user_results[label])
59
+ data.append(y_user)
60
+
61
+ return labels, y_labels, data
62
+
63
+
64
+ def plot_bars(task, labels, y_labels, data, tick_step=1, group_gap=0.2, bar_gap=0):
65
+ x = np.arange(len(labels)) * tick_step
66
+ group_num = len(data)
67
+ group_width = tick_step - group_gap
68
+ bar_span = group_width / group_num
69
+ bar_width = bar_span - bar_gap
70
+ for index, y in enumerate(data):
71
+ plt.bar(x + index*bar_span, y, bar_width, label=y_labels[index])
72
+ plt.ylabel('Scores')
73
+ plt.title(f'Benchmarks for {task}')
74
+ ticks = x + (group_width - bar_span) / 2
75
+ plt.xticks(ticks, labels)
76
+ plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
77
+ plt.show()
78
+
79
+
80
+ def plot_lines(results):
81
+ x = results['time_points']
82
+ x = [n for n in range(len(x))]
83
+ data = []
84
+ y_labels = []
85
+ labels = ["CPU", "Memory", "GPU", "GPU Memory"]
86
+
87
+ for label in labels:
88
+ if sum(results[label]) != 0:
89
+ data.append(results[label])
90
+ y_labels.append(label)
91
+
92
+ for i in range(len(data)):
93
+ if len(x) == len(data[i]):
94
+ if 'GPU' in y_labels[i]:
95
+ plt.plot(x, data[i], label=y_labels[i], marker='o', linestyle='--') # '--' sets a dashed line style
96
+ else:
97
+ plt.plot(x, data[i], label=y_labels[i], marker='o') # 'o' adds circular markers
98
+
99
+ # Adding labels, title, and legend for clarity
100
+ plt.xlabel('Time Points (s)')
101
+ plt.ylabel('Utilization (%)')
102
+ plt.title('Computing Assessments')
103
+ plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.) # Displays the labels for each line
104
+
105
+ # Displaying the plot
106
+ plt.show()
107
+
@@ -0,0 +1,73 @@
1
+ Metadata-Version: 2.2
2
+ Name: oscb
3
+ Version: 0.1.0
4
+ Summary: OSCB aims to provide automated end-to-end single-cell analyses ML pipelines to simplify and standardize the process of single-cell data formatting, quality control, loading, model development, and model evaluation.
5
+ Home-page: https://github.com/cirisjl/Machine-learning-development-environment-for-single-cell-sequencing-data-analyses
6
+ Author: Lei Jiang
7
+ Author-email: leijiang@missouri.edu
8
+ License: MIT
9
+ Keywords: single-cell,benchmarks
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.6, <=3.12
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: leidenalg>=0.8.10
16
+ Requires-Dist: matplotlib>=3.5.1
17
+ Requires-Dist: networkx>=2.6.3
18
+ Requires-Dist: numpy>=1.26.4
19
+ Requires-Dist: pandas>=1.3.5
20
+ Requires-Dist: python_igraph>=0.9.9
21
+ Requires-Dist: python_louvain>=0.16
22
+ Requires-Dist: scanpy
23
+ Requires-Dist: muon
24
+ Requires-Dist: mudata
25
+ Requires-Dist: tqdm
26
+ Requires-Dist: requests
27
+ Requires-Dist: scib
28
+ Requires-Dist: zss
29
+ Requires-Dist: grakel
30
+ Requires-Dist: scikit_learn>=1.0.2
31
+ Requires-Dist: scipy>=1.7.3
32
+ Requires-Dist: umap_learn>=0.5.2
33
+ Dynamic: author
34
+ Dynamic: author-email
35
+ Dynamic: classifier
36
+ Dynamic: description
37
+ Dynamic: description-content-type
38
+ Dynamic: home-page
39
+ Dynamic: keywords
40
+ Dynamic: license
41
+ Dynamic: requires-dist
42
+ Dynamic: requires-python
43
+ Dynamic: summary
44
+
45
+ # Overview
46
+
47
+ --------------------------------------------------------------------------------
48
+
49
+
50
+ Machine learning (ML) is transforming single-cell sequencing data analysis; however, the barriers of technology complexity and biology knowledge remain challenging for the involvement of the ML community in single-cell data analysis. We present an ML development environment for single-cell sequencing data analyses with a diverse set of AI-Ready benchmark datasets. A cloud-based platform is built to dynamically scale workflows for collecting, processing, and managing various single-cell sequencing data to make them ML-ready. In addition, benchmarks for each problem formulation and a code-level and web-interface IDE for single-cell analysis method development are provided.
51
+
52
+
53
+ ![Workflow](https://oscb.missouri.edu/assets/30e6000a-5e6f-440f-bec5-5c7ceb256c55)
54
+
55
+ OSCB aims to provide automated end-to-end single-cell analyses ML pipelines to simplify and standardize the process of single-cell data formatting, quality control, loading, model development, and model evaluation.
56
+
57
+
58
+ **Workflows** are developed for collecting, processing, and managing diverse single-cell sequencing data to make them ML-ready and build benchmarks.
59
+
60
+ **IDE** is provided for supporting partial method development.
61
+
62
+ **Assessment utilities** are provided for evaluating results and report generation.
63
+
64
+ This **end-to-end pipeline** transforms the traditional “static” machine Learning into **continuous learning** on extensive new data.
65
+
66
+
67
+ By **in-depth fusing models with data**, this platform could ultimately help many single-cell sequencing researchers substantially.
68
+
69
+
70
+ ![Tools](https://oscb.missouri.edu/assets/c18ffb2a-814f-452c-921b-e399b99c41b4)
71
+
72
+
73
+ OSCB is an on-going effort, and we are planning to increase our coverage in the future.
@@ -0,0 +1,17 @@
1
+ oscb/__init__.py,sha256=4-lyUl644DHa1-kCvev0FnHX5YX7WH-xQ8-aMsEgy_8,498
2
+ oscb/data.py,sha256=AxV-umgeh_MdpOsR99V9Y7SXvEL6IqHby1RsNcp4kc0,4676
3
+ oscb/evaluator.py,sha256=KoU4-l0EFsd4fUrL871MWJ9-cPVjmdgpRg5PApHKs3s,11029
4
+ oscb/utilization.py,sha256=yOiUMnYw1PRnywCo_4Eeyl25Er6GJYZrvaSP4evQZ_4,4513
5
+ oscb/utils.py,sha256=Ezm6cWWfldSO3lDNMOBzDFXWVC2pTXNxTzgMxA8k43g,3361
6
+ oscb/evaluation/__init__.py,sha256=yV92jToZZNb4sJE2GsmXm6YCk1VQZv7nbrivrI8b-LQ,241
7
+ oscb/evaluation/annotation.py,sha256=DY1-GasL5PPHLhsC6lIxE5kkVFQqshkt5yxWgxhAk9Q,1394
8
+ oscb/evaluation/ccc.py,sha256=jKo7A1Vat1XIKYoZZlUDUrGBgTfLJmSl5JCjT1hWFNU,2151
9
+ oscb/evaluation/clustering.py,sha256=xpYvumLnQjc147G3zyzqgEQNeUT9g45d65H6WVTTSNI,864
10
+ oscb/evaluation/imputation.py,sha256=5eSWB2zkJLuqqg3mUET1UxdwHP2ytQsVV6H20HO5eyM,4499
11
+ oscb/evaluation/integration.py,sha256=dD1uX5n11qcKJjAb_MRAZq5owszxoWgwfwa8EElKI1U,3106
12
+ oscb/evaluation/multimodal.py,sha256=vFWVZI04ghaBvmyc6KfNpFwlDIuIfVkOYZLHd_geU2s,1851
13
+ oscb/evaluation/trajectory.py,sha256=qyvsgFAzRs4ydQkkYU80OLIZOOhL-4jggid4maajaik,5231
14
+ oscb-0.1.0.dist-info/METADATA,sha256=Yt6_SVTENmQeu2Ab6qHclON53eLtzYATFGNqDmT5l8g,3259
15
+ oscb-0.1.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
16
+ oscb-0.1.0.dist-info/top_level.txt,sha256=-kscy76s5yJOs8EyWQof-Ico6tACMlsgQ7tHMT4sd2Q,5
17
+ oscb-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ oscb