oscb 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oscb-0.1.0/PKG-INFO +73 -0
- oscb-0.1.0/README.md +29 -0
- oscb-0.1.0/oscb/__init__.py +14 -0
- oscb-0.1.0/oscb/data.py +135 -0
- oscb-0.1.0/oscb/evaluation/__init__.py +10 -0
- oscb-0.1.0/oscb/evaluation/annotation.py +39 -0
- oscb-0.1.0/oscb/evaluation/ccc.py +74 -0
- oscb-0.1.0/oscb/evaluation/clustering.py +20 -0
- oscb-0.1.0/oscb/evaluation/imputation.py +161 -0
- oscb-0.1.0/oscb/evaluation/integration.py +60 -0
- oscb-0.1.0/oscb/evaluation/multimodal.py +49 -0
- oscb-0.1.0/oscb/evaluation/trajectory.py +196 -0
- oscb-0.1.0/oscb/evaluator.py +219 -0
- oscb-0.1.0/oscb/utilization.py +133 -0
- oscb-0.1.0/oscb/utils.py +107 -0
- oscb-0.1.0/oscb.egg-info/PKG-INFO +73 -0
- oscb-0.1.0/oscb.egg-info/SOURCES.txt +20 -0
- oscb-0.1.0/oscb.egg-info/dependency_links.txt +1 -0
- oscb-0.1.0/oscb.egg-info/requires.txt +18 -0
- oscb-0.1.0/oscb.egg-info/top_level.txt +1 -0
- oscb-0.1.0/setup.cfg +4 -0
- oscb-0.1.0/setup.py +28 -0
oscb-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: oscb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: OSCB aims to provide automated end-to-end single-cell analyses ML pipelines to simplify and standardize the process of single-cell data formatting, quality control, loading, model development, and model evaluation.
|
|
5
|
+
Home-page: https://github.com/cirisjl/Machine-learning-development-environment-for-single-cell-sequencing-data-analyses
|
|
6
|
+
Author: Lei Jiang
|
|
7
|
+
Author-email: leijiang@missouri.edu
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: single-cell,benchmarks
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Requires-Python: >=3.6, <=3.12
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: leidenalg>=0.8.10
|
|
16
|
+
Requires-Dist: matplotlib>=3.5.1
|
|
17
|
+
Requires-Dist: networkx>=2.6.3
|
|
18
|
+
Requires-Dist: numpy>=1.26.4
|
|
19
|
+
Requires-Dist: pandas>=1.3.5
|
|
20
|
+
Requires-Dist: python_igraph>=0.9.9
|
|
21
|
+
Requires-Dist: python_louvain>=0.16
|
|
22
|
+
Requires-Dist: scanpy
|
|
23
|
+
Requires-Dist: muon
|
|
24
|
+
Requires-Dist: mudata
|
|
25
|
+
Requires-Dist: tqdm
|
|
26
|
+
Requires-Dist: requests
|
|
27
|
+
Requires-Dist: scib
|
|
28
|
+
Requires-Dist: zss
|
|
29
|
+
Requires-Dist: grakel
|
|
30
|
+
Requires-Dist: scikit_learn>=1.0.2
|
|
31
|
+
Requires-Dist: scipy>=1.7.3
|
|
32
|
+
Requires-Dist: umap_learn>=0.5.2
|
|
33
|
+
Dynamic: author
|
|
34
|
+
Dynamic: author-email
|
|
35
|
+
Dynamic: classifier
|
|
36
|
+
Dynamic: description
|
|
37
|
+
Dynamic: description-content-type
|
|
38
|
+
Dynamic: home-page
|
|
39
|
+
Dynamic: keywords
|
|
40
|
+
Dynamic: license
|
|
41
|
+
Dynamic: requires-dist
|
|
42
|
+
Dynamic: requires-python
|
|
43
|
+
Dynamic: summary
|
|
44
|
+
|
|
45
|
+
# Overview
|
|
46
|
+
|
|
47
|
+
--------------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
Machine learning (ML) is transforming single-cell sequencing data analysis; however, the barriers of technology complexity and biology knowledge remain challenging for the involvement of the ML community in single-cell data analysis. We present an ML development environment for single-cell sequencing data analyses with a diverse set of AI-Ready benchmark datasets. A cloud-based platform is built to dynamically scale workflows for collecting, processing, and managing various single-cell sequencing data to make them ML-ready. In addition, benchmarks for each problem formulation and a code-level and web-interface IDE for single-cell analysis method development are provided.
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+

|
|
54
|
+
|
|
55
|
+
OSCB aims to provide automated end-to-end single-cell analyses ML pipelines to simplify and standardize the process of single-cell data formatting, quality control, loading, model development, and model evaluation.
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
**Workflows** are developed for collecting, processing, and managing diverse single-cell sequencing data to make them ML-ready and build benchmarks.
|
|
59
|
+
|
|
60
|
+
**IDE** is provided for supporting partial method development.
|
|
61
|
+
|
|
62
|
+
**Assessment utilities** are provided for evaluating results and report generation.
|
|
63
|
+
|
|
64
|
+
This **end-to-end pipeline** transforms the traditional “static” machine Learning into **continuous learning** on extensive new data.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
By **in-depth fusing models with data**, this platform could ultimately help many single-cell sequencing researchers substantially.
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+

|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
OSCB is an on-going effort, and we are planning to increase our coverage in the future.
|
oscb-0.1.0/README.md
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Overview
|
|
2
|
+
|
|
3
|
+
--------------------------------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
Machine learning (ML) is transforming single-cell sequencing data analysis; however, the barriers of technology complexity and biology knowledge remain challenging for the involvement of the ML community in single-cell data analysis. We present an ML development environment for single-cell sequencing data analyses with a diverse set of AI-Ready benchmark datasets. A cloud-based platform is built to dynamically scale workflows for collecting, processing, and managing various single-cell sequencing data to make them ML-ready. In addition, benchmarks for each problem formulation and a code-level and web-interface IDE for single-cell analysis method development are provided.
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+

|
|
10
|
+
|
|
11
|
+
OSCB aims to provide automated end-to-end single-cell analyses ML pipelines to simplify and standardize the process of single-cell data formatting, quality control, loading, model development, and model evaluation.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
**Workflows** are developed for collecting, processing, and managing diverse single-cell sequencing data to make them ML-ready and build benchmarks.
|
|
15
|
+
|
|
16
|
+
**IDE** is provided for supporting partial method development.
|
|
17
|
+
|
|
18
|
+
**Assessment utilities** are provided for evaluating results and report generation.
|
|
19
|
+
|
|
20
|
+
This **end-to-end pipeline** transforms the traditional “static” machine Learning into **continuous learning** on extensive new data.
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
By **in-depth fusing models with data**, this platform could ultimately help many single-cell sequencing researchers substantially.
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+

|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
OSCB is an on-going effort, and we are planning to increase our coverage in the future.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from __future__ import absolute_import
|
|
2
|
+
|
|
3
|
+
from .evaluation import annotation, ccc, clustering, imputation, integration, multimodal, trajectory, annotation
|
|
4
|
+
from .data import *
|
|
5
|
+
from .evaluator import *
|
|
6
|
+
from .utilization import *
|
|
7
|
+
from .utils import *
|
|
8
|
+
# from .evaluation import ccc
|
|
9
|
+
# from .evaluation import clustering
|
|
10
|
+
# from .evaluation import imputation
|
|
11
|
+
# from .evaluation import integration
|
|
12
|
+
# from .evaluation import multimodal
|
|
13
|
+
# from .evaluation import trajectory
|
|
14
|
+
# from .evaluation import annotation
|
oscb-0.1.0/oscb/data.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from tqdm import tqdm
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import hashlib
|
|
6
|
+
import re
|
|
7
|
+
from muon import MuData
|
|
8
|
+
import muon as mu
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import scanpy as sc
|
|
12
|
+
import anndata as ad
|
|
13
|
+
import mudata as md
|
|
14
|
+
from .utils import *
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FileDownloader:
|
|
19
|
+
def __init__(self, chunk_size=8192):
|
|
20
|
+
self.chunk_size = chunk_size
|
|
21
|
+
self.session = requests.Session()
|
|
22
|
+
|
|
23
|
+
def get_filename_from_response(self, headers):
|
|
24
|
+
"""
|
|
25
|
+
Extracts filename from Content-Disposition header or URL.
|
|
26
|
+
"""
|
|
27
|
+
print(headers)
|
|
28
|
+
if "content-disposition" in headers:
|
|
29
|
+
cd = headers["content-disposition"]
|
|
30
|
+
match = re.search(r"filename\*?=['\"]?(.*?)['\"]?(?:;|$)", cd)
|
|
31
|
+
if match:
|
|
32
|
+
filename = match.group(1)
|
|
33
|
+
# Handle potential encoding if filename* is used
|
|
34
|
+
if filename.startswith("utf-8''"):
|
|
35
|
+
filename = filename.split("''", 1)[1]
|
|
36
|
+
filename = requests.utils.unquote(filename)
|
|
37
|
+
return filename
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
def get_file_size(self, response):
|
|
41
|
+
# response = self.session.head(url)
|
|
42
|
+
return int(response.headers.get('content-length', 0))
|
|
43
|
+
|
|
44
|
+
def get_file_hash(self, file_path):
|
|
45
|
+
sha256_hash = hashlib.sha256()
|
|
46
|
+
with open(file_path, "rb") as f:
|
|
47
|
+
for byte_block in iter(lambda: f.read(4096), b""):
|
|
48
|
+
sha256_hash.update(byte_block)
|
|
49
|
+
return sha256_hash.hexdigest()
|
|
50
|
+
|
|
51
|
+
def download(self, url, data_dict, data_folder='downloads/', verify_hash=None):
|
|
52
|
+
try:
|
|
53
|
+
response = self.session.post(url, json=data_dict, stream=True)
|
|
54
|
+
response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
|
|
55
|
+
total_size = self.get_file_size(response)
|
|
56
|
+
file_name = self.get_filename_from_response(response.headers)
|
|
57
|
+
local_file_path = os.path.join(data_folder, file_name)
|
|
58
|
+
local_file_path = Path(local_file_path)
|
|
59
|
+
# Make dir
|
|
60
|
+
local_file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
|
|
62
|
+
# Progress bar
|
|
63
|
+
progress = tqdm(total=total_size,
|
|
64
|
+
unit='B',
|
|
65
|
+
unit_scale=True,
|
|
66
|
+
desc=local_file_path.name)
|
|
67
|
+
|
|
68
|
+
with local_file_path.open('wb') as f:
|
|
69
|
+
for chunk in response.iter_content(chunk_size=self.chunk_size):
|
|
70
|
+
if chunk:
|
|
71
|
+
f.write(chunk)
|
|
72
|
+
progress.update(len(chunk))
|
|
73
|
+
progress.close()
|
|
74
|
+
|
|
75
|
+
# File validation
|
|
76
|
+
if verify_hash:
|
|
77
|
+
downloaded_hash = self.get_file_hash(local_file_path)
|
|
78
|
+
if downloaded_hash != verify_hash:
|
|
79
|
+
raise ValueError("File hash verification failed.")
|
|
80
|
+
|
|
81
|
+
print(f"File downloaded successfully to: {local_file_path}")
|
|
82
|
+
|
|
83
|
+
return local_file_path
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
progress.close()
|
|
87
|
+
print(f"Download failed: {str(e)}")
|
|
88
|
+
if local_file_path.exists():
|
|
89
|
+
local_file_path.unlink()
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
def download_multiple(self, url_list, data_folder):
|
|
93
|
+
results = []
|
|
94
|
+
for url in url_list:
|
|
95
|
+
filename = url.split('/')[-1]
|
|
96
|
+
local_file_path = Path(data_folder) / filename
|
|
97
|
+
success = self.download(url, local_file_path)
|
|
98
|
+
results.append({
|
|
99
|
+
'url': url,
|
|
100
|
+
'success': success,
|
|
101
|
+
'local_file_path': str(local_file_path)
|
|
102
|
+
})
|
|
103
|
+
return results
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def DataLoader(benchmarks_id, data_folder='downloads/', server_endpoint=server_endpoint+'download'):
|
|
108
|
+
dataset_id, task = get_dataset_id(benchmarks_id)
|
|
109
|
+
if task is not None:
|
|
110
|
+
print(f"Downloading dataset for {task} Benchmarks.")
|
|
111
|
+
else:
|
|
112
|
+
print("Downloading dataset.")
|
|
113
|
+
data_dict = {
|
|
114
|
+
"dataset_id": dataset_id
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
downloader = FileDownloader()
|
|
118
|
+
adata_path = downloader.download(server_endpoint, data_dict, data_folder="downloads")
|
|
119
|
+
|
|
120
|
+
if os.path.isfile(adata_path):
|
|
121
|
+
if str(adata_path).endswith(".h5mu"):
|
|
122
|
+
mdata = muon.read_h5mu(adata_path)
|
|
123
|
+
return mdata
|
|
124
|
+
else:
|
|
125
|
+
adata = sc.read_h5ad(adata_path)
|
|
126
|
+
return adata
|
|
127
|
+
else:
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def split_data(adata):
|
|
132
|
+
train_adata = adata[adata.obs.split_idx.str.contains('train'), :].copy()
|
|
133
|
+
test_adata = adata[adata.obs.split_idx.str.contains('test'), :].copy()
|
|
134
|
+
|
|
135
|
+
return train_adata, test_adata
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import sklearn.preprocessing
|
|
3
|
+
from sklearn.metrics import f1_score, accuracy_score
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def annotation_metrics(labels, labels_pred):
|
|
7
|
+
print("Encode labels", flush=True)
|
|
8
|
+
labels = labels.astype('category')
|
|
9
|
+
labels_pred = labels_pred.astype('category')
|
|
10
|
+
if labels.isna().any():
|
|
11
|
+
labels = labels.cat.add_categories(['Unkown'])
|
|
12
|
+
labels = labels.fillna('Unkown')
|
|
13
|
+
if labels_pred.isna().any():
|
|
14
|
+
labels_pred = labels_pred.cat.add_categories(['Unkown'])
|
|
15
|
+
labels_pred = labels_pred.fillna('Unkown')
|
|
16
|
+
cats = list(labels.dtype.categories) + list(labels_pred.dtype.categories)
|
|
17
|
+
encoder = sklearn.preprocessing.LabelEncoder().fit(cats)
|
|
18
|
+
labels = encoder.transform(labels)
|
|
19
|
+
labels_pred = encoder.transform(labels_pred)
|
|
20
|
+
|
|
21
|
+
print("Compute prediction accuracy", flush=True)
|
|
22
|
+
accuracy = accuracy_score(labels, labels_pred)
|
|
23
|
+
accuracy = float('{:.4f}'.format(accuracy))
|
|
24
|
+
|
|
25
|
+
print("Compute F1 score", flush=True)
|
|
26
|
+
f1_macro = float('{:.4f}'.format(f1_score(
|
|
27
|
+
labels, labels_pred,
|
|
28
|
+
average='macro'
|
|
29
|
+
)))
|
|
30
|
+
f1_micro = float('{:.4f}'.format(f1_score(
|
|
31
|
+
labels, labels_pred,
|
|
32
|
+
average='micro'
|
|
33
|
+
)))
|
|
34
|
+
f1_weighted = float('{:.4f}'.format(f1_score(
|
|
35
|
+
labels, labels_pred,
|
|
36
|
+
average='weighted'
|
|
37
|
+
)))
|
|
38
|
+
|
|
39
|
+
return accuracy, f1_macro, f1_micro, f1_weighted
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
|
|
3
|
+
import anndata
|
|
4
|
+
import collections
|
|
5
|
+
import numpy as np
|
|
6
|
+
from sklearn.metrics import auc
|
|
7
|
+
from sklearn.metrics import precision_recall_curve
|
|
8
|
+
|
|
9
|
+
# Cell Cell Communication
|
|
10
|
+
def ccc_metrics(adata, ccc_pred="ccc_pred", ccc_target="ccc_target", score="score", top_prop=0.05):
|
|
11
|
+
# Precision-recall AUC
|
|
12
|
+
gt = join_truth_and_pred(adata, ccc_pred, ccc_target, score)
|
|
13
|
+
precision, recall, _ = precision_recall_curve(
|
|
14
|
+
gt["response"], gt[score], pos_label=1
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
auc_score = auc(recall, precision)
|
|
18
|
+
|
|
19
|
+
# Odds Ratio
|
|
20
|
+
gt = gt.sort_values(score, ascending=False)
|
|
21
|
+
top_n = int(adata.uns[ccc_target].shape[0] * top_prop)
|
|
22
|
+
|
|
23
|
+
# assign the top rank interactions to 1
|
|
24
|
+
a = np.zeros(len(gt[score]))
|
|
25
|
+
a[0:top_n] = 1
|
|
26
|
+
gt.loc[:, ["top_n"]] = a
|
|
27
|
+
|
|
28
|
+
top = gt[gt["top_n"] == 1]
|
|
29
|
+
tp = np.sum(top.response == 1)
|
|
30
|
+
fp = np.sum(top.response == 0)
|
|
31
|
+
|
|
32
|
+
bot = gt[gt["top_n"] == 0]
|
|
33
|
+
fn = np.sum(bot.response == 1)
|
|
34
|
+
tn = np.sum(bot.response == 0)
|
|
35
|
+
|
|
36
|
+
numerator = tp * tn
|
|
37
|
+
denominator = fp * fn
|
|
38
|
+
if denominator == 0:
|
|
39
|
+
if numerator == 0:
|
|
40
|
+
# undefined
|
|
41
|
+
oddsratio_score = np.nan
|
|
42
|
+
else:
|
|
43
|
+
# perfect score
|
|
44
|
+
oddsratio_score = np.inf
|
|
45
|
+
else:
|
|
46
|
+
oddsratio_score = numerator / denominator
|
|
47
|
+
oddsratio_score = _sigmoid_transform(oddsratio_score)
|
|
48
|
+
|
|
49
|
+
return float('{:.4f}'.format(auc_score)), float('{:.4f}'.format(oddsratio_score))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# Join predictions to target
|
|
53
|
+
def join_truth_and_pred(adata, ccc_pred="ccc_pred", ccc_target="ccc_target", score="lrscore"):
|
|
54
|
+
merge_keys = list(adata.uns["merge_keys"])
|
|
55
|
+
gt = adata.uns[ccc_target].merge(adata.uns[ccc_pred], on=merge_keys, how="left")
|
|
56
|
+
|
|
57
|
+
gt.loc[gt["response"].isna(), "response"] = 0
|
|
58
|
+
gt.loc[gt[score].isna(), score] = np.nanmin(gt[score]) - np.finfo(float).eps
|
|
59
|
+
|
|
60
|
+
return gt
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _sigmoid_transform(x):
|
|
64
|
+
return 1 - 1 / (1 + x / 2)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def aggregate_method_scores(adata, how, ccc_pred="LIANA", score="score"):
|
|
68
|
+
merge_keys = list(adata.uns["merge_keys"])
|
|
69
|
+
return (
|
|
70
|
+
adata.uns[ccc_pred]
|
|
71
|
+
.groupby(merge_keys)
|
|
72
|
+
.agg(score=(score, how))
|
|
73
|
+
.reset_index()
|
|
74
|
+
)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from sklearn.metrics import adjusted_rand_score as ARI
|
|
2
|
+
from sklearn.metrics import normalized_mutual_info_score as NMI
|
|
3
|
+
from sklearn.metrics import silhouette_score
|
|
4
|
+
from sklearn.metrics import fowlkes_mallows_score as FM
|
|
5
|
+
|
|
6
|
+
def clustering_metrics(labels, labels_pred, embedding):
|
|
7
|
+
asw_score = silhouette_score(embedding, labels)
|
|
8
|
+
nmi_score = NMI(labels, labels_pred)
|
|
9
|
+
ari_score = ARI(labels, labels_pred)
|
|
10
|
+
fm_score = FM(labels, labels_pred)
|
|
11
|
+
asw_score = float('{:.4f}'.format(asw_score))
|
|
12
|
+
nmi_score = float('{:.4f}'.format(nmi_score))
|
|
13
|
+
ari_score = float('{:.4f}'.format(ari_score))
|
|
14
|
+
fm_score = float('{:.4f}'.format(fm_score))
|
|
15
|
+
|
|
16
|
+
print(
|
|
17
|
+
"Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nFowlkes Mallows: %.4f"
|
|
18
|
+
% (asw_score, nmi_score, ari_score, fm_score)
|
|
19
|
+
)
|
|
20
|
+
return asw_score, nmi_score, ari_score, fm_score
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
from scipy import sparse
|
|
2
|
+
import importlib
|
|
3
|
+
import numbers
|
|
4
|
+
import anndata
|
|
5
|
+
import scanpy as sc
|
|
6
|
+
import sklearn.metrics
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import re
|
|
10
|
+
import warnings
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# test_data = adata.obsm["test"]
|
|
14
|
+
# denoised_data = adata.obsm["denoised"]
|
|
15
|
+
# train_data = adata.obsm["train"]
|
|
16
|
+
def imputation_metrics(adata, denoised_layer, train='train', test='test'):
|
|
17
|
+
#Mean-squared error
|
|
18
|
+
test_adata = anndata.AnnData(X=adata.obsm[test], obs=adata.obs, var=adata.var)
|
|
19
|
+
denoised_adata = anndata.AnnData(
|
|
20
|
+
X=adata.layers[denoised_layer], obs=adata.obs, var=adata.var
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# scaling and transformation
|
|
24
|
+
target_sum = 10000
|
|
25
|
+
|
|
26
|
+
sc.pp.normalize_total(test_adata, target_sum)
|
|
27
|
+
sc.pp.log1p(test_adata)
|
|
28
|
+
|
|
29
|
+
sc.pp.normalize_total(denoised_adata, target_sum)
|
|
30
|
+
sc.pp.log1p(denoised_adata)
|
|
31
|
+
|
|
32
|
+
mse = sklearn.metrics.mean_squared_error(
|
|
33
|
+
toarray(test_adata.X), toarray(denoised_adata.X)
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Poisson loss
|
|
37
|
+
test_data = adata.obsm[test]
|
|
38
|
+
denoised_data = adata.layers[denoised_layer]
|
|
39
|
+
|
|
40
|
+
# scaling
|
|
41
|
+
initial_sum = adata.obsm[train].sum()
|
|
42
|
+
target_sum = test_data.sum()
|
|
43
|
+
denoised_data = denoised_data * target_sum / initial_sum
|
|
44
|
+
|
|
45
|
+
possion = poisson_nll_loss(toarray(test_data), toarray(denoised_data))
|
|
46
|
+
|
|
47
|
+
return float('{:.4f}'.format(mse)), float('{:.4f}'.format(possion))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def poisson_nll_loss(y_pred: np.ndarray, y_true: np.ndarray) -> float:
|
|
51
|
+
return (y_pred - y_true * np.log(y_pred + 1e-6)).mean()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def toarray(x):
|
|
55
|
+
"""Convert an array-like to a np.ndarray.
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
x : array-like
|
|
60
|
+
Array-like to be converted
|
|
61
|
+
Returns
|
|
62
|
+
-------
|
|
63
|
+
x : np.ndarray
|
|
64
|
+
"""
|
|
65
|
+
if is_SparseDataFrame(x):
|
|
66
|
+
x = x.to_coo().toarray()
|
|
67
|
+
elif is_SparseSeries(x):
|
|
68
|
+
x = x.to_dense().to_numpy()
|
|
69
|
+
elif isinstance(x, (pd.DataFrame, pd.Series, pd.Index)):
|
|
70
|
+
x = x.to_numpy()
|
|
71
|
+
elif isinstance(x, sparse.spmatrix):
|
|
72
|
+
x = x.toarray()
|
|
73
|
+
elif isinstance(x, np.matrix):
|
|
74
|
+
x = x.A
|
|
75
|
+
elif isinstance(x, list):
|
|
76
|
+
x_out = []
|
|
77
|
+
for xi in x:
|
|
78
|
+
try:
|
|
79
|
+
xi = toarray(xi)
|
|
80
|
+
except TypeError:
|
|
81
|
+
# recursed too far
|
|
82
|
+
pass
|
|
83
|
+
x_out.append(xi)
|
|
84
|
+
# convert x_out from list to array
|
|
85
|
+
x = np.array(x_out, dtype=_check_numpy_dtype(x_out))
|
|
86
|
+
elif isinstance(x, (np.ndarray, numbers.Number)):
|
|
87
|
+
pass
|
|
88
|
+
else:
|
|
89
|
+
raise TypeError("Expected array-like. Got {}".format(type(x)))
|
|
90
|
+
return x
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def is_SparseSeries(X):
|
|
94
|
+
with warnings.catch_warnings():
|
|
95
|
+
warnings.filterwarnings(
|
|
96
|
+
"ignore",
|
|
97
|
+
"The SparseSeries class is removed from pandas. Accessing it from the "
|
|
98
|
+
"top-level namespace will also be removed in the next version",
|
|
99
|
+
FutureWarning,
|
|
100
|
+
)
|
|
101
|
+
try:
|
|
102
|
+
return isinstance(X, pd.SparseSeries)
|
|
103
|
+
except AttributeError:
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def is_SparseDataFrame(X):
|
|
108
|
+
with warnings.catch_warnings():
|
|
109
|
+
warnings.filterwarnings(
|
|
110
|
+
"ignore",
|
|
111
|
+
"The SparseDataFrame class is removed from pandas. Accessing it from the "
|
|
112
|
+
"top-level namespace will also be removed in the next version",
|
|
113
|
+
FutureWarning,
|
|
114
|
+
)
|
|
115
|
+
try:
|
|
116
|
+
return isinstance(X, pd.SparseDataFrame)
|
|
117
|
+
except AttributeError:
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def is_sparse_dataframe(x):
|
|
122
|
+
if isinstance(x, pd.DataFrame) and not is_SparseDataFrame(x):
|
|
123
|
+
try:
|
|
124
|
+
x.sparse
|
|
125
|
+
return True
|
|
126
|
+
except AttributeError:
|
|
127
|
+
pass
|
|
128
|
+
return False
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def is_sparse_series(x):
|
|
132
|
+
if isinstance(x, pd.Series) and not is_SparseSeries(x):
|
|
133
|
+
try:
|
|
134
|
+
x.sparse
|
|
135
|
+
return True
|
|
136
|
+
except AttributeError:
|
|
137
|
+
pass
|
|
138
|
+
return False
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def dataframe_to_sparse(x, fill_value=0.0):
|
|
142
|
+
x = pd.DataFrame.sparse.from_spmatrix(
|
|
143
|
+
sparse.coo_matrix(x.values), index=x.index, columns=x.columns
|
|
144
|
+
)
|
|
145
|
+
x.sparse.fill_value = fill_value
|
|
146
|
+
return x
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def SparseDataFrame(X, columns=None, index=None, default_fill_value=0.0):
|
|
150
|
+
if sparse.issparse(X):
|
|
151
|
+
X = pd.DataFrame.sparse.from_spmatrix(X)
|
|
152
|
+
X.sparse.fill_value = default_fill_value
|
|
153
|
+
else:
|
|
154
|
+
if is_SparseDataFrame(X) or not isinstance(X, pd.DataFrame):
|
|
155
|
+
X = pd.DataFrame(X)
|
|
156
|
+
X = dataframe_to_sparse(X, fill_value=default_fill_value)
|
|
157
|
+
if columns is not None:
|
|
158
|
+
X.columns = columns
|
|
159
|
+
if index is not None:
|
|
160
|
+
X.index = index
|
|
161
|
+
return X
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from scib.metrics import metrics
|
|
2
|
+
|
|
3
|
+
# https://github.com/theislab/scib/blob/main/scib/metrics/metrics.py
|
|
4
|
+
# https://scib.readthedocs.io/en/latest/api/scib.metrics.metrics_all.html
|
|
5
|
+
def integration_metrics(adata, adata_int, batch_key='batch', label_key='cell_type', species="mouse"):
|
|
6
|
+
"""All metrics
|
|
7
|
+
|
|
8
|
+
:Biological conservation:
|
|
9
|
+
+ HVG overlap :func:`~scib.metrics.hvg_overlap`
|
|
10
|
+
+ Cell type ASW :func:`~scib.metrics.silhouette`
|
|
11
|
+
+ Isolated label ASW :func:`~scib.metrics.isolated_labels`
|
|
12
|
+
+ Isolated label F1 :func:`~scib.metrics.isolated_labels`
|
|
13
|
+
+ NMI cluster/label :func:`~scib.metrics.nmi`
|
|
14
|
+
+ ARI cluster/label :func:`~scib.metrics.ari`
|
|
15
|
+
+ Cell cycle conservation :func:`~scib.metrics.cell_cycle`
|
|
16
|
+
+ cLISI (cell type Local Inverse Simpson's Index) :func:`~scib.metrics.clisi_graph`
|
|
17
|
+
+ Trajectory conservation :func:`~scib.metrics.trajectory_conservation`
|
|
18
|
+
|
|
19
|
+
:Batch correction:
|
|
20
|
+
+ Graph connectivity :func:`~scib.metrics.graph_connectivity`
|
|
21
|
+
+ Batch ASW :func:`~scib.metrics.silhouette_batch`
|
|
22
|
+
+ Principal component regression :func:`~scib.metrics.pcr_comparison`
|
|
23
|
+
+ kBET (k-nearest neighbour batch effect test) :func:`~scib.metrics.kBET`
|
|
24
|
+
+ iLISI (integration Local Inverse Simpson's Index) :func:`~scib.metrics.ilisi_graph`
|
|
25
|
+
|
|
26
|
+
:param adata: unintegrated, preprocessed anndata object
|
|
27
|
+
:param adata_int: integrated anndata object
|
|
28
|
+
:param batch_key: name of batch column in adata.obs and adata_int.obs
|
|
29
|
+
:param label_key: name of biological label (cell type) column in adata.obs and adata_int.obs
|
|
30
|
+
:param kwargs:
|
|
31
|
+
Parameters to pass on to :func:`~scib.metrics.metrics` function:
|
|
32
|
+
|
|
33
|
+
+ ``embed``
|
|
34
|
+
+ ``cluster_key``
|
|
35
|
+
+ ``cluster_nmi``
|
|
36
|
+
+ ``nmi_method``
|
|
37
|
+
+ ``nmi_dir``
|
|
38
|
+
+ ``si_metric``
|
|
39
|
+
+ ``organism``
|
|
40
|
+
+ ``n_isolated``
|
|
41
|
+
+ ``subsample``
|
|
42
|
+
+ ``type_``
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
metrics_all = metrics(adata, adata_int, batch_key=batch_key, label_key=label_key, cluster_nmi=None, ari_=True, nmi_=True, nmi_method='arithmetic', nmi_dir=None, silhouette_=True, si_metric='euclidean', pcr_=True, cell_cycle_=True, organism=species, hvg_score_=True, isolated_labels_=True, isolated_labels_f1_=True, isolated_labels_asw_=True, n_isolated=True, graph_conn_=True, trajectory_=False, kBET_=True)
|
|
46
|
+
biological_conservation_metrics = ['NMI_cluster/label', 'ARI_cluster/label', 'ASW_label', 'cell_cycle_conservation','isolated_label_F1', 'isolated_label_silhouette', 'hvg_overlap']
|
|
47
|
+
# metrics_dict = metrics_all.dropna().to_dict()[0]
|
|
48
|
+
metrics_dict = metrics_all.fillna(0).to_dict()[0]
|
|
49
|
+
|
|
50
|
+
for key, value in metrics_dict.items():
|
|
51
|
+
metrics_dict[key] = float('{:.4f}'.format(value))
|
|
52
|
+
|
|
53
|
+
bc_total = 0
|
|
54
|
+
for key in biological_conservation_metrics:
|
|
55
|
+
bc_total += metrics_dict[key]
|
|
56
|
+
biological_conservation_score = float('{:.4f}'.format(bc_total/len(biological_conservation_metrics)))
|
|
57
|
+
|
|
58
|
+
metrics_dict['Biological Conservation'] = biological_conservation_score
|
|
59
|
+
|
|
60
|
+
return metrics_dict
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
import scanpy as sc
|
|
6
|
+
import scipy.io
|
|
7
|
+
import scib
|
|
8
|
+
import muon as mu
|
|
9
|
+
from muon import MuData
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def multimodal_metrics(mdata, embed, mod1='rna', batch='group', label_key='cell_type'):
|
|
13
|
+
scib_anndata = sc.AnnData(mdata.obsm[embed]).copy()
|
|
14
|
+
scib_anndata.obs = mdata.obs.copy()
|
|
15
|
+
scib_anndata.obsp["connectivities"] = mdata.obsp["connectivities"].copy()
|
|
16
|
+
scib_anndata.obsm[embed] = mdata.obsm[embed].copy()
|
|
17
|
+
scib_anndata = scib_anndata[~scib_anndata.obs[f"{mod1}:{batch}"].isna()] # Remove NaN in batch
|
|
18
|
+
scib_anndata = scib_anndata[~scib_anndata.obs[f"{mod1}:{label_key}"].isna()] # Remove NaN in cell type label
|
|
19
|
+
scib_anndata.obs[f"{mod1}:{batch}"] = scib_anndata.obs[f"{mod1}:{batch}"].astype("category")
|
|
20
|
+
scib_anndata.obs[f"{mod1}:{label_key}"] = scib_anndata.obs[f"{mod1}:{label_key}"].astype("category")
|
|
21
|
+
|
|
22
|
+
metrics = scib.metrics.metrics(
|
|
23
|
+
scib_anndata,
|
|
24
|
+
scib_anndata,
|
|
25
|
+
batch_key=f"{mod1}:{batch}",
|
|
26
|
+
label_key=f"{mod1}:{label_key}",
|
|
27
|
+
embed=embed,
|
|
28
|
+
ari_=True,
|
|
29
|
+
nmi_=True,
|
|
30
|
+
silhouette_=True,
|
|
31
|
+
graph_conn_=True,
|
|
32
|
+
isolated_labels_asw_=True,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
biological_conservation_metrics = ['NMI_cluster/label', 'ARI_cluster/label', 'ASW_label', 'cell_cycle_conservation','isolated_label_F1', 'isolated_label_silhouette', 'hvg_overlap']
|
|
36
|
+
metrics = metrics.fillna(0).to_dict()[0]
|
|
37
|
+
|
|
38
|
+
for key, value in metrics.items():
|
|
39
|
+
metrics[key] = float('{:.4f}'.format(value))
|
|
40
|
+
|
|
41
|
+
bc_total = 0
|
|
42
|
+
for key in biological_conservation_metrics:
|
|
43
|
+
bc_total += metrics[key]
|
|
44
|
+
biological_conservation_score = float('{:.4f}'.format(bc_total/len(biological_conservation_metrics)))
|
|
45
|
+
|
|
46
|
+
metrics['Biological Conservation'] = biological_conservation_score
|
|
47
|
+
scib_anndata = None
|
|
48
|
+
|
|
49
|
+
return metrics
|