dora-singlecell 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,131 @@
1
+ Metadata-Version: 2.4
2
+ Name: dora-singlecell
3
+ Version: 0.1.0
4
+ Summary: DORA: latent trajectory model for single-cell drug response (PyTorch).
5
+ Requires-Python: >=3.9
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: numpy>=1.20
8
+ Requires-Dist: scipy>=1.7
9
+ Requires-Dist: scikit-learn>=1.2
10
+ Requires-Dist: pandas>=1.3
11
+ Requires-Dist: torch>=1.12
12
+ Requires-Dist: scanpy>=1.9
13
+ Requires-Dist: joblib>=1.2
14
+ Requires-Dist: tqdm>=4.60
15
+ Requires-Dist: torchmetrics>=0.11
16
+ Provides-Extra: dev
17
+ Requires-Dist: pytest>=7.0; extra == "dev"
18
+
19
+ # dora-singlecell
20
+
21
+ PyTorch implementation of **DORA**, a latent-trajectory model for single-cell drug response: drug and cell embeddings, dose response, and a gene decoder, with utilities for AnnData / perturbation-style datasets.
22
+
23
+ **PyPI package name:** `dora-singlecell`
24
+ **Import name:** `dora`
25
+
26
+ ## Installation
27
+
28
+ ### From PyPI (after you publish)
29
+
30
+ ```bash
31
+ pip install dora-singlecell
32
+ ```
33
+
34
+ ### From GitHub (before or instead of PyPI)
35
+
36
+
37
+ ```bash
38
+ pip install git+https://github.com/LBiophyEvo/dora-singlecell.git@main
39
+ ```
40
+
41
+ For a local editable install while developing:
42
+
43
+ ```bash
44
+ git clone https://github.com/LBiophyEvo/dora-singlecell.git
45
+ cd dora-singlecell
46
+ pip install -e .
47
+ ```
48
+
49
+ ## Quick start
50
+
51
+ - Load datasets
52
+ ```python
53
+ from dora import CustomDataset_mask
54
+
55
+ # Example: load preprocessed data (paths must match your layout; see utils.dataset_selection)
56
+ # First load the adata, prepared dataset (arranged dose-response gene expression), drug features, cell features and defined dose trajectory)
57
+ # For example, for Sci-Plex daatset
58
+ dosages_standard = [0.0, 0.001, 0.01, 0.1, 1.0]
59
+ train_dataset = CustomDataset_mask(adata=adata, dataset=dataset_train, feature_dict_drug= feature_dict_drug, feature_dict_cell=feature_dict_cell, dosages_standard=dosages_standard)
60
+
61
+ ```
62
+
63
+ - Build the model
64
+ ```python
65
+ from dora import DORA
66
+ dosage_len = len(dosages_standard)
67
+ hparam = {
68
+ 'lr': 1e-2,
69
+ 'wd': 4e-5,
70
+ 'dim_hid': 32,
71
+ 'dep_hid': 3,
72
+ 'nb_layer': 5,
73
+ 'n_drugs': 188,
74
+ 'n_cells': 3,
75
+ 'n_genes': dim_cell_feature,
76
+ 'dim_drug_feature': dim_drug_feature,
77
+ 'dim_cell_feature': dim_cell_feature,
78
+ 'batch_size': 128,
79
+ 'max_epoch': 700,
80
+ 'device': device,
81
+ 'cell_dim_hid': 128,
82
+ 'module': 1,
83
+ 'drug_dose_f': False,
84
+ 'max_patience': 100,
85
+ 'last_layer': 'linear',
86
+ 'step_size_lr': 35,
87
+ 'batch_norm': True,
88
+ 'param_pen': 0,
89
+ }
90
+ model = DORA(num_genes = hparam['n_genes'],
91
+ num_drugs = hparam['n_drugs'],
92
+ num_cells = hparam['n_cells'],
93
+ genes= genes,
94
+ dosage_len = dosage_len,
95
+ hparam=hparam,
96
+
97
+ )
98
+
99
+ ```
100
+
101
+ Training and evaluation helpers live in `dora.train`, `dora.eval`, `dora.get_latent_util`, and `dora.train_clf_test_adam`.
102
+
103
+ ## Project layout
104
+
105
+ ```
106
+ .
107
+ ├── pyproject.toml # package metadata & dependencies (name: dora-singlecell)
108
+ ├── README.md
109
+ └── dora/ # importable Python package
110
+ ├── __init__.py
111
+ ├── model.py # DORA, MLP, losses, dose modules
112
+ ├── utils.py # CustomDataset_mask, data loading
113
+ ├── train.py # train the model
114
+ ├── eval.py # eval the model
115
+ ├── get_latent_util.py # extract the embeddings
116
+ └── train_clf_test_adam.py # fine tune the model
117
+ ```
118
+
119
+ ## Requirements
120
+
121
+ - Python ≥ 3.9
122
+ - PyTorch, scanpy, scikit-learn, numpy, scipy, pandas, joblib, tqdm, torchmetrics (see `pyproject.toml` for versions).
123
+
124
+
125
+ ## Citation
126
+
127
+ If you use this code in a publication, cite the associated paper (add reference when available).
128
+
129
+ ## License
130
+
131
+ MIT.
@@ -0,0 +1,113 @@
1
+ # dora-singlecell
2
+
3
+ PyTorch implementation of **DORA**, a latent-trajectory model for single-cell drug response: drug and cell embeddings, dose response, and a gene decoder, with utilities for AnnData / perturbation-style datasets.
4
+
5
+ **PyPI package name:** `dora-singlecell`
6
+ **Import name:** `dora`
7
+
8
+ ## Installation
9
+
10
+ ### From PyPI (after you publish)
11
+
12
+ ```bash
13
+ pip install dora-singlecell
14
+ ```
15
+
16
+ ### From GitHub (before or instead of PyPI)
17
+
18
+
19
+ ```bash
20
+ pip install git+https://github.com/LBiophyEvo/dora-singlecell.git@main
21
+ ```
22
+
23
+ For a local editable install while developing:
24
+
25
+ ```bash
26
+ git clone https://github.com/LBiophyEvo/dora-singlecell.git
27
+ cd dora-singlecell
28
+ pip install -e .
29
+ ```
30
+
31
+ ## Quick start
32
+
33
+ - Load datasets
34
+ ```python
35
+ from dora import CustomDataset_mask
36
+
37
+ # Example: load preprocessed data (paths must match your layout; see utils.dataset_selection)
38
+ # First load the adata, prepared dataset (arranged dose-response gene expression), drug features, cell features and defined dose trajectory)
39
+ # For example, for Sci-Plex daatset
40
+ dosages_standard = [0.0, 0.001, 0.01, 0.1, 1.0]
41
+ train_dataset = CustomDataset_mask(adata=adata, dataset=dataset_train, feature_dict_drug= feature_dict_drug, feature_dict_cell=feature_dict_cell, dosages_standard=dosages_standard)
42
+
43
+ ```
44
+
45
+ - Build the model
46
+ ```python
47
+ from dora import DORA
48
+ dosage_len = len(dosages_standard)
49
+ hparam = {
50
+ 'lr': 1e-2,
51
+ 'wd': 4e-5,
52
+ 'dim_hid': 32,
53
+ 'dep_hid': 3,
54
+ 'nb_layer': 5,
55
+ 'n_drugs': 188,
56
+ 'n_cells': 3,
57
+ 'n_genes': dim_cell_feature,
58
+ 'dim_drug_feature': dim_drug_feature,
59
+ 'dim_cell_feature': dim_cell_feature,
60
+ 'batch_size': 128,
61
+ 'max_epoch': 700,
62
+ 'device': device,
63
+ 'cell_dim_hid': 128,
64
+ 'module': 1,
65
+ 'drug_dose_f': False,
66
+ 'max_patience': 100,
67
+ 'last_layer': 'linear',
68
+ 'step_size_lr': 35,
69
+ 'batch_norm': True,
70
+ 'param_pen': 0,
71
+ }
72
+ model = DORA(num_genes = hparam['n_genes'],
73
+ num_drugs = hparam['n_drugs'],
74
+ num_cells = hparam['n_cells'],
75
+ genes= genes,
76
+ dosage_len = dosage_len,
77
+ hparam=hparam,
78
+
79
+ )
80
+
81
+ ```
82
+
83
+ Training and evaluation helpers live in `dora.train`, `dora.eval`, `dora.get_latent_util`, and `dora.train_clf_test_adam`.
84
+
85
+ ## Project layout
86
+
87
+ ```
88
+ .
89
+ ├── pyproject.toml # package metadata & dependencies (name: dora-singlecell)
90
+ ├── README.md
91
+ └── dora/ # importable Python package
92
+ ├── __init__.py
93
+ ├── model.py # DORA, MLP, losses, dose modules
94
+ ├── utils.py # CustomDataset_mask, data loading
95
+ ├── train.py # train the model
96
+ ├── eval.py # eval the model
97
+ ├── get_latent_util.py # extract the embeddings
98
+ └── train_clf_test_adam.py # fine tune the model
99
+ ```
100
+
101
+ ## Requirements
102
+
103
+ - Python ≥ 3.9
104
+ - PyTorch, scanpy, scikit-learn, numpy, scipy, pandas, joblib, tqdm, torchmetrics (see `pyproject.toml` for versions).
105
+
106
+
107
+ ## Citation
108
+
109
+ If you use this code in a publication, cite the associated paper (add reference when available).
110
+
111
+ ## License
112
+
113
+ MIT.
@@ -0,0 +1,34 @@
1
+ """
2
+ DORA single-cell perturbation model and data utilities.
3
+
4
+ After ``pip install -e .`` from the project root, import with ``import dora`` or
5
+ ``from dora import DORA, CustomDataset_mask``.
6
+ """
7
+
8
+ from .model import (
9
+ Basic_ff,
10
+ DORA,
11
+ GeneralizedSigmoid,
12
+ MLP,
13
+ rrmse,
14
+ rrmse_penality,
15
+ )
16
+ from .utils import (
17
+ CustomDataset_mask,
18
+ SubDataset,
19
+ dataset_selection,
20
+ get_normaled_cell,
21
+ )
22
+
23
+ __all__ = [
24
+ "Basic_ff",
25
+ "DORA",
26
+ "GeneralizedSigmoid",
27
+ "MLP",
28
+ "CustomDataset_mask",
29
+ "SubDataset",
30
+ "dataset_selection",
31
+ "get_normaled_cell",
32
+ "rrmse",
33
+ "rrmse_penality",
34
+ ]
@@ -0,0 +1,64 @@
1
+ """
2
+ Classification-oriented metrics for binary response / phenotype prediction.
3
+
4
+ Used alongside training scripts that output probabilities; pairs with the same
5
+ ``calculate_accuracy`` helper in ``train_clf_test_adam`` when scoring phenotypic heads.
6
+ """
7
+
8
+ import numpy as np
9
+ from sklearn.metrics import (
10
+ accuracy_score,
11
+ auc,
12
+ precision_recall_curve,
13
+ precision_score,
14
+ recall_score,
15
+ roc_auc_score,
16
+ )
17
+
18
+
19
+ def calculate_accuracy(test_prob, test_label):
20
+ """
21
+ Summarize binary predictions given probabilities and ground-truth labels.
22
+
23
+ Parameters
24
+ ----------
25
+ test_prob : array-like
26
+ Predicted probabilities (or scores in [0, 1]) per sample.
27
+ test_label : array-like
28
+ Binary ground-truth labels (same length as ``test_prob``).
29
+
30
+ Returns
31
+ -------
32
+ list of float
33
+ ``[ROC-AUC, PR-AUC (area under precision–recall curve), accuracy,
34
+ precision, recall]``. If there are no positive labels in ``test_label``,
35
+ ROC/PR/precision/recall are returned as 0. If the model predicts no
36
+ positives, precision and recall are 0 while ROC-AUC and PR-AUC may still
37
+ be computed from scores.
38
+
39
+ Notes
40
+ -----
41
+ Hard labels are obtained with threshold 0.5 on ``test_prob``. The docstring
42
+ mention of "RECALL when PRECISION is 0.75" refers to an older metric choice
43
+ and is not implemented in the returned list.
44
+ """
45
+ pred_label = np.array([1 if x > 0.5 else 0 for x in test_prob])
46
+ ACC = accuracy_score(test_label, pred_label)
47
+ if test_label.sum() > 0:
48
+ if pred_label.sum() > 0:
49
+ PREC = precision_score(test_label, pred_label)
50
+ TPR = recall_score(test_label, pred_label)
51
+
52
+ precision, recall, _thresholds = precision_recall_curve(test_label, test_prob)
53
+
54
+ return [
55
+ roc_auc_score(test_label, test_prob),
56
+ auc(recall, precision),
57
+ ACC,
58
+ PREC,
59
+ TPR,
60
+ ]
61
+ precision, recall, _thresholds = precision_recall_curve(test_label, test_prob)
62
+ return [roc_auc_score(test_label, test_prob), auc(recall, precision), ACC, 0, 0]
63
+
64
+ return [0, 0, ACC, 0, 0]
@@ -0,0 +1,155 @@
1
+ """
2
+ Extract latent embeddings from a trained DORA-style model and package them as AnnData.
3
+
4
+ ``get_latent`` runs the encoder/trajectory stack with dose information disabled
5
+ (``drugs_dose=None``), collects per-step latent vectors for masked samples, and
6
+ attaches cell / drug / dose metadata via the dataset’s sklearn encoders.
7
+ """
8
+
9
+ import numpy as np
10
+ import scanpy as sc
11
+ import torch
12
+ from scipy.stats import pearsonr
13
+ from torchmetrics import R2Score
14
+
15
+
16
+ def compute_r2(y_true, y_pred):
17
+ """
18
+ R² between two 1D tensors using torchmetrics (GPU-friendly).
19
+
20
+ Parameters
21
+ ----------
22
+ y_true, y_pred : torch.Tensor
23
+ Same shape; typically one cell’s gene vector vs reconstruction.
24
+
25
+ Returns
26
+ -------
27
+ float
28
+ R² score. Values are clamped internally for numerical stability; NaNs in
29
+ ``y_pred`` are not explicitly handled (torchmetrics may propagate NaN).
30
+ """
31
+ y_pred = torch.clamp(y_pred, -3e12, 3e12)
32
+ metric = R2Score().to(y_true.device)
33
+ metric.update(y_pred, y_true)
34
+ return metric.compute().item()
35
+
36
+
37
+ def get_latent(model, datasets_val, genes, train_dataset):
38
+ """
39
+ Iterate validation batches, collect latent states and reconstruction quality.
40
+
41
+ Parameters
42
+ ----------
43
+ model : torch.nn.Module
44
+ Trained model with ``predict(..., return_emb=True)`` (e.g. DORA).
45
+ datasets_val : DataLoader
46
+ Batches matching ``CustomDataset_mask`` layout (7 tensors per batch).
47
+ genes : torch.Tensor
48
+ Full gene matrix indexed by ``indices`` (same convention as training).
49
+ train_dataset : CustomDataset_mask
50
+ Provides ``encoder_cell`` / ``encoder_drug`` to map integer ids back to names.
51
+
52
+ Returns
53
+ -------
54
+ anndata.AnnData
55
+ ``.X`` = stacked latent vectors; ``.obs`` contains ``cell_id``,
56
+ ``pert_iname``, ``pert_dose``, composite ``pert_cell_dose``, and per-row
57
+ ``r2_score`` from gene reconstruction.
58
+ """
59
+ r2_loss = []
60
+ pearson_loss = []
61
+ latent_dict = {
62
+ "emb_all": [],
63
+ "cell_id": [],
64
+ "pert_iname": [],
65
+ "drug_dose": [],
66
+ "response_label": [],
67
+ "r2_score": [],
68
+ }
69
+ model.eval()
70
+ for data in datasets_val:
71
+ data = [tensor.to(model.device) for tensor in data]
72
+
73
+ drugs, cells, drugs_dose, drugs_feature, cell_feature, indices, masks = (
74
+ data[0],
75
+ data[1],
76
+ data[2],
77
+ data[3],
78
+ data[4],
79
+ data[5],
80
+ data[6],
81
+ )
82
+
83
+ # Latent trajectory without dose conditioning (baseline for embedding export).
84
+ gene_reconstructions_arr, emb_arr, _ = model.predict(
85
+ drugs,
86
+ drugs_feature,
87
+ cell_feature,
88
+ indices,
89
+ cells=cells,
90
+ drugs_dose=None,
91
+ return_emb=True,
92
+ )
93
+
94
+ for id_g, gene_reconstructions in enumerate(gene_reconstructions_arr):
95
+ id_gene = indices[:, (id_g + 1)]
96
+ mask = masks[:, (id_g + 1)]
97
+
98
+ if mask.sum() > 0:
99
+ gene_step = genes[id_gene][mask == 1]
100
+ gene_reconstructions_left = gene_reconstructions[mask == 1]
101
+
102
+ r2_iter = [
103
+ compute_r2(gene_step[i, :], gene_reconstructions_left[i, :])
104
+ for i in range(len(gene_step))
105
+ ]
106
+ r2_loss += r2_iter
107
+
108
+ y_true = gene_step.cpu().detach().numpy()
109
+ y_pred = gene_reconstructions_left.cpu().detach().numpy()
110
+
111
+ pearson_loss += [
112
+ pearsonr(y_true[i, :], y_pred[i, :])[0] for i in range(len(gene_step))
113
+ ]
114
+ latent_dict["emb_all"].append(emb_arr[id_g][mask == 1].cpu().detach().numpy())
115
+ latent_dict["cell_id"].append(cells[mask == 1].cpu().detach().numpy())
116
+ latent_dict["pert_iname"].append(drugs[mask == 1].cpu().detach().numpy())
117
+ latent_dict["drug_dose"].append(
118
+ drugs_dose[:, (id_g + 1)][mask == 1].cpu().detach().numpy()
119
+ )
120
+ latent_dict["r2_score"].append(r2_iter)
121
+
122
+ print("the R2 on gene reconstruction:", np.mean(r2_loss))
123
+ print("the pearson on gene reconstruction", np.mean(pearson_loss))
124
+
125
+ # Decode integer cell indices to strings via one-hot trick (inverse_transform).
126
+ index_cell = np.vstack(latent_dict["cell_id"])
127
+ onehot = np.zeros((len(index_cell), len(train_dataset.encoder_cell.categories_[0])))
128
+ for i in range(len(index_cell)):
129
+ onehot[i, index_cell[i]] = 1
130
+
131
+ cells_name = train_dataset.encoder_cell.inverse_transform(onehot)
132
+
133
+ index_drugs = np.vstack(latent_dict["pert_iname"])
134
+ onehot = np.zeros((len(index_cell), len(train_dataset.encoder_drug.categories_[0])))
135
+ for i in range(len(index_drugs)):
136
+ onehot[i, index_drugs[i]] = 1
137
+
138
+ drugs_name = train_dataset.encoder_drug.inverse_transform(onehot)
139
+
140
+ dosages = np.hstack(latent_dict["drug_dose"])
141
+
142
+ adata_emb = sc.AnnData(np.vstack(latent_dict["emb_all"]))
143
+ adata_emb.obs["cell_id"] = cells_name.flatten()
144
+ adata_emb.obs["pert_iname"] = drugs_name.flatten()
145
+ adata_emb.obs["pert_dose"] = [str(el) for el in dosages]
146
+
147
+ adata_emb.obs["pert_cell_dose"] = (
148
+ adata_emb.obs["cell_id"].astype(str)
149
+ + "_"
150
+ + adata_emb.obs["pert_iname"].astype(str)
151
+ + "_"
152
+ + adata_emb.obs["pert_dose"].astype(str)
153
+ )
154
+ adata_emb.obs["r2_score"] = np.hstack(latent_dict["r2_score"])
155
+ return adata_emb