omicsync 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omicsync/__init__.py +33 -0
- omicsync/core/__init__.py +25 -0
- omicsync/core/dataset.py +507 -0
- omicsync/core/modality.py +398 -0
- omicsync/core/sample_index.py +200 -0
- omicsync/integration/__init__.py +11 -0
- omicsync/integration/concat.py +146 -0
- omicsync/integration/mofa.py +279 -0
- omicsync/integration/sklearn_compat.py +178 -0
- omicsync/loaders/__init__.py +19 -0
- omicsync/loaders/csv.py +147 -0
- omicsync/loaders/geo.py +111 -0
- omicsync/loaders/open_targets.py +239 -0
- omicsync/loaders/tcga.py +251 -0
- omicsync/normalisation/__init__.py +5 -0
- omicsync/normalisation/cnv.py +97 -0
- omicsync/normalisation/methylation.py +131 -0
- omicsync/normalisation/mutations.py +123 -0
- omicsync/normalisation/protein.py +54 -0
- omicsync/normalisation/rna.py +182 -0
- omicsync/utils/__init__.py +32 -0
- omicsync/utils/barcode.py +165 -0
- omicsync/utils/logging.py +44 -0
- omicsync/utils/validation.py +152 -0
- omicsync-0.1.0.dist-info/METADATA +188 -0
- omicsync-0.1.0.dist-info/RECORD +29 -0
- omicsync-0.1.0.dist-info/WHEEL +5 -0
- omicsync-0.1.0.dist-info/licenses/LICENSE +21 -0
- omicsync-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Simple concatenation strategies for multi-omics integration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Dict, List, Optional, Sequence
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from omicsync.core.dataset import OmicsDataset
|
|
11
|
+
from omicsync.utils.logging import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger("integration.concat")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def simple_concat(
|
|
17
|
+
dataset: OmicsDataset,
|
|
18
|
+
modalities: Optional[Sequence[str]] = None,
|
|
19
|
+
fill_missing: float = 0.0,
|
|
20
|
+
) -> np.ndarray:
|
|
21
|
+
"""Concatenate modalities into a flat numpy array.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
dataset:
|
|
26
|
+
An :class:`~omicsync.core.dataset.OmicsDataset`.
|
|
27
|
+
modalities:
|
|
28
|
+
Modality names to include. ``None`` uses all modalities.
|
|
29
|
+
fill_missing:
|
|
30
|
+
Value for missing entries (default 0.0).
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
numpy.ndarray
|
|
35
|
+
Shape (n_samples, total_features).
|
|
36
|
+
"""
|
|
37
|
+
df = dataset.to_dataframe(modalities=modalities, fill_missing=fill_missing)
|
|
38
|
+
logger.info(
|
|
39
|
+
"simple_concat: output shape %s.", df.shape
|
|
40
|
+
)
|
|
41
|
+
return df.values
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def weighted_concat(
|
|
45
|
+
dataset: OmicsDataset,
|
|
46
|
+
weights: Optional[Dict[str, float]] = None,
|
|
47
|
+
fill_missing: float = 0.0,
|
|
48
|
+
) -> np.ndarray:
|
|
49
|
+
"""Concatenate modalities with per-modality feature scaling.
|
|
50
|
+
|
|
51
|
+
Each modality's features are multiplied by the corresponding weight
|
|
52
|
+
before concatenation, allowing you to up-weight or down-weight
|
|
53
|
+
specific data types.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
dataset:
|
|
58
|
+
An :class:`~omicsync.core.dataset.OmicsDataset`.
|
|
59
|
+
weights:
|
|
60
|
+
Mapping from modality name to scalar weight. Modalities not listed
|
|
61
|
+
receive a weight of 1.0.
|
|
62
|
+
fill_missing:
|
|
63
|
+
Value for missing entries.
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
numpy.ndarray
|
|
68
|
+
Shape (n_samples, total_features).
|
|
69
|
+
"""
|
|
70
|
+
weights = weights or {}
|
|
71
|
+
frames: List[pd.DataFrame] = []
|
|
72
|
+
|
|
73
|
+
for name, mod in dataset._modalities.items():
|
|
74
|
+
w = weights.get(name, 1.0)
|
|
75
|
+
prefixed = mod.data.add_prefix(f"{name}__") * w
|
|
76
|
+
frames.append(prefixed)
|
|
77
|
+
|
|
78
|
+
if not frames:
|
|
79
|
+
return np.empty((0, 0))
|
|
80
|
+
|
|
81
|
+
df = frames[0].join(frames[1:], how="outer").fillna(fill_missing)
|
|
82
|
+
logger.info("weighted_concat: output shape %s.", df.shape)
|
|
83
|
+
return df.values
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def pca_concat(
|
|
87
|
+
dataset: OmicsDataset,
|
|
88
|
+
n_components_per_modality: int = 50,
|
|
89
|
+
fill_missing: float = 0.0,
|
|
90
|
+
) -> np.ndarray:
|
|
91
|
+
"""Reduce each modality by PCA then concatenate the scores.
|
|
92
|
+
|
|
93
|
+
Requires ``scikit-learn``.
|
|
94
|
+
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
dataset:
|
|
98
|
+
An :class:`~omicsync.core.dataset.OmicsDataset`.
|
|
99
|
+
n_components_per_modality:
|
|
100
|
+
Number of PCA components to retain per modality (capped at the
|
|
101
|
+
modality's feature count).
|
|
102
|
+
fill_missing:
|
|
103
|
+
Value for missing entries before PCA.
|
|
104
|
+
|
|
105
|
+
Returns
|
|
106
|
+
-------
|
|
107
|
+
numpy.ndarray
|
|
108
|
+
Shape (n_samples, n_components_per_modality × n_modalities).
|
|
109
|
+
|
|
110
|
+
Raises
|
|
111
|
+
------
|
|
112
|
+
ImportError
|
|
113
|
+
If ``scikit-learn`` is not installed.
|
|
114
|
+
"""
|
|
115
|
+
try:
|
|
116
|
+
from sklearn.decomposition import PCA
|
|
117
|
+
from sklearn.impute import SimpleImputer
|
|
118
|
+
except ImportError as exc:
|
|
119
|
+
raise ImportError(
|
|
120
|
+
"scikit-learn is required for pca_concat(). "
|
|
121
|
+
"Install it with: pip install scikit-learn"
|
|
122
|
+
) from exc
|
|
123
|
+
|
|
124
|
+
all_samples = dataset.common_samples
|
|
125
|
+
parts: List[np.ndarray] = []
|
|
126
|
+
|
|
127
|
+
for name, mod in dataset._modalities.items():
|
|
128
|
+
data = mod.data.reindex(all_samples).fillna(fill_missing).values.astype(float)
|
|
129
|
+
n_comp = min(n_components_per_modality, data.shape[1], data.shape[0])
|
|
130
|
+
|
|
131
|
+
imputer = SimpleImputer(strategy="mean")
|
|
132
|
+
data = imputer.fit_transform(data)
|
|
133
|
+
|
|
134
|
+
pca = PCA(n_components=n_comp, random_state=0)
|
|
135
|
+
scores = pca.fit_transform(data)
|
|
136
|
+
parts.append(scores)
|
|
137
|
+
logger.info(
|
|
138
|
+
"pca_concat: %r → %d components (%.1f%% variance).",
|
|
139
|
+
name,
|
|
140
|
+
n_comp,
|
|
141
|
+
100.0 * pca.explained_variance_ratio_.sum(),
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
result = np.concatenate(parts, axis=1)
|
|
145
|
+
logger.info("pca_concat: final shape %s.", result.shape)
|
|
146
|
+
return result
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"""MOFA2 wrapper for multi-omics factor analysis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from omicsync.core.dataset import OmicsDataset
|
|
12
|
+
from omicsync.utils.logging import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger("integration.mofa")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MOFA2Wrapper:
|
|
18
|
+
"""Wrapper around ``mofapy2`` for factor analysis of multi-omics data.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
dataset:
|
|
23
|
+
An :class:`~omicsync.core.dataset.OmicsDataset`.
|
|
24
|
+
n_factors:
|
|
25
|
+
Number of latent factors to learn (default 10).
|
|
26
|
+
convergence_mode:
|
|
27
|
+
``"fast"``, ``"medium"``, or ``"slow"`` (default ``"fast"``).
|
|
28
|
+
use_gpu:
|
|
29
|
+
Whether to use GPU acceleration (requires CUDA; default ``False``).
|
|
30
|
+
seed:
|
|
31
|
+
Random seed for reproducibility (default 42).
|
|
32
|
+
|
|
33
|
+
Raises
|
|
34
|
+
------
|
|
35
|
+
ImportError
|
|
36
|
+
If ``mofapy2`` is not installed.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
dataset: OmicsDataset,
|
|
42
|
+
n_factors: int = 10,
|
|
43
|
+
convergence_mode: str = "fast",
|
|
44
|
+
use_gpu: bool = False,
|
|
45
|
+
seed: int = 42,
|
|
46
|
+
) -> None:
|
|
47
|
+
try:
|
|
48
|
+
import mofapy2 # noqa: F401
|
|
49
|
+
except ImportError as exc:
|
|
50
|
+
raise ImportError(
|
|
51
|
+
"mofapy2 is required for MOFA2Wrapper. "
|
|
52
|
+
"Install it with: pip install mofapy2"
|
|
53
|
+
) from exc
|
|
54
|
+
|
|
55
|
+
self.dataset = dataset
|
|
56
|
+
self.n_factors = n_factors
|
|
57
|
+
self.convergence_mode = convergence_mode
|
|
58
|
+
self.use_gpu = use_gpu
|
|
59
|
+
self.seed = seed
|
|
60
|
+
|
|
61
|
+
self._model = None
|
|
62
|
+
self._prepared_data: Optional[Dict] = None
|
|
63
|
+
self._samples: Optional[List[str]] = None
|
|
64
|
+
self._views: Optional[List[str]] = None
|
|
65
|
+
|
|
66
|
+
def prepare(self) -> "MOFA2Wrapper":
|
|
67
|
+
"""Format the dataset for mofapy2.
|
|
68
|
+
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
MOFA2Wrapper
|
|
72
|
+
*self*, for method chaining.
|
|
73
|
+
"""
|
|
74
|
+
mofa_input = self.dataset.to_mofa2()
|
|
75
|
+
self._prepared_data = mofa_input
|
|
76
|
+
self._views = mofa_input["views"]
|
|
77
|
+
self._samples = mofa_input["samples"][0] # single group
|
|
78
|
+
logger.info(
|
|
79
|
+
"MOFA2Wrapper.prepare: %d views, %d samples.",
|
|
80
|
+
len(self._views),
|
|
81
|
+
len(self._samples),
|
|
82
|
+
)
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
def train(self, output_path: Optional[Union[str, Path]] = None) -> "MOFA2Wrapper":
|
|
86
|
+
"""Train the MOFA2 model.
|
|
87
|
+
|
|
88
|
+
Parameters
|
|
89
|
+
----------
|
|
90
|
+
output_path:
|
|
91
|
+
If provided, save the trained model to this HDF5 file path.
|
|
92
|
+
|
|
93
|
+
Returns
|
|
94
|
+
-------
|
|
95
|
+
MOFA2Wrapper
|
|
96
|
+
*self*, for method chaining.
|
|
97
|
+
|
|
98
|
+
Raises
|
|
99
|
+
------
|
|
100
|
+
RuntimeError
|
|
101
|
+
If :meth:`prepare` has not been called first.
|
|
102
|
+
"""
|
|
103
|
+
if self._prepared_data is None:
|
|
104
|
+
raise RuntimeError("Call prepare() before train().")
|
|
105
|
+
|
|
106
|
+
from mofapy2.run.entry_point import entry_point
|
|
107
|
+
|
|
108
|
+
ent = entry_point()
|
|
109
|
+
|
|
110
|
+
data = self._prepared_data["data"]
|
|
111
|
+
groups = self._prepared_data["groups"]
|
|
112
|
+
views = self._prepared_data["views"]
|
|
113
|
+
samples = self._prepared_data["samples"]
|
|
114
|
+
|
|
115
|
+
ent.set_data_options(scale_groups=False, scale_views=False)
|
|
116
|
+
ent.set_data_matrix(
|
|
117
|
+
data,
|
|
118
|
+
likelihoods=["gaussian"] * len(views),
|
|
119
|
+
views_names=views,
|
|
120
|
+
groups_names=groups,
|
|
121
|
+
samples_names=samples,
|
|
122
|
+
features_names=[
|
|
123
|
+
self.dataset._modalities[v].feature_ids.tolist() for v in views
|
|
124
|
+
],
|
|
125
|
+
)
|
|
126
|
+
ent.set_model_options(
|
|
127
|
+
factors=self.n_factors,
|
|
128
|
+
spikeslab_weights=True,
|
|
129
|
+
ard_factors=True,
|
|
130
|
+
ard_weights=True,
|
|
131
|
+
)
|
|
132
|
+
ent.set_train_options(
|
|
133
|
+
convergence_mode=self.convergence_mode,
|
|
134
|
+
gpu_mode=self.use_gpu,
|
|
135
|
+
seed=self.seed,
|
|
136
|
+
verbose=False,
|
|
137
|
+
)
|
|
138
|
+
ent.build()
|
|
139
|
+
ent.run()
|
|
140
|
+
|
|
141
|
+
self._model = ent.model
|
|
142
|
+
|
|
143
|
+
if output_path is not None:
|
|
144
|
+
output_path = Path(output_path)
|
|
145
|
+
ent.save(str(output_path))
|
|
146
|
+
logger.info("MOFA2Wrapper: model saved to %s.", output_path)
|
|
147
|
+
|
|
148
|
+
logger.info("MOFA2Wrapper: training complete.")
|
|
149
|
+
return self
|
|
150
|
+
|
|
151
|
+
def get_factors(self) -> pd.DataFrame:
|
|
152
|
+
"""Return factor scores for all samples.
|
|
153
|
+
|
|
154
|
+
Returns
|
|
155
|
+
-------
|
|
156
|
+
pandas.DataFrame
|
|
157
|
+
Shape (n_samples, n_factors); index = sample IDs,
|
|
158
|
+
columns = ``Factor1``, ``Factor2``, ...
|
|
159
|
+
|
|
160
|
+
Raises
|
|
161
|
+
------
|
|
162
|
+
RuntimeError
|
|
163
|
+
If the model has not been trained yet.
|
|
164
|
+
"""
|
|
165
|
+
self._check_trained()
|
|
166
|
+
Z = self._model.nodes["Z"].getExpectation()
|
|
167
|
+
# Z has shape (n_groups, n_samples, n_factors)
|
|
168
|
+
scores = Z[0]
|
|
169
|
+
cols = [f"Factor{i+1}" for i in range(scores.shape[1])]
|
|
170
|
+
return pd.DataFrame(scores, index=self._samples, columns=cols)
|
|
171
|
+
|
|
172
|
+
def get_weights(
|
|
173
|
+
self, modality: Optional[str] = None
|
|
174
|
+
) -> Union[Dict[str, pd.DataFrame], pd.DataFrame]:
|
|
175
|
+
"""Return feature weights.
|
|
176
|
+
|
|
177
|
+
Parameters
|
|
178
|
+
----------
|
|
179
|
+
modality:
|
|
180
|
+
If specified, return weights for that modality only.
|
|
181
|
+
Otherwise return a dict of DataFrames keyed by modality name.
|
|
182
|
+
|
|
183
|
+
Returns
|
|
184
|
+
-------
|
|
185
|
+
dict[str, pandas.DataFrame] or pandas.DataFrame
|
|
186
|
+
"""
|
|
187
|
+
self._check_trained()
|
|
188
|
+
W = self._model.nodes["W"].getExpectation()
|
|
189
|
+
# W: list of arrays (n_features, n_factors) per view
|
|
190
|
+
result: Dict[str, pd.DataFrame] = {}
|
|
191
|
+
for i, view in enumerate(self._views):
|
|
192
|
+
features = self.dataset._modalities[view].feature_ids.tolist()
|
|
193
|
+
cols = [f"Factor{j+1}" for j in range(W[i].shape[1])]
|
|
194
|
+
result[view] = pd.DataFrame(W[i], index=features, columns=cols)
|
|
195
|
+
|
|
196
|
+
if modality is not None:
|
|
197
|
+
if modality not in result:
|
|
198
|
+
raise KeyError(f"Modality {modality!r} not found; available: {list(result)}.")
|
|
199
|
+
return result[modality]
|
|
200
|
+
return result
|
|
201
|
+
|
|
202
|
+
def get_variance_explained(self) -> pd.DataFrame:
|
|
203
|
+
"""Return R² per factor per modality.
|
|
204
|
+
|
|
205
|
+
Returns
|
|
206
|
+
-------
|
|
207
|
+
pandas.DataFrame
|
|
208
|
+
Shape (n_factors, n_modalities).
|
|
209
|
+
"""
|
|
210
|
+
self._check_trained()
|
|
211
|
+
r2 = self._model.calculate_variance_explained()
|
|
212
|
+
# r2 is a list of arrays (n_factors, 1) per view
|
|
213
|
+
data = np.concatenate([v[0] for v in r2], axis=1) if isinstance(r2, list) else r2
|
|
214
|
+
cols = self._views
|
|
215
|
+
idx = [f"Factor{i+1}" for i in range(data.shape[0])]
|
|
216
|
+
return pd.DataFrame(data, index=idx, columns=cols)
|
|
217
|
+
|
|
218
|
+
def plot_variance_explained(self) -> None:
|
|
219
|
+
"""Plot a bar chart of variance explained per factor per modality.
|
|
220
|
+
|
|
221
|
+
Requires ``matplotlib``.
|
|
222
|
+
|
|
223
|
+
Raises
|
|
224
|
+
------
|
|
225
|
+
ImportError
|
|
226
|
+
If ``matplotlib`` is not installed.
|
|
227
|
+
"""
|
|
228
|
+
try:
|
|
229
|
+
import matplotlib.pyplot as plt
|
|
230
|
+
except ImportError as exc:
|
|
231
|
+
raise ImportError(
|
|
232
|
+
"matplotlib is required for plot_variance_explained(). "
|
|
233
|
+
"Install it with: pip install matplotlib"
|
|
234
|
+
) from exc
|
|
235
|
+
|
|
236
|
+
r2 = self.get_variance_explained()
|
|
237
|
+
r2.T.plot(kind="bar", figsize=(10, 4))
|
|
238
|
+
plt.ylabel("Variance Explained (R²)")
|
|
239
|
+
plt.title("MOFA2 Variance Explained per View")
|
|
240
|
+
plt.tight_layout()
|
|
241
|
+
plt.show()
|
|
242
|
+
|
|
243
|
+
def top_features(
|
|
244
|
+
self, factor: int, modality: str, n: int = 20
|
|
245
|
+
) -> pd.DataFrame:
|
|
246
|
+
"""Return the top *n* features by absolute weight for a factor.
|
|
247
|
+
|
|
248
|
+
Parameters
|
|
249
|
+
----------
|
|
250
|
+
factor:
|
|
251
|
+
1-based factor index.
|
|
252
|
+
modality:
|
|
253
|
+
Modality name.
|
|
254
|
+
n:
|
|
255
|
+
Number of top features to return.
|
|
256
|
+
|
|
257
|
+
Returns
|
|
258
|
+
-------
|
|
259
|
+
pandas.DataFrame
|
|
260
|
+
Columns: ``feature``, ``weight``, ``abs_weight``.
|
|
261
|
+
"""
|
|
262
|
+
self._check_trained()
|
|
263
|
+
weights = self.get_weights(modality)
|
|
264
|
+
col = f"Factor{factor}"
|
|
265
|
+
if col not in weights.columns:
|
|
266
|
+
raise ValueError(
|
|
267
|
+
f"Factor {factor} not found; available 1–{len(weights.columns)}."
|
|
268
|
+
)
|
|
269
|
+
s = weights[col].abs().sort_values(ascending=False).head(n)
|
|
270
|
+
df = pd.DataFrame({
|
|
271
|
+
"feature": s.index,
|
|
272
|
+
"abs_weight": s.values,
|
|
273
|
+
"weight": weights.loc[s.index, col].values,
|
|
274
|
+
})
|
|
275
|
+
return df.reset_index(drop=True)
|
|
276
|
+
|
|
277
|
+
def _check_trained(self) -> None:
|
|
278
|
+
if self._model is None:
|
|
279
|
+
raise RuntimeError("Model not trained yet. Call prepare() then train().")
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""scikit-learn Pipeline compatibility for OmicsDataset."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import List, Optional, Sequence
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
10
|
+
|
|
11
|
+
from omicsync.core.dataset import OmicsDataset
|
|
12
|
+
from omicsync.utils.logging import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger("integration.sklearn_compat")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class OmicsSyncTransformer(BaseEstimator, TransformerMixin):
|
|
18
|
+
"""sklearn-compatible transformer for :class:`~omicsync.core.dataset.OmicsDataset`.
|
|
19
|
+
|
|
20
|
+
Aligns samples, normalises each modality, and returns a concatenated
|
|
21
|
+
numpy array (or DataFrame) suitable for downstream estimators.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
align: bool
|
|
26
|
+
Whether to align samples across modalities during fit (default ``True``).
|
|
27
|
+
normalize: bool
|
|
28
|
+
Whether to apply per-modality normalisation during fit (default ``True``).
|
|
29
|
+
modalities: list[str] or None
|
|
30
|
+
Modality names to include. ``None`` uses all modalities.
|
|
31
|
+
fill_missing: float
|
|
32
|
+
Value used for missing entries in the output (default ``0.0``).
|
|
33
|
+
|
|
34
|
+
Examples
|
|
35
|
+
--------
|
|
36
|
+
>>> from sklearn.pipeline import Pipeline
|
|
37
|
+
>>> from omicsync.integration.sklearn_compat import OmicsSyncTransformer
|
|
38
|
+
>>> pipe = Pipeline([
|
|
39
|
+
... ('omicsync', OmicsSyncTransformer()),
|
|
40
|
+
... ('classifier', SomeClassifier()),
|
|
41
|
+
... ])
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
align: bool = True,
|
|
47
|
+
normalize: bool = True,
|
|
48
|
+
modalities: Optional[Sequence[str]] = None,
|
|
49
|
+
fill_missing: float = 0.0,
|
|
50
|
+
) -> None:
|
|
51
|
+
self.align = align
|
|
52
|
+
self.normalize = normalize
|
|
53
|
+
self.modalities = modalities
|
|
54
|
+
self.fill_missing = fill_missing
|
|
55
|
+
|
|
56
|
+
self._feature_names: Optional[List[str]] = None
|
|
57
|
+
self._output_transform: Optional[str] = None
|
|
58
|
+
|
|
59
|
+
def fit(self, X: OmicsDataset, y=None) -> "OmicsSyncTransformer":
|
|
60
|
+
"""Learn normalisation parameters from *X*.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
X:
|
|
65
|
+
An :class:`~omicsync.core.dataset.OmicsDataset`.
|
|
66
|
+
y:
|
|
67
|
+
Ignored (present for sklearn API compatibility).
|
|
68
|
+
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
OmicsSyncTransformer
|
|
72
|
+
*self*.
|
|
73
|
+
"""
|
|
74
|
+
if not isinstance(X, OmicsDataset):
|
|
75
|
+
raise TypeError(
|
|
76
|
+
f"OmicsSyncTransformer expects an OmicsDataset, got {type(X).__name__}."
|
|
77
|
+
)
|
|
78
|
+
if self.align:
|
|
79
|
+
X.align_samples(strategy="intersection")
|
|
80
|
+
if self.normalize:
|
|
81
|
+
X.normalize(per_modality=True)
|
|
82
|
+
|
|
83
|
+
df = X.to_dataframe(
|
|
84
|
+
modalities=list(self.modalities) if self.modalities else None,
|
|
85
|
+
fill_missing=self.fill_missing,
|
|
86
|
+
)
|
|
87
|
+
self._feature_names = df.columns.tolist()
|
|
88
|
+
logger.info(
|
|
89
|
+
"OmicsSyncTransformer.fit: %d features learned.", len(self._feature_names)
|
|
90
|
+
)
|
|
91
|
+
return self
|
|
92
|
+
|
|
93
|
+
def transform(self, X: OmicsDataset, y=None) -> np.ndarray:
|
|
94
|
+
"""Apply learned normalisation and return a numpy array.
|
|
95
|
+
|
|
96
|
+
Parameters
|
|
97
|
+
----------
|
|
98
|
+
X:
|
|
99
|
+
An :class:`~omicsync.core.dataset.OmicsDataset`.
|
|
100
|
+
y:
|
|
101
|
+
Ignored.
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
-------
|
|
105
|
+
numpy.ndarray or pandas.DataFrame
|
|
106
|
+
Shape (n_samples, n_features). Returns a DataFrame if
|
|
107
|
+
``set_output(transform='pandas')`` was called.
|
|
108
|
+
"""
|
|
109
|
+
if self._feature_names is None:
|
|
110
|
+
raise RuntimeError("fit() must be called before transform().")
|
|
111
|
+
if not isinstance(X, OmicsDataset):
|
|
112
|
+
raise TypeError(
|
|
113
|
+
f"OmicsSyncTransformer expects an OmicsDataset, got {type(X).__name__}."
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
df = X.to_dataframe(
|
|
117
|
+
modalities=list(self.modalities) if self.modalities else None,
|
|
118
|
+
fill_missing=self.fill_missing,
|
|
119
|
+
)
|
|
120
|
+
# Align columns to fitted feature names
|
|
121
|
+
df = df.reindex(columns=self._feature_names, fill_value=self.fill_missing)
|
|
122
|
+
|
|
123
|
+
if self._output_transform == "pandas":
|
|
124
|
+
return df
|
|
125
|
+
return df.values
|
|
126
|
+
|
|
127
|
+
def fit_transform(self, X: OmicsDataset, y=None, **fit_params) -> np.ndarray:
|
|
128
|
+
"""Fit and transform in one step.
|
|
129
|
+
|
|
130
|
+
Parameters
|
|
131
|
+
----------
|
|
132
|
+
X:
|
|
133
|
+
An :class:`~omicsync.core.dataset.OmicsDataset`.
|
|
134
|
+
y:
|
|
135
|
+
Ignored.
|
|
136
|
+
|
|
137
|
+
Returns
|
|
138
|
+
-------
|
|
139
|
+
numpy.ndarray or pandas.DataFrame
|
|
140
|
+
"""
|
|
141
|
+
return self.fit(X, y).transform(X, y)
|
|
142
|
+
|
|
143
|
+
def get_feature_names_out(
|
|
144
|
+
self, input_features=None
|
|
145
|
+
) -> np.ndarray:
|
|
146
|
+
"""Return feature names with modality prefix.
|
|
147
|
+
|
|
148
|
+
Returns
|
|
149
|
+
-------
|
|
150
|
+
numpy.ndarray of str
|
|
151
|
+
E.g. ``["rna__EGFR", "mut__TP53", ...]``.
|
|
152
|
+
|
|
153
|
+
Raises
|
|
154
|
+
------
|
|
155
|
+
RuntimeError
|
|
156
|
+
If fit() has not been called.
|
|
157
|
+
"""
|
|
158
|
+
if self._feature_names is None:
|
|
159
|
+
raise RuntimeError("fit() must be called before get_feature_names_out().")
|
|
160
|
+
return np.array(self._feature_names, dtype=object)
|
|
161
|
+
|
|
162
|
+
def set_output(self, *, transform: Optional[str] = None) -> "OmicsSyncTransformer":
|
|
163
|
+
"""Set the output format for :meth:`transform`.
|
|
164
|
+
|
|
165
|
+
Parameters
|
|
166
|
+
----------
|
|
167
|
+
transform:
|
|
168
|
+
``"pandas"`` to return a DataFrame; ``None`` for numpy array.
|
|
169
|
+
|
|
170
|
+
Returns
|
|
171
|
+
-------
|
|
172
|
+
OmicsSyncTransformer
|
|
173
|
+
*self*.
|
|
174
|
+
"""
|
|
175
|
+
if transform not in (None, "pandas"):
|
|
176
|
+
raise ValueError(f"Unknown transform format {transform!r}. Valid: None, 'pandas'.")
|
|
177
|
+
self._output_transform = transform
|
|
178
|
+
return self
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Data loaders for omicsync."""
|
|
2
|
+
|
|
3
|
+
from omicsync.loaders.csv import load_csv, load_multimodal_csv
|
|
4
|
+
from omicsync.loaders.tcga import load_tcga_files, download_tcga_manifest
|
|
5
|
+
from omicsync.loaders.geo import load_geo
|
|
6
|
+
from omicsync.loaders.open_targets import (
|
|
7
|
+
load_open_targets_targets,
|
|
8
|
+
add_open_targets_annotations,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"load_csv",
|
|
13
|
+
"load_multimodal_csv",
|
|
14
|
+
"load_tcga_files",
|
|
15
|
+
"download_tcga_manifest",
|
|
16
|
+
"load_geo",
|
|
17
|
+
"load_open_targets_targets",
|
|
18
|
+
"add_open_targets_annotations",
|
|
19
|
+
]
|