omicsync 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,146 @@
1
+ """Simple concatenation strategies for multi-omics integration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Dict, List, Optional, Sequence
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from omicsync.core.dataset import OmicsDataset
11
+ from omicsync.utils.logging import get_logger
12
+
13
+ logger = get_logger("integration.concat")
14
+
15
+
16
+ def simple_concat(
17
+ dataset: OmicsDataset,
18
+ modalities: Optional[Sequence[str]] = None,
19
+ fill_missing: float = 0.0,
20
+ ) -> np.ndarray:
21
+ """Concatenate modalities into a flat numpy array.
22
+
23
+ Parameters
24
+ ----------
25
+ dataset:
26
+ An :class:`~omicsync.core.dataset.OmicsDataset`.
27
+ modalities:
28
+ Modality names to include. ``None`` uses all modalities.
29
+ fill_missing:
30
+ Value for missing entries (default 0.0).
31
+
32
+ Returns
33
+ -------
34
+ numpy.ndarray
35
+ Shape (n_samples, total_features).
36
+ """
37
+ df = dataset.to_dataframe(modalities=modalities, fill_missing=fill_missing)
38
+ logger.info(
39
+ "simple_concat: output shape %s.", df.shape
40
+ )
41
+ return df.values
42
+
43
+
44
+ def weighted_concat(
45
+ dataset: OmicsDataset,
46
+ weights: Optional[Dict[str, float]] = None,
47
+ fill_missing: float = 0.0,
48
+ ) -> np.ndarray:
49
+ """Concatenate modalities with per-modality feature scaling.
50
+
51
+ Each modality's features are multiplied by the corresponding weight
52
+ before concatenation, allowing you to up-weight or down-weight
53
+ specific data types.
54
+
55
+ Parameters
56
+ ----------
57
+ dataset:
58
+ An :class:`~omicsync.core.dataset.OmicsDataset`.
59
+ weights:
60
+ Mapping from modality name to scalar weight. Modalities not listed
61
+ receive a weight of 1.0.
62
+ fill_missing:
63
+ Value for missing entries.
64
+
65
+ Returns
66
+ -------
67
+ numpy.ndarray
68
+ Shape (n_samples, total_features).
69
+ """
70
+ weights = weights or {}
71
+ frames: List[pd.DataFrame] = []
72
+
73
+ for name, mod in dataset._modalities.items():
74
+ w = weights.get(name, 1.0)
75
+ prefixed = mod.data.add_prefix(f"{name}__") * w
76
+ frames.append(prefixed)
77
+
78
+ if not frames:
79
+ return np.empty((0, 0))
80
+
81
+ df = frames[0].join(frames[1:], how="outer").fillna(fill_missing)
82
+ logger.info("weighted_concat: output shape %s.", df.shape)
83
+ return df.values
84
+
85
+
86
+ def pca_concat(
87
+ dataset: OmicsDataset,
88
+ n_components_per_modality: int = 50,
89
+ fill_missing: float = 0.0,
90
+ ) -> np.ndarray:
91
+ """Reduce each modality by PCA then concatenate the scores.
92
+
93
+ Requires ``scikit-learn``.
94
+
95
+ Parameters
96
+ ----------
97
+ dataset:
98
+ An :class:`~omicsync.core.dataset.OmicsDataset`.
99
+ n_components_per_modality:
100
+ Number of PCA components to retain per modality (capped at the
101
+ modality's feature count).
102
+ fill_missing:
103
+ Value for missing entries before PCA.
104
+
105
+ Returns
106
+ -------
107
+ numpy.ndarray
108
+ Shape (n_samples, n_components_per_modality × n_modalities).
109
+
110
+ Raises
111
+ ------
112
+ ImportError
113
+ If ``scikit-learn`` is not installed.
114
+ """
115
+ try:
116
+ from sklearn.decomposition import PCA
117
+ from sklearn.impute import SimpleImputer
118
+ except ImportError as exc:
119
+ raise ImportError(
120
+ "scikit-learn is required for pca_concat(). "
121
+ "Install it with: pip install scikit-learn"
122
+ ) from exc
123
+
124
+ all_samples = dataset.common_samples
125
+ parts: List[np.ndarray] = []
126
+
127
+ for name, mod in dataset._modalities.items():
128
+ data = mod.data.reindex(all_samples).fillna(fill_missing).values.astype(float)
129
+ n_comp = min(n_components_per_modality, data.shape[1], data.shape[0])
130
+
131
+ imputer = SimpleImputer(strategy="mean")
132
+ data = imputer.fit_transform(data)
133
+
134
+ pca = PCA(n_components=n_comp, random_state=0)
135
+ scores = pca.fit_transform(data)
136
+ parts.append(scores)
137
+ logger.info(
138
+ "pca_concat: %r → %d components (%.1f%% variance).",
139
+ name,
140
+ n_comp,
141
+ 100.0 * pca.explained_variance_ratio_.sum(),
142
+ )
143
+
144
+ result = np.concatenate(parts, axis=1)
145
+ logger.info("pca_concat: final shape %s.", result.shape)
146
+ return result
@@ -0,0 +1,279 @@
1
+ """MOFA2 wrapper for multi-omics factor analysis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Dict, List, Optional, Union
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ from omicsync.core.dataset import OmicsDataset
12
+ from omicsync.utils.logging import get_logger
13
+
14
+ logger = get_logger("integration.mofa")
15
+
16
+
17
+ class MOFA2Wrapper:
18
+ """Wrapper around ``mofapy2`` for factor analysis of multi-omics data.
19
+
20
+ Parameters
21
+ ----------
22
+ dataset:
23
+ An :class:`~omicsync.core.dataset.OmicsDataset`.
24
+ n_factors:
25
+ Number of latent factors to learn (default 10).
26
+ convergence_mode:
27
+ ``"fast"``, ``"medium"``, or ``"slow"`` (default ``"fast"``).
28
+ use_gpu:
29
+ Whether to use GPU acceleration (requires CUDA; default ``False``).
30
+ seed:
31
+ Random seed for reproducibility (default 42).
32
+
33
+ Raises
34
+ ------
35
+ ImportError
36
+ If ``mofapy2`` is not installed.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ dataset: OmicsDataset,
42
+ n_factors: int = 10,
43
+ convergence_mode: str = "fast",
44
+ use_gpu: bool = False,
45
+ seed: int = 42,
46
+ ) -> None:
47
+ try:
48
+ import mofapy2 # noqa: F401
49
+ except ImportError as exc:
50
+ raise ImportError(
51
+ "mofapy2 is required for MOFA2Wrapper. "
52
+ "Install it with: pip install mofapy2"
53
+ ) from exc
54
+
55
+ self.dataset = dataset
56
+ self.n_factors = n_factors
57
+ self.convergence_mode = convergence_mode
58
+ self.use_gpu = use_gpu
59
+ self.seed = seed
60
+
61
+ self._model = None
62
+ self._prepared_data: Optional[Dict] = None
63
+ self._samples: Optional[List[str]] = None
64
+ self._views: Optional[List[str]] = None
65
+
66
+ def prepare(self) -> "MOFA2Wrapper":
67
+ """Format the dataset for mofapy2.
68
+
69
+ Returns
70
+ -------
71
+ MOFA2Wrapper
72
+ *self*, for method chaining.
73
+ """
74
+ mofa_input = self.dataset.to_mofa2()
75
+ self._prepared_data = mofa_input
76
+ self._views = mofa_input["views"]
77
+ self._samples = mofa_input["samples"][0] # single group
78
+ logger.info(
79
+ "MOFA2Wrapper.prepare: %d views, %d samples.",
80
+ len(self._views),
81
+ len(self._samples),
82
+ )
83
+ return self
84
+
85
+ def train(self, output_path: Optional[Union[str, Path]] = None) -> "MOFA2Wrapper":
86
+ """Train the MOFA2 model.
87
+
88
+ Parameters
89
+ ----------
90
+ output_path:
91
+ If provided, save the trained model to this HDF5 file path.
92
+
93
+ Returns
94
+ -------
95
+ MOFA2Wrapper
96
+ *self*, for method chaining.
97
+
98
+ Raises
99
+ ------
100
+ RuntimeError
101
+ If :meth:`prepare` has not been called first.
102
+ """
103
+ if self._prepared_data is None:
104
+ raise RuntimeError("Call prepare() before train().")
105
+
106
+ from mofapy2.run.entry_point import entry_point
107
+
108
+ ent = entry_point()
109
+
110
+ data = self._prepared_data["data"]
111
+ groups = self._prepared_data["groups"]
112
+ views = self._prepared_data["views"]
113
+ samples = self._prepared_data["samples"]
114
+
115
+ ent.set_data_options(scale_groups=False, scale_views=False)
116
+ ent.set_data_matrix(
117
+ data,
118
+ likelihoods=["gaussian"] * len(views),
119
+ views_names=views,
120
+ groups_names=groups,
121
+ samples_names=samples,
122
+ features_names=[
123
+ self.dataset._modalities[v].feature_ids.tolist() for v in views
124
+ ],
125
+ )
126
+ ent.set_model_options(
127
+ factors=self.n_factors,
128
+ spikeslab_weights=True,
129
+ ard_factors=True,
130
+ ard_weights=True,
131
+ )
132
+ ent.set_train_options(
133
+ convergence_mode=self.convergence_mode,
134
+ gpu_mode=self.use_gpu,
135
+ seed=self.seed,
136
+ verbose=False,
137
+ )
138
+ ent.build()
139
+ ent.run()
140
+
141
+ self._model = ent.model
142
+
143
+ if output_path is not None:
144
+ output_path = Path(output_path)
145
+ ent.save(str(output_path))
146
+ logger.info("MOFA2Wrapper: model saved to %s.", output_path)
147
+
148
+ logger.info("MOFA2Wrapper: training complete.")
149
+ return self
150
+
151
+ def get_factors(self) -> pd.DataFrame:
152
+ """Return factor scores for all samples.
153
+
154
+ Returns
155
+ -------
156
+ pandas.DataFrame
157
+ Shape (n_samples, n_factors); index = sample IDs,
158
+ columns = ``Factor1``, ``Factor2``, ...
159
+
160
+ Raises
161
+ ------
162
+ RuntimeError
163
+ If the model has not been trained yet.
164
+ """
165
+ self._check_trained()
166
+ Z = self._model.nodes["Z"].getExpectation()
167
+ # Z has shape (n_groups, n_samples, n_factors)
168
+ scores = Z[0]
169
+ cols = [f"Factor{i+1}" for i in range(scores.shape[1])]
170
+ return pd.DataFrame(scores, index=self._samples, columns=cols)
171
+
172
+ def get_weights(
173
+ self, modality: Optional[str] = None
174
+ ) -> Union[Dict[str, pd.DataFrame], pd.DataFrame]:
175
+ """Return feature weights.
176
+
177
+ Parameters
178
+ ----------
179
+ modality:
180
+ If specified, return weights for that modality only.
181
+ Otherwise return a dict of DataFrames keyed by modality name.
182
+
183
+ Returns
184
+ -------
185
+ dict[str, pandas.DataFrame] or pandas.DataFrame
186
+ """
187
+ self._check_trained()
188
+ W = self._model.nodes["W"].getExpectation()
189
+ # W: list of arrays (n_features, n_factors) per view
190
+ result: Dict[str, pd.DataFrame] = {}
191
+ for i, view in enumerate(self._views):
192
+ features = self.dataset._modalities[view].feature_ids.tolist()
193
+ cols = [f"Factor{j+1}" for j in range(W[i].shape[1])]
194
+ result[view] = pd.DataFrame(W[i], index=features, columns=cols)
195
+
196
+ if modality is not None:
197
+ if modality not in result:
198
+ raise KeyError(f"Modality {modality!r} not found; available: {list(result)}.")
199
+ return result[modality]
200
+ return result
201
+
202
+ def get_variance_explained(self) -> pd.DataFrame:
203
+ """Return R² per factor per modality.
204
+
205
+ Returns
206
+ -------
207
+ pandas.DataFrame
208
+ Shape (n_factors, n_modalities).
209
+ """
210
+ self._check_trained()
211
+ r2 = self._model.calculate_variance_explained()
212
+ # r2 is a list of arrays (n_factors, 1) per view
213
+ data = np.concatenate([v[0] for v in r2], axis=1) if isinstance(r2, list) else r2
214
+ cols = self._views
215
+ idx = [f"Factor{i+1}" for i in range(data.shape[0])]
216
+ return pd.DataFrame(data, index=idx, columns=cols)
217
+
218
+ def plot_variance_explained(self) -> None:
219
+ """Plot a bar chart of variance explained per factor per modality.
220
+
221
+ Requires ``matplotlib``.
222
+
223
+ Raises
224
+ ------
225
+ ImportError
226
+ If ``matplotlib`` is not installed.
227
+ """
228
+ try:
229
+ import matplotlib.pyplot as plt
230
+ except ImportError as exc:
231
+ raise ImportError(
232
+ "matplotlib is required for plot_variance_explained(). "
233
+ "Install it with: pip install matplotlib"
234
+ ) from exc
235
+
236
+ r2 = self.get_variance_explained()
237
+ r2.T.plot(kind="bar", figsize=(10, 4))
238
+ plt.ylabel("Variance Explained (R²)")
239
+ plt.title("MOFA2 Variance Explained per View")
240
+ plt.tight_layout()
241
+ plt.show()
242
+
243
+ def top_features(
244
+ self, factor: int, modality: str, n: int = 20
245
+ ) -> pd.DataFrame:
246
+ """Return the top *n* features by absolute weight for a factor.
247
+
248
+ Parameters
249
+ ----------
250
+ factor:
251
+ 1-based factor index.
252
+ modality:
253
+ Modality name.
254
+ n:
255
+ Number of top features to return.
256
+
257
+ Returns
258
+ -------
259
+ pandas.DataFrame
260
+ Columns: ``feature``, ``weight``, ``abs_weight``.
261
+ """
262
+ self._check_trained()
263
+ weights = self.get_weights(modality)
264
+ col = f"Factor{factor}"
265
+ if col not in weights.columns:
266
+ raise ValueError(
267
+ f"Factor {factor} not found; available 1–{len(weights.columns)}."
268
+ )
269
+ s = weights[col].abs().sort_values(ascending=False).head(n)
270
+ df = pd.DataFrame({
271
+ "feature": s.index,
272
+ "abs_weight": s.values,
273
+ "weight": weights.loc[s.index, col].values,
274
+ })
275
+ return df.reset_index(drop=True)
276
+
277
+ def _check_trained(self) -> None:
278
+ if self._model is None:
279
+ raise RuntimeError("Model not trained yet. Call prepare() then train().")
@@ -0,0 +1,178 @@
1
+ """scikit-learn Pipeline compatibility for OmicsDataset."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import List, Optional, Sequence
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from sklearn.base import BaseEstimator, TransformerMixin
10
+
11
+ from omicsync.core.dataset import OmicsDataset
12
+ from omicsync.utils.logging import get_logger
13
+
14
+ logger = get_logger("integration.sklearn_compat")
15
+
16
+
17
+ class OmicsSyncTransformer(BaseEstimator, TransformerMixin):
18
+ """sklearn-compatible transformer for :class:`~omicsync.core.dataset.OmicsDataset`.
19
+
20
+ Aligns samples, normalises each modality, and returns a concatenated
21
+ numpy array (or DataFrame) suitable for downstream estimators.
22
+
23
+ Parameters
24
+ ----------
25
+ align: bool
26
+ Whether to align samples across modalities during fit (default ``True``).
27
+ normalize: bool
28
+ Whether to apply per-modality normalisation during fit (default ``True``).
29
+ modalities: list[str] or None
30
+ Modality names to include. ``None`` uses all modalities.
31
+ fill_missing: float
32
+ Value used for missing entries in the output (default ``0.0``).
33
+
34
+ Examples
35
+ --------
36
+ >>> from sklearn.pipeline import Pipeline
37
+ >>> from omicsync.integration.sklearn_compat import OmicsSyncTransformer
38
+ >>> pipe = Pipeline([
39
+ ... ('omicsync', OmicsSyncTransformer()),
40
+ ... ('classifier', SomeClassifier()),
41
+ ... ])
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ align: bool = True,
47
+ normalize: bool = True,
48
+ modalities: Optional[Sequence[str]] = None,
49
+ fill_missing: float = 0.0,
50
+ ) -> None:
51
+ self.align = align
52
+ self.normalize = normalize
53
+ self.modalities = modalities
54
+ self.fill_missing = fill_missing
55
+
56
+ self._feature_names: Optional[List[str]] = None
57
+ self._output_transform: Optional[str] = None
58
+
59
+ def fit(self, X: OmicsDataset, y=None) -> "OmicsSyncTransformer":
60
+ """Learn normalisation parameters from *X*.
61
+
62
+ Parameters
63
+ ----------
64
+ X:
65
+ An :class:`~omicsync.core.dataset.OmicsDataset`.
66
+ y:
67
+ Ignored (present for sklearn API compatibility).
68
+
69
+ Returns
70
+ -------
71
+ OmicsSyncTransformer
72
+ *self*.
73
+ """
74
+ if not isinstance(X, OmicsDataset):
75
+ raise TypeError(
76
+ f"OmicsSyncTransformer expects an OmicsDataset, got {type(X).__name__}."
77
+ )
78
+ if self.align:
79
+ X.align_samples(strategy="intersection")
80
+ if self.normalize:
81
+ X.normalize(per_modality=True)
82
+
83
+ df = X.to_dataframe(
84
+ modalities=list(self.modalities) if self.modalities else None,
85
+ fill_missing=self.fill_missing,
86
+ )
87
+ self._feature_names = df.columns.tolist()
88
+ logger.info(
89
+ "OmicsSyncTransformer.fit: %d features learned.", len(self._feature_names)
90
+ )
91
+ return self
92
+
93
+ def transform(self, X: OmicsDataset, y=None) -> np.ndarray:
94
+ """Apply learned normalisation and return a numpy array.
95
+
96
+ Parameters
97
+ ----------
98
+ X:
99
+ An :class:`~omicsync.core.dataset.OmicsDataset`.
100
+ y:
101
+ Ignored.
102
+
103
+ Returns
104
+ -------
105
+ numpy.ndarray or pandas.DataFrame
106
+ Shape (n_samples, n_features). Returns a DataFrame if
107
+ ``set_output(transform='pandas')`` was called.
108
+ """
109
+ if self._feature_names is None:
110
+ raise RuntimeError("fit() must be called before transform().")
111
+ if not isinstance(X, OmicsDataset):
112
+ raise TypeError(
113
+ f"OmicsSyncTransformer expects an OmicsDataset, got {type(X).__name__}."
114
+ )
115
+
116
+ df = X.to_dataframe(
117
+ modalities=list(self.modalities) if self.modalities else None,
118
+ fill_missing=self.fill_missing,
119
+ )
120
+ # Align columns to fitted feature names
121
+ df = df.reindex(columns=self._feature_names, fill_value=self.fill_missing)
122
+
123
+ if self._output_transform == "pandas":
124
+ return df
125
+ return df.values
126
+
127
+ def fit_transform(self, X: OmicsDataset, y=None, **fit_params) -> np.ndarray:
128
+ """Fit and transform in one step.
129
+
130
+ Parameters
131
+ ----------
132
+ X:
133
+ An :class:`~omicsync.core.dataset.OmicsDataset`.
134
+ y:
135
+ Ignored.
136
+
137
+ Returns
138
+ -------
139
+ numpy.ndarray or pandas.DataFrame
140
+ """
141
+ return self.fit(X, y).transform(X, y)
142
+
143
+ def get_feature_names_out(
144
+ self, input_features=None
145
+ ) -> np.ndarray:
146
+ """Return feature names with modality prefix.
147
+
148
+ Returns
149
+ -------
150
+ numpy.ndarray of str
151
+ E.g. ``["rna__EGFR", "mut__TP53", ...]``.
152
+
153
+ Raises
154
+ ------
155
+ RuntimeError
156
+ If fit() has not been called.
157
+ """
158
+ if self._feature_names is None:
159
+ raise RuntimeError("fit() must be called before get_feature_names_out().")
160
+ return np.array(self._feature_names, dtype=object)
161
+
162
+ def set_output(self, *, transform: Optional[str] = None) -> "OmicsSyncTransformer":
163
+ """Set the output format for :meth:`transform`.
164
+
165
+ Parameters
166
+ ----------
167
+ transform:
168
+ ``"pandas"`` to return a DataFrame; ``None`` for numpy array.
169
+
170
+ Returns
171
+ -------
172
+ OmicsSyncTransformer
173
+ *self*.
174
+ """
175
+ if transform not in (None, "pandas"):
176
+ raise ValueError(f"Unknown transform format {transform!r}. Valid: None, 'pandas'.")
177
+ self._output_transform = transform
178
+ return self
@@ -0,0 +1,19 @@
1
+ """Data loaders for omicsync."""
2
+
3
+ from omicsync.loaders.csv import load_csv, load_multimodal_csv
4
+ from omicsync.loaders.tcga import load_tcga_files, download_tcga_manifest
5
+ from omicsync.loaders.geo import load_geo
6
+ from omicsync.loaders.open_targets import (
7
+ load_open_targets_targets,
8
+ add_open_targets_annotations,
9
+ )
10
+
11
+ __all__ = [
12
+ "load_csv",
13
+ "load_multimodal_csv",
14
+ "load_tcga_files",
15
+ "download_tcga_manifest",
16
+ "load_geo",
17
+ "load_open_targets_targets",
18
+ "add_open_targets_annotations",
19
+ ]