omicsync 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
omicsync/__init__.py ADDED
@@ -0,0 +1,33 @@
1
+ """omicsync — Multi-omics data harmonisation for Python."""
2
+
3
+ from omicsync.core.dataset import OmicsDataset
4
+ from omicsync.core.modality import (
5
+ OmicsModality,
6
+ RNAModality,
7
+ MutationModality,
8
+ MethylationModality,
9
+ CNVModality,
10
+ ProteinModality,
11
+ make_modality,
12
+ )
13
+ from omicsync.core.sample_index import SampleIndex
14
+ from omicsync.utils.logging import set_verbose, get_logger
15
+
16
+ __version__ = "0.1.0"
17
+ __author__ = "Paterson V."
18
+ __license__ = "MIT"
19
+
20
+ __all__ = [
21
+ "__version__",
22
+ "OmicsDataset",
23
+ "OmicsModality",
24
+ "RNAModality",
25
+ "MutationModality",
26
+ "MethylationModality",
27
+ "CNVModality",
28
+ "ProteinModality",
29
+ "make_modality",
30
+ "SampleIndex",
31
+ "set_verbose",
32
+ "get_logger",
33
+ ]
@@ -0,0 +1,25 @@
1
+ """Core data structures for omicsync."""
2
+
3
+ from omicsync.core.dataset import OmicsDataset
4
+ from omicsync.core.modality import (
5
+ OmicsModality,
6
+ RNAModality,
7
+ MutationModality,
8
+ MethylationModality,
9
+ CNVModality,
10
+ ProteinModality,
11
+ make_modality,
12
+ )
13
+ from omicsync.core.sample_index import SampleIndex
14
+
15
+ __all__ = [
16
+ "OmicsDataset",
17
+ "OmicsModality",
18
+ "RNAModality",
19
+ "MutationModality",
20
+ "MethylationModality",
21
+ "CNVModality",
22
+ "ProteinModality",
23
+ "make_modality",
24
+ "SampleIndex",
25
+ ]
@@ -0,0 +1,507 @@
1
+ """OmicsDataset: the main user-facing multi-omics container."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, List, Optional, Sequence, Union
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from omicsync.core.modality import OmicsModality
11
+ from omicsync.core.sample_index import SampleIndex
12
+ from omicsync.utils.logging import get_logger
13
+
14
+ logger = get_logger("core.dataset")
15
+
16
+
17
+ class OmicsDataset:
18
+ """Multi-omics container holding one or more :class:`OmicsModality` objects.
19
+
20
+ Parameters
21
+ ----------
22
+ modalities:
23
+ Mapping from modality name (e.g. ``"rna"``) to :class:`OmicsModality`.
24
+ study_id:
25
+ Study identifier, e.g. ``"TCGA-BRCA"``.
26
+ metadata:
27
+ Arbitrary dataset-level metadata.
28
+
29
+ Raises
30
+ ------
31
+ TypeError
32
+ If *modalities* values are not :class:`OmicsModality` instances.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ modalities: Dict[str, OmicsModality],
38
+ study_id: str = "unknown",
39
+ metadata: Optional[Dict] = None,
40
+ ) -> None:
41
+ for name, mod in modalities.items():
42
+ if not isinstance(mod, OmicsModality):
43
+ raise TypeError(
44
+ f"Expected OmicsModality for modality {name!r}, "
45
+ f"got {type(mod).__name__}."
46
+ )
47
+ self._modalities: Dict[str, OmicsModality] = dict(modalities)
48
+ self.study_id = study_id
49
+ self.metadata: Dict = metadata or {}
50
+
51
+ # ------------------------------------------------------------------
52
+ # Properties
53
+ # ------------------------------------------------------------------
54
+
55
+ @property
56
+ def modality_names(self) -> List[str]:
57
+ """Names of loaded modalities."""
58
+ return list(self._modalities.keys())
59
+
60
+ @property
61
+ def common_samples(self) -> pd.Index:
62
+ """Sample IDs present in *all* modalities."""
63
+ if not self._modalities:
64
+ return pd.Index([])
65
+ arrays = [mod.sample_ids for mod in self._modalities.values()]
66
+ return SampleIndex.align(arrays, strategy="intersection")
67
+
68
+ @property
69
+ def sample_coverage(self) -> pd.DataFrame:
70
+ """Boolean DataFrame: rows = all samples, columns = modalities.
71
+
72
+ ``True`` indicates the sample is present in that modality.
73
+ """
74
+ si = SampleIndex()
75
+ return si.summarise(
76
+ {name: mod.sample_ids for name, mod in self._modalities.items()}
77
+ )
78
+
79
+ @property
80
+ def n_complete_cases(self) -> int:
81
+ """Number of samples present in every modality."""
82
+ return len(self.common_samples)
83
+
84
+ # ------------------------------------------------------------------
85
+ # Mutation methods (all return self for chaining)
86
+ # ------------------------------------------------------------------
87
+
88
+ def align_samples(
89
+ self,
90
+ strategy: str = "intersection",
91
+ fill_value: float = np.nan,
92
+ ) -> "OmicsDataset":
93
+ """Retain only samples that are present according to *strategy*.
94
+
95
+ Parameters
96
+ ----------
97
+ strategy:
98
+ ``"intersection"`` (default) — keep only samples present in all
99
+ modalities. ``"union"`` — keep all samples; modalities that do
100
+ not have a sample will have ``fill_value`` for that row.
101
+ fill_value:
102
+ Value used to fill missing samples when ``strategy="union"``.
103
+
104
+ Returns
105
+ -------
106
+ OmicsDataset
107
+ *self*, for method chaining.
108
+
109
+ Raises
110
+ ------
111
+ ValueError
112
+ If *strategy* is not recognised.
113
+ """
114
+ if strategy not in ("intersection", "union"):
115
+ raise ValueError(
116
+ f"Unknown alignment strategy {strategy!r}. "
117
+ "Valid: 'intersection', 'union'."
118
+ )
119
+ arrays = [mod.sample_ids for mod in self._modalities.values()]
120
+ aligned = SampleIndex.align(arrays, strategy=strategy)
121
+
122
+ for name, mod in self._modalities.items():
123
+ if strategy == "intersection":
124
+ mod.filter_samples(aligned)
125
+ else:
126
+ missing = aligned.difference(mod.sample_ids)
127
+ if len(missing) > 0:
128
+ filler = pd.DataFrame(
129
+ fill_value,
130
+ index=missing,
131
+ columns=mod.feature_ids,
132
+ )
133
+ new_data = pd.concat([mod.data, filler]).loc[aligned]
134
+ mod._data = new_data
135
+
136
+ logger.info(
137
+ "align_samples (%s): %d samples across %d modalities.",
138
+ strategy,
139
+ len(aligned),
140
+ len(self._modalities),
141
+ )
142
+ return self
143
+
144
+ def normalize(self, per_modality: bool = True) -> "OmicsDataset":
145
+ """Apply default normalisation to each modality in-place.
146
+
147
+ Normalisation applied per modality type:
148
+
149
+ * **rna**: :func:`~omicsync.normalisation.rna.detect_and_normalise`
150
+ * **methylation**: :func:`~omicsync.normalisation.methylation.detect_and_normalise`
151
+ * **cnv**: log2-ratio relative to diploid, clipped to [-2, 2]
152
+ * **mutations**: binarise at 0
153
+ * **protein**: z-score per feature
154
+
155
+ Parameters
156
+ ----------
157
+ per_modality:
158
+ If ``False``, skip normalisation (no-op, for API compatibility).
159
+
160
+ Returns
161
+ -------
162
+ OmicsDataset
163
+ *self*, for method chaining.
164
+ """
165
+ if not per_modality:
166
+ return self
167
+
168
+ from omicsync.normalisation import rna as rna_norm
169
+ from omicsync.normalisation import methylation as meth_norm
170
+ from omicsync.normalisation import cnv as cnv_norm
171
+ from omicsync.normalisation import mutations as mut_norm
172
+ from omicsync.normalisation import protein as prot_norm
173
+
174
+ _dispatch = {
175
+ "rna": rna_norm.detect_and_normalise,
176
+ "methylation": meth_norm.detect_and_normalise,
177
+ "cnv": lambda df: cnv_norm.log2_ratio(cnv_norm.centre_diploid(df)).clip(-2, 2),
178
+ "mutations": lambda df: mut_norm.binarise(df, threshold=0),
179
+ "protein": prot_norm.z_score,
180
+ }
181
+
182
+ for name, mod in self._modalities.items():
183
+ fn = _dispatch.get(mod.modality_type)
184
+ if fn is not None:
185
+ logger.info("Normalising modality %r (%s).", name, mod.modality_type)
186
+ mod._data = fn(mod.data)
187
+
188
+ return self
189
+
190
+ def filter_features(
191
+ self,
192
+ min_variance: float = 0.0,
193
+ min_nonzero_frac: float = 0.0,
194
+ ) -> "OmicsDataset":
195
+ """Apply feature filtering to all modalities.
196
+
197
+ Parameters
198
+ ----------
199
+ min_variance:
200
+ Minimum variance for a feature to be kept.
201
+ min_nonzero_frac:
202
+ Minimum fraction of non-zero values for a feature to be kept.
203
+
204
+ Returns
205
+ -------
206
+ OmicsDataset
207
+ *self*, for method chaining.
208
+ """
209
+ for mod in self._modalities.values():
210
+ mod.filter_features(
211
+ min_variance=min_variance,
212
+ min_nonzero_frac=min_nonzero_frac,
213
+ )
214
+ return self
215
+
216
+ def drop_modality(self, name: str) -> "OmicsDataset":
217
+ """Remove a modality by name.
218
+
219
+ Parameters
220
+ ----------
221
+ name:
222
+ Modality name to remove.
223
+
224
+ Returns
225
+ -------
226
+ OmicsDataset
227
+ *self*, for method chaining.
228
+
229
+ Raises
230
+ ------
231
+ KeyError
232
+ If *name* is not in the dataset.
233
+ """
234
+ if name not in self._modalities:
235
+ raise KeyError(
236
+ f"Modality {name!r} not found. "
237
+ f"Available: {self.modality_names}."
238
+ )
239
+ del self._modalities[name]
240
+ logger.info("Dropped modality %r.", name)
241
+ return self
242
+
243
+ def add_modality(self, name: str, modality: OmicsModality) -> "OmicsDataset":
244
+ """Add a new modality.
245
+
246
+ Parameters
247
+ ----------
248
+ name:
249
+ Name for the new modality.
250
+ modality:
251
+ :class:`OmicsModality` instance to add.
252
+
253
+ Returns
254
+ -------
255
+ OmicsDataset
256
+ *self*, for method chaining.
257
+
258
+ Raises
259
+ ------
260
+ TypeError
261
+ If *modality* is not an :class:`OmicsModality`.
262
+ ValueError
263
+ If *name* is already present.
264
+ """
265
+ if not isinstance(modality, OmicsModality):
266
+ raise TypeError(
267
+ f"Expected OmicsModality, got {type(modality).__name__}."
268
+ )
269
+ if name in self._modalities:
270
+ raise ValueError(
271
+ f"Modality {name!r} already exists. "
272
+ "Use drop_modality() first to replace it."
273
+ )
274
+ self._modalities[name] = modality
275
+ logger.info("Added modality %r (%s).", name, modality.modality_type)
276
+ return self
277
+
278
+ def subset_samples(self, sample_ids: Sequence) -> "OmicsDataset":
279
+ """Filter all modalities to the specified samples.
280
+
281
+ Parameters
282
+ ----------
283
+ sample_ids:
284
+ Sample IDs to retain.
285
+
286
+ Returns
287
+ -------
288
+ OmicsDataset
289
+ *self*, for method chaining.
290
+ """
291
+ for mod in self._modalities.values():
292
+ mod.filter_samples(sample_ids)
293
+ return self
294
+
295
+ def subset_cancer_types(self, types: Sequence[str]) -> "OmicsDataset":
296
+ """Filter samples by cancer type using the dataset metadata.
297
+
298
+ Requires ``metadata["sample_cancer_type"]`` to be a dict mapping
299
+ sample ID to cancer type string.
300
+
301
+ Parameters
302
+ ----------
303
+ types:
304
+ Cancer type labels to retain.
305
+
306
+ Returns
307
+ -------
308
+ OmicsDataset
309
+ *self*, for method chaining.
310
+
311
+ Raises
312
+ ------
313
+ KeyError
314
+ If ``sample_cancer_type`` is not in :attr:`metadata`.
315
+ """
316
+ if "sample_cancer_type" not in self.metadata:
317
+ raise KeyError(
318
+ "metadata['sample_cancer_type'] is not set. "
319
+ "Populate it with a dict mapping sample_id → cancer_type."
320
+ )
321
+ type_map: Dict[str, str] = self.metadata["sample_cancer_type"]
322
+ keep = [sid for sid, ct in type_map.items() if ct in types]
323
+ return self.subset_samples(keep)
324
+
325
+ # ------------------------------------------------------------------
326
+ # Export methods
327
+ # ------------------------------------------------------------------
328
+
329
+ def to_dataframe(
330
+ self,
331
+ modalities: Optional[Sequence[str]] = None,
332
+ fill_missing: float = np.nan,
333
+ ) -> pd.DataFrame:
334
+ """Return a concatenated samples × features DataFrame.
335
+
336
+ Column names are prefixed with the modality name, e.g.
337
+ ``"rna__EGFR"``, ``"mut__TP53"``.
338
+
339
+ Parameters
340
+ ----------
341
+ modalities:
342
+ Subset of modality names to include. ``None`` means all.
343
+ fill_missing:
344
+ Value used to fill when samples differ across modalities.
345
+
346
+ Returns
347
+ -------
348
+ pandas.DataFrame
349
+ Concatenated feature matrix.
350
+ """
351
+ names = modalities if modalities is not None else self.modality_names
352
+ frames = []
353
+ for name in names:
354
+ if name not in self._modalities:
355
+ raise KeyError(f"Modality {name!r} not found.")
356
+ mod = self._modalities[name]
357
+ prefixed = mod.data.add_prefix(f"{name}__")
358
+ frames.append(prefixed)
359
+
360
+ if not frames:
361
+ return pd.DataFrame()
362
+
363
+ result = frames[0]
364
+ for frame in frames[1:]:
365
+ result = result.join(frame, how="outer")
366
+
367
+ if not np.isnan(fill_missing):
368
+ result = result.fillna(fill_missing)
369
+ return result
370
+
371
+ def to_dict(self) -> Dict[str, pd.DataFrame]:
372
+ """Return a dict mapping modality name to its DataFrame.
373
+
374
+ Returns
375
+ -------
376
+ dict[str, pandas.DataFrame]
377
+ """
378
+ return {name: mod.data.copy() for name, mod in self._modalities.items()}
379
+
380
+ def to_mofa2(self) -> Dict[str, Any]:
381
+ """Format data for mofapy2 entry_point input.
382
+
383
+ Returns
384
+ -------
385
+ dict
386
+ Keys: ``"data"`` (list-of-lists format), ``"views"`` (view names),
387
+ ``"groups"`` (group names, single group here), ``"samples"`` (list
388
+ of sample ID lists per group/view).
389
+
390
+ Notes
391
+ -----
392
+ MOFA2 expects data as a list of views, each a list of groups, each a
393
+ 2D numpy array (samples × features), with NaN for missing values.
394
+ """
395
+ views = self.modality_names
396
+ all_samples = self.common_samples.tolist()
397
+
398
+ data_list: List[List[np.ndarray]] = []
399
+ for name in views:
400
+ mod = self._modalities[name]
401
+ mat = mod.data.reindex(all_samples).values.astype(float)
402
+ data_list.append([mat])
403
+
404
+ return {
405
+ "data": data_list,
406
+ "views": views,
407
+ "groups": ["group1"],
408
+ "samples": [[all_samples]],
409
+ }
410
+
411
+ def to_tensor(self, dtype: Any = None):
412
+ """Return a PyTorch tensor of the concatenated feature matrix.
413
+
414
+ Requires ``torch`` to be installed.
415
+
416
+ Parameters
417
+ ----------
418
+ dtype:
419
+ PyTorch dtype. Defaults to ``torch.float32``.
420
+
421
+ Returns
422
+ -------
423
+ torch.Tensor
424
+
425
+ Raises
426
+ ------
427
+ ImportError
428
+ If ``torch`` is not installed.
429
+ """
430
+ try:
431
+ import torch
432
+ except ImportError as exc:
433
+ raise ImportError(
434
+ "torch is required for to_tensor(). "
435
+ "Install it with: pip install torch"
436
+ ) from exc
437
+ if dtype is None:
438
+ dtype = torch.float32
439
+ df = self.to_dataframe()
440
+ return torch.tensor(df.values, dtype=dtype)
441
+
442
+ def to_anndata(self):
443
+ """Return an AnnData object with modalities stored in obsm.
444
+
445
+ Requires ``anndata`` to be installed.
446
+
447
+ Returns
448
+ -------
449
+ anndata.AnnData
450
+
451
+ Raises
452
+ ------
453
+ ImportError
454
+ If ``anndata`` is not installed.
455
+ """
456
+ try:
457
+ import anndata as ad
458
+ except ImportError as exc:
459
+ raise ImportError(
460
+ "anndata is required for to_anndata(). "
461
+ "Install it with: pip install anndata"
462
+ ) from exc
463
+
464
+ common = self.common_samples
465
+ X = self.to_dataframe().reindex(common).values
466
+
467
+ obsm = {}
468
+ for name, mod in self._modalities.items():
469
+ obsm[f"X_{name}"] = mod.data.reindex(common).values
470
+
471
+ adata = ad.AnnData(
472
+ X=X,
473
+ obs=pd.DataFrame(index=common),
474
+ obsm=obsm,
475
+ )
476
+ adata.uns["study_id"] = self.study_id
477
+ adata.uns["modalities"] = self.modality_names
478
+ return adata
479
+
480
+ def describe(self) -> pd.DataFrame:
481
+ """Print and return a summary table of all modalities.
482
+
483
+ Returns
484
+ -------
485
+ pandas.DataFrame
486
+ One row per modality with shape and value statistics.
487
+ """
488
+ rows = []
489
+ for name, mod in self._modalities.items():
490
+ row = mod.describe()
491
+ row["name"] = name
492
+ rows.append(row)
493
+ df = pd.DataFrame(rows).set_index("name")
494
+ logger.info("Dataset %r: %d modalities.", self.study_id, len(self._modalities))
495
+ return df
496
+
497
+ def __repr__(self) -> str:
498
+ modality_str = ", ".join(
499
+ f"{name}({mod.n_samples}×{mod.n_features})"
500
+ for name, mod in self._modalities.items()
501
+ )
502
+ return (
503
+ f"OmicsDataset("
504
+ f"study_id={self.study_id!r}, "
505
+ f"modalities=[{modality_str}], "
506
+ f"n_common_samples={self.n_complete_cases})"
507
+ )