omicsync 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,398 @@
1
+ """OmicsModality base class and modality-specific subclasses."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Dict, List, Optional, Sequence
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from omicsync.utils.logging import get_logger
11
+ from omicsync.utils.validation import (
12
+ validate_dataframe,
13
+ validate_modality_type,
14
+ check_value_range,
15
+ validate_sample_ids,
16
+ )
17
+
18
+ logger = get_logger("core.modality")
19
+
20
+
21
+ class OmicsModality:
22
+ """Container for a single omics modality (samples × features).
23
+
24
+ Parameters
25
+ ----------
26
+ data:
27
+ DataFrame indexed by sample IDs, columns are feature IDs.
28
+ modality_type:
29
+ One of ``"rna"``, ``"mutations"``, ``"methylation"``, ``"cnv"``,
30
+ ``"protein"``.
31
+ source:
32
+ Data source identifier, e.g. ``"tcga"``, ``"geo"``, ``"csv"``.
33
+ metadata:
34
+ Arbitrary key/value metadata stored alongside the data.
35
+
36
+ Raises
37
+ ------
38
+ ValueError
39
+ If *modality_type* is invalid or the DataFrame is malformed.
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ data: pd.DataFrame,
45
+ modality_type: str,
46
+ source: str = "unknown",
47
+ metadata: Optional[Dict] = None,
48
+ ) -> None:
49
+ validate_modality_type(modality_type)
50
+ validate_dataframe(data, name=f"{modality_type} data")
51
+ validate_sample_ids(data.index.tolist())
52
+
53
+ self._data = data.copy()
54
+ self.modality_type = modality_type
55
+ self.source = source
56
+ self.metadata: Dict = metadata or {}
57
+
58
+ check_value_range(self._data, self.modality_type)
59
+ logger.info(
60
+ "Loaded %s modality from %s: %d samples × %d features.",
61
+ modality_type,
62
+ source,
63
+ self.n_samples,
64
+ self.n_features,
65
+ )
66
+
67
+ # ------------------------------------------------------------------
68
+ # Properties
69
+ # ------------------------------------------------------------------
70
+
71
+ @property
72
+ def data(self) -> pd.DataFrame:
73
+ """The underlying data matrix (samples × features)."""
74
+ return self._data
75
+
76
+ @property
77
+ def n_samples(self) -> int:
78
+ """Number of samples (rows)."""
79
+ return self._data.shape[0]
80
+
81
+ @property
82
+ def n_features(self) -> int:
83
+ """Number of features (columns)."""
84
+ return self._data.shape[1]
85
+
86
+ @property
87
+ def sample_ids(self) -> pd.Index:
88
+ """Sample identifiers (row index)."""
89
+ return self._data.index
90
+
91
+ @property
92
+ def feature_ids(self) -> pd.Index:
93
+ """Feature identifiers (column index)."""
94
+ return self._data.columns
95
+
96
+ # ------------------------------------------------------------------
97
+ # Methods
98
+ # ------------------------------------------------------------------
99
+
100
+ def filter_features(
101
+ self,
102
+ min_variance: float = 0.0,
103
+ min_nonzero_frac: float = 0.0,
104
+ ) -> "OmicsModality":
105
+ """Remove low-information features in-place and return *self*.
106
+
107
+ Parameters
108
+ ----------
109
+ min_variance:
110
+ Drop features whose variance is below this threshold.
111
+ min_nonzero_frac:
112
+ Drop features where the fraction of non-zero values is below this.
113
+
114
+ Returns
115
+ -------
116
+ OmicsModality
117
+ *self*, for method chaining.
118
+ """
119
+ mask = np.ones(self.n_features, dtype=bool)
120
+
121
+ if min_variance > 0.0:
122
+ variances = self._data.var(axis=0, skipna=True)
123
+ mask &= variances.values >= min_variance
124
+
125
+ if min_nonzero_frac > 0.0:
126
+ nonzero_frac = (self._data != 0).mean(axis=0)
127
+ mask &= nonzero_frac.values >= min_nonzero_frac
128
+
129
+ n_before = self.n_features
130
+ self._data = self._data.loc[:, mask]
131
+ n_after = self.n_features
132
+ logger.info(
133
+ "%s: filtered features %d → %d (kept %.1f%%).",
134
+ self.modality_type,
135
+ n_before,
136
+ n_after,
137
+ 100.0 * n_after / max(n_before, 1),
138
+ )
139
+ return self
140
+
141
+ def filter_samples(self, sample_ids: Sequence) -> "OmicsModality":
142
+ """Keep only the specified samples in-place and return *self*.
143
+
144
+ Parameters
145
+ ----------
146
+ sample_ids:
147
+ Iterable of sample IDs to retain.
148
+
149
+ Returns
150
+ -------
151
+ OmicsModality
152
+ *self*, for method chaining.
153
+
154
+ Raises
155
+ ------
156
+ ValueError
157
+ If none of the provided IDs are present in this modality.
158
+ """
159
+ requested = pd.Index(sample_ids)
160
+ common = self._data.index.intersection(requested)
161
+ if len(common) == 0:
162
+ raise ValueError(
163
+ f"None of the {len(requested)} requested sample IDs were found "
164
+ f"in {self.modality_type} modality."
165
+ )
166
+ n_before = self.n_samples
167
+ self._data = self._data.loc[common]
168
+ logger.info(
169
+ "%s: filtered samples %d → %d.",
170
+ self.modality_type,
171
+ n_before,
172
+ self.n_samples,
173
+ )
174
+ return self
175
+
176
+ def describe(self) -> Dict:
177
+ """Return a summary dictionary of this modality.
178
+
179
+ Returns
180
+ -------
181
+ dict
182
+ Keys: ``modality_type``, ``source``, ``n_samples``,
183
+ ``n_features``, ``value_min``, ``value_max``, ``value_mean``,
184
+ ``missing_frac``.
185
+ """
186
+ vals = self._data.values.ravel().astype(float)
187
+ finite = vals[np.isfinite(vals)]
188
+ return {
189
+ "modality_type": self.modality_type,
190
+ "source": self.source,
191
+ "n_samples": self.n_samples,
192
+ "n_features": self.n_features,
193
+ "value_min": float(finite.min()) if len(finite) else float("nan"),
194
+ "value_max": float(finite.max()) if len(finite) else float("nan"),
195
+ "value_mean": float(finite.mean()) if len(finite) else float("nan"),
196
+ "missing_frac": float(np.isnan(vals).mean()),
197
+ }
198
+
199
+ def __repr__(self) -> str:
200
+ return (
201
+ f"{type(self).__name__}("
202
+ f"modality_type={self.modality_type!r}, "
203
+ f"shape=({self.n_samples}, {self.n_features}), "
204
+ f"source={self.source!r})"
205
+ )
206
+
207
+
208
+ # ---------------------------------------------------------------------------
209
+ # Modality-specific subclasses
210
+ # ---------------------------------------------------------------------------
211
+
212
+
213
+ class RNAModality(OmicsModality):
214
+ """Modality subclass for RNA expression data.
215
+
216
+ Validates that all values are non-negative.
217
+
218
+ Parameters
219
+ ----------
220
+ data:
221
+ DataFrame of RNA expression values (samples × genes).
222
+ source:
223
+ Data source identifier.
224
+ metadata:
225
+ Optional metadata dict.
226
+ """
227
+
228
+ def __init__(
229
+ self,
230
+ data: pd.DataFrame,
231
+ source: str = "unknown",
232
+ metadata: Optional[Dict] = None,
233
+ ) -> None:
234
+ super().__init__(data, modality_type="rna", source=source, metadata=metadata)
235
+ finite_vals = self._data.values[np.isfinite(self._data.values)]
236
+ if len(finite_vals) > 0 and finite_vals.min() < 0:
237
+ raise ValueError(
238
+ "RNAModality: data contains negative values. "
239
+ "RNA expression values must be non-negative (counts or TPM)."
240
+ )
241
+
242
+
243
+ class MutationModality(OmicsModality):
244
+ """Modality subclass for somatic mutation data.
245
+
246
+ Parameters
247
+ ----------
248
+ data:
249
+ Binary or count-based mutation matrix (samples × genes).
250
+ source:
251
+ Data source identifier.
252
+ metadata:
253
+ Optional metadata dict.
254
+ """
255
+
256
+ def __init__(
257
+ self,
258
+ data: pd.DataFrame,
259
+ source: str = "unknown",
260
+ metadata: Optional[Dict] = None,
261
+ ) -> None:
262
+ super().__init__(
263
+ data, modality_type="mutations", source=source, metadata=metadata
264
+ )
265
+
266
+
267
+ class MethylationModality(OmicsModality):
268
+ """Modality subclass for DNA methylation data.
269
+
270
+ Validates that beta values lie in [0, 1] if the data appears to be
271
+ beta values (i.e. all finite values are in [-6, 6] is permitted for
272
+ M-values, but pure beta must be in [0, 1]).
273
+
274
+ Parameters
275
+ ----------
276
+ data:
277
+ Methylation matrix (samples × CpG sites).
278
+ source:
279
+ Data source identifier.
280
+ metadata:
281
+ Optional metadata dict.
282
+ value_type:
283
+ ``"beta"`` (default) or ``"mvalue"``. Beta values are validated
284
+ to lie in [0, 1]; M-values have no range constraint.
285
+ """
286
+
287
+ def __init__(
288
+ self,
289
+ data: pd.DataFrame,
290
+ source: str = "unknown",
291
+ metadata: Optional[Dict] = None,
292
+ value_type: str = "beta",
293
+ ) -> None:
294
+ if value_type not in ("beta", "mvalue"):
295
+ raise ValueError(
296
+ f"value_type must be 'beta' or 'mvalue', got {value_type!r}."
297
+ )
298
+ self.value_type = value_type
299
+ super().__init__(
300
+ data, modality_type="methylation", source=source, metadata=metadata
301
+ )
302
+ if value_type == "beta":
303
+ finite_vals = self._data.values[np.isfinite(self._data.values)]
304
+ if len(finite_vals) > 0:
305
+ if finite_vals.min() < -0.01 or finite_vals.max() > 1.01:
306
+ raise ValueError(
307
+ "MethylationModality (beta): values must be in [0, 1]. "
308
+ f"Got min={finite_vals.min():.4f}, max={finite_vals.max():.4f}. "
309
+ "If these are M-values, set value_type='mvalue'."
310
+ )
311
+
312
+
313
+ class CNVModality(OmicsModality):
314
+ """Modality subclass for copy-number variation data.
315
+
316
+ Parameters
317
+ ----------
318
+ data:
319
+ CNV matrix (samples × genes/segments).
320
+ source:
321
+ Data source identifier.
322
+ metadata:
323
+ Optional metadata dict.
324
+ """
325
+
326
+ def __init__(
327
+ self,
328
+ data: pd.DataFrame,
329
+ source: str = "unknown",
330
+ metadata: Optional[Dict] = None,
331
+ ) -> None:
332
+ super().__init__(data, modality_type="cnv", source=source, metadata=metadata)
333
+
334
+
335
+ class ProteinModality(OmicsModality):
336
+ """Modality subclass for protein abundance data.
337
+
338
+ Parameters
339
+ ----------
340
+ data:
341
+ Protein abundance matrix (samples × proteins).
342
+ source:
343
+ Data source identifier.
344
+ metadata:
345
+ Optional metadata dict.
346
+ """
347
+
348
+ def __init__(
349
+ self,
350
+ data: pd.DataFrame,
351
+ source: str = "unknown",
352
+ metadata: Optional[Dict] = None,
353
+ ) -> None:
354
+ super().__init__(
355
+ data, modality_type="protein", source=source, metadata=metadata
356
+ )
357
+
358
+
359
+ # Convenience mapping from modality_type string to subclass
360
+ MODALITY_CLASSES: Dict[str, type] = {
361
+ "rna": RNAModality,
362
+ "mutations": MutationModality,
363
+ "methylation": MethylationModality,
364
+ "cnv": CNVModality,
365
+ "protein": ProteinModality,
366
+ }
367
+
368
+
369
+ def make_modality(
370
+ data: pd.DataFrame,
371
+ modality_type: str,
372
+ source: str = "unknown",
373
+ metadata: Optional[Dict] = None,
374
+ **kwargs,
375
+ ) -> OmicsModality:
376
+ """Instantiate the appropriate :class:`OmicsModality` subclass.
377
+
378
+ Parameters
379
+ ----------
380
+ data:
381
+ Feature matrix (samples × features).
382
+ modality_type:
383
+ One of the recognised modality types.
384
+ source:
385
+ Data source identifier.
386
+ metadata:
387
+ Optional metadata dict.
388
+ **kwargs:
389
+ Passed through to the subclass constructor.
390
+
391
+ Returns
392
+ -------
393
+ OmicsModality
394
+ The appropriate subclass instance.
395
+ """
396
+ validate_modality_type(modality_type)
397
+ cls = MODALITY_CLASSES[modality_type]
398
+ return cls(data, source=source, metadata=metadata, **kwargs)
@@ -0,0 +1,200 @@
1
+ """Sample ID harmonisation logic for multi-omics datasets."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Dict, List, Optional, Sequence, Union
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from omicsync.utils.barcode import truncate_to_participant, truncate_to_sample
11
+ from omicsync.utils.logging import get_logger
12
+
13
+ logger = get_logger("core.sample_index")
14
+
15
+ _LEVEL_FUNCS = {
16
+ "participant": truncate_to_participant,
17
+ "sample": truncate_to_sample,
18
+ "aliquot": lambda x: x,
19
+ }
20
+
21
+
22
+ class SampleIndex:
23
+ """Manages sample ID sets and harmonisation across modalities.
24
+
25
+ Parameters
26
+ ----------
27
+ sample_ids:
28
+ Initial set of sample identifiers (optional).
29
+ """
30
+
31
+ def __init__(self, sample_ids: Optional[Sequence] = None) -> None:
32
+ self._ids: pd.Index = (
33
+ pd.Index(sample_ids) if sample_ids is not None else pd.Index([])
34
+ )
35
+
36
+ @classmethod
37
+ def from_barcodes(
38
+ cls,
39
+ barcodes: Sequence[str],
40
+ level: str = "participant",
41
+ ) -> "SampleIndex":
42
+ """Create a :class:`SampleIndex` by truncating TCGA barcodes.
43
+
44
+ Parameters
45
+ ----------
46
+ barcodes:
47
+ Full TCGA aliquot barcodes.
48
+ level:
49
+ Truncation level: ``"participant"`` (default), ``"sample"``,
50
+ or ``"aliquot"`` (no truncation).
51
+
52
+ Returns
53
+ -------
54
+ SampleIndex
55
+
56
+ Raises
57
+ ------
58
+ ValueError
59
+ If *level* is not recognised.
60
+ """
61
+ if level not in _LEVEL_FUNCS:
62
+ raise ValueError(
63
+ f"Unknown barcode level {level!r}. "
64
+ f"Valid levels: {list(_LEVEL_FUNCS)}."
65
+ )
66
+ func = _LEVEL_FUNCS[level]
67
+ truncated = []
68
+ for bc in barcodes:
69
+ try:
70
+ truncated.append(func(bc))
71
+ except ValueError:
72
+ logger.warning("Could not parse barcode %r at level %r; keeping as-is.", bc, level)
73
+ truncated.append(bc)
74
+ idx = cls(truncated)
75
+ logger.info(
76
+ "SampleIndex: %d barcodes → %d unique IDs at level %r.",
77
+ len(barcodes),
78
+ len(set(truncated)),
79
+ level,
80
+ )
81
+ return idx
82
+
83
+ @staticmethod
84
+ def align(
85
+ list_of_sample_id_arrays: Sequence[Union[Sequence, pd.Index]],
86
+ strategy: str = "intersection",
87
+ ) -> pd.Index:
88
+ """Find common samples across multiple modalities.
89
+
90
+ Parameters
91
+ ----------
92
+ list_of_sample_id_arrays:
93
+ One array/index per modality.
94
+ strategy:
95
+ ``"intersection"`` (default) — samples present in every modality.
96
+ ``"union"`` — all samples seen across any modality.
97
+
98
+ Returns
99
+ -------
100
+ pandas.Index
101
+ Aligned sample IDs.
102
+
103
+ Raises
104
+ ------
105
+ ValueError
106
+ If *strategy* is unrecognised or the input list is empty.
107
+ """
108
+ if not list_of_sample_id_arrays:
109
+ raise ValueError("list_of_sample_id_arrays must not be empty.")
110
+ if strategy not in ("intersection", "union"):
111
+ raise ValueError(
112
+ f"Unknown strategy {strategy!r}. Valid: 'intersection', 'union'."
113
+ )
114
+
115
+ indices = [pd.Index(arr) for arr in list_of_sample_id_arrays]
116
+ if strategy == "intersection":
117
+ result = indices[0]
118
+ for idx in indices[1:]:
119
+ result = result.intersection(idx)
120
+ else:
121
+ result = indices[0]
122
+ for idx in indices[1:]:
123
+ result = result.union(idx)
124
+
125
+ logger.info(
126
+ "SampleIndex.align (%s): %d common samples from %d modalities.",
127
+ strategy,
128
+ len(result),
129
+ len(indices),
130
+ )
131
+ return result
132
+
133
+ @staticmethod
134
+ def match_fuzzy(
135
+ ids_a: Sequence[str],
136
+ ids_b: Sequence[str],
137
+ ) -> Dict[str, Optional[str]]:
138
+ """Match IDs across two sets, tolerating minor formatting differences.
139
+
140
+ Normalises by converting to uppercase and replacing dots with dashes
141
+ before matching.
142
+
143
+ Parameters
144
+ ----------
145
+ ids_a:
146
+ Reference ID set.
147
+ ids_b:
148
+ Query ID set.
149
+
150
+ Returns
151
+ -------
152
+ dict
153
+ Mapping from each ID in *ids_a* to the best matching ID in
154
+ *ids_b*, or ``None`` if no match found.
155
+ """
156
+
157
+ def _normalise(s: str) -> str:
158
+ return s.strip().upper().replace(".", "-")
159
+
160
+ norm_b: Dict[str, str] = {_normalise(b): b for b in ids_b}
161
+ result: Dict[str, Optional[str]] = {}
162
+ for a in ids_a:
163
+ key = _normalise(a)
164
+ result[a] = norm_b.get(key)
165
+ n_matched = sum(v is not None for v in result.values())
166
+ logger.info(
167
+ "Fuzzy match: %d/%d IDs matched.", n_matched, len(ids_a)
168
+ )
169
+ return result
170
+
171
+ def summarise(
172
+ self,
173
+ modality_sample_ids: Dict[str, Sequence],
174
+ ) -> pd.DataFrame:
175
+ """Report how many samples are present in each modality combination.
176
+
177
+ Parameters
178
+ ----------
179
+ modality_sample_ids:
180
+ Mapping from modality name to its sample IDs.
181
+
182
+ Returns
183
+ -------
184
+ pandas.DataFrame
185
+ Rows are samples; columns are modality names; values are boolean
186
+ (``True`` = present). An additional ``n_modalities`` column gives
187
+ the count of modalities present for each sample.
188
+ """
189
+ all_ids = set()
190
+ for ids in modality_sample_ids.values():
191
+ all_ids.update(ids)
192
+ df = pd.DataFrame(
193
+ {
194
+ name: pd.Index(all_ids).isin(ids)
195
+ for name, ids in modality_sample_ids.items()
196
+ },
197
+ index=pd.Index(sorted(all_ids), name="sample_id"),
198
+ )
199
+ df["n_modalities"] = df.sum(axis=1)
200
+ return df
@@ -0,0 +1,11 @@
1
+ """Integration methods for multi-omics data fusion."""
2
+
3
+ from omicsync.integration.concat import simple_concat, weighted_concat, pca_concat
4
+ from omicsync.integration.sklearn_compat import OmicsSyncTransformer
5
+
6
+ __all__ = [
7
+ "simple_concat",
8
+ "weighted_concat",
9
+ "pca_concat",
10
+ "OmicsSyncTransformer",
11
+ ]