omicsync 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omicsync/__init__.py +33 -0
- omicsync/core/__init__.py +25 -0
- omicsync/core/dataset.py +507 -0
- omicsync/core/modality.py +398 -0
- omicsync/core/sample_index.py +200 -0
- omicsync/integration/__init__.py +11 -0
- omicsync/integration/concat.py +146 -0
- omicsync/integration/mofa.py +279 -0
- omicsync/integration/sklearn_compat.py +178 -0
- omicsync/loaders/__init__.py +19 -0
- omicsync/loaders/csv.py +147 -0
- omicsync/loaders/geo.py +111 -0
- omicsync/loaders/open_targets.py +239 -0
- omicsync/loaders/tcga.py +251 -0
- omicsync/normalisation/__init__.py +5 -0
- omicsync/normalisation/cnv.py +97 -0
- omicsync/normalisation/methylation.py +131 -0
- omicsync/normalisation/mutations.py +123 -0
- omicsync/normalisation/protein.py +54 -0
- omicsync/normalisation/rna.py +182 -0
- omicsync/utils/__init__.py +32 -0
- omicsync/utils/barcode.py +165 -0
- omicsync/utils/logging.py +44 -0
- omicsync/utils/validation.py +152 -0
- omicsync-0.1.0.dist-info/METADATA +188 -0
- omicsync-0.1.0.dist-info/RECORD +29 -0
- omicsync-0.1.0.dist-info/WHEEL +5 -0
- omicsync-0.1.0.dist-info/licenses/LICENSE +21 -0
- omicsync-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
"""OmicsModality base class and modality-specific subclasses."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Dict, List, Optional, Sequence
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from omicsync.utils.logging import get_logger
|
|
11
|
+
from omicsync.utils.validation import (
|
|
12
|
+
validate_dataframe,
|
|
13
|
+
validate_modality_type,
|
|
14
|
+
check_value_range,
|
|
15
|
+
validate_sample_ids,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
logger = get_logger("core.modality")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class OmicsModality:
|
|
22
|
+
"""Container for a single omics modality (samples × features).
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
data:
|
|
27
|
+
DataFrame indexed by sample IDs, columns are feature IDs.
|
|
28
|
+
modality_type:
|
|
29
|
+
One of ``"rna"``, ``"mutations"``, ``"methylation"``, ``"cnv"``,
|
|
30
|
+
``"protein"``.
|
|
31
|
+
source:
|
|
32
|
+
Data source identifier, e.g. ``"tcga"``, ``"geo"``, ``"csv"``.
|
|
33
|
+
metadata:
|
|
34
|
+
Arbitrary key/value metadata stored alongside the data.
|
|
35
|
+
|
|
36
|
+
Raises
|
|
37
|
+
------
|
|
38
|
+
ValueError
|
|
39
|
+
If *modality_type* is invalid or the DataFrame is malformed.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
data: pd.DataFrame,
|
|
45
|
+
modality_type: str,
|
|
46
|
+
source: str = "unknown",
|
|
47
|
+
metadata: Optional[Dict] = None,
|
|
48
|
+
) -> None:
|
|
49
|
+
validate_modality_type(modality_type)
|
|
50
|
+
validate_dataframe(data, name=f"{modality_type} data")
|
|
51
|
+
validate_sample_ids(data.index.tolist())
|
|
52
|
+
|
|
53
|
+
self._data = data.copy()
|
|
54
|
+
self.modality_type = modality_type
|
|
55
|
+
self.source = source
|
|
56
|
+
self.metadata: Dict = metadata or {}
|
|
57
|
+
|
|
58
|
+
check_value_range(self._data, self.modality_type)
|
|
59
|
+
logger.info(
|
|
60
|
+
"Loaded %s modality from %s: %d samples × %d features.",
|
|
61
|
+
modality_type,
|
|
62
|
+
source,
|
|
63
|
+
self.n_samples,
|
|
64
|
+
self.n_features,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# ------------------------------------------------------------------
|
|
68
|
+
# Properties
|
|
69
|
+
# ------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def data(self) -> pd.DataFrame:
|
|
73
|
+
"""The underlying data matrix (samples × features)."""
|
|
74
|
+
return self._data
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def n_samples(self) -> int:
|
|
78
|
+
"""Number of samples (rows)."""
|
|
79
|
+
return self._data.shape[0]
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def n_features(self) -> int:
|
|
83
|
+
"""Number of features (columns)."""
|
|
84
|
+
return self._data.shape[1]
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def sample_ids(self) -> pd.Index:
|
|
88
|
+
"""Sample identifiers (row index)."""
|
|
89
|
+
return self._data.index
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def feature_ids(self) -> pd.Index:
|
|
93
|
+
"""Feature identifiers (column index)."""
|
|
94
|
+
return self._data.columns
|
|
95
|
+
|
|
96
|
+
# ------------------------------------------------------------------
|
|
97
|
+
# Methods
|
|
98
|
+
# ------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
def filter_features(
|
|
101
|
+
self,
|
|
102
|
+
min_variance: float = 0.0,
|
|
103
|
+
min_nonzero_frac: float = 0.0,
|
|
104
|
+
) -> "OmicsModality":
|
|
105
|
+
"""Remove low-information features in-place and return *self*.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
min_variance:
|
|
110
|
+
Drop features whose variance is below this threshold.
|
|
111
|
+
min_nonzero_frac:
|
|
112
|
+
Drop features where the fraction of non-zero values is below this.
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
OmicsModality
|
|
117
|
+
*self*, for method chaining.
|
|
118
|
+
"""
|
|
119
|
+
mask = np.ones(self.n_features, dtype=bool)
|
|
120
|
+
|
|
121
|
+
if min_variance > 0.0:
|
|
122
|
+
variances = self._data.var(axis=0, skipna=True)
|
|
123
|
+
mask &= variances.values >= min_variance
|
|
124
|
+
|
|
125
|
+
if min_nonzero_frac > 0.0:
|
|
126
|
+
nonzero_frac = (self._data != 0).mean(axis=0)
|
|
127
|
+
mask &= nonzero_frac.values >= min_nonzero_frac
|
|
128
|
+
|
|
129
|
+
n_before = self.n_features
|
|
130
|
+
self._data = self._data.loc[:, mask]
|
|
131
|
+
n_after = self.n_features
|
|
132
|
+
logger.info(
|
|
133
|
+
"%s: filtered features %d → %d (kept %.1f%%).",
|
|
134
|
+
self.modality_type,
|
|
135
|
+
n_before,
|
|
136
|
+
n_after,
|
|
137
|
+
100.0 * n_after / max(n_before, 1),
|
|
138
|
+
)
|
|
139
|
+
return self
|
|
140
|
+
|
|
141
|
+
def filter_samples(self, sample_ids: Sequence) -> "OmicsModality":
|
|
142
|
+
"""Keep only the specified samples in-place and return *self*.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
sample_ids:
|
|
147
|
+
Iterable of sample IDs to retain.
|
|
148
|
+
|
|
149
|
+
Returns
|
|
150
|
+
-------
|
|
151
|
+
OmicsModality
|
|
152
|
+
*self*, for method chaining.
|
|
153
|
+
|
|
154
|
+
Raises
|
|
155
|
+
------
|
|
156
|
+
ValueError
|
|
157
|
+
If none of the provided IDs are present in this modality.
|
|
158
|
+
"""
|
|
159
|
+
requested = pd.Index(sample_ids)
|
|
160
|
+
common = self._data.index.intersection(requested)
|
|
161
|
+
if len(common) == 0:
|
|
162
|
+
raise ValueError(
|
|
163
|
+
f"None of the {len(requested)} requested sample IDs were found "
|
|
164
|
+
f"in {self.modality_type} modality."
|
|
165
|
+
)
|
|
166
|
+
n_before = self.n_samples
|
|
167
|
+
self._data = self._data.loc[common]
|
|
168
|
+
logger.info(
|
|
169
|
+
"%s: filtered samples %d → %d.",
|
|
170
|
+
self.modality_type,
|
|
171
|
+
n_before,
|
|
172
|
+
self.n_samples,
|
|
173
|
+
)
|
|
174
|
+
return self
|
|
175
|
+
|
|
176
|
+
def describe(self) -> Dict:
|
|
177
|
+
"""Return a summary dictionary of this modality.
|
|
178
|
+
|
|
179
|
+
Returns
|
|
180
|
+
-------
|
|
181
|
+
dict
|
|
182
|
+
Keys: ``modality_type``, ``source``, ``n_samples``,
|
|
183
|
+
``n_features``, ``value_min``, ``value_max``, ``value_mean``,
|
|
184
|
+
``missing_frac``.
|
|
185
|
+
"""
|
|
186
|
+
vals = self._data.values.ravel().astype(float)
|
|
187
|
+
finite = vals[np.isfinite(vals)]
|
|
188
|
+
return {
|
|
189
|
+
"modality_type": self.modality_type,
|
|
190
|
+
"source": self.source,
|
|
191
|
+
"n_samples": self.n_samples,
|
|
192
|
+
"n_features": self.n_features,
|
|
193
|
+
"value_min": float(finite.min()) if len(finite) else float("nan"),
|
|
194
|
+
"value_max": float(finite.max()) if len(finite) else float("nan"),
|
|
195
|
+
"value_mean": float(finite.mean()) if len(finite) else float("nan"),
|
|
196
|
+
"missing_frac": float(np.isnan(vals).mean()),
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
def __repr__(self) -> str:
|
|
200
|
+
return (
|
|
201
|
+
f"{type(self).__name__}("
|
|
202
|
+
f"modality_type={self.modality_type!r}, "
|
|
203
|
+
f"shape=({self.n_samples}, {self.n_features}), "
|
|
204
|
+
f"source={self.source!r})"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# ---------------------------------------------------------------------------
|
|
209
|
+
# Modality-specific subclasses
|
|
210
|
+
# ---------------------------------------------------------------------------
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class RNAModality(OmicsModality):
|
|
214
|
+
"""Modality subclass for RNA expression data.
|
|
215
|
+
|
|
216
|
+
Validates that all values are non-negative.
|
|
217
|
+
|
|
218
|
+
Parameters
|
|
219
|
+
----------
|
|
220
|
+
data:
|
|
221
|
+
DataFrame of RNA expression values (samples × genes).
|
|
222
|
+
source:
|
|
223
|
+
Data source identifier.
|
|
224
|
+
metadata:
|
|
225
|
+
Optional metadata dict.
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
def __init__(
|
|
229
|
+
self,
|
|
230
|
+
data: pd.DataFrame,
|
|
231
|
+
source: str = "unknown",
|
|
232
|
+
metadata: Optional[Dict] = None,
|
|
233
|
+
) -> None:
|
|
234
|
+
super().__init__(data, modality_type="rna", source=source, metadata=metadata)
|
|
235
|
+
finite_vals = self._data.values[np.isfinite(self._data.values)]
|
|
236
|
+
if len(finite_vals) > 0 and finite_vals.min() < 0:
|
|
237
|
+
raise ValueError(
|
|
238
|
+
"RNAModality: data contains negative values. "
|
|
239
|
+
"RNA expression values must be non-negative (counts or TPM)."
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class MutationModality(OmicsModality):
|
|
244
|
+
"""Modality subclass for somatic mutation data.
|
|
245
|
+
|
|
246
|
+
Parameters
|
|
247
|
+
----------
|
|
248
|
+
data:
|
|
249
|
+
Binary or count-based mutation matrix (samples × genes).
|
|
250
|
+
source:
|
|
251
|
+
Data source identifier.
|
|
252
|
+
metadata:
|
|
253
|
+
Optional metadata dict.
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
def __init__(
|
|
257
|
+
self,
|
|
258
|
+
data: pd.DataFrame,
|
|
259
|
+
source: str = "unknown",
|
|
260
|
+
metadata: Optional[Dict] = None,
|
|
261
|
+
) -> None:
|
|
262
|
+
super().__init__(
|
|
263
|
+
data, modality_type="mutations", source=source, metadata=metadata
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
class MethylationModality(OmicsModality):
|
|
268
|
+
"""Modality subclass for DNA methylation data.
|
|
269
|
+
|
|
270
|
+
Validates that beta values lie in [0, 1] if the data appears to be
|
|
271
|
+
beta values (i.e. all finite values are in [-6, 6] is permitted for
|
|
272
|
+
M-values, but pure beta must be in [0, 1]).
|
|
273
|
+
|
|
274
|
+
Parameters
|
|
275
|
+
----------
|
|
276
|
+
data:
|
|
277
|
+
Methylation matrix (samples × CpG sites).
|
|
278
|
+
source:
|
|
279
|
+
Data source identifier.
|
|
280
|
+
metadata:
|
|
281
|
+
Optional metadata dict.
|
|
282
|
+
value_type:
|
|
283
|
+
``"beta"`` (default) or ``"mvalue"``. Beta values are validated
|
|
284
|
+
to lie in [0, 1]; M-values have no range constraint.
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
def __init__(
|
|
288
|
+
self,
|
|
289
|
+
data: pd.DataFrame,
|
|
290
|
+
source: str = "unknown",
|
|
291
|
+
metadata: Optional[Dict] = None,
|
|
292
|
+
value_type: str = "beta",
|
|
293
|
+
) -> None:
|
|
294
|
+
if value_type not in ("beta", "mvalue"):
|
|
295
|
+
raise ValueError(
|
|
296
|
+
f"value_type must be 'beta' or 'mvalue', got {value_type!r}."
|
|
297
|
+
)
|
|
298
|
+
self.value_type = value_type
|
|
299
|
+
super().__init__(
|
|
300
|
+
data, modality_type="methylation", source=source, metadata=metadata
|
|
301
|
+
)
|
|
302
|
+
if value_type == "beta":
|
|
303
|
+
finite_vals = self._data.values[np.isfinite(self._data.values)]
|
|
304
|
+
if len(finite_vals) > 0:
|
|
305
|
+
if finite_vals.min() < -0.01 or finite_vals.max() > 1.01:
|
|
306
|
+
raise ValueError(
|
|
307
|
+
"MethylationModality (beta): values must be in [0, 1]. "
|
|
308
|
+
f"Got min={finite_vals.min():.4f}, max={finite_vals.max():.4f}. "
|
|
309
|
+
"If these are M-values, set value_type='mvalue'."
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
class CNVModality(OmicsModality):
|
|
314
|
+
"""Modality subclass for copy-number variation data.
|
|
315
|
+
|
|
316
|
+
Parameters
|
|
317
|
+
----------
|
|
318
|
+
data:
|
|
319
|
+
CNV matrix (samples × genes/segments).
|
|
320
|
+
source:
|
|
321
|
+
Data source identifier.
|
|
322
|
+
metadata:
|
|
323
|
+
Optional metadata dict.
|
|
324
|
+
"""
|
|
325
|
+
|
|
326
|
+
def __init__(
|
|
327
|
+
self,
|
|
328
|
+
data: pd.DataFrame,
|
|
329
|
+
source: str = "unknown",
|
|
330
|
+
metadata: Optional[Dict] = None,
|
|
331
|
+
) -> None:
|
|
332
|
+
super().__init__(data, modality_type="cnv", source=source, metadata=metadata)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
class ProteinModality(OmicsModality):
|
|
336
|
+
"""Modality subclass for protein abundance data.
|
|
337
|
+
|
|
338
|
+
Parameters
|
|
339
|
+
----------
|
|
340
|
+
data:
|
|
341
|
+
Protein abundance matrix (samples × proteins).
|
|
342
|
+
source:
|
|
343
|
+
Data source identifier.
|
|
344
|
+
metadata:
|
|
345
|
+
Optional metadata dict.
|
|
346
|
+
"""
|
|
347
|
+
|
|
348
|
+
def __init__(
|
|
349
|
+
self,
|
|
350
|
+
data: pd.DataFrame,
|
|
351
|
+
source: str = "unknown",
|
|
352
|
+
metadata: Optional[Dict] = None,
|
|
353
|
+
) -> None:
|
|
354
|
+
super().__init__(
|
|
355
|
+
data, modality_type="protein", source=source, metadata=metadata
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
# Convenience mapping from modality_type string to subclass
|
|
360
|
+
MODALITY_CLASSES: Dict[str, type] = {
|
|
361
|
+
"rna": RNAModality,
|
|
362
|
+
"mutations": MutationModality,
|
|
363
|
+
"methylation": MethylationModality,
|
|
364
|
+
"cnv": CNVModality,
|
|
365
|
+
"protein": ProteinModality,
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def make_modality(
|
|
370
|
+
data: pd.DataFrame,
|
|
371
|
+
modality_type: str,
|
|
372
|
+
source: str = "unknown",
|
|
373
|
+
metadata: Optional[Dict] = None,
|
|
374
|
+
**kwargs,
|
|
375
|
+
) -> OmicsModality:
|
|
376
|
+
"""Instantiate the appropriate :class:`OmicsModality` subclass.
|
|
377
|
+
|
|
378
|
+
Parameters
|
|
379
|
+
----------
|
|
380
|
+
data:
|
|
381
|
+
Feature matrix (samples × features).
|
|
382
|
+
modality_type:
|
|
383
|
+
One of the recognised modality types.
|
|
384
|
+
source:
|
|
385
|
+
Data source identifier.
|
|
386
|
+
metadata:
|
|
387
|
+
Optional metadata dict.
|
|
388
|
+
**kwargs:
|
|
389
|
+
Passed through to the subclass constructor.
|
|
390
|
+
|
|
391
|
+
Returns
|
|
392
|
+
-------
|
|
393
|
+
OmicsModality
|
|
394
|
+
The appropriate subclass instance.
|
|
395
|
+
"""
|
|
396
|
+
validate_modality_type(modality_type)
|
|
397
|
+
cls = MODALITY_CLASSES[modality_type]
|
|
398
|
+
return cls(data, source=source, metadata=metadata, **kwargs)
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""Sample ID harmonisation logic for multi-omics datasets."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Dict, List, Optional, Sequence, Union
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from omicsync.utils.barcode import truncate_to_participant, truncate_to_sample
|
|
11
|
+
from omicsync.utils.logging import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger("core.sample_index")
|
|
14
|
+
|
|
15
|
+
_LEVEL_FUNCS = {
|
|
16
|
+
"participant": truncate_to_participant,
|
|
17
|
+
"sample": truncate_to_sample,
|
|
18
|
+
"aliquot": lambda x: x,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SampleIndex:
|
|
23
|
+
"""Manages sample ID sets and harmonisation across modalities.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
sample_ids:
|
|
28
|
+
Initial set of sample identifiers (optional).
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, sample_ids: Optional[Sequence] = None) -> None:
|
|
32
|
+
self._ids: pd.Index = (
|
|
33
|
+
pd.Index(sample_ids) if sample_ids is not None else pd.Index([])
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def from_barcodes(
|
|
38
|
+
cls,
|
|
39
|
+
barcodes: Sequence[str],
|
|
40
|
+
level: str = "participant",
|
|
41
|
+
) -> "SampleIndex":
|
|
42
|
+
"""Create a :class:`SampleIndex` by truncating TCGA barcodes.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
barcodes:
|
|
47
|
+
Full TCGA aliquot barcodes.
|
|
48
|
+
level:
|
|
49
|
+
Truncation level: ``"participant"`` (default), ``"sample"``,
|
|
50
|
+
or ``"aliquot"`` (no truncation).
|
|
51
|
+
|
|
52
|
+
Returns
|
|
53
|
+
-------
|
|
54
|
+
SampleIndex
|
|
55
|
+
|
|
56
|
+
Raises
|
|
57
|
+
------
|
|
58
|
+
ValueError
|
|
59
|
+
If *level* is not recognised.
|
|
60
|
+
"""
|
|
61
|
+
if level not in _LEVEL_FUNCS:
|
|
62
|
+
raise ValueError(
|
|
63
|
+
f"Unknown barcode level {level!r}. "
|
|
64
|
+
f"Valid levels: {list(_LEVEL_FUNCS)}."
|
|
65
|
+
)
|
|
66
|
+
func = _LEVEL_FUNCS[level]
|
|
67
|
+
truncated = []
|
|
68
|
+
for bc in barcodes:
|
|
69
|
+
try:
|
|
70
|
+
truncated.append(func(bc))
|
|
71
|
+
except ValueError:
|
|
72
|
+
logger.warning("Could not parse barcode %r at level %r; keeping as-is.", bc, level)
|
|
73
|
+
truncated.append(bc)
|
|
74
|
+
idx = cls(truncated)
|
|
75
|
+
logger.info(
|
|
76
|
+
"SampleIndex: %d barcodes → %d unique IDs at level %r.",
|
|
77
|
+
len(barcodes),
|
|
78
|
+
len(set(truncated)),
|
|
79
|
+
level,
|
|
80
|
+
)
|
|
81
|
+
return idx
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def align(
|
|
85
|
+
list_of_sample_id_arrays: Sequence[Union[Sequence, pd.Index]],
|
|
86
|
+
strategy: str = "intersection",
|
|
87
|
+
) -> pd.Index:
|
|
88
|
+
"""Find common samples across multiple modalities.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
list_of_sample_id_arrays:
|
|
93
|
+
One array/index per modality.
|
|
94
|
+
strategy:
|
|
95
|
+
``"intersection"`` (default) — samples present in every modality.
|
|
96
|
+
``"union"`` — all samples seen across any modality.
|
|
97
|
+
|
|
98
|
+
Returns
|
|
99
|
+
-------
|
|
100
|
+
pandas.Index
|
|
101
|
+
Aligned sample IDs.
|
|
102
|
+
|
|
103
|
+
Raises
|
|
104
|
+
------
|
|
105
|
+
ValueError
|
|
106
|
+
If *strategy* is unrecognised or the input list is empty.
|
|
107
|
+
"""
|
|
108
|
+
if not list_of_sample_id_arrays:
|
|
109
|
+
raise ValueError("list_of_sample_id_arrays must not be empty.")
|
|
110
|
+
if strategy not in ("intersection", "union"):
|
|
111
|
+
raise ValueError(
|
|
112
|
+
f"Unknown strategy {strategy!r}. Valid: 'intersection', 'union'."
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
indices = [pd.Index(arr) for arr in list_of_sample_id_arrays]
|
|
116
|
+
if strategy == "intersection":
|
|
117
|
+
result = indices[0]
|
|
118
|
+
for idx in indices[1:]:
|
|
119
|
+
result = result.intersection(idx)
|
|
120
|
+
else:
|
|
121
|
+
result = indices[0]
|
|
122
|
+
for idx in indices[1:]:
|
|
123
|
+
result = result.union(idx)
|
|
124
|
+
|
|
125
|
+
logger.info(
|
|
126
|
+
"SampleIndex.align (%s): %d common samples from %d modalities.",
|
|
127
|
+
strategy,
|
|
128
|
+
len(result),
|
|
129
|
+
len(indices),
|
|
130
|
+
)
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def match_fuzzy(
|
|
135
|
+
ids_a: Sequence[str],
|
|
136
|
+
ids_b: Sequence[str],
|
|
137
|
+
) -> Dict[str, Optional[str]]:
|
|
138
|
+
"""Match IDs across two sets, tolerating minor formatting differences.
|
|
139
|
+
|
|
140
|
+
Normalises by converting to uppercase and replacing dots with dashes
|
|
141
|
+
before matching.
|
|
142
|
+
|
|
143
|
+
Parameters
|
|
144
|
+
----------
|
|
145
|
+
ids_a:
|
|
146
|
+
Reference ID set.
|
|
147
|
+
ids_b:
|
|
148
|
+
Query ID set.
|
|
149
|
+
|
|
150
|
+
Returns
|
|
151
|
+
-------
|
|
152
|
+
dict
|
|
153
|
+
Mapping from each ID in *ids_a* to the best matching ID in
|
|
154
|
+
*ids_b*, or ``None`` if no match found.
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
def _normalise(s: str) -> str:
|
|
158
|
+
return s.strip().upper().replace(".", "-")
|
|
159
|
+
|
|
160
|
+
norm_b: Dict[str, str] = {_normalise(b): b for b in ids_b}
|
|
161
|
+
result: Dict[str, Optional[str]] = {}
|
|
162
|
+
for a in ids_a:
|
|
163
|
+
key = _normalise(a)
|
|
164
|
+
result[a] = norm_b.get(key)
|
|
165
|
+
n_matched = sum(v is not None for v in result.values())
|
|
166
|
+
logger.info(
|
|
167
|
+
"Fuzzy match: %d/%d IDs matched.", n_matched, len(ids_a)
|
|
168
|
+
)
|
|
169
|
+
return result
|
|
170
|
+
|
|
171
|
+
def summarise(
|
|
172
|
+
self,
|
|
173
|
+
modality_sample_ids: Dict[str, Sequence],
|
|
174
|
+
) -> pd.DataFrame:
|
|
175
|
+
"""Report how many samples are present in each modality combination.
|
|
176
|
+
|
|
177
|
+
Parameters
|
|
178
|
+
----------
|
|
179
|
+
modality_sample_ids:
|
|
180
|
+
Mapping from modality name to its sample IDs.
|
|
181
|
+
|
|
182
|
+
Returns
|
|
183
|
+
-------
|
|
184
|
+
pandas.DataFrame
|
|
185
|
+
Rows are samples; columns are modality names; values are boolean
|
|
186
|
+
(``True`` = present). An additional ``n_modalities`` column gives
|
|
187
|
+
the count of modalities present for each sample.
|
|
188
|
+
"""
|
|
189
|
+
all_ids = set()
|
|
190
|
+
for ids in modality_sample_ids.values():
|
|
191
|
+
all_ids.update(ids)
|
|
192
|
+
df = pd.DataFrame(
|
|
193
|
+
{
|
|
194
|
+
name: pd.Index(all_ids).isin(ids)
|
|
195
|
+
for name, ids in modality_sample_ids.items()
|
|
196
|
+
},
|
|
197
|
+
index=pd.Index(sorted(all_ids), name="sample_id"),
|
|
198
|
+
)
|
|
199
|
+
df["n_modalities"] = df.sum(axis=1)
|
|
200
|
+
return df
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Integration methods for multi-omics data fusion."""
|
|
2
|
+
|
|
3
|
+
from omicsync.integration.concat import simple_concat, weighted_concat, pca_concat
|
|
4
|
+
from omicsync.integration.sklearn_compat import OmicsSyncTransformer
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"simple_concat",
|
|
8
|
+
"weighted_concat",
|
|
9
|
+
"pca_concat",
|
|
10
|
+
"OmicsSyncTransformer",
|
|
11
|
+
]
|