omicsync 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omicsync/__init__.py +33 -0
- omicsync/core/__init__.py +25 -0
- omicsync/core/dataset.py +507 -0
- omicsync/core/modality.py +398 -0
- omicsync/core/sample_index.py +200 -0
- omicsync/integration/__init__.py +11 -0
- omicsync/integration/concat.py +146 -0
- omicsync/integration/mofa.py +279 -0
- omicsync/integration/sklearn_compat.py +178 -0
- omicsync/loaders/__init__.py +19 -0
- omicsync/loaders/csv.py +147 -0
- omicsync/loaders/geo.py +111 -0
- omicsync/loaders/open_targets.py +239 -0
- omicsync/loaders/tcga.py +251 -0
- omicsync/normalisation/__init__.py +5 -0
- omicsync/normalisation/cnv.py +97 -0
- omicsync/normalisation/methylation.py +131 -0
- omicsync/normalisation/mutations.py +123 -0
- omicsync/normalisation/protein.py +54 -0
- omicsync/normalisation/rna.py +182 -0
- omicsync/utils/__init__.py +32 -0
- omicsync/utils/barcode.py +165 -0
- omicsync/utils/logging.py +44 -0
- omicsync/utils/validation.py +152 -0
- omicsync-0.1.0.dist-info/METADATA +188 -0
- omicsync-0.1.0.dist-info/RECORD +29 -0
- omicsync-0.1.0.dist-info/WHEEL +5 -0
- omicsync-0.1.0.dist-info/licenses/LICENSE +21 -0
- omicsync-0.1.0.dist-info/top_level.txt +1 -0
omicsync/loaders/tcga.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""TCGA data loaders for omicsync."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Optional, Sequence, Union
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from omicsync.core.dataset import OmicsDataset
|
|
12
|
+
from omicsync.core.modality import make_modality, OmicsModality
|
|
13
|
+
from omicsync.core.sample_index import SampleIndex
|
|
14
|
+
from omicsync.utils.barcode import truncate_to_participant
|
|
15
|
+
from omicsync.utils.logging import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger("loaders.tcga")
|
|
18
|
+
|
|
19
|
+
# Expected filename patterns for each modality within a TCGA data directory.
|
|
20
|
+
# Pattern: {cancer_type}_{modality}.{ext}
|
|
21
|
+
_MODALITY_FILE_PATTERNS = {
|
|
22
|
+
"rna": ["{cancer_type}_rna.tsv", "{cancer_type}_rna.csv",
|
|
23
|
+
"{cancer_type}_rna_fpkm.tsv", "{cancer_type}_htseq_counts.tsv"],
|
|
24
|
+
"mutations": ["{cancer_type}_mutations.tsv", "{cancer_type}_mutations.maf",
|
|
25
|
+
"{cancer_type}_somatic.maf"],
|
|
26
|
+
"methylation": ["{cancer_type}_methylation.tsv", "{cancer_type}_methylation.csv"],
|
|
27
|
+
"cnv": ["{cancer_type}_cnv.tsv", "{cancer_type}_cnv.csv",
|
|
28
|
+
"{cancer_type}_gistic2.tsv"],
|
|
29
|
+
"protein": ["{cancer_type}_protein.tsv", "{cancer_type}_protein.csv",
|
|
30
|
+
"{cancer_type}_rppa.tsv"],
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# GDC data portal file type identifiers used in the manifest helper
|
|
34
|
+
_GDC_DATA_TYPES = {
|
|
35
|
+
"rna": "Gene Expression Quantification",
|
|
36
|
+
"mutations": "Masked Somatic Mutation",
|
|
37
|
+
"methylation": "Methylation Beta Value",
|
|
38
|
+
"cnv": "Copy Number Segment",
|
|
39
|
+
"protein": "Protein Expression Quantification",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _find_modality_file(
|
|
44
|
+
data_dir: Path,
|
|
45
|
+
cancer_type: str,
|
|
46
|
+
modality: str,
|
|
47
|
+
) -> Optional[Path]:
|
|
48
|
+
"""Search *data_dir* for a file matching known naming conventions."""
|
|
49
|
+
patterns = _MODALITY_FILE_PATTERNS.get(modality, [])
|
|
50
|
+
for pattern in patterns:
|
|
51
|
+
candidate = data_dir / pattern.format(cancer_type=cancer_type.upper())
|
|
52
|
+
if candidate.exists():
|
|
53
|
+
return candidate
|
|
54
|
+
candidate = data_dir / pattern.format(cancer_type=cancer_type.lower())
|
|
55
|
+
if candidate.exists():
|
|
56
|
+
return candidate
|
|
57
|
+
# Fallback: look for any file containing the modality name
|
|
58
|
+
for f in data_dir.iterdir():
|
|
59
|
+
if modality in f.name.lower() and f.suffix in (".tsv", ".csv", ".maf"):
|
|
60
|
+
return f
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _load_maf_mutations(path: Path) -> pd.DataFrame:
|
|
65
|
+
"""Parse a MAF file into a binary genes × samples mutation matrix."""
|
|
66
|
+
df = pd.read_csv(path, sep="\t", comment="#", low_memory=False)
|
|
67
|
+
|
|
68
|
+
required = {"Hugo_Symbol", "Tumor_Sample_Barcode"}
|
|
69
|
+
missing = required - set(df.columns)
|
|
70
|
+
if missing:
|
|
71
|
+
raise ValueError(
|
|
72
|
+
f"MAF file missing required columns: {missing}. "
|
|
73
|
+
f"Found: {df.columns.tolist()[:15]}."
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
df["sample_id"] = df["Tumor_Sample_Barcode"].apply(
|
|
77
|
+
lambda x: truncate_to_participant(x) if isinstance(x, str) and x.startswith("TCGA") else x
|
|
78
|
+
)
|
|
79
|
+
mat = (
|
|
80
|
+
df.groupby(["sample_id", "Hugo_Symbol"])
|
|
81
|
+
.size()
|
|
82
|
+
.unstack(fill_value=0)
|
|
83
|
+
.clip(upper=1)
|
|
84
|
+
)
|
|
85
|
+
return mat.astype(float)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _load_generic_matrix(path: Path) -> pd.DataFrame:
|
|
89
|
+
"""Load a TSV/CSV as a sample-indexed matrix.
|
|
90
|
+
|
|
91
|
+
Tries to detect whether samples are rows or columns.
|
|
92
|
+
"""
|
|
93
|
+
sep = "\t" if path.suffix in (".tsv", ".maf") else ","
|
|
94
|
+
df = pd.read_csv(path, sep=sep, index_col=0, low_memory=False)
|
|
95
|
+
|
|
96
|
+
# Heuristic: if more than half of values in first column look numeric,
|
|
97
|
+
# samples are rows; otherwise transpose.
|
|
98
|
+
numeric_frac = pd.to_numeric(df.iloc[:, 0], errors="coerce").notna().mean()
|
|
99
|
+
if numeric_frac < 0.5:
|
|
100
|
+
df = df.T
|
|
101
|
+
|
|
102
|
+
df = df.apply(pd.to_numeric, errors="coerce")
|
|
103
|
+
return df
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def load_tcga_files(
|
|
107
|
+
data_dir: Union[str, Path],
|
|
108
|
+
cancer_type: str,
|
|
109
|
+
modalities: Sequence[str],
|
|
110
|
+
) -> OmicsDataset:
|
|
111
|
+
"""Load TCGA data from local files into an :class:`~omicsync.core.dataset.OmicsDataset`.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
data_dir:
|
|
116
|
+
Path to directory containing TCGA data files. Expected naming:
|
|
117
|
+
``{cancer_type}_{modality}.tsv`` or ``{cancer_type}_{modality}.csv``.
|
|
118
|
+
For mutations, ``.maf`` files are also supported.
|
|
119
|
+
cancer_type:
|
|
120
|
+
TCGA cancer type abbreviation, e.g. ``"BRCA"``.
|
|
121
|
+
modalities:
|
|
122
|
+
List of modality names to load, e.g.
|
|
123
|
+
``["rna", "mutations", "methylation"]``.
|
|
124
|
+
|
|
125
|
+
Returns
|
|
126
|
+
-------
|
|
127
|
+
OmicsDataset
|
|
128
|
+
|
|
129
|
+
Raises
|
|
130
|
+
------
|
|
131
|
+
FileNotFoundError
|
|
132
|
+
If *data_dir* does not exist.
|
|
133
|
+
ValueError
|
|
134
|
+
If no file is found for a requested modality.
|
|
135
|
+
"""
|
|
136
|
+
data_dir = Path(data_dir)
|
|
137
|
+
if not data_dir.exists():
|
|
138
|
+
raise FileNotFoundError(f"Data directory not found: {data_dir}")
|
|
139
|
+
|
|
140
|
+
loaded: Dict[str, OmicsModality] = {}
|
|
141
|
+
|
|
142
|
+
for modality in modalities:
|
|
143
|
+
path = _find_modality_file(data_dir, cancer_type, modality)
|
|
144
|
+
if path is None:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"No file found for modality {modality!r} in {data_dir}. "
|
|
147
|
+
f"Expected patterns like: "
|
|
148
|
+
+ str(_MODALITY_FILE_PATTERNS.get(modality, []))
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
logger.info("load_tcga_files: loading %r from %s.", modality, path.name)
|
|
152
|
+
|
|
153
|
+
if modality == "mutations" and path.suffix == ".maf":
|
|
154
|
+
df = _load_maf_mutations(path)
|
|
155
|
+
else:
|
|
156
|
+
df = _load_generic_matrix(path)
|
|
157
|
+
|
|
158
|
+
# Harmonise TCGA barcodes if detected
|
|
159
|
+
if all(str(idx).startswith("TCGA") for idx in df.index[:5]):
|
|
160
|
+
df.index = pd.Index(
|
|
161
|
+
[truncate_to_participant(str(i)) for i in df.index],
|
|
162
|
+
name=df.index.name,
|
|
163
|
+
)
|
|
164
|
+
df = df[~df.index.duplicated(keep="first")]
|
|
165
|
+
logger.info(
|
|
166
|
+
"load_tcga_files: truncated %r sample IDs to participant level.",
|
|
167
|
+
modality,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
loaded[modality] = make_modality(df, modality_type=modality, source="tcga")
|
|
171
|
+
|
|
172
|
+
dataset = OmicsDataset(loaded, study_id=f"TCGA-{cancer_type.upper()}")
|
|
173
|
+
|
|
174
|
+
# Print coverage report
|
|
175
|
+
coverage = dataset.sample_coverage
|
|
176
|
+
logger.info(
|
|
177
|
+
"TCGA %s coverage: %d total samples, %d complete cases.",
|
|
178
|
+
cancer_type.upper(),
|
|
179
|
+
len(coverage),
|
|
180
|
+
dataset.n_complete_cases,
|
|
181
|
+
)
|
|
182
|
+
return dataset
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def download_tcga_manifest(
|
|
186
|
+
cancer_type: str,
|
|
187
|
+
modalities: Sequence[str],
|
|
188
|
+
output_dir: Union[str, Path],
|
|
189
|
+
) -> None:
|
|
190
|
+
"""Print GDC data portal instructions for downloading TCGA data.
|
|
191
|
+
|
|
192
|
+
This function does NOT download anything itself — TCGA data via the GDC
|
|
193
|
+
API requires authentication tokens. It prints the required ``curl``
|
|
194
|
+
commands and GDC portal URLs so you can retrieve the data manually.
|
|
195
|
+
|
|
196
|
+
Parameters
|
|
197
|
+
----------
|
|
198
|
+
cancer_type:
|
|
199
|
+
TCGA cancer type abbreviation, e.g. ``"BRCA"``.
|
|
200
|
+
modalities:
|
|
201
|
+
Modalities to retrieve, e.g. ``["rna", "mutations"]``.
|
|
202
|
+
output_dir:
|
|
203
|
+
Directory where you plan to save the files (used in printed commands).
|
|
204
|
+
"""
|
|
205
|
+
ct = cancer_type.upper()
|
|
206
|
+
output_dir = Path(output_dir)
|
|
207
|
+
|
|
208
|
+
instructions = [
|
|
209
|
+
f"",
|
|
210
|
+
f"TCGA {ct} data download instructions",
|
|
211
|
+
f"{'=' * 50}",
|
|
212
|
+
f"",
|
|
213
|
+
f"To download TCGA data you need a GDC account and an API token.",
|
|
214
|
+
f"",
|
|
215
|
+
f"1. Create/log into your account at: https://portal.gdc.cancer.gov/",
|
|
216
|
+
f"2. Download your API token from:",
|
|
217
|
+
f" Profile → Download Token (valid 30 days)",
|
|
218
|
+
f"3. Save the token to a file, e.g. ~/gdc-token.txt",
|
|
219
|
+
f"",
|
|
220
|
+
f"Data types needed for {ct}:",
|
|
221
|
+
f"",
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
for modality in modalities:
|
|
225
|
+
data_type = _GDC_DATA_TYPES.get(modality, modality)
|
|
226
|
+
filename = f"{ct}_{modality}.tsv"
|
|
227
|
+
instructions += [
|
|
228
|
+
f" [{modality.upper()}]",
|
|
229
|
+
f" GDC data type : {data_type}",
|
|
230
|
+
f" Portal filter : https://portal.gdc.cancer.gov/repository"
|
|
231
|
+
f"?filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22cases.project.project_id%22%2C%22value%22%3A%5B%22TCGA-{ct}%22%5D%7D%7D%5D%7D"
|
|
232
|
+
f"&facetTab=files&searchTableTab=files",
|
|
233
|
+
f" Save to : {output_dir / filename}",
|
|
234
|
+
f"",
|
|
235
|
+
]
|
|
236
|
+
|
|
237
|
+
instructions += [
|
|
238
|
+
f"Alternative: use the GDC client tool",
|
|
239
|
+
f" pip install gdc-client",
|
|
240
|
+
f" gdc-client download -t ~/gdc-token.txt -d {output_dir} <manifest-file>",
|
|
241
|
+
f"",
|
|
242
|
+
f"Alternative: use TCGAbiolinks (R):",
|
|
243
|
+
f" library(TCGAbiolinks)",
|
|
244
|
+
f" query <- GDCquery(project = 'TCGA-{ct}', ...)",
|
|
245
|
+
f" GDCdownload(query)",
|
|
246
|
+
f"",
|
|
247
|
+
]
|
|
248
|
+
|
|
249
|
+
for line in instructions:
|
|
250
|
+
logger.info(line)
|
|
251
|
+
print(line)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Copy number variation normalisation utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Tuple
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from omicsync.utils.logging import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger("normalisation.cnv")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def centre_diploid(df: pd.DataFrame, diploid: float = 2.0) -> pd.DataFrame:
|
|
16
|
+
"""Subtract the diploid baseline from all values.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
df:
|
|
21
|
+
CNV matrix (samples × genes). Values are typically absolute copy
|
|
22
|
+
number estimates (centred around 2 for diploid).
|
|
23
|
+
diploid:
|
|
24
|
+
The baseline copy number to subtract (default 2.0).
|
|
25
|
+
|
|
26
|
+
Returns
|
|
27
|
+
-------
|
|
28
|
+
pandas.DataFrame
|
|
29
|
+
Copy number deviation from diploid.
|
|
30
|
+
"""
|
|
31
|
+
result = df.values.astype(float) - diploid
|
|
32
|
+
logger.info("centre_diploid: subtracted diploid=%.1f from %s.", diploid, df.shape)
|
|
33
|
+
return pd.DataFrame(result, index=df.index, columns=df.columns)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def log2_ratio(df: pd.DataFrame, pseudo: float = 1.0) -> pd.DataFrame:
|
|
37
|
+
"""Compute log2 copy-number ratio relative to diploid.
|
|
38
|
+
|
|
39
|
+
Assumes input is already centred (deviation from diploid = 0).
|
|
40
|
+
Adds *pseudo* before log2 to handle zero deviations.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
df:
|
|
45
|
+
CNV deviation matrix (output of :func:`centre_diploid`).
|
|
46
|
+
pseudo:
|
|
47
|
+
Pseudocount added before log2 transform (default 1.0).
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
pandas.DataFrame
|
|
52
|
+
log2 ratio matrix.
|
|
53
|
+
"""
|
|
54
|
+
data = df.values.astype(float)
|
|
55
|
+
shifted = data + pseudo
|
|
56
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
57
|
+
result = np.where(shifted > 0, np.log2(shifted), np.nan)
|
|
58
|
+
logger.info("log2_ratio: applied log2 to %s.", df.shape)
|
|
59
|
+
return pd.DataFrame(result, index=df.index, columns=df.columns)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def discretise(
|
|
63
|
+
df: pd.DataFrame,
|
|
64
|
+
thresholds: Tuple[float, float, float, float] = (-1.0, -0.3, 0.3, 1.0),
|
|
65
|
+
) -> pd.DataFrame:
|
|
66
|
+
"""Discretise copy-number values into -2/-1/0/1/2 states.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
df:
|
|
71
|
+
CNV matrix (log2 ratios or deviations from diploid).
|
|
72
|
+
thresholds:
|
|
73
|
+
Four boundary values ``(deep_del, del, amp, high_amp)`` that define
|
|
74
|
+
the five copy-number states:
|
|
75
|
+
|
|
76
|
+
* < deep_del → -2 (deep deletion)
|
|
77
|
+
* < del → -1 (deletion)
|
|
78
|
+
* <= amp → 0 (diploid)
|
|
79
|
+
* <= high_amp → 1 (gain)
|
|
80
|
+
* > high_amp → 2 (amplification)
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
pandas.DataFrame
|
|
85
|
+
Integer copy-number state matrix.
|
|
86
|
+
"""
|
|
87
|
+
if len(thresholds) != 4:
|
|
88
|
+
raise ValueError("thresholds must have exactly 4 values.")
|
|
89
|
+
t1, t2, t3, t4 = thresholds
|
|
90
|
+
data = df.values.astype(float)
|
|
91
|
+
result = np.zeros_like(data, dtype=float)
|
|
92
|
+
result[data < t1] = -2.0
|
|
93
|
+
result[(data >= t1) & (data < t2)] = -1.0
|
|
94
|
+
result[(data > t3) & (data <= t4)] = 1.0
|
|
95
|
+
result[data > t4] = 2.0
|
|
96
|
+
logger.info("discretise: discretised CNV matrix %s.", df.shape)
|
|
97
|
+
return pd.DataFrame(result, index=df.index, columns=df.columns)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Methylation normalisation utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from omicsync.utils.logging import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger("normalisation.methylation")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def beta_to_m(df: pd.DataFrame) -> pd.DataFrame:
|
|
14
|
+
"""Convert beta values to M-values: log2(beta / (1 - beta)).
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
df:
|
|
19
|
+
Beta value matrix (samples × CpG sites). Values must be in (0, 1).
|
|
20
|
+
|
|
21
|
+
Returns
|
|
22
|
+
-------
|
|
23
|
+
pandas.DataFrame
|
|
24
|
+
M-value matrix.
|
|
25
|
+
|
|
26
|
+
Raises
|
|
27
|
+
------
|
|
28
|
+
ValueError
|
|
29
|
+
If any value is outside [0, 1].
|
|
30
|
+
"""
|
|
31
|
+
data = df.values.astype(float)
|
|
32
|
+
finite = data[np.isfinite(data)]
|
|
33
|
+
if len(finite) > 0 and (finite.min() < 0 or finite.max() > 1):
|
|
34
|
+
raise ValueError(
|
|
35
|
+
f"beta_to_m: beta values must be in [0, 1]. "
|
|
36
|
+
f"Got min={finite.min():.4f}, max={finite.max():.4f}. "
|
|
37
|
+
"Clip first with clip_beta()."
|
|
38
|
+
)
|
|
39
|
+
eps = 1e-6
|
|
40
|
+
clipped = np.clip(data, eps, 1 - eps)
|
|
41
|
+
result = np.log2(clipped / (1 - clipped))
|
|
42
|
+
logger.info("beta_to_m: converted beta → M-values for %s.", df.shape)
|
|
43
|
+
return pd.DataFrame(result, index=df.index, columns=df.columns)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def m_to_beta(df: pd.DataFrame) -> pd.DataFrame:
|
|
47
|
+
"""Convert M-values back to beta values: 2^M / (2^M + 1).
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
df:
|
|
52
|
+
M-value matrix (samples × CpG sites).
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
pandas.DataFrame
|
|
57
|
+
Beta value matrix in (0, 1).
|
|
58
|
+
"""
|
|
59
|
+
data = df.values.astype(float)
|
|
60
|
+
exp = np.power(2.0, data)
|
|
61
|
+
result = exp / (exp + 1.0)
|
|
62
|
+
logger.info("m_to_beta: converted M-values → beta for %s.", df.shape)
|
|
63
|
+
return pd.DataFrame(result, index=df.index, columns=df.columns)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def clip_beta(
|
|
67
|
+
df: pd.DataFrame,
|
|
68
|
+
low: float = 0.001,
|
|
69
|
+
high: float = 0.999,
|
|
70
|
+
) -> pd.DataFrame:
|
|
71
|
+
"""Clip beta values to avoid extreme values near 0 and 1.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
df:
|
|
76
|
+
Beta value matrix.
|
|
77
|
+
low:
|
|
78
|
+
Lower clip bound (default 0.001).
|
|
79
|
+
high:
|
|
80
|
+
Upper clip bound (default 0.999).
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
pandas.DataFrame
|
|
85
|
+
Clipped beta matrix.
|
|
86
|
+
"""
|
|
87
|
+
result = df.values.astype(float).clip(low, high)
|
|
88
|
+
logger.info("clip_beta: clipped to [%.4f, %.4f] for %s.", low, high, df.shape)
|
|
89
|
+
return pd.DataFrame(result, index=df.index, columns=df.columns)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def detect_and_normalise(df: pd.DataFrame) -> pd.DataFrame:
|
|
93
|
+
"""Auto-detect M-values vs beta values and normalise to clipped beta.
|
|
94
|
+
|
|
95
|
+
Heuristic: if any finite value is outside [0, 1], treat as M-values
|
|
96
|
+
and convert to beta. Otherwise clip beta to [0.001, 0.999].
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
df:
|
|
101
|
+
Methylation matrix (samples × CpG sites).
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
-------
|
|
105
|
+
pandas.DataFrame
|
|
106
|
+
Beta values clipped to [0.001, 0.999].
|
|
107
|
+
"""
|
|
108
|
+
data = df.values.astype(float)
|
|
109
|
+
finite = data[np.isfinite(data)]
|
|
110
|
+
if len(finite) == 0:
|
|
111
|
+
logger.warning("detect_and_normalise: no finite values; skipping.")
|
|
112
|
+
return df
|
|
113
|
+
|
|
114
|
+
vmin, vmax = finite.min(), finite.max()
|
|
115
|
+
|
|
116
|
+
if vmin < -0.01 or vmax > 1.01:
|
|
117
|
+
logger.info(
|
|
118
|
+
"detect_and_normalise (methylation): detected M-values "
|
|
119
|
+
"(min=%.4f, max=%.4f); converting to beta.",
|
|
120
|
+
vmin, vmax,
|
|
121
|
+
)
|
|
122
|
+
result = m_to_beta(df)
|
|
123
|
+
else:
|
|
124
|
+
logger.info(
|
|
125
|
+
"detect_and_normalise (methylation): detected beta values "
|
|
126
|
+
"(min=%.4f, max=%.4f); clipping.",
|
|
127
|
+
vmin, vmax,
|
|
128
|
+
)
|
|
129
|
+
result = df
|
|
130
|
+
|
|
131
|
+
return clip_beta(result)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Mutation matrix processing utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Sequence
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from omicsync.utils.logging import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger("normalisation.mutations")
|
|
13
|
+
|
|
14
|
+
# Standard Sequence Ontology consequence terms
|
|
15
|
+
CONSEQUENCE_TERMS = frozenset({
|
|
16
|
+
"missense_variant",
|
|
17
|
+
"stop_gained",
|
|
18
|
+
"stop_lost",
|
|
19
|
+
"frameshift_variant",
|
|
20
|
+
"splice_acceptor_variant",
|
|
21
|
+
"splice_donor_variant",
|
|
22
|
+
"start_lost",
|
|
23
|
+
"inframe_insertion",
|
|
24
|
+
"inframe_deletion",
|
|
25
|
+
"synonymous_variant",
|
|
26
|
+
"3_prime_UTR_variant",
|
|
27
|
+
"5_prime_UTR_variant",
|
|
28
|
+
"intron_variant",
|
|
29
|
+
"upstream_gene_variant",
|
|
30
|
+
"downstream_gene_variant",
|
|
31
|
+
"non_coding_transcript_variant",
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def binarise(df: pd.DataFrame, threshold: float = 0) -> pd.DataFrame:
|
|
36
|
+
"""Binarise a mutation matrix: any value above *threshold* becomes 1.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
df:
|
|
41
|
+
Mutation matrix (samples × genes). May contain counts or continuous
|
|
42
|
+
variant scores.
|
|
43
|
+
threshold:
|
|
44
|
+
Values strictly above this threshold are set to 1; others to 0.
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
pandas.DataFrame
|
|
49
|
+
Binary mutation matrix with dtype float64.
|
|
50
|
+
"""
|
|
51
|
+
result = (df.values.astype(float) > threshold).astype(float)
|
|
52
|
+
logger.info(
|
|
53
|
+
"binarise: threshold=%.2f applied to %s; "
|
|
54
|
+
"%.1f%% mutated entries.",
|
|
55
|
+
threshold,
|
|
56
|
+
df.shape,
|
|
57
|
+
100.0 * result.mean(),
|
|
58
|
+
)
|
|
59
|
+
return pd.DataFrame(result, index=df.index, columns=df.columns)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def filter_by_consequence(
|
|
63
|
+
df: pd.DataFrame,
|
|
64
|
+
consequences: Sequence[str],
|
|
65
|
+
consequence_map: dict | None = None,
|
|
66
|
+
) -> pd.DataFrame:
|
|
67
|
+
"""Keep only genes that have at least one sample with a specified consequence.
|
|
68
|
+
|
|
69
|
+
This function operates on a pre-binarised mutation matrix. If a
|
|
70
|
+
``consequence_map`` is provided (mapping gene → consequence), genes whose
|
|
71
|
+
mapped consequence is not in *consequences* are zeroed out.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
df:
|
|
76
|
+
Mutation matrix (samples × genes).
|
|
77
|
+
consequences:
|
|
78
|
+
Consequence types to retain, e.g. ``["missense_variant", "stop_gained"]``.
|
|
79
|
+
consequence_map:
|
|
80
|
+
Optional dict mapping gene ID to its predominant consequence. If
|
|
81
|
+
``None``, this function simply returns *df* unchanged with a warning.
|
|
82
|
+
|
|
83
|
+
Returns
|
|
84
|
+
-------
|
|
85
|
+
pandas.DataFrame
|
|
86
|
+
Filtered mutation matrix.
|
|
87
|
+
"""
|
|
88
|
+
if consequence_map is None:
|
|
89
|
+
logger.warning(
|
|
90
|
+
"filter_by_consequence: no consequence_map provided; returning input unchanged."
|
|
91
|
+
)
|
|
92
|
+
return df
|
|
93
|
+
|
|
94
|
+
keep = [gene for gene in df.columns if consequence_map.get(gene) in consequences]
|
|
95
|
+
n_before = df.shape[1]
|
|
96
|
+
result = df[keep].copy()
|
|
97
|
+
logger.info(
|
|
98
|
+
"filter_by_consequence: kept %d/%d genes matching %s.",
|
|
99
|
+
len(keep),
|
|
100
|
+
n_before,
|
|
101
|
+
list(consequences),
|
|
102
|
+
)
|
|
103
|
+
return result
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def compute_tmb(df: pd.DataFrame) -> pd.Series:
|
|
107
|
+
"""Compute tumour mutation burden (total mutations per sample).
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
df:
|
|
112
|
+
Binary mutation matrix (samples × genes).
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
pandas.Series
|
|
117
|
+
Mutation count per sample, indexed by sample ID.
|
|
118
|
+
"""
|
|
119
|
+
tmb = df.sum(axis=1).rename("tmb")
|
|
120
|
+
logger.info(
|
|
121
|
+
"compute_tmb: TMB computed for %d samples; mean=%.2f.", len(tmb), tmb.mean()
|
|
122
|
+
)
|
|
123
|
+
return tmb
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Protein abundance normalisation utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from omicsync.utils.logging import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger("normalisation.protein")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def z_score(df: pd.DataFrame) -> pd.DataFrame:
|
|
14
|
+
"""Z-score normalise protein abundance per feature (column).
|
|
15
|
+
|
|
16
|
+
Constant columns (zero standard deviation) are set to 0.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
df:
|
|
21
|
+
Protein abundance matrix (samples × proteins).
|
|
22
|
+
|
|
23
|
+
Returns
|
|
24
|
+
-------
|
|
25
|
+
pandas.DataFrame
|
|
26
|
+
Z-scored matrix.
|
|
27
|
+
"""
|
|
28
|
+
data = df.values.astype(float)
|
|
29
|
+
mu = np.nanmean(data, axis=0, keepdims=True)
|
|
30
|
+
sd = np.nanstd(data, axis=0, keepdims=True)
|
|
31
|
+
sd = np.where(sd == 0, 1.0, sd)
|
|
32
|
+
result = (data - mu) / sd
|
|
33
|
+
logger.info("z_score (protein): applied to %s.", df.shape)
|
|
34
|
+
return pd.DataFrame(result, index=df.index, columns=df.columns)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def median_centring(df: pd.DataFrame) -> pd.DataFrame:
|
|
38
|
+
"""Centre each protein on its median across samples.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
df:
|
|
43
|
+
Protein abundance matrix (samples × proteins).
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
pandas.DataFrame
|
|
48
|
+
Median-centred matrix.
|
|
49
|
+
"""
|
|
50
|
+
data = df.values.astype(float)
|
|
51
|
+
medians = np.nanmedian(data, axis=0, keepdims=True)
|
|
52
|
+
result = data - medians
|
|
53
|
+
logger.info("median_centring: applied to %s.", df.shape)
|
|
54
|
+
return pd.DataFrame(result, index=df.index, columns=df.columns)
|