omicsync 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,251 @@
1
+ """TCGA data loaders for omicsync."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Dict, List, Optional, Sequence, Union
8
+
9
+ import pandas as pd
10
+
11
+ from omicsync.core.dataset import OmicsDataset
12
+ from omicsync.core.modality import make_modality, OmicsModality
13
+ from omicsync.core.sample_index import SampleIndex
14
+ from omicsync.utils.barcode import truncate_to_participant
15
+ from omicsync.utils.logging import get_logger
16
+
17
+ logger = get_logger("loaders.tcga")
18
+
19
+ # Expected filename patterns for each modality within a TCGA data directory.
20
+ # Pattern: {cancer_type}_{modality}.{ext}
21
+ _MODALITY_FILE_PATTERNS = {
22
+ "rna": ["{cancer_type}_rna.tsv", "{cancer_type}_rna.csv",
23
+ "{cancer_type}_rna_fpkm.tsv", "{cancer_type}_htseq_counts.tsv"],
24
+ "mutations": ["{cancer_type}_mutations.tsv", "{cancer_type}_mutations.maf",
25
+ "{cancer_type}_somatic.maf"],
26
+ "methylation": ["{cancer_type}_methylation.tsv", "{cancer_type}_methylation.csv"],
27
+ "cnv": ["{cancer_type}_cnv.tsv", "{cancer_type}_cnv.csv",
28
+ "{cancer_type}_gistic2.tsv"],
29
+ "protein": ["{cancer_type}_protein.tsv", "{cancer_type}_protein.csv",
30
+ "{cancer_type}_rppa.tsv"],
31
+ }
32
+
33
+ # GDC data portal file type identifiers used in the manifest helper
34
+ _GDC_DATA_TYPES = {
35
+ "rna": "Gene Expression Quantification",
36
+ "mutations": "Masked Somatic Mutation",
37
+ "methylation": "Methylation Beta Value",
38
+ "cnv": "Copy Number Segment",
39
+ "protein": "Protein Expression Quantification",
40
+ }
41
+
42
+
43
+ def _find_modality_file(
44
+ data_dir: Path,
45
+ cancer_type: str,
46
+ modality: str,
47
+ ) -> Optional[Path]:
48
+ """Search *data_dir* for a file matching known naming conventions."""
49
+ patterns = _MODALITY_FILE_PATTERNS.get(modality, [])
50
+ for pattern in patterns:
51
+ candidate = data_dir / pattern.format(cancer_type=cancer_type.upper())
52
+ if candidate.exists():
53
+ return candidate
54
+ candidate = data_dir / pattern.format(cancer_type=cancer_type.lower())
55
+ if candidate.exists():
56
+ return candidate
57
+ # Fallback: look for any file containing the modality name
58
+ for f in data_dir.iterdir():
59
+ if modality in f.name.lower() and f.suffix in (".tsv", ".csv", ".maf"):
60
+ return f
61
+ return None
62
+
63
+
64
+ def _load_maf_mutations(path: Path) -> pd.DataFrame:
65
+ """Parse a MAF file into a binary genes × samples mutation matrix."""
66
+ df = pd.read_csv(path, sep="\t", comment="#", low_memory=False)
67
+
68
+ required = {"Hugo_Symbol", "Tumor_Sample_Barcode"}
69
+ missing = required - set(df.columns)
70
+ if missing:
71
+ raise ValueError(
72
+ f"MAF file missing required columns: {missing}. "
73
+ f"Found: {df.columns.tolist()[:15]}."
74
+ )
75
+
76
+ df["sample_id"] = df["Tumor_Sample_Barcode"].apply(
77
+ lambda x: truncate_to_participant(x) if isinstance(x, str) and x.startswith("TCGA") else x
78
+ )
79
+ mat = (
80
+ df.groupby(["sample_id", "Hugo_Symbol"])
81
+ .size()
82
+ .unstack(fill_value=0)
83
+ .clip(upper=1)
84
+ )
85
+ return mat.astype(float)
86
+
87
+
88
+ def _load_generic_matrix(path: Path) -> pd.DataFrame:
89
+ """Load a TSV/CSV as a sample-indexed matrix.
90
+
91
+ Tries to detect whether samples are rows or columns.
92
+ """
93
+ sep = "\t" if path.suffix in (".tsv", ".maf") else ","
94
+ df = pd.read_csv(path, sep=sep, index_col=0, low_memory=False)
95
+
96
+ # Heuristic: if more than half of values in first column look numeric,
97
+ # samples are rows; otherwise transpose.
98
+ numeric_frac = pd.to_numeric(df.iloc[:, 0], errors="coerce").notna().mean()
99
+ if numeric_frac < 0.5:
100
+ df = df.T
101
+
102
+ df = df.apply(pd.to_numeric, errors="coerce")
103
+ return df
104
+
105
+
106
+ def load_tcga_files(
107
+ data_dir: Union[str, Path],
108
+ cancer_type: str,
109
+ modalities: Sequence[str],
110
+ ) -> OmicsDataset:
111
+ """Load TCGA data from local files into an :class:`~omicsync.core.dataset.OmicsDataset`.
112
+
113
+ Parameters
114
+ ----------
115
+ data_dir:
116
+ Path to directory containing TCGA data files. Expected naming:
117
+ ``{cancer_type}_{modality}.tsv`` or ``{cancer_type}_{modality}.csv``.
118
+ For mutations, ``.maf`` files are also supported.
119
+ cancer_type:
120
+ TCGA cancer type abbreviation, e.g. ``"BRCA"``.
121
+ modalities:
122
+ List of modality names to load, e.g.
123
+ ``["rna", "mutations", "methylation"]``.
124
+
125
+ Returns
126
+ -------
127
+ OmicsDataset
128
+
129
+ Raises
130
+ ------
131
+ FileNotFoundError
132
+ If *data_dir* does not exist.
133
+ ValueError
134
+ If no file is found for a requested modality.
135
+ """
136
+ data_dir = Path(data_dir)
137
+ if not data_dir.exists():
138
+ raise FileNotFoundError(f"Data directory not found: {data_dir}")
139
+
140
+ loaded: Dict[str, OmicsModality] = {}
141
+
142
+ for modality in modalities:
143
+ path = _find_modality_file(data_dir, cancer_type, modality)
144
+ if path is None:
145
+ raise ValueError(
146
+ f"No file found for modality {modality!r} in {data_dir}. "
147
+ f"Expected patterns like: "
148
+ + str(_MODALITY_FILE_PATTERNS.get(modality, []))
149
+ )
150
+
151
+ logger.info("load_tcga_files: loading %r from %s.", modality, path.name)
152
+
153
+ if modality == "mutations" and path.suffix == ".maf":
154
+ df = _load_maf_mutations(path)
155
+ else:
156
+ df = _load_generic_matrix(path)
157
+
158
+ # Harmonise TCGA barcodes if detected
159
+ if all(str(idx).startswith("TCGA") for idx in df.index[:5]):
160
+ df.index = pd.Index(
161
+ [truncate_to_participant(str(i)) for i in df.index],
162
+ name=df.index.name,
163
+ )
164
+ df = df[~df.index.duplicated(keep="first")]
165
+ logger.info(
166
+ "load_tcga_files: truncated %r sample IDs to participant level.",
167
+ modality,
168
+ )
169
+
170
+ loaded[modality] = make_modality(df, modality_type=modality, source="tcga")
171
+
172
+ dataset = OmicsDataset(loaded, study_id=f"TCGA-{cancer_type.upper()}")
173
+
174
+ # Print coverage report
175
+ coverage = dataset.sample_coverage
176
+ logger.info(
177
+ "TCGA %s coverage: %d total samples, %d complete cases.",
178
+ cancer_type.upper(),
179
+ len(coverage),
180
+ dataset.n_complete_cases,
181
+ )
182
+ return dataset
183
+
184
+
185
+ def download_tcga_manifest(
186
+ cancer_type: str,
187
+ modalities: Sequence[str],
188
+ output_dir: Union[str, Path],
189
+ ) -> None:
190
+ """Print GDC data portal instructions for downloading TCGA data.
191
+
192
+ This function does NOT download anything itself — TCGA data via the GDC
193
+ API requires authentication tokens. It prints the required ``curl``
194
+ commands and GDC portal URLs so you can retrieve the data manually.
195
+
196
+ Parameters
197
+ ----------
198
+ cancer_type:
199
+ TCGA cancer type abbreviation, e.g. ``"BRCA"``.
200
+ modalities:
201
+ Modalities to retrieve, e.g. ``["rna", "mutations"]``.
202
+ output_dir:
203
+ Directory where you plan to save the files (used in printed commands).
204
+ """
205
+ ct = cancer_type.upper()
206
+ output_dir = Path(output_dir)
207
+
208
+ instructions = [
209
+ f"",
210
+ f"TCGA {ct} data download instructions",
211
+ f"{'=' * 50}",
212
+ f"",
213
+ f"To download TCGA data you need a GDC account and an API token.",
214
+ f"",
215
+ f"1. Create/log into your account at: https://portal.gdc.cancer.gov/",
216
+ f"2. Download your API token from:",
217
+ f" Profile → Download Token (valid 30 days)",
218
+ f"3. Save the token to a file, e.g. ~/gdc-token.txt",
219
+ f"",
220
+ f"Data types needed for {ct}:",
221
+ f"",
222
+ ]
223
+
224
+ for modality in modalities:
225
+ data_type = _GDC_DATA_TYPES.get(modality, modality)
226
+ filename = f"{ct}_{modality}.tsv"
227
+ instructions += [
228
+ f" [{modality.upper()}]",
229
+ f" GDC data type : {data_type}",
230
+ f" Portal filter : https://portal.gdc.cancer.gov/repository"
231
+ f"?filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22cases.project.project_id%22%2C%22value%22%3A%5B%22TCGA-{ct}%22%5D%7D%7D%5D%7D"
232
+ f"&facetTab=files&searchTableTab=files",
233
+ f" Save to : {output_dir / filename}",
234
+ f"",
235
+ ]
236
+
237
+ instructions += [
238
+ f"Alternative: use the GDC client tool",
239
+ f" pip install gdc-client",
240
+ f" gdc-client download -t ~/gdc-token.txt -d {output_dir} <manifest-file>",
241
+ f"",
242
+ f"Alternative: use TCGAbiolinks (R):",
243
+ f" library(TCGAbiolinks)",
244
+ f" query <- GDCquery(project = 'TCGA-{ct}', ...)",
245
+ f" GDCdownload(query)",
246
+ f"",
247
+ ]
248
+
249
+ for line in instructions:
250
+ logger.info(line)
251
+ print(line)
@@ -0,0 +1,5 @@
1
+ """Normalisation methods for each omics modality."""
2
+
3
+ from omicsync.normalisation import cnv, methylation, mutations, protein, rna
4
+
5
+ __all__ = ["rna", "methylation", "cnv", "mutations", "protein"]
@@ -0,0 +1,97 @@
1
+ """Copy number variation normalisation utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Tuple
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from omicsync.utils.logging import get_logger
11
+
12
+ logger = get_logger("normalisation.cnv")
13
+
14
+
15
+ def centre_diploid(df: pd.DataFrame, diploid: float = 2.0) -> pd.DataFrame:
16
+ """Subtract the diploid baseline from all values.
17
+
18
+ Parameters
19
+ ----------
20
+ df:
21
+ CNV matrix (samples × genes). Values are typically absolute copy
22
+ number estimates (centred around 2 for diploid).
23
+ diploid:
24
+ The baseline copy number to subtract (default 2.0).
25
+
26
+ Returns
27
+ -------
28
+ pandas.DataFrame
29
+ Copy number deviation from diploid.
30
+ """
31
+ result = df.values.astype(float) - diploid
32
+ logger.info("centre_diploid: subtracted diploid=%.1f from %s.", diploid, df.shape)
33
+ return pd.DataFrame(result, index=df.index, columns=df.columns)
34
+
35
+
36
+ def log2_ratio(df: pd.DataFrame, pseudo: float = 1.0) -> pd.DataFrame:
37
+ """Compute log2 copy-number ratio relative to diploid.
38
+
39
+ Assumes input is already centred (deviation from diploid = 0).
40
+ Adds *pseudo* before log2 to handle zero deviations.
41
+
42
+ Parameters
43
+ ----------
44
+ df:
45
+ CNV deviation matrix (output of :func:`centre_diploid`).
46
+ pseudo:
47
+ Pseudocount added before log2 transform (default 1.0).
48
+
49
+ Returns
50
+ -------
51
+ pandas.DataFrame
52
+ log2 ratio matrix.
53
+ """
54
+ data = df.values.astype(float)
55
+ shifted = data + pseudo
56
+ with np.errstate(divide="ignore", invalid="ignore"):
57
+ result = np.where(shifted > 0, np.log2(shifted), np.nan)
58
+ logger.info("log2_ratio: applied log2 to %s.", df.shape)
59
+ return pd.DataFrame(result, index=df.index, columns=df.columns)
60
+
61
+
62
+ def discretise(
63
+ df: pd.DataFrame,
64
+ thresholds: Tuple[float, float, float, float] = (-1.0, -0.3, 0.3, 1.0),
65
+ ) -> pd.DataFrame:
66
+ """Discretise copy-number values into -2/-1/0/1/2 states.
67
+
68
+ Parameters
69
+ ----------
70
+ df:
71
+ CNV matrix (log2 ratios or deviations from diploid).
72
+ thresholds:
73
+ Four boundary values ``(deep_del, del, amp, high_amp)`` that define
74
+ the five copy-number states:
75
+
76
+ * < deep_del → -2 (deep deletion)
77
+ * < del → -1 (deletion)
78
+ * <= amp → 0 (diploid)
79
+ * <= high_amp → 1 (gain)
80
+ * > high_amp → 2 (amplification)
81
+
82
+ Returns
83
+ -------
84
+ pandas.DataFrame
85
+ Integer copy-number state matrix.
86
+ """
87
+ if len(thresholds) != 4:
88
+ raise ValueError("thresholds must have exactly 4 values.")
89
+ t1, t2, t3, t4 = thresholds
90
+ data = df.values.astype(float)
91
+ result = np.zeros_like(data, dtype=float)
92
+ result[data < t1] = -2.0
93
+ result[(data >= t1) & (data < t2)] = -1.0
94
+ result[(data > t3) & (data <= t4)] = 1.0
95
+ result[data > t4] = 2.0
96
+ logger.info("discretise: discretised CNV matrix %s.", df.shape)
97
+ return pd.DataFrame(result, index=df.index, columns=df.columns)
@@ -0,0 +1,131 @@
1
+ """Methylation normalisation utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from omicsync.utils.logging import get_logger
9
+
10
+ logger = get_logger("normalisation.methylation")
11
+
12
+
13
+ def beta_to_m(df: pd.DataFrame) -> pd.DataFrame:
14
+ """Convert beta values to M-values: log2(beta / (1 - beta)).
15
+
16
+ Parameters
17
+ ----------
18
+ df:
19
+ Beta value matrix (samples × CpG sites). Values must be in (0, 1).
20
+
21
+ Returns
22
+ -------
23
+ pandas.DataFrame
24
+ M-value matrix.
25
+
26
+ Raises
27
+ ------
28
+ ValueError
29
+ If any value is outside [0, 1].
30
+ """
31
+ data = df.values.astype(float)
32
+ finite = data[np.isfinite(data)]
33
+ if len(finite) > 0 and (finite.min() < 0 or finite.max() > 1):
34
+ raise ValueError(
35
+ f"beta_to_m: beta values must be in [0, 1]. "
36
+ f"Got min={finite.min():.4f}, max={finite.max():.4f}. "
37
+ "Clip first with clip_beta()."
38
+ )
39
+ eps = 1e-6
40
+ clipped = np.clip(data, eps, 1 - eps)
41
+ result = np.log2(clipped / (1 - clipped))
42
+ logger.info("beta_to_m: converted beta → M-values for %s.", df.shape)
43
+ return pd.DataFrame(result, index=df.index, columns=df.columns)
44
+
45
+
46
+ def m_to_beta(df: pd.DataFrame) -> pd.DataFrame:
47
+ """Convert M-values back to beta values: 2^M / (2^M + 1).
48
+
49
+ Parameters
50
+ ----------
51
+ df:
52
+ M-value matrix (samples × CpG sites).
53
+
54
+ Returns
55
+ -------
56
+ pandas.DataFrame
57
+ Beta value matrix in (0, 1).
58
+ """
59
+ data = df.values.astype(float)
60
+ exp = np.power(2.0, data)
61
+ result = exp / (exp + 1.0)
62
+ logger.info("m_to_beta: converted M-values → beta for %s.", df.shape)
63
+ return pd.DataFrame(result, index=df.index, columns=df.columns)
64
+
65
+
66
+ def clip_beta(
67
+ df: pd.DataFrame,
68
+ low: float = 0.001,
69
+ high: float = 0.999,
70
+ ) -> pd.DataFrame:
71
+ """Clip beta values to avoid extreme values near 0 and 1.
72
+
73
+ Parameters
74
+ ----------
75
+ df:
76
+ Beta value matrix.
77
+ low:
78
+ Lower clip bound (default 0.001).
79
+ high:
80
+ Upper clip bound (default 0.999).
81
+
82
+ Returns
83
+ -------
84
+ pandas.DataFrame
85
+ Clipped beta matrix.
86
+ """
87
+ result = df.values.astype(float).clip(low, high)
88
+ logger.info("clip_beta: clipped to [%.4f, %.4f] for %s.", low, high, df.shape)
89
+ return pd.DataFrame(result, index=df.index, columns=df.columns)
90
+
91
+
92
+ def detect_and_normalise(df: pd.DataFrame) -> pd.DataFrame:
93
+ """Auto-detect M-values vs beta values and normalise to clipped beta.
94
+
95
+ Heuristic: if any finite value is outside [0, 1], treat as M-values
96
+ and convert to beta. Otherwise clip beta to [0.001, 0.999].
97
+
98
+ Parameters
99
+ ----------
100
+ df:
101
+ Methylation matrix (samples × CpG sites).
102
+
103
+ Returns
104
+ -------
105
+ pandas.DataFrame
106
+ Beta values clipped to [0.001, 0.999].
107
+ """
108
+ data = df.values.astype(float)
109
+ finite = data[np.isfinite(data)]
110
+ if len(finite) == 0:
111
+ logger.warning("detect_and_normalise: no finite values; skipping.")
112
+ return df
113
+
114
+ vmin, vmax = finite.min(), finite.max()
115
+
116
+ if vmin < -0.01 or vmax > 1.01:
117
+ logger.info(
118
+ "detect_and_normalise (methylation): detected M-values "
119
+ "(min=%.4f, max=%.4f); converting to beta.",
120
+ vmin, vmax,
121
+ )
122
+ result = m_to_beta(df)
123
+ else:
124
+ logger.info(
125
+ "detect_and_normalise (methylation): detected beta values "
126
+ "(min=%.4f, max=%.4f); clipping.",
127
+ vmin, vmax,
128
+ )
129
+ result = df
130
+
131
+ return clip_beta(result)
@@ -0,0 +1,123 @@
1
+ """Mutation matrix processing utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Sequence
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from omicsync.utils.logging import get_logger
11
+
12
+ logger = get_logger("normalisation.mutations")
13
+
14
+ # Standard Sequence Ontology consequence terms
15
+ CONSEQUENCE_TERMS = frozenset({
16
+ "missense_variant",
17
+ "stop_gained",
18
+ "stop_lost",
19
+ "frameshift_variant",
20
+ "splice_acceptor_variant",
21
+ "splice_donor_variant",
22
+ "start_lost",
23
+ "inframe_insertion",
24
+ "inframe_deletion",
25
+ "synonymous_variant",
26
+ "3_prime_UTR_variant",
27
+ "5_prime_UTR_variant",
28
+ "intron_variant",
29
+ "upstream_gene_variant",
30
+ "downstream_gene_variant",
31
+ "non_coding_transcript_variant",
32
+ })
33
+
34
+
35
+ def binarise(df: pd.DataFrame, threshold: float = 0) -> pd.DataFrame:
36
+ """Binarise a mutation matrix: any value above *threshold* becomes 1.
37
+
38
+ Parameters
39
+ ----------
40
+ df:
41
+ Mutation matrix (samples × genes). May contain counts or continuous
42
+ variant scores.
43
+ threshold:
44
+ Values strictly above this threshold are set to 1; others to 0.
45
+
46
+ Returns
47
+ -------
48
+ pandas.DataFrame
49
+ Binary mutation matrix with dtype float64.
50
+ """
51
+ result = (df.values.astype(float) > threshold).astype(float)
52
+ logger.info(
53
+ "binarise: threshold=%.2f applied to %s; "
54
+ "%.1f%% mutated entries.",
55
+ threshold,
56
+ df.shape,
57
+ 100.0 * result.mean(),
58
+ )
59
+ return pd.DataFrame(result, index=df.index, columns=df.columns)
60
+
61
+
62
+ def filter_by_consequence(
63
+ df: pd.DataFrame,
64
+ consequences: Sequence[str],
65
+ consequence_map: dict | None = None,
66
+ ) -> pd.DataFrame:
67
+ """Keep only genes that have at least one sample with a specified consequence.
68
+
69
+ This function operates on a pre-binarised mutation matrix. If a
70
+ ``consequence_map`` is provided (mapping gene → consequence), genes whose
71
+ mapped consequence is not in *consequences* are zeroed out.
72
+
73
+ Parameters
74
+ ----------
75
+ df:
76
+ Mutation matrix (samples × genes).
77
+ consequences:
78
+ Consequence types to retain, e.g. ``["missense_variant", "stop_gained"]``.
79
+ consequence_map:
80
+ Optional dict mapping gene ID to its predominant consequence. If
81
+ ``None``, this function simply returns *df* unchanged with a warning.
82
+
83
+ Returns
84
+ -------
85
+ pandas.DataFrame
86
+ Filtered mutation matrix.
87
+ """
88
+ if consequence_map is None:
89
+ logger.warning(
90
+ "filter_by_consequence: no consequence_map provided; returning input unchanged."
91
+ )
92
+ return df
93
+
94
+ keep = [gene for gene in df.columns if consequence_map.get(gene) in consequences]
95
+ n_before = df.shape[1]
96
+ result = df[keep].copy()
97
+ logger.info(
98
+ "filter_by_consequence: kept %d/%d genes matching %s.",
99
+ len(keep),
100
+ n_before,
101
+ list(consequences),
102
+ )
103
+ return result
104
+
105
+
106
+ def compute_tmb(df: pd.DataFrame) -> pd.Series:
107
+ """Compute tumour mutation burden (total mutations per sample).
108
+
109
+ Parameters
110
+ ----------
111
+ df:
112
+ Binary mutation matrix (samples × genes).
113
+
114
+ Returns
115
+ -------
116
+ pandas.Series
117
+ Mutation count per sample, indexed by sample ID.
118
+ """
119
+ tmb = df.sum(axis=1).rename("tmb")
120
+ logger.info(
121
+ "compute_tmb: TMB computed for %d samples; mean=%.2f.", len(tmb), tmb.mean()
122
+ )
123
+ return tmb
@@ -0,0 +1,54 @@
1
+ """Protein abundance normalisation utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from omicsync.utils.logging import get_logger
9
+
10
+ logger = get_logger("normalisation.protein")
11
+
12
+
13
+ def z_score(df: pd.DataFrame) -> pd.DataFrame:
14
+ """Z-score normalise protein abundance per feature (column).
15
+
16
+ Constant columns (zero standard deviation) are set to 0.
17
+
18
+ Parameters
19
+ ----------
20
+ df:
21
+ Protein abundance matrix (samples × proteins).
22
+
23
+ Returns
24
+ -------
25
+ pandas.DataFrame
26
+ Z-scored matrix.
27
+ """
28
+ data = df.values.astype(float)
29
+ mu = np.nanmean(data, axis=0, keepdims=True)
30
+ sd = np.nanstd(data, axis=0, keepdims=True)
31
+ sd = np.where(sd == 0, 1.0, sd)
32
+ result = (data - mu) / sd
33
+ logger.info("z_score (protein): applied to %s.", df.shape)
34
+ return pd.DataFrame(result, index=df.index, columns=df.columns)
35
+
36
+
37
+ def median_centring(df: pd.DataFrame) -> pd.DataFrame:
38
+ """Centre each protein on its median across samples.
39
+
40
+ Parameters
41
+ ----------
42
+ df:
43
+ Protein abundance matrix (samples × proteins).
44
+
45
+ Returns
46
+ -------
47
+ pandas.DataFrame
48
+ Median-centred matrix.
49
+ """
50
+ data = df.values.astype(float)
51
+ medians = np.nanmedian(data, axis=0, keepdims=True)
52
+ result = data - medians
53
+ logger.info("median_centring: applied to %s.", df.shape)
54
+ return pd.DataFrame(result, index=df.index, columns=df.columns)