omicsync 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omicsync/__init__.py +33 -0
- omicsync/core/__init__.py +25 -0
- omicsync/core/dataset.py +507 -0
- omicsync/core/modality.py +398 -0
- omicsync/core/sample_index.py +200 -0
- omicsync/integration/__init__.py +11 -0
- omicsync/integration/concat.py +146 -0
- omicsync/integration/mofa.py +279 -0
- omicsync/integration/sklearn_compat.py +178 -0
- omicsync/loaders/__init__.py +19 -0
- omicsync/loaders/csv.py +147 -0
- omicsync/loaders/geo.py +111 -0
- omicsync/loaders/open_targets.py +239 -0
- omicsync/loaders/tcga.py +251 -0
- omicsync/normalisation/__init__.py +5 -0
- omicsync/normalisation/cnv.py +97 -0
- omicsync/normalisation/methylation.py +131 -0
- omicsync/normalisation/mutations.py +123 -0
- omicsync/normalisation/protein.py +54 -0
- omicsync/normalisation/rna.py +182 -0
- omicsync/utils/__init__.py +32 -0
- omicsync/utils/barcode.py +165 -0
- omicsync/utils/logging.py +44 -0
- omicsync/utils/validation.py +152 -0
- omicsync-0.1.0.dist-info/METADATA +188 -0
- omicsync-0.1.0.dist-info/RECORD +29 -0
- omicsync-0.1.0.dist-info/WHEEL +5 -0
- omicsync-0.1.0.dist-info/licenses/LICENSE +21 -0
- omicsync-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""RNA-seq normalisation methods."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from scipy import stats
|
|
8
|
+
|
|
9
|
+
from omicsync.utils.logging import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger("normalisation.rna")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def log1p_normalise(df: pd.DataFrame) -> pd.DataFrame:
|
|
15
|
+
"""Apply log1p transform to all values.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
df:
|
|
20
|
+
Expression matrix (samples × features). Values must be non-negative.
|
|
21
|
+
|
|
22
|
+
Returns
|
|
23
|
+
-------
|
|
24
|
+
pandas.DataFrame
|
|
25
|
+
log1p-transformed matrix with same index and columns.
|
|
26
|
+
"""
|
|
27
|
+
result = np.log1p(df.values.astype(float))
|
|
28
|
+
logger.info("log1p_normalise: applied to %s.", df.shape)
|
|
29
|
+
return pd.DataFrame(result, index=df.index, columns=df.columns)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def tpm_to_log1p(df: pd.DataFrame) -> pd.DataFrame:
|
|
33
|
+
"""Apply log1p to a TPM expression matrix.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
df:
|
|
38
|
+
TPM expression matrix (samples × genes).
|
|
39
|
+
|
|
40
|
+
Returns
|
|
41
|
+
-------
|
|
42
|
+
pandas.DataFrame
|
|
43
|
+
log1p(TPM) matrix.
|
|
44
|
+
"""
|
|
45
|
+
logger.info("tpm_to_log1p: applying log1p to TPM matrix %s.", df.shape)
|
|
46
|
+
return log1p_normalise(df)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def counts_to_tpm(df: pd.DataFrame, gene_lengths: pd.Series) -> pd.DataFrame:
|
|
50
|
+
"""Convert raw counts to TPM using gene lengths.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
df:
|
|
55
|
+
Raw count matrix (samples × genes).
|
|
56
|
+
gene_lengths:
|
|
57
|
+
Gene lengths in base pairs, indexed by gene ID matching *df* columns.
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
pandas.DataFrame
|
|
62
|
+
TPM matrix.
|
|
63
|
+
|
|
64
|
+
Raises
|
|
65
|
+
------
|
|
66
|
+
ValueError
|
|
67
|
+
If gene lengths are missing for any column in *df*.
|
|
68
|
+
"""
|
|
69
|
+
missing = df.columns.difference(gene_lengths.index)
|
|
70
|
+
if len(missing) > 0:
|
|
71
|
+
raise ValueError(
|
|
72
|
+
f"Gene lengths missing for {len(missing)} genes: {missing[:5].tolist()}..."
|
|
73
|
+
)
|
|
74
|
+
lengths = gene_lengths.reindex(df.columns).values.astype(float)
|
|
75
|
+
rpk = df.values.astype(float) / (lengths / 1e3)
|
|
76
|
+
scaling = rpk.sum(axis=1, keepdims=True) / 1e6
|
|
77
|
+
tpm = rpk / np.where(scaling == 0, 1.0, scaling)
|
|
78
|
+
logger.info("counts_to_tpm: converted %s to TPM.", df.shape)
|
|
79
|
+
return pd.DataFrame(tpm, index=df.index, columns=df.columns)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def quantile_normalise(df: pd.DataFrame) -> pd.DataFrame:
|
|
83
|
+
"""Quantile normalise a matrix so each sample has the same distribution.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
df:
|
|
88
|
+
Expression matrix (samples × features).
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
pandas.DataFrame
|
|
93
|
+
Quantile-normalised matrix.
|
|
94
|
+
"""
|
|
95
|
+
data = df.values.astype(float).copy()
|
|
96
|
+
n_samples, n_features = data.shape
|
|
97
|
+
|
|
98
|
+
sort_indices = np.argsort(data, axis=1)
|
|
99
|
+
sorted_data = np.sort(data, axis=1)
|
|
100
|
+
row_means = sorted_data.mean(axis=0)
|
|
101
|
+
|
|
102
|
+
result = np.empty_like(data)
|
|
103
|
+
for i in range(n_samples):
|
|
104
|
+
result[i, sort_indices[i]] = row_means
|
|
105
|
+
|
|
106
|
+
logger.info("quantile_normalise: applied to %s.", df.shape)
|
|
107
|
+
return pd.DataFrame(result, index=df.index, columns=df.columns)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def z_score(df: pd.DataFrame, axis: int = 0) -> pd.DataFrame:
|
|
111
|
+
"""Z-score normalise the expression matrix.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
df:
|
|
116
|
+
Expression matrix.
|
|
117
|
+
axis:
|
|
118
|
+
``0`` to z-score per feature (column), ``1`` to z-score per sample (row).
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
pandas.DataFrame
|
|
123
|
+
Z-scored matrix. Constant features/samples are set to 0.
|
|
124
|
+
"""
|
|
125
|
+
data = df.values.astype(float)
|
|
126
|
+
mu = np.nanmean(data, axis=axis, keepdims=True)
|
|
127
|
+
sd = np.nanstd(data, axis=axis, keepdims=True)
|
|
128
|
+
sd = np.where(sd == 0, 1.0, sd)
|
|
129
|
+
result = (data - mu) / sd
|
|
130
|
+
logger.info("z_score: applied along axis=%d to %s.", axis, df.shape)
|
|
131
|
+
return pd.DataFrame(result, index=df.index, columns=df.columns)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def detect_and_normalise(df: pd.DataFrame) -> pd.DataFrame:
|
|
135
|
+
"""Auto-detect RNA value type and apply appropriate normalisation.
|
|
136
|
+
|
|
137
|
+
Heuristic:
|
|
138
|
+
|
|
139
|
+
* If max value > 50 and median > 5 → assume raw counts, apply log1p.
|
|
140
|
+
* If max value in [0.1, 50] and median < 5 → assume TPM, apply log1p.
|
|
141
|
+
* Otherwise → assume already normalised, return as-is.
|
|
142
|
+
|
|
143
|
+
Parameters
|
|
144
|
+
----------
|
|
145
|
+
df:
|
|
146
|
+
RNA expression matrix (samples × features).
|
|
147
|
+
|
|
148
|
+
Returns
|
|
149
|
+
-------
|
|
150
|
+
pandas.DataFrame
|
|
151
|
+
Normalised matrix.
|
|
152
|
+
"""
|
|
153
|
+
vals = df.values.ravel().astype(float)
|
|
154
|
+
finite = vals[np.isfinite(vals) & (vals >= 0)]
|
|
155
|
+
if len(finite) == 0:
|
|
156
|
+
logger.warning("detect_and_normalise: no finite non-negative values; skipping.")
|
|
157
|
+
return df
|
|
158
|
+
|
|
159
|
+
vmax = finite.max()
|
|
160
|
+
vmedian = np.median(finite)
|
|
161
|
+
|
|
162
|
+
if vmax > 50 and vmedian > 5:
|
|
163
|
+
logger.info(
|
|
164
|
+
"detect_and_normalise: detected raw counts (max=%.1f, median=%.2f); "
|
|
165
|
+
"applying log1p.",
|
|
166
|
+
vmax, vmedian,
|
|
167
|
+
)
|
|
168
|
+
return log1p_normalise(df)
|
|
169
|
+
elif vmax > 0.1:
|
|
170
|
+
logger.info(
|
|
171
|
+
"detect_and_normalise: detected TPM-like values (max=%.1f, median=%.2f); "
|
|
172
|
+
"applying log1p.",
|
|
173
|
+
vmax, vmedian,
|
|
174
|
+
)
|
|
175
|
+
return tpm_to_log1p(df)
|
|
176
|
+
else:
|
|
177
|
+
logger.info(
|
|
178
|
+
"detect_and_normalise: values appear already normalised "
|
|
179
|
+
"(max=%.4f, median=%.4f); returning as-is.",
|
|
180
|
+
vmax, vmedian,
|
|
181
|
+
)
|
|
182
|
+
return df
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Utility modules for omicsync."""
|
|
2
|
+
|
|
3
|
+
from omicsync.utils.logging import get_logger, set_verbose
|
|
4
|
+
from omicsync.utils.validation import (
|
|
5
|
+
validate_dataframe,
|
|
6
|
+
validate_modality_type,
|
|
7
|
+
check_value_range,
|
|
8
|
+
validate_sample_ids,
|
|
9
|
+
)
|
|
10
|
+
from omicsync.utils.barcode import (
|
|
11
|
+
parse_barcode,
|
|
12
|
+
truncate_to_participant,
|
|
13
|
+
truncate_to_sample,
|
|
14
|
+
is_tumour,
|
|
15
|
+
is_normal,
|
|
16
|
+
batch_parse,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"get_logger",
|
|
21
|
+
"set_verbose",
|
|
22
|
+
"validate_dataframe",
|
|
23
|
+
"validate_modality_type",
|
|
24
|
+
"check_value_range",
|
|
25
|
+
"validate_sample_ids",
|
|
26
|
+
"parse_barcode",
|
|
27
|
+
"truncate_to_participant",
|
|
28
|
+
"truncate_to_sample",
|
|
29
|
+
"is_tumour",
|
|
30
|
+
"is_normal",
|
|
31
|
+
"batch_parse",
|
|
32
|
+
]
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""TCGA barcode parsing utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import List, Sequence
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
# TCGA barcode structure:
|
|
10
|
+
# TCGA-{TSS}-{Participant}-{Sample}{Vial}-{Portion}{Analyte}-{Plate}-{Centre}
|
|
11
|
+
# e.g. TCGA-02-0001-01A-01R-0177-13
|
|
12
|
+
# 0 1 2 3 4 5 6 (dash-split index)
|
|
13
|
+
# Sample type codes: 01-09 = tumour; 10-19 = normal; 20-29 = control
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def parse_barcode(barcode: str) -> dict:
|
|
17
|
+
"""Parse a TCGA barcode into its component fields.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
barcode:
|
|
22
|
+
A full TCGA aliquot barcode, e.g. ``"TCGA-02-0001-01A-01R-0177-13"``.
|
|
23
|
+
|
|
24
|
+
Returns
|
|
25
|
+
-------
|
|
26
|
+
dict
|
|
27
|
+
Keys: ``project``, ``tss``, ``participant``, ``sample``, ``vial``,
|
|
28
|
+
``portion``, ``analyte``, ``plate``, ``centre``. Missing trailing
|
|
29
|
+
fields are ``None``.
|
|
30
|
+
|
|
31
|
+
Raises
|
|
32
|
+
------
|
|
33
|
+
ValueError
|
|
34
|
+
If the barcode does not start with ``"TCGA-"``.
|
|
35
|
+
"""
|
|
36
|
+
barcode = barcode.strip()
|
|
37
|
+
if not barcode.upper().startswith("TCGA-"):
|
|
38
|
+
raise ValueError(f"Not a valid TCGA barcode: {barcode!r}")
|
|
39
|
+
|
|
40
|
+
parts = barcode.split("-")
|
|
41
|
+
result: dict = {
|
|
42
|
+
"project": parts[0] if len(parts) > 0 else None,
|
|
43
|
+
"tss": parts[1] if len(parts) > 1 else None,
|
|
44
|
+
"participant": parts[2] if len(parts) > 2 else None,
|
|
45
|
+
"sample": parts[3][:2] if len(parts) > 3 else None,
|
|
46
|
+
"vial": parts[3][2:] if len(parts) > 3 and len(parts[3]) > 2 else None,
|
|
47
|
+
"portion": parts[4][:2] if len(parts) > 4 else None,
|
|
48
|
+
"analyte": parts[4][2:] if len(parts) > 4 and len(parts[4]) > 2 else None,
|
|
49
|
+
"plate": parts[5] if len(parts) > 5 else None,
|
|
50
|
+
"centre": parts[6] if len(parts) > 6 else None,
|
|
51
|
+
}
|
|
52
|
+
return result
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def truncate_to_participant(barcode: str) -> str:
|
|
56
|
+
"""Return the participant-level ID (first 12 characters).
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
barcode:
|
|
61
|
+
Full or partial TCGA barcode.
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
str
|
|
66
|
+
E.g. ``"TCGA-02-0001"``.
|
|
67
|
+
"""
|
|
68
|
+
parts = barcode.strip().split("-")
|
|
69
|
+
if len(parts) < 3:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"Barcode {barcode!r} does not contain enough fields to extract "
|
|
72
|
+
"a participant ID."
|
|
73
|
+
)
|
|
74
|
+
return "-".join(parts[:3])
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def truncate_to_sample(barcode: str) -> str:
|
|
78
|
+
"""Return the sample-level ID (first 15–16 characters, through sample+vial).
|
|
79
|
+
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
barcode:
|
|
83
|
+
Full or partial TCGA barcode.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
str
|
|
88
|
+
E.g. ``"TCGA-02-0001-01A"``.
|
|
89
|
+
"""
|
|
90
|
+
parts = barcode.strip().split("-")
|
|
91
|
+
if len(parts) < 4:
|
|
92
|
+
raise ValueError(
|
|
93
|
+
f"Barcode {barcode!r} does not contain enough fields to extract "
|
|
94
|
+
"a sample ID."
|
|
95
|
+
)
|
|
96
|
+
return "-".join(parts[:4])
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def is_tumour(barcode: str) -> bool:
|
|
100
|
+
"""Return ``True`` if the barcode represents a tumour sample (type 01-09).
|
|
101
|
+
|
|
102
|
+
Parameters
|
|
103
|
+
----------
|
|
104
|
+
barcode:
|
|
105
|
+
Full or partial TCGA barcode.
|
|
106
|
+
"""
|
|
107
|
+
parts = barcode.strip().split("-")
|
|
108
|
+
if len(parts) < 4:
|
|
109
|
+
return False
|
|
110
|
+
sample_code = parts[3][:2]
|
|
111
|
+
try:
|
|
112
|
+
return 1 <= int(sample_code) <= 9
|
|
113
|
+
except ValueError:
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def is_normal(barcode: str) -> bool:
|
|
118
|
+
"""Return ``True`` if the barcode represents a normal sample (type 10-19).
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
barcode:
|
|
123
|
+
Full or partial TCGA barcode.
|
|
124
|
+
"""
|
|
125
|
+
parts = barcode.strip().split("-")
|
|
126
|
+
if len(parts) < 4:
|
|
127
|
+
return False
|
|
128
|
+
sample_code = parts[3][:2]
|
|
129
|
+
try:
|
|
130
|
+
return 10 <= int(sample_code) <= 19
|
|
131
|
+
except ValueError:
|
|
132
|
+
return False
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def batch_parse(barcodes: Sequence[str]) -> pd.DataFrame:
|
|
136
|
+
"""Parse a sequence of TCGA barcodes into a DataFrame.
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
barcodes:
|
|
141
|
+
Iterable of TCGA barcode strings.
|
|
142
|
+
|
|
143
|
+
Returns
|
|
144
|
+
-------
|
|
145
|
+
pandas.DataFrame
|
|
146
|
+
One row per barcode; columns match the keys of :func:`parse_barcode`,
|
|
147
|
+
plus ``is_tumour`` and ``is_normal`` boolean columns.
|
|
148
|
+
"""
|
|
149
|
+
rows = []
|
|
150
|
+
for bc in barcodes:
|
|
151
|
+
try:
|
|
152
|
+
row = parse_barcode(bc)
|
|
153
|
+
except ValueError:
|
|
154
|
+
row = {k: None for k in [
|
|
155
|
+
"project", "tss", "participant", "sample",
|
|
156
|
+
"vial", "portion", "analyte", "plate", "centre"
|
|
157
|
+
]}
|
|
158
|
+
row["barcode"] = bc
|
|
159
|
+
row["is_tumour"] = is_tumour(bc)
|
|
160
|
+
row["is_normal"] = is_normal(bc)
|
|
161
|
+
rows.append(row)
|
|
162
|
+
df = pd.DataFrame(rows)
|
|
163
|
+
cols = ["barcode", "project", "tss", "participant", "sample", "vial",
|
|
164
|
+
"portion", "analyte", "plate", "centre", "is_tumour", "is_normal"]
|
|
165
|
+
return df[[c for c in cols if c in df.columns]]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Consistent logging setup for omicsync."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
_logger = logging.getLogger("omicsync")
|
|
7
|
+
|
|
8
|
+
if not _logger.handlers:
|
|
9
|
+
_handler = logging.StreamHandler()
|
|
10
|
+
_handler.setFormatter(
|
|
11
|
+
logging.Formatter("%(asctime)s [%(levelname)s] omicsync: %(message)s",
|
|
12
|
+
datefmt="%H:%M:%S")
|
|
13
|
+
)
|
|
14
|
+
_logger.addHandler(_handler)
|
|
15
|
+
_logger.setLevel(logging.WARNING)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_logger(name: Optional[str] = None) -> logging.Logger:
|
|
19
|
+
"""Return the omicsync logger or a child logger.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
name:
|
|
24
|
+
Optional child name, e.g. ``"loaders.csv"``.
|
|
25
|
+
|
|
26
|
+
Returns
|
|
27
|
+
-------
|
|
28
|
+
logging.Logger
|
|
29
|
+
"""
|
|
30
|
+
if name:
|
|
31
|
+
return logging.getLogger(f"omicsync.{name}")
|
|
32
|
+
return _logger
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def set_verbose(verbose: bool) -> None:
|
|
36
|
+
"""Enable or disable verbose (INFO-level) logging.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
verbose:
|
|
41
|
+
``True`` to enable INFO logging, ``False`` to restore WARNING level.
|
|
42
|
+
"""
|
|
43
|
+
level = logging.INFO if verbose else logging.WARNING
|
|
44
|
+
_logger.setLevel(level)
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Input validation helpers for omicsync."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Sequence
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from omicsync.utils.logging import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger("utils.validation")
|
|
13
|
+
|
|
14
|
+
VALID_MODALITY_TYPES = frozenset(
|
|
15
|
+
{"rna", "mutations", "methylation", "cnv", "protein"}
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def validate_dataframe(
|
|
20
|
+
df: pd.DataFrame,
|
|
21
|
+
name: str,
|
|
22
|
+
min_samples: int = 1,
|
|
23
|
+
min_features: int = 1,
|
|
24
|
+
) -> None:
|
|
25
|
+
"""Validate that *df* is a non-empty DataFrame with the expected shape.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
df:
|
|
30
|
+
DataFrame to validate.
|
|
31
|
+
name:
|
|
32
|
+
Human-readable name used in error messages.
|
|
33
|
+
min_samples:
|
|
34
|
+
Minimum number of rows required.
|
|
35
|
+
min_features:
|
|
36
|
+
Minimum number of columns required.
|
|
37
|
+
|
|
38
|
+
Raises
|
|
39
|
+
------
|
|
40
|
+
TypeError
|
|
41
|
+
If *df* is not a :class:`pandas.DataFrame`.
|
|
42
|
+
ValueError
|
|
43
|
+
If the DataFrame does not meet size requirements.
|
|
44
|
+
"""
|
|
45
|
+
if not isinstance(df, pd.DataFrame):
|
|
46
|
+
raise TypeError(f"{name} must be a pandas DataFrame, got {type(df).__name__}.")
|
|
47
|
+
if df.shape[0] < min_samples:
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"{name} must have at least {min_samples} sample(s); got {df.shape[0]}."
|
|
50
|
+
)
|
|
51
|
+
if df.shape[1] < min_features:
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"{name} must have at least {min_features} feature(s); got {df.shape[1]}."
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def validate_modality_type(modality_type: str) -> None:
|
|
58
|
+
"""Raise :exc:`ValueError` if *modality_type* is not a recognised type.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
modality_type:
|
|
63
|
+
One of ``"rna"``, ``"mutations"``, ``"methylation"``, ``"cnv"``,
|
|
64
|
+
``"protein"``.
|
|
65
|
+
|
|
66
|
+
Raises
|
|
67
|
+
------
|
|
68
|
+
ValueError
|
|
69
|
+
If *modality_type* is unrecognised.
|
|
70
|
+
"""
|
|
71
|
+
if modality_type not in VALID_MODALITY_TYPES:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Unknown modality_type {modality_type!r}. "
|
|
74
|
+
f"Valid types: {sorted(VALID_MODALITY_TYPES)}."
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def check_value_range(df: pd.DataFrame, modality_type: str) -> None:
|
|
79
|
+
"""Warn if values in *df* look unusual for the given modality type.
|
|
80
|
+
|
|
81
|
+
Parameters
|
|
82
|
+
----------
|
|
83
|
+
df:
|
|
84
|
+
Feature matrix (samples × features).
|
|
85
|
+
modality_type:
|
|
86
|
+
One of the recognised omicsync modality types.
|
|
87
|
+
"""
|
|
88
|
+
values = df.values.ravel()
|
|
89
|
+
finite = values[np.isfinite(values)]
|
|
90
|
+
if len(finite) == 0:
|
|
91
|
+
logger.warning("DataFrame contains no finite values.")
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
vmin, vmax, vmean = finite.min(), finite.max(), finite.mean()
|
|
95
|
+
|
|
96
|
+
if modality_type == "rna":
|
|
97
|
+
if vmin < 0:
|
|
98
|
+
logger.warning(
|
|
99
|
+
"RNA modality: found negative values (min=%.4f). "
|
|
100
|
+
"Expected non-negative counts or expression values.",
|
|
101
|
+
vmin,
|
|
102
|
+
)
|
|
103
|
+
elif modality_type == "methylation":
|
|
104
|
+
if vmax > 1.05 or vmin < -1.05:
|
|
105
|
+
logger.warning(
|
|
106
|
+
"Methylation modality: values outside [-1, 1] detected "
|
|
107
|
+
"(min=%.4f, max=%.4f). Beta values should be in [0, 1]; "
|
|
108
|
+
"M-values typically in [-5, 5].",
|
|
109
|
+
vmin,
|
|
110
|
+
vmax,
|
|
111
|
+
)
|
|
112
|
+
elif modality_type == "mutations":
|
|
113
|
+
unique = np.unique(finite)
|
|
114
|
+
if not np.all(np.isin(unique, [0.0, 1.0])):
|
|
115
|
+
logger.warning(
|
|
116
|
+
"Mutation modality: non-binary values detected. "
|
|
117
|
+
"Consider calling binarise() first."
|
|
118
|
+
)
|
|
119
|
+
elif modality_type == "protein":
|
|
120
|
+
if abs(vmean) > 10:
|
|
121
|
+
logger.warning(
|
|
122
|
+
"Protein modality: mean value %.4f seems high. "
|
|
123
|
+
"Consider z-score normalisation.",
|
|
124
|
+
vmean,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def validate_sample_ids(ids: Sequence) -> None:
|
|
129
|
+
"""Check sample IDs for duplicates, NaN, and empty strings.
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
ids:
|
|
134
|
+
Sequence of sample identifiers.
|
|
135
|
+
|
|
136
|
+
Raises
|
|
137
|
+
------
|
|
138
|
+
ValueError
|
|
139
|
+
If any ID is NaN, empty, or duplicated.
|
|
140
|
+
"""
|
|
141
|
+
seen: set = set()
|
|
142
|
+
duplicates: list = []
|
|
143
|
+
for idx, sid in enumerate(ids):
|
|
144
|
+
if sid is None or (isinstance(sid, float) and np.isnan(sid)):
|
|
145
|
+
raise ValueError(f"Sample ID at position {idx} is NaN/None.")
|
|
146
|
+
if isinstance(sid, str) and sid.strip() == "":
|
|
147
|
+
raise ValueError(f"Sample ID at position {idx} is an empty string.")
|
|
148
|
+
if sid in seen:
|
|
149
|
+
duplicates.append(sid)
|
|
150
|
+
seen.add(sid)
|
|
151
|
+
if duplicates:
|
|
152
|
+
raise ValueError(f"Duplicate sample IDs found: {duplicates[:10]}.")
|