omicsync 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,182 @@
1
+ """RNA-seq normalisation methods."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from scipy import stats
8
+
9
+ from omicsync.utils.logging import get_logger
10
+
11
+ logger = get_logger("normalisation.rna")
12
+
13
+
14
+ def log1p_normalise(df: pd.DataFrame) -> pd.DataFrame:
15
+ """Apply log1p transform to all values.
16
+
17
+ Parameters
18
+ ----------
19
+ df:
20
+ Expression matrix (samples × features). Values must be non-negative.
21
+
22
+ Returns
23
+ -------
24
+ pandas.DataFrame
25
+ log1p-transformed matrix with same index and columns.
26
+ """
27
+ result = np.log1p(df.values.astype(float))
28
+ logger.info("log1p_normalise: applied to %s.", df.shape)
29
+ return pd.DataFrame(result, index=df.index, columns=df.columns)
30
+
31
+
32
+ def tpm_to_log1p(df: pd.DataFrame) -> pd.DataFrame:
33
+ """Apply log1p to a TPM expression matrix.
34
+
35
+ Parameters
36
+ ----------
37
+ df:
38
+ TPM expression matrix (samples × genes).
39
+
40
+ Returns
41
+ -------
42
+ pandas.DataFrame
43
+ log1p(TPM) matrix.
44
+ """
45
+ logger.info("tpm_to_log1p: applying log1p to TPM matrix %s.", df.shape)
46
+ return log1p_normalise(df)
47
+
48
+
49
+ def counts_to_tpm(df: pd.DataFrame, gene_lengths: pd.Series) -> pd.DataFrame:
50
+ """Convert raw counts to TPM using gene lengths.
51
+
52
+ Parameters
53
+ ----------
54
+ df:
55
+ Raw count matrix (samples × genes).
56
+ gene_lengths:
57
+ Gene lengths in base pairs, indexed by gene ID matching *df* columns.
58
+
59
+ Returns
60
+ -------
61
+ pandas.DataFrame
62
+ TPM matrix.
63
+
64
+ Raises
65
+ ------
66
+ ValueError
67
+ If gene lengths are missing for any column in *df*.
68
+ """
69
+ missing = df.columns.difference(gene_lengths.index)
70
+ if len(missing) > 0:
71
+ raise ValueError(
72
+ f"Gene lengths missing for {len(missing)} genes: {missing[:5].tolist()}..."
73
+ )
74
+ lengths = gene_lengths.reindex(df.columns).values.astype(float)
75
+ rpk = df.values.astype(float) / (lengths / 1e3)
76
+ scaling = rpk.sum(axis=1, keepdims=True) / 1e6
77
+ tpm = rpk / np.where(scaling == 0, 1.0, scaling)
78
+ logger.info("counts_to_tpm: converted %s to TPM.", df.shape)
79
+ return pd.DataFrame(tpm, index=df.index, columns=df.columns)
80
+
81
+
82
+ def quantile_normalise(df: pd.DataFrame) -> pd.DataFrame:
83
+ """Quantile normalise a matrix so each sample has the same distribution.
84
+
85
+ Parameters
86
+ ----------
87
+ df:
88
+ Expression matrix (samples × features).
89
+
90
+ Returns
91
+ -------
92
+ pandas.DataFrame
93
+ Quantile-normalised matrix.
94
+ """
95
+ data = df.values.astype(float).copy()
96
+ n_samples, n_features = data.shape
97
+
98
+ sort_indices = np.argsort(data, axis=1)
99
+ sorted_data = np.sort(data, axis=1)
100
+ row_means = sorted_data.mean(axis=0)
101
+
102
+ result = np.empty_like(data)
103
+ for i in range(n_samples):
104
+ result[i, sort_indices[i]] = row_means
105
+
106
+ logger.info("quantile_normalise: applied to %s.", df.shape)
107
+ return pd.DataFrame(result, index=df.index, columns=df.columns)
108
+
109
+
110
+ def z_score(df: pd.DataFrame, axis: int = 0) -> pd.DataFrame:
111
+ """Z-score normalise the expression matrix.
112
+
113
+ Parameters
114
+ ----------
115
+ df:
116
+ Expression matrix.
117
+ axis:
118
+ ``0`` to z-score per feature (column), ``1`` to z-score per sample (row).
119
+
120
+ Returns
121
+ -------
122
+ pandas.DataFrame
123
+ Z-scored matrix. Constant features/samples are set to 0.
124
+ """
125
+ data = df.values.astype(float)
126
+ mu = np.nanmean(data, axis=axis, keepdims=True)
127
+ sd = np.nanstd(data, axis=axis, keepdims=True)
128
+ sd = np.where(sd == 0, 1.0, sd)
129
+ result = (data - mu) / sd
130
+ logger.info("z_score: applied along axis=%d to %s.", axis, df.shape)
131
+ return pd.DataFrame(result, index=df.index, columns=df.columns)
132
+
133
+
134
+ def detect_and_normalise(df: pd.DataFrame) -> pd.DataFrame:
135
+ """Auto-detect RNA value type and apply appropriate normalisation.
136
+
137
+ Heuristic:
138
+
139
+ * If max value > 50 and median > 5 → assume raw counts, apply log1p.
140
+ * If max value in [0.1, 50] and median < 5 → assume TPM, apply log1p.
141
+ * Otherwise → assume already normalised, return as-is.
142
+
143
+ Parameters
144
+ ----------
145
+ df:
146
+ RNA expression matrix (samples × features).
147
+
148
+ Returns
149
+ -------
150
+ pandas.DataFrame
151
+ Normalised matrix.
152
+ """
153
+ vals = df.values.ravel().astype(float)
154
+ finite = vals[np.isfinite(vals) & (vals >= 0)]
155
+ if len(finite) == 0:
156
+ logger.warning("detect_and_normalise: no finite non-negative values; skipping.")
157
+ return df
158
+
159
+ vmax = finite.max()
160
+ vmedian = np.median(finite)
161
+
162
+ if vmax > 50 and vmedian > 5:
163
+ logger.info(
164
+ "detect_and_normalise: detected raw counts (max=%.1f, median=%.2f); "
165
+ "applying log1p.",
166
+ vmax, vmedian,
167
+ )
168
+ return log1p_normalise(df)
169
+ elif vmax > 0.1:
170
+ logger.info(
171
+ "detect_and_normalise: detected TPM-like values (max=%.1f, median=%.2f); "
172
+ "applying log1p.",
173
+ vmax, vmedian,
174
+ )
175
+ return tpm_to_log1p(df)
176
+ else:
177
+ logger.info(
178
+ "detect_and_normalise: values appear already normalised "
179
+ "(max=%.4f, median=%.4f); returning as-is.",
180
+ vmax, vmedian,
181
+ )
182
+ return df
@@ -0,0 +1,32 @@
1
+ """Utility modules for omicsync."""
2
+
3
+ from omicsync.utils.logging import get_logger, set_verbose
4
+ from omicsync.utils.validation import (
5
+ validate_dataframe,
6
+ validate_modality_type,
7
+ check_value_range,
8
+ validate_sample_ids,
9
+ )
10
+ from omicsync.utils.barcode import (
11
+ parse_barcode,
12
+ truncate_to_participant,
13
+ truncate_to_sample,
14
+ is_tumour,
15
+ is_normal,
16
+ batch_parse,
17
+ )
18
+
19
+ __all__ = [
20
+ "get_logger",
21
+ "set_verbose",
22
+ "validate_dataframe",
23
+ "validate_modality_type",
24
+ "check_value_range",
25
+ "validate_sample_ids",
26
+ "parse_barcode",
27
+ "truncate_to_participant",
28
+ "truncate_to_sample",
29
+ "is_tumour",
30
+ "is_normal",
31
+ "batch_parse",
32
+ ]
@@ -0,0 +1,165 @@
1
+ """TCGA barcode parsing utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import List, Sequence
6
+
7
+ import pandas as pd
8
+
9
+ # TCGA barcode structure:
10
+ # TCGA-{TSS}-{Participant}-{Sample}{Vial}-{Portion}{Analyte}-{Plate}-{Centre}
11
+ # e.g. TCGA-02-0001-01A-01R-0177-13
12
+ # 0 1 2 3 4 5 6 (dash-split index)
13
+ # Sample type codes: 01-09 = tumour; 10-19 = normal; 20-29 = control
14
+
15
+
16
+ def parse_barcode(barcode: str) -> dict:
17
+ """Parse a TCGA barcode into its component fields.
18
+
19
+ Parameters
20
+ ----------
21
+ barcode:
22
+ A full TCGA aliquot barcode, e.g. ``"TCGA-02-0001-01A-01R-0177-13"``.
23
+
24
+ Returns
25
+ -------
26
+ dict
27
+ Keys: ``project``, ``tss``, ``participant``, ``sample``, ``vial``,
28
+ ``portion``, ``analyte``, ``plate``, ``centre``. Missing trailing
29
+ fields are ``None``.
30
+
31
+ Raises
32
+ ------
33
+ ValueError
34
+ If the barcode does not start with ``"TCGA-"``.
35
+ """
36
+ barcode = barcode.strip()
37
+ if not barcode.upper().startswith("TCGA-"):
38
+ raise ValueError(f"Not a valid TCGA barcode: {barcode!r}")
39
+
40
+ parts = barcode.split("-")
41
+ result: dict = {
42
+ "project": parts[0] if len(parts) > 0 else None,
43
+ "tss": parts[1] if len(parts) > 1 else None,
44
+ "participant": parts[2] if len(parts) > 2 else None,
45
+ "sample": parts[3][:2] if len(parts) > 3 else None,
46
+ "vial": parts[3][2:] if len(parts) > 3 and len(parts[3]) > 2 else None,
47
+ "portion": parts[4][:2] if len(parts) > 4 else None,
48
+ "analyte": parts[4][2:] if len(parts) > 4 and len(parts[4]) > 2 else None,
49
+ "plate": parts[5] if len(parts) > 5 else None,
50
+ "centre": parts[6] if len(parts) > 6 else None,
51
+ }
52
+ return result
53
+
54
+
55
+ def truncate_to_participant(barcode: str) -> str:
56
+ """Return the participant-level ID (first 12 characters).
57
+
58
+ Parameters
59
+ ----------
60
+ barcode:
61
+ Full or partial TCGA barcode.
62
+
63
+ Returns
64
+ -------
65
+ str
66
+ E.g. ``"TCGA-02-0001"``.
67
+ """
68
+ parts = barcode.strip().split("-")
69
+ if len(parts) < 3:
70
+ raise ValueError(
71
+ f"Barcode {barcode!r} does not contain enough fields to extract "
72
+ "a participant ID."
73
+ )
74
+ return "-".join(parts[:3])
75
+
76
+
77
+ def truncate_to_sample(barcode: str) -> str:
78
+ """Return the sample-level ID (first 15–16 characters, through sample+vial).
79
+
80
+ Parameters
81
+ ----------
82
+ barcode:
83
+ Full or partial TCGA barcode.
84
+
85
+ Returns
86
+ -------
87
+ str
88
+ E.g. ``"TCGA-02-0001-01A"``.
89
+ """
90
+ parts = barcode.strip().split("-")
91
+ if len(parts) < 4:
92
+ raise ValueError(
93
+ f"Barcode {barcode!r} does not contain enough fields to extract "
94
+ "a sample ID."
95
+ )
96
+ return "-".join(parts[:4])
97
+
98
+
99
+ def is_tumour(barcode: str) -> bool:
100
+ """Return ``True`` if the barcode represents a tumour sample (type 01-09).
101
+
102
+ Parameters
103
+ ----------
104
+ barcode:
105
+ Full or partial TCGA barcode.
106
+ """
107
+ parts = barcode.strip().split("-")
108
+ if len(parts) < 4:
109
+ return False
110
+ sample_code = parts[3][:2]
111
+ try:
112
+ return 1 <= int(sample_code) <= 9
113
+ except ValueError:
114
+ return False
115
+
116
+
117
+ def is_normal(barcode: str) -> bool:
118
+ """Return ``True`` if the barcode represents a normal sample (type 10-19).
119
+
120
+ Parameters
121
+ ----------
122
+ barcode:
123
+ Full or partial TCGA barcode.
124
+ """
125
+ parts = barcode.strip().split("-")
126
+ if len(parts) < 4:
127
+ return False
128
+ sample_code = parts[3][:2]
129
+ try:
130
+ return 10 <= int(sample_code) <= 19
131
+ except ValueError:
132
+ return False
133
+
134
+
135
+ def batch_parse(barcodes: Sequence[str]) -> pd.DataFrame:
136
+ """Parse a sequence of TCGA barcodes into a DataFrame.
137
+
138
+ Parameters
139
+ ----------
140
+ barcodes:
141
+ Iterable of TCGA barcode strings.
142
+
143
+ Returns
144
+ -------
145
+ pandas.DataFrame
146
+ One row per barcode; columns match the keys of :func:`parse_barcode`,
147
+ plus ``is_tumour`` and ``is_normal`` boolean columns.
148
+ """
149
+ rows = []
150
+ for bc in barcodes:
151
+ try:
152
+ row = parse_barcode(bc)
153
+ except ValueError:
154
+ row = {k: None for k in [
155
+ "project", "tss", "participant", "sample",
156
+ "vial", "portion", "analyte", "plate", "centre"
157
+ ]}
158
+ row["barcode"] = bc
159
+ row["is_tumour"] = is_tumour(bc)
160
+ row["is_normal"] = is_normal(bc)
161
+ rows.append(row)
162
+ df = pd.DataFrame(rows)
163
+ cols = ["barcode", "project", "tss", "participant", "sample", "vial",
164
+ "portion", "analyte", "plate", "centre", "is_tumour", "is_normal"]
165
+ return df[[c for c in cols if c in df.columns]]
@@ -0,0 +1,44 @@
1
+ """Consistent logging setup for omicsync."""
2
+
3
+ import logging
4
+ from typing import Optional
5
+
6
+ _logger = logging.getLogger("omicsync")
7
+
8
+ if not _logger.handlers:
9
+ _handler = logging.StreamHandler()
10
+ _handler.setFormatter(
11
+ logging.Formatter("%(asctime)s [%(levelname)s] omicsync: %(message)s",
12
+ datefmt="%H:%M:%S")
13
+ )
14
+ _logger.addHandler(_handler)
15
+ _logger.setLevel(logging.WARNING)
16
+
17
+
18
+ def get_logger(name: Optional[str] = None) -> logging.Logger:
19
+ """Return the omicsync logger or a child logger.
20
+
21
+ Parameters
22
+ ----------
23
+ name:
24
+ Optional child name, e.g. ``"loaders.csv"``.
25
+
26
+ Returns
27
+ -------
28
+ logging.Logger
29
+ """
30
+ if name:
31
+ return logging.getLogger(f"omicsync.{name}")
32
+ return _logger
33
+
34
+
35
+ def set_verbose(verbose: bool) -> None:
36
+ """Enable or disable verbose (INFO-level) logging.
37
+
38
+ Parameters
39
+ ----------
40
+ verbose:
41
+ ``True`` to enable INFO logging, ``False`` to restore WARNING level.
42
+ """
43
+ level = logging.INFO if verbose else logging.WARNING
44
+ _logger.setLevel(level)
@@ -0,0 +1,152 @@
1
+ """Input validation helpers for omicsync."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Sequence
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from omicsync.utils.logging import get_logger
11
+
12
+ logger = get_logger("utils.validation")
13
+
14
+ VALID_MODALITY_TYPES = frozenset(
15
+ {"rna", "mutations", "methylation", "cnv", "protein"}
16
+ )
17
+
18
+
19
+ def validate_dataframe(
20
+ df: pd.DataFrame,
21
+ name: str,
22
+ min_samples: int = 1,
23
+ min_features: int = 1,
24
+ ) -> None:
25
+ """Validate that *df* is a non-empty DataFrame with the expected shape.
26
+
27
+ Parameters
28
+ ----------
29
+ df:
30
+ DataFrame to validate.
31
+ name:
32
+ Human-readable name used in error messages.
33
+ min_samples:
34
+ Minimum number of rows required.
35
+ min_features:
36
+ Minimum number of columns required.
37
+
38
+ Raises
39
+ ------
40
+ TypeError
41
+ If *df* is not a :class:`pandas.DataFrame`.
42
+ ValueError
43
+ If the DataFrame does not meet size requirements.
44
+ """
45
+ if not isinstance(df, pd.DataFrame):
46
+ raise TypeError(f"{name} must be a pandas DataFrame, got {type(df).__name__}.")
47
+ if df.shape[0] < min_samples:
48
+ raise ValueError(
49
+ f"{name} must have at least {min_samples} sample(s); got {df.shape[0]}."
50
+ )
51
+ if df.shape[1] < min_features:
52
+ raise ValueError(
53
+ f"{name} must have at least {min_features} feature(s); got {df.shape[1]}."
54
+ )
55
+
56
+
57
+ def validate_modality_type(modality_type: str) -> None:
58
+ """Raise :exc:`ValueError` if *modality_type* is not a recognised type.
59
+
60
+ Parameters
61
+ ----------
62
+ modality_type:
63
+ One of ``"rna"``, ``"mutations"``, ``"methylation"``, ``"cnv"``,
64
+ ``"protein"``.
65
+
66
+ Raises
67
+ ------
68
+ ValueError
69
+ If *modality_type* is unrecognised.
70
+ """
71
+ if modality_type not in VALID_MODALITY_TYPES:
72
+ raise ValueError(
73
+ f"Unknown modality_type {modality_type!r}. "
74
+ f"Valid types: {sorted(VALID_MODALITY_TYPES)}."
75
+ )
76
+
77
+
78
+ def check_value_range(df: pd.DataFrame, modality_type: str) -> None:
79
+ """Warn if values in *df* look unusual for the given modality type.
80
+
81
+ Parameters
82
+ ----------
83
+ df:
84
+ Feature matrix (samples × features).
85
+ modality_type:
86
+ One of the recognised omicsync modality types.
87
+ """
88
+ values = df.values.ravel()
89
+ finite = values[np.isfinite(values)]
90
+ if len(finite) == 0:
91
+ logger.warning("DataFrame contains no finite values.")
92
+ return
93
+
94
+ vmin, vmax, vmean = finite.min(), finite.max(), finite.mean()
95
+
96
+ if modality_type == "rna":
97
+ if vmin < 0:
98
+ logger.warning(
99
+ "RNA modality: found negative values (min=%.4f). "
100
+ "Expected non-negative counts or expression values.",
101
+ vmin,
102
+ )
103
+ elif modality_type == "methylation":
104
+ if vmax > 1.05 or vmin < -1.05:
105
+ logger.warning(
106
+ "Methylation modality: values outside [-1, 1] detected "
107
+ "(min=%.4f, max=%.4f). Beta values should be in [0, 1]; "
108
+ "M-values typically in [-5, 5].",
109
+ vmin,
110
+ vmax,
111
+ )
112
+ elif modality_type == "mutations":
113
+ unique = np.unique(finite)
114
+ if not np.all(np.isin(unique, [0.0, 1.0])):
115
+ logger.warning(
116
+ "Mutation modality: non-binary values detected. "
117
+ "Consider calling binarise() first."
118
+ )
119
+ elif modality_type == "protein":
120
+ if abs(vmean) > 10:
121
+ logger.warning(
122
+ "Protein modality: mean value %.4f seems high. "
123
+ "Consider z-score normalisation.",
124
+ vmean,
125
+ )
126
+
127
+
128
+ def validate_sample_ids(ids: Sequence) -> None:
129
+ """Check sample IDs for duplicates, NaN, and empty strings.
130
+
131
+ Parameters
132
+ ----------
133
+ ids:
134
+ Sequence of sample identifiers.
135
+
136
+ Raises
137
+ ------
138
+ ValueError
139
+ If any ID is NaN, empty, or duplicated.
140
+ """
141
+ seen: set = set()
142
+ duplicates: list = []
143
+ for idx, sid in enumerate(ids):
144
+ if sid is None or (isinstance(sid, float) and np.isnan(sid)):
145
+ raise ValueError(f"Sample ID at position {idx} is NaN/None.")
146
+ if isinstance(sid, str) and sid.strip() == "":
147
+ raise ValueError(f"Sample ID at position {idx} is an empty string.")
148
+ if sid in seen:
149
+ duplicates.append(sid)
150
+ seen.add(sid)
151
+ if duplicates:
152
+ raise ValueError(f"Duplicate sample IDs found: {duplicates[:10]}.")