msreport 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport/__init__.py +1 -1
- msreport/aggregate/__init__.py +10 -0
- msreport/aggregate/condense.py +9 -0
- msreport/aggregate/pivot.py +14 -5
- msreport/aggregate/summarize.py +14 -4
- msreport/analyze.py +67 -5
- msreport/export.py +9 -15
- msreport/fasta.py +9 -2
- msreport/helper/__init__.py +18 -0
- msreport/impute.py +18 -10
- msreport/isobar.py +11 -14
- msreport/normalize.py +95 -10
- msreport/peptidoform.py +21 -11
- msreport/plot/__init__.py +3 -3
- msreport/plot/distribution.py +2 -1
- msreport/plot/quality.py +1 -1
- msreport/qtable.py +44 -20
- msreport/reader.py +321 -40
- msreport/rinterface/limma.py +1 -1
- {msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/METADATA +20 -2
- msreport-0.0.31.dist-info/RECORD +38 -0
- {msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/WHEEL +1 -1
- msreport-0.0.29.dist-info/RECORD +0 -38
- {msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/licenses/LICENSE.txt +0 -0
- {msreport-0.0.29.dist-info → msreport-0.0.31.dist-info}/top_level.txt +0 -0
msreport/__init__.py
CHANGED
msreport/aggregate/__init__.py
CHANGED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""A comprehensive set of tools for aggregating and reshaping tabular proteomics data.
|
|
2
|
+
|
|
3
|
+
The `aggregation` module contains submodules that offer functionalities to transform
|
|
4
|
+
data from lower levels of abstraction (e.g. ions, peptides) to higher levels (e.g.
|
|
5
|
+
peptides, proteins, PTMs) through various summarization and condensation techniques.
|
|
6
|
+
It also includes methods for reshaping tables from "long" to "wide" format, a common
|
|
7
|
+
prerequisite for aggregation. The MaxLFQ algorithm is integrated for specific
|
|
8
|
+
quantitative summarizations, enabling users to build customized, higher-level data
|
|
9
|
+
tables.
|
|
10
|
+
"""
|
msreport/aggregate/condense.py
CHANGED
|
@@ -1,3 +1,12 @@
|
|
|
1
|
+
"""Low-level functions for aggregating numerical and string data.
|
|
2
|
+
|
|
3
|
+
This module defines fundamental "condenser" functions that operate directly on NumPy
|
|
4
|
+
arrays. These functions are designed to be applied to groups of data, performing
|
|
5
|
+
operations such as summing values, finding maximum/minimum, counting or joining unique
|
|
6
|
+
elements, and calculating abundance profiles. It includes the core implementations for
|
|
7
|
+
MaxLFQ summation.
|
|
8
|
+
"""
|
|
9
|
+
|
|
1
10
|
import numpy as np
|
|
2
11
|
|
|
3
12
|
import msreport.helper.maxlfq as MAXLFQ
|
msreport/aggregate/pivot.py
CHANGED
|
@@ -1,4 +1,12 @@
|
|
|
1
|
-
|
|
1
|
+
"""Functionalities for reshaping tabular quantitative proteomics data.
|
|
2
|
+
|
|
3
|
+
This module offers methods to transform data from a "long" format into a "wide" format,
|
|
4
|
+
which is a common and often necessary step before aggregation or analysis. It supports
|
|
5
|
+
pivoting data based on specified index and grouping columns, and can handle both
|
|
6
|
+
quantitative values and annotation columns.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Iterable
|
|
2
10
|
|
|
3
11
|
import pandas as pd
|
|
4
12
|
|
|
@@ -12,11 +20,12 @@ def pivot_table(
|
|
|
12
20
|
group_by: str,
|
|
13
21
|
annotation_columns: Iterable[str],
|
|
14
22
|
pivoting_columns: Iterable[str],
|
|
15
|
-
):
|
|
23
|
+
) -> pd.DataFrame:
|
|
16
24
|
"""Generates a pivoted table in wide format.
|
|
17
25
|
|
|
18
26
|
Args:
|
|
19
|
-
|
|
27
|
+
long_table: Dataframe in long format that is used to generate a table in wide
|
|
28
|
+
format.
|
|
20
29
|
index: One or multiple column names that are used to group the table for
|
|
21
30
|
pivoting.
|
|
22
31
|
group_by: Column that is used to split the table on its unique entries.
|
|
@@ -58,7 +67,7 @@ def pivot_table(
|
|
|
58
67
|
|
|
59
68
|
|
|
60
69
|
def pivot_column(
|
|
61
|
-
table: pd.DataFrame, index:
|
|
70
|
+
table: pd.DataFrame, index: str | Iterable[str], group_by: str, values: str
|
|
62
71
|
) -> pd.DataFrame:
|
|
63
72
|
"""Returns a reshaped dataframe, generated by pivoting the table on one column.
|
|
64
73
|
|
|
@@ -98,7 +107,7 @@ def pivot_column(
|
|
|
98
107
|
|
|
99
108
|
|
|
100
109
|
def join_unique(
|
|
101
|
-
table: pd.DataFrame, index:
|
|
110
|
+
table: pd.DataFrame, index: str | Iterable[str], values: str
|
|
102
111
|
) -> pd.DataFrame:
|
|
103
112
|
"""Returns a new dataframe with unique values from a column and grouped by 'index'.
|
|
104
113
|
|
msreport/aggregate/summarize.py
CHANGED
|
@@ -1,4 +1,14 @@
|
|
|
1
|
-
|
|
1
|
+
"""High-level functions for aggregating quantitative proteomics data.
|
|
2
|
+
|
|
3
|
+
This module offers functions to summarize data from a lower level of abstraction (e.g.
|
|
4
|
+
ions, peptides) to a higher level (e.g., peptides, proteins, PTMs). It operates directly
|
|
5
|
+
on pandas DataFrames, allowing users to specify a grouping column and the columns to be
|
|
6
|
+
summarized. These functions often leverage low-level condenser operations defined in
|
|
7
|
+
`msreport.aggregate.condense`. It includes specific functions for MaxLFQ summation, as
|
|
8
|
+
well as general counting, joining, and summing of columns.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Callable, Iterable, Optional
|
|
2
12
|
|
|
3
13
|
import numpy as np
|
|
4
14
|
import pandas as pd
|
|
@@ -10,7 +20,7 @@ from msreport.helper import find_sample_columns
|
|
|
10
20
|
def count_unique(
|
|
11
21
|
table: pd.DataFrame,
|
|
12
22
|
group_by: str,
|
|
13
|
-
input_column:
|
|
23
|
+
input_column: str | Iterable[str],
|
|
14
24
|
output_column: str = "Unique counts",
|
|
15
25
|
is_sorted: bool = False,
|
|
16
26
|
) -> pd.DataFrame:
|
|
@@ -55,7 +65,7 @@ def count_unique(
|
|
|
55
65
|
def join_unique(
|
|
56
66
|
table: pd.DataFrame,
|
|
57
67
|
group_by: str,
|
|
58
|
-
input_column:
|
|
68
|
+
input_column: str | Iterable[str],
|
|
59
69
|
output_column: str = "Unique values",
|
|
60
70
|
sep: str = ";",
|
|
61
71
|
is_sorted: bool = False,
|
|
@@ -215,7 +225,7 @@ def sum_columns_maxlfq(
|
|
|
215
225
|
def aggregate_unique_groups(
|
|
216
226
|
table: pd.DataFrame,
|
|
217
227
|
group_by: str,
|
|
218
|
-
columns_to_aggregate:
|
|
228
|
+
columns_to_aggregate: str | Iterable[str],
|
|
219
229
|
condenser: Callable,
|
|
220
230
|
is_sorted: bool,
|
|
221
231
|
) -> tuple[np.ndarray, np.ndarray]:
|
msreport/analyze.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Tools for post-processing and statistical analysis of `Qtable` data.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
All functions in this module take a `Qtable` object and modify its data in place. The
|
|
4
|
+
module provides functionality for data evaluation, normalization, imputation of missing
|
|
5
|
+
values, and statistical testing, including integration with R's LIMMA package.
|
|
6
|
+
"""
|
|
4
7
|
|
|
5
8
|
import warnings
|
|
6
9
|
from typing import Iterable, Optional, Protocol, Sequence
|
|
7
10
|
|
|
8
11
|
import numpy as np
|
|
9
12
|
import pandas as pd
|
|
13
|
+
from typing_extensions import Self
|
|
10
14
|
|
|
11
15
|
import msreport.normalize
|
|
12
16
|
from msreport.errors import OptionalDependencyError
|
|
@@ -24,7 +28,7 @@ except OptionalDependencyError as err:
|
|
|
24
28
|
|
|
25
29
|
|
|
26
30
|
class Transformer(Protocol):
|
|
27
|
-
def fit(self, table: pd.DataFrame) ->
|
|
31
|
+
def fit(self, table: pd.DataFrame) -> Self:
|
|
28
32
|
"""Fits the Transformer and returns a fitted Transformer instance."""
|
|
29
33
|
|
|
30
34
|
def is_fitted(self) -> bool:
|
|
@@ -35,7 +39,7 @@ class Transformer(Protocol):
|
|
|
35
39
|
|
|
36
40
|
|
|
37
41
|
class CategoryTransformer(Protocol):
|
|
38
|
-
def fit(self, table: pd.DataFrame) ->
|
|
42
|
+
def fit(self, table: pd.DataFrame) -> Self:
|
|
39
43
|
"""Fits the Transformer and returns a fitted Transformer instance."""
|
|
40
44
|
|
|
41
45
|
def is_fitted(self) -> bool:
|
|
@@ -162,7 +166,7 @@ def validate_proteins(
|
|
|
162
166
|
|
|
163
167
|
|
|
164
168
|
def apply_transformer(
|
|
165
|
-
qtable:
|
|
169
|
+
qtable: Qtable,
|
|
166
170
|
transformer: Transformer,
|
|
167
171
|
tag: str,
|
|
168
172
|
exclude_invalid: bool,
|
|
@@ -205,6 +209,64 @@ def apply_transformer(
|
|
|
205
209
|
qtable.data[data_table.columns] = data_table
|
|
206
210
|
|
|
207
211
|
|
|
212
|
+
def apply_category_transformer(
|
|
213
|
+
qtable: Qtable,
|
|
214
|
+
transformer: CategoryTransformer,
|
|
215
|
+
tag: str,
|
|
216
|
+
exclude_invalid: bool,
|
|
217
|
+
remove_invalid: bool,
|
|
218
|
+
new_tag: Optional[str] = None,
|
|
219
|
+
) -> None:
|
|
220
|
+
"""Apply a category transformer to Qtable columns selected by tag.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
qtable: A Qtable instance, to which the transformer is applied.
|
|
224
|
+
transformer: The CategoryTransformer to apply.
|
|
225
|
+
tag: The tag used to identify the columns for applying the transformer.
|
|
226
|
+
exclude_invalid: Exclude invalid values from the transformation.
|
|
227
|
+
remove_invalid: Remove invalid values from the table after the transformation.
|
|
228
|
+
new_tag: Optional, if specified than the tag is replaced with this value in the
|
|
229
|
+
column names and the transformed data is stored to these new columns.
|
|
230
|
+
|
|
231
|
+
Raises:
|
|
232
|
+
KeyError: If the category column of the `transformer` is not found in the
|
|
233
|
+
`qtable.data`.
|
|
234
|
+
ValueError: If no sample columns are found for the specified tag.
|
|
235
|
+
"""
|
|
236
|
+
category_column = transformer.get_category_column()
|
|
237
|
+
if category_column not in qtable.data.columns:
|
|
238
|
+
raise KeyError(
|
|
239
|
+
f'The category column "{category_column}" in the transformer '
|
|
240
|
+
f"is not found in `qtable.data`."
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
valid = qtable.data["Valid"]
|
|
244
|
+
samples = qtable.get_samples()
|
|
245
|
+
sample_columns = find_sample_columns(qtable.data, tag, samples)
|
|
246
|
+
|
|
247
|
+
if not sample_columns:
|
|
248
|
+
raise ValueError(f"No sample columns found for tag '{tag}'.")
|
|
249
|
+
|
|
250
|
+
if new_tag is not None:
|
|
251
|
+
sample_columns = [c.replace(tag, new_tag) for c in sample_columns]
|
|
252
|
+
column_mapping = dict(zip(samples, sample_columns))
|
|
253
|
+
|
|
254
|
+
data_table = qtable.make_sample_table(tag, samples_as_columns=True)
|
|
255
|
+
data_table[category_column] = qtable.data[category_column]
|
|
256
|
+
|
|
257
|
+
if exclude_invalid:
|
|
258
|
+
data_table.loc[valid, :] = transformer.transform(data_table.loc[valid, :])
|
|
259
|
+
else:
|
|
260
|
+
data_table = transformer.transform(data_table)
|
|
261
|
+
data_table = data_table.drop(columns=[category_column])
|
|
262
|
+
|
|
263
|
+
if remove_invalid:
|
|
264
|
+
data_table[~valid] = np.nan
|
|
265
|
+
|
|
266
|
+
data_table.columns = [column_mapping[s] for s in data_table.columns]
|
|
267
|
+
qtable.data[data_table.columns] = data_table
|
|
268
|
+
|
|
269
|
+
|
|
208
270
|
def normalize_expression(
|
|
209
271
|
qtable: Qtable,
|
|
210
272
|
normalizer: Transformer,
|
msreport/export.py
CHANGED
|
@@ -1,19 +1,13 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
'Leading proteins',
|
|
8
|
-
'Protein entry name',
|
|
9
|
-
'Fasta header',
|
|
10
|
-
'Protein length',
|
|
11
|
-
'iBAQ peptides',
|
|
12
|
-
'Sequence coverage',
|
|
13
|
-
], dtype='object')
|
|
1
|
+
"""Exporting of proteomics data from `Qtable` into external formats.
|
|
2
|
+
|
|
3
|
+
This module offers functionalities to convert and save `Qtable` data into files
|
|
4
|
+
compatible with external tools (Amica and Perseus), and creating sequence coverage maps
|
|
5
|
+
in HTML format. While most functions operate on `Qtable` instances, some may accept
|
|
6
|
+
other data structures.
|
|
14
7
|
"""
|
|
15
8
|
|
|
16
9
|
import os
|
|
10
|
+
import pathlib
|
|
17
11
|
import warnings
|
|
18
12
|
from collections import defaultdict as ddict
|
|
19
13
|
from typing import Iterable, Optional, Protocol, Sequence
|
|
@@ -99,7 +93,7 @@ def contaminants_to_clipboard(qtable: Qtable) -> None:
|
|
|
99
93
|
|
|
100
94
|
def to_perseus_matrix(
|
|
101
95
|
qtable: Qtable,
|
|
102
|
-
directory,
|
|
96
|
+
directory: str | pathlib.Path,
|
|
103
97
|
table_name: str = "perseus_matrix.tsv",
|
|
104
98
|
) -> None:
|
|
105
99
|
"""Exports a qtable to a perseus matrix file in tsv format.
|
|
@@ -151,7 +145,7 @@ def to_perseus_matrix(
|
|
|
151
145
|
|
|
152
146
|
def to_amica(
|
|
153
147
|
qtable: Qtable,
|
|
154
|
-
directory,
|
|
148
|
+
directory: str | pathlib.Path,
|
|
155
149
|
table_name: str = "amica_table.tsv",
|
|
156
150
|
design_name: str = "amica_design.tsv",
|
|
157
151
|
) -> None:
|
msreport/fasta.py
CHANGED
|
@@ -1,11 +1,18 @@
|
|
|
1
|
+
"""Functionalities for import and access to protein sequence databases from FASTA files.
|
|
2
|
+
|
|
3
|
+
This module serves as an interface to the `profasta` library, offering a convenient way
|
|
4
|
+
to generate a `profasta.db.ProteinDatabase` from one or multiple FASTA files. It
|
|
5
|
+
supports custom FASTA header parsing through a configurable header parser.
|
|
6
|
+
"""
|
|
7
|
+
|
|
1
8
|
import pathlib
|
|
2
|
-
from typing import Iterable
|
|
9
|
+
from typing import Iterable
|
|
3
10
|
|
|
4
11
|
from profasta.db import ProteinDatabase
|
|
5
12
|
|
|
6
13
|
|
|
7
14
|
def import_protein_database(
|
|
8
|
-
fasta_path:
|
|
15
|
+
fasta_path: str | pathlib.Path | Iterable[str | pathlib.Path],
|
|
9
16
|
header_parser: str = "uniprot",
|
|
10
17
|
) -> ProteinDatabase:
|
|
11
18
|
"""Generates a protein database from one or a list of fasta files.
|
msreport/helper/__init__.py
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
"""A collection of widely used helper and utility functions.
|
|
2
|
+
|
|
3
|
+
This module re-exports commonly used functions from various `msreport.helper`
|
|
4
|
+
submodules for convenience.
|
|
5
|
+
"""
|
|
6
|
+
|
|
1
7
|
from .calc import (
|
|
2
8
|
calculate_monoisotopic_mass,
|
|
3
9
|
calculate_sequence_coverage,
|
|
@@ -21,3 +27,15 @@ from .temp import (
|
|
|
21
27
|
extract_modifications,
|
|
22
28
|
modify_peptide,
|
|
23
29
|
)
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"apply_intensity_cutoff",
|
|
33
|
+
"find_columns",
|
|
34
|
+
"find_sample_columns",
|
|
35
|
+
"guess_design",
|
|
36
|
+
"intensities_in_logspace",
|
|
37
|
+
"keep_rows_by_partial_match",
|
|
38
|
+
"remove_rows_by_partial_match",
|
|
39
|
+
"rename_mq_reporter_channels",
|
|
40
|
+
"rename_sample_columns",
|
|
41
|
+
]
|
msreport/impute.py
CHANGED
|
@@ -1,9 +1,17 @@
|
|
|
1
|
-
|
|
1
|
+
"""Transformer classes for imputing missing values in quantitative proteomics data.
|
|
2
|
+
|
|
3
|
+
This module defines transformer classes that can be fitted to a table containing
|
|
4
|
+
quantitative values to learn imputation parameters. Once fitted, these transformers can
|
|
5
|
+
then be applied to another table to transform it by filling in missing values. The
|
|
6
|
+
transformation returns a new copy of the table with the imputed values, leaving the
|
|
7
|
+
original table unchanged.
|
|
8
|
+
"""
|
|
2
9
|
|
|
3
10
|
from typing import Any, Optional
|
|
4
11
|
|
|
5
12
|
import numpy as np
|
|
6
13
|
import pandas as pd
|
|
14
|
+
from typing_extensions import Self
|
|
7
15
|
|
|
8
16
|
from msreport.errors import NotFittedError
|
|
9
17
|
|
|
@@ -42,7 +50,7 @@ class FixedValueImputer:
|
|
|
42
50
|
self.column_wise = column_wise
|
|
43
51
|
self._sample_fill_values: dict[str, float] = {}
|
|
44
52
|
|
|
45
|
-
def fit(self, table: pd.DataFrame) ->
|
|
53
|
+
def fit(self, table: pd.DataFrame) -> Self:
|
|
46
54
|
"""Fits the FixedValueImputer.
|
|
47
55
|
|
|
48
56
|
Args:
|
|
@@ -79,7 +87,7 @@ class FixedValueImputer:
|
|
|
79
87
|
Returns:
|
|
80
88
|
'table' with imputed missing values.
|
|
81
89
|
"""
|
|
82
|
-
|
|
90
|
+
_confirm_is_fitted(self)
|
|
83
91
|
|
|
84
92
|
_table = table.copy()
|
|
85
93
|
for column in _table.columns:
|
|
@@ -108,7 +116,7 @@ class GaussianImputer:
|
|
|
108
116
|
self.sigma = sigma
|
|
109
117
|
self.seed = seed
|
|
110
118
|
|
|
111
|
-
def fit(self, table: pd.DataFrame) ->
|
|
119
|
+
def fit(self, table: pd.DataFrame) -> Self:
|
|
112
120
|
"""Fits the GaussianImputer, altough this is not necessary.
|
|
113
121
|
|
|
114
122
|
Args:
|
|
@@ -134,7 +142,7 @@ class GaussianImputer:
|
|
|
134
142
|
Returns:
|
|
135
143
|
'table' with imputed missing values.
|
|
136
144
|
"""
|
|
137
|
-
|
|
145
|
+
_confirm_is_fitted(self)
|
|
138
146
|
np.random.seed(self.seed)
|
|
139
147
|
|
|
140
148
|
_table = table.copy()
|
|
@@ -182,9 +190,9 @@ class PerseusImputer:
|
|
|
182
190
|
self.std_width = std_width
|
|
183
191
|
self.column_wise = column_wise
|
|
184
192
|
self.seed = seed
|
|
185
|
-
self._column_params: dict[str, dict] = {}
|
|
193
|
+
self._column_params: dict[str, dict[str, float]] = {}
|
|
186
194
|
|
|
187
|
-
def fit(self, table: pd.DataFrame) ->
|
|
195
|
+
def fit(self, table: pd.DataFrame) -> Self:
|
|
188
196
|
"""Fits the PerseusImputer.
|
|
189
197
|
|
|
190
198
|
Args:
|
|
@@ -223,7 +231,7 @@ class PerseusImputer:
|
|
|
223
231
|
Returns:
|
|
224
232
|
'table' with imputed missing values.
|
|
225
233
|
"""
|
|
226
|
-
|
|
234
|
+
_confirm_is_fitted(self)
|
|
227
235
|
np.random.seed(self.seed)
|
|
228
236
|
|
|
229
237
|
_table = table.copy()
|
|
@@ -239,7 +247,7 @@ class PerseusImputer:
|
|
|
239
247
|
return _table
|
|
240
248
|
|
|
241
249
|
|
|
242
|
-
def
|
|
250
|
+
def _confirm_is_fitted(imputer: Any, msg: Optional[str] = None) -> None:
|
|
243
251
|
"""Perform is_fitted validation for imputer instances.
|
|
244
252
|
|
|
245
253
|
Checks if the imputer is fitted by verifying the presence of fitted attributes
|
|
@@ -266,7 +274,7 @@ def confirm_is_fitted(imputer: Any, msg: Optional[str] = None) -> None:
|
|
|
266
274
|
raise NotFittedError(msg % {"name": type(imputer).__name__})
|
|
267
275
|
|
|
268
276
|
|
|
269
|
-
def _calculate_integer_below_min(table) -> int:
|
|
277
|
+
def _calculate_integer_below_min(table: pd.DataFrame) -> int:
|
|
270
278
|
minimal_value = np.nanmin(table.to_numpy().flatten())
|
|
271
279
|
below_minimal = np.floor(minimal_value)
|
|
272
280
|
if minimal_value <= below_minimal:
|
msreport/isobar.py
CHANGED
|
@@ -1,34 +1,31 @@
|
|
|
1
|
-
|
|
1
|
+
"""Provides a transformer class for processing isobarically labeled proteomics data.
|
|
2
|
+
|
|
3
|
+
This module defines the `IsotopeImpurityCorrecter` class for processing of isobaric
|
|
4
|
+
(e.g., TMT, iTRAQ) reporter intensities. This transformer must be fitted with an isotope
|
|
5
|
+
impurity matrix to correct interference in reporter intensities. Once fitted, the
|
|
6
|
+
transformer can then be applied to a table containing reporter ion intensities to adjust
|
|
7
|
+
its intensity values. The transformation returns a new copy of the table with the
|
|
8
|
+
processed values, leaving the original table unchanged.
|
|
9
|
+
"""
|
|
2
10
|
|
|
3
11
|
import functools
|
|
4
|
-
from typing import Protocol
|
|
5
12
|
|
|
6
13
|
import numpy as np
|
|
7
14
|
import pandas as pd
|
|
8
15
|
import scipy
|
|
16
|
+
from typing_extensions import Self
|
|
9
17
|
|
|
10
18
|
import msreport.helper
|
|
11
19
|
from msreport.errors import NotFittedError
|
|
12
20
|
|
|
13
21
|
|
|
14
|
-
class Transformer(Protocol):
|
|
15
|
-
def fit(self, table: pd.DataFrame) -> Transformer:
|
|
16
|
-
"""Fits the Transformer and returns a fitted Transformer instance."""
|
|
17
|
-
|
|
18
|
-
def is_fitted(self) -> bool:
|
|
19
|
-
"""Returns True if the Transformer has been fitted."""
|
|
20
|
-
|
|
21
|
-
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
22
|
-
"""Transform values in 'table'."""
|
|
23
|
-
|
|
24
|
-
|
|
25
22
|
class IsotopeImpurityCorrecter:
|
|
26
23
|
"""Corrects isotope impurity interference in isobaric reporter expression values."""
|
|
27
24
|
|
|
28
25
|
def __init__(self):
|
|
29
26
|
self._impurity_matrix = None
|
|
30
27
|
|
|
31
|
-
def fit(self, impurity_matrix: np.ndarray) ->
|
|
28
|
+
def fit(self, impurity_matrix: np.ndarray) -> Self:
|
|
32
29
|
"""Fits the isotope impurity correcter to a given impurity matrix.
|
|
33
30
|
|
|
34
31
|
Args:
|
msreport/normalize.py
CHANGED
|
@@ -1,4 +1,16 @@
|
|
|
1
|
-
|
|
1
|
+
"""Transformer classes for normalizing and transforming quantitative proteomics data.
|
|
2
|
+
|
|
3
|
+
This module defines various transformer classes for normalizing and scaling quantitative
|
|
4
|
+
values in tabular data. Examples include normalizers like median, mode, and LOWESS, as
|
|
5
|
+
well as scalers such as PercentageScaler and ZScoreScaler. A specialized
|
|
6
|
+
`CategoricalNormalizer` is also provided, which, when appropriately fitted and applied,
|
|
7
|
+
can be used for complex transformations such as iBAQ or site-to-protein normalization.
|
|
8
|
+
|
|
9
|
+
These transformers can be fitted to a table containing quantitative values to learn
|
|
10
|
+
parameters. Once fitted, they can then be applied to another table to adjust its values.
|
|
11
|
+
The transformation returns a new copy of the table with the normalized/scaled values,
|
|
12
|
+
leaving the original table unchanged.
|
|
13
|
+
"""
|
|
2
14
|
|
|
3
15
|
from typing import Callable, Iterable, Optional, Protocol
|
|
4
16
|
|
|
@@ -79,7 +91,7 @@ class FixedValueNormalizer:
|
|
|
79
91
|
Raises:
|
|
80
92
|
NotFittedError: If the FixedValueNormalizer has not been fitted yet.
|
|
81
93
|
"""
|
|
82
|
-
|
|
94
|
+
_confirm_is_fitted(self)
|
|
83
95
|
return self._sample_fits.copy()
|
|
84
96
|
|
|
85
97
|
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -95,7 +107,7 @@ class FixedValueNormalizer:
|
|
|
95
107
|
Raises:
|
|
96
108
|
NotFittedError: If the FixedValueNormalizer has not been fitted yet.
|
|
97
109
|
"""
|
|
98
|
-
|
|
110
|
+
_confirm_is_fitted(self)
|
|
99
111
|
|
|
100
112
|
_table = table.copy()
|
|
101
113
|
for column in _table.columns:
|
|
@@ -195,7 +207,7 @@ class ValueDependentNormalizer:
|
|
|
195
207
|
Raises:
|
|
196
208
|
NotFittedError: If the ValueDependentNormalizer has not been fitted yet.
|
|
197
209
|
"""
|
|
198
|
-
|
|
210
|
+
_confirm_is_fitted(self)
|
|
199
211
|
return self._sample_fits.copy()
|
|
200
212
|
|
|
201
213
|
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -211,7 +223,7 @@ class ValueDependentNormalizer:
|
|
|
211
223
|
Raises:
|
|
212
224
|
NotFittedError: If the ValueDependentNormalizer has not been fitted yet.
|
|
213
225
|
"""
|
|
214
|
-
|
|
226
|
+
_confirm_is_fitted(self)
|
|
215
227
|
|
|
216
228
|
_table = table.copy()
|
|
217
229
|
for column in _table.columns:
|
|
@@ -250,6 +262,59 @@ class ValueDependentNormalizer:
|
|
|
250
262
|
self._sample_fits[sample] = sample_fit
|
|
251
263
|
|
|
252
264
|
|
|
265
|
+
class SumNormalizer:
|
|
266
|
+
"""Normalizer that uses the sum of all values in each sample for normalization.
|
|
267
|
+
|
|
268
|
+
Expects log2-transformed intensity values. To obtain normalization factors, the sum
|
|
269
|
+
of non-log2-transformed values is calculated for each sample, then divided by the
|
|
270
|
+
average of all sample sums and log2-transformed.
|
|
271
|
+
"""
|
|
272
|
+
|
|
273
|
+
def __init__(self):
|
|
274
|
+
"""Initializes the SumNormalizer."""
|
|
275
|
+
self._sample_fits: dict[str, float] = {}
|
|
276
|
+
|
|
277
|
+
def fit(self, table: pd.DataFrame) -> Self:
|
|
278
|
+
"""Fits the SumNormalizer and returns a fitted instance.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
table: Dataframe used to calculate normalization values for each column.
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
Returns the instance itself.
|
|
285
|
+
"""
|
|
286
|
+
_sums = np.power(2, table).sum()
|
|
287
|
+
_log2_fits = np.log2(_sums.divide(_sums.mean()))
|
|
288
|
+
self._sample_fits = _log2_fits.to_dict()
|
|
289
|
+
return self
|
|
290
|
+
|
|
291
|
+
def is_fitted(self) -> bool:
|
|
292
|
+
"""Returns True if the Transformer has been fitted."""
|
|
293
|
+
return True if self._sample_fits else False
|
|
294
|
+
|
|
295
|
+
def get_fits(self) -> dict[str, float]:
|
|
296
|
+
"""Returns a dictionary containing the fitted center values per sample.
|
|
297
|
+
|
|
298
|
+
Raises:
|
|
299
|
+
NotFittedError: If the FixedValueNormalizer has not been fitted yet.
|
|
300
|
+
"""
|
|
301
|
+
_confirm_is_fitted(self)
|
|
302
|
+
return self._sample_fits.copy()
|
|
303
|
+
|
|
304
|
+
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
305
|
+
"""Transform values in table."""
|
|
306
|
+
_confirm_is_fitted(self)
|
|
307
|
+
|
|
308
|
+
_table = table.copy()
|
|
309
|
+
for column in _table.columns:
|
|
310
|
+
column_data = np.array(_table[column], dtype=float)
|
|
311
|
+
mask = np.isfinite(column_data)
|
|
312
|
+
column_data[mask] = column_data[mask] - self._sample_fits[column]
|
|
313
|
+
|
|
314
|
+
_table[column] = column_data
|
|
315
|
+
return _table
|
|
316
|
+
|
|
317
|
+
|
|
253
318
|
class MedianNormalizer(FixedValueNormalizer):
|
|
254
319
|
"""A FixedValueNormalizer that uses the median as the fitting function.
|
|
255
320
|
|
|
@@ -346,7 +411,7 @@ class CategoricalNormalizer:
|
|
|
346
411
|
Raises:
|
|
347
412
|
NotFittedError: If the CategoricalNormalizer has not been fitted yet.
|
|
348
413
|
"""
|
|
349
|
-
|
|
414
|
+
_confirm_is_fitted(self)
|
|
350
415
|
return self._fitted_table.copy()
|
|
351
416
|
|
|
352
417
|
def get_category_column(self) -> str:
|
|
@@ -367,7 +432,7 @@ class CategoricalNormalizer:
|
|
|
367
432
|
table.
|
|
368
433
|
NotFittedError: If the CategoricalNormalizer has not been fitted yet.
|
|
369
434
|
"""
|
|
370
|
-
|
|
435
|
+
_confirm_is_fitted(self)
|
|
371
436
|
|
|
372
437
|
original_index = table.index
|
|
373
438
|
table = table.set_index(self.get_category_column(), drop=True, inplace=False)
|
|
@@ -396,11 +461,11 @@ class PercentageScaler:
|
|
|
396
461
|
return self
|
|
397
462
|
|
|
398
463
|
def is_fitted(self) -> bool:
|
|
399
|
-
"""Always returns True because the
|
|
464
|
+
"""Always returns True because the Scaler does not need to be fitted."""
|
|
400
465
|
return True
|
|
401
466
|
|
|
402
467
|
def get_fits(self) -> dict:
|
|
403
|
-
"""Returns
|
|
468
|
+
"""Returns an empty dictionary."""
|
|
404
469
|
return {}
|
|
405
470
|
|
|
406
471
|
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -457,7 +522,27 @@ class ZscoreScaler:
|
|
|
457
522
|
return scaled_table
|
|
458
523
|
|
|
459
524
|
|
|
460
|
-
|
|
525
|
+
class Log2Transformer:
|
|
526
|
+
"""Apply log2 transformation to column values."""
|
|
527
|
+
|
|
528
|
+
def fit(self, table: pd.DataFrame) -> Self:
|
|
529
|
+
"""Returns the instance itself."""
|
|
530
|
+
return self
|
|
531
|
+
|
|
532
|
+
def is_fitted(self) -> bool:
|
|
533
|
+
"""Returns True if the transformer is fitted."""
|
|
534
|
+
return True
|
|
535
|
+
|
|
536
|
+
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
537
|
+
"""Applies a log2 transformation to each column of the table.
|
|
538
|
+
|
|
539
|
+
Zero values are replaced with NaN before the transformation to avoid an error
|
|
540
|
+
during the log2 calculation.
|
|
541
|
+
"""
|
|
542
|
+
return pd.DataFrame(np.log2(table.replace({0: np.nan})))
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
def _confirm_is_fitted(
|
|
461
546
|
normalizer: AbstractTransformer, msg: Optional[str] = None
|
|
462
547
|
) -> None:
|
|
463
548
|
"""Perform is_fitted validation for normalizer instances.
|