msreport 0.0.26__tar.gz → 0.0.28__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {msreport-0.0.26 → msreport-0.0.28}/PKG-INFO +7 -3
- {msreport-0.0.26 → msreport-0.0.28}/README.md +1 -1
- {msreport-0.0.26 → msreport-0.0.28}/msreport/__init__.py +4 -6
- {msreport-0.0.26 → msreport-0.0.28}/msreport/aggregate/condense.py +1 -1
- {msreport-0.0.26 → msreport-0.0.28}/msreport/aggregate/pivot.py +1 -0
- {msreport-0.0.26 → msreport-0.0.28}/msreport/aggregate/summarize.py +2 -2
- {msreport-0.0.26 → msreport-0.0.28}/msreport/analyze.py +171 -38
- {msreport-0.0.26 → msreport-0.0.28}/msreport/errors.py +1 -2
- {msreport-0.0.26 → msreport-0.0.28}/msreport/export.py +16 -13
- {msreport-0.0.26 → msreport-0.0.28}/msreport/fasta.py +2 -1
- {msreport-0.0.26 → msreport-0.0.28}/msreport/helper/__init__.py +7 -7
- {msreport-0.0.26 → msreport-0.0.28}/msreport/helper/calc.py +29 -24
- {msreport-0.0.26 → msreport-0.0.28}/msreport/helper/maxlfq.py +2 -2
- {msreport-0.0.26 → msreport-0.0.28}/msreport/helper/table.py +5 -6
- {msreport-0.0.26 → msreport-0.0.28}/msreport/impute.py +7 -8
- {msreport-0.0.26 → msreport-0.0.28}/msreport/isobar.py +10 -9
- {msreport-0.0.26 → msreport-0.0.28}/msreport/normalize.py +54 -36
- {msreport-0.0.26 → msreport-0.0.28}/msreport/peptidoform.py +6 -4
- msreport-0.0.28/msreport/plot/__init__.py +41 -0
- msreport-0.0.28/msreport/plot/_partial_plots.py +159 -0
- msreport-0.0.28/msreport/plot/comparison.py +490 -0
- msreport-0.0.28/msreport/plot/distribution.py +253 -0
- msreport-0.0.28/msreport/plot/multivariate.py +355 -0
- msreport-0.0.28/msreport/plot/quality.py +431 -0
- msreport-0.0.28/msreport/plot/style.py +286 -0
- msreport-0.0.28/msreport/plot/style_sheets/msreport-notebook.mplstyle +57 -0
- msreport-0.0.28/msreport/plot/style_sheets/seaborn-whitegrid.mplstyle +45 -0
- {msreport-0.0.26 → msreport-0.0.28}/msreport/qtable.py +109 -17
- {msreport-0.0.26 → msreport-0.0.28}/msreport/reader.py +73 -79
- {msreport-0.0.26 → msreport-0.0.28}/msreport/rinterface/__init__.py +2 -1
- {msreport-0.0.26 → msreport-0.0.28}/msreport/rinterface/limma.py +2 -1
- {msreport-0.0.26 → msreport-0.0.28}/msreport/rinterface/rinstaller.py +3 -3
- {msreport-0.0.26 → msreport-0.0.28}/msreport.egg-info/PKG-INFO +7 -3
- {msreport-0.0.26 → msreport-0.0.28}/msreport.egg-info/SOURCES.txt +10 -1
- {msreport-0.0.26 → msreport-0.0.28}/msreport.egg-info/requires.txt +6 -1
- {msreport-0.0.26 → msreport-0.0.28}/pyproject.toml +40 -1
- {msreport-0.0.26 → msreport-0.0.28}/tests/test_analyze.py +115 -18
- {msreport-0.0.26 → msreport-0.0.28}/tests/test_peptidoform.py +2 -1
- msreport-0.0.28/tests/test_plot.py +144 -0
- {msreport-0.0.26 → msreport-0.0.28}/tests/test_qtable.py +90 -23
- msreport-0.0.26/msreport/plot.py +0 -1132
- {msreport-0.0.26 → msreport-0.0.28}/LICENSE.txt +0 -0
- {msreport-0.0.26 → msreport-0.0.28}/msreport/aggregate/__init__.py +0 -0
- {msreport-0.0.26 → msreport-0.0.28}/msreport/helper/temp.py +0 -0
- {msreport-0.0.26 → msreport-0.0.28}/msreport/rinterface/rscripts/limma.R +0 -0
- {msreport-0.0.26 → msreport-0.0.28}/msreport.egg-info/dependency_links.txt +0 -0
- {msreport-0.0.26 → msreport-0.0.28}/msreport.egg-info/top_level.txt +0 -0
- {msreport-0.0.26 → msreport-0.0.28}/setup.cfg +0 -0
- {msreport-0.0.26 → msreport-0.0.28}/setup.py +0 -0
- {msreport-0.0.26 → msreport-0.0.28}/tests/test_export.py +0 -0
- {msreport-0.0.26 → msreport-0.0.28}/tests/test_helper.py +0 -0
- {msreport-0.0.26 → msreport-0.0.28}/tests/test_impute.py +0 -0
- {msreport-0.0.26 → msreport-0.0.28}/tests/test_isobar.py +0 -0
- {msreport-0.0.26 → msreport-0.0.28}/tests/test_maxlfq.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: msreport
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.28
|
|
4
4
|
Summary: Post processing and analysis of quantitative proteomics data
|
|
5
5
|
Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -19,11 +19,15 @@ Requires-Dist: pandas>=1.4.4
|
|
|
19
19
|
Requires-Dist: profasta>=0.0.4
|
|
20
20
|
Requires-Dist: pyteomics>=4.6.0
|
|
21
21
|
Requires-Dist: pyyaml>=6.0.0
|
|
22
|
-
Requires-Dist: rpy2
|
|
22
|
+
Requires-Dist: rpy2!=3.5.13,>=3.5.3
|
|
23
23
|
Requires-Dist: scikit-learn>=1.0.0
|
|
24
24
|
Requires-Dist: scipy>=1.9.1
|
|
25
25
|
Requires-Dist: seaborn>=0.12.0
|
|
26
26
|
Requires-Dist: statsmodels>=0.13.2
|
|
27
|
+
Requires-Dist: typing_extensions>=4
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: mypy>=1.15.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest>=8.3.5; extra == "dev"
|
|
27
31
|
Dynamic: license-file
|
|
28
32
|
|
|
29
33
|
[](https://www.repostatus.org/#wip)
|
|
@@ -117,7 +121,7 @@ command as described above.
|
|
|
117
121
|
### Additional requirements
|
|
118
122
|
|
|
119
123
|
MsReport provides an interface to the R package LIMMA for differential expression
|
|
120
|
-
analysis, which requires a local installation of R (R version
|
|
124
|
+
analysis, which requires a local installation of R (R version 4.0 or higher) and the
|
|
121
125
|
system environment variable "R_HOME" to be set to the R home directory. Note that it
|
|
122
126
|
might be necessary to restart the computer after adding the "R_HOME" variable. The R
|
|
123
127
|
home directory can also be found from within R by using the command below, and might
|
|
@@ -89,7 +89,7 @@ command as described above.
|
|
|
89
89
|
### Additional requirements
|
|
90
90
|
|
|
91
91
|
MsReport provides an interface to the R package LIMMA for differential expression
|
|
92
|
-
analysis, which requires a local installation of R (R version
|
|
92
|
+
analysis, which requires a local installation of R (R version 4.0 or higher) and the
|
|
93
93
|
system environment variable "R_HOME" to be set to the R home directory. Note that it
|
|
94
94
|
might be necessary to restart the computer after adding the "R_HOME" variable. The R
|
|
95
95
|
home directory can also be found from within R by using the command below, and might
|
|
@@ -1,13 +1,11 @@
|
|
|
1
|
-
from msreport.qtable import Qtable
|
|
2
|
-
from msreport.reader import MaxQuantReader, FragPipeReader, SpectronautReader
|
|
3
|
-
|
|
4
|
-
from msreport.fasta import import_protein_database
|
|
5
|
-
|
|
6
1
|
import msreport.analyze
|
|
7
2
|
import msreport.export
|
|
8
3
|
import msreport.impute
|
|
9
4
|
import msreport.normalize
|
|
10
5
|
import msreport.plot
|
|
11
6
|
import msreport.reader
|
|
7
|
+
from msreport.fasta import import_protein_database
|
|
8
|
+
from msreport.qtable import Qtable
|
|
9
|
+
from msreport.reader import FragPipeReader, MaxQuantReader, SpectronautReader
|
|
12
10
|
|
|
13
|
-
__version__ = "0.0.
|
|
11
|
+
__version__ = "0.0.28"
|
|
@@ -71,7 +71,7 @@ def maximum_per_column(array: np.ndarray) -> np.ndarray:
|
|
|
71
71
|
return np.array([maximum(i) for i in array.transpose()])
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
def minimum(array: np.ndarray) ->
|
|
74
|
+
def minimum(array: np.ndarray) -> float:
|
|
75
75
|
"""Returns the lowest finite value from one or multiple columns."""
|
|
76
76
|
array = array.flatten()
|
|
77
77
|
if np.isfinite(array).any():
|
|
@@ -218,7 +218,7 @@ def aggregate_unique_groups(
|
|
|
218
218
|
columns_to_aggregate: Union[str, Iterable],
|
|
219
219
|
condenser: Callable,
|
|
220
220
|
is_sorted: bool,
|
|
221
|
-
) ->
|
|
221
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
222
222
|
"""Aggregates column(s) by applying a condenser function to unique groups.
|
|
223
223
|
|
|
224
224
|
The function returns two arrays containing the aggregated values and the
|
|
@@ -256,7 +256,7 @@ def aggregate_unique_groups(
|
|
|
256
256
|
|
|
257
257
|
def _prepare_grouping_indices(
|
|
258
258
|
table: pd.DataFrame, group_by: str, is_sorted: bool
|
|
259
|
-
) ->
|
|
259
|
+
) -> tuple[np.ndarray, np.ndarray, pd.DataFrame]:
|
|
260
260
|
"""Prepares start indices and names of unique groups from a sorted dataframe.
|
|
261
261
|
|
|
262
262
|
Args:
|
|
@@ -1,14 +1,16 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""The analyze module contains methods for analysing quantification results."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
|
-
|
|
4
|
+
|
|
5
5
|
import warnings
|
|
6
|
+
from typing import Iterable, Optional, Protocol, Sequence
|
|
6
7
|
|
|
7
8
|
import numpy as np
|
|
8
9
|
import pandas as pd
|
|
9
10
|
|
|
10
11
|
import msreport.normalize
|
|
11
12
|
import msreport.rinterface
|
|
13
|
+
from msreport.helper import find_sample_columns
|
|
12
14
|
from msreport.qtable import Qtable
|
|
13
15
|
|
|
14
16
|
|
|
@@ -33,8 +35,8 @@ class CategoryTransformer(Protocol):
|
|
|
33
35
|
def transform(self, table: pd.DataFrame) -> pd.DataFrame:
|
|
34
36
|
"""Transform values in 'table'."""
|
|
35
37
|
|
|
36
|
-
def get_category_column(self
|
|
37
|
-
"""Returns the
|
|
38
|
+
def get_category_column(self) -> str:
|
|
39
|
+
"""Returns the name of the category column."""
|
|
38
40
|
|
|
39
41
|
|
|
40
42
|
def analyze_missingness(qtable: Qtable) -> None:
|
|
@@ -75,6 +77,7 @@ def analyze_missingness(qtable: Qtable) -> None:
|
|
|
75
77
|
def validate_proteins(
|
|
76
78
|
qtable: Qtable,
|
|
77
79
|
min_peptides: int = 0,
|
|
80
|
+
min_spectral_counts: int = 0,
|
|
78
81
|
remove_contaminants: bool = True,
|
|
79
82
|
min_events: Optional[int] = None,
|
|
80
83
|
max_missing: Optional[int] = None,
|
|
@@ -84,12 +87,13 @@ def validate_proteins(
|
|
|
84
87
|
Adds an additional column "Valid" to the qtable, containing Boolean values.
|
|
85
88
|
|
|
86
89
|
Requires expression columns to be set. Depending on the arguments requires the
|
|
87
|
-
columns "Total peptides", "Potential contaminant", and
|
|
88
|
-
"Missing experiment_name" and "Events experiment_name".
|
|
90
|
+
columns "Total peptides", "Spectral count Combined", "Potential contaminant", and
|
|
91
|
+
the experiment columns "Missing experiment_name" and "Events experiment_name".
|
|
89
92
|
|
|
90
93
|
Args:
|
|
91
94
|
qtable: A Qtable instance.
|
|
92
95
|
min_peptides: Minimum number of unique peptides, default 0.
|
|
96
|
+
min_spectral_counts: Minimum number of combined spectral counts, default 0.
|
|
93
97
|
remove_contaminants: If true, the "Potential contaminant" column is used to
|
|
94
98
|
remove invalid entries, default True. If no "Potential contaminant" column
|
|
95
99
|
is present 'remove_contaminants' is ignored.
|
|
@@ -107,6 +111,16 @@ def validate_proteins(
|
|
|
107
111
|
[valid_entries, qtable["Total peptides"] >= min_peptides], axis=0
|
|
108
112
|
)
|
|
109
113
|
|
|
114
|
+
if min_spectral_counts > 0:
|
|
115
|
+
if "Spectral count Combined" not in qtable:
|
|
116
|
+
raise KeyError(
|
|
117
|
+
"'Spectral count Combined' column not present in qtable.data"
|
|
118
|
+
)
|
|
119
|
+
valid_entries = np.all(
|
|
120
|
+
[valid_entries, qtable["Spectral count Combined"] >= min_spectral_counts],
|
|
121
|
+
axis=0,
|
|
122
|
+
)
|
|
123
|
+
|
|
110
124
|
# TODO: not tested from here #
|
|
111
125
|
if remove_contaminants:
|
|
112
126
|
if "Potential contaminant" not in qtable:
|
|
@@ -138,6 +152,50 @@ def validate_proteins(
|
|
|
138
152
|
qtable["Valid"] = valid_entries
|
|
139
153
|
|
|
140
154
|
|
|
155
|
+
def apply_transformer(
|
|
156
|
+
qtable: msreport.Qtable,
|
|
157
|
+
transformer: Transformer,
|
|
158
|
+
tag: str,
|
|
159
|
+
exclude_invalid: bool,
|
|
160
|
+
remove_invalid: bool,
|
|
161
|
+
new_tag: Optional[str] = None,
|
|
162
|
+
) -> None:
|
|
163
|
+
"""Applies a transformer to the values of a Qtable selected with the tag parameter.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
qtable: A Qtable instance, to which the transformer is applied.
|
|
167
|
+
transformer: The transformer to apply.
|
|
168
|
+
tag: The tag used to identify the columns for applying the transformer.
|
|
169
|
+
exclude_invalid: Exclude invalid values from the transformation.
|
|
170
|
+
remove_invalid: Remove invalid values from the table after the transformation.
|
|
171
|
+
new_tag: Optional, if specified than the tag is replaced with this value in the
|
|
172
|
+
column names and the transformed data is stored to these new columns.
|
|
173
|
+
"""
|
|
174
|
+
valid = qtable.data["Valid"]
|
|
175
|
+
samples = qtable.get_samples()
|
|
176
|
+
sample_columns = find_sample_columns(qtable.data, tag, samples)
|
|
177
|
+
|
|
178
|
+
if not sample_columns:
|
|
179
|
+
raise ValueError(f"No sample columns found for tag '{tag}'.")
|
|
180
|
+
|
|
181
|
+
if new_tag is not None:
|
|
182
|
+
sample_columns = [c.replace(tag, new_tag) for c in sample_columns]
|
|
183
|
+
column_mapping = dict(zip(samples, sample_columns))
|
|
184
|
+
|
|
185
|
+
data_table = qtable.make_sample_table(tag, samples_as_columns=True)
|
|
186
|
+
|
|
187
|
+
if exclude_invalid:
|
|
188
|
+
data_table[valid] = transformer.transform(data_table[valid])
|
|
189
|
+
else:
|
|
190
|
+
data_table = transformer.transform(data_table)
|
|
191
|
+
|
|
192
|
+
if remove_invalid:
|
|
193
|
+
data_table[~valid] = np.nan
|
|
194
|
+
|
|
195
|
+
data_table.columns = [column_mapping[s] for s in data_table.columns]
|
|
196
|
+
qtable.data[data_table.columns] = data_table
|
|
197
|
+
|
|
198
|
+
|
|
141
199
|
def normalize_expression(
|
|
142
200
|
qtable: Qtable,
|
|
143
201
|
normalizer: Transformer,
|
|
@@ -168,11 +226,9 @@ def normalize_expression(
|
|
|
168
226
|
raw_data = table[sample_columns]
|
|
169
227
|
if not normalizer.is_fitted():
|
|
170
228
|
if exclude_invalid:
|
|
171
|
-
|
|
229
|
+
normalizer.fit(raw_data[table["Valid"]])
|
|
172
230
|
else:
|
|
173
|
-
|
|
174
|
-
fit_data = raw_data[valid_mask]
|
|
175
|
-
normalizer = normalizer.fit(fit_data)
|
|
231
|
+
normalizer = normalizer.fit(raw_data)
|
|
176
232
|
|
|
177
233
|
transformed_data = normalizer.transform(raw_data)
|
|
178
234
|
qtable[expression_columns] = transformed_data[sample_columns]
|
|
@@ -180,7 +236,7 @@ def normalize_expression(
|
|
|
180
236
|
|
|
181
237
|
def create_site_to_protein_normalizer(
|
|
182
238
|
qtable: Qtable, category_column: str = "Representative protein"
|
|
183
|
-
) -> msreport.
|
|
239
|
+
) -> msreport.normalize.CategoricalNormalizer:
|
|
184
240
|
"""Creates a fitted `CategoricalNormalizer` for site-to-protein normalization.
|
|
185
241
|
|
|
186
242
|
The `CategoricalNormalizer` is fitted to protein expression profiles of the provided
|
|
@@ -200,8 +256,8 @@ def create_site_to_protein_normalizer(
|
|
|
200
256
|
samples_as_columns=True,
|
|
201
257
|
features=[category_column],
|
|
202
258
|
)
|
|
203
|
-
completely_quantified = (
|
|
204
|
-
|
|
259
|
+
completely_quantified = ~reference_expression[qtable.get_samples()].isna().any(
|
|
260
|
+
axis=1
|
|
205
261
|
)
|
|
206
262
|
reference_expression = reference_expression[completely_quantified]
|
|
207
263
|
|
|
@@ -221,7 +277,7 @@ def create_ibaq_transformer(
|
|
|
221
277
|
qtable: Qtable,
|
|
222
278
|
category_column: str = "Representative protein",
|
|
223
279
|
ibaq_column: str = "iBAQ peptides",
|
|
224
|
-
) -> msreport.
|
|
280
|
+
) -> msreport.normalize.CategoricalNormalizer:
|
|
225
281
|
"""Creates a fitted `CategoricalNormalizer` for iBAQ transformation.
|
|
226
282
|
|
|
227
283
|
The `CategoricalNormalizer` is fitted to iBAQ peptide counts of the provided
|
|
@@ -247,7 +303,7 @@ def create_ibaq_transformer(
|
|
|
247
303
|
ibaq_factor_values[ibaq_factor_values < 1] = 1
|
|
248
304
|
ibaq_factor_values = np.log2(ibaq_factor_values)
|
|
249
305
|
|
|
250
|
-
reference_table = pd.DataFrame(
|
|
306
|
+
reference_table = pd.DataFrame(dict.fromkeys(sample_columns, ibaq_factor_values))
|
|
251
307
|
reference_table[category_column] = category_values
|
|
252
308
|
|
|
253
309
|
normalizer = msreport.normalize.CategoricalNormalizer(category_column)
|
|
@@ -368,7 +424,15 @@ def calculate_multi_group_comparison(
|
|
|
368
424
|
correspond to entries from qtable.design["Experiment"].
|
|
369
425
|
exclude_invalid: If true, the column "Valid" is used to determine which rows are
|
|
370
426
|
used for calculating the group comparisons; default True.
|
|
427
|
+
|
|
428
|
+
Raises:
|
|
429
|
+
ValueError: If 'experiment_pairs' contains invalid entries. Each experiment pair
|
|
430
|
+
must have exactly two entries and the two entries must not be the same. All
|
|
431
|
+
experiments must be present in qtable.design. No duplicate experiment pairs
|
|
432
|
+
are allowed.
|
|
371
433
|
"""
|
|
434
|
+
_validate_experiment_pairs(qtable, experiment_pairs)
|
|
435
|
+
|
|
372
436
|
table = qtable.make_expression_table(samples_as_columns=True, features=["Valid"])
|
|
373
437
|
comparison_tag = " vs "
|
|
374
438
|
|
|
@@ -421,7 +485,7 @@ def two_group_comparison(
|
|
|
421
485
|
|
|
422
486
|
def calculate_multi_group_limma(
|
|
423
487
|
qtable: Qtable,
|
|
424
|
-
experiment_pairs:
|
|
488
|
+
experiment_pairs: Sequence[Iterable[str]],
|
|
425
489
|
exclude_invalid: bool = True,
|
|
426
490
|
batch: bool = False,
|
|
427
491
|
limma_trend: bool = True,
|
|
@@ -437,8 +501,7 @@ def calculate_multi_group_limma(
|
|
|
437
501
|
|
|
438
502
|
Requires that expression columns are set, and expression values are log2 transformed
|
|
439
503
|
All rows with missing values are ignored, impute missing values to allow
|
|
440
|
-
differential expression analysis of all rows.
|
|
441
|
-
"Representative protein" is used as the index.
|
|
504
|
+
differential expression analysis of all rows.
|
|
442
505
|
|
|
443
506
|
Args:
|
|
444
507
|
qtable: Qtable instance that contains expression values for differential
|
|
@@ -454,7 +517,19 @@ def calculate_multi_group_limma(
|
|
|
454
517
|
limma_trend: If true, an intensity-dependent trend is fitted to the prior
|
|
455
518
|
variance during calculation of the moderated t-statistics, refer to
|
|
456
519
|
limma.eBayes for details; default True.
|
|
520
|
+
|
|
521
|
+
Raises:
|
|
522
|
+
ValueError: If 'experiment_pairs' contains invalid entries. Each experiment pair
|
|
523
|
+
must have exactly two entries and the two entries must not be the same. All
|
|
524
|
+
experiments must be present in qtable.design. No duplicate experiment pairs
|
|
525
|
+
are allowed.
|
|
526
|
+
KeyError: If the "Batch" column is not present in the qtable.design when
|
|
527
|
+
'batch' is set to True.
|
|
528
|
+
ValueError: If all values from qtable.design["Batch"] are identical when 'batch'
|
|
529
|
+
is set to True.
|
|
457
530
|
"""
|
|
531
|
+
_validate_experiment_pairs(qtable, experiment_pairs)
|
|
532
|
+
|
|
458
533
|
# TODO: not tested #
|
|
459
534
|
if batch and "Batch" not in qtable.get_design():
|
|
460
535
|
raise KeyError(
|
|
@@ -468,10 +543,8 @@ def calculate_multi_group_limma(
|
|
|
468
543
|
)
|
|
469
544
|
|
|
470
545
|
design = qtable.get_design()
|
|
471
|
-
table = qtable.make_expression_table(
|
|
472
|
-
|
|
473
|
-
)
|
|
474
|
-
table = table.set_index("Representative protein")
|
|
546
|
+
table = qtable.make_expression_table(samples_as_columns=True)
|
|
547
|
+
table.index = table.index.astype(str) # It appears that a string is required for R
|
|
475
548
|
comparison_tag = " vs "
|
|
476
549
|
|
|
477
550
|
if exclude_invalid:
|
|
@@ -487,7 +560,7 @@ def calculate_multi_group_limma(
|
|
|
487
560
|
experiment_to_r[experiment] = f".EXPERIMENT__{i:04d}"
|
|
488
561
|
r_to_experiment = {v: k for k, v in experiment_to_r.items()}
|
|
489
562
|
|
|
490
|
-
r_experiment_pairs = []
|
|
563
|
+
r_experiment_pairs: list[str] = []
|
|
491
564
|
for exp1, exp2 in experiment_pairs:
|
|
492
565
|
r_experiment_pairs.append(f"{experiment_to_r[exp1]}-{experiment_to_r[exp2]}")
|
|
493
566
|
|
|
@@ -504,7 +577,7 @@ def calculate_multi_group_limma(
|
|
|
504
577
|
limma_result.rename(columns=mapping, inplace=True)
|
|
505
578
|
|
|
506
579
|
limma_table = pd.DataFrame(index=table.index)
|
|
507
|
-
limma_table = limma_table.join(limma_results.values())
|
|
580
|
+
limma_table = limma_table.join(list(limma_results.values()))
|
|
508
581
|
limma_table.fillna(np.nan, inplace=True)
|
|
509
582
|
qtable.add_expression_features(limma_table)
|
|
510
583
|
|
|
@@ -516,7 +589,7 @@ def calculate_multi_group_limma(
|
|
|
516
589
|
|
|
517
590
|
def calculate_two_group_limma(
|
|
518
591
|
qtable: Qtable,
|
|
519
|
-
experiment_pair:
|
|
592
|
+
experiment_pair: Sequence[str],
|
|
520
593
|
exclude_invalid: bool = True,
|
|
521
594
|
limma_trend: bool = True,
|
|
522
595
|
) -> None:
|
|
@@ -529,8 +602,7 @@ def calculate_two_group_limma(
|
|
|
529
602
|
|
|
530
603
|
Requires that expression columns are set, and expression values are log2
|
|
531
604
|
transformed. All rows with missing values are ignored, impute missing values to
|
|
532
|
-
allow differential expression analysis of all rows.
|
|
533
|
-
column "Representative protein" is used as the index.
|
|
605
|
+
allow differential expression analysis of all rows.
|
|
534
606
|
|
|
535
607
|
Args:
|
|
536
608
|
qtable: Qtable instance that contains expression values for differential
|
|
@@ -541,27 +613,30 @@ def calculate_two_group_limma(
|
|
|
541
613
|
used for the differential expression analysis; default True.
|
|
542
614
|
limma_trend: If true, an intensity-dependent trend is fitted to the prior
|
|
543
615
|
variances; default True.
|
|
616
|
+
Raises:
|
|
617
|
+
ValueError: If 'experiment_pair' contains invalid entries. The experiment pair
|
|
618
|
+
must have exactly two entries and the two entries must not be the same. Both
|
|
619
|
+
experiments must be present in qtable.design.
|
|
544
620
|
"""
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
)
|
|
621
|
+
_validate_experiment_pair(qtable, experiment_pair)
|
|
622
|
+
|
|
623
|
+
# TODO: LIMMA function not tested #
|
|
624
|
+
table = qtable.make_expression_table(samples_as_columns=True)
|
|
549
625
|
comparison_tag = " vs "
|
|
550
626
|
|
|
551
627
|
if exclude_invalid:
|
|
552
628
|
valid = qtable["Valid"]
|
|
553
629
|
else:
|
|
554
|
-
valid = np.full(
|
|
630
|
+
valid = np.full(table.shape[0], True)
|
|
555
631
|
|
|
556
632
|
samples_to_experiment = {}
|
|
557
633
|
for experiment in experiment_pair:
|
|
558
|
-
mapping =
|
|
634
|
+
mapping = dict.fromkeys(qtable.get_samples(experiment), experiment)
|
|
559
635
|
samples_to_experiment.update(mapping)
|
|
560
636
|
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
table =
|
|
564
|
-
table = table.set_index("Representative protein")
|
|
637
|
+
# Keep only samples that are present in the 'experiment_pair'
|
|
638
|
+
table = table[samples_to_experiment.keys()]
|
|
639
|
+
table.index = table.index.astype(str) # It appears that a string is required for R
|
|
565
640
|
not_nan = table.isna().sum(axis=1) == 0
|
|
566
641
|
|
|
567
642
|
mask = np.all([valid, not_nan], axis=0)
|
|
@@ -583,4 +658,62 @@ def calculate_two_group_limma(
|
|
|
583
658
|
limma_table.rename(columns=mapping, inplace=True)
|
|
584
659
|
qtable.add_expression_features(limma_table)
|
|
585
660
|
|
|
586
|
-
|
|
661
|
+
|
|
662
|
+
def _validate_experiment_pairs(
|
|
663
|
+
qtable: Qtable, exp_pairs: Iterable[Iterable[str]]
|
|
664
|
+
) -> None:
|
|
665
|
+
"""Validates that experiment pairs are valid and raises an error if not.
|
|
666
|
+
|
|
667
|
+
- All 'exp_pairs' entries must have a length of exactly 2.
|
|
668
|
+
- All experiments must be present in the qtable.design.
|
|
669
|
+
- No duplicate experiments are allowed in a pair.
|
|
670
|
+
- No duplicate experiment pairs are allowed.
|
|
671
|
+
|
|
672
|
+
Args:
|
|
673
|
+
qtable: Qtable instance containing experiment data.
|
|
674
|
+
exp_pairs: Iterable of experiment pairs to validate.
|
|
675
|
+
|
|
676
|
+
Raises:
|
|
677
|
+
ValueError: If any of the validation checks fail.
|
|
678
|
+
"""
|
|
679
|
+
all_experiments = {exp for pair in exp_pairs for exp in pair}
|
|
680
|
+
missing_experiments = all_experiments - set(qtable.get_experiments())
|
|
681
|
+
if missing_experiments:
|
|
682
|
+
raise ValueError(
|
|
683
|
+
f"Experiments '{missing_experiments}' not found in qtable.design."
|
|
684
|
+
)
|
|
685
|
+
for experiment_pair in exp_pairs:
|
|
686
|
+
_validate_experiment_pair(qtable, experiment_pair)
|
|
687
|
+
|
|
688
|
+
if len(list(exp_pairs)) != len({tuple(pair) for pair in exp_pairs}):
|
|
689
|
+
raise ValueError(
|
|
690
|
+
f"Some experiment pairs in {exp_pairs} have been specified multiple "
|
|
691
|
+
"times. Each pair must occur only once."
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def _validate_experiment_pair(qtable: Qtable, exp_pair: Iterable[str]) -> None:
|
|
696
|
+
"""Validates the experiment pair is valid and raises an error if not.
|
|
697
|
+
|
|
698
|
+
- The experiment pair must contain exactly two entries
|
|
699
|
+
- The two entries of the experiment pair must be different.
|
|
700
|
+
- Both experiments must be present in the qtable.design.
|
|
701
|
+
|
|
702
|
+
Args:
|
|
703
|
+
qtable: Qtable instance containing experiment data.
|
|
704
|
+
experiment_pairs: Iterable of experiment pairs to validate.
|
|
705
|
+
|
|
706
|
+
Raises:
|
|
707
|
+
ValueError: If any of the validation checks fail.
|
|
708
|
+
"""
|
|
709
|
+
if len(list(exp_pair)) != 2:
|
|
710
|
+
raise ValueError(
|
|
711
|
+
f"Experiment pair '{exp_pair}' contains more than two entries."
|
|
712
|
+
)
|
|
713
|
+
if len(list(exp_pair)) != len(set(exp_pair)):
|
|
714
|
+
raise ValueError(f"Experiment pair '{exp_pair}' contains the same entry twice.")
|
|
715
|
+
if set(exp_pair) - set(qtable.get_experiments()):
|
|
716
|
+
raise ValueError(
|
|
717
|
+
f"Experiments '{set(exp_pair) - set(qtable.get_experiments())}' "
|
|
718
|
+
"not found in qtable.design."
|
|
719
|
+
)
|
|
@@ -13,10 +13,10 @@ Index([
|
|
|
13
13
|
], dtype='object')
|
|
14
14
|
"""
|
|
15
15
|
|
|
16
|
-
from collections import defaultdict as ddict
|
|
17
16
|
import os
|
|
18
|
-
from typing import Iterable, Optional, Protocol
|
|
19
17
|
import warnings
|
|
18
|
+
from collections import defaultdict as ddict
|
|
19
|
+
from typing import Iterable, Optional, Protocol, Sequence
|
|
20
20
|
|
|
21
21
|
import numpy as np
|
|
22
22
|
import pandas as pd
|
|
@@ -88,7 +88,7 @@ def contaminants_to_clipboard(qtable: Qtable) -> None:
|
|
|
88
88
|
|
|
89
89
|
for column_tag in column_tags:
|
|
90
90
|
columns.extend(helper.find_sample_columns(data, column_tag, samples))
|
|
91
|
-
columns =
|
|
91
|
+
columns = [c for c in columns if c in data.columns]
|
|
92
92
|
|
|
93
93
|
contaminants = qtable["Potential contaminant"]
|
|
94
94
|
data = data.loc[contaminants, columns]
|
|
@@ -135,10 +135,10 @@ def to_perseus_matrix(
|
|
|
135
135
|
numeric_columns = set(numeric_columns).difference(expression_columns)
|
|
136
136
|
numeric_columns = set(numeric_columns).difference(categorical_columns)
|
|
137
137
|
|
|
138
|
-
column_categories = ddict(lambda: default_category)
|
|
139
|
-
column_categories.update(
|
|
140
|
-
column_categories.update(
|
|
141
|
-
column_categories.update(
|
|
138
|
+
column_categories: ddict[str, str] = ddict(lambda: default_category)
|
|
139
|
+
column_categories.update(dict.fromkeys(numeric_columns, "N"))
|
|
140
|
+
column_categories.update(dict.fromkeys(categorical_columns, "C"))
|
|
141
|
+
column_categories.update(dict.fromkeys(expression_columns, "E"))
|
|
142
142
|
|
|
143
143
|
column_annotation = [column_categories[column] for column in table.columns]
|
|
144
144
|
column_annotation[0] = f"{annotation_row_prefix}{column_annotation[0]}"
|
|
@@ -219,6 +219,7 @@ def write_html_coverage_map(
|
|
|
219
219
|
"change in a future release."
|
|
220
220
|
),
|
|
221
221
|
FutureWarning,
|
|
222
|
+
stacklevel=2,
|
|
222
223
|
)
|
|
223
224
|
# Get protein information from the protein database
|
|
224
225
|
protein_entry = protein_db[protein_id]
|
|
@@ -314,8 +315,8 @@ def _amica_table_from(qtable: Qtable) -> pd.DataFrame:
|
|
|
314
315
|
sample_columns = helper.find_sample_columns(
|
|
315
316
|
amica_table, tag, qtable.get_samples()
|
|
316
317
|
)
|
|
317
|
-
non_sample_columns = set(columns).difference(set(sample_columns))
|
|
318
|
-
amica_table.drop(non_sample_columns, inplace=True, axis=1)
|
|
318
|
+
non_sample_columns = list(set(columns).difference(set(sample_columns)))
|
|
319
|
+
amica_table.drop(columns=non_sample_columns, inplace=True, axis=1)
|
|
319
320
|
|
|
320
321
|
# Log transform columns if necessary
|
|
321
322
|
for tag in intensity_column_tags:
|
|
@@ -437,7 +438,7 @@ def _generate_html_sequence_map(
|
|
|
437
438
|
highlights = highlights if highlights is not None else {}
|
|
438
439
|
sequence_length = len(sequence)
|
|
439
440
|
|
|
440
|
-
def write_row_index(pos: int, strings: list)
|
|
441
|
+
def write_row_index(pos: int, strings: list):
|
|
441
442
|
ndigits = len(str(sequence_length))
|
|
442
443
|
row_index = str(pos + 1).rjust(ndigits)
|
|
443
444
|
html_entry = '<FONT COLOR="#000000">' + row_index + " " + "</FONT>"
|
|
@@ -457,7 +458,7 @@ def _generate_html_sequence_map(
|
|
|
457
458
|
|
|
458
459
|
in_covered_region: bool = False
|
|
459
460
|
strings = []
|
|
460
|
-
strings.append(
|
|
461
|
+
strings.append('<FONT COLOR="#606060">') # Set default text color to grey
|
|
461
462
|
write_row_index(0, strings)
|
|
462
463
|
for pos, character in enumerate(sequence):
|
|
463
464
|
if pos in coverage_start_idx:
|
|
@@ -483,13 +484,15 @@ def _generate_html_sequence_map(
|
|
|
483
484
|
if pos in coverage_stop_idx:
|
|
484
485
|
in_covered_region = False
|
|
485
486
|
close_coverage_region(strings)
|
|
486
|
-
strings.append(
|
|
487
|
+
strings.append("</FONT>")
|
|
487
488
|
|
|
488
489
|
html_sequence_block = "".join(strings)
|
|
489
490
|
return html_sequence_block
|
|
490
491
|
|
|
491
492
|
|
|
492
|
-
def _find_covered_region_boundaries(
|
|
493
|
+
def _find_covered_region_boundaries(
|
|
494
|
+
coverage_mask: Sequence[bool],
|
|
495
|
+
) -> list[tuple[int, int]]:
|
|
493
496
|
"""Returns a list of boundaries from continuously covered regions in a protein.
|
|
494
497
|
|
|
495
498
|
Args:
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import pathlib
|
|
2
2
|
from typing import Iterable, Union
|
|
3
3
|
|
|
4
|
-
|
|
5
4
|
from profasta.db import ProteinDatabase
|
|
6
5
|
|
|
7
6
|
|
|
@@ -24,5 +23,7 @@ def import_protein_database(
|
|
|
24
23
|
database = ProteinDatabase()
|
|
25
24
|
paths = [fasta_path] if isinstance(fasta_path, (str, pathlib.Path)) else fasta_path
|
|
26
25
|
for path in paths:
|
|
26
|
+
if isinstance(path, pathlib.Path):
|
|
27
|
+
path = path.as_posix()
|
|
27
28
|
database.add_fasta(path, header_parser=header_parser, overwrite=True)
|
|
28
29
|
return database
|
|
@@ -1,21 +1,21 @@
|
|
|
1
1
|
from .calc import (
|
|
2
|
-
|
|
2
|
+
calculate_monoisotopic_mass,
|
|
3
|
+
calculate_sequence_coverage,
|
|
3
4
|
calculate_tryptic_ibaq_peptides,
|
|
4
5
|
make_coverage_mask,
|
|
5
|
-
|
|
6
|
-
calculate_monoisotopic_mass,
|
|
6
|
+
mode,
|
|
7
7
|
)
|
|
8
8
|
from .table import (
|
|
9
9
|
apply_intensity_cutoff,
|
|
10
|
-
guess_design,
|
|
11
|
-
intensities_in_logspace,
|
|
12
10
|
find_columns,
|
|
13
11
|
find_sample_columns,
|
|
12
|
+
guess_design,
|
|
13
|
+
intensities_in_logspace,
|
|
14
|
+
join_tables,
|
|
14
15
|
keep_rows_by_partial_match,
|
|
15
16
|
remove_rows_by_partial_match,
|
|
16
|
-
join_tables,
|
|
17
|
-
rename_sample_columns,
|
|
18
17
|
rename_mq_reporter_channels,
|
|
18
|
+
rename_sample_columns,
|
|
19
19
|
)
|
|
20
20
|
from .temp import (
|
|
21
21
|
extract_modifications,
|