msreport 0.0.27__tar.gz → 0.0.28__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {msreport-0.0.27 → msreport-0.0.28}/PKG-INFO +6 -3
- {msreport-0.0.27 → msreport-0.0.28}/README.md +1 -1
- {msreport-0.0.27 → msreport-0.0.28}/msreport/__init__.py +4 -6
- {msreport-0.0.27 → msreport-0.0.28}/msreport/aggregate/condense.py +1 -1
- {msreport-0.0.27 → msreport-0.0.28}/msreport/aggregate/pivot.py +1 -0
- {msreport-0.0.27 → msreport-0.0.28}/msreport/aggregate/summarize.py +2 -2
- {msreport-0.0.27 → msreport-0.0.28}/msreport/analyze.py +103 -35
- {msreport-0.0.27 → msreport-0.0.28}/msreport/errors.py +1 -2
- {msreport-0.0.27 → msreport-0.0.28}/msreport/export.py +16 -13
- {msreport-0.0.27 → msreport-0.0.28}/msreport/fasta.py +2 -1
- {msreport-0.0.27 → msreport-0.0.28}/msreport/helper/__init__.py +7 -7
- {msreport-0.0.27 → msreport-0.0.28}/msreport/helper/calc.py +14 -15
- {msreport-0.0.27 → msreport-0.0.28}/msreport/helper/maxlfq.py +2 -2
- {msreport-0.0.27 → msreport-0.0.28}/msreport/helper/table.py +5 -6
- {msreport-0.0.27 → msreport-0.0.28}/msreport/impute.py +4 -3
- {msreport-0.0.27 → msreport-0.0.28}/msreport/isobar.py +10 -9
- {msreport-0.0.27 → msreport-0.0.28}/msreport/normalize.py +2 -1
- {msreport-0.0.27 → msreport-0.0.28}/msreport/peptidoform.py +6 -4
- msreport-0.0.28/msreport/plot/__init__.py +41 -0
- msreport-0.0.28/msreport/plot/_partial_plots.py +159 -0
- msreport-0.0.28/msreport/plot/comparison.py +490 -0
- msreport-0.0.28/msreport/plot/distribution.py +253 -0
- msreport-0.0.28/msreport/plot/multivariate.py +355 -0
- msreport-0.0.28/msreport/plot/quality.py +431 -0
- msreport-0.0.28/msreport/plot/style.py +286 -0
- msreport-0.0.28/msreport/plot/style_sheets/msreport-notebook.mplstyle +57 -0
- msreport-0.0.28/msreport/plot/style_sheets/seaborn-whitegrid.mplstyle +45 -0
- {msreport-0.0.27 → msreport-0.0.28}/msreport/qtable.py +109 -17
- {msreport-0.0.27 → msreport-0.0.28}/msreport/reader.py +73 -79
- {msreport-0.0.27 → msreport-0.0.28}/msreport/rinterface/__init__.py +2 -1
- {msreport-0.0.27 → msreport-0.0.28}/msreport/rinterface/limma.py +2 -1
- {msreport-0.0.27 → msreport-0.0.28}/msreport/rinterface/rinstaller.py +3 -3
- {msreport-0.0.27 → msreport-0.0.28}/msreport.egg-info/PKG-INFO +6 -3
- {msreport-0.0.27 → msreport-0.0.28}/msreport.egg-info/SOURCES.txt +9 -1
- {msreport-0.0.27 → msreport-0.0.28}/msreport.egg-info/requires.txt +5 -1
- {msreport-0.0.27 → msreport-0.0.28}/pyproject.toml +39 -1
- {msreport-0.0.27 → msreport-0.0.28}/tests/test_analyze.py +72 -17
- {msreport-0.0.27 → msreport-0.0.28}/tests/test_peptidoform.py +2 -1
- {msreport-0.0.27 → msreport-0.0.28}/tests/test_plot.py +24 -1
- {msreport-0.0.27 → msreport-0.0.28}/tests/test_qtable.py +90 -23
- msreport-0.0.27/msreport/plot.py +0 -1134
- {msreport-0.0.27 → msreport-0.0.28}/LICENSE.txt +0 -0
- {msreport-0.0.27 → msreport-0.0.28}/msreport/aggregate/__init__.py +0 -0
- {msreport-0.0.27 → msreport-0.0.28}/msreport/helper/temp.py +0 -0
- {msreport-0.0.27 → msreport-0.0.28}/msreport/rinterface/rscripts/limma.R +0 -0
- {msreport-0.0.27 → msreport-0.0.28}/msreport.egg-info/dependency_links.txt +0 -0
- {msreport-0.0.27 → msreport-0.0.28}/msreport.egg-info/top_level.txt +0 -0
- {msreport-0.0.27 → msreport-0.0.28}/setup.cfg +0 -0
- {msreport-0.0.27 → msreport-0.0.28}/setup.py +0 -0
- {msreport-0.0.27 → msreport-0.0.28}/tests/test_export.py +0 -0
- {msreport-0.0.27 → msreport-0.0.28}/tests/test_helper.py +0 -0
- {msreport-0.0.27 → msreport-0.0.28}/tests/test_impute.py +0 -0
- {msreport-0.0.27 → msreport-0.0.28}/tests/test_isobar.py +0 -0
- {msreport-0.0.27 → msreport-0.0.28}/tests/test_maxlfq.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: msreport
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.28
|
|
4
4
|
Summary: Post processing and analysis of quantitative proteomics data
|
|
5
5
|
Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -19,12 +19,15 @@ Requires-Dist: pandas>=1.4.4
|
|
|
19
19
|
Requires-Dist: profasta>=0.0.4
|
|
20
20
|
Requires-Dist: pyteomics>=4.6.0
|
|
21
21
|
Requires-Dist: pyyaml>=6.0.0
|
|
22
|
-
Requires-Dist: rpy2
|
|
22
|
+
Requires-Dist: rpy2!=3.5.13,>=3.5.3
|
|
23
23
|
Requires-Dist: scikit-learn>=1.0.0
|
|
24
24
|
Requires-Dist: scipy>=1.9.1
|
|
25
25
|
Requires-Dist: seaborn>=0.12.0
|
|
26
26
|
Requires-Dist: statsmodels>=0.13.2
|
|
27
27
|
Requires-Dist: typing_extensions>=4
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: mypy>=1.15.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest>=8.3.5; extra == "dev"
|
|
28
31
|
Dynamic: license-file
|
|
29
32
|
|
|
30
33
|
[](https://www.repostatus.org/#wip)
|
|
@@ -118,7 +121,7 @@ command as described above.
|
|
|
118
121
|
### Additional requirements
|
|
119
122
|
|
|
120
123
|
MsReport provides an interface to the R package LIMMA for differential expression
|
|
121
|
-
analysis, which requires a local installation of R (R version
|
|
124
|
+
analysis, which requires a local installation of R (R version 4.0 or higher) and the
|
|
122
125
|
system environment variable "R_HOME" to be set to the R home directory. Note that it
|
|
123
126
|
might be necessary to restart the computer after adding the "R_HOME" variable. The R
|
|
124
127
|
home directory can also be found from within R by using the command below, and might
|
|
@@ -89,7 +89,7 @@ command as described above.
|
|
|
89
89
|
### Additional requirements
|
|
90
90
|
|
|
91
91
|
MsReport provides an interface to the R package LIMMA for differential expression
|
|
92
|
-
analysis, which requires a local installation of R (R version
|
|
92
|
+
analysis, which requires a local installation of R (R version 4.0 or higher) and the
|
|
93
93
|
system environment variable "R_HOME" to be set to the R home directory. Note that it
|
|
94
94
|
might be necessary to restart the computer after adding the "R_HOME" variable. The R
|
|
95
95
|
home directory can also be found from within R by using the command below, and might
|
|
@@ -1,13 +1,11 @@
|
|
|
1
|
-
from msreport.qtable import Qtable
|
|
2
|
-
from msreport.reader import MaxQuantReader, FragPipeReader, SpectronautReader
|
|
3
|
-
|
|
4
|
-
from msreport.fasta import import_protein_database
|
|
5
|
-
|
|
6
1
|
import msreport.analyze
|
|
7
2
|
import msreport.export
|
|
8
3
|
import msreport.impute
|
|
9
4
|
import msreport.normalize
|
|
10
5
|
import msreport.plot
|
|
11
6
|
import msreport.reader
|
|
7
|
+
from msreport.fasta import import_protein_database
|
|
8
|
+
from msreport.qtable import Qtable
|
|
9
|
+
from msreport.reader import FragPipeReader, MaxQuantReader, SpectronautReader
|
|
12
10
|
|
|
13
|
-
__version__ = "0.0.
|
|
11
|
+
__version__ = "0.0.28"
|
|
@@ -71,7 +71,7 @@ def maximum_per_column(array: np.ndarray) -> np.ndarray:
|
|
|
71
71
|
return np.array([maximum(i) for i in array.transpose()])
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
def minimum(array: np.ndarray) ->
|
|
74
|
+
def minimum(array: np.ndarray) -> float:
|
|
75
75
|
"""Returns the lowest finite value from one or multiple columns."""
|
|
76
76
|
array = array.flatten()
|
|
77
77
|
if np.isfinite(array).any():
|
|
@@ -218,7 +218,7 @@ def aggregate_unique_groups(
|
|
|
218
218
|
columns_to_aggregate: Union[str, Iterable],
|
|
219
219
|
condenser: Callable,
|
|
220
220
|
is_sorted: bool,
|
|
221
|
-
) ->
|
|
221
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
222
222
|
"""Aggregates column(s) by applying a condenser function to unique groups.
|
|
223
223
|
|
|
224
224
|
The function returns two arrays containing the aggregated values and the
|
|
@@ -256,7 +256,7 @@ def aggregate_unique_groups(
|
|
|
256
256
|
|
|
257
257
|
def _prepare_grouping_indices(
|
|
258
258
|
table: pd.DataFrame, group_by: str, is_sorted: bool
|
|
259
|
-
) ->
|
|
259
|
+
) -> tuple[np.ndarray, np.ndarray, pd.DataFrame]:
|
|
260
260
|
"""Prepares start indices and names of unique groups from a sorted dataframe.
|
|
261
261
|
|
|
262
262
|
Args:
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""The analyze module contains methods for analysing quantification results."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
|
-
|
|
4
|
+
|
|
5
5
|
import warnings
|
|
6
|
+
from typing import Iterable, Optional, Protocol, Sequence
|
|
6
7
|
|
|
7
8
|
import numpy as np
|
|
8
9
|
import pandas as pd
|
|
@@ -10,6 +11,7 @@ import pandas as pd
|
|
|
10
11
|
import msreport.normalize
|
|
11
12
|
import msreport.rinterface
|
|
12
13
|
from msreport.helper import find_sample_columns
|
|
14
|
+
from msreport.qtable import Qtable
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
class Transformer(Protocol):
|
|
@@ -234,7 +236,7 @@ def normalize_expression(
|
|
|
234
236
|
|
|
235
237
|
def create_site_to_protein_normalizer(
|
|
236
238
|
qtable: Qtable, category_column: str = "Representative protein"
|
|
237
|
-
) -> msreport.
|
|
239
|
+
) -> msreport.normalize.CategoricalNormalizer:
|
|
238
240
|
"""Creates a fitted `CategoricalNormalizer` for site-to-protein normalization.
|
|
239
241
|
|
|
240
242
|
The `CategoricalNormalizer` is fitted to protein expression profiles of the provided
|
|
@@ -254,8 +256,8 @@ def create_site_to_protein_normalizer(
|
|
|
254
256
|
samples_as_columns=True,
|
|
255
257
|
features=[category_column],
|
|
256
258
|
)
|
|
257
|
-
completely_quantified = (
|
|
258
|
-
|
|
259
|
+
completely_quantified = ~reference_expression[qtable.get_samples()].isna().any(
|
|
260
|
+
axis=1
|
|
259
261
|
)
|
|
260
262
|
reference_expression = reference_expression[completely_quantified]
|
|
261
263
|
|
|
@@ -275,7 +277,7 @@ def create_ibaq_transformer(
|
|
|
275
277
|
qtable: Qtable,
|
|
276
278
|
category_column: str = "Representative protein",
|
|
277
279
|
ibaq_column: str = "iBAQ peptides",
|
|
278
|
-
) -> msreport.
|
|
280
|
+
) -> msreport.normalize.CategoricalNormalizer:
|
|
279
281
|
"""Creates a fitted `CategoricalNormalizer` for iBAQ transformation.
|
|
280
282
|
|
|
281
283
|
The `CategoricalNormalizer` is fitted to iBAQ peptide counts of the provided
|
|
@@ -301,7 +303,7 @@ def create_ibaq_transformer(
|
|
|
301
303
|
ibaq_factor_values[ibaq_factor_values < 1] = 1
|
|
302
304
|
ibaq_factor_values = np.log2(ibaq_factor_values)
|
|
303
305
|
|
|
304
|
-
reference_table = pd.DataFrame(
|
|
306
|
+
reference_table = pd.DataFrame(dict.fromkeys(sample_columns, ibaq_factor_values))
|
|
305
307
|
reference_table[category_column] = category_values
|
|
306
308
|
|
|
307
309
|
normalizer = msreport.normalize.CategoricalNormalizer(category_column)
|
|
@@ -422,7 +424,15 @@ def calculate_multi_group_comparison(
|
|
|
422
424
|
correspond to entries from qtable.design["Experiment"].
|
|
423
425
|
exclude_invalid: If true, the column "Valid" is used to determine which rows are
|
|
424
426
|
used for calculating the group comparisons; default True.
|
|
427
|
+
|
|
428
|
+
Raises:
|
|
429
|
+
ValueError: If 'experiment_pairs' contains invalid entries. Each experiment pair
|
|
430
|
+
must have exactly two entries and the two entries must not be the same. All
|
|
431
|
+
experiments must be present in qtable.design. No duplicate experiment pairs
|
|
432
|
+
are allowed.
|
|
425
433
|
"""
|
|
434
|
+
_validate_experiment_pairs(qtable, experiment_pairs)
|
|
435
|
+
|
|
426
436
|
table = qtable.make_expression_table(samples_as_columns=True, features=["Valid"])
|
|
427
437
|
comparison_tag = " vs "
|
|
428
438
|
|
|
@@ -475,7 +485,7 @@ def two_group_comparison(
|
|
|
475
485
|
|
|
476
486
|
def calculate_multi_group_limma(
|
|
477
487
|
qtable: Qtable,
|
|
478
|
-
experiment_pairs:
|
|
488
|
+
experiment_pairs: Sequence[Iterable[str]],
|
|
479
489
|
exclude_invalid: bool = True,
|
|
480
490
|
batch: bool = False,
|
|
481
491
|
limma_trend: bool = True,
|
|
@@ -491,8 +501,7 @@ def calculate_multi_group_limma(
|
|
|
491
501
|
|
|
492
502
|
Requires that expression columns are set, and expression values are log2 transformed
|
|
493
503
|
All rows with missing values are ignored, impute missing values to allow
|
|
494
|
-
differential expression analysis of all rows.
|
|
495
|
-
"Representative protein" is used as the index.
|
|
504
|
+
differential expression analysis of all rows.
|
|
496
505
|
|
|
497
506
|
Args:
|
|
498
507
|
qtable: Qtable instance that contains expression values for differential
|
|
@@ -510,13 +519,17 @@ def calculate_multi_group_limma(
|
|
|
510
519
|
limma.eBayes for details; default True.
|
|
511
520
|
|
|
512
521
|
Raises:
|
|
522
|
+
ValueError: If 'experiment_pairs' contains invalid entries. Each experiment pair
|
|
523
|
+
must have exactly two entries and the two entries must not be the same. All
|
|
524
|
+
experiments must be present in qtable.design. No duplicate experiment pairs
|
|
525
|
+
are allowed.
|
|
513
526
|
KeyError: If the "Batch" column is not present in the qtable.design when
|
|
514
527
|
'batch' is set to True.
|
|
515
528
|
ValueError: If all values from qtable.design["Batch"] are identical when 'batch'
|
|
516
529
|
is set to True.
|
|
517
|
-
ValueError: If the same experiment pair has been specified multiple times in
|
|
518
|
-
'experiment_pairs'.
|
|
519
530
|
"""
|
|
531
|
+
_validate_experiment_pairs(qtable, experiment_pairs)
|
|
532
|
+
|
|
520
533
|
# TODO: not tested #
|
|
521
534
|
if batch and "Batch" not in qtable.get_design():
|
|
522
535
|
raise KeyError(
|
|
@@ -528,17 +541,10 @@ def calculate_multi_group_limma(
|
|
|
528
541
|
"When using calculate_multi_group_limma(batch=True), not all values from"
|
|
529
542
|
' qtable.design["Batch"] are allowed to be identical.'
|
|
530
543
|
)
|
|
531
|
-
if len(list(experiment_pairs)) != len(set(experiment_pairs)):
|
|
532
|
-
raise ValueError(
|
|
533
|
-
"The same experiment pair has been specified multiple times."
|
|
534
|
-
" Each entry in the `experiment_pairs` argument must be unique."
|
|
535
|
-
)
|
|
536
544
|
|
|
537
545
|
design = qtable.get_design()
|
|
538
|
-
table = qtable.make_expression_table(
|
|
539
|
-
|
|
540
|
-
)
|
|
541
|
-
table = table.set_index("Representative protein")
|
|
546
|
+
table = qtable.make_expression_table(samples_as_columns=True)
|
|
547
|
+
table.index = table.index.astype(str) # It appears that a string is required for R
|
|
542
548
|
comparison_tag = " vs "
|
|
543
549
|
|
|
544
550
|
if exclude_invalid:
|
|
@@ -554,7 +560,7 @@ def calculate_multi_group_limma(
|
|
|
554
560
|
experiment_to_r[experiment] = f".EXPERIMENT__{i:04d}"
|
|
555
561
|
r_to_experiment = {v: k for k, v in experiment_to_r.items()}
|
|
556
562
|
|
|
557
|
-
r_experiment_pairs = []
|
|
563
|
+
r_experiment_pairs: list[str] = []
|
|
558
564
|
for exp1, exp2 in experiment_pairs:
|
|
559
565
|
r_experiment_pairs.append(f"{experiment_to_r[exp1]}-{experiment_to_r[exp2]}")
|
|
560
566
|
|
|
@@ -583,7 +589,7 @@ def calculate_multi_group_limma(
|
|
|
583
589
|
|
|
584
590
|
def calculate_two_group_limma(
|
|
585
591
|
qtable: Qtable,
|
|
586
|
-
experiment_pair:
|
|
592
|
+
experiment_pair: Sequence[str],
|
|
587
593
|
exclude_invalid: bool = True,
|
|
588
594
|
limma_trend: bool = True,
|
|
589
595
|
) -> None:
|
|
@@ -596,8 +602,7 @@ def calculate_two_group_limma(
|
|
|
596
602
|
|
|
597
603
|
Requires that expression columns are set, and expression values are log2
|
|
598
604
|
transformed. All rows with missing values are ignored, impute missing values to
|
|
599
|
-
allow differential expression analysis of all rows.
|
|
600
|
-
column "Representative protein" is used as the index.
|
|
605
|
+
allow differential expression analysis of all rows.
|
|
601
606
|
|
|
602
607
|
Args:
|
|
603
608
|
qtable: Qtable instance that contains expression values for differential
|
|
@@ -608,27 +613,30 @@ def calculate_two_group_limma(
|
|
|
608
613
|
used for the differential expression analysis; default True.
|
|
609
614
|
limma_trend: If true, an intensity-dependent trend is fitted to the prior
|
|
610
615
|
variances; default True.
|
|
616
|
+
Raises:
|
|
617
|
+
ValueError: If 'experiment_pair' contains invalid entries. The experiment pair
|
|
618
|
+
must have exactly two entries and the two entries must not be the same. Both
|
|
619
|
+
experiments must be present in qtable.design.
|
|
611
620
|
"""
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
)
|
|
621
|
+
_validate_experiment_pair(qtable, experiment_pair)
|
|
622
|
+
|
|
623
|
+
# TODO: LIMMA function not tested #
|
|
624
|
+
table = qtable.make_expression_table(samples_as_columns=True)
|
|
616
625
|
comparison_tag = " vs "
|
|
617
626
|
|
|
618
627
|
if exclude_invalid:
|
|
619
628
|
valid = qtable["Valid"]
|
|
620
629
|
else:
|
|
621
|
-
valid = np.full(
|
|
630
|
+
valid = np.full(table.shape[0], True)
|
|
622
631
|
|
|
623
632
|
samples_to_experiment = {}
|
|
624
633
|
for experiment in experiment_pair:
|
|
625
|
-
mapping =
|
|
634
|
+
mapping = dict.fromkeys(qtable.get_samples(experiment), experiment)
|
|
626
635
|
samples_to_experiment.update(mapping)
|
|
627
636
|
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
table =
|
|
631
|
-
table = table.set_index("Representative protein")
|
|
637
|
+
# Keep only samples that are present in the 'experiment_pair'
|
|
638
|
+
table = table[samples_to_experiment.keys()]
|
|
639
|
+
table.index = table.index.astype(str) # It appears that a string is required for R
|
|
632
640
|
not_nan = table.isna().sum(axis=1) == 0
|
|
633
641
|
|
|
634
642
|
mask = np.all([valid, not_nan], axis=0)
|
|
@@ -649,3 +657,63 @@ def calculate_two_group_limma(
|
|
|
649
657
|
mapping = {col: f"{col} {comparison_group}" for col in limma_table.columns}
|
|
650
658
|
limma_table.rename(columns=mapping, inplace=True)
|
|
651
659
|
qtable.add_expression_features(limma_table)
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def _validate_experiment_pairs(
|
|
663
|
+
qtable: Qtable, exp_pairs: Iterable[Iterable[str]]
|
|
664
|
+
) -> None:
|
|
665
|
+
"""Validates that experiment pairs are valid and raises an error if not.
|
|
666
|
+
|
|
667
|
+
- All 'exp_pairs' entries must have a length of exactly 2.
|
|
668
|
+
- All experiments must be present in the qtable.design.
|
|
669
|
+
- No duplicate experiments are allowed in a pair.
|
|
670
|
+
- No duplicate experiment pairs are allowed.
|
|
671
|
+
|
|
672
|
+
Args:
|
|
673
|
+
qtable: Qtable instance containing experiment data.
|
|
674
|
+
exp_pairs: Iterable of experiment pairs to validate.
|
|
675
|
+
|
|
676
|
+
Raises:
|
|
677
|
+
ValueError: If any of the validation checks fail.
|
|
678
|
+
"""
|
|
679
|
+
all_experiments = {exp for pair in exp_pairs for exp in pair}
|
|
680
|
+
missing_experiments = all_experiments - set(qtable.get_experiments())
|
|
681
|
+
if missing_experiments:
|
|
682
|
+
raise ValueError(
|
|
683
|
+
f"Experiments '{missing_experiments}' not found in qtable.design."
|
|
684
|
+
)
|
|
685
|
+
for experiment_pair in exp_pairs:
|
|
686
|
+
_validate_experiment_pair(qtable, experiment_pair)
|
|
687
|
+
|
|
688
|
+
if len(list(exp_pairs)) != len({tuple(pair) for pair in exp_pairs}):
|
|
689
|
+
raise ValueError(
|
|
690
|
+
f"Some experiment pairs in {exp_pairs} have been specified multiple "
|
|
691
|
+
"times. Each pair must occur only once."
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def _validate_experiment_pair(qtable: Qtable, exp_pair: Iterable[str]) -> None:
|
|
696
|
+
"""Validates the experiment pair is valid and raises an error if not.
|
|
697
|
+
|
|
698
|
+
- The experiment pair must contain exactly two entries
|
|
699
|
+
- The two entries of the experiment pair must be different.
|
|
700
|
+
- Both experiments must be present in the qtable.design.
|
|
701
|
+
|
|
702
|
+
Args:
|
|
703
|
+
qtable: Qtable instance containing experiment data.
|
|
704
|
+
experiment_pairs: Iterable of experiment pairs to validate.
|
|
705
|
+
|
|
706
|
+
Raises:
|
|
707
|
+
ValueError: If any of the validation checks fail.
|
|
708
|
+
"""
|
|
709
|
+
if len(list(exp_pair)) != 2:
|
|
710
|
+
raise ValueError(
|
|
711
|
+
f"Experiment pair '{exp_pair}' contains more than two entries."
|
|
712
|
+
)
|
|
713
|
+
if len(list(exp_pair)) != len(set(exp_pair)):
|
|
714
|
+
raise ValueError(f"Experiment pair '{exp_pair}' contains the same entry twice.")
|
|
715
|
+
if set(exp_pair) - set(qtable.get_experiments()):
|
|
716
|
+
raise ValueError(
|
|
717
|
+
f"Experiments '{set(exp_pair) - set(qtable.get_experiments())}' "
|
|
718
|
+
"not found in qtable.design."
|
|
719
|
+
)
|
|
@@ -13,10 +13,10 @@ Index([
|
|
|
13
13
|
], dtype='object')
|
|
14
14
|
"""
|
|
15
15
|
|
|
16
|
-
from collections import defaultdict as ddict
|
|
17
16
|
import os
|
|
18
|
-
from typing import Iterable, Optional, Protocol
|
|
19
17
|
import warnings
|
|
18
|
+
from collections import defaultdict as ddict
|
|
19
|
+
from typing import Iterable, Optional, Protocol, Sequence
|
|
20
20
|
|
|
21
21
|
import numpy as np
|
|
22
22
|
import pandas as pd
|
|
@@ -88,7 +88,7 @@ def contaminants_to_clipboard(qtable: Qtable) -> None:
|
|
|
88
88
|
|
|
89
89
|
for column_tag in column_tags:
|
|
90
90
|
columns.extend(helper.find_sample_columns(data, column_tag, samples))
|
|
91
|
-
columns =
|
|
91
|
+
columns = [c for c in columns if c in data.columns]
|
|
92
92
|
|
|
93
93
|
contaminants = qtable["Potential contaminant"]
|
|
94
94
|
data = data.loc[contaminants, columns]
|
|
@@ -135,10 +135,10 @@ def to_perseus_matrix(
|
|
|
135
135
|
numeric_columns = set(numeric_columns).difference(expression_columns)
|
|
136
136
|
numeric_columns = set(numeric_columns).difference(categorical_columns)
|
|
137
137
|
|
|
138
|
-
column_categories = ddict(lambda: default_category)
|
|
139
|
-
column_categories.update(
|
|
140
|
-
column_categories.update(
|
|
141
|
-
column_categories.update(
|
|
138
|
+
column_categories: ddict[str, str] = ddict(lambda: default_category)
|
|
139
|
+
column_categories.update(dict.fromkeys(numeric_columns, "N"))
|
|
140
|
+
column_categories.update(dict.fromkeys(categorical_columns, "C"))
|
|
141
|
+
column_categories.update(dict.fromkeys(expression_columns, "E"))
|
|
142
142
|
|
|
143
143
|
column_annotation = [column_categories[column] for column in table.columns]
|
|
144
144
|
column_annotation[0] = f"{annotation_row_prefix}{column_annotation[0]}"
|
|
@@ -219,6 +219,7 @@ def write_html_coverage_map(
|
|
|
219
219
|
"change in a future release."
|
|
220
220
|
),
|
|
221
221
|
FutureWarning,
|
|
222
|
+
stacklevel=2,
|
|
222
223
|
)
|
|
223
224
|
# Get protein information from the protein database
|
|
224
225
|
protein_entry = protein_db[protein_id]
|
|
@@ -314,8 +315,8 @@ def _amica_table_from(qtable: Qtable) -> pd.DataFrame:
|
|
|
314
315
|
sample_columns = helper.find_sample_columns(
|
|
315
316
|
amica_table, tag, qtable.get_samples()
|
|
316
317
|
)
|
|
317
|
-
non_sample_columns = set(columns).difference(set(sample_columns))
|
|
318
|
-
amica_table.drop(non_sample_columns, inplace=True, axis=1)
|
|
318
|
+
non_sample_columns = list(set(columns).difference(set(sample_columns)))
|
|
319
|
+
amica_table.drop(columns=non_sample_columns, inplace=True, axis=1)
|
|
319
320
|
|
|
320
321
|
# Log transform columns if necessary
|
|
321
322
|
for tag in intensity_column_tags:
|
|
@@ -437,7 +438,7 @@ def _generate_html_sequence_map(
|
|
|
437
438
|
highlights = highlights if highlights is not None else {}
|
|
438
439
|
sequence_length = len(sequence)
|
|
439
440
|
|
|
440
|
-
def write_row_index(pos: int, strings: list)
|
|
441
|
+
def write_row_index(pos: int, strings: list):
|
|
441
442
|
ndigits = len(str(sequence_length))
|
|
442
443
|
row_index = str(pos + 1).rjust(ndigits)
|
|
443
444
|
html_entry = '<FONT COLOR="#000000">' + row_index + " " + "</FONT>"
|
|
@@ -457,7 +458,7 @@ def _generate_html_sequence_map(
|
|
|
457
458
|
|
|
458
459
|
in_covered_region: bool = False
|
|
459
460
|
strings = []
|
|
460
|
-
strings.append(
|
|
461
|
+
strings.append('<FONT COLOR="#606060">') # Set default text color to grey
|
|
461
462
|
write_row_index(0, strings)
|
|
462
463
|
for pos, character in enumerate(sequence):
|
|
463
464
|
if pos in coverage_start_idx:
|
|
@@ -483,13 +484,15 @@ def _generate_html_sequence_map(
|
|
|
483
484
|
if pos in coverage_stop_idx:
|
|
484
485
|
in_covered_region = False
|
|
485
486
|
close_coverage_region(strings)
|
|
486
|
-
strings.append(
|
|
487
|
+
strings.append("</FONT>")
|
|
487
488
|
|
|
488
489
|
html_sequence_block = "".join(strings)
|
|
489
490
|
return html_sequence_block
|
|
490
491
|
|
|
491
492
|
|
|
492
|
-
def _find_covered_region_boundaries(
|
|
493
|
+
def _find_covered_region_boundaries(
|
|
494
|
+
coverage_mask: Sequence[bool],
|
|
495
|
+
) -> list[tuple[int, int]]:
|
|
493
496
|
"""Returns a list of boundaries from continuously covered regions in a protein.
|
|
494
497
|
|
|
495
498
|
Args:
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import pathlib
|
|
2
2
|
from typing import Iterable, Union
|
|
3
3
|
|
|
4
|
-
|
|
5
4
|
from profasta.db import ProteinDatabase
|
|
6
5
|
|
|
7
6
|
|
|
@@ -24,5 +23,7 @@ def import_protein_database(
|
|
|
24
23
|
database = ProteinDatabase()
|
|
25
24
|
paths = [fasta_path] if isinstance(fasta_path, (str, pathlib.Path)) else fasta_path
|
|
26
25
|
for path in paths:
|
|
26
|
+
if isinstance(path, pathlib.Path):
|
|
27
|
+
path = path.as_posix()
|
|
27
28
|
database.add_fasta(path, header_parser=header_parser, overwrite=True)
|
|
28
29
|
return database
|
|
@@ -1,21 +1,21 @@
|
|
|
1
1
|
from .calc import (
|
|
2
|
-
|
|
2
|
+
calculate_monoisotopic_mass,
|
|
3
|
+
calculate_sequence_coverage,
|
|
3
4
|
calculate_tryptic_ibaq_peptides,
|
|
4
5
|
make_coverage_mask,
|
|
5
|
-
|
|
6
|
-
calculate_monoisotopic_mass,
|
|
6
|
+
mode,
|
|
7
7
|
)
|
|
8
8
|
from .table import (
|
|
9
9
|
apply_intensity_cutoff,
|
|
10
|
-
guess_design,
|
|
11
|
-
intensities_in_logspace,
|
|
12
10
|
find_columns,
|
|
13
11
|
find_sample_columns,
|
|
12
|
+
guess_design,
|
|
13
|
+
intensities_in_logspace,
|
|
14
|
+
join_tables,
|
|
14
15
|
keep_rows_by_partial_match,
|
|
15
16
|
remove_rows_by_partial_match,
|
|
16
|
-
join_tables,
|
|
17
|
-
rename_sample_columns,
|
|
18
17
|
rename_mq_reporter_channels,
|
|
18
|
+
rename_sample_columns,
|
|
19
19
|
)
|
|
20
20
|
from .temp import (
|
|
21
21
|
extract_modifications,
|
|
@@ -1,15 +1,13 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import Iterable
|
|
1
|
+
from typing import Iterable, Sequence
|
|
3
2
|
|
|
4
3
|
import numpy as np
|
|
5
|
-
import scipy.stats
|
|
6
|
-
import scipy.optimize
|
|
7
|
-
|
|
8
4
|
import pyteomics.mass
|
|
9
5
|
import pyteomics.parser
|
|
6
|
+
import scipy.optimize
|
|
7
|
+
import scipy.stats
|
|
10
8
|
|
|
11
9
|
|
|
12
|
-
def mode(values:
|
|
10
|
+
def mode(values: Sequence) -> float:
|
|
13
11
|
"""Calculate the mode by using kernel-density estimation.
|
|
14
12
|
|
|
15
13
|
Args:
|
|
@@ -19,25 +17,26 @@ def mode(values: Iterable) -> float:
|
|
|
19
17
|
Returns:
|
|
20
18
|
The estimated mode. If no finite values are present, returns nan.
|
|
21
19
|
"""
|
|
22
|
-
|
|
23
|
-
finite_values = values[np.isfinite(values)]
|
|
20
|
+
finite_values = np.asarray(values)[np.isfinite(values)]
|
|
24
21
|
if len(finite_values) == 0:
|
|
25
22
|
return np.nan
|
|
26
23
|
elif len(np.unique(finite_values)) == 1:
|
|
27
24
|
return np.unique(finite_values)[0]
|
|
28
25
|
|
|
29
26
|
kde = scipy.stats.gaussian_kde(finite_values)
|
|
30
|
-
|
|
27
|
+
|
|
28
|
+
def _minimum_function(x):
|
|
29
|
+
return -kde(x)[0]
|
|
31
30
|
|
|
32
31
|
min_slice, max_sclice = np.percentile(finite_values, (2, 98))
|
|
33
32
|
slice_step = 0.2
|
|
34
33
|
brute_optimize_result = scipy.optimize.brute(
|
|
35
|
-
|
|
34
|
+
_minimum_function, [slice(min_slice, max_sclice + slice_step, slice_step)]
|
|
36
35
|
)
|
|
37
36
|
rough_minimum = brute_optimize_result[0]
|
|
38
37
|
|
|
39
38
|
local_optimize_result = scipy.optimize.minimize(
|
|
40
|
-
|
|
39
|
+
_minimum_function, x0=rough_minimum, method="BFGS"
|
|
41
40
|
)
|
|
42
41
|
fine_minimum = local_optimize_result.x[0]
|
|
43
42
|
return fine_minimum
|
|
@@ -91,8 +90,8 @@ def calculate_monoisotopic_mass(protein_sequence: str) -> float:
|
|
|
91
90
|
|
|
92
91
|
|
|
93
92
|
def make_coverage_mask(
|
|
94
|
-
protein_length: int, peptide_positions:
|
|
95
|
-
) -> np.
|
|
93
|
+
protein_length: int, peptide_positions: Iterable[Iterable[int]]
|
|
94
|
+
) -> np.ndarray:
|
|
96
95
|
"""Returns a Boolean array with True for positions present in 'peptide_positions'.
|
|
97
96
|
|
|
98
97
|
Args:
|
|
@@ -109,8 +108,8 @@ def make_coverage_mask(
|
|
|
109
108
|
|
|
110
109
|
|
|
111
110
|
def calculate_sequence_coverage(
|
|
112
|
-
protein_length: int, peptide_positions:
|
|
113
|
-
) ->
|
|
111
|
+
protein_length: int, peptide_positions: Iterable[Iterable[int]], ndigits: int = 1
|
|
112
|
+
) -> float:
|
|
114
113
|
"""Calculates the protein sequence coverage given a list of peptide positions.
|
|
115
114
|
|
|
116
115
|
Args:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import itertools
|
|
2
|
-
from typing import Callable
|
|
3
2
|
import warnings
|
|
3
|
+
from typing import Callable
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
|
|
@@ -125,7 +125,7 @@ def calculate_pairwise_mode_log_ratio_matrix(
|
|
|
125
125
|
|
|
126
126
|
def prepare_coefficient_matrix(
|
|
127
127
|
ratio_matrix: np.ndarray,
|
|
128
|
-
) ->
|
|
128
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
129
129
|
"""Prepares coefficients, ratios, and initial row indices from a log ratio matrix.
|
|
130
130
|
|
|
131
131
|
Args:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Iterable, Union
|
|
2
|
+
from typing import Iterable, Sequence, Union
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
@@ -63,7 +63,7 @@ def intensities_in_logspace(data: Union[pd.DataFrame, np.ndarray, Iterable]) ->
|
|
|
63
63
|
"""
|
|
64
64
|
data = np.array(data, dtype=float)
|
|
65
65
|
mask = np.isfinite(data)
|
|
66
|
-
return np.all(data[mask].flatten() <= 64)
|
|
66
|
+
return bool(np.all(data[mask].flatten() <= 64))
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
def rename_sample_columns(table: pd.DataFrame, mapping: dict[str, str]) -> pd.DataFrame:
|
|
@@ -102,7 +102,7 @@ def rename_sample_columns(table: pd.DataFrame, mapping: dict[str, str]) -> pd.Da
|
|
|
102
102
|
|
|
103
103
|
|
|
104
104
|
def rename_mq_reporter_channels(
|
|
105
|
-
table: pd.DataFrame, channel_names:
|
|
105
|
+
table: pd.DataFrame, channel_names: Sequence[str]
|
|
106
106
|
) -> None:
|
|
107
107
|
"""Renames reporter channel numbers with sample names.
|
|
108
108
|
|
|
@@ -157,8 +157,7 @@ def find_columns(
|
|
|
157
157
|
Returns:
|
|
158
158
|
A list of column names.
|
|
159
159
|
"""
|
|
160
|
-
|
|
161
|
-
matched_columns = np.array(table.columns)[matches].tolist()
|
|
160
|
+
matched_columns = [col for col in table.columns if substring in col]
|
|
162
161
|
if must_be_substring:
|
|
163
162
|
matched_columns = [col for col in matched_columns if col != substring]
|
|
164
163
|
return matched_columns
|
|
@@ -255,7 +254,7 @@ def remove_rows_by_partial_match(
|
|
|
255
254
|
|
|
256
255
|
|
|
257
256
|
def join_tables(
|
|
258
|
-
tables:
|
|
257
|
+
tables: Sequence[pd.DataFrame], reset_index: bool = False
|
|
259
258
|
) -> pd.DataFrame:
|
|
260
259
|
"""Returns a joined dataframe.
|
|
261
260
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
@@ -51,7 +52,7 @@ class FixedValueImputer:
|
|
|
51
52
|
Returns the fitted FixedValueImputer instance.
|
|
52
53
|
"""
|
|
53
54
|
if self.strategy == "constant":
|
|
54
|
-
fill_values =
|
|
55
|
+
fill_values = dict.fromkeys(table.columns, self.fill_value)
|
|
55
56
|
elif self.strategy == "below":
|
|
56
57
|
if self.column_wise:
|
|
57
58
|
fill_values = {}
|
|
@@ -59,7 +60,7 @@ class FixedValueImputer:
|
|
|
59
60
|
fill_values[column] = _calculate_integer_below_min(table[column])
|
|
60
61
|
else:
|
|
61
62
|
int_below_min = _calculate_integer_below_min(table)
|
|
62
|
-
fill_values =
|
|
63
|
+
fill_values = dict.fromkeys(table.columns, int_below_min)
|
|
63
64
|
self._sample_fill_values = fill_values
|
|
64
65
|
return self
|
|
65
66
|
|