msreport 0.0.27__tar.gz → 0.0.29__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport-0.0.29/PKG-INFO +136 -0
- msreport-0.0.29/README.md +97 -0
- {msreport-0.0.27 → msreport-0.0.29}/msreport/__init__.py +4 -6
- {msreport-0.0.27 → msreport-0.0.29}/msreport/aggregate/condense.py +1 -1
- {msreport-0.0.27 → msreport-0.0.29}/msreport/aggregate/pivot.py +1 -0
- {msreport-0.0.27 → msreport-0.0.29}/msreport/aggregate/summarize.py +2 -2
- {msreport-0.0.27 → msreport-0.0.29}/msreport/analyze.py +117 -36
- {msreport-0.0.27 → msreport-0.0.29}/msreport/errors.py +5 -2
- {msreport-0.0.27 → msreport-0.0.29}/msreport/export.py +16 -13
- {msreport-0.0.27 → msreport-0.0.29}/msreport/fasta.py +2 -1
- {msreport-0.0.27 → msreport-0.0.29}/msreport/helper/__init__.py +7 -7
- {msreport-0.0.27 → msreport-0.0.29}/msreport/helper/calc.py +14 -15
- {msreport-0.0.27 → msreport-0.0.29}/msreport/helper/maxlfq.py +2 -2
- {msreport-0.0.27 → msreport-0.0.29}/msreport/helper/table.py +5 -6
- {msreport-0.0.27 → msreport-0.0.29}/msreport/impute.py +4 -3
- {msreport-0.0.27 → msreport-0.0.29}/msreport/isobar.py +10 -9
- {msreport-0.0.27 → msreport-0.0.29}/msreport/normalize.py +2 -1
- {msreport-0.0.27 → msreport-0.0.29}/msreport/peptidoform.py +6 -4
- msreport-0.0.29/msreport/plot/__init__.py +41 -0
- msreport-0.0.29/msreport/plot/_partial_plots.py +159 -0
- msreport-0.0.29/msreport/plot/comparison.py +490 -0
- msreport-0.0.29/msreport/plot/distribution.py +253 -0
- msreport-0.0.29/msreport/plot/multivariate.py +355 -0
- msreport-0.0.29/msreport/plot/quality.py +431 -0
- msreport-0.0.29/msreport/plot/style.py +286 -0
- msreport-0.0.29/msreport/plot/style_sheets/msreport-notebook.mplstyle +57 -0
- msreport-0.0.29/msreport/plot/style_sheets/seaborn-whitegrid.mplstyle +45 -0
- {msreport-0.0.27 → msreport-0.0.29}/msreport/qtable.py +109 -17
- {msreport-0.0.27 → msreport-0.0.29}/msreport/reader.py +235 -86
- msreport-0.0.29/msreport/rinterface/__init__.py +16 -0
- {msreport-0.0.27 → msreport-0.0.29}/msreport/rinterface/limma.py +2 -1
- {msreport-0.0.27 → msreport-0.0.29}/msreport/rinterface/rinstaller.py +3 -3
- msreport-0.0.29/msreport.egg-info/PKG-INFO +136 -0
- {msreport-0.0.27 → msreport-0.0.29}/msreport.egg-info/SOURCES.txt +9 -1
- {msreport-0.0.27 → msreport-0.0.29}/msreport.egg-info/requires.txt +10 -1
- msreport-0.0.29/pyproject.toml +105 -0
- {msreport-0.0.27 → msreport-0.0.29}/tests/test_analyze.py +74 -17
- {msreport-0.0.27 → msreport-0.0.29}/tests/test_peptidoform.py +2 -1
- {msreport-0.0.27 → msreport-0.0.29}/tests/test_plot.py +24 -1
- {msreport-0.0.27 → msreport-0.0.29}/tests/test_qtable.py +90 -23
- msreport-0.0.27/PKG-INFO +0 -129
- msreport-0.0.27/README.md +0 -100
- msreport-0.0.27/msreport/plot.py +0 -1134
- msreport-0.0.27/msreport/rinterface/__init__.py +0 -3
- msreport-0.0.27/msreport.egg-info/PKG-INFO +0 -129
- msreport-0.0.27/pyproject.toml +0 -56
- {msreport-0.0.27 → msreport-0.0.29}/LICENSE.txt +0 -0
- {msreport-0.0.27 → msreport-0.0.29}/msreport/aggregate/__init__.py +0 -0
- {msreport-0.0.27 → msreport-0.0.29}/msreport/helper/temp.py +0 -0
- {msreport-0.0.27 → msreport-0.0.29}/msreport/rinterface/rscripts/limma.R +0 -0
- {msreport-0.0.27 → msreport-0.0.29}/msreport.egg-info/dependency_links.txt +0 -0
- {msreport-0.0.27 → msreport-0.0.29}/msreport.egg-info/top_level.txt +0 -0
- {msreport-0.0.27 → msreport-0.0.29}/setup.cfg +0 -0
- {msreport-0.0.27 → msreport-0.0.29}/setup.py +0 -0
- {msreport-0.0.27 → msreport-0.0.29}/tests/test_export.py +0 -0
- {msreport-0.0.27 → msreport-0.0.29}/tests/test_helper.py +0 -0
- {msreport-0.0.27 → msreport-0.0.29}/tests/test_impute.py +0 -0
- {msreport-0.0.27 → msreport-0.0.29}/tests/test_isobar.py +0 -0
- {msreport-0.0.27 → msreport-0.0.29}/tests/test_maxlfq.py +0 -0
msreport-0.0.29/PKG-INFO
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: msreport
|
|
3
|
+
Version: 0.0.29
|
|
4
|
+
Summary: Post processing and analysis of quantitative proteomics data
|
|
5
|
+
Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: homepage, https://github.com/hollenstein/msreport
|
|
8
|
+
Project-URL: changelog, https://github.com/hollenstein/msreport/blob/main/CHANGELOG.md
|
|
9
|
+
Keywords: mass spectrometry,proteomics,post processing,data analysis
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE.txt
|
|
19
|
+
Requires-Dist: adjustText<1.0.0,>=0.7.0
|
|
20
|
+
Requires-Dist: matplotlib>=3.5.2
|
|
21
|
+
Requires-Dist: numpy>=1.21.5
|
|
22
|
+
Requires-Dist: pandas>=1.4.4
|
|
23
|
+
Requires-Dist: profasta>=0.0.4
|
|
24
|
+
Requires-Dist: pyteomics>=4.6.0
|
|
25
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
26
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
27
|
+
Requires-Dist: scipy>=1.9.1
|
|
28
|
+
Requires-Dist: seaborn>=0.12.0
|
|
29
|
+
Requires-Dist: statsmodels>=0.13.2
|
|
30
|
+
Requires-Dist: typing_extensions>=4
|
|
31
|
+
Provides-Extra: r
|
|
32
|
+
Requires-Dist: rpy2!=3.5.13,>=3.5.3; extra == "r"
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: mypy>=1.15.0; extra == "dev"
|
|
35
|
+
Requires-Dist: pytest>=8.3.5; extra == "dev"
|
|
36
|
+
Provides-Extra: test
|
|
37
|
+
Requires-Dist: pytest>=8.3.5; extra == "test"
|
|
38
|
+
Dynamic: license-file
|
|
39
|
+
|
|
40
|
+
# MsReport
|
|
41
|
+
|
|
42
|
+
[](https://www.repostatus.org/#wip)
|
|
43
|
+

|
|
44
|
+
[](https://github.com/hollenstein/msreport/actions/workflows/run-tests.yml)
|
|
45
|
+
|
|
46
|
+
**MsReport** is a Python library for post-processing quantitative proteomics data from
|
|
47
|
+
bottom-up mass spectrometry experiments.
|
|
48
|
+
|
|
49
|
+
## Table of Contents
|
|
50
|
+
|
|
51
|
+
- [What is MsReport?](#what-is-msreport)
|
|
52
|
+
- [Key features of MsReport](#key-features-of-msreport)
|
|
53
|
+
- [Installation](#installation)
|
|
54
|
+
- [Installation when using Anaconda](#installation-when-using-anaconda)
|
|
55
|
+
- [Additional requirements](#additional-requirements)
|
|
56
|
+
- [Optional Dependencies](#optional-dependencies)
|
|
57
|
+
- [Development status](#development-status)
|
|
58
|
+
|
|
59
|
+
## What is MsReport?
|
|
60
|
+
|
|
61
|
+
MsReport is a Python library designed to simplify the post-processing and analysis of quantitative proteomics data from bottom-up mass spectrometry experiments. It provides a high-level, abstraction-focused API for efficient and standardized workflows. The modular design of the library provides the flexibility to meet project specific data processing needs and customize workflows as required.
|
|
62
|
+
|
|
63
|
+
The library supports importing protein and peptide-level quantification results from MaxQuant, FragPipe, and Spectronaut, as well as post-translational modification (PTM) data from MaxQuant and FragPipe. MsReport provides tools for data annotation, normalization and transformation, statistical testing, and data visualization.
|
|
64
|
+
|
|
65
|
+
### Key features of MsReport
|
|
66
|
+
|
|
67
|
+
#### Data Import and Standardization
|
|
68
|
+
|
|
69
|
+
The `reader` module provides software-specific reader classes for importing data from MaxQuant, FragPipe, and Spectronaut that enable the import of protein, peptide and ion tables. During the import process, these classes transform tables column names and table values into a standardized format to ensure that the rest of the library can operate in a tool-agnostic manner.
|
|
70
|
+
|
|
71
|
+
#### Data management
|
|
72
|
+
|
|
73
|
+
The `qtable` module provides a structured approach to managing quantitative data through its central `Qtable` class. This class combines quantitative data with an experimental design table that defines the relationship between samples and experimental conditions. The quantitative data is stored in a wide format, where each sample's measurements are stored in separate columns. The `Qtable` class serves as the foundation for data analysis workflows in MsReport, providing the standardized data structure used by the `analyze`, `plot`, and `export` modules.
|
|
74
|
+
|
|
75
|
+
#### Data processing and analysis
|
|
76
|
+
|
|
77
|
+
The `analyze` module provides tools for post-processing of mass spectrometry data generated by software such as MaxQuant, FragPipe, or Spectronaut. It includes functions for filtering, normalization, imputation of missing values, and statistical testing. The library integrates with the R package LIMMA to enable differential expression analysis.
|
|
78
|
+
|
|
79
|
+
> [!NOTE]
|
|
80
|
+
> In order to use the R integration you need to install msreport with optional dependencies, see [Optional Dependencies](#optional-dependencies) for more information.
|
|
81
|
+
|
|
82
|
+
#### Data visualization
|
|
83
|
+
|
|
84
|
+
The `plot` module supports the generation of visualizations for quality control and data analysis. It includes functions for creating various plots, such as intensity and ratio distributions, heatmaps, volcano plots, and PCA plots.
|
|
85
|
+
|
|
86
|
+
#### Data export
|
|
87
|
+
|
|
88
|
+
Finally, the `export` module enables the conversion and export into formats compatible with external tools. This includes generating input files for [Amica](https://bioapps.maxperutzlabs.ac.at/app/amica) and exporting tables for easier integration with Perseus.
|
|
89
|
+
|
|
90
|
+
## Installation
|
|
91
|
+
|
|
92
|
+
If you do not already have a Python installation, we recommend installing the [Anaconda distribution](https://www.anaconda.com/download) or [Miniconda](https://docs.anaconda.com/free/miniconda/index.html) distribution from Continuum Analytics, which already contains a large number of popular Python packages for Data Science. Alternatively, you can also get Python from the [Python homepage](https://www.python.org/downloads/windows). Note that MsReport requires Python version 3.10 or higher.
|
|
93
|
+
|
|
94
|
+
The following command will install MsReport and its dependencies by using a wheel file.
|
|
95
|
+
|
|
96
|
+
```shell
|
|
97
|
+
pip install msreport
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
To uninstall the MsReport library use:
|
|
101
|
+
|
|
102
|
+
```shell
|
|
103
|
+
pip uninstall msreport
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Installation when using Anaconda
|
|
107
|
+
|
|
108
|
+
To install the MsReport library using Anaconda, you need to either activate a custom conda environment or install it into the default base environment. Open the Anaconda Navigator, activate the desired conda environment or use the base environment, and then open a command line by running the "CMD.exe" application. Finally, use the `pip install` command as before.
|
|
109
|
+
|
|
110
|
+
### Optional Dependencies
|
|
111
|
+
|
|
112
|
+
#### R Integration
|
|
113
|
+
|
|
114
|
+
MsReport provides an interface to the R package LIMMA for differential expression analysis. To use this functionality, you need:
|
|
115
|
+
|
|
116
|
+
- A local installation of **R (version 4.0 or higher)**.
|
|
117
|
+
- The system environment variable R_HOME set to the R home directory.
|
|
118
|
+
- To install msreport with the optional dependencies for R integration.
|
|
119
|
+
|
|
120
|
+
```shell
|
|
121
|
+
pip install msreport[R]
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
#### Setting the R_HOME environment variable
|
|
125
|
+
|
|
126
|
+
On Windows, you may need to restart your computer after modifying the system environment variables for the changes to take effect. To find the R home directory, you can run the following command in R:
|
|
127
|
+
|
|
128
|
+
```R
|
|
129
|
+
normalizePath(R.home("home"))
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
For example, the R home directory might look like this on Windows: `C:\Program Files\R\R-4.2.1`
|
|
133
|
+
|
|
134
|
+
## Development status
|
|
135
|
+
|
|
136
|
+
MsReport is a stable and reliable library that has been used on a daily basis for over two years in the Mass Spectrometry Facility at the Max Perutz Labs and the Mass Spectrometry Facility of IMP/IMBA/GMI. While the current interface of MsReport is stable, the library is still under active development, with new features being added regularly. Please note that a major rewrite is planned, which may introduce changes to the API in the future.
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# MsReport
|
|
2
|
+
|
|
3
|
+
[](https://www.repostatus.org/#wip)
|
|
4
|
+

|
|
5
|
+
[](https://github.com/hollenstein/msreport/actions/workflows/run-tests.yml)
|
|
6
|
+
|
|
7
|
+
**MsReport** is a Python library for post-processing quantitative proteomics data from
|
|
8
|
+
bottom-up mass spectrometry experiments.
|
|
9
|
+
|
|
10
|
+
## Table of Contents
|
|
11
|
+
|
|
12
|
+
- [What is MsReport?](#what-is-msreport)
|
|
13
|
+
- [Key features of MsReport](#key-features-of-msreport)
|
|
14
|
+
- [Installation](#installation)
|
|
15
|
+
- [Installation when using Anaconda](#installation-when-using-anaconda)
|
|
16
|
+
- [Additional requirements](#additional-requirements)
|
|
17
|
+
- [Optional Dependencies](#optional-dependencies)
|
|
18
|
+
- [Development status](#development-status)
|
|
19
|
+
|
|
20
|
+
## What is MsReport?
|
|
21
|
+
|
|
22
|
+
MsReport is a Python library designed to simplify the post-processing and analysis of quantitative proteomics data from bottom-up mass spectrometry experiments. It provides a high-level, abstraction-focused API for efficient and standardized workflows. The modular design of the library provides the flexibility to meet project specific data processing needs and customize workflows as required.
|
|
23
|
+
|
|
24
|
+
The library supports importing protein and peptide-level quantification results from MaxQuant, FragPipe, and Spectronaut, as well as post-translational modification (PTM) data from MaxQuant and FragPipe. MsReport provides tools for data annotation, normalization and transformation, statistical testing, and data visualization.
|
|
25
|
+
|
|
26
|
+
### Key features of MsReport
|
|
27
|
+
|
|
28
|
+
#### Data Import and Standardization
|
|
29
|
+
|
|
30
|
+
The `reader` module provides software-specific reader classes for importing data from MaxQuant, FragPipe, and Spectronaut that enable the import of protein, peptide and ion tables. During the import process, these classes transform tables column names and table values into a standardized format to ensure that the rest of the library can operate in a tool-agnostic manner.
|
|
31
|
+
|
|
32
|
+
#### Data management
|
|
33
|
+
|
|
34
|
+
The `qtable` module provides a structured approach to managing quantitative data through its central `Qtable` class. This class combines quantitative data with an experimental design table that defines the relationship between samples and experimental conditions. The quantitative data is stored in a wide format, where each sample's measurements are stored in separate columns. The `Qtable` class serves as the foundation for data analysis workflows in MsReport, providing the standardized data structure used by the `analyze`, `plot`, and `export` modules.
|
|
35
|
+
|
|
36
|
+
#### Data processing and analysis
|
|
37
|
+
|
|
38
|
+
The `analyze` module provides tools for post-processing of mass spectrometry data generated by software such as MaxQuant, FragPipe, or Spectronaut. It includes functions for filtering, normalization, imputation of missing values, and statistical testing. The library integrates with the R package LIMMA to enable differential expression analysis.
|
|
39
|
+
|
|
40
|
+
> [!NOTE]
|
|
41
|
+
> In order to use the R integration you need to install msreport with optional dependencies, see [Optional Dependencies](#optional-dependencies) for more information.
|
|
42
|
+
|
|
43
|
+
#### Data visualization
|
|
44
|
+
|
|
45
|
+
The `plot` module supports the generation of visualizations for quality control and data analysis. It includes functions for creating various plots, such as intensity and ratio distributions, heatmaps, volcano plots, and PCA plots.
|
|
46
|
+
|
|
47
|
+
#### Data export
|
|
48
|
+
|
|
49
|
+
Finally, the `export` module enables the conversion and export into formats compatible with external tools. This includes generating input files for [Amica](https://bioapps.maxperutzlabs.ac.at/app/amica) and exporting tables for easier integration with Perseus.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
If you do not already have a Python installation, we recommend installing the [Anaconda distribution](https://www.anaconda.com/download) or [Miniconda](https://docs.anaconda.com/free/miniconda/index.html) distribution from Continuum Analytics, which already contains a large number of popular Python packages for Data Science. Alternatively, you can also get Python from the [Python homepage](https://www.python.org/downloads/windows). Note that MsReport requires Python version 3.10 or higher.
|
|
54
|
+
|
|
55
|
+
The following command will install MsReport and its dependencies by using a wheel file.
|
|
56
|
+
|
|
57
|
+
```shell
|
|
58
|
+
pip install msreport
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
To uninstall the MsReport library use:
|
|
62
|
+
|
|
63
|
+
```shell
|
|
64
|
+
pip uninstall msreport
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Installation when using Anaconda
|
|
68
|
+
|
|
69
|
+
To install the MsReport library using Anaconda, you need to either activate a custom conda environment or install it into the default base environment. Open the Anaconda Navigator, activate the desired conda environment or use the base environment, and then open a command line by running the "CMD.exe" application. Finally, use the `pip install` command as before.
|
|
70
|
+
|
|
71
|
+
### Optional Dependencies
|
|
72
|
+
|
|
73
|
+
#### R Integration
|
|
74
|
+
|
|
75
|
+
MsReport provides an interface to the R package LIMMA for differential expression analysis. To use this functionality, you need:
|
|
76
|
+
|
|
77
|
+
- A local installation of **R (version 4.0 or higher)**.
|
|
78
|
+
- The system environment variable R_HOME set to the R home directory.
|
|
79
|
+
- To install msreport with the optional dependencies for R integration.
|
|
80
|
+
|
|
81
|
+
```shell
|
|
82
|
+
pip install msreport[R]
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
#### Setting the R_HOME environment variable
|
|
86
|
+
|
|
87
|
+
On Windows, you may need to restart your computer after modifying the system environment variables for the changes to take effect. To find the R home directory, you can run the following command in R:
|
|
88
|
+
|
|
89
|
+
```R
|
|
90
|
+
normalizePath(R.home("home"))
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
For example, the R home directory might look like this on Windows: `C:\Program Files\R\R-4.2.1`
|
|
94
|
+
|
|
95
|
+
## Development status
|
|
96
|
+
|
|
97
|
+
MsReport is a stable and reliable library that has been used on a daily basis for over two years in the Mass Spectrometry Facility at the Max Perutz Labs and the Mass Spectrometry Facility of IMP/IMBA/GMI. While the current interface of MsReport is stable, the library is still under active development, with new features being added regularly. Please note that a major rewrite is planned, which may introduce changes to the API in the future.
|
|
@@ -1,13 +1,11 @@
|
|
|
1
|
-
from msreport.qtable import Qtable
|
|
2
|
-
from msreport.reader import MaxQuantReader, FragPipeReader, SpectronautReader
|
|
3
|
-
|
|
4
|
-
from msreport.fasta import import_protein_database
|
|
5
|
-
|
|
6
1
|
import msreport.analyze
|
|
7
2
|
import msreport.export
|
|
8
3
|
import msreport.impute
|
|
9
4
|
import msreport.normalize
|
|
10
5
|
import msreport.plot
|
|
11
6
|
import msreport.reader
|
|
7
|
+
from msreport.fasta import import_protein_database
|
|
8
|
+
from msreport.qtable import Qtable
|
|
9
|
+
from msreport.reader import FragPipeReader, MaxQuantReader, SpectronautReader
|
|
12
10
|
|
|
13
|
-
__version__ = "0.0.
|
|
11
|
+
__version__ = "0.0.29"
|
|
@@ -71,7 +71,7 @@ def maximum_per_column(array: np.ndarray) -> np.ndarray:
|
|
|
71
71
|
return np.array([maximum(i) for i in array.transpose()])
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
def minimum(array: np.ndarray) ->
|
|
74
|
+
def minimum(array: np.ndarray) -> float:
|
|
75
75
|
"""Returns the lowest finite value from one or multiple columns."""
|
|
76
76
|
array = array.flatten()
|
|
77
77
|
if np.isfinite(array).any():
|
|
@@ -218,7 +218,7 @@ def aggregate_unique_groups(
|
|
|
218
218
|
columns_to_aggregate: Union[str, Iterable],
|
|
219
219
|
condenser: Callable,
|
|
220
220
|
is_sorted: bool,
|
|
221
|
-
) ->
|
|
221
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
222
222
|
"""Aggregates column(s) by applying a condenser function to unique groups.
|
|
223
223
|
|
|
224
224
|
The function returns two arrays containing the aggregated values and the
|
|
@@ -256,7 +256,7 @@ def aggregate_unique_groups(
|
|
|
256
256
|
|
|
257
257
|
def _prepare_grouping_indices(
|
|
258
258
|
table: pd.DataFrame, group_by: str, is_sorted: bool
|
|
259
|
-
) ->
|
|
259
|
+
) -> tuple[np.ndarray, np.ndarray, pd.DataFrame]:
|
|
260
260
|
"""Prepares start indices and names of unique groups from a sorted dataframe.
|
|
261
261
|
|
|
262
262
|
Args:
|
|
@@ -1,15 +1,26 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""The analyze module contains methods for analysing quantification results."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
|
-
|
|
4
|
+
|
|
5
5
|
import warnings
|
|
6
|
+
from typing import Iterable, Optional, Protocol, Sequence
|
|
6
7
|
|
|
7
8
|
import numpy as np
|
|
8
9
|
import pandas as pd
|
|
9
10
|
|
|
10
11
|
import msreport.normalize
|
|
11
|
-
|
|
12
|
+
from msreport.errors import OptionalDependencyError
|
|
12
13
|
from msreport.helper import find_sample_columns
|
|
14
|
+
from msreport.qtable import Qtable
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
import msreport.rinterface
|
|
18
|
+
|
|
19
|
+
_rinterface_available = True
|
|
20
|
+
_rinterface_error = ""
|
|
21
|
+
except OptionalDependencyError as err:
|
|
22
|
+
_rinterface_available = False
|
|
23
|
+
_rinterface_error = str(err)
|
|
13
24
|
|
|
14
25
|
|
|
15
26
|
class Transformer(Protocol):
|
|
@@ -234,7 +245,7 @@ def normalize_expression(
|
|
|
234
245
|
|
|
235
246
|
def create_site_to_protein_normalizer(
|
|
236
247
|
qtable: Qtable, category_column: str = "Representative protein"
|
|
237
|
-
) -> msreport.
|
|
248
|
+
) -> msreport.normalize.CategoricalNormalizer:
|
|
238
249
|
"""Creates a fitted `CategoricalNormalizer` for site-to-protein normalization.
|
|
239
250
|
|
|
240
251
|
The `CategoricalNormalizer` is fitted to protein expression profiles of the provided
|
|
@@ -254,8 +265,8 @@ def create_site_to_protein_normalizer(
|
|
|
254
265
|
samples_as_columns=True,
|
|
255
266
|
features=[category_column],
|
|
256
267
|
)
|
|
257
|
-
completely_quantified = (
|
|
258
|
-
|
|
268
|
+
completely_quantified = ~reference_expression[qtable.get_samples()].isna().any(
|
|
269
|
+
axis=1
|
|
259
270
|
)
|
|
260
271
|
reference_expression = reference_expression[completely_quantified]
|
|
261
272
|
|
|
@@ -275,7 +286,7 @@ def create_ibaq_transformer(
|
|
|
275
286
|
qtable: Qtable,
|
|
276
287
|
category_column: str = "Representative protein",
|
|
277
288
|
ibaq_column: str = "iBAQ peptides",
|
|
278
|
-
) -> msreport.
|
|
289
|
+
) -> msreport.normalize.CategoricalNormalizer:
|
|
279
290
|
"""Creates a fitted `CategoricalNormalizer` for iBAQ transformation.
|
|
280
291
|
|
|
281
292
|
The `CategoricalNormalizer` is fitted to iBAQ peptide counts of the provided
|
|
@@ -301,7 +312,7 @@ def create_ibaq_transformer(
|
|
|
301
312
|
ibaq_factor_values[ibaq_factor_values < 1] = 1
|
|
302
313
|
ibaq_factor_values = np.log2(ibaq_factor_values)
|
|
303
314
|
|
|
304
|
-
reference_table = pd.DataFrame(
|
|
315
|
+
reference_table = pd.DataFrame(dict.fromkeys(sample_columns, ibaq_factor_values))
|
|
305
316
|
reference_table[category_column] = category_values
|
|
306
317
|
|
|
307
318
|
normalizer = msreport.normalize.CategoricalNormalizer(category_column)
|
|
@@ -422,7 +433,15 @@ def calculate_multi_group_comparison(
|
|
|
422
433
|
correspond to entries from qtable.design["Experiment"].
|
|
423
434
|
exclude_invalid: If true, the column "Valid" is used to determine which rows are
|
|
424
435
|
used for calculating the group comparisons; default True.
|
|
436
|
+
|
|
437
|
+
Raises:
|
|
438
|
+
ValueError: If 'experiment_pairs' contains invalid entries. Each experiment pair
|
|
439
|
+
must have exactly two entries and the two entries must not be the same. All
|
|
440
|
+
experiments must be present in qtable.design. No duplicate experiment pairs
|
|
441
|
+
are allowed.
|
|
425
442
|
"""
|
|
443
|
+
_validate_experiment_pairs(qtable, experiment_pairs)
|
|
444
|
+
|
|
426
445
|
table = qtable.make_expression_table(samples_as_columns=True, features=["Valid"])
|
|
427
446
|
comparison_tag = " vs "
|
|
428
447
|
|
|
@@ -475,7 +494,7 @@ def two_group_comparison(
|
|
|
475
494
|
|
|
476
495
|
def calculate_multi_group_limma(
|
|
477
496
|
qtable: Qtable,
|
|
478
|
-
experiment_pairs:
|
|
497
|
+
experiment_pairs: Sequence[Iterable[str]],
|
|
479
498
|
exclude_invalid: bool = True,
|
|
480
499
|
batch: bool = False,
|
|
481
500
|
limma_trend: bool = True,
|
|
@@ -491,8 +510,7 @@ def calculate_multi_group_limma(
|
|
|
491
510
|
|
|
492
511
|
Requires that expression columns are set, and expression values are log2 transformed
|
|
493
512
|
All rows with missing values are ignored, impute missing values to allow
|
|
494
|
-
differential expression analysis of all rows.
|
|
495
|
-
"Representative protein" is used as the index.
|
|
513
|
+
differential expression analysis of all rows.
|
|
496
514
|
|
|
497
515
|
Args:
|
|
498
516
|
qtable: Qtable instance that contains expression values for differential
|
|
@@ -510,13 +528,19 @@ def calculate_multi_group_limma(
|
|
|
510
528
|
limma.eBayes for details; default True.
|
|
511
529
|
|
|
512
530
|
Raises:
|
|
531
|
+
ValueError: If 'experiment_pairs' contains invalid entries. Each experiment pair
|
|
532
|
+
must have exactly two entries and the two entries must not be the same. All
|
|
533
|
+
experiments must be present in qtable.design. No duplicate experiment pairs
|
|
534
|
+
are allowed.
|
|
513
535
|
KeyError: If the "Batch" column is not present in the qtable.design when
|
|
514
536
|
'batch' is set to True.
|
|
515
537
|
ValueError: If all values from qtable.design["Batch"] are identical when 'batch'
|
|
516
538
|
is set to True.
|
|
517
|
-
ValueError: If the same experiment pair has been specified multiple times in
|
|
518
|
-
'experiment_pairs'.
|
|
519
539
|
"""
|
|
540
|
+
if not _rinterface_available:
|
|
541
|
+
raise OptionalDependencyError(_rinterface_error)
|
|
542
|
+
|
|
543
|
+
_validate_experiment_pairs(qtable, experiment_pairs)
|
|
520
544
|
# TODO: not tested #
|
|
521
545
|
if batch and "Batch" not in qtable.get_design():
|
|
522
546
|
raise KeyError(
|
|
@@ -528,17 +552,10 @@ def calculate_multi_group_limma(
|
|
|
528
552
|
"When using calculate_multi_group_limma(batch=True), not all values from"
|
|
529
553
|
' qtable.design["Batch"] are allowed to be identical.'
|
|
530
554
|
)
|
|
531
|
-
if len(list(experiment_pairs)) != len(set(experiment_pairs)):
|
|
532
|
-
raise ValueError(
|
|
533
|
-
"The same experiment pair has been specified multiple times."
|
|
534
|
-
" Each entry in the `experiment_pairs` argument must be unique."
|
|
535
|
-
)
|
|
536
555
|
|
|
537
556
|
design = qtable.get_design()
|
|
538
|
-
table = qtable.make_expression_table(
|
|
539
|
-
|
|
540
|
-
)
|
|
541
|
-
table = table.set_index("Representative protein")
|
|
557
|
+
table = qtable.make_expression_table(samples_as_columns=True)
|
|
558
|
+
table.index = table.index.astype(str) # It appears that a string is required for R
|
|
542
559
|
comparison_tag = " vs "
|
|
543
560
|
|
|
544
561
|
if exclude_invalid:
|
|
@@ -554,7 +571,7 @@ def calculate_multi_group_limma(
|
|
|
554
571
|
experiment_to_r[experiment] = f".EXPERIMENT__{i:04d}"
|
|
555
572
|
r_to_experiment = {v: k for k, v in experiment_to_r.items()}
|
|
556
573
|
|
|
557
|
-
r_experiment_pairs = []
|
|
574
|
+
r_experiment_pairs: list[str] = []
|
|
558
575
|
for exp1, exp2 in experiment_pairs:
|
|
559
576
|
r_experiment_pairs.append(f"{experiment_to_r[exp1]}-{experiment_to_r[exp2]}")
|
|
560
577
|
|
|
@@ -583,7 +600,7 @@ def calculate_multi_group_limma(
|
|
|
583
600
|
|
|
584
601
|
def calculate_two_group_limma(
|
|
585
602
|
qtable: Qtable,
|
|
586
|
-
experiment_pair:
|
|
603
|
+
experiment_pair: Sequence[str],
|
|
587
604
|
exclude_invalid: bool = True,
|
|
588
605
|
limma_trend: bool = True,
|
|
589
606
|
) -> None:
|
|
@@ -596,8 +613,7 @@ def calculate_two_group_limma(
|
|
|
596
613
|
|
|
597
614
|
Requires that expression columns are set, and expression values are log2
|
|
598
615
|
transformed. All rows with missing values are ignored, impute missing values to
|
|
599
|
-
allow differential expression analysis of all rows.
|
|
600
|
-
column "Representative protein" is used as the index.
|
|
616
|
+
allow differential expression analysis of all rows.
|
|
601
617
|
|
|
602
618
|
Args:
|
|
603
619
|
qtable: Qtable instance that contains expression values for differential
|
|
@@ -608,27 +624,32 @@ def calculate_two_group_limma(
|
|
|
608
624
|
used for the differential expression analysis; default True.
|
|
609
625
|
limma_trend: If true, an intensity-dependent trend is fitted to the prior
|
|
610
626
|
variances; default True.
|
|
627
|
+
Raises:
|
|
628
|
+
ValueError: If 'experiment_pair' contains invalid entries. The experiment pair
|
|
629
|
+
must have exactly two entries and the two entries must not be the same. Both
|
|
630
|
+
experiments must be present in qtable.design.
|
|
611
631
|
"""
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
)
|
|
632
|
+
if not _rinterface_available:
|
|
633
|
+
raise OptionalDependencyError(_rinterface_error)
|
|
634
|
+
|
|
635
|
+
_validate_experiment_pair(qtable, experiment_pair)
|
|
636
|
+
# TODO: LIMMA function not tested #
|
|
637
|
+
table = qtable.make_expression_table(samples_as_columns=True)
|
|
616
638
|
comparison_tag = " vs "
|
|
617
639
|
|
|
618
640
|
if exclude_invalid:
|
|
619
641
|
valid = qtable["Valid"]
|
|
620
642
|
else:
|
|
621
|
-
valid = np.full(
|
|
643
|
+
valid = np.full(table.shape[0], True)
|
|
622
644
|
|
|
623
645
|
samples_to_experiment = {}
|
|
624
646
|
for experiment in experiment_pair:
|
|
625
|
-
mapping =
|
|
647
|
+
mapping = dict.fromkeys(qtable.get_samples(experiment), experiment)
|
|
626
648
|
samples_to_experiment.update(mapping)
|
|
627
649
|
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
table =
|
|
631
|
-
table = table.set_index("Representative protein")
|
|
650
|
+
# Keep only samples that are present in the 'experiment_pair'
|
|
651
|
+
table = table[samples_to_experiment.keys()]
|
|
652
|
+
table.index = table.index.astype(str) # It appears that a string is required for R
|
|
632
653
|
not_nan = table.isna().sum(axis=1) == 0
|
|
633
654
|
|
|
634
655
|
mask = np.all([valid, not_nan], axis=0)
|
|
@@ -649,3 +670,63 @@ def calculate_two_group_limma(
|
|
|
649
670
|
mapping = {col: f"{col} {comparison_group}" for col in limma_table.columns}
|
|
650
671
|
limma_table.rename(columns=mapping, inplace=True)
|
|
651
672
|
qtable.add_expression_features(limma_table)
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def _validate_experiment_pairs(
|
|
676
|
+
qtable: Qtable, exp_pairs: Iterable[Iterable[str]]
|
|
677
|
+
) -> None:
|
|
678
|
+
"""Validates that experiment pairs are valid and raises an error if not.
|
|
679
|
+
|
|
680
|
+
- All 'exp_pairs' entries must have a length of exactly 2.
|
|
681
|
+
- All experiments must be present in the qtable.design.
|
|
682
|
+
- No duplicate experiments are allowed in a pair.
|
|
683
|
+
- No duplicate experiment pairs are allowed.
|
|
684
|
+
|
|
685
|
+
Args:
|
|
686
|
+
qtable: Qtable instance containing experiment data.
|
|
687
|
+
exp_pairs: Iterable of experiment pairs to validate.
|
|
688
|
+
|
|
689
|
+
Raises:
|
|
690
|
+
ValueError: If any of the validation checks fail.
|
|
691
|
+
"""
|
|
692
|
+
all_experiments = {exp for pair in exp_pairs for exp in pair}
|
|
693
|
+
missing_experiments = all_experiments - set(qtable.get_experiments())
|
|
694
|
+
if missing_experiments:
|
|
695
|
+
raise ValueError(
|
|
696
|
+
f"Experiments '{missing_experiments}' not found in qtable.design."
|
|
697
|
+
)
|
|
698
|
+
for experiment_pair in exp_pairs:
|
|
699
|
+
_validate_experiment_pair(qtable, experiment_pair)
|
|
700
|
+
|
|
701
|
+
if len(list(exp_pairs)) != len({tuple(pair) for pair in exp_pairs}):
|
|
702
|
+
raise ValueError(
|
|
703
|
+
f"Some experiment pairs in {exp_pairs} have been specified multiple "
|
|
704
|
+
"times. Each pair must occur only once."
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
def _validate_experiment_pair(qtable: Qtable, exp_pair: Iterable[str]) -> None:
|
|
709
|
+
"""Validates the experiment pair is valid and raises an error if not.
|
|
710
|
+
|
|
711
|
+
- The experiment pair must contain exactly two entries
|
|
712
|
+
- The two entries of the experiment pair must be different.
|
|
713
|
+
- Both experiments must be present in the qtable.design.
|
|
714
|
+
|
|
715
|
+
Args:
|
|
716
|
+
qtable: Qtable instance containing experiment data.
|
|
717
|
+
experiment_pairs: Iterable of experiment pairs to validate.
|
|
718
|
+
|
|
719
|
+
Raises:
|
|
720
|
+
ValueError: If any of the validation checks fail.
|
|
721
|
+
"""
|
|
722
|
+
if len(list(exp_pair)) != 2:
|
|
723
|
+
raise ValueError(
|
|
724
|
+
f"Experiment pair '{exp_pair}' contains more than two entries."
|
|
725
|
+
)
|
|
726
|
+
if len(list(exp_pair)) != len(set(exp_pair)):
|
|
727
|
+
raise ValueError(f"Experiment pair '{exp_pair}' contains the same entry twice.")
|
|
728
|
+
if set(exp_pair) - set(qtable.get_experiments()):
|
|
729
|
+
raise ValueError(
|
|
730
|
+
f"Experiments '{set(exp_pair) - set(qtable.get_experiments())}' "
|
|
731
|
+
"not found in qtable.design."
|
|
732
|
+
)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
class MsreportError(Exception):
|
|
2
|
-
...
|
|
1
|
+
class MsreportError(Exception): ...
|
|
3
2
|
|
|
4
3
|
|
|
5
4
|
class NotFittedError(ValueError, AttributeError):
|
|
@@ -8,3 +7,7 @@ class NotFittedError(ValueError, AttributeError):
|
|
|
8
7
|
|
|
9
8
|
class ProteinsNotInFastaWarning(UserWarning):
|
|
10
9
|
"""Warning raised when queried proteins are absent from a FASTA file."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class OptionalDependencyError(ImportError):
|
|
13
|
+
"""Raised when an optional dependency is required but not installed."""
|