msreport 0.0.28__py3-none-any.whl → 0.0.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msreport/__init__.py +1 -1
- msreport/analyze.py +16 -3
- msreport/errors.py +4 -0
- msreport/reader.py +162 -7
- msreport/rinterface/__init__.py +14 -2
- msreport-0.0.29.dist-info/METADATA +136 -0
- {msreport-0.0.28.dist-info → msreport-0.0.29.dist-info}/RECORD +10 -10
- {msreport-0.0.28.dist-info → msreport-0.0.29.dist-info}/WHEEL +1 -1
- msreport-0.0.28.dist-info/METADATA +0 -132
- {msreport-0.0.28.dist-info → msreport-0.0.29.dist-info}/licenses/LICENSE.txt +0 -0
- {msreport-0.0.28.dist-info → msreport-0.0.29.dist-info}/top_level.txt +0 -0
msreport/__init__.py
CHANGED
msreport/analyze.py
CHANGED
|
@@ -9,10 +9,19 @@ import numpy as np
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
|
|
11
11
|
import msreport.normalize
|
|
12
|
-
|
|
12
|
+
from msreport.errors import OptionalDependencyError
|
|
13
13
|
from msreport.helper import find_sample_columns
|
|
14
14
|
from msreport.qtable import Qtable
|
|
15
15
|
|
|
16
|
+
try:
|
|
17
|
+
import msreport.rinterface
|
|
18
|
+
|
|
19
|
+
_rinterface_available = True
|
|
20
|
+
_rinterface_error = ""
|
|
21
|
+
except OptionalDependencyError as err:
|
|
22
|
+
_rinterface_available = False
|
|
23
|
+
_rinterface_error = str(err)
|
|
24
|
+
|
|
16
25
|
|
|
17
26
|
class Transformer(Protocol):
|
|
18
27
|
def fit(self, table: pd.DataFrame) -> Transformer:
|
|
@@ -528,8 +537,10 @@ def calculate_multi_group_limma(
|
|
|
528
537
|
ValueError: If all values from qtable.design["Batch"] are identical when 'batch'
|
|
529
538
|
is set to True.
|
|
530
539
|
"""
|
|
531
|
-
|
|
540
|
+
if not _rinterface_available:
|
|
541
|
+
raise OptionalDependencyError(_rinterface_error)
|
|
532
542
|
|
|
543
|
+
_validate_experiment_pairs(qtable, experiment_pairs)
|
|
533
544
|
# TODO: not tested #
|
|
534
545
|
if batch and "Batch" not in qtable.get_design():
|
|
535
546
|
raise KeyError(
|
|
@@ -618,8 +629,10 @@ def calculate_two_group_limma(
|
|
|
618
629
|
must have exactly two entries and the two entries must not be the same. Both
|
|
619
630
|
experiments must be present in qtable.design.
|
|
620
631
|
"""
|
|
621
|
-
|
|
632
|
+
if not _rinterface_available:
|
|
633
|
+
raise OptionalDependencyError(_rinterface_error)
|
|
622
634
|
|
|
635
|
+
_validate_experiment_pair(qtable, experiment_pair)
|
|
623
636
|
# TODO: LIMMA function not tested #
|
|
624
637
|
table = qtable.make_expression_table(samples_as_columns=True)
|
|
625
638
|
comparison_tag = " vs "
|
msreport/errors.py
CHANGED
|
@@ -7,3 +7,7 @@ class NotFittedError(ValueError, AttributeError):
|
|
|
7
7
|
|
|
8
8
|
class ProteinsNotInFastaWarning(UserWarning):
|
|
9
9
|
"""Warning raised when queried proteins are absent from a FASTA file."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class OptionalDependencyError(ImportError):
|
|
13
|
+
"""Raised when an optional dependency is required but not installed."""
|
msreport/reader.py
CHANGED
|
@@ -343,7 +343,9 @@ class MaxQuantReader(ResultReader):
|
|
|
343
343
|
Adds new columns to comply with the MsReport convention. "Modified sequence",
|
|
344
344
|
"Modifications columns", "Modification localization string". "Protein reported
|
|
345
345
|
by software" and "Representative protein", both contain the first entry from
|
|
346
|
-
"Leading razor protein".
|
|
346
|
+
"Leading razor protein". "Ion ID" contains unique entries for each ion, which
|
|
347
|
+
are generated by concatenating the "Modified sequence" and "Charge" columns, and
|
|
348
|
+
if present, the "Compensation voltage" column.
|
|
347
349
|
|
|
348
350
|
"Modified sequence" entries contain modifications within square brackets.
|
|
349
351
|
"Modification" entries are strings in the form of "position:modification_tag",
|
|
@@ -376,15 +378,19 @@ class MaxQuantReader(ResultReader):
|
|
|
376
378
|
df["Leading razor protein"]
|
|
377
379
|
)
|
|
378
380
|
df["Representative protein"] = df["Protein reported by software"]
|
|
381
|
+
|
|
379
382
|
if drop_decoy:
|
|
380
383
|
df = self._drop_decoy(df)
|
|
381
384
|
if rename_columns:
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
) # Actually there are no column tags as the table is in long format
|
|
385
|
+
# Actually there are no column tags as the table is in long format
|
|
386
|
+
df = self._rename_columns(df, prefix_tag=True)
|
|
385
387
|
if rewrite_modifications and rename_columns:
|
|
386
388
|
df = self._add_peptide_modification_entries(df)
|
|
387
389
|
df = self._add_modification_localization_string(df)
|
|
390
|
+
df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
|
|
391
|
+
if "Compensation voltage" in df.columns:
|
|
392
|
+
_cv = df["Compensation voltage"].astype(str)
|
|
393
|
+
df["Ion ID"] = df["Ion ID"] + "_cv" + _cv
|
|
388
394
|
return df
|
|
389
395
|
|
|
390
396
|
def _add_protein_entries(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -576,6 +582,7 @@ class FragPipeReader(ResultReader):
|
|
|
576
582
|
"peptides": "combined_peptide.tsv",
|
|
577
583
|
"ions": "combined_ion.tsv",
|
|
578
584
|
"ion_evidence": "ion.tsv",
|
|
585
|
+
"psm_evidence": "psm.tsv",
|
|
579
586
|
}
|
|
580
587
|
isobar_filenames: dict[str, str] = {
|
|
581
588
|
"proteins": "protein.tsv",
|
|
@@ -591,6 +598,13 @@ class FragPipeReader(ResultReader):
|
|
|
591
598
|
"MaxLFQ Intensity",
|
|
592
599
|
]
|
|
593
600
|
column_mapping: dict[str, str] = {
|
|
601
|
+
"Peptide": "Peptide sequence", # PSM
|
|
602
|
+
"Modified Peptide": "Modified sequence", # PSM
|
|
603
|
+
"Protein Start": "Start position", # PSM
|
|
604
|
+
"Protein End": "End position", # PSM
|
|
605
|
+
"Number of Missed Cleavages": "Missed cleavage", # PSM
|
|
606
|
+
"PeptideProphet Probability": "Probability", # PSM
|
|
607
|
+
"Compensation Voltage": "Compensation voltage", # PSM and ion
|
|
594
608
|
"Peptide Sequence": "Peptide sequence", # Peptide and ion
|
|
595
609
|
"Modified Sequence": "Modified sequence", # Modified peptide and ion
|
|
596
610
|
"Start": "Start position", # Peptide and ion
|
|
@@ -741,7 +755,10 @@ class FragPipeReader(ResultReader):
|
|
|
741
755
|
|
|
742
756
|
Adds new columns to comply with the MsReport convention. "Modified sequence"
|
|
743
757
|
and "Modifications columns". "Protein reported by software" and "Representative
|
|
744
|
-
protein", both contain the first entry from "Leading razor protein".
|
|
758
|
+
protein", both contain the first entry from "Leading razor protein". "Ion ID"
|
|
759
|
+
contains unique entries for each ion, which are generated by concatenating the
|
|
760
|
+
"Modified sequence" and "Charge" columns, and if present, the
|
|
761
|
+
"Compensation voltage" column.
|
|
745
762
|
|
|
746
763
|
"Modified sequence" entries contain modifications within square brackets.
|
|
747
764
|
"Modification" entries are strings in the form of "position:modification_text",
|
|
@@ -781,6 +798,11 @@ class FragPipeReader(ResultReader):
|
|
|
781
798
|
if rewrite_modifications and rename_columns:
|
|
782
799
|
df = self._add_peptide_modification_entries(df)
|
|
783
800
|
df = self._add_modification_localization_string(df, prefix_column_tags)
|
|
801
|
+
df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
|
|
802
|
+
if "Compensation voltage" in df.columns:
|
|
803
|
+
_cv = df["Compensation voltage"].astype(str)
|
|
804
|
+
df["Ion ID"] = df["Ion ID"] + "_cv" + _cv
|
|
805
|
+
|
|
784
806
|
return df
|
|
785
807
|
|
|
786
808
|
def import_ion_evidence(
|
|
@@ -795,7 +817,9 @@ class FragPipeReader(ResultReader):
|
|
|
795
817
|
Adds new columns to comply with the MsReport convention. "Modified sequence",
|
|
796
818
|
"Modifications", and "Modification localization string" columns. "Protein
|
|
797
819
|
reported by software" and "Representative protein", both contain the first entry
|
|
798
|
-
from "Leading razor protein".
|
|
820
|
+
from "Leading razor protein". "Ion ID" contains unique entries for each ion,
|
|
821
|
+
which are generated by concatenating the "Modified sequence" and "Charge"
|
|
822
|
+
columns, and if present, the "Compensation voltage" column.
|
|
799
823
|
|
|
800
824
|
"Modified sequence" entries contain modifications within square brackets.
|
|
801
825
|
"Modification" entries are strings in the form of "position:modification_text",
|
|
@@ -848,6 +872,9 @@ class FragPipeReader(ResultReader):
|
|
|
848
872
|
df = pd.concat(ion_tables, ignore_index=True)
|
|
849
873
|
|
|
850
874
|
# --- Process dataframe --- #
|
|
875
|
+
df["Ion ID"] = df["Modified Sequence"] + "_c" + df["Charge"].astype(str)
|
|
876
|
+
if "Compensation Voltage" in df.columns:
|
|
877
|
+
df["Ion ID"] = df["Ion ID"] + "_cv" + df["Compensation Voltage"].astype(str)
|
|
851
878
|
# FUTURE: replace this by _add_protein_entries(df, False) if FragPipe adds
|
|
852
879
|
# 'Indistinguishable Proteins' to the ion table.
|
|
853
880
|
df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
|
|
@@ -859,6 +886,76 @@ class FragPipeReader(ResultReader):
|
|
|
859
886
|
df = self._add_modification_localization_string(df, prefix_column_tags)
|
|
860
887
|
return df
|
|
861
888
|
|
|
889
|
+
def import_psm_evidence(
|
|
890
|
+
self,
|
|
891
|
+
filename: Optional[str] = None,
|
|
892
|
+
rename_columns: bool = True,
|
|
893
|
+
rewrite_modifications: bool = True,
|
|
894
|
+
):
|
|
895
|
+
"""Concatenate all "psm.tsv" files and return a processed dataframe.
|
|
896
|
+
|
|
897
|
+
Args:
|
|
898
|
+
filename: Allows specifying an alternative filename, otherwise the default
|
|
899
|
+
filename is used.
|
|
900
|
+
rename_columns: If True, columns are renamed according to the MsReport
|
|
901
|
+
convention; default True.
|
|
902
|
+
rewrite_modifications: If True, the peptide format in "Modified sequence" is
|
|
903
|
+
changed according to the MsReport convention, and a "Modifications" is
|
|
904
|
+
added to contains the amino acid position for all modifications.
|
|
905
|
+
Requires 'rename_columns' to be true. Default True.
|
|
906
|
+
|
|
907
|
+
Returns:
|
|
908
|
+
A DataFrame containing the processed psm evidence tables.
|
|
909
|
+
"""
|
|
910
|
+
if filename is None:
|
|
911
|
+
filename = self.default_filenames["psm_evidence"]
|
|
912
|
+
|
|
913
|
+
psm_table_paths = []
|
|
914
|
+
for path in pathlib.Path(self.data_directory).iterdir():
|
|
915
|
+
psm_table_path = path / filename
|
|
916
|
+
if path.is_dir() and psm_table_path.exists():
|
|
917
|
+
psm_table_paths.append(psm_table_path)
|
|
918
|
+
|
|
919
|
+
psm_tables = []
|
|
920
|
+
for filepath in psm_table_paths:
|
|
921
|
+
table = pd.read_csv(filepath, sep="\t", low_memory=False)
|
|
922
|
+
str_cols = table.select_dtypes(include=["object"]).columns
|
|
923
|
+
table.loc[:, str_cols] = table.loc[:, str_cols].fillna("")
|
|
924
|
+
|
|
925
|
+
table["Sample"] = filepath.parent.name
|
|
926
|
+
psm_tables.append(table)
|
|
927
|
+
df = pd.concat(psm_tables, ignore_index=True)
|
|
928
|
+
|
|
929
|
+
df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
|
|
930
|
+
df["Representative protein"] = df["Protein reported by software"]
|
|
931
|
+
df["Mapped Proteins"] = df["Mapped Proteins"].astype(str).replace("nan", "")
|
|
932
|
+
|
|
933
|
+
# FP only lists additional mapped proteins in the "Mapped Proteins" column
|
|
934
|
+
# MsReport reports all matching proteins in the "Mapped proteins" column
|
|
935
|
+
mapped_proteins_entries = []
|
|
936
|
+
for protein, mapped_protein_fp in zip(
|
|
937
|
+
df["Representative protein"], df["Mapped Proteins"], strict=True
|
|
938
|
+
):
|
|
939
|
+
if mapped_protein_fp == "":
|
|
940
|
+
mapped_proteins = [protein]
|
|
941
|
+
else:
|
|
942
|
+
additional_mapped_proteins = msreport.reader._extract_protein_ids(
|
|
943
|
+
mapped_protein_fp.split(", ")
|
|
944
|
+
)
|
|
945
|
+
mapped_proteins = [protein] + additional_mapped_proteins
|
|
946
|
+
mapped_proteins_entries.append(";".join(mapped_proteins))
|
|
947
|
+
df["Mapped proteins"] = mapped_proteins_entries
|
|
948
|
+
|
|
949
|
+
if rename_columns:
|
|
950
|
+
df = self._rename_columns(df, prefix_tag=True)
|
|
951
|
+
if rewrite_modifications and rename_columns:
|
|
952
|
+
mod_entries = _generate_modification_entries_from_assigned_modifications(
|
|
953
|
+
df["Peptide sequence"], df["Assigned Modifications"]
|
|
954
|
+
)
|
|
955
|
+
df["Modified sequence"] = mod_entries["Modified sequence"]
|
|
956
|
+
df["Modifications"] = mod_entries["Modifications"]
|
|
957
|
+
return df
|
|
958
|
+
|
|
862
959
|
def _add_protein_entries(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
863
960
|
"""Adds standardized protein entry columns to the data frame.
|
|
864
961
|
|
|
@@ -1319,7 +1416,9 @@ class SpectronautReader(ResultReader):
|
|
|
1319
1416
|
|
|
1320
1417
|
Adds new columns to comply with the MsReport convention. "Protein reported
|
|
1321
1418
|
by software" and "Representative protein", both contain the first entry from
|
|
1322
|
-
"PG.ProteinAccessions".
|
|
1419
|
+
"PG.ProteinAccessions". "Ion ID" contains unique entries for each ion, which are
|
|
1420
|
+
generated by concatenating the "Modified sequence" and "Charge" columns, and if
|
|
1421
|
+
present, the "Compensation voltage" column.
|
|
1323
1422
|
|
|
1324
1423
|
(!) Note that the modified sequence and modification localization probabilities
|
|
1325
1424
|
are currently not processed.
|
|
@@ -1357,6 +1456,11 @@ class SpectronautReader(ResultReader):
|
|
|
1357
1456
|
df = self._add_protein_entries(df)
|
|
1358
1457
|
if rename_columns:
|
|
1359
1458
|
df = self._rename_columns(df, True)
|
|
1459
|
+
df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
|
|
1460
|
+
if "Compensation voltage" in df.columns:
|
|
1461
|
+
_cv = df["Compensation voltage"].astype(str)
|
|
1462
|
+
df["Ion ID"] = df["Ion ID"] + "_cv" + _cv
|
|
1463
|
+
|
|
1360
1464
|
return df
|
|
1361
1465
|
|
|
1362
1466
|
def _tidy_up_sample_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -2141,6 +2245,57 @@ def _generate_modification_entries(
|
|
|
2141
2245
|
return entries
|
|
2142
2246
|
|
|
2143
2247
|
|
|
2248
|
+
def _generate_modification_entries_from_assigned_modifications(
|
|
2249
|
+
sequences: Iterable[str],
|
|
2250
|
+
assigned_modifications: Iterable[str],
|
|
2251
|
+
) -> dict[str, list[str]]:
|
|
2252
|
+
modified_sequence_entries = []
|
|
2253
|
+
modification_entries = []
|
|
2254
|
+
for sequence, modifications_entry in zip(sequences, assigned_modifications):
|
|
2255
|
+
modifications = _extract_fragpipe_assigned_modifications(
|
|
2256
|
+
modifications_entry, sequence
|
|
2257
|
+
)
|
|
2258
|
+
modified_sequence = helper.modify_peptide(sequence, modifications)
|
|
2259
|
+
modification_entry = ";".join([f"{pos}:{mod}" for pos, mod in modifications])
|
|
2260
|
+
modified_sequence_entries.append(modified_sequence)
|
|
2261
|
+
modification_entries.append(modification_entry)
|
|
2262
|
+
|
|
2263
|
+
entries = {
|
|
2264
|
+
"Modified sequence": modified_sequence_entries,
|
|
2265
|
+
"Modifications": modification_entries,
|
|
2266
|
+
}
|
|
2267
|
+
return entries
|
|
2268
|
+
|
|
2269
|
+
|
|
2270
|
+
def _extract_fragpipe_assigned_modifications(
|
|
2271
|
+
modifications_entry: str,
|
|
2272
|
+
sequence: str,
|
|
2273
|
+
) -> list[tuple[int, str]]:
|
|
2274
|
+
"""Extracts modifications from a FragPipe "Modifications" entry.
|
|
2275
|
+
|
|
2276
|
+
Example for a modification entry: "N-term(42.0106),8C(57.0215)"
|
|
2277
|
+
|
|
2278
|
+
Returns:
|
|
2279
|
+
A list of tuples, where each tuple contains the position of the modification and
|
|
2280
|
+
the modification text. The position is one-indexed, meaning that the first amino
|
|
2281
|
+
acid position is 1. N-term and C-term are represented as 0 and len(sequence)
|
|
2282
|
+
respectively.
|
|
2283
|
+
"""
|
|
2284
|
+
if modifications_entry == "":
|
|
2285
|
+
return []
|
|
2286
|
+
modifications = []
|
|
2287
|
+
for mod_entry in modifications_entry.split(","):
|
|
2288
|
+
position_entry, modification = mod_entry.split(")")[0].split("(")
|
|
2289
|
+
if position_entry == "N-term":
|
|
2290
|
+
position = 0
|
|
2291
|
+
elif position_entry == "C-term":
|
|
2292
|
+
position = len(sequence)
|
|
2293
|
+
else:
|
|
2294
|
+
position = int(position_entry[:-1])
|
|
2295
|
+
modifications.append((position, modification))
|
|
2296
|
+
return modifications
|
|
2297
|
+
|
|
2298
|
+
|
|
2144
2299
|
def extract_maxquant_localization_probabilities(localization_entry: str) -> dict:
|
|
2145
2300
|
"""Extract localization probabilites from a MaxQuant "Probabilities" entry.
|
|
2146
2301
|
|
msreport/rinterface/__init__.py
CHANGED
|
@@ -1,4 +1,16 @@
|
|
|
1
1
|
"""Python interface to custome R scripts."""
|
|
2
2
|
|
|
3
|
-
from .
|
|
4
|
-
|
|
3
|
+
from msreport.errors import OptionalDependencyError
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from .limma import multi_group_limma, two_group_limma
|
|
7
|
+
from .rinstaller import r_package_version
|
|
8
|
+
except ImportError as err:
|
|
9
|
+
raise OptionalDependencyError(
|
|
10
|
+
"R integration is not available. R must be installed and configured before "
|
|
11
|
+
"installing optional R dependencies using 'pip install msreport[R]'. For "
|
|
12
|
+
"more information, see: https://github.com/hollenstein/msreport"
|
|
13
|
+
) from err
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
__all__ = ["multi_group_limma", "two_group_limma", "r_package_version"]
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: msreport
|
|
3
|
+
Version: 0.0.29
|
|
4
|
+
Summary: Post processing and analysis of quantitative proteomics data
|
|
5
|
+
Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: homepage, https://github.com/hollenstein/msreport
|
|
8
|
+
Project-URL: changelog, https://github.com/hollenstein/msreport/blob/main/CHANGELOG.md
|
|
9
|
+
Keywords: mass spectrometry,proteomics,post processing,data analysis
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE.txt
|
|
19
|
+
Requires-Dist: adjustText<1.0.0,>=0.7.0
|
|
20
|
+
Requires-Dist: matplotlib>=3.5.2
|
|
21
|
+
Requires-Dist: numpy>=1.21.5
|
|
22
|
+
Requires-Dist: pandas>=1.4.4
|
|
23
|
+
Requires-Dist: profasta>=0.0.4
|
|
24
|
+
Requires-Dist: pyteomics>=4.6.0
|
|
25
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
26
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
27
|
+
Requires-Dist: scipy>=1.9.1
|
|
28
|
+
Requires-Dist: seaborn>=0.12.0
|
|
29
|
+
Requires-Dist: statsmodels>=0.13.2
|
|
30
|
+
Requires-Dist: typing_extensions>=4
|
|
31
|
+
Provides-Extra: r
|
|
32
|
+
Requires-Dist: rpy2!=3.5.13,>=3.5.3; extra == "r"
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: mypy>=1.15.0; extra == "dev"
|
|
35
|
+
Requires-Dist: pytest>=8.3.5; extra == "dev"
|
|
36
|
+
Provides-Extra: test
|
|
37
|
+
Requires-Dist: pytest>=8.3.5; extra == "test"
|
|
38
|
+
Dynamic: license-file
|
|
39
|
+
|
|
40
|
+
# MsReport
|
|
41
|
+
|
|
42
|
+
[](https://www.repostatus.org/#wip)
|
|
43
|
+

|
|
44
|
+
[](https://github.com/hollenstein/msreport/actions/workflows/run-tests.yml)
|
|
45
|
+
|
|
46
|
+
**MsReport** is a Python library for post-processing quantitative proteomics data from
|
|
47
|
+
bottom-up mass spectrometry experiments.
|
|
48
|
+
|
|
49
|
+
## Table of Contents
|
|
50
|
+
|
|
51
|
+
- [What is MsReport?](#what-is-msreport)
|
|
52
|
+
- [Key features of MsReport](#key-features-of-msreport)
|
|
53
|
+
- [Installation](#installation)
|
|
54
|
+
- [Installation when using Anaconda](#installation-when-using-anaconda)
|
|
55
|
+
- [Additional requirements](#additional-requirements)
|
|
56
|
+
- [Optional Dependencies](#optional-dependencies)
|
|
57
|
+
- [Development status](#development-status)
|
|
58
|
+
|
|
59
|
+
## What is MsReport?
|
|
60
|
+
|
|
61
|
+
MsReport is a Python library designed to simplify the post-processing and analysis of quantitative proteomics data from bottom-up mass spectrometry experiments. It provides a high-level, abstraction-focused API for efficient and standardized workflows. The modular design of the library provides the flexibility to meet project specific data processing needs and customize workflows as required.
|
|
62
|
+
|
|
63
|
+
The library supports importing protein and peptide-level quantification results from MaxQuant, FragPipe, and Spectronaut, as well as post-translational modification (PTM) data from MaxQuant and FragPipe. MsReport provides tools for data annotation, normalization and transformation, statistical testing, and data visualization.
|
|
64
|
+
|
|
65
|
+
### Key features of MsReport
|
|
66
|
+
|
|
67
|
+
#### Data Import and Standardization
|
|
68
|
+
|
|
69
|
+
The `reader` module provides software-specific reader classes for importing data from MaxQuant, FragPipe, and Spectronaut that enable the import of protein, peptide and ion tables. During the import process, these classes transform tables column names and table values into a standardized format to ensure that the rest of the library can operate in a tool-agnostic manner.
|
|
70
|
+
|
|
71
|
+
#### Data management
|
|
72
|
+
|
|
73
|
+
The `qtable` module provides a structured approach to managing quantitative data through its central `Qtable` class. This class combines quantitative data with an experimental design table that defines the relationship between samples and experimental conditions. The quantitative data is stored in a wide format, where each sample's measurements are stored in separate columns. The `Qtable` class serves as the foundation for data analysis workflows in MsReport, providing the standardized data structure used by the `analyze`, `plot`, and `export` modules.
|
|
74
|
+
|
|
75
|
+
#### Data processing and analysis
|
|
76
|
+
|
|
77
|
+
The `analyze` module provides tools for post-processing of mass spectrometry data generated by software such as MaxQuant, FragPipe, or Spectronaut. It includes functions for filtering, normalization, imputation of missing values, and statistical testing. The library integrates with the R package LIMMA to enable differential expression analysis.
|
|
78
|
+
|
|
79
|
+
> [!NOTE]
|
|
80
|
+
> In order to use the R integration you need to install msreport with optional dependencies, see [Optional Dependencies](#optional-dependencies) for more information.
|
|
81
|
+
|
|
82
|
+
#### Data visualization
|
|
83
|
+
|
|
84
|
+
The `plot` module supports the generation of visualizations for quality control and data analysis. It includes functions for creating various plots, such as intensity and ratio distributions, heatmaps, volcano plots, and PCA plots.
|
|
85
|
+
|
|
86
|
+
#### Data export
|
|
87
|
+
|
|
88
|
+
Finally, the `export` module enables the conversion and export into formats compatible with external tools. This includes generating input files for [Amica](https://bioapps.maxperutzlabs.ac.at/app/amica) and exporting tables for easier integration with Perseus.
|
|
89
|
+
|
|
90
|
+
## Installation
|
|
91
|
+
|
|
92
|
+
If you do not already have a Python installation, we recommend installing the [Anaconda distribution](https://www.anaconda.com/download) or [Miniconda](https://docs.anaconda.com/free/miniconda/index.html) distribution from Continuum Analytics, which already contains a large number of popular Python packages for Data Science. Alternatively, you can also get Python from the [Python homepage](https://www.python.org/downloads/windows). Note that MsReport requires Python version 3.10 or higher.
|
|
93
|
+
|
|
94
|
+
The following command will install MsReport and its dependencies by using a wheel file.
|
|
95
|
+
|
|
96
|
+
```shell
|
|
97
|
+
pip install msreport
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
To uninstall the MsReport library use:
|
|
101
|
+
|
|
102
|
+
```shell
|
|
103
|
+
pip uninstall msreport
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Installation when using Anaconda
|
|
107
|
+
|
|
108
|
+
To install the MsReport library using Anaconda, you need to either activate a custom conda environment or install it into the default base environment. Open the Anaconda Navigator, activate the desired conda environment or use the base environment, and then open a command line by running the "CMD.exe" application. Finally, use the `pip install` command as before.
|
|
109
|
+
|
|
110
|
+
### Optional Dependencies
|
|
111
|
+
|
|
112
|
+
#### R Integration
|
|
113
|
+
|
|
114
|
+
MsReport provides an interface to the R package LIMMA for differential expression analysis. To use this functionality, you need:
|
|
115
|
+
|
|
116
|
+
- A local installation of **R (version 4.0 or higher)**.
|
|
117
|
+
- The system environment variable R_HOME set to the R home directory.
|
|
118
|
+
- To install msreport with the optional dependencies for R integration.
|
|
119
|
+
|
|
120
|
+
```shell
|
|
121
|
+
pip install msreport[R]
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
#### Setting the R_HOME environment variable
|
|
125
|
+
|
|
126
|
+
On Windows, you may need to restart your computer after modifying the system environment variables for the changes to take effect. To find the R home directory, you can run the following command in R:
|
|
127
|
+
|
|
128
|
+
```R
|
|
129
|
+
normalizePath(R.home("home"))
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
For example, the R home directory might look like this on Windows: `C:\Program Files\R\R-4.2.1`
|
|
133
|
+
|
|
134
|
+
## Development status
|
|
135
|
+
|
|
136
|
+
MsReport is a stable and reliable library that has been used on a daily basis for over two years in the Mass Spectrometry Facility at the Max Perutz Labs and the Mass Spectrometry Facility of IMP/IMBA/GMI. While the current interface of MsReport is stable, the library is still under active development, with new features being added regularly. Please note that a major rewrite is planned, which may introduce changes to the API in the future.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
msreport/__init__.py,sha256=
|
|
2
|
-
msreport/analyze.py,sha256=
|
|
3
|
-
msreport/errors.py,sha256=
|
|
1
|
+
msreport/__init__.py,sha256=mwpJ3VkH0wctK3CzSQwaywF-mkOC8revi1Ra82apJ3U,339
|
|
2
|
+
msreport/analyze.py,sha256=I1sfxvXy02AjFcfLRlvC-F_bg0J8ePKoSIU8yDWxLs0,31313
|
|
3
|
+
msreport/errors.py,sha256=X9yFxMiIOCWQdxuqBGr8L7O3vRV2KElXdX1uHbFcZMk,421
|
|
4
4
|
msreport/export.py,sha256=YvY3Nly5JC2CUM-JY1gydU1g2eqnennzToZfQQ5phO0,20156
|
|
5
5
|
msreport/fasta.py,sha256=eXTmA4WGX4dT9wcTw7AdrvybLWG47p7ur48CxIjxjfg,1161
|
|
6
6
|
msreport/impute.py,sha256=bf2Zy8VQNJ0Oh1sKn84Xp9iV5svi_Hp7iHxwRrFBwsI,10327
|
|
@@ -8,7 +8,7 @@ msreport/isobar.py,sha256=m6NhLaKBiItIXuBhly_z2wEslxQGFC2f3-e1bzYXB78,6575
|
|
|
8
8
|
msreport/normalize.py,sha256=K1x3DjL5Rep3t_eDIKIghMr0sAJiROnX6skHnOMPZ_k,20160
|
|
9
9
|
msreport/peptidoform.py,sha256=26USj6WPrMgMIc7LttQ2n6Oq5jo1o7ayUQLR6gsRmZY,12015
|
|
10
10
|
msreport/qtable.py,sha256=0e-TXmuiKBU6W5TL3tz06nNrjtEyT-CI9bvUq8W6qME,26768
|
|
11
|
-
msreport/reader.py,sha256=
|
|
11
|
+
msreport/reader.py,sha256=Dqm3Ii9RKiQ61RNATXIqWzD8eOfQyPQQ7lddl-thLQA,112075
|
|
12
12
|
msreport/aggregate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
msreport/aggregate/condense.py,sha256=eIh5A3RUvXrmoFUjRXagiPl0m-ucuRwYD8kDBI7voVs,5862
|
|
14
14
|
msreport/aggregate/pivot.py,sha256=rn8li-FrtOZS4oWA8COk0uV2m71GCEbNu1ALNoMuHOA,5081
|
|
@@ -27,12 +27,12 @@ msreport/plot/quality.py,sha256=dIo_dpdexEN_vp35WpUTt626E-QJ2qNbJmjUai_8uck,1586
|
|
|
27
27
|
msreport/plot/style.py,sha256=67jWf4uA1ub9RJDu4xhuSoXAW0lbLj6SMP4QXQO76Pc,10591
|
|
28
28
|
msreport/plot/style_sheets/msreport-notebook.mplstyle,sha256=SPYO_7vYT8Ha7tQ0KCTLtykiRQ13-_igAm7kyvsZj1I,1266
|
|
29
29
|
msreport/plot/style_sheets/seaborn-whitegrid.mplstyle,sha256=eC8Zboy8R7ybBwbHPKvKbMIHACystN6X6I0lqm7B80U,833
|
|
30
|
-
msreport/rinterface/__init__.py,sha256=
|
|
30
|
+
msreport/rinterface/__init__.py,sha256=Zs6STvbDqaVZVPRM6iU0kKjq0TWz_2p2ChvNAveRdTA,616
|
|
31
31
|
msreport/rinterface/limma.py,sha256=fxYRUkkJKI-JpDvivjWj8bUS0ug7RRTMnaf2UOgRsXQ,5421
|
|
32
32
|
msreport/rinterface/rinstaller.py,sha256=AGs6NFMSwTLrzrIJz1E5BE5jFUz8eQBHlpM_MWVChzA,1370
|
|
33
33
|
msreport/rinterface/rscripts/limma.R,sha256=gr_yjMm_YoG45irDhWOo6gkRQSTwj_7uU_p3NBRHPm8,4331
|
|
34
|
-
msreport-0.0.
|
|
35
|
-
msreport-0.0.
|
|
36
|
-
msreport-0.0.
|
|
37
|
-
msreport-0.0.
|
|
38
|
-
msreport-0.0.
|
|
34
|
+
msreport-0.0.29.dist-info/licenses/LICENSE.txt,sha256=Pd-b5cKP4n2tFDpdx27qJSIq0d1ok0oEcGTlbtL6QMU,11560
|
|
35
|
+
msreport-0.0.29.dist-info/METADATA,sha256=pI4CU6ol8LHTgkVEAGs-5HrCNefD52o-VdrZakKtwE4,8008
|
|
36
|
+
msreport-0.0.29.dist-info/WHEEL,sha256=ooBFpIzZCPdw3uqIQsOo4qqbA4ZRPxHnOH7peeONza0,91
|
|
37
|
+
msreport-0.0.29.dist-info/top_level.txt,sha256=Drl8mCckJHFIw-Ovh5AnyjKnqvLJltDOBUr1JAcHAlI,9
|
|
38
|
+
msreport-0.0.29.dist-info/RECORD,,
|
|
@@ -1,132 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: msreport
|
|
3
|
-
Version: 0.0.28
|
|
4
|
-
Summary: Post processing and analysis of quantitative proteomics data
|
|
5
|
-
Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
|
|
6
|
-
License: Apache-2.0
|
|
7
|
-
Keywords: mass spectrometry,proteomics,post processing,data analysis
|
|
8
|
-
Classifier: Development Status :: 3 - Alpha
|
|
9
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
-
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
12
|
-
Requires-Python: >=3.9
|
|
13
|
-
Description-Content-Type: text/markdown
|
|
14
|
-
License-File: LICENSE.txt
|
|
15
|
-
Requires-Dist: adjustText<1.0.0,>=0.7.0
|
|
16
|
-
Requires-Dist: matplotlib>=3.5.2
|
|
17
|
-
Requires-Dist: numpy>=1.21.5
|
|
18
|
-
Requires-Dist: pandas>=1.4.4
|
|
19
|
-
Requires-Dist: profasta>=0.0.4
|
|
20
|
-
Requires-Dist: pyteomics>=4.6.0
|
|
21
|
-
Requires-Dist: pyyaml>=6.0.0
|
|
22
|
-
Requires-Dist: rpy2!=3.5.13,>=3.5.3
|
|
23
|
-
Requires-Dist: scikit-learn>=1.0.0
|
|
24
|
-
Requires-Dist: scipy>=1.9.1
|
|
25
|
-
Requires-Dist: seaborn>=0.12.0
|
|
26
|
-
Requires-Dist: statsmodels>=0.13.2
|
|
27
|
-
Requires-Dist: typing_extensions>=4
|
|
28
|
-
Provides-Extra: dev
|
|
29
|
-
Requires-Dist: mypy>=1.15.0; extra == "dev"
|
|
30
|
-
Requires-Dist: pytest>=8.3.5; extra == "dev"
|
|
31
|
-
Dynamic: license-file
|
|
32
|
-
|
|
33
|
-
[](https://www.repostatus.org/#wip)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
# MsReport
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
## Introduction
|
|
40
|
-
|
|
41
|
-
MsReport is a python library that allows simple and standardized post processing of
|
|
42
|
-
quantitative proteomics data from bottom up, mass spectrometry experiments. Currently
|
|
43
|
-
working with label free protein quantification reports from MaxQuant and FragPipe is
|
|
44
|
-
fully supported. Other data analysis pipelines can be added by writing a software
|
|
45
|
-
specific reader function.
|
|
46
|
-
|
|
47
|
-
MsReport is primarily developed as a tool for the Mass Spectrometry Facility at the Max
|
|
48
|
-
Perutz Labs (University of Vienna), to allow the generation of Quantitative Protein and
|
|
49
|
-
PTM reports, and to facilitate project specific data analysis tasks.
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
## Release
|
|
53
|
-
|
|
54
|
-
Development is currently in early alpha and the interface is not yet stable.
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
## Scope
|
|
58
|
-
|
|
59
|
-
The `reader` module contains software specific reader classes that provide access to the
|
|
60
|
-
outputs of the respective software. Reader instances allow importing protein and ion
|
|
61
|
-
tables, and provide the ability to standardize column names and data formats during the
|
|
62
|
-
import. To do so, reader classes must know the file structure and naming conventions of
|
|
63
|
-
the respective software.
|
|
64
|
-
|
|
65
|
-
The `qtable` class allows storing and accessing quantitative data from a particular
|
|
66
|
-
level of abstraction, such as proteins or ions, and an experimental design table that
|
|
67
|
-
describes to which experiment a sample belongs to. The quantitative data are in the wide
|
|
68
|
-
format, i.e. the quantification data of each sample is stored in a separate column. The
|
|
69
|
-
`Qtable` allows convenient handling and access to quantitative data through information
|
|
70
|
-
from the experimental design, and represents the data structure used by the `analyze`,
|
|
71
|
-
`plot`, and `export` modules.
|
|
72
|
-
|
|
73
|
-
The `analyze` module provides a high-level interface for post-processing of quantitative
|
|
74
|
-
data, such as filtering valid values, normalization between samples, imputation of
|
|
75
|
-
missing values, and statistical testing with the R package LIMMA.
|
|
76
|
-
|
|
77
|
-
The `plot` module allows generation of quality control and data analysis plots.
|
|
78
|
-
|
|
79
|
-
Using methods from the `export` module allows conversion and export of quantitative data
|
|
80
|
-
into the Amica input format, and generating contaminant tables for the inspection of
|
|
81
|
-
potential contaminants.
|
|
82
|
-
|
|
83
|
-
Additional scripts
|
|
84
|
-
|
|
85
|
-
- The `excel_report` module enables the creation of a formatted excel protein report
|
|
86
|
-
by using the XlsxReport library.
|
|
87
|
-
- The `benchmark` module contains functions to generate benchmark plots from multiple
|
|
88
|
-
`Qtable` instances, and can be used for method or software comparison.
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
## Install
|
|
92
|
-
|
|
93
|
-
If you do not already have a Python installation, we recommend installing the
|
|
94
|
-
[Anaconda distribution](https://www.continuum.io/downloads) of Continuum Analytics,
|
|
95
|
-
which already contains a large number of popular Python packages for Data Science.
|
|
96
|
-
Alternatively, you can also get Python from the
|
|
97
|
-
[Python homepage](https://www.python.org/downloads/windows). MsReport requires Python
|
|
98
|
-
version 3.9 or higher.
|
|
99
|
-
|
|
100
|
-
You can use pip to install MsReport from the distribution file with the following
|
|
101
|
-
command:
|
|
102
|
-
|
|
103
|
-
```
|
|
104
|
-
pip install msreport-X.Y.Z-py3-none-any.whl
|
|
105
|
-
```
|
|
106
|
-
|
|
107
|
-
To uninstall the MsReport library type:
|
|
108
|
-
|
|
109
|
-
```
|
|
110
|
-
pip uninstall msreport
|
|
111
|
-
```
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
### Installation when using Anaconda
|
|
115
|
-
If you are using Anaconda, you will need to install the MsReport package into a conda
|
|
116
|
-
environment. Open the Anaconda navigator, activate the conda environment you want to
|
|
117
|
-
use, run the "CMD.exe" application to open a terminal, and then use the pip install
|
|
118
|
-
command as described above.
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
### Additional requirements
|
|
122
|
-
|
|
123
|
-
MsReport provides an interface to the R package LIMMA for differential expression
|
|
124
|
-
analysis, which requires a local installation of R (R version 4.0 or higher) and the
|
|
125
|
-
system environment variable "R_HOME" to be set to the R home directory. Note that it
|
|
126
|
-
might be necessary to restart the computer after adding the "R_HOME" variable. The R
|
|
127
|
-
home directory can also be found from within R by using the command below, and might
|
|
128
|
-
look similar to "C:\Program Files\R\R-4.2.1" on windows.
|
|
129
|
-
|
|
130
|
-
```
|
|
131
|
-
normalizePath(R.home("home"))
|
|
132
|
-
```
|
|
File without changes
|
|
File without changes
|