msreport 0.0.26__py3-none-any.whl → 0.0.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
msreport/reader.py CHANGED
@@ -1,4 +1,4 @@
1
- """ Module for reading result tables from various MS analysis tools and converting them
1
+ """Module for reading result tables from various MS analysis tools and converting them
2
2
  to a standardized format following the MsReport convention.
3
3
 
4
4
  Currently for MaxQuant and FragPipe protein, peptide, and ion tables are supported, and
@@ -20,19 +20,19 @@ Unified column names:
20
20
  - iBAQ intensity "sample name"
21
21
  """
22
22
 
23
- from collections import OrderedDict, defaultdict
24
23
  import os
25
- from typing import Any, Callable, Iterable, Optional, Protocol
26
24
  import pathlib
27
25
  import warnings
26
+ from collections import OrderedDict, defaultdict
27
+ from typing import Any, Callable, Iterable, Optional, Protocol
28
28
 
29
29
  import numpy as np
30
30
  import pandas as pd
31
31
 
32
32
  import msreport.helper as helper
33
- from msreport.helper.temp import extract_window_around_position
34
- from msreport.errors import ProteinsNotInFastaWarning
35
33
  import msreport.peptidoform
34
+ from msreport.errors import ProteinsNotInFastaWarning
35
+ from msreport.helper.temp import extract_window_around_position
36
36
 
37
37
 
38
38
  class Protein(Protocol):
@@ -54,6 +54,8 @@ class ProteinDatabase(Protocol):
54
54
  class ResultReader:
55
55
  """Base Reader class, is by itself not functional."""
56
56
 
57
+ data_directory: str
58
+ filenames: dict[str, str]
57
59
  default_filenames: dict[str, str]
58
60
  protected_columns: list[str]
59
61
  column_mapping: dict[str, str]
@@ -61,8 +63,8 @@ class ResultReader:
61
63
  sample_column_tags: list[str]
62
64
 
63
65
  def __init__(self):
64
- self.data_directory: str = ""
65
- self.filenames: dict[str, str] = {}
66
+ self.data_directory = ""
67
+ self.filenames = {}
66
68
 
67
69
  def _read_file(self, which: str, sep: str = "\t") -> pd.DataFrame:
68
70
  """Read a result table.
@@ -183,18 +185,16 @@ class MaxQuantReader(ResultReader):
183
185
  "MS/MS count",
184
186
  "Sequence coverage",
185
187
  ]
186
- column_mapping: dict[str, str] = dict(
187
- [
188
- ("Peptides", "Total peptides"),
189
- ("Sequence coverage [%]", "Sequence coverage"),
190
- ("MS/MS count", "Spectral count Combined"), # proteinGroups, evidence
191
- ("MS/MS Count", "Spectral count Combined"), # peptides
192
- ("Sequence", "Peptide sequence"), # peptides, evidence
193
- ("Sequence length", "Protein length"),
194
- ("Mol. weight [kDa]", "Molecular weight [kDa]"),
195
- ("Experiment", "Sample"),
196
- ]
197
- )
188
+ column_mapping: dict[str, str] = {
189
+ "Peptides": "Total peptides",
190
+ "Sequence coverage [%]": "Sequence coverage",
191
+ "MS/MS count": "Spectral count Combined", # proteinGroups, evidence
192
+ "MS/MS Count": "Spectral count Combined", # peptides
193
+ "Sequence": "Peptide sequence", # peptides, evidence
194
+ "Sequence length": "Protein length",
195
+ "Mol. weight [kDa]": "Molecular weight [kDa]",
196
+ "Experiment": "Sample",
197
+ }
198
198
  column_tag_mapping: OrderedDict[str, str] = OrderedDict(
199
199
  [("MS/MS count", "Spectral count"), ("iBAQ", "iBAQ intensity")]
200
200
  )
@@ -590,20 +590,18 @@ class FragPipeReader(ResultReader):
590
590
  "Intensity",
591
591
  "MaxLFQ Intensity",
592
592
  ]
593
- column_mapping: dict[str, str] = dict(
594
- [
595
- ("Peptide Sequence", "Peptide sequence"), # Peptide and ion
596
- ("Modified Sequence", "Modified sequence"), # Modified peptide and ion
597
- ("Start", "Start position"), # Peptide and ion
598
- ("End", "End position"), # Peptide and ion
599
- ("Combined Total Peptides", "Total peptides"), # From LFQ
600
- ("Total Peptides", "Total peptides"), # From TMT
601
- ("Description", "Protein name"),
602
- ("Protein Length", "Protein length"),
603
- ("Entry Name", "Protein entry name"),
604
- ("Gene", "Gene name"),
605
- ]
606
- )
593
+ column_mapping: dict[str, str] = {
594
+ "Peptide Sequence": "Peptide sequence", # Peptide and ion
595
+ "Modified Sequence": "Modified sequence", # Modified peptide and ion
596
+ "Start": "Start position", # Peptide and ion
597
+ "End": "End position", # Peptide and ion
598
+ "Combined Total Peptides": "Total peptides", # From LFQ
599
+ "Total Peptides": "Total peptides", # From TMT
600
+ "Description": "Protein name",
601
+ "Protein Length": "Protein length",
602
+ "Entry Name": "Protein entry name",
603
+ "Gene": "Gene name",
604
+ }
607
605
  column_tag_mapping: OrderedDict[str, str] = OrderedDict(
608
606
  [
609
607
  ("MaxLFQ Intensity", "LFQ intensity"),
@@ -1038,40 +1036,32 @@ class SpectronautReader(ResultReader):
1038
1036
  "design": "conditionsetup",
1039
1037
  }
1040
1038
  protected_columns: list[str] = []
1041
- column_mapping: dict[str, str] = dict(
1042
- [
1043
- ("R.FileName", "Filename"),
1044
- ("R.Label", "Sample"),
1045
- ("PG.Qvalue", "Protein qvalue"),
1046
- ("PG.Cscore", "Protein cscore"),
1047
- ("PG.NrOfStrippedSequencesIdentified (Experiment-wide)", "Total peptides"),
1048
- ("PG.NrOfPrecursorsIdentified (Experiment-wide)", "Total ions"),
1049
- ("PG.Cscore", "Cscore"),
1050
- ("PEP.StrippedSequence", "Peptide sequence"),
1051
- ("PEP.AllOccurringProteinAccessions", "Mapped proteins"),
1052
- ("EG.ModifiedSequence", "Modified sequence"),
1053
- ("EG.CompensationVoltage", "Compensation voltage"),
1054
- ("EG.Qvalue", "Qvalue"),
1055
- ("EG.ApexRT", "Apex retention time"),
1056
- ("EG.DatapointsPerPeak", "Datapoints per peak"),
1057
- ("EG.FWHM", "FWHM"),
1058
- ("EG.SignalToNoise", "Signal to noise"),
1059
- ("FG.FragmentCount", "Fragment count"),
1060
- ("FG.Charge", "Charge"),
1061
- ("FG.MS1Quantity", "MS1 intensity"),
1062
- ("FG.MS1RawQuantity", "MS1 raw intensity"),
1063
- ("FG.MS2Quantity", "MS2 intensity"),
1064
- ("FG.MS2RawQuantity", "MS2 raw intensity"),
1065
- ("FG.MeasuredMz", "Observed m/z"),
1066
- ("FG.TheoreticalMz", "Theoretical m/z"),
1067
- ("FG.CalibratedMz", "Calibrated m/z"),
1068
- # ("PG.ProteinAccessions", ""),
1069
- # ("EG.HasLocalizationInformation", ""),
1070
- # ("EG.PTMLocalizationProbabilities", ""),
1071
- # ("EG.UsedForProteinGroupQuantity", ""),
1072
- # Modified peptides need to be parsed and rewritten
1073
- ]
1074
- )
1039
+ column_mapping: dict[str, str] = {
1040
+ "R.FileName": "Filename",
1041
+ "R.Label": "Sample",
1042
+ "PG.Qvalue": "Protein qvalue",
1043
+ "PG.Cscore": "Protein cscore",
1044
+ "PG.NrOfStrippedSequencesIdentified (Experiment-wide)": "Total peptides",
1045
+ "PG.NrOfPrecursorsIdentified (Experiment-wide)": "Total ions",
1046
+ "PEP.StrippedSequence": "Peptide sequence",
1047
+ "PEP.AllOccurringProteinAccessions": "Mapped proteins",
1048
+ "EG.ModifiedSequence": "Modified sequence",
1049
+ "EG.CompensationVoltage": "Compensation voltage",
1050
+ "EG.Qvalue": "Qvalue",
1051
+ "EG.ApexRT": "Apex retention time",
1052
+ "EG.DatapointsPerPeak": "Datapoints per peak",
1053
+ "EG.FWHM": "FWHM",
1054
+ "EG.SignalToNoise": "Signal to noise",
1055
+ "FG.FragmentCount": "Fragment count",
1056
+ "FG.Charge": "Charge",
1057
+ "FG.MS1Quantity": "MS1 intensity",
1058
+ "FG.MS1RawQuantity": "MS1 raw intensity",
1059
+ "FG.MS2Quantity": "MS2 intensity",
1060
+ "FG.MS2RawQuantity": "MS2 raw intensity",
1061
+ "FG.MeasuredMz": "Observed m/z",
1062
+ "FG.TheoreticalMz": "Theoretical m/z",
1063
+ "FG.CalibratedMz": "Calibrated m/z",
1064
+ }
1075
1065
  sample_column_tags: list[str] = [
1076
1066
  ".PG.NrOfPrecursorsIdentified",
1077
1067
  ".PG.IBAQ",
@@ -1324,7 +1314,7 @@ class SpectronautReader(ResultReader):
1324
1314
  filename: Optional[str] = None,
1325
1315
  filetag: Optional[str] = None,
1326
1316
  rename_columns: bool = True,
1327
- ) -> None:
1317
+ ) -> pd.DataFrame:
1328
1318
  """Reads an ion evidence file (long format) and returns a processed dataframe.
1329
1319
 
1330
1320
  Adds new columns to comply with the MsReport convention. "Protein reported
@@ -1462,7 +1452,7 @@ def sort_leading_proteins(
1462
1452
  if penalize_contaminants is not None:
1463
1453
  contaminant_encoding = {"False": 0, "True": 1, False: 0, True: 1}
1464
1454
 
1465
- for idx, row in table.iterrows():
1455
+ for _, row in table.iterrows():
1466
1456
  protein_ids = row["Leading proteins"].split(";")
1467
1457
 
1468
1458
  sorting_info = [[] for _ in protein_ids]
@@ -1559,6 +1549,7 @@ def add_protein_annotation(
1559
1549
  warnings.warn(
1560
1550
  f"Some proteins could not be annotated: {repr(proteins_not_in_db)}",
1561
1551
  ProteinsNotInFastaWarning,
1552
+ stacklevel=2,
1562
1553
  )
1563
1554
 
1564
1555
  annotations = {}
@@ -1636,9 +1627,10 @@ def add_protein_site_annotation(
1636
1627
  warnings.warn(
1637
1628
  f"Some proteins could not be annotated: {repr(proteins_not_in_db)}",
1638
1629
  ProteinsNotInFastaWarning,
1630
+ stacklevel=2,
1639
1631
  )
1640
1632
 
1641
- annotations = {
1633
+ annotations: dict[str, list[str]] = {
1642
1634
  "Modified residue": [],
1643
1635
  "Sequence window": [],
1644
1636
  }
@@ -1702,6 +1694,7 @@ def add_leading_proteins_annotation(
1702
1694
  warnings.warn(
1703
1695
  f"Some proteins could not be annotated: {repr(proteins_not_in_db)}",
1704
1696
  ProteinsNotInFastaWarning,
1697
+ stacklevel=2,
1705
1698
  )
1706
1699
 
1707
1700
  annotations = {}
@@ -1853,7 +1846,7 @@ def add_peptide_positions(
1853
1846
  find matching entries in the FASTA files.
1854
1847
  """
1855
1848
  # not tested #
1856
- peptide_positions = {"Start position": [], "End position": []}
1849
+ peptide_positions: dict[str, list[int]] = {"Start position": [], "End position": []}
1857
1850
  proteins_not_in_db = []
1858
1851
  for peptide, protein_id in zip(table[peptide_column], table[protein_column]):
1859
1852
  if protein_id in protein_db:
@@ -1875,6 +1868,7 @@ def add_peptide_positions(
1875
1868
  warnings.warn(
1876
1869
  f"Some peptides could not be annotated: {repr(proteins_not_in_db)}",
1877
1870
  ProteinsNotInFastaWarning,
1871
+ stacklevel=2,
1878
1872
  )
1879
1873
 
1880
1874
 
@@ -1894,10 +1888,10 @@ def add_protein_modifications(table: pd.DataFrame):
1894
1888
  for peptide_site, mod in [m.split(":") for m in mod_entry.split(";")]:
1895
1889
  protein_site = int(peptide_site) + start_pos - 1
1896
1890
  protein_mods.append([str(protein_site), mod])
1897
- protein_mods = ";".join([f"{pos}:{mod}" for pos, mod in protein_mods])
1891
+ protein_mod_string = ";".join([f"{pos}:{mod}" for pos, mod in protein_mods])
1898
1892
  else:
1899
- protein_mods = ""
1900
- protein_modification_entries.append(protein_mods)
1893
+ protein_mod_string = ""
1894
+ protein_modification_entries.append(protein_mod_string)
1901
1895
  table["Protein modifications"] = protein_modification_entries
1902
1896
 
1903
1897
 
@@ -2074,7 +2068,7 @@ def _process_protein_entries(
2074
2068
  A dataframe containing the columns "Protein reported by software",
2075
2069
  "Leading proteins", "Representative protein", and "Potential contaminant".
2076
2070
  """
2077
- new_entries = {
2071
+ new_entries: dict[str, list[str | bool]] = {
2078
2072
  "Protein reported by software": [],
2079
2073
  "Representative protein": [],
2080
2074
  "Potential contaminant": [],
@@ -2189,7 +2183,7 @@ def extract_fragpipe_localization_probabilities(localization_entry: str) -> dict
2189
2183
  ... )
2190
2184
  {'15.9949': {3: 1.0}, '79.9663': {4: 0.334, 6: 0.666}}
2191
2185
  """
2192
- modification_probabilities = {}
2186
+ modification_probabilities: dict[str, dict[int, float]] = {}
2193
2187
  for modification_entry in filter(None, localization_entry.split(";")):
2194
2188
  specified_modification, probability_sequence = modification_entry.split("@")
2195
2189
  _, modification = specified_modification.split(":")
@@ -2247,7 +2241,7 @@ def _create_protein_annotations_from_db(
2247
2241
  protein_db: ProteinDatabase,
2248
2242
  query_function: Callable,
2249
2243
  default_value: Any,
2250
- ) -> list[str]:
2244
+ ) -> list[Any]:
2251
2245
  """Returns a list of multi protein entry annotations.
2252
2246
 
2253
2247
  Used to generate protein annotations for protein entries. For each protein id an
@@ -2274,9 +2268,9 @@ def _create_protein_annotations_from_db(
2274
2268
  if protein_id in protein_db:
2275
2269
  db_entry = protein_db[protein_id]
2276
2270
  query_result = query_function(db_entry, default_value)
2271
+ annotation_values.append(query_result)
2277
2272
  else:
2278
- query_result = default_value
2279
- annotation_values.append(query_result)
2273
+ annotation_values.append(default_value)
2280
2274
  return annotation_values
2281
2275
 
2282
2276
 
@@ -1,3 +1,4 @@
1
- """ Python interface to custome R scripts. """
1
+ """Python interface to custome R scripts."""
2
+
2
3
  from .limma import multi_group_limma, two_group_limma
3
4
  from .rinstaller import r_package_version
@@ -1,4 +1,5 @@
1
- """ Python interface to custome R scripts. """
1
+ """Python interface to custome R scripts."""
2
+
2
3
  import os
3
4
 
4
5
  import pandas as pd
@@ -1,9 +1,9 @@
1
- from rpy2.robjects.packages import importr
2
- import rpy2.robjects.packages as rpackages
3
1
  import rpy2.robjects as robjects
2
+ import rpy2.robjects.packages as rpackages
3
+ from rpy2.robjects.packages import importr
4
4
 
5
5
 
6
- def r_package_version(package_name: str) -> (str, str):
6
+ def r_package_version(package_name: str) -> str:
7
7
  """Returns the version number of an installed R package."""
8
8
  with robjects.conversion.localconverter(robjects.default_converter):
9
9
  utils = importr("utils")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: msreport
3
- Version: 0.0.26
3
+ Version: 0.0.28
4
4
  Summary: Post processing and analysis of quantitative proteomics data
5
5
  Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
6
6
  License: Apache-2.0
@@ -19,11 +19,15 @@ Requires-Dist: pandas>=1.4.4
19
19
  Requires-Dist: profasta>=0.0.4
20
20
  Requires-Dist: pyteomics>=4.6.0
21
21
  Requires-Dist: pyyaml>=6.0.0
22
- Requires-Dist: rpy2>=3.5.3
22
+ Requires-Dist: rpy2!=3.5.13,>=3.5.3
23
23
  Requires-Dist: scikit-learn>=1.0.0
24
24
  Requires-Dist: scipy>=1.9.1
25
25
  Requires-Dist: seaborn>=0.12.0
26
26
  Requires-Dist: statsmodels>=0.13.2
27
+ Requires-Dist: typing_extensions>=4
28
+ Provides-Extra: dev
29
+ Requires-Dist: mypy>=1.15.0; extra == "dev"
30
+ Requires-Dist: pytest>=8.3.5; extra == "dev"
27
31
  Dynamic: license-file
28
32
 
29
33
  [![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip)
@@ -117,7 +121,7 @@ command as described above.
117
121
  ### Additional requirements
118
122
 
119
123
  MsReport provides an interface to the R package LIMMA for differential expression
120
- analysis, which requires a local installation of R (R version 3.4 or higher) and the
124
+ analysis, which requires a local installation of R (R version 4.0 or higher) and the
121
125
  system environment variable "R_HOME" to be set to the R home directory. Note that it
122
126
  might be necessary to restart the computer after adding the "R_HOME" variable. The R
123
127
  home directory can also be found from within R by using the command below, and might
@@ -0,0 +1,38 @@
1
+ msreport/__init__.py,sha256=5-d_i-t9A3MV7hC-3z_vcWzaSAJSGY5T6McCBr4UGfc,339
2
+ msreport/analyze.py,sha256=zNs0Vc2ODTfdiX6rSr79jXLJIh-6N11WH-vZpQzKDTE,30889
3
+ msreport/errors.py,sha256=algGlR5iD9Q0U6Q3m25IwZryl9smtlPHsfhAL35PChc,295
4
+ msreport/export.py,sha256=YvY3Nly5JC2CUM-JY1gydU1g2eqnennzToZfQQ5phO0,20156
5
+ msreport/fasta.py,sha256=eXTmA4WGX4dT9wcTw7AdrvybLWG47p7ur48CxIjxjfg,1161
6
+ msreport/impute.py,sha256=bf2Zy8VQNJ0Oh1sKn84Xp9iV5svi_Hp7iHxwRrFBwsI,10327
7
+ msreport/isobar.py,sha256=m6NhLaKBiItIXuBhly_z2wEslxQGFC2f3-e1bzYXB78,6575
8
+ msreport/normalize.py,sha256=K1x3DjL5Rep3t_eDIKIghMr0sAJiROnX6skHnOMPZ_k,20160
9
+ msreport/peptidoform.py,sha256=26USj6WPrMgMIc7LttQ2n6Oq5jo1o7ayUQLR6gsRmZY,12015
10
+ msreport/qtable.py,sha256=0e-TXmuiKBU6W5TL3tz06nNrjtEyT-CI9bvUq8W6qME,26768
11
+ msreport/reader.py,sha256=ja4q8XtOHR_A6RL8ho-c6aGCVu1kzyhvil8ymiPx3PY,104612
12
+ msreport/aggregate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ msreport/aggregate/condense.py,sha256=eIh5A3RUvXrmoFUjRXagiPl0m-ucuRwYD8kDBI7voVs,5862
14
+ msreport/aggregate/pivot.py,sha256=rn8li-FrtOZS4oWA8COk0uV2m71GCEbNu1ALNoMuHOA,5081
15
+ msreport/aggregate/summarize.py,sha256=aYXi_i7MkqjA8k9WWpOgn029TeJ3H5Qo899msDVw89M,12165
16
+ msreport/helper/__init__.py,sha256=UbBHKMcapSXCyNmfQm6rg-2OgS303txkgILtboE05KI,535
17
+ msreport/helper/calc.py,sha256=J4XltEnMrFR9IQlPtrZhyxlSTj15072huHCMA_nqQ6E,4245
18
+ msreport/helper/maxlfq.py,sha256=EP1UjV3IAz4NSpGOQSsWGbuxtGLmtw92dvXUwgBYmF0,14943
19
+ msreport/helper/table.py,sha256=x-Wo8mTENsUxc_gtF-wgOyQa9g7W2fK6tuRiEX7bda0,11430
20
+ msreport/helper/temp.py,sha256=jNulgDATf9sKXEFWMXAhjflciOZPAqlxg_7QZS7IkW8,3736
21
+ msreport/plot/__init__.py,sha256=SnoQORfrjgz9SmqPZ-1J1aeVC5xu-cFfZINP4aYVCmY,1488
22
+ msreport/plot/_partial_plots.py,sha256=tqZTSXEPuruMgVakaGR2tUQl5OrHgo2cROJ0S4cqkR0,5598
23
+ msreport/plot/comparison.py,sha256=J8zWyQrzx7rxDLxeZQkfAlcSmLY3e_7wwPG-cGuWo2M,18564
24
+ msreport/plot/distribution.py,sha256=a2Rw6HxQwGfDwRSy8dwpT7zvEQ968wYHjcVPOdXI3l8,10150
25
+ msreport/plot/multivariate.py,sha256=0xzxggqbIGQYOfgiij93DTRWfG6GvvhqI9u1GNPHarY,13111
26
+ msreport/plot/quality.py,sha256=dIo_dpdexEN_vp35WpUTt626E-QJ2qNbJmjUai_8uck,15861
27
+ msreport/plot/style.py,sha256=67jWf4uA1ub9RJDu4xhuSoXAW0lbLj6SMP4QXQO76Pc,10591
28
+ msreport/plot/style_sheets/msreport-notebook.mplstyle,sha256=SPYO_7vYT8Ha7tQ0KCTLtykiRQ13-_igAm7kyvsZj1I,1266
29
+ msreport/plot/style_sheets/seaborn-whitegrid.mplstyle,sha256=eC8Zboy8R7ybBwbHPKvKbMIHACystN6X6I0lqm7B80U,833
30
+ msreport/rinterface/__init__.py,sha256=g29j2cIrc71qBdF4Zys51feoXlC0dP6YcTIscPTqPdI,146
31
+ msreport/rinterface/limma.py,sha256=fxYRUkkJKI-JpDvivjWj8bUS0ug7RRTMnaf2UOgRsXQ,5421
32
+ msreport/rinterface/rinstaller.py,sha256=AGs6NFMSwTLrzrIJz1E5BE5jFUz8eQBHlpM_MWVChzA,1370
33
+ msreport/rinterface/rscripts/limma.R,sha256=gr_yjMm_YoG45irDhWOo6gkRQSTwj_7uU_p3NBRHPm8,4331
34
+ msreport-0.0.28.dist-info/licenses/LICENSE.txt,sha256=Pd-b5cKP4n2tFDpdx27qJSIq0d1ok0oEcGTlbtL6QMU,11560
35
+ msreport-0.0.28.dist-info/METADATA,sha256=IVyUd3ZATwccffCWbgYYmUmPe8Y4vJvwZC6oMFuBBfw,5497
36
+ msreport-0.0.28.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
37
+ msreport-0.0.28.dist-info/top_level.txt,sha256=Drl8mCckJHFIw-Ovh5AnyjKnqvLJltDOBUr1JAcHAlI,9
38
+ msreport-0.0.28.dist-info/RECORD,,