msreport 0.0.30__py3-none-any.whl → 0.0.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
msreport/peptidoform.py CHANGED
@@ -1,5 +1,14 @@
1
+ """Defines the `Peptide` class and associated utilities for handling peptidoforms.
2
+
3
+ This module provides a `Peptide` class for representing modified peptide sequences,
4
+ and their site localization probabilities. It offers methods to access and manipulate
5
+ peptide information, summarize isoform probabilities, and retrieve modification sites.
6
+ Additionally, it includes utility functions for parsing modified sequence strings and
7
+ converting site localization probabilities to and from a standardized string format.
8
+ """
9
+
1
10
  from collections import defaultdict as ddict
2
- from typing import Optional, Union
11
+ from typing import Optional
3
12
 
4
13
  import numpy as np
5
14
 
@@ -10,7 +19,7 @@ class Peptide:
10
19
  def __init__(
11
20
  self,
12
21
  modified_sequence: str,
13
- localization_probabilities: Optional[dict] = None,
22
+ localization_probabilities: Optional[dict[str, dict[int, float]]] = None,
14
23
  protein_position: Optional[int] = None,
15
24
  ):
16
25
  plain_sequence, modifications = parse_modified_sequence(
@@ -28,7 +37,7 @@ class Peptide:
28
37
  self.modification_positions[mod_tag].append(position)
29
38
  self.modified_residues[position] = mod_tag
30
39
 
31
- def make_modified_sequence(self, include: Optional[list] = None) -> str:
40
+ def make_modified_sequence(self, include: Optional[list[str]] = None) -> str:
32
41
  """Returns a modified sequence string.
33
42
 
34
43
  Args:
@@ -55,7 +64,7 @@ class Peptide:
55
64
  return 0
56
65
  return len(self.modification_positions[modification])
57
66
 
58
- def isoform_probability(self, modification: str) -> Union[float, None]:
67
+ def isoform_probability(self, modification: str) -> float | None:
59
68
  """Calculates the isoform probability for a given modification.
60
69
 
61
70
  Returns:
@@ -66,12 +75,13 @@ class Peptide:
66
75
  """
67
76
  probabilities = []
68
77
  for site in self.list_modified_peptide_sites(modification):
69
- probabilities.append(self.get_peptide_site_probability(site))
70
- if None in probabilities:
71
- return None
78
+ probability = self.get_peptide_site_probability(site)
79
+ if probability is None:
80
+ return None
81
+ probabilities.append(probability)
72
82
  return float(np.prod(probabilities))
73
83
 
74
- def get_peptide_site_probability(self, position: int) -> Optional[float]:
84
+ def get_peptide_site_probability(self, position: int) -> float | None:
75
85
  """Return the modification localization probability of the peptide position.
76
86
 
77
87
  Args:
@@ -85,7 +95,7 @@ class Peptide:
85
95
  """
86
96
  return self._get_site_probability(position, is_protein_position=False)
87
97
 
88
- def get_protein_site_probability(self, position: int) -> Optional[float]:
98
+ def get_protein_site_probability(self, position: int) -> float | None:
89
99
  """Return the modification localization probability of the protein position.
90
100
 
91
101
  Args:
@@ -109,7 +119,7 @@ class Peptide:
109
119
 
110
120
  def _get_site_probability(
111
121
  self, position: int, is_protein_position: bool
112
- ) -> Optional[float]:
122
+ ) -> float | None:
113
123
  """Return the modification localization probability of the peptide position.
114
124
 
115
125
  Args:
@@ -224,7 +234,7 @@ def modify_peptide(
224
234
 
225
235
 
226
236
  def make_localization_string(
227
- localization_probabilities: dict, decimal_places: int = 3
237
+ localization_probabilities: dict[str, dict[int, float]], decimal_places: int = 3
228
238
  ) -> str:
229
239
  """Generates a site localization probability string.
230
240
 
msreport/plot/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- """This module provides various plotting functions for visualizing data within a Qtable.
1
+ """Plotting functions for visualizing proteomics data from `Qtable`.
2
2
 
3
3
  The functions in this module generate a wide range of plots, including heatmaps, PCA
4
4
  plots, volcano plots, and histograms, to analyze and compare expression values,
@@ -6,8 +6,8 @@ missingness, contaminants, and other features in proteomics datasets. The plots
6
6
  designed to work with the Qtable class as input, which provides structured access to
7
7
  proteomics data and experimental design information.
8
8
 
9
- The style of the plots can be customized using the `set_active_style` function, which
10
- allows applying style sheets from the msreport library or those available in matplotlib.
9
+ Users can customize plot styles via the `set_active_style` function, which allows
10
+ applying style sheets from the msreport library or those available in matplotlib.
11
11
  """
12
12
 
13
13
  from .comparison import expression_comparison, pvalue_histogram, volcano_ma
msreport/plot/quality.py CHANGED
@@ -314,7 +314,7 @@ def sample_intensities(
314
314
 
315
315
  @with_active_style
316
316
  def sample_correlation(
317
- qtable, exclude_invalid: bool = True, labels: bool = False
317
+ qtable: Qtable, exclude_invalid: bool = True, labels: bool = False
318
318
  ) -> tuple[plt.Figure, list[plt.Axes]]:
319
319
  """Generates a pair-wise correlation matrix of samples 'Expression' values.
320
320
 
msreport/qtable.py CHANGED
@@ -1,14 +1,28 @@
1
- from __future__ import annotations
1
+ """Defines the `Qtable` class, the central container for quantitative proteomics data.
2
+
3
+ The `Qtable` class serves as the standardized data structure for `msreport`,
4
+ storing a main table with quantitative values and associated metadata for its entries;
5
+ it also maintains the name of the unique ID column for the main table. Additionally,
6
+ it stores an experimental design table that links sample names to experimental
7
+ conditions and replicate information.
8
+
9
+ `Qtable` provides convenience methods for creating subtables and accessing design
10
+ related information (e.g., samples per experiment), and instances of `Qtable` can be
11
+ easily saved to disk and loaded back. As the central data container, the `Qtable`
12
+ facilitates seamless integration with the high-level modules `analyze`, `plot` and
13
+ `export`, which all directly operate on `Qtable` instances.
14
+ """
2
15
 
3
16
  import copy
4
17
  import os
5
18
  import warnings
6
19
  from contextlib import contextmanager
7
- from typing import Any, Iterable, Optional
20
+ from typing import Any, Generator, Iterable, Optional
8
21
 
9
22
  import numpy as np
10
23
  import pandas as pd
11
24
  import yaml
25
+ from typing_extensions import Self
12
26
 
13
27
  import msreport.helper as helper
14
28
 
@@ -359,7 +373,7 @@ class Qtable:
359
373
  keep_experiments: Optional[Iterable[str]] = None,
360
374
  exclude_samples: Optional[Iterable[str]] = None,
361
375
  keep_samples: Optional[Iterable[str]] = None,
362
- ):
376
+ ) -> Generator[None, None, None]:
363
377
  """Context manager to temporarily modify the design table.
364
378
 
365
379
  Args:
@@ -422,7 +436,7 @@ class Qtable:
422
436
  self.design.to_csv(filepaths["design"], sep="\t", index=True)
423
437
 
424
438
  @classmethod
425
- def load(cls, directory: str, basename: str) -> Qtable:
439
+ def load(cls, directory: str, basename: str) -> Self:
426
440
  """Load a qtable from disk by reading a data, design, and config file.
427
441
 
428
442
  Loading a qtable will first import the three files generated during saving, then
@@ -470,7 +484,7 @@ class Qtable:
470
484
  )
471
485
  id_column = config_data["Unique ID column"]
472
486
 
473
- qtable = Qtable(data, design, id_column)
487
+ qtable = cls(data, design, id_column)
474
488
  qtable._expression_columns = config_data["Expression columns"]
475
489
  qtable._expression_features = config_data["Expression features"]
476
490
  qtable._expression_sample_mapping = config_data["Expression sample mapping"]
@@ -486,11 +500,11 @@ class Qtable:
486
500
  )
487
501
  self.data.to_csv(path, sep="\t", index=index)
488
502
 
489
- def to_clipboard(self, index: bool = False):
503
+ def to_clipboard(self, index: bool = False) -> None:
490
504
  """Writes the data table to the system clipboard."""
491
505
  self.data.to_clipboard(sep="\t", index=index)
492
506
 
493
- def copy(self) -> Qtable:
507
+ def copy(self) -> Self:
494
508
  """Returns a copy of this Qtable instance."""
495
509
  return self.__copy__()
496
510
 
@@ -579,8 +593,8 @@ class Qtable:
579
593
  self._expression_features = []
580
594
  self._expression_sample_mapping = {}
581
595
 
582
- def __copy__(self) -> Qtable:
583
- new_instance = Qtable(self.data, self.design, self.id_column)
596
+ def __copy__(self) -> Self:
597
+ new_instance = type(self)(self.data, self.design, self.id_column)
584
598
  # Copy all private attributes
585
599
  for attr in dir(self):
586
600
  if (
@@ -609,7 +623,7 @@ def _match_samples_to_tag_columns(
609
623
  samples: Iterable[str],
610
624
  columns: Iterable[str],
611
625
  tag: str,
612
- ) -> dict:
626
+ ) -> dict[str, str]:
613
627
  """Mapping of samples to columns which contain the sample and the tag.
614
628
 
615
629
  Args:
@@ -632,7 +646,7 @@ def _match_samples_to_tag_columns(
632
646
  return mapping
633
647
 
634
648
 
635
- def _get_qtable_export_filepaths(directory: str, name: str):
649
+ def _get_qtable_export_filepaths(directory: str, name: str) -> dict[str, str]:
636
650
  """Returns a dictionary of standard filepaths for loading and saving a qtable."""
637
651
  filenames = {
638
652
  "data": f"{name}.data.tsv",
msreport/reader.py CHANGED
@@ -1,17 +1,18 @@
1
- """Module for reading result tables from various MS analysis tools and converting them
2
- to a standardized format following the MsReport convention.
1
+ """Provides tools for importing and standardizing quantitative proteomics data.
3
2
 
4
- Currently for MaxQuant and FragPipe protein, peptide, and ion tables are supported, and
5
- for Spectronaut protein tables are supported when exported with the correct report
6
- scheme.
3
+ This module offers software-specific reader classes to import raw result tables (e.g.,
4
+ proteins, peptides, ions) from various proteomics software (MaxQuant, FragPipe,
5
+ Spectronaut) and convert them into a standardized `msreport` format. Additionally, it
6
+ provides functions for annotating imported data with biological metadata, such as
7
+ protein information (e.g., sequence length, molecular weight) and peptide positions,
8
+ extracted from a ProteinDatabase (FASTA file).
7
9
 
8
- New column names:
10
+ New columns added to imported protein tables:
9
11
  - Representative protein
10
12
  - Leading proteins
11
13
  - Protein reported by software
12
14
 
13
- Unified column names:
14
- - Total peptides
15
+ Standardized column names for quantitative values (if available in the software output):
15
16
  - Spectral count "sample name"
16
17
  - Unique spectral count "sample name"
17
18
  - Total spectral count "sample name"
@@ -38,6 +39,7 @@ from msreport.helper.temp import extract_window_around_position
38
39
  class Protein(Protocol):
39
40
  """Abstract protein entry"""
40
41
 
42
+ # identifier: str
41
43
  header: str
42
44
  sequence: str
43
45
  header_fields: dict[str, str]
@@ -46,9 +48,9 @@ class Protein(Protocol):
46
48
  class ProteinDatabase(Protocol):
47
49
  """Abstract protein database"""
48
50
 
49
- def __getitem__(self, protein_id: str) -> Protein: ...
51
+ def __getitem__(self, identifier: str) -> Protein: ...
50
52
 
51
- def __contains__(self, protein_id: str) -> bool: ...
53
+ def __contains__(self, identifier: str) -> bool: ...
52
54
 
53
55
 
54
56
  class ResultReader:
@@ -497,7 +499,9 @@ class MaxQuantReader(ResultReader):
497
499
  mod_probability_columns = msreport.helper.find_columns(new_df, "Probabilities")
498
500
  localization_string_column = "Modification localization string"
499
501
 
500
- mod_localization_probabilities = [{} for _ in range(new_df.shape[0])]
502
+ mod_localization_probabilities: list[dict[str, dict[int, float]]] = [
503
+ {} for _ in range(new_df.shape[0])
504
+ ]
501
505
  for probability_column in mod_probability_columns:
502
506
  # FUTURE: Type should be checked and enforced during the import
503
507
  if not pd.api.types.is_string_dtype(new_df[probability_column].dtype):
@@ -681,7 +685,15 @@ class FragPipeReader(ResultReader):
681
685
  def import_design(
682
686
  self, filename: Optional[str] = None, sort: bool = False
683
687
  ) -> pd.DataFrame:
684
- """Reads a 'fp-manifest' file and returns a processed design dataframe.
688
+ """Read a 'fp-manifest' file and returns a processed design dataframe.
689
+
690
+ The manifest columns "Path", "Experiment", and "Bioreplicate" are mapped to the
691
+ design table columns "Rawfile", "Experiment", and "Replicate". The "Rawfile"
692
+ column is extracted as the filename from the full path. The "Sample" column is
693
+ generated by combining "Experiment" and "Replicate" with an underscore
694
+ (e.g., "Experiment_Replicate"), except when "Replicate" is empty, in which case
695
+ "Sample" is set to "Experiment". If "Experiment" is missing, it is set to "exp"
696
+ by default.
685
697
 
686
698
  Args:
687
699
  filename: Allows specifying an alternative filename, otherwise the default
@@ -704,12 +716,18 @@ class FragPipeReader(ResultReader):
704
716
  raise FileNotFoundError(
705
717
  f"File '{filepath}' does not exist. Please check the file path."
706
718
  )
707
- fp_manifest = pd.read_csv(filepath, sep="\t", header=None, dtype=str)
719
+ fp_manifest = (
720
+ pd.read_csv(
721
+ filepath, sep="\t", header=None, na_values=[""], keep_default_na=False
722
+ )
723
+ .fillna("")
724
+ .astype(str)
725
+ )
708
726
  fp_manifest.columns = ["Path", "Experiment", "Bioreplicate", "Data type"]
709
727
 
710
728
  design = pd.DataFrame(
711
729
  {
712
- "Sample": fp_manifest["Experiment"] + "_" + fp_manifest["Bioreplicate"],
730
+ "Sample": "",
713
731
  "Experiment": fp_manifest["Experiment"],
714
732
  "Replicate": fp_manifest["Bioreplicate"],
715
733
  "Rawfile": fp_manifest["Path"].apply(
@@ -718,6 +736,12 @@ class FragPipeReader(ResultReader):
718
736
  ),
719
737
  }
720
738
  )
739
+ # FragPipe uses "exp" for missing 'Experiment' values
740
+ design.loc[design["Experiment"] == "", "Experiment"] = "exp"
741
+ # FragPipe combines 'Experiment' + "_" + 'Replicate' into 'Sample', except when
742
+ # 'Replicate' is empty, in which case 'Sample' is set to 'Experiment'.
743
+ design["Sample"] = design["Experiment"] + "_" + design["Replicate"]
744
+ design.loc[design["Replicate"] == "", "Sample"] = design["Experiment"]
721
745
 
722
746
  if sort:
723
747
  design.sort_values(by=["Experiment", "Replicate"], inplace=True)
@@ -963,7 +987,7 @@ class FragPipeReader(ResultReader):
963
987
  filename: Optional[str] = None,
964
988
  rename_columns: bool = True,
965
989
  rewrite_modifications: bool = True,
966
- ):
990
+ ) -> pd.DataFrame:
967
991
  """Concatenate all "psm.tsv" files and return a processed dataframe.
968
992
 
969
993
  Args:
@@ -1499,6 +1523,7 @@ class SpectronautReader(ResultReader):
1499
1523
  filename: Optional[str] = None,
1500
1524
  filetag: Optional[str] = None,
1501
1525
  rename_columns: bool = True,
1526
+ rewrite_modifications: bool = True,
1502
1527
  ) -> pd.DataFrame:
1503
1528
  """Reads an ion evidence file (long format) and returns a processed dataframe.
1504
1529
 
@@ -1508,8 +1533,15 @@ class SpectronautReader(ResultReader):
1508
1533
  generated by concatenating the "Modified sequence" and "Charge" columns, and if
1509
1534
  present, the "Compensation voltage" column.
1510
1535
 
1511
- (!) Note that the modified sequence and modification localization probabilities
1512
- are currently not processed.
1536
+ "Modified sequence" entries contain modifications within square brackets.
1537
+ "Modification" entries are strings in the form of "position:modification_tag",
1538
+ multiple modifications are joined by ";". An example for a modified sequence and
1539
+ a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
1540
+
1541
+ "Modification localization string" contains localization probabilities in the
1542
+ format "Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
1543
+ e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
1544
+ `msreport.peptidoform.make_localization_string` for details.
1513
1545
 
1514
1546
  Args:
1515
1547
  filename: Optional, allows specifying a specific file that will be imported.
@@ -1517,6 +1549,10 @@ class SpectronautReader(ResultReader):
1517
1549
  a substring, instead of specifying a filename.
1518
1550
  rename_columns: If True, columns are renamed according to the MsReport
1519
1551
  convention; default True.
1552
+ rewrite_modifications: If True, the peptide format in "Modified sequence" is
1553
+ changed according to the MsReport convention, and a "Modifications" is
1554
+ added to contains the amino acid position for all modifications.
1555
+ Requires 'rename_columns' to be true. Default True.
1520
1556
 
1521
1557
  Returns:
1522
1558
  A dataframe containing the processed ion table.
@@ -1544,6 +1580,9 @@ class SpectronautReader(ResultReader):
1544
1580
  df = self._add_protein_entries(df)
1545
1581
  if rename_columns:
1546
1582
  df = self._rename_columns(df, True)
1583
+ if rewrite_modifications and rename_columns:
1584
+ df = self._add_peptide_modification_entries(df)
1585
+ df = self._add_modification_localization_string(df)
1547
1586
  df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
1548
1587
  if "Compensation voltage" in df.columns:
1549
1588
  _cv = df["Compensation voltage"].astype(str)
@@ -1597,6 +1636,70 @@ class SpectronautReader(ResultReader):
1597
1636
  leading_protein_entries = df["PG.ProteinAccessions"].str.split(";").tolist()
1598
1637
  return leading_protein_entries
1599
1638
 
1639
+ def _add_peptide_modification_entries(self, df: pd.DataFrame) -> pd.DataFrame:
1640
+ """Adds standardized "Modified sequence" and "Modifications" columns.
1641
+
1642
+ "Modified sequence" entries contain modifications within square brackets.
1643
+ "Modifications" entries are strings in the form of "position:modification_text",
1644
+ multiple modifications are joined by ";". An example for a modified sequence and
1645
+ a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
1646
+
1647
+ Requires the columns "Peptide sequence" and "Modified sequence" from the
1648
+ software output.
1649
+
1650
+ Args:
1651
+ df: Dataframe containing "Peptide sequence" and "Modified sequence" columns.
1652
+
1653
+ Returns:
1654
+ A copy of the input dataframe with updated columns.
1655
+ """
1656
+ # TODO: not tested
1657
+ mod_sequences = df["Modified sequence"].str[1:-1] # Remove sourrounding "_"
1658
+ mod_entries = _generate_modification_entries(
1659
+ df["Peptide sequence"], mod_sequences, "[", "]"
1660
+ )
1661
+ new_df = df.copy()
1662
+ new_df["Modified sequence"] = mod_entries["Modified sequence"]
1663
+ new_df["Modifications"] = mod_entries["Modifications"]
1664
+ return new_df
1665
+
1666
+ def _add_modification_localization_string(self, df: pd.DataFrame) -> pd.DataFrame:
1667
+ """Adds modification localization string columns.
1668
+
1669
+ Extracts localization probabilities from the "EG.PTMLocalizationProbabilities"
1670
+ column, converts them into the standardized modification localization string
1671
+ format used by msreport, and adds new column "Modification localization string".
1672
+
1673
+ Probabilities are written in the format
1674
+ "Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
1675
+ e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
1676
+ `msreport.peptidoform.make_localization_string` for details.
1677
+
1678
+ Args:
1679
+ df: Dataframe containing a "EG.PTMLocalizationProbabilities" column.
1680
+
1681
+ Returns:
1682
+ A copy of the input dataframe with the added column
1683
+ "Modification localization string".
1684
+ """
1685
+ # TODO: not tested
1686
+ new_df = df.copy()
1687
+ localization_strings = []
1688
+ for localization_entry in new_df["EG.PTMLocalizationProbabilities"]:
1689
+ if localization_entry == "":
1690
+ localization_strings.append("")
1691
+ continue
1692
+
1693
+ localization_probabilities = extract_spectronaut_localization_probabilities(
1694
+ localization_entry
1695
+ )
1696
+ localization_string = msreport.peptidoform.make_localization_string(
1697
+ localization_probabilities
1698
+ )
1699
+ localization_strings.append(localization_string)
1700
+ new_df["Modification localization string"] = localization_strings
1701
+ return new_df
1702
+
1600
1703
 
1601
1704
  def sort_leading_proteins(
1602
1705
  table: pd.DataFrame,
@@ -1639,7 +1742,7 @@ def sort_leading_proteins(
1639
1742
  db_origins_present = "Leading proteins database origin" in table
1640
1743
 
1641
1744
  if database_order is not None:
1642
- database_encoding = defaultdict(lambda: 999)
1745
+ database_encoding: dict[str, int] = defaultdict(lambda: 999)
1643
1746
  database_encoding.update({db: i for i, db in enumerate(database_order)})
1644
1747
  if penalize_contaminants is not None:
1645
1748
  contaminant_encoding = {"False": 0, "True": 1, False: 0, True: 1}
@@ -1647,7 +1750,7 @@ def sort_leading_proteins(
1647
1750
  for _, row in table.iterrows():
1648
1751
  protein_ids = row["Leading proteins"].split(";")
1649
1752
 
1650
- sorting_info = [[] for _ in protein_ids]
1753
+ sorting_info: list[list] = [[] for _ in protein_ids]
1651
1754
  if special_proteins is not None:
1652
1755
  for i, _id in enumerate(protein_ids):
1653
1756
  sorting_info[i].append(_id not in special_proteins)
@@ -1787,7 +1890,7 @@ def add_protein_site_annotation(
1787
1890
  protein_db: ProteinDatabase,
1788
1891
  protein_column: str = "Representative protein",
1789
1892
  site_column: str = "Protein site",
1790
- ):
1893
+ ) -> pd.DataFrame:
1791
1894
  """Uses a FASTA protein database to add protein site annotation columns.
1792
1895
 
1793
1896
  Adds the columns "Modified residue", which corresponds to the amino acid at the
@@ -1925,6 +2028,61 @@ def add_leading_proteins_annotation(
1925
2028
  return table
1926
2029
 
1927
2030
 
2031
+ def add_protein_site_identifiers(
2032
+ table: pd.DataFrame,
2033
+ protein_db: ProteinDatabase,
2034
+ site_column: str,
2035
+ protein_name_column: str,
2036
+ ):
2037
+ """Adds a "Protein site identifier" column to the 'table'.
2038
+
2039
+ The "Protein site identifier" is generated by concatenating the protein name
2040
+ with the amino acid and position of the protein site or sites, e.g. "P12345 - S123"
2041
+ or "P12345 - S123 / T125". The amino acid is extracted from the protein sequence at
2042
+ the position of the site. If the protein name is not available, the
2043
+ "Representative protein" entry is used instead.
2044
+
2045
+ Args:
2046
+ table: Dataframe to which the protein site identifiers are added.
2047
+ protein_db: A protein database containing entries from one or multiple FASTA
2048
+ files. Protein identifiers in the 'table' column "Representative protein"
2049
+ are used to look up entries in the 'protein_db'.
2050
+ site_column: Column in 'table' that contains protein site positions. Positions
2051
+ are one-indexed, meaning the first amino acid of the protein is position 1.
2052
+ Multiple sites in a single entry should be separated by ";".
2053
+ protein_name_column: Column in 'table' that contains protein names, which will
2054
+ be used to generate the identifier. If no name is available, the accession
2055
+ is used instead.
2056
+
2057
+ Raises:
2058
+ ValueError: If the "Representative protein", 'protein_name_column' or
2059
+ 'site_column' is not found in the 'table'.
2060
+ """
2061
+ if site_column not in table.columns:
2062
+ raise ValueError(f"Column '{site_column}' not found in the table.")
2063
+ if protein_name_column not in table.columns:
2064
+ raise ValueError(f"Column '{protein_name_column}' not found in the table.")
2065
+ if "Representative protein" not in table.columns:
2066
+ raise ValueError("Column 'Representative protein' not found in the table.")
2067
+
2068
+ site_identifiers = []
2069
+ for accession, sites, name in zip(
2070
+ table["Representative protein"],
2071
+ table[site_column].astype(str),
2072
+ table[protein_name_column],
2073
+ ):
2074
+ protein_sequence = protein_db[accession].sequence
2075
+ protein_identifier = name if name else accession
2076
+ aa_sites = []
2077
+ for site in sites.split(";"):
2078
+ aa = protein_sequence[int(site) - 1]
2079
+ aa_sites.append(f"{aa}{site}")
2080
+ aa_site_tag = " / ".join(aa_sites)
2081
+ site_identifier = f"{protein_identifier} - {aa_site_tag}"
2082
+ site_identifiers.append(site_identifier)
2083
+ table["Protein site identifier"] = site_identifiers
2084
+
2085
+
1928
2086
  def add_sequence_coverage(
1929
2087
  protein_table: pd.DataFrame,
1930
2088
  peptide_table: pd.DataFrame,
@@ -2384,7 +2542,9 @@ def _extract_fragpipe_assigned_modifications(
2384
2542
  return modifications
2385
2543
 
2386
2544
 
2387
- def extract_maxquant_localization_probabilities(localization_entry: str) -> dict:
2545
+ def extract_maxquant_localization_probabilities(
2546
+ localization_entry: str,
2547
+ ) -> dict[int, float]:
2388
2548
  """Extract localization probabilites from a MaxQuant "Probabilities" entry.
2389
2549
 
2390
2550
  Args:
@@ -2441,6 +2601,39 @@ def extract_fragpipe_localization_probabilities(localization_entry: str) -> dict
2441
2601
  return modification_probabilities
2442
2602
 
2443
2603
 
2604
+ def extract_spectronaut_localization_probabilities(localization_entry: str) -> dict:
2605
+ """Extract localization probabilites from a Spectronaut localization entry.
2606
+
2607
+ Args:
2608
+ localization_entry: Entry from the "EG.PTMLocalizationProbabilities" column of a
2609
+ spectronaut elution group (EG) output table.
2610
+
2611
+ Returns:
2612
+ A dictionary of modifications containing a dictionary of {position: probability}
2613
+ mappings. Positions are one-indexed, which means that the first amino acid
2614
+ position is 1.
2615
+
2616
+ Example:
2617
+ >>> extract_spectronaut_localization_probabilities(
2618
+ ... "_HM[Oxidation (M): 100%]S[Phospho (STY): 45.5%]GS[Phospho (STY): 54.5%]PG_"
2619
+ ... )
2620
+ {'Oxidation (M)': {2: 1.0}, 'Phospho (STY)': {3: 0.455, 5: 0.545}}
2621
+ """
2622
+ modification_probabilities: dict[str, dict[int, float]] = {}
2623
+ localization_entry = localization_entry.strip("_")
2624
+ _, raw_probability_entries = msreport.peptidoform.parse_modified_sequence(
2625
+ localization_entry, "[", "]"
2626
+ )
2627
+
2628
+ for site, mod_probability_entry in raw_probability_entries:
2629
+ modification, probability_entry = mod_probability_entry.split(": ")
2630
+ if modification not in modification_probabilities:
2631
+ modification_probabilities[modification] = {}
2632
+ probability = float(probability_entry.replace("%", "")) / 100.0
2633
+ modification_probabilities[modification][site] = probability
2634
+ return modification_probabilities
2635
+
2636
+
2444
2637
  def _extract_protein_ids(entries: list[str]) -> list[str]:
2445
2638
  """Returns a list of protein IDs, extracted from protein entries.
2446
2639
 
@@ -2554,8 +2747,8 @@ def _create_multi_protein_annotations_from_db(
2554
2747
  query_result.append(query_function(db_entry, default_value))
2555
2748
  else:
2556
2749
  query_result.append(default_value)
2557
- query_result = ";".join(map(str, query_result))
2558
- annotation_values.append(query_result)
2750
+ annotation_value = ";".join(map(str, query_result))
2751
+ annotation_values.append(annotation_value)
2559
2752
  return annotation_values
2560
2753
 
2561
2754
 
@@ -1,4 +1,4 @@
1
- """Python interface to custome R scripts."""
1
+ """Python interface to the 'limma.R' script."""
2
2
 
3
3
  import os
4
4
 
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: msreport
3
- Version: 0.0.30
3
+ Version: 0.0.31
4
4
  Summary: Post processing and analysis of quantitative proteomics data
5
5
  Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
6
6
  License-Expression: Apache-2.0
7
7
  Project-URL: homepage, https://github.com/hollenstein/msreport
8
+ Project-URL: documentation, https://hollenstein.github.io/msreport/
8
9
  Project-URL: changelog, https://github.com/hollenstein/msreport/blob/main/CHANGELOG.md
9
10
  Keywords: mass spectrometry,proteomics,post processing,data analysis
10
11
  Classifier: Development Status :: 4 - Beta
@@ -33,6 +34,13 @@ Requires-Dist: rpy2<3.5.13,>=3.5.3; extra == "r"
33
34
  Provides-Extra: dev
34
35
  Requires-Dist: mypy>=1.15.0; extra == "dev"
35
36
  Requires-Dist: pytest>=8.3.5; extra == "dev"
37
+ Provides-Extra: docs
38
+ Requires-Dist: mkdocs-awesome-nav>=3.1.2; extra == "docs"
39
+ Requires-Dist: mkdocs-macros-plugin>=1.3.7; extra == "docs"
40
+ Requires-Dist: mkdocs-material>=9.6.15; extra == "docs"
41
+ Requires-Dist: mkdocs-roamlinks-plugin>=0.3.2; extra == "docs"
42
+ Requires-Dist: mkdocstrings-python>=1.16.12; extra == "docs"
43
+ Requires-Dist: ruff>=0.12.2; extra == "docs"
36
44
  Provides-Extra: test
37
45
  Requires-Dist: pytest>=8.3.5; extra == "test"
38
46
  Dynamic: license-file
@@ -64,6 +72,8 @@ MsReport is a Python library designed to simplify the post-processing and analys
64
72
 
65
73
  The library supports importing protein and peptide-level quantification results from MaxQuant, FragPipe, and Spectronaut, as well as post-translational modification (PTM) data from MaxQuant and FragPipe. MsReport provides tools for data annotation, normalization and transformation, statistical testing, and data visualization.
66
74
 
75
+ The [documentation](https://hollenstein.github.io/msreport/) provides an overview of the library's public API.
76
+
67
77
  ### Key features of MsReport
68
78
 
69
79
  #### Data Import and Standardization