msreport 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
msreport/reader.py CHANGED
@@ -1,17 +1,18 @@
1
- """Module for reading result tables from various MS analysis tools and converting them
2
- to a standardized format following the MsReport convention.
1
+ """Provides tools for importing and standardizing quantitative proteomics data.
3
2
 
4
- Currently for MaxQuant and FragPipe protein, peptide, and ion tables are supported, and
5
- for Spectronaut protein tables are supported when exported with the correct report
6
- scheme.
3
+ This module offers software-specific reader classes to import raw result tables (e.g.,
4
+ proteins, peptides, ions) from various proteomics software (MaxQuant, FragPipe,
5
+ Spectronaut) and convert them into a standardized `msreport` format. Additionally, it
6
+ provides functions for annotating imported data with biological metadata, such as
7
+ protein information (e.g., sequence length, molecular weight) and peptide positions,
8
+ extracted from a ProteinDatabase (FASTA file).
7
9
 
8
- New column names:
10
+ New columns added to imported protein tables:
9
11
  - Representative protein
10
12
  - Leading proteins
11
13
  - Protein reported by software
12
14
 
13
- Unified column names:
14
- - Total peptides
15
+ Standardized column names for quantitative values (if available in the software output):
15
16
  - Spectral count "sample name"
16
17
  - Unique spectral count "sample name"
17
18
  - Total spectral count "sample name"
@@ -38,6 +39,7 @@ from msreport.helper.temp import extract_window_around_position
38
39
  class Protein(Protocol):
39
40
  """Abstract protein entry"""
40
41
 
42
+ # identifier: str
41
43
  header: str
42
44
  sequence: str
43
45
  header_fields: dict[str, str]
@@ -46,9 +48,9 @@ class Protein(Protocol):
46
48
  class ProteinDatabase(Protocol):
47
49
  """Abstract protein database"""
48
50
 
49
- def __getitem__(self, protein_id: str) -> Protein: ...
51
+ def __getitem__(self, identifier: str) -> Protein: ...
50
52
 
51
- def __contains__(self, protein_id: str) -> bool: ...
53
+ def __contains__(self, identifier: str) -> bool: ...
52
54
 
53
55
 
54
56
  class ResultReader:
@@ -497,7 +499,9 @@ class MaxQuantReader(ResultReader):
497
499
  mod_probability_columns = msreport.helper.find_columns(new_df, "Probabilities")
498
500
  localization_string_column = "Modification localization string"
499
501
 
500
- mod_localization_probabilities = [{} for _ in range(new_df.shape[0])]
502
+ mod_localization_probabilities: list[dict[str, dict[int, float]]] = [
503
+ {} for _ in range(new_df.shape[0])
504
+ ]
501
505
  for probability_column in mod_probability_columns:
502
506
  # FUTURE: Type should be checked and enforced during the import
503
507
  if not pd.api.types.is_string_dtype(new_df[probability_column].dtype):
@@ -541,6 +545,8 @@ class FragPipeReader(ResultReader):
541
545
  """FragPipe result reader.
542
546
 
543
547
  Methods:
548
+ import_design: Reads a "fragpipe-files.fp-manifest" file and returns a
549
+ processed design dataframe.
544
550
  import_proteins: Reads a "combined_protein.tsv" or "protein.tsv" file and
545
551
  returns a processed dataframe, conforming to the MsReport naming
546
552
  convention.
@@ -583,12 +589,19 @@ class FragPipeReader(ResultReader):
583
589
  "ions": "combined_ion.tsv",
584
590
  "ion_evidence": "ion.tsv",
585
591
  "psm_evidence": "psm.tsv",
592
+ "design": "fragpipe-files.fp-manifest",
586
593
  }
587
594
  isobar_filenames: dict[str, str] = {
588
595
  "proteins": "protein.tsv",
589
596
  "peptides": "peptide.tsv",
590
597
  "ions": "ion.tsv",
591
598
  }
599
+ sil_filenames: dict[str, str] = {
600
+ "proteins": "combined_protein_label_quant.tsv",
601
+ "peptides": "combined_modified_peptide_label_quant.tsv",
602
+ "ions": "combined_ion_label_quant.tsv",
603
+ }
604
+
592
605
  protected_columns: list[str] = []
593
606
  sample_column_tags: list[str] = [
594
607
  "Spectral Count",
@@ -609,6 +622,7 @@ class FragPipeReader(ResultReader):
609
622
  "Modified Sequence": "Modified sequence", # Modified peptide and ion
610
623
  "Start": "Start position", # Peptide and ion
611
624
  "End": "End position", # Peptide and ion
625
+ "Mapped Proteins": "Mapped proteins", # All PSM, ion, and peptide tables
612
626
  "Combined Total Peptides": "Total peptides", # From LFQ
613
627
  "Total Peptides": "Total peptides", # From TMT
614
628
  "Description": "Protein name",
@@ -638,7 +652,11 @@ class FragPipeReader(ResultReader):
638
652
  protein_info_tags: list[str] = []
639
653
 
640
654
  def __init__(
641
- self, directory: str, isobar: bool = False, contaminant_tag: str = "contam_"
655
+ self,
656
+ directory: str,
657
+ isobar: bool = False,
658
+ sil: bool = False,
659
+ contaminant_tag: str = "contam_",
642
660
  ) -> None:
643
661
  """Initializes the FragPipeReader.
644
662
 
@@ -646,16 +664,89 @@ class FragPipeReader(ResultReader):
646
664
  directory: Location of the FragPipe result folder
647
665
  isobar: Set to True if quantification strategy was TMT, iTRAQ or similar;
648
666
  default False.
667
+ sil: Set to True if the FragPipe result files are from a stable isotope
668
+ labeling experiment, such as SILAC; default False.
649
669
  contaminant_tag: Prefix of Protein ID entries to identify contaminants;
650
670
  default "contam_".
651
671
  """
672
+ if sil and isobar:
673
+ raise ValueError("Cannot set both 'isobar' and 'sil' to True.")
652
674
  self._add_data_directory(directory)
653
675
  self._isobar: bool = isobar
676
+ self._sil: bool = sil
654
677
  self._contaminant_tag: str = contaminant_tag
655
- if not isobar:
678
+ if isobar:
679
+ self.filenames = self.isobar_filenames
680
+ elif sil:
681
+ self.filenames = self.sil_filenames
682
+ else:
656
683
  self.filenames = self.default_filenames
684
+
685
+ def import_design(
686
+ self, filename: Optional[str] = None, sort: bool = False
687
+ ) -> pd.DataFrame:
688
+ """Read a 'fp-manifest' file and returns a processed design dataframe.
689
+
690
+ The manifest columns "Path", "Experiment", and "Bioreplicate" are mapped to the
691
+ design table columns "Rawfile", "Experiment", and "Replicate". The "Rawfile"
692
+ column is extracted as the filename from the full path. The "Sample" column is
693
+ generated by combining "Experiment" and "Replicate" with an underscore
694
+ (e.g., "Experiment_Replicate"), except when "Replicate" is empty, in which case
695
+ "Sample" is set to "Experiment". If "Experiment" is missing, it is set to "exp"
696
+ by default.
697
+
698
+ Args:
699
+ filename: Allows specifying an alternative filename, otherwise the default
700
+ filename is used.
701
+ sort: If True, the design dataframe is sorted by "Experiment" and
702
+ "Replicate"; default False.
703
+
704
+ Returns:
705
+ A dataframe containing the processed design table with columns:
706
+ "Sample", "Experiment", "Replicate", "Rawfile".
707
+
708
+ Raises:
709
+ FileNotFoundError: If the specified manifest file does not exist.
710
+ """
711
+ if filename is None:
712
+ filepath = os.path.join(self.data_directory, self.filenames["design"])
657
713
  else:
658
- self.filenames = self.isobar_filenames
714
+ filepath = os.path.join(self.data_directory, filename)
715
+ if not os.path.exists(filepath):
716
+ raise FileNotFoundError(
717
+ f"File '{filepath}' does not exist. Please check the file path."
718
+ )
719
+ fp_manifest = (
720
+ pd.read_csv(
721
+ filepath, sep="\t", header=None, na_values=[""], keep_default_na=False
722
+ )
723
+ .fillna("")
724
+ .astype(str)
725
+ )
726
+ fp_manifest.columns = ["Path", "Experiment", "Bioreplicate", "Data type"]
727
+
728
+ design = pd.DataFrame(
729
+ {
730
+ "Sample": "",
731
+ "Experiment": fp_manifest["Experiment"],
732
+ "Replicate": fp_manifest["Bioreplicate"],
733
+ "Rawfile": fp_manifest["Path"].apply(
734
+ # Required to handle Windows and Unix style paths on either system
735
+ lambda x: x.replace("\\", "/").split("/")[-1]
736
+ ),
737
+ }
738
+ )
739
+ # FragPipe uses "exp" for missing 'Experiment' values
740
+ design.loc[design["Experiment"] == "", "Experiment"] = "exp"
741
+ # FragPipe combines 'Experiment' + "_" + 'Replicate' into 'Sample', except when
742
+ # 'Replicate' is empty, in which case 'Sample' is set to 'Experiment'.
743
+ design["Sample"] = design["Experiment"] + "_" + design["Replicate"]
744
+ design.loc[design["Replicate"] == "", "Sample"] = design["Experiment"]
745
+
746
+ if sort:
747
+ design.sort_values(by=["Experiment", "Replicate"], inplace=True)
748
+ design.reset_index(drop=True, inplace=True)
749
+ return design
659
750
 
660
751
  def import_proteins(
661
752
  self,
@@ -737,6 +828,7 @@ class FragPipeReader(ResultReader):
737
828
  df = self._read_file("peptides" if filename is None else filename)
738
829
  df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
739
830
  df["Representative protein"] = df["Protein reported by software"]
831
+ df["Mapped Proteins"] = self._collect_mapped_proteins(df)
740
832
  # Note that _add_protein_entries would need to be adapted for the peptide table.
741
833
  # df = self._add_protein_entries(df)
742
834
  if rename_columns:
@@ -793,6 +885,8 @@ class FragPipeReader(ResultReader):
793
885
  # 'Indistinguishable Proteins' to the ion table.
794
886
  df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
795
887
  df["Representative protein"] = df["Protein reported by software"]
888
+ df["Mapped Proteins"] = self._collect_mapped_proteins(df)
889
+
796
890
  if rename_columns:
797
891
  df = self._rename_columns(df, prefix_column_tags)
798
892
  if rewrite_modifications and rename_columns:
@@ -879,6 +973,8 @@ class FragPipeReader(ResultReader):
879
973
  # 'Indistinguishable Proteins' to the ion table.
880
974
  df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
881
975
  df["Representative protein"] = df["Protein reported by software"]
976
+ df["Mapped Proteins"] = self._collect_mapped_proteins(df)
977
+
882
978
  if rename_columns:
883
979
  df = self._rename_columns(df, prefix_column_tags)
884
980
  if rewrite_modifications and rename_columns:
@@ -891,7 +987,7 @@ class FragPipeReader(ResultReader):
891
987
  filename: Optional[str] = None,
892
988
  rename_columns: bool = True,
893
989
  rewrite_modifications: bool = True,
894
- ):
990
+ ) -> pd.DataFrame:
895
991
  """Concatenate all "psm.tsv" files and return a processed dataframe.
896
992
 
897
993
  Args:
@@ -928,23 +1024,7 @@ class FragPipeReader(ResultReader):
928
1024
 
929
1025
  df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
930
1026
  df["Representative protein"] = df["Protein reported by software"]
931
- df["Mapped Proteins"] = df["Mapped Proteins"].astype(str).replace("nan", "")
932
-
933
- # FP only lists additional mapped proteins in the "Mapped Proteins" column
934
- # MsReport reports all matching proteins in the "Mapped proteins" column
935
- mapped_proteins_entries = []
936
- for protein, mapped_protein_fp in zip(
937
- df["Representative protein"], df["Mapped Proteins"], strict=True
938
- ):
939
- if mapped_protein_fp == "":
940
- mapped_proteins = [protein]
941
- else:
942
- additional_mapped_proteins = msreport.reader._extract_protein_ids(
943
- mapped_protein_fp.split(", ")
944
- )
945
- mapped_proteins = [protein] + additional_mapped_proteins
946
- mapped_proteins_entries.append(";".join(mapped_proteins))
947
- df["Mapped proteins"] = mapped_proteins_entries
1027
+ df["Mapped Proteins"] = self._collect_mapped_proteins(df)
948
1028
 
949
1029
  if rename_columns:
950
1030
  df = self._rename_columns(df, prefix_tag=True)
@@ -980,6 +1060,35 @@ class FragPipeReader(ResultReader):
980
1060
  df[key] = protein_entry_table[key]
981
1061
  return df
982
1062
 
1063
+ def _collect_mapped_proteins(self, df: pd.DataFrame) -> list[str]:
1064
+ """Generates a list of mapped proteins entries.
1065
+
1066
+ This method extracts protein IDs from the 'Representative protein' and the
1067
+ 'Mapped Proteins' column and combines them into a single string for each row,
1068
+ where multiple protein IDs are separated by semicolons.
1069
+
1070
+ Args:
1071
+ df: DataFrame containing the 'Mapped Proteins' column.
1072
+
1073
+ Returns:
1074
+ A list of mapped proteins entries.
1075
+ """
1076
+ mapped_proteins_entries = []
1077
+ for protein, mapped_protein_fp in zip(
1078
+ df["Representative protein"],
1079
+ df["Mapped Proteins"].astype(str).replace("nan", ""),
1080
+ strict=True,
1081
+ ):
1082
+ if mapped_protein_fp == "":
1083
+ mapped_proteins = [protein]
1084
+ else:
1085
+ additional_mapped_proteins = msreport.reader._extract_protein_ids(
1086
+ mapped_protein_fp.split(", ")
1087
+ )
1088
+ mapped_proteins = [protein] + additional_mapped_proteins
1089
+ mapped_proteins_entries.append(";".join(mapped_proteins))
1090
+ return mapped_proteins_entries
1091
+
983
1092
  def _collect_leading_protein_entries(self, df: pd.DataFrame) -> list[list[str]]:
984
1093
  """Generates a list of leading protein entries.
985
1094
 
@@ -995,6 +1104,9 @@ class FragPipeReader(ResultReader):
995
1104
  A list of the same length as the input dataframe. Each position contains a
996
1105
  list of leading protein entries, which a minimum of one entry.
997
1106
  """
1107
+ if self._sil: # No "Indistinguishable Proteins" columns in 'SIL' data
1108
+ return [[p] for p in df["Protein"]]
1109
+
998
1110
  leading_protein_entries = []
999
1111
  for protein_entry, indist_protein_entry in zip(
1000
1112
  df["Protein"], df["Indistinguishable Proteins"].fillna("").astype(str)
@@ -1411,6 +1523,7 @@ class SpectronautReader(ResultReader):
1411
1523
  filename: Optional[str] = None,
1412
1524
  filetag: Optional[str] = None,
1413
1525
  rename_columns: bool = True,
1526
+ rewrite_modifications: bool = True,
1414
1527
  ) -> pd.DataFrame:
1415
1528
  """Reads an ion evidence file (long format) and returns a processed dataframe.
1416
1529
 
@@ -1420,8 +1533,15 @@ class SpectronautReader(ResultReader):
1420
1533
  generated by concatenating the "Modified sequence" and "Charge" columns, and if
1421
1534
  present, the "Compensation voltage" column.
1422
1535
 
1423
- (!) Note that the modified sequence and modification localization probabilities
1424
- are currently not processed.
1536
+ "Modified sequence" entries contain modifications within square brackets.
1537
+ "Modification" entries are strings in the form of "position:modification_tag",
1538
+ multiple modifications are joined by ";". An example for a modified sequence and
1539
+ a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
1540
+
1541
+ "Modification localization string" contains localization probabilities in the
1542
+ format "Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
1543
+ e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
1544
+ `msreport.peptidoform.make_localization_string` for details.
1425
1545
 
1426
1546
  Args:
1427
1547
  filename: Optional, allows specifying a specific file that will be imported.
@@ -1429,6 +1549,10 @@ class SpectronautReader(ResultReader):
1429
1549
  a substring, instead of specifying a filename.
1430
1550
  rename_columns: If True, columns are renamed according to the MsReport
1431
1551
  convention; default True.
1552
+ rewrite_modifications: If True, the peptide format in "Modified sequence" is
1553
+ changed according to the MsReport convention, and a "Modifications" is
1554
+ added to contains the amino acid position for all modifications.
1555
+ Requires 'rename_columns' to be true. Default True.
1432
1556
 
1433
1557
  Returns:
1434
1558
  A dataframe containing the processed ion table.
@@ -1456,6 +1580,9 @@ class SpectronautReader(ResultReader):
1456
1580
  df = self._add_protein_entries(df)
1457
1581
  if rename_columns:
1458
1582
  df = self._rename_columns(df, True)
1583
+ if rewrite_modifications and rename_columns:
1584
+ df = self._add_peptide_modification_entries(df)
1585
+ df = self._add_modification_localization_string(df)
1459
1586
  df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
1460
1587
  if "Compensation voltage" in df.columns:
1461
1588
  _cv = df["Compensation voltage"].astype(str)
@@ -1509,6 +1636,70 @@ class SpectronautReader(ResultReader):
1509
1636
  leading_protein_entries = df["PG.ProteinAccessions"].str.split(";").tolist()
1510
1637
  return leading_protein_entries
1511
1638
 
1639
+ def _add_peptide_modification_entries(self, df: pd.DataFrame) -> pd.DataFrame:
1640
+ """Adds standardized "Modified sequence" and "Modifications" columns.
1641
+
1642
+ "Modified sequence" entries contain modifications within square brackets.
1643
+ "Modifications" entries are strings in the form of "position:modification_text",
1644
+ multiple modifications are joined by ";". An example for a modified sequence and
1645
+ a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
1646
+
1647
+ Requires the columns "Peptide sequence" and "Modified sequence" from the
1648
+ software output.
1649
+
1650
+ Args:
1651
+ df: Dataframe containing "Peptide sequence" and "Modified sequence" columns.
1652
+
1653
+ Returns:
1654
+ A copy of the input dataframe with updated columns.
1655
+ """
1656
+ # TODO: not tested
1657
+ mod_sequences = df["Modified sequence"].str[1:-1] # Remove sourrounding "_"
1658
+ mod_entries = _generate_modification_entries(
1659
+ df["Peptide sequence"], mod_sequences, "[", "]"
1660
+ )
1661
+ new_df = df.copy()
1662
+ new_df["Modified sequence"] = mod_entries["Modified sequence"]
1663
+ new_df["Modifications"] = mod_entries["Modifications"]
1664
+ return new_df
1665
+
1666
+ def _add_modification_localization_string(self, df: pd.DataFrame) -> pd.DataFrame:
1667
+ """Adds modification localization string columns.
1668
+
1669
+ Extracts localization probabilities from the "EG.PTMLocalizationProbabilities"
1670
+ column, converts them into the standardized modification localization string
1671
+ format used by msreport, and adds new column "Modification localization string".
1672
+
1673
+ Probabilities are written in the format
1674
+ "Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
1675
+ e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
1676
+ `msreport.peptidoform.make_localization_string` for details.
1677
+
1678
+ Args:
1679
+ df: Dataframe containing a "EG.PTMLocalizationProbabilities" column.
1680
+
1681
+ Returns:
1682
+ A copy of the input dataframe with the added column
1683
+ "Modification localization string".
1684
+ """
1685
+ # TODO: not tested
1686
+ new_df = df.copy()
1687
+ localization_strings = []
1688
+ for localization_entry in new_df["EG.PTMLocalizationProbabilities"]:
1689
+ if localization_entry == "":
1690
+ localization_strings.append("")
1691
+ continue
1692
+
1693
+ localization_probabilities = extract_spectronaut_localization_probabilities(
1694
+ localization_entry
1695
+ )
1696
+ localization_string = msreport.peptidoform.make_localization_string(
1697
+ localization_probabilities
1698
+ )
1699
+ localization_strings.append(localization_string)
1700
+ new_df["Modification localization string"] = localization_strings
1701
+ return new_df
1702
+
1512
1703
 
1513
1704
  def sort_leading_proteins(
1514
1705
  table: pd.DataFrame,
@@ -1551,7 +1742,7 @@ def sort_leading_proteins(
1551
1742
  db_origins_present = "Leading proteins database origin" in table
1552
1743
 
1553
1744
  if database_order is not None:
1554
- database_encoding = defaultdict(lambda: 999)
1745
+ database_encoding: dict[str, int] = defaultdict(lambda: 999)
1555
1746
  database_encoding.update({db: i for i, db in enumerate(database_order)})
1556
1747
  if penalize_contaminants is not None:
1557
1748
  contaminant_encoding = {"False": 0, "True": 1, False: 0, True: 1}
@@ -1559,7 +1750,7 @@ def sort_leading_proteins(
1559
1750
  for _, row in table.iterrows():
1560
1751
  protein_ids = row["Leading proteins"].split(";")
1561
1752
 
1562
- sorting_info = [[] for _ in protein_ids]
1753
+ sorting_info: list[list] = [[] for _ in protein_ids]
1563
1754
  if special_proteins is not None:
1564
1755
  for i, _id in enumerate(protein_ids):
1565
1756
  sorting_info[i].append(_id not in special_proteins)
@@ -1699,7 +1890,7 @@ def add_protein_site_annotation(
1699
1890
  protein_db: ProteinDatabase,
1700
1891
  protein_column: str = "Representative protein",
1701
1892
  site_column: str = "Protein site",
1702
- ):
1893
+ ) -> pd.DataFrame:
1703
1894
  """Uses a FASTA protein database to add protein site annotation columns.
1704
1895
 
1705
1896
  Adds the columns "Modified residue", which corresponds to the amino acid at the
@@ -1837,6 +2028,61 @@ def add_leading_proteins_annotation(
1837
2028
  return table
1838
2029
 
1839
2030
 
2031
+ def add_protein_site_identifiers(
2032
+ table: pd.DataFrame,
2033
+ protein_db: ProteinDatabase,
2034
+ site_column: str,
2035
+ protein_name_column: str,
2036
+ ):
2037
+ """Adds a "Protein site identifier" column to the 'table'.
2038
+
2039
+ The "Protein site identifier" is generated by concatenating the protein name
2040
+ with the amino acid and position of the protein site or sites, e.g. "P12345 - S123"
2041
+ or "P12345 - S123 / T125". The amino acid is extracted from the protein sequence at
2042
+ the position of the site. If the protein name is not available, the
2043
+ "Representative protein" entry is used instead.
2044
+
2045
+ Args:
2046
+ table: Dataframe to which the protein site identifiers are added.
2047
+ protein_db: A protein database containing entries from one or multiple FASTA
2048
+ files. Protein identifiers in the 'table' column "Representative protein"
2049
+ are used to look up entries in the 'protein_db'.
2050
+ site_column: Column in 'table' that contains protein site positions. Positions
2051
+ are one-indexed, meaning the first amino acid of the protein is position 1.
2052
+ Multiple sites in a single entry should be separated by ";".
2053
+ protein_name_column: Column in 'table' that contains protein names, which will
2054
+ be used to generate the identifier. If no name is available, the accession
2055
+ is used instead.
2056
+
2057
+ Raises:
2058
+ ValueError: If the "Representative protein", 'protein_name_column' or
2059
+ 'site_column' is not found in the 'table'.
2060
+ """
2061
+ if site_column not in table.columns:
2062
+ raise ValueError(f"Column '{site_column}' not found in the table.")
2063
+ if protein_name_column not in table.columns:
2064
+ raise ValueError(f"Column '{protein_name_column}' not found in the table.")
2065
+ if "Representative protein" not in table.columns:
2066
+ raise ValueError("Column 'Representative protein' not found in the table.")
2067
+
2068
+ site_identifiers = []
2069
+ for accession, sites, name in zip(
2070
+ table["Representative protein"],
2071
+ table[site_column].astype(str),
2072
+ table[protein_name_column],
2073
+ ):
2074
+ protein_sequence = protein_db[accession].sequence
2075
+ protein_identifier = name if name else accession
2076
+ aa_sites = []
2077
+ for site in sites.split(";"):
2078
+ aa = protein_sequence[int(site) - 1]
2079
+ aa_sites.append(f"{aa}{site}")
2080
+ aa_site_tag = " / ".join(aa_sites)
2081
+ site_identifier = f"{protein_identifier} - {aa_site_tag}"
2082
+ site_identifiers.append(site_identifier)
2083
+ table["Protein site identifier"] = site_identifiers
2084
+
2085
+
1840
2086
  def add_sequence_coverage(
1841
2087
  protein_table: pd.DataFrame,
1842
2088
  peptide_table: pd.DataFrame,
@@ -2296,7 +2542,9 @@ def _extract_fragpipe_assigned_modifications(
2296
2542
  return modifications
2297
2543
 
2298
2544
 
2299
- def extract_maxquant_localization_probabilities(localization_entry: str) -> dict:
2545
+ def extract_maxquant_localization_probabilities(
2546
+ localization_entry: str,
2547
+ ) -> dict[int, float]:
2300
2548
  """Extract localization probabilites from a MaxQuant "Probabilities" entry.
2301
2549
 
2302
2550
  Args:
@@ -2353,6 +2601,39 @@ def extract_fragpipe_localization_probabilities(localization_entry: str) -> dict
2353
2601
  return modification_probabilities
2354
2602
 
2355
2603
 
2604
+ def extract_spectronaut_localization_probabilities(localization_entry: str) -> dict:
2605
+ """Extract localization probabilites from a Spectronaut localization entry.
2606
+
2607
+ Args:
2608
+ localization_entry: Entry from the "EG.PTMLocalizationProbabilities" column of a
2609
+ spectronaut elution group (EG) output table.
2610
+
2611
+ Returns:
2612
+ A dictionary of modifications containing a dictionary of {position: probability}
2613
+ mappings. Positions are one-indexed, which means that the first amino acid
2614
+ position is 1.
2615
+
2616
+ Example:
2617
+ >>> extract_spectronaut_localization_probabilities(
2618
+ ... "_HM[Oxidation (M): 100%]S[Phospho (STY): 45.5%]GS[Phospho (STY): 54.5%]PG_"
2619
+ ... )
2620
+ {'Oxidation (M)': {2: 1.0}, 'Phospho (STY)': {3: 0.455, 5: 0.545}}
2621
+ """
2622
+ modification_probabilities: dict[str, dict[int, float]] = {}
2623
+ localization_entry = localization_entry.strip("_")
2624
+ _, raw_probability_entries = msreport.peptidoform.parse_modified_sequence(
2625
+ localization_entry, "[", "]"
2626
+ )
2627
+
2628
+ for site, mod_probability_entry in raw_probability_entries:
2629
+ modification, probability_entry = mod_probability_entry.split(": ")
2630
+ if modification not in modification_probabilities:
2631
+ modification_probabilities[modification] = {}
2632
+ probability = float(probability_entry.replace("%", "")) / 100.0
2633
+ modification_probabilities[modification][site] = probability
2634
+ return modification_probabilities
2635
+
2636
+
2356
2637
  def _extract_protein_ids(entries: list[str]) -> list[str]:
2357
2638
  """Returns a list of protein IDs, extracted from protein entries.
2358
2639
 
@@ -2466,8 +2747,8 @@ def _create_multi_protein_annotations_from_db(
2466
2747
  query_result.append(query_function(db_entry, default_value))
2467
2748
  else:
2468
2749
  query_result.append(default_value)
2469
- query_result = ";".join(map(str, query_result))
2470
- annotation_values.append(query_result)
2750
+ annotation_value = ";".join(map(str, query_result))
2751
+ annotation_values.append(annotation_value)
2471
2752
  return annotation_values
2472
2753
 
2473
2754
 
@@ -1,4 +1,4 @@
1
- """Python interface to custome R scripts."""
1
+ """Python interface to the 'limma.R' script."""
2
2
 
3
3
  import os
4
4
 
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: msreport
3
- Version: 0.0.29
3
+ Version: 0.0.31
4
4
  Summary: Post processing and analysis of quantitative proteomics data
5
5
  Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
6
6
  License-Expression: Apache-2.0
7
7
  Project-URL: homepage, https://github.com/hollenstein/msreport
8
+ Project-URL: documentation, https://hollenstein.github.io/msreport/
8
9
  Project-URL: changelog, https://github.com/hollenstein/msreport/blob/main/CHANGELOG.md
9
10
  Keywords: mass spectrometry,proteomics,post processing,data analysis
10
11
  Classifier: Development Status :: 4 - Beta
@@ -29,10 +30,17 @@ Requires-Dist: seaborn>=0.12.0
29
30
  Requires-Dist: statsmodels>=0.13.2
30
31
  Requires-Dist: typing_extensions>=4
31
32
  Provides-Extra: r
32
- Requires-Dist: rpy2!=3.5.13,>=3.5.3; extra == "r"
33
+ Requires-Dist: rpy2<3.5.13,>=3.5.3; extra == "r"
33
34
  Provides-Extra: dev
34
35
  Requires-Dist: mypy>=1.15.0; extra == "dev"
35
36
  Requires-Dist: pytest>=8.3.5; extra == "dev"
37
+ Provides-Extra: docs
38
+ Requires-Dist: mkdocs-awesome-nav>=3.1.2; extra == "docs"
39
+ Requires-Dist: mkdocs-macros-plugin>=1.3.7; extra == "docs"
40
+ Requires-Dist: mkdocs-material>=9.6.15; extra == "docs"
41
+ Requires-Dist: mkdocs-roamlinks-plugin>=0.3.2; extra == "docs"
42
+ Requires-Dist: mkdocstrings-python>=1.16.12; extra == "docs"
43
+ Requires-Dist: ruff>=0.12.2; extra == "docs"
36
44
  Provides-Extra: test
37
45
  Requires-Dist: pytest>=8.3.5; extra == "test"
38
46
  Dynamic: license-file
@@ -40,6 +48,7 @@ Dynamic: license-file
40
48
  # MsReport
41
49
 
42
50
  [![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip)
51
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.15309090.svg)](https://doi.org/10.5281/zenodo.15309090)
43
52
  ![Python Version from PEP 621 TOML](https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fhollenstein%2Fmsreport%2Fmain%2Fpyproject.toml)
44
53
  [![Run tests](https://github.com/hollenstein/msreport/actions/workflows/run-tests.yml/badge.svg)](https://github.com/hollenstein/msreport/actions/workflows/run-tests.yml)
45
54
 
@@ -55,6 +64,7 @@ bottom-up mass spectrometry experiments.
55
64
  - [Additional requirements](#additional-requirements)
56
65
  - [Optional Dependencies](#optional-dependencies)
57
66
  - [Development status](#development-status)
67
+ - [How to cite](#how-to-cite)
58
68
 
59
69
  ## What is MsReport?
60
70
 
@@ -62,6 +72,8 @@ MsReport is a Python library designed to simplify the post-processing and analys
62
72
 
63
73
  The library supports importing protein and peptide-level quantification results from MaxQuant, FragPipe, and Spectronaut, as well as post-translational modification (PTM) data from MaxQuant and FragPipe. MsReport provides tools for data annotation, normalization and transformation, statistical testing, and data visualization.
64
74
 
75
+ The [documentation](https://hollenstein.github.io/msreport/) provides an overview of the library's public API.
76
+
65
77
  ### Key features of MsReport
66
78
 
67
79
  #### Data Import and Standardization
@@ -134,3 +146,9 @@ For example, the R home directory might look like this on Windows: `C:\Program F
134
146
  ## Development status
135
147
 
136
148
  MsReport is a stable and reliable library that has been used on a daily basis for over two years in the Mass Spectrometry Facility at the Max Perutz Labs and the Mass Spectrometry Facility of IMP/IMBA/GMI. While the current interface of MsReport is stable, the library is still under active development, with new features being added regularly. Please note that a major rewrite is planned, which may introduce changes to the API in the future.
149
+
150
+ ## How to cite
151
+
152
+ If you use MsReport for your research or publications, please include the following citation and consider giving the project a star on GitHub.
153
+
154
+ > Hollenstein, D. M., & Hartl, M. (2025). hollenstein/msreport: v0.0.29 (0.0.29). Zenodo. https://doi.org/10.5281/zenodo.15309090