msreport 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
msreport/reader.py CHANGED
@@ -1,17 +1,18 @@
1
- """Module for reading result tables from various MS analysis tools and converting them
2
- to a standardized format following the MsReport convention.
1
+ """Provides tools for importing and standardizing quantitative proteomics data.
3
2
 
4
- Currently for MaxQuant and FragPipe protein, peptide, and ion tables are supported, and
5
- for Spectronaut protein tables are supported when exported with the correct report
6
- scheme.
3
+ This module offers software-specific reader classes to import raw result tables (e.g.,
4
+ proteins, peptides, ions) from various proteomics software (MaxQuant, FragPipe,
5
+ Spectronaut) and convert them into a standardized `msreport` format. Additionally, it
6
+ provides functions for annotating imported data with biological metadata, such as
7
+ protein information (e.g., sequence length, molecular weight) and peptide positions,
8
+ extracted from a ProteinDatabase (FASTA file).
7
9
 
8
- New column names:
10
+ New columns added to imported protein tables:
9
11
  - Representative protein
10
12
  - Leading proteins
11
13
  - Protein reported by software
12
14
 
13
- Unified column names:
14
- - Total peptides
15
+ Standardized column names for quantitative values (if available in the software output):
15
16
  - Spectral count "sample name"
16
17
  - Unique spectral count "sample name"
17
18
  - Total spectral count "sample name"
@@ -38,6 +39,7 @@ from msreport.helper.temp import extract_window_around_position
38
39
  class Protein(Protocol):
39
40
  """Abstract protein entry"""
40
41
 
42
+ # identifier: str
41
43
  header: str
42
44
  sequence: str
43
45
  header_fields: dict[str, str]
@@ -46,9 +48,9 @@ class Protein(Protocol):
46
48
  class ProteinDatabase(Protocol):
47
49
  """Abstract protein database"""
48
50
 
49
- def __getitem__(self, protein_id: str) -> Protein: ...
51
+ def __getitem__(self, identifier: str) -> Protein: ...
50
52
 
51
- def __contains__(self, protein_id: str) -> bool: ...
53
+ def __contains__(self, identifier: str) -> bool: ...
52
54
 
53
55
 
54
56
  class ResultReader:
@@ -497,7 +499,9 @@ class MaxQuantReader(ResultReader):
497
499
  mod_probability_columns = msreport.helper.find_columns(new_df, "Probabilities")
498
500
  localization_string_column = "Modification localization string"
499
501
 
500
- mod_localization_probabilities = [{} for _ in range(new_df.shape[0])]
502
+ mod_localization_probabilities: list[dict[str, dict[int, float]]] = [
503
+ {} for _ in range(new_df.shape[0])
504
+ ]
501
505
  for probability_column in mod_probability_columns:
502
506
  # FUTURE: Type should be checked and enforced during the import
503
507
  if not pd.api.types.is_string_dtype(new_df[probability_column].dtype):
@@ -541,7 +545,12 @@ class FragPipeReader(ResultReader):
541
545
  """FragPipe result reader.
542
546
 
543
547
  Methods:
544
- import_design: Reads a "fragpipe-files.fp-manifest" file and returns a
548
+ import_design: Depending on the quantification strategy, imports either the
549
+ manifest file or the experiment annotation file and returns a processed
550
+ design dataframe.
551
+ import_manifest: Reads a "fragpipe-files.fp-manifest" file and returns a
552
+ processed design dataframe.
553
+ import_experiment_annotation: Reads a "experiment_annotation" file and returns a
545
554
  processed design dataframe.
546
555
  import_proteins: Reads a "combined_protein.tsv" or "protein.tsv" file and
547
556
  returns a processed dataframe, conforming to the MsReport naming
@@ -585,12 +594,8 @@ class FragPipeReader(ResultReader):
585
594
  "ions": "combined_ion.tsv",
586
595
  "ion_evidence": "ion.tsv",
587
596
  "psm_evidence": "psm.tsv",
588
- "design": "fragpipe-files.fp-manifest",
589
- }
590
- isobar_filenames: dict[str, str] = {
591
- "proteins": "protein.tsv",
592
- "peptides": "peptide.tsv",
593
- "ions": "ion.tsv",
597
+ "manifest": "fragpipe-files.fp-manifest",
598
+ "experiment_annotation": "experiment_annotation.tsv",
594
599
  }
595
600
  sil_filenames: dict[str, str] = {
596
601
  "proteins": "combined_protein_label_quant.tsv",
@@ -671,17 +676,38 @@ class FragPipeReader(ResultReader):
671
676
  self._isobar: bool = isobar
672
677
  self._sil: bool = sil
673
678
  self._contaminant_tag: str = contaminant_tag
674
- if isobar:
675
- self.filenames = self.isobar_filenames
676
- elif sil:
677
- self.filenames = self.sil_filenames
679
+
680
+ self.filenames = self.default_filenames.copy()
681
+ if sil:
682
+ self.filenames.update(self.sil_filenames)
683
+
684
+ def import_design(self, sort: bool = False) -> pd.DataFrame:
685
+ """Reads the experimental design file and returns a processed design dataframe.
686
+
687
+ Depending on the quantification strategy (isobaric or label-free/sil), either
688
+ the experiment annotation file or the manifest file is imported.
689
+
690
+ Args:
691
+ sort: If True, the design dataframe is sorted by "Experiment" and
692
+ "Replicate"; default False.
693
+ """
694
+ if self._isobar:
695
+ return self.import_experiment_annotation(sort=sort)
678
696
  else:
679
- self.filenames = self.default_filenames
697
+ return self.import_manifest(sort=sort)
680
698
 
681
- def import_design(
699
+ def import_manifest(
682
700
  self, filename: Optional[str] = None, sort: bool = False
683
701
  ) -> pd.DataFrame:
684
- """Reads a 'fp-manifest' file and returns a processed design dataframe.
702
+ """Read a 'fp-manifest' file and returns a processed design dataframe.
703
+
704
+ The manifest columns "Path", "Experiment", and "Bioreplicate" are mapped to the
705
+ design table columns "Rawfile", "Experiment", and "Replicate". The "Rawfile"
706
+ column is extracted as the filename from the full path. The "Sample" column is
707
+ generated by combining "Experiment" and "Replicate" with an underscore
708
+ (e.g., "Experiment_Replicate"), except when "Replicate" is empty, in which case
709
+ "Sample" is set to "Experiment". If "Experiment" is missing, it is set to "exp"
710
+ by default.
685
711
 
686
712
  Args:
687
713
  filename: Allows specifying an alternative filename, otherwise the default
@@ -697,19 +723,25 @@ class FragPipeReader(ResultReader):
697
723
  FileNotFoundError: If the specified manifest file does not exist.
698
724
  """
699
725
  if filename is None:
700
- filepath = os.path.join(self.data_directory, self.filenames["design"])
726
+ filepath = os.path.join(self.data_directory, self.filenames["manifest"])
701
727
  else:
702
728
  filepath = os.path.join(self.data_directory, filename)
703
729
  if not os.path.exists(filepath):
704
730
  raise FileNotFoundError(
705
731
  f"File '{filepath}' does not exist. Please check the file path."
706
732
  )
707
- fp_manifest = pd.read_csv(filepath, sep="\t", header=None, dtype=str)
733
+ fp_manifest = (
734
+ pd.read_csv(
735
+ filepath, sep="\t", header=None, na_values=[""], keep_default_na=False
736
+ )
737
+ .fillna("")
738
+ .astype(str)
739
+ )
708
740
  fp_manifest.columns = ["Path", "Experiment", "Bioreplicate", "Data type"]
709
741
 
710
742
  design = pd.DataFrame(
711
743
  {
712
- "Sample": fp_manifest["Experiment"] + "_" + fp_manifest["Bioreplicate"],
744
+ "Sample": "",
713
745
  "Experiment": fp_manifest["Experiment"],
714
746
  "Replicate": fp_manifest["Bioreplicate"],
715
747
  "Rawfile": fp_manifest["Path"].apply(
@@ -718,10 +750,73 @@ class FragPipeReader(ResultReader):
718
750
  ),
719
751
  }
720
752
  )
753
+ # FragPipe uses "exp" for missing 'Experiment' values
754
+ design.loc[design["Experiment"] == "", "Experiment"] = "exp"
755
+ # FragPipe combines 'Experiment' + "_" + 'Replicate' into 'Sample', except when
756
+ # 'Replicate' is empty, in which case 'Sample' is set to 'Experiment'.
757
+ design["Sample"] = design["Experiment"] + "_" + design["Replicate"]
758
+ design.loc[design["Replicate"] == "", "Sample"] = design["Experiment"]
759
+
760
+ if sort:
761
+ design.sort_values(by=["Experiment", "Replicate"], inplace=True)
762
+ design.reset_index(drop=True, inplace=True)
763
+ return design
764
+
765
+ def import_experiment_annotation(
766
+ self, filename: Optional[str] = None, sort: bool = False
767
+ ) -> pd.DataFrame:
768
+ """Read a 'experiment_annotation' file and returns a processed design dataframe.
769
+
770
+ The annotation columns "sample", "channel", and "plex" are mapped to the design
771
+ table columns "Sample", "Channel", and "Plex". The "Experiment" and "Replicate"
772
+ columns are extracted from the "Sample" column by splitting at the last
773
+ underscore, if there is no underscore, "Replicate" is set to an empty string.
774
+
775
+ Note that this convention of splitting the "Sample" column does confirm to the
776
+ FragPipe convention, but FragPipe does not enforce it for the experiment
777
+ annotation file.
778
+
779
+ Args:
780
+ filename: Allows specifying an alternative filename, otherwise the default
781
+ filename is used.
782
+ sort: If True, the design dataframe is sorted by "Experiment" and
783
+ "Replicate"; default False.
784
+
785
+ Returns:
786
+ A dataframe containing the processed design table with columns:
787
+ "Sample", "Experiment", "Replicate", "Channel", and "Plex".
788
+
789
+ Raises:
790
+ FileNotFoundError: If the specified manifest file does not exist.
791
+ """
792
+ if filename is None:
793
+ filepath = os.path.join(
794
+ self.data_directory, self.filenames["experiment_annotation"]
795
+ )
796
+ else:
797
+ filepath = os.path.join(self.data_directory, filename)
798
+ if not os.path.exists(filepath):
799
+ raise FileNotFoundError(
800
+ f"File '{filepath}' does not exist. Please check the file path."
801
+ )
802
+
803
+ annotation = pd.read_csv(filepath, sep="\t")
804
+
805
+ design = pd.DataFrame(
806
+ {
807
+ "Sample": annotation["sample"],
808
+ "Experiment": annotation["sample"].str.rsplit("_", n=1).str[0],
809
+ "Replicate": annotation["sample"].str.rsplit("_", n=1).str[1],
810
+ "Channel": annotation["channel"],
811
+ "Plex": annotation["plex"],
812
+ }
813
+ )
814
+ design["Replicate"] = design["Replicate"].fillna("")
721
815
 
722
816
  if sort:
723
817
  design.sort_values(by=["Experiment", "Replicate"], inplace=True)
724
818
  design.reset_index(drop=True, inplace=True)
819
+
725
820
  return design
726
821
 
727
822
  def import_proteins(
@@ -963,7 +1058,7 @@ class FragPipeReader(ResultReader):
963
1058
  filename: Optional[str] = None,
964
1059
  rename_columns: bool = True,
965
1060
  rewrite_modifications: bool = True,
966
- ):
1061
+ ) -> pd.DataFrame:
967
1062
  """Concatenate all "psm.tsv" files and return a processed dataframe.
968
1063
 
969
1064
  Args:
@@ -1010,6 +1105,7 @@ class FragPipeReader(ResultReader):
1010
1105
  )
1011
1106
  df["Modified sequence"] = mod_entries["Modified sequence"]
1012
1107
  df["Modifications"] = mod_entries["Modifications"]
1108
+ df = self._add_modification_localization_string_to_psm_evidence(df)
1013
1109
  return df
1014
1110
 
1015
1111
  def _add_protein_entries(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -1183,6 +1279,66 @@ class FragPipeReader(ResultReader):
1183
1279
  new_df[new_column] = localization_strings
1184
1280
  return new_df
1185
1281
 
1282
+ def _add_modification_localization_string_to_psm_evidence(
1283
+ self, df: pd.DataFrame
1284
+ ) -> pd.DataFrame:
1285
+ """Adds a modification localization string column to a PSM evidence table.
1286
+
1287
+ Extracts localization probabilities from all columns in the form
1288
+ f"{aa:modification}", converts them into the standardized modification
1289
+ localization string format used by msreport, and adds a new column
1290
+ "Modification localization string".
1291
+
1292
+ Probabilities are written in the format
1293
+ "Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
1294
+ e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
1295
+ `msreport.peptidoform.make_localization_string` for details.
1296
+
1297
+ Args:
1298
+ df: A dataframe containing PSM tables from FragPipe.
1299
+
1300
+ Returns:
1301
+ A copy of the input dataframe with the added
1302
+ "Modification localization string" column.
1303
+ """
1304
+ new_df = df.copy()
1305
+ _search_tag = " Best Localization"
1306
+ mod_localization_columns = [
1307
+ c.strip(_search_tag) for c in new_df.columns if c.endswith(_search_tag)
1308
+ ]
1309
+ if not mod_localization_columns:
1310
+ new_df["Modification localization string"] = ""
1311
+ return new_df
1312
+
1313
+ df[mod_localization_columns] = (
1314
+ df[mod_localization_columns].astype(str).replace("nan", "")
1315
+ )
1316
+ row_mod_probabilities: list[dict[str, dict[int, float]]] = [
1317
+ {} for i in range(df.shape[0])
1318
+ ]
1319
+ for mod_localization_column in mod_localization_columns:
1320
+ modification = mod_localization_column.split(":")[1]
1321
+ for modification_probabilities, probability_sequence in zip(
1322
+ row_mod_probabilities, df[mod_localization_column]
1323
+ ):
1324
+ if not probability_sequence:
1325
+ continue
1326
+ _, probabilities = msreport.peptidoform.parse_modified_sequence(
1327
+ probability_sequence, "(", ")"
1328
+ )
1329
+ modification_probabilities[modification] = {
1330
+ site: float(probability) for site, probability in probabilities
1331
+ }
1332
+
1333
+ localization_strings = []
1334
+ for localization_probabilities in row_mod_probabilities:
1335
+ localization_string = msreport.peptidoform.make_localization_string(
1336
+ localization_probabilities
1337
+ )
1338
+ localization_strings.append(localization_string)
1339
+ new_df["Modification localization string"] = localization_strings
1340
+ return new_df
1341
+
1186
1342
 
1187
1343
  class SpectronautReader(ResultReader):
1188
1344
  """Spectronaut result reader.
@@ -1499,6 +1655,7 @@ class SpectronautReader(ResultReader):
1499
1655
  filename: Optional[str] = None,
1500
1656
  filetag: Optional[str] = None,
1501
1657
  rename_columns: bool = True,
1658
+ rewrite_modifications: bool = True,
1502
1659
  ) -> pd.DataFrame:
1503
1660
  """Reads an ion evidence file (long format) and returns a processed dataframe.
1504
1661
 
@@ -1508,8 +1665,15 @@ class SpectronautReader(ResultReader):
1508
1665
  generated by concatenating the "Modified sequence" and "Charge" columns, and if
1509
1666
  present, the "Compensation voltage" column.
1510
1667
 
1511
- (!) Note that the modified sequence and modification localization probabilities
1512
- are currently not processed.
1668
+ "Modified sequence" entries contain modifications within square brackets.
1669
+ "Modification" entries are strings in the form of "position:modification_tag",
1670
+ multiple modifications are joined by ";". An example for a modified sequence and
1671
+ a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
1672
+
1673
+ "Modification localization string" contains localization probabilities in the
1674
+ format "Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
1675
+ e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
1676
+ `msreport.peptidoform.make_localization_string` for details.
1513
1677
 
1514
1678
  Args:
1515
1679
  filename: Optional, allows specifying a specific file that will be imported.
@@ -1517,6 +1681,10 @@ class SpectronautReader(ResultReader):
1517
1681
  a substring, instead of specifying a filename.
1518
1682
  rename_columns: If True, columns are renamed according to the MsReport
1519
1683
  convention; default True.
1684
+ rewrite_modifications: If True, the peptide format in "Modified sequence" is
1685
+ changed according to the MsReport convention, and a "Modifications" is
1686
+ added to contains the amino acid position for all modifications.
1687
+ Requires 'rename_columns' to be true. Default True.
1520
1688
 
1521
1689
  Returns:
1522
1690
  A dataframe containing the processed ion table.
@@ -1544,6 +1712,9 @@ class SpectronautReader(ResultReader):
1544
1712
  df = self._add_protein_entries(df)
1545
1713
  if rename_columns:
1546
1714
  df = self._rename_columns(df, True)
1715
+ if rewrite_modifications and rename_columns:
1716
+ df = self._add_peptide_modification_entries(df)
1717
+ df = self._add_modification_localization_string(df)
1547
1718
  df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
1548
1719
  if "Compensation voltage" in df.columns:
1549
1720
  _cv = df["Compensation voltage"].astype(str)
@@ -1597,6 +1768,70 @@ class SpectronautReader(ResultReader):
1597
1768
  leading_protein_entries = df["PG.ProteinAccessions"].str.split(";").tolist()
1598
1769
  return leading_protein_entries
1599
1770
 
1771
+ def _add_peptide_modification_entries(self, df: pd.DataFrame) -> pd.DataFrame:
1772
+ """Adds standardized "Modified sequence" and "Modifications" columns.
1773
+
1774
+ "Modified sequence" entries contain modifications within square brackets.
1775
+ "Modifications" entries are strings in the form of "position:modification_text",
1776
+ multiple modifications are joined by ";". An example for a modified sequence and
1777
+ a modification entry: "PEPT[Phospho]IDO[Oxidation]", "4:Phospho;7:Oxidation".
1778
+
1779
+ Requires the columns "Peptide sequence" and "Modified sequence" from the
1780
+ software output.
1781
+
1782
+ Args:
1783
+ df: Dataframe containing "Peptide sequence" and "Modified sequence" columns.
1784
+
1785
+ Returns:
1786
+ A copy of the input dataframe with updated columns.
1787
+ """
1788
+ # TODO: not tested
1789
+ mod_sequences = df["Modified sequence"].str[1:-1] # Remove sourrounding "_"
1790
+ mod_entries = _generate_modification_entries(
1791
+ df["Peptide sequence"], mod_sequences, "[", "]"
1792
+ )
1793
+ new_df = df.copy()
1794
+ new_df["Modified sequence"] = mod_entries["Modified sequence"]
1795
+ new_df["Modifications"] = mod_entries["Modifications"]
1796
+ return new_df
1797
+
1798
+ def _add_modification_localization_string(self, df: pd.DataFrame) -> pd.DataFrame:
1799
+ """Adds modification localization string columns.
1800
+
1801
+ Extracts localization probabilities from the "EG.PTMLocalizationProbabilities"
1802
+ column, converts them into the standardized modification localization string
1803
+ format used by msreport, and adds new column "Modification localization string".
1804
+
1805
+ Probabilities are written in the format
1806
+ "Mod1@Site1:Probability1,Site2:Probability2;Mod2@Site3:Probability3",
1807
+ e.g. "15.9949@11:1.000;79.9663@3:0.200,4:0.800". Refer to
1808
+ `msreport.peptidoform.make_localization_string` for details.
1809
+
1810
+ Args:
1811
+ df: Dataframe containing a "EG.PTMLocalizationProbabilities" column.
1812
+
1813
+ Returns:
1814
+ A copy of the input dataframe with the added column
1815
+ "Modification localization string".
1816
+ """
1817
+ # TODO: not tested
1818
+ new_df = df.copy()
1819
+ localization_strings = []
1820
+ for localization_entry in new_df["EG.PTMLocalizationProbabilities"]:
1821
+ if localization_entry == "":
1822
+ localization_strings.append("")
1823
+ continue
1824
+
1825
+ localization_probabilities = extract_spectronaut_localization_probabilities(
1826
+ localization_entry
1827
+ )
1828
+ localization_string = msreport.peptidoform.make_localization_string(
1829
+ localization_probabilities
1830
+ )
1831
+ localization_strings.append(localization_string)
1832
+ new_df["Modification localization string"] = localization_strings
1833
+ return new_df
1834
+
1600
1835
 
1601
1836
  def sort_leading_proteins(
1602
1837
  table: pd.DataFrame,
@@ -1639,7 +1874,7 @@ def sort_leading_proteins(
1639
1874
  db_origins_present = "Leading proteins database origin" in table
1640
1875
 
1641
1876
  if database_order is not None:
1642
- database_encoding = defaultdict(lambda: 999)
1877
+ database_encoding: dict[str, int] = defaultdict(lambda: 999)
1643
1878
  database_encoding.update({db: i for i, db in enumerate(database_order)})
1644
1879
  if penalize_contaminants is not None:
1645
1880
  contaminant_encoding = {"False": 0, "True": 1, False: 0, True: 1}
@@ -1647,7 +1882,7 @@ def sort_leading_proteins(
1647
1882
  for _, row in table.iterrows():
1648
1883
  protein_ids = row["Leading proteins"].split(";")
1649
1884
 
1650
- sorting_info = [[] for _ in protein_ids]
1885
+ sorting_info: list[list] = [[] for _ in protein_ids]
1651
1886
  if special_proteins is not None:
1652
1887
  for i, _id in enumerate(protein_ids):
1653
1888
  sorting_info[i].append(_id not in special_proteins)
@@ -1787,7 +2022,7 @@ def add_protein_site_annotation(
1787
2022
  protein_db: ProteinDatabase,
1788
2023
  protein_column: str = "Representative protein",
1789
2024
  site_column: str = "Protein site",
1790
- ):
2025
+ ) -> pd.DataFrame:
1791
2026
  """Uses a FASTA protein database to add protein site annotation columns.
1792
2027
 
1793
2028
  Adds the columns "Modified residue", which corresponds to the amino acid at the
@@ -1925,6 +2160,61 @@ def add_leading_proteins_annotation(
1925
2160
  return table
1926
2161
 
1927
2162
 
2163
+ def add_protein_site_identifiers(
2164
+ table: pd.DataFrame,
2165
+ protein_db: ProteinDatabase,
2166
+ site_column: str,
2167
+ protein_name_column: str,
2168
+ ):
2169
+ """Adds a "Protein site identifier" column to the 'table'.
2170
+
2171
+ The "Protein site identifier" is generated by concatenating the protein name
2172
+ with the amino acid and position of the protein site or sites, e.g. "P12345 - S123"
2173
+ or "P12345 - S123 / T125". The amino acid is extracted from the protein sequence at
2174
+ the position of the site. If the protein name is not available, the
2175
+ "Representative protein" entry is used instead.
2176
+
2177
+ Args:
2178
+ table: Dataframe to which the protein site identifiers are added.
2179
+ protein_db: A protein database containing entries from one or multiple FASTA
2180
+ files. Protein identifiers in the 'table' column "Representative protein"
2181
+ are used to look up entries in the 'protein_db'.
2182
+ site_column: Column in 'table' that contains protein site positions. Positions
2183
+ are one-indexed, meaning the first amino acid of the protein is position 1.
2184
+ Multiple sites in a single entry should be separated by ";".
2185
+ protein_name_column: Column in 'table' that contains protein names, which will
2186
+ be used to generate the identifier. If no name is available, the accession
2187
+ is used instead.
2188
+
2189
+ Raises:
2190
+ ValueError: If the "Representative protein", 'protein_name_column' or
2191
+ 'site_column' is not found in the 'table'.
2192
+ """
2193
+ if site_column not in table.columns:
2194
+ raise ValueError(f"Column '{site_column}' not found in the table.")
2195
+ if protein_name_column not in table.columns:
2196
+ raise ValueError(f"Column '{protein_name_column}' not found in the table.")
2197
+ if "Representative protein" not in table.columns:
2198
+ raise ValueError("Column 'Representative protein' not found in the table.")
2199
+
2200
+ site_identifiers = []
2201
+ for accession, sites, name in zip(
2202
+ table["Representative protein"],
2203
+ table[site_column].astype(str),
2204
+ table[protein_name_column],
2205
+ ):
2206
+ protein_sequence = protein_db[accession].sequence
2207
+ protein_identifier = name if name else accession
2208
+ aa_sites = []
2209
+ for site in sites.split(";"):
2210
+ aa = protein_sequence[int(site) - 1]
2211
+ aa_sites.append(f"{aa}{site}")
2212
+ aa_site_tag = " / ".join(aa_sites)
2213
+ site_identifier = f"{protein_identifier} - {aa_site_tag}"
2214
+ site_identifiers.append(site_identifier)
2215
+ table["Protein site identifier"] = site_identifiers
2216
+
2217
+
1928
2218
  def add_sequence_coverage(
1929
2219
  protein_table: pd.DataFrame,
1930
2220
  peptide_table: pd.DataFrame,
@@ -2384,7 +2674,9 @@ def _extract_fragpipe_assigned_modifications(
2384
2674
  return modifications
2385
2675
 
2386
2676
 
2387
- def extract_maxquant_localization_probabilities(localization_entry: str) -> dict:
2677
+ def extract_maxquant_localization_probabilities(
2678
+ localization_entry: str,
2679
+ ) -> dict[int, float]:
2388
2680
  """Extract localization probabilites from a MaxQuant "Probabilities" entry.
2389
2681
 
2390
2682
  Args:
@@ -2441,6 +2733,39 @@ def extract_fragpipe_localization_probabilities(localization_entry: str) -> dict
2441
2733
  return modification_probabilities
2442
2734
 
2443
2735
 
2736
+ def extract_spectronaut_localization_probabilities(localization_entry: str) -> dict:
2737
+ """Extract localization probabilites from a Spectronaut localization entry.
2738
+
2739
+ Args:
2740
+ localization_entry: Entry from the "EG.PTMLocalizationProbabilities" column of a
2741
+ spectronaut elution group (EG) output table.
2742
+
2743
+ Returns:
2744
+ A dictionary of modifications containing a dictionary of {position: probability}
2745
+ mappings. Positions are one-indexed, which means that the first amino acid
2746
+ position is 1.
2747
+
2748
+ Example:
2749
+ >>> extract_spectronaut_localization_probabilities(
2750
+ ... "_HM[Oxidation (M): 100%]S[Phospho (STY): 45.5%]GS[Phospho (STY): 54.5%]PG_"
2751
+ ... )
2752
+ {'Oxidation (M)': {2: 1.0}, 'Phospho (STY)': {3: 0.455, 5: 0.545}}
2753
+ """
2754
+ modification_probabilities: dict[str, dict[int, float]] = {}
2755
+ localization_entry = localization_entry.strip("_")
2756
+ _, raw_probability_entries = msreport.peptidoform.parse_modified_sequence(
2757
+ localization_entry, "[", "]"
2758
+ )
2759
+
2760
+ for site, mod_probability_entry in raw_probability_entries:
2761
+ modification, probability_entry = mod_probability_entry.split(": ")
2762
+ if modification not in modification_probabilities:
2763
+ modification_probabilities[modification] = {}
2764
+ probability = float(probability_entry.replace("%", "")) / 100.0
2765
+ modification_probabilities[modification][site] = probability
2766
+ return modification_probabilities
2767
+
2768
+
2444
2769
  def _extract_protein_ids(entries: list[str]) -> list[str]:
2445
2770
  """Returns a list of protein IDs, extracted from protein entries.
2446
2771
 
@@ -2554,8 +2879,8 @@ def _create_multi_protein_annotations_from_db(
2554
2879
  query_result.append(query_function(db_entry, default_value))
2555
2880
  else:
2556
2881
  query_result.append(default_value)
2557
- query_result = ";".join(map(str, query_result))
2558
- annotation_values.append(query_result)
2882
+ annotation_value = ";".join(map(str, query_result))
2883
+ annotation_values.append(annotation_value)
2559
2884
  return annotation_values
2560
2885
 
2561
2886
 
@@ -1,4 +1,4 @@
1
- """Python interface to custome R scripts."""
1
+ """Python interface to the 'limma.R' script."""
2
2
 
3
3
  import os
4
4
 
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: msreport
3
- Version: 0.0.30
3
+ Version: 0.0.32
4
4
  Summary: Post processing and analysis of quantitative proteomics data
5
5
  Author-email: "David M. Hollenstein" <hollenstein.david@gmail.com>
6
6
  License-Expression: Apache-2.0
7
7
  Project-URL: homepage, https://github.com/hollenstein/msreport
8
+ Project-URL: documentation, https://hollenstein.github.io/msreport/
8
9
  Project-URL: changelog, https://github.com/hollenstein/msreport/blob/main/CHANGELOG.md
9
10
  Keywords: mass spectrometry,proteomics,post processing,data analysis
10
11
  Classifier: Development Status :: 4 - Beta
@@ -33,6 +34,13 @@ Requires-Dist: rpy2<3.5.13,>=3.5.3; extra == "r"
33
34
  Provides-Extra: dev
34
35
  Requires-Dist: mypy>=1.15.0; extra == "dev"
35
36
  Requires-Dist: pytest>=8.3.5; extra == "dev"
37
+ Provides-Extra: docs
38
+ Requires-Dist: mkdocs-awesome-nav>=3.1.2; extra == "docs"
39
+ Requires-Dist: mkdocs-macros-plugin>=1.3.7; extra == "docs"
40
+ Requires-Dist: mkdocs-material>=9.6.15; extra == "docs"
41
+ Requires-Dist: mkdocs-roamlinks-plugin>=0.3.2; extra == "docs"
42
+ Requires-Dist: mkdocstrings-python>=1.16.12; extra == "docs"
43
+ Requires-Dist: ruff>=0.12.2; extra == "docs"
36
44
  Provides-Extra: test
37
45
  Requires-Dist: pytest>=8.3.5; extra == "test"
38
46
  Dynamic: license-file
@@ -64,6 +72,8 @@ MsReport is a Python library designed to simplify the post-processing and analys
64
72
 
65
73
  The library supports importing protein and peptide-level quantification results from MaxQuant, FragPipe, and Spectronaut, as well as post-translational modification (PTM) data from MaxQuant and FragPipe. MsReport provides tools for data annotation, normalization and transformation, statistical testing, and data visualization.
66
74
 
75
+ The [documentation](https://hollenstein.github.io/msreport/) provides an overview of the library's public API.
76
+
67
77
  ### Key features of MsReport
68
78
 
69
79
  #### Data Import and Standardization
@@ -0,0 +1,38 @@
1
+ msreport/__init__.py,sha256=hmq4---v9oHxQm9gidnxGryrWB8HqPfMPHaPryBS_Oc,339
2
+ msreport/analyze.py,sha256=T6ORhBYP3Qnil0r7qF5CkwS2KHUsedpU5P-0paqUmaA,33838
3
+ msreport/errors.py,sha256=X9yFxMiIOCWQdxuqBGr8L7O3vRV2KElXdX1uHbFcZMk,421
4
+ msreport/export.py,sha256=wXQfaVd5UHlGKyKdrt2UWbhzNf-VyJy2Up5qfrPzO2M,20229
5
+ msreport/fasta.py,sha256=hPz4xlkjeTV-2YCrtWMsQQJSkJSmH1ZzNZBxHI89Nqk,1489
6
+ msreport/impute.py,sha256=q21cFKnpENE4GHUPz-R5FipkvagWjX4fa31qeb8uaxc,10782
7
+ msreport/isobar.py,sha256=nh2Wem1wheqJ6wAJYm8be9FuK21c7T1k7nectJjPw7o,6729
8
+ msreport/normalize.py,sha256=73n344jBQ9u-Ube_wOxF5Svi2ltKMnBKaw8M36hEaQM,23441
9
+ msreport/peptidoform.py,sha256=mJhqoolFL6ZzwnmQkWhgJn8zIBoxv_GdYVSb-6gw37g,12615
10
+ msreport/qtable.py,sha256=RhfGdij7cIVO5JiUC-xSQkd7zV-Q8KmC94daA9JotHc,28203
11
+ msreport/reader.py,sha256=02cst1NRyBoeBaspfM67BM_KsTR9pt1NZQX49J_Wev0,131276
12
+ msreport/aggregate/__init__.py,sha256=Y5HnN9C2PRjWfq4epJAoNqyp4Pv6WQfguAcSYKIhRuw,609
13
+ msreport/aggregate/condense.py,sha256=fspY8osQfjzzehw3v4Up2QSihNiixhQpAiCiwXLIpCQ,6301
14
+ msreport/aggregate/pivot.py,sha256=Myk9QhOmQWge7MvGlFYwdD4u7pdqYaAaFZ0uxZH4d28,5491
15
+ msreport/aggregate/summarize.py,sha256=_KbSuLS3rRxIMpoIXfPyC2--5sACV9NsivbS0BPFr9o,12736
16
+ msreport/helper/__init__.py,sha256=IG4xaP_iIugqBLUpDHMj-SbD2_elL5on_V4whLIQTbM,1003
17
+ msreport/helper/calc.py,sha256=J4XltEnMrFR9IQlPtrZhyxlSTj15072huHCMA_nqQ6E,4245
18
+ msreport/helper/maxlfq.py,sha256=kFm3hRNWntM067EuoSrO_x-i5YNXphBfrvssMA3OM1g,14947
19
+ msreport/helper/table.py,sha256=x-Wo8mTENsUxc_gtF-wgOyQa9g7W2fK6tuRiEX7bda0,11430
20
+ msreport/helper/temp.py,sha256=jNulgDATf9sKXEFWMXAhjflciOZPAqlxg_7QZS7IkW8,3736
21
+ msreport/plot/__init__.py,sha256=p-oLxmZIvfC--xkjB0ka321xddW-lst19PmokJq9lTk,1457
22
+ msreport/plot/_partial_plots.py,sha256=tqZTSXEPuruMgVakaGR2tUQl5OrHgo2cROJ0S4cqkR0,5598
23
+ msreport/plot/comparison.py,sha256=Y2KOuakj-TxqdT2XNt7lnVZwimKSszvFQI-K9Pm80k8,18770
24
+ msreport/plot/distribution.py,sha256=QNFL5vG9p-vqhwEk5WcCSXa2B8u5QgySZlAQIPys0-0,10248
25
+ msreport/plot/multivariate.py,sha256=v79gcb-8s5bZVpaJn13MOmqsNA0ZvrV25JlXmHmp4WA,14046
26
+ msreport/plot/quality.py,sha256=ZZKMkghmVESjA49Qg-iukVFBoDIgI2iWLlFa7vJWX7M,15869
27
+ msreport/plot/style.py,sha256=67jWf4uA1ub9RJDu4xhuSoXAW0lbLj6SMP4QXQO76Pc,10591
28
+ msreport/plot/style_sheets/msreport-notebook.mplstyle,sha256=SPYO_7vYT8Ha7tQ0KCTLtykiRQ13-_igAm7kyvsZj1I,1266
29
+ msreport/plot/style_sheets/seaborn-whitegrid.mplstyle,sha256=eC8Zboy8R7ybBwbHPKvKbMIHACystN6X6I0lqm7B80U,833
30
+ msreport/rinterface/__init__.py,sha256=Zs6STvbDqaVZVPRM6iU0kKjq0TWz_2p2ChvNAveRdTA,616
31
+ msreport/rinterface/limma.py,sha256=P-Fs8HARSXz60rO_vLc--of1hafk_IgGgPaNXnS_aKg,5424
32
+ msreport/rinterface/rinstaller.py,sha256=AGs6NFMSwTLrzrIJz1E5BE5jFUz8eQBHlpM_MWVChzA,1370
33
+ msreport/rinterface/rscripts/limma.R,sha256=gr_yjMm_YoG45irDhWOo6gkRQSTwj_7uU_p3NBRHPm8,4331
34
+ msreport-0.0.32.dist-info/licenses/LICENSE.txt,sha256=Pd-b5cKP4n2tFDpdx27qJSIq0d1ok0oEcGTlbtL6QMU,11560
35
+ msreport-0.0.32.dist-info/METADATA,sha256=_OI-LkqJoperzDBo6KjAir7Xq6jANqyjpqyxUfu9T-4,8998
36
+ msreport-0.0.32.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
37
+ msreport-0.0.32.dist-info/top_level.txt,sha256=Drl8mCckJHFIw-Ovh5AnyjKnqvLJltDOBUr1JAcHAlI,9
38
+ msreport-0.0.32.dist-info/RECORD,,