msreport 0.0.27__py3-none-any.whl → 0.0.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
msreport/reader.py CHANGED
@@ -1,4 +1,4 @@
1
- """ Module for reading result tables from various MS analysis tools and converting them
1
+ """Module for reading result tables from various MS analysis tools and converting them
2
2
  to a standardized format following the MsReport convention.
3
3
 
4
4
  Currently for MaxQuant and FragPipe protein, peptide, and ion tables are supported, and
@@ -20,19 +20,19 @@ Unified column names:
20
20
  - iBAQ intensity "sample name"
21
21
  """
22
22
 
23
- from collections import OrderedDict, defaultdict
24
23
  import os
25
- from typing import Any, Callable, Iterable, Optional, Protocol
26
24
  import pathlib
27
25
  import warnings
26
+ from collections import OrderedDict, defaultdict
27
+ from typing import Any, Callable, Iterable, Optional, Protocol
28
28
 
29
29
  import numpy as np
30
30
  import pandas as pd
31
31
 
32
32
  import msreport.helper as helper
33
- from msreport.helper.temp import extract_window_around_position
34
- from msreport.errors import ProteinsNotInFastaWarning
35
33
  import msreport.peptidoform
34
+ from msreport.errors import ProteinsNotInFastaWarning
35
+ from msreport.helper.temp import extract_window_around_position
36
36
 
37
37
 
38
38
  class Protein(Protocol):
@@ -54,6 +54,8 @@ class ProteinDatabase(Protocol):
54
54
  class ResultReader:
55
55
  """Base Reader class, is by itself not functional."""
56
56
 
57
+ data_directory: str
58
+ filenames: dict[str, str]
57
59
  default_filenames: dict[str, str]
58
60
  protected_columns: list[str]
59
61
  column_mapping: dict[str, str]
@@ -61,8 +63,8 @@ class ResultReader:
61
63
  sample_column_tags: list[str]
62
64
 
63
65
  def __init__(self):
64
- self.data_directory: str = ""
65
- self.filenames: dict[str, str] = {}
66
+ self.data_directory = ""
67
+ self.filenames = {}
66
68
 
67
69
  def _read_file(self, which: str, sep: str = "\t") -> pd.DataFrame:
68
70
  """Read a result table.
@@ -183,18 +185,16 @@ class MaxQuantReader(ResultReader):
183
185
  "MS/MS count",
184
186
  "Sequence coverage",
185
187
  ]
186
- column_mapping: dict[str, str] = dict(
187
- [
188
- ("Peptides", "Total peptides"),
189
- ("Sequence coverage [%]", "Sequence coverage"),
190
- ("MS/MS count", "Spectral count Combined"), # proteinGroups, evidence
191
- ("MS/MS Count", "Spectral count Combined"), # peptides
192
- ("Sequence", "Peptide sequence"), # peptides, evidence
193
- ("Sequence length", "Protein length"),
194
- ("Mol. weight [kDa]", "Molecular weight [kDa]"),
195
- ("Experiment", "Sample"),
196
- ]
197
- )
188
+ column_mapping: dict[str, str] = {
189
+ "Peptides": "Total peptides",
190
+ "Sequence coverage [%]": "Sequence coverage",
191
+ "MS/MS count": "Spectral count Combined", # proteinGroups, evidence
192
+ "MS/MS Count": "Spectral count Combined", # peptides
193
+ "Sequence": "Peptide sequence", # peptides, evidence
194
+ "Sequence length": "Protein length",
195
+ "Mol. weight [kDa]": "Molecular weight [kDa]",
196
+ "Experiment": "Sample",
197
+ }
198
198
  column_tag_mapping: OrderedDict[str, str] = OrderedDict(
199
199
  [("MS/MS count", "Spectral count"), ("iBAQ", "iBAQ intensity")]
200
200
  )
@@ -343,7 +343,9 @@ class MaxQuantReader(ResultReader):
343
343
  Adds new columns to comply with the MsReport convention. "Modified sequence",
344
344
  "Modifications columns", "Modification localization string". "Protein reported
345
345
  by software" and "Representative protein", both contain the first entry from
346
- "Leading razor protein".
346
+ "Leading razor protein". "Ion ID" contains unique entries for each ion, which
347
+ are generated by concatenating the "Modified sequence" and "Charge" columns, and
348
+ if present, the "Compensation voltage" column.
347
349
 
348
350
  "Modified sequence" entries contain modifications within square brackets.
349
351
  "Modification" entries are strings in the form of "position:modification_tag",
@@ -376,15 +378,19 @@ class MaxQuantReader(ResultReader):
376
378
  df["Leading razor protein"]
377
379
  )
378
380
  df["Representative protein"] = df["Protein reported by software"]
381
+
379
382
  if drop_decoy:
380
383
  df = self._drop_decoy(df)
381
384
  if rename_columns:
382
- df = self._rename_columns(
383
- df, True
384
- ) # Actually there are no column tags as the table is in long format
385
+ # Actually there are no column tags as the table is in long format
386
+ df = self._rename_columns(df, prefix_tag=True)
385
387
  if rewrite_modifications and rename_columns:
386
388
  df = self._add_peptide_modification_entries(df)
387
389
  df = self._add_modification_localization_string(df)
390
+ df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
391
+ if "Compensation voltage" in df.columns:
392
+ _cv = df["Compensation voltage"].astype(str)
393
+ df["Ion ID"] = df["Ion ID"] + "_cv" + _cv
388
394
  return df
389
395
 
390
396
  def _add_protein_entries(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -576,6 +582,7 @@ class FragPipeReader(ResultReader):
576
582
  "peptides": "combined_peptide.tsv",
577
583
  "ions": "combined_ion.tsv",
578
584
  "ion_evidence": "ion.tsv",
585
+ "psm_evidence": "psm.tsv",
579
586
  }
580
587
  isobar_filenames: dict[str, str] = {
581
588
  "proteins": "protein.tsv",
@@ -590,20 +597,25 @@ class FragPipeReader(ResultReader):
590
597
  "Intensity",
591
598
  "MaxLFQ Intensity",
592
599
  ]
593
- column_mapping: dict[str, str] = dict(
594
- [
595
- ("Peptide Sequence", "Peptide sequence"), # Peptide and ion
596
- ("Modified Sequence", "Modified sequence"), # Modified peptide and ion
597
- ("Start", "Start position"), # Peptide and ion
598
- ("End", "End position"), # Peptide and ion
599
- ("Combined Total Peptides", "Total peptides"), # From LFQ
600
- ("Total Peptides", "Total peptides"), # From TMT
601
- ("Description", "Protein name"),
602
- ("Protein Length", "Protein length"),
603
- ("Entry Name", "Protein entry name"),
604
- ("Gene", "Gene name"),
605
- ]
606
- )
600
+ column_mapping: dict[str, str] = {
601
+ "Peptide": "Peptide sequence", # PSM
602
+ "Modified Peptide": "Modified sequence", # PSM
603
+ "Protein Start": "Start position", # PSM
604
+ "Protein End": "End position", # PSM
605
+ "Number of Missed Cleavages": "Missed cleavage", # PSM
606
+ "PeptideProphet Probability": "Probability", # PSM
607
+ "Compensation Voltage": "Compensation voltage", # PSM and ion
608
+ "Peptide Sequence": "Peptide sequence", # Peptide and ion
609
+ "Modified Sequence": "Modified sequence", # Modified peptide and ion
610
+ "Start": "Start position", # Peptide and ion
611
+ "End": "End position", # Peptide and ion
612
+ "Combined Total Peptides": "Total peptides", # From LFQ
613
+ "Total Peptides": "Total peptides", # From TMT
614
+ "Description": "Protein name",
615
+ "Protein Length": "Protein length",
616
+ "Entry Name": "Protein entry name",
617
+ "Gene": "Gene name",
618
+ }
607
619
  column_tag_mapping: OrderedDict[str, str] = OrderedDict(
608
620
  [
609
621
  ("MaxLFQ Intensity", "LFQ intensity"),
@@ -743,7 +755,10 @@ class FragPipeReader(ResultReader):
743
755
 
744
756
  Adds new columns to comply with the MsReport convention. "Modified sequence"
745
757
  and "Modifications columns". "Protein reported by software" and "Representative
746
- protein", both contain the first entry from "Leading razor protein".
758
+ protein", both contain the first entry from "Leading razor protein". "Ion ID"
759
+ contains unique entries for each ion, which are generated by concatenating the
760
+ "Modified sequence" and "Charge" columns, and if present, the
761
+ "Compensation voltage" column.
747
762
 
748
763
  "Modified sequence" entries contain modifications within square brackets.
749
764
  "Modification" entries are strings in the form of "position:modification_text",
@@ -783,6 +798,11 @@ class FragPipeReader(ResultReader):
783
798
  if rewrite_modifications and rename_columns:
784
799
  df = self._add_peptide_modification_entries(df)
785
800
  df = self._add_modification_localization_string(df, prefix_column_tags)
801
+ df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
802
+ if "Compensation voltage" in df.columns:
803
+ _cv = df["Compensation voltage"].astype(str)
804
+ df["Ion ID"] = df["Ion ID"] + "_cv" + _cv
805
+
786
806
  return df
787
807
 
788
808
  def import_ion_evidence(
@@ -797,7 +817,9 @@ class FragPipeReader(ResultReader):
797
817
  Adds new columns to comply with the MsReport convention. "Modified sequence",
798
818
  "Modifications", and "Modification localization string" columns. "Protein
799
819
  reported by software" and "Representative protein", both contain the first entry
800
- from "Leading razor protein".
820
+ from "Leading razor protein". "Ion ID" contains unique entries for each ion,
821
+ which are generated by concatenating the "Modified sequence" and "Charge"
822
+ columns, and if present, the "Compensation voltage" column.
801
823
 
802
824
  "Modified sequence" entries contain modifications within square brackets.
803
825
  "Modification" entries are strings in the form of "position:modification_text",
@@ -850,6 +872,9 @@ class FragPipeReader(ResultReader):
850
872
  df = pd.concat(ion_tables, ignore_index=True)
851
873
 
852
874
  # --- Process dataframe --- #
875
+ df["Ion ID"] = df["Modified Sequence"] + "_c" + df["Charge"].astype(str)
876
+ if "Compensation Voltage" in df.columns:
877
+ df["Ion ID"] = df["Ion ID"] + "_cv" + df["Compensation Voltage"].astype(str)
853
878
  # FUTURE: replace this by _add_protein_entries(df, False) if FragPipe adds
854
879
  # 'Indistinguishable Proteins' to the ion table.
855
880
  df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
@@ -861,6 +886,76 @@ class FragPipeReader(ResultReader):
861
886
  df = self._add_modification_localization_string(df, prefix_column_tags)
862
887
  return df
863
888
 
889
+ def import_psm_evidence(
890
+ self,
891
+ filename: Optional[str] = None,
892
+ rename_columns: bool = True,
893
+ rewrite_modifications: bool = True,
894
+ ):
895
+ """Concatenate all "psm.tsv" files and return a processed dataframe.
896
+
897
+ Args:
898
+ filename: Allows specifying an alternative filename, otherwise the default
899
+ filename is used.
900
+ rename_columns: If True, columns are renamed according to the MsReport
901
+ convention; default True.
902
+ rewrite_modifications: If True, the peptide format in "Modified sequence" is
903
+ changed according to the MsReport convention, and a "Modifications" is
904
+ added to contains the amino acid position for all modifications.
905
+ Requires 'rename_columns' to be true. Default True.
906
+
907
+ Returns:
908
+ A DataFrame containing the processed psm evidence tables.
909
+ """
910
+ if filename is None:
911
+ filename = self.default_filenames["psm_evidence"]
912
+
913
+ psm_table_paths = []
914
+ for path in pathlib.Path(self.data_directory).iterdir():
915
+ psm_table_path = path / filename
916
+ if path.is_dir() and psm_table_path.exists():
917
+ psm_table_paths.append(psm_table_path)
918
+
919
+ psm_tables = []
920
+ for filepath in psm_table_paths:
921
+ table = pd.read_csv(filepath, sep="\t", low_memory=False)
922
+ str_cols = table.select_dtypes(include=["object"]).columns
923
+ table.loc[:, str_cols] = table.loc[:, str_cols].fillna("")
924
+
925
+ table["Sample"] = filepath.parent.name
926
+ psm_tables.append(table)
927
+ df = pd.concat(psm_tables, ignore_index=True)
928
+
929
+ df["Protein reported by software"] = _extract_protein_ids(df["Protein"])
930
+ df["Representative protein"] = df["Protein reported by software"]
931
+ df["Mapped Proteins"] = df["Mapped Proteins"].astype(str).replace("nan", "")
932
+
933
+ # FP only lists additional mapped proteins in the "Mapped Proteins" column
934
+ # MsReport reports all matching proteins in the "Mapped proteins" column
935
+ mapped_proteins_entries = []
936
+ for protein, mapped_protein_fp in zip(
937
+ df["Representative protein"], df["Mapped Proteins"], strict=True
938
+ ):
939
+ if mapped_protein_fp == "":
940
+ mapped_proteins = [protein]
941
+ else:
942
+ additional_mapped_proteins = msreport.reader._extract_protein_ids(
943
+ mapped_protein_fp.split(", ")
944
+ )
945
+ mapped_proteins = [protein] + additional_mapped_proteins
946
+ mapped_proteins_entries.append(";".join(mapped_proteins))
947
+ df["Mapped proteins"] = mapped_proteins_entries
948
+
949
+ if rename_columns:
950
+ df = self._rename_columns(df, prefix_tag=True)
951
+ if rewrite_modifications and rename_columns:
952
+ mod_entries = _generate_modification_entries_from_assigned_modifications(
953
+ df["Peptide sequence"], df["Assigned Modifications"]
954
+ )
955
+ df["Modified sequence"] = mod_entries["Modified sequence"]
956
+ df["Modifications"] = mod_entries["Modifications"]
957
+ return df
958
+
864
959
  def _add_protein_entries(self, df: pd.DataFrame) -> pd.DataFrame:
865
960
  """Adds standardized protein entry columns to the data frame.
866
961
 
@@ -1038,40 +1133,32 @@ class SpectronautReader(ResultReader):
1038
1133
  "design": "conditionsetup",
1039
1134
  }
1040
1135
  protected_columns: list[str] = []
1041
- column_mapping: dict[str, str] = dict(
1042
- [
1043
- ("R.FileName", "Filename"),
1044
- ("R.Label", "Sample"),
1045
- ("PG.Qvalue", "Protein qvalue"),
1046
- ("PG.Cscore", "Protein cscore"),
1047
- ("PG.NrOfStrippedSequencesIdentified (Experiment-wide)", "Total peptides"),
1048
- ("PG.NrOfPrecursorsIdentified (Experiment-wide)", "Total ions"),
1049
- ("PG.Cscore", "Cscore"),
1050
- ("PEP.StrippedSequence", "Peptide sequence"),
1051
- ("PEP.AllOccurringProteinAccessions", "Mapped proteins"),
1052
- ("EG.ModifiedSequence", "Modified sequence"),
1053
- ("EG.CompensationVoltage", "Compensation voltage"),
1054
- ("EG.Qvalue", "Qvalue"),
1055
- ("EG.ApexRT", "Apex retention time"),
1056
- ("EG.DatapointsPerPeak", "Datapoints per peak"),
1057
- ("EG.FWHM", "FWHM"),
1058
- ("EG.SignalToNoise", "Signal to noise"),
1059
- ("FG.FragmentCount", "Fragment count"),
1060
- ("FG.Charge", "Charge"),
1061
- ("FG.MS1Quantity", "MS1 intensity"),
1062
- ("FG.MS1RawQuantity", "MS1 raw intensity"),
1063
- ("FG.MS2Quantity", "MS2 intensity"),
1064
- ("FG.MS2RawQuantity", "MS2 raw intensity"),
1065
- ("FG.MeasuredMz", "Observed m/z"),
1066
- ("FG.TheoreticalMz", "Theoretical m/z"),
1067
- ("FG.CalibratedMz", "Calibrated m/z"),
1068
- # ("PG.ProteinAccessions", ""),
1069
- # ("EG.HasLocalizationInformation", ""),
1070
- # ("EG.PTMLocalizationProbabilities", ""),
1071
- # ("EG.UsedForProteinGroupQuantity", ""),
1072
- # Modified peptides need to be parsed and rewritten
1073
- ]
1074
- )
1136
+ column_mapping: dict[str, str] = {
1137
+ "R.FileName": "Filename",
1138
+ "R.Label": "Sample",
1139
+ "PG.Qvalue": "Protein qvalue",
1140
+ "PG.Cscore": "Protein cscore",
1141
+ "PG.NrOfStrippedSequencesIdentified (Experiment-wide)": "Total peptides",
1142
+ "PG.NrOfPrecursorsIdentified (Experiment-wide)": "Total ions",
1143
+ "PEP.StrippedSequence": "Peptide sequence",
1144
+ "PEP.AllOccurringProteinAccessions": "Mapped proteins",
1145
+ "EG.ModifiedSequence": "Modified sequence",
1146
+ "EG.CompensationVoltage": "Compensation voltage",
1147
+ "EG.Qvalue": "Qvalue",
1148
+ "EG.ApexRT": "Apex retention time",
1149
+ "EG.DatapointsPerPeak": "Datapoints per peak",
1150
+ "EG.FWHM": "FWHM",
1151
+ "EG.SignalToNoise": "Signal to noise",
1152
+ "FG.FragmentCount": "Fragment count",
1153
+ "FG.Charge": "Charge",
1154
+ "FG.MS1Quantity": "MS1 intensity",
1155
+ "FG.MS1RawQuantity": "MS1 raw intensity",
1156
+ "FG.MS2Quantity": "MS2 intensity",
1157
+ "FG.MS2RawQuantity": "MS2 raw intensity",
1158
+ "FG.MeasuredMz": "Observed m/z",
1159
+ "FG.TheoreticalMz": "Theoretical m/z",
1160
+ "FG.CalibratedMz": "Calibrated m/z",
1161
+ }
1075
1162
  sample_column_tags: list[str] = [
1076
1163
  ".PG.NrOfPrecursorsIdentified",
1077
1164
  ".PG.IBAQ",
@@ -1324,12 +1411,14 @@ class SpectronautReader(ResultReader):
1324
1411
  filename: Optional[str] = None,
1325
1412
  filetag: Optional[str] = None,
1326
1413
  rename_columns: bool = True,
1327
- ) -> None:
1414
+ ) -> pd.DataFrame:
1328
1415
  """Reads an ion evidence file (long format) and returns a processed dataframe.
1329
1416
 
1330
1417
  Adds new columns to comply with the MsReport convention. "Protein reported
1331
1418
  by software" and "Representative protein", both contain the first entry from
1332
- "PG.ProteinAccessions".
1419
+ "PG.ProteinAccessions". "Ion ID" contains unique entries for each ion, which are
1420
+ generated by concatenating the "Modified sequence" and "Charge" columns, and if
1421
+ present, the "Compensation voltage" column.
1333
1422
 
1334
1423
  (!) Note that the modified sequence and modification localization probabilities
1335
1424
  are currently not processed.
@@ -1367,6 +1456,11 @@ class SpectronautReader(ResultReader):
1367
1456
  df = self._add_protein_entries(df)
1368
1457
  if rename_columns:
1369
1458
  df = self._rename_columns(df, True)
1459
+ df["Ion ID"] = df["Modified sequence"] + "_c" + df["Charge"].astype(str)
1460
+ if "Compensation voltage" in df.columns:
1461
+ _cv = df["Compensation voltage"].astype(str)
1462
+ df["Ion ID"] = df["Ion ID"] + "_cv" + _cv
1463
+
1370
1464
  return df
1371
1465
 
1372
1466
  def _tidy_up_sample_columns(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -1462,7 +1556,7 @@ def sort_leading_proteins(
1462
1556
  if penalize_contaminants is not None:
1463
1557
  contaminant_encoding = {"False": 0, "True": 1, False: 0, True: 1}
1464
1558
 
1465
- for idx, row in table.iterrows():
1559
+ for _, row in table.iterrows():
1466
1560
  protein_ids = row["Leading proteins"].split(";")
1467
1561
 
1468
1562
  sorting_info = [[] for _ in protein_ids]
@@ -1559,6 +1653,7 @@ def add_protein_annotation(
1559
1653
  warnings.warn(
1560
1654
  f"Some proteins could not be annotated: {repr(proteins_not_in_db)}",
1561
1655
  ProteinsNotInFastaWarning,
1656
+ stacklevel=2,
1562
1657
  )
1563
1658
 
1564
1659
  annotations = {}
@@ -1636,9 +1731,10 @@ def add_protein_site_annotation(
1636
1731
  warnings.warn(
1637
1732
  f"Some proteins could not be annotated: {repr(proteins_not_in_db)}",
1638
1733
  ProteinsNotInFastaWarning,
1734
+ stacklevel=2,
1639
1735
  )
1640
1736
 
1641
- annotations = {
1737
+ annotations: dict[str, list[str]] = {
1642
1738
  "Modified residue": [],
1643
1739
  "Sequence window": [],
1644
1740
  }
@@ -1702,6 +1798,7 @@ def add_leading_proteins_annotation(
1702
1798
  warnings.warn(
1703
1799
  f"Some proteins could not be annotated: {repr(proteins_not_in_db)}",
1704
1800
  ProteinsNotInFastaWarning,
1801
+ stacklevel=2,
1705
1802
  )
1706
1803
 
1707
1804
  annotations = {}
@@ -1853,7 +1950,7 @@ def add_peptide_positions(
1853
1950
  find matching entries in the FASTA files.
1854
1951
  """
1855
1952
  # not tested #
1856
- peptide_positions = {"Start position": [], "End position": []}
1953
+ peptide_positions: dict[str, list[int]] = {"Start position": [], "End position": []}
1857
1954
  proteins_not_in_db = []
1858
1955
  for peptide, protein_id in zip(table[peptide_column], table[protein_column]):
1859
1956
  if protein_id in protein_db:
@@ -1875,6 +1972,7 @@ def add_peptide_positions(
1875
1972
  warnings.warn(
1876
1973
  f"Some peptides could not be annotated: {repr(proteins_not_in_db)}",
1877
1974
  ProteinsNotInFastaWarning,
1975
+ stacklevel=2,
1878
1976
  )
1879
1977
 
1880
1978
 
@@ -1894,10 +1992,10 @@ def add_protein_modifications(table: pd.DataFrame):
1894
1992
  for peptide_site, mod in [m.split(":") for m in mod_entry.split(";")]:
1895
1993
  protein_site = int(peptide_site) + start_pos - 1
1896
1994
  protein_mods.append([str(protein_site), mod])
1897
- protein_mods = ";".join([f"{pos}:{mod}" for pos, mod in protein_mods])
1995
+ protein_mod_string = ";".join([f"{pos}:{mod}" for pos, mod in protein_mods])
1898
1996
  else:
1899
- protein_mods = ""
1900
- protein_modification_entries.append(protein_mods)
1997
+ protein_mod_string = ""
1998
+ protein_modification_entries.append(protein_mod_string)
1901
1999
  table["Protein modifications"] = protein_modification_entries
1902
2000
 
1903
2001
 
@@ -2074,7 +2172,7 @@ def _process_protein_entries(
2074
2172
  A dataframe containing the columns "Protein reported by software",
2075
2173
  "Leading proteins", "Representative protein", and "Potential contaminant".
2076
2174
  """
2077
- new_entries = {
2175
+ new_entries: dict[str, list[str | bool]] = {
2078
2176
  "Protein reported by software": [],
2079
2177
  "Representative protein": [],
2080
2178
  "Potential contaminant": [],
@@ -2147,6 +2245,57 @@ def _generate_modification_entries(
2147
2245
  return entries
2148
2246
 
2149
2247
 
2248
+ def _generate_modification_entries_from_assigned_modifications(
2249
+ sequences: Iterable[str],
2250
+ assigned_modifications: Iterable[str],
2251
+ ) -> dict[str, list[str]]:
2252
+ modified_sequence_entries = []
2253
+ modification_entries = []
2254
+ for sequence, modifications_entry in zip(sequences, assigned_modifications):
2255
+ modifications = _extract_fragpipe_assigned_modifications(
2256
+ modifications_entry, sequence
2257
+ )
2258
+ modified_sequence = helper.modify_peptide(sequence, modifications)
2259
+ modification_entry = ";".join([f"{pos}:{mod}" for pos, mod in modifications])
2260
+ modified_sequence_entries.append(modified_sequence)
2261
+ modification_entries.append(modification_entry)
2262
+
2263
+ entries = {
2264
+ "Modified sequence": modified_sequence_entries,
2265
+ "Modifications": modification_entries,
2266
+ }
2267
+ return entries
2268
+
2269
+
2270
+ def _extract_fragpipe_assigned_modifications(
2271
+ modifications_entry: str,
2272
+ sequence: str,
2273
+ ) -> list[tuple[int, str]]:
2274
+ """Extracts modifications from a FragPipe "Modifications" entry.
2275
+
2276
+ Example for a modification entry: "N-term(42.0106),8C(57.0215)"
2277
+
2278
+ Returns:
2279
+ A list of tuples, where each tuple contains the position of the modification and
2280
+ the modification text. The position is one-indexed, meaning that the first amino
2281
+ acid position is 1. N-term and C-term are represented as 0 and len(sequence)
2282
+ respectively.
2283
+ """
2284
+ if modifications_entry == "":
2285
+ return []
2286
+ modifications = []
2287
+ for mod_entry in modifications_entry.split(","):
2288
+ position_entry, modification = mod_entry.split(")")[0].split("(")
2289
+ if position_entry == "N-term":
2290
+ position = 0
2291
+ elif position_entry == "C-term":
2292
+ position = len(sequence)
2293
+ else:
2294
+ position = int(position_entry[:-1])
2295
+ modifications.append((position, modification))
2296
+ return modifications
2297
+
2298
+
2150
2299
  def extract_maxquant_localization_probabilities(localization_entry: str) -> dict:
2151
2300
  """Extract localization probabilites from a MaxQuant "Probabilities" entry.
2152
2301
 
@@ -2189,7 +2338,7 @@ def extract_fragpipe_localization_probabilities(localization_entry: str) -> dict
2189
2338
  ... )
2190
2339
  {'15.9949': {3: 1.0}, '79.9663': {4: 0.334, 6: 0.666}}
2191
2340
  """
2192
- modification_probabilities = {}
2341
+ modification_probabilities: dict[str, dict[int, float]] = {}
2193
2342
  for modification_entry in filter(None, localization_entry.split(";")):
2194
2343
  specified_modification, probability_sequence = modification_entry.split("@")
2195
2344
  _, modification = specified_modification.split(":")
@@ -2247,7 +2396,7 @@ def _create_protein_annotations_from_db(
2247
2396
  protein_db: ProteinDatabase,
2248
2397
  query_function: Callable,
2249
2398
  default_value: Any,
2250
- ) -> list[str]:
2399
+ ) -> list[Any]:
2251
2400
  """Returns a list of multi protein entry annotations.
2252
2401
 
2253
2402
  Used to generate protein annotations for protein entries. For each protein id an
@@ -2274,9 +2423,9 @@ def _create_protein_annotations_from_db(
2274
2423
  if protein_id in protein_db:
2275
2424
  db_entry = protein_db[protein_id]
2276
2425
  query_result = query_function(db_entry, default_value)
2426
+ annotation_values.append(query_result)
2277
2427
  else:
2278
- query_result = default_value
2279
- annotation_values.append(query_result)
2428
+ annotation_values.append(default_value)
2280
2429
  return annotation_values
2281
2430
 
2282
2431
 
@@ -1,3 +1,16 @@
1
- """ Python interface to custome R scripts. """
2
- from .limma import multi_group_limma, two_group_limma
3
- from .rinstaller import r_package_version
1
+ """Python interface to custome R scripts."""
2
+
3
+ from msreport.errors import OptionalDependencyError
4
+
5
+ try:
6
+ from .limma import multi_group_limma, two_group_limma
7
+ from .rinstaller import r_package_version
8
+ except ImportError as err:
9
+ raise OptionalDependencyError(
10
+ "R integration is not available. R must be installed and configured before "
11
+ "installing optional R dependencies using 'pip install msreport[R]'. For "
12
+ "more information, see: https://github.com/hollenstein/msreport"
13
+ ) from err
14
+
15
+
16
+ __all__ = ["multi_group_limma", "two_group_limma", "r_package_version"]
@@ -1,4 +1,5 @@
1
- """ Python interface to custome R scripts. """
1
+ """Python interface to custome R scripts."""
2
+
2
3
  import os
3
4
 
4
5
  import pandas as pd
@@ -1,9 +1,9 @@
1
- from rpy2.robjects.packages import importr
2
- import rpy2.robjects.packages as rpackages
3
1
  import rpy2.robjects as robjects
2
+ import rpy2.robjects.packages as rpackages
3
+ from rpy2.robjects.packages import importr
4
4
 
5
5
 
6
- def r_package_version(package_name: str) -> (str, str):
6
+ def r_package_version(package_name: str) -> str:
7
7
  """Returns the version number of an installed R package."""
8
8
  with robjects.conversion.localconverter(robjects.default_converter):
9
9
  utils = importr("utils")