msreport 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
msreport/export.py ADDED
@@ -0,0 +1,526 @@
1
+ """
2
+ Columns that are not yet present in the amica output at the moment:
3
+ Index([
4
+ 'Protein Probability',
5
+ 'Top Peptide Probability',
6
+ 'Total peptides',
7
+ 'Leading proteins',
8
+ 'Protein entry name',
9
+ 'Fasta header',
10
+ 'Protein length',
11
+ 'iBAQ peptides',
12
+ 'Sequence coverage',
13
+ ], dtype='object')
14
+ """
15
+
16
+ from collections import defaultdict as ddict
17
+ import os
18
+ from typing import Iterable, Optional, Protocol
19
+ import warnings
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+
24
+ import msreport.helper as helper
25
+ import msreport.reader
26
+ from msreport.qtable import Qtable
27
+
28
+
29
+ class Protein(Protocol):
30
+ """Abstract protein entry"""
31
+
32
+ header: str
33
+ sequence: str
34
+ header_fields: dict[str, str]
35
+
36
+
37
+ class ProteinDatabase(Protocol):
38
+ """Abstract protein database"""
39
+
40
+ def __getitem__(self, protein_id: str) -> Protein: ...
41
+
42
+ def __contains__(self, protein_id: str) -> bool: ...
43
+
44
+
45
+ def contaminants_to_clipboard(qtable: Qtable) -> None:
46
+ """Creates a contaminant table and writes it to the system clipboard.
47
+
48
+ The contaminant table contains "iBAQ rank", "riBAQ", "iBAQ intensity", "Intensity",
49
+ and "Expression" columns for each sample. Imputed values in the "Expression" columns
50
+ are set to NaN.
51
+
52
+ The qtable must at least contain "iBAQ intensity" and "Missing" sample columns, and
53
+ a "Potential contaminant" column, expression columns must be set. For calculation
54
+ of iBAQ intensities refer to msreport.reader.add_ibaq_intensities(). "Missing"
55
+ sample columns can be added with msreport.analyze.analyze_missingness().
56
+
57
+ Args:
58
+ qtable: A Qtable instance. Requires that column names follow the MsReport
59
+ conventions.
60
+ """
61
+ columns = [
62
+ "Representative protein",
63
+ "Protein entry name",
64
+ "Gene name",
65
+ "Fasta header",
66
+ "Protein length",
67
+ "Total peptides",
68
+ "iBAQ peptides",
69
+ "iBAQ intensity total",
70
+ ]
71
+ column_tags = ["iBAQ rank", "riBAQ", "iBAQ intensity", "Intensity", "Expression"]
72
+
73
+ samples = qtable.get_samples()
74
+ data = qtable.get_data()
75
+
76
+ data["iBAQ intensity total"] = np.nansum(
77
+ data[[f"iBAQ intensity {s}" for s in samples]], axis=1
78
+ ) / len(samples)
79
+ for sample in samples:
80
+ data.loc[data[f"Missing {sample}"], f"Expression {sample}"] = np.nan
81
+
82
+ ibaq_values = data[f"iBAQ intensity {sample}"]
83
+ order = np.argsort(ibaq_values)[::-1]
84
+ rank = np.empty_like(ibaq_values, dtype=int)
85
+ rank[order] = np.arange(1, len(ibaq_values) + 1)
86
+ data[f"iBAQ rank {sample}"] = rank
87
+ data[f"riBAQ {sample}"] = ibaq_values / ibaq_values.sum() * 100
88
+
89
+ for column_tag in column_tags:
90
+ columns.extend(helper.find_sample_columns(data, column_tag, samples))
91
+ columns = np.array(columns)[[c in data.columns for c in columns]]
92
+
93
+ contaminants = qtable["Potential contaminant"]
94
+ data = data.loc[contaminants, columns]
95
+
96
+ data.sort_values("iBAQ intensity total", ascending=False, inplace=True)
97
+ data.to_clipboard(index=False)
98
+
99
+
100
+ def to_perseus_matrix(
101
+ qtable: Qtable,
102
+ directory,
103
+ table_name: str = "perseus_matrix.tsv",
104
+ ) -> None:
105
+ """Exports a qtable to a perseus matrix file in tsv format.
106
+
107
+ The Perseus matrix file has a second header row that contains single-letter entries
108
+ for column annotations. The first entry starts with the string "#!{Type}" followed
109
+ by an annotation letter, such as "#!{Type}E".
110
+
111
+ The annotation single letter code is:
112
+ E = Expression
113
+ N = numerical
114
+ C = Categorical
115
+ T = Text
116
+
117
+ Args:
118
+ qtable: A Qtable instance.
119
+ directory: Output path of the generated files.
120
+ table_name: Optional, filename of the perseus matrix file. Default is
121
+ "perseus_matrix.tsv".
122
+ """
123
+ table = qtable.data
124
+ default_category = "T"
125
+ annotation_row_prefix = "#!{Type}"
126
+ categorical_tags = ["Events", "Missing"]
127
+
128
+ categorical_columns = ["Potential contaminant", "Valid"]
129
+ for tag in categorical_tags:
130
+ categorical_columns.extend([c for c in table.columns if tag in c])
131
+
132
+ expression_columns = [qtable.get_expression_column(s) for s in qtable.get_samples()]
133
+
134
+ numeric_columns = table.select_dtypes(include="number").columns.tolist()
135
+ numeric_columns = set(numeric_columns).difference(expression_columns)
136
+ numeric_columns = set(numeric_columns).difference(categorical_columns)
137
+
138
+ column_categories = ddict(lambda: default_category)
139
+ column_categories.update({c: "N" for c in numeric_columns})
140
+ column_categories.update({c: "C" for c in categorical_columns})
141
+ column_categories.update({c: "E" for c in expression_columns})
142
+
143
+ column_annotation = [column_categories[column] for column in table.columns]
144
+ column_annotation[0] = f"{annotation_row_prefix}{column_annotation[0]}"
145
+ annotation_frame = pd.DataFrame(columns=table.columns, data=[column_annotation])
146
+
147
+ perseus_matrix = pd.concat([annotation_frame, table])
148
+ perseus_matrix_path = os.path.join(directory, table_name)
149
+ perseus_matrix.to_csv(perseus_matrix_path, sep="\t", index=False)
150
+
151
+
152
+ def to_amica(
153
+ qtable: Qtable,
154
+ directory,
155
+ table_name: str = "amica_table.tsv",
156
+ design_name: str = "amica_design.tsv",
157
+ ) -> None:
158
+ """Exports a qtable to an amica protein table and design files.
159
+
160
+ Note that amica expects the same number of columns for each group of intensity
161
+ columns (Intensity, LFQIntensity, ImputedIntensity, iBAQ), therefore only sample
162
+ columns are included from samples that are present in the qtable design.
163
+
164
+ Args:
165
+ qtable: A Qtable instance.
166
+ directory: Output path of the generated files.
167
+ table_name: Optional, filename of the amica table file. Default is
168
+ "amica_table.tsv".
169
+ design_name: Optional, filename of the amica design file. Default is
170
+ "amica_design.tsv".
171
+ """
172
+ amica_table = _amica_table_from(qtable)
173
+ amica_table_path = os.path.join(directory, table_name)
174
+ amica_table.to_csv(amica_table_path, sep="\t", index=False)
175
+
176
+ amica_design = _amica_design_from(qtable)
177
+ amica_design_path = os.path.join(directory, design_name)
178
+ amica_design.to_csv(amica_design_path, sep="\t", index=False)
179
+
180
+
181
+ def write_html_coverage_map(
182
+ filepath: str,
183
+ protein_id: str,
184
+ peptide_table: pd.DataFrame,
185
+ protein_db: ProteinDatabase,
186
+ displayed_name: Optional[str] = None,
187
+ coverage_color: str = "#E73C40",
188
+ highlight_positions: Optional[Iterable[int]] = None,
189
+ highlight_color: str = "#1E90FF",
190
+ column_length: int = 10,
191
+ row_length: int = 50,
192
+ ):
193
+ """Generates an html file containing a protein coverage map.
194
+
195
+ Args:
196
+ filepath: The filepath where the generated html file will be saved.
197
+ protein_id: ID of the protein that will be displayed on the html page. Must
198
+ correspond to an entry in the specified `protein_db`.
199
+ peptide_table: Dataframe which contains peptide information required for
200
+ calculation of the protein sequence coverage.
201
+ protein_db: A protein database containing entries from one or multiple FASTA
202
+ files.
203
+ displayed_name: Allows specifying a custom displayed name. By default, the
204
+ protein name and protein id are shown.
205
+ coverage_color: Hex color code for highlighting amino acids that correspond to
206
+ covered regions from the coverage mask, for example "#FF0000" for red.
207
+ highlight_positions: Optional, allows specifying a list of amino acid positions
208
+ that are highlighted in a different color. Note that positions specified
209
+ here will overwrite the coloring from the coverage mask. Positions are
210
+ one-indexed, which means that the first amino acid positions is 1.
211
+ highlight_color: Hex color code for highlighting amino acids specified with the
212
+ 'highlight_positions' variable.
213
+ column_length: Number of amino acids after which a space is inserted.
214
+ row_length: Number of amino acids after which a new line is inserted.
215
+ """
216
+ warnings.warn(
217
+ (
218
+ "`write_html_coverage_map` is still experimental, and the interface might "
219
+ "change in a future release."
220
+ ),
221
+ FutureWarning,
222
+ )
223
+ # Get protein information from the protein database
224
+ protein_entry = protein_db[protein_id]
225
+ sequence = protein_entry.sequence
226
+ protein_length = len(sequence)
227
+
228
+ if displayed_name is None:
229
+ protein_name = msreport.reader._get_annotation_protein_name(
230
+ protein_entry, default_value=protein_id
231
+ )
232
+ if protein_name == protein_id:
233
+ displayed_name = protein_id
234
+ else:
235
+ displayed_name = f"{protein_name} ({protein_id})"
236
+
237
+ # Generate coverage boundaries from a peptide table
238
+ id_column = "Representative protein"
239
+ peptide_group = peptide_table[peptide_table[id_column] == protein_id]
240
+ peptide_positions = list(
241
+ zip(peptide_group["Start position"], peptide_group["End position"])
242
+ )
243
+ coverage_mask = helper.make_coverage_mask(protein_length, peptide_positions)
244
+ boundaries = _find_covered_region_boundaries(coverage_mask)
245
+
246
+ # Define highlight positions
247
+ highlight_positions = highlight_positions if highlight_positions is not None else ()
248
+ highlights = {pos - 1: highlight_color for pos in highlight_positions}
249
+ html_title = f"Coverage map: {displayed_name}"
250
+
251
+ # Generate and save the html page
252
+ sequence_coverage = helper.calculate_sequence_coverage(
253
+ protein_length, peptide_positions, ndigits=1
254
+ )
255
+ html_sequence_map = _generate_html_sequence_map(
256
+ sequence,
257
+ boundaries,
258
+ coverage_color,
259
+ highlights=highlights,
260
+ column_length=column_length,
261
+ row_length=row_length,
262
+ )
263
+ html_text = _generate_html_coverage_map_page(
264
+ html_sequence_map, sequence_coverage, title=html_title
265
+ )
266
+ with open(filepath, "w") as openfile:
267
+ openfile.write(html_text)
268
+
269
+
270
+ def _amica_table_from(qtable: Qtable) -> pd.DataFrame:
271
+ """Returns a dataframe in the amica format.
272
+
273
+ Args:
274
+ table: A dataframe containing experimental data. Requires that column names
275
+ follow the MsReport conventions.
276
+
277
+ Returns:
278
+ A dataframe which columns are in the amica data table format. Note that only
279
+ intensity columns are included from samples that are present in the qtable
280
+ design.
281
+ """
282
+ filter_columns = ["Valid", "Potential contaminant"]
283
+ amica_column_mapping = {
284
+ "Representative protein": "Majority.protein.IDs",
285
+ "Gene name": "Gene.names",
286
+ "Valid": "quantified",
287
+ "Potential contaminant": "Potential.contaminant",
288
+ }
289
+ amica_column_tag_mapping = {
290
+ "Intensity ": "Intensity_",
291
+ "LFQ intensity ": "LFQIntensity_",
292
+ "Expression ": "ImputedIntensity_",
293
+ "iBAQ intensity ": "iBAQ_",
294
+ "Spectral count ": "razorUniqueCount_",
295
+ "Average expression ": "AveExpr_",
296
+ "Ratio [log2] ": "logFC_",
297
+ "P-value ": "P.Value_",
298
+ "Adjusted p-value ": "adj.P.Val_",
299
+ }
300
+ intensity_column_tags = [
301
+ "Intensity",
302
+ "LFQ intensity",
303
+ "Expression",
304
+ "iBAQ intensity",
305
+ ]
306
+ sample_columns_tags = ["Spectral count"] + intensity_column_tags
307
+ amica_comparison_tag = (" vs ", "__vs__")
308
+
309
+ amica_table = qtable.get_data()
310
+
311
+ # Drop intensity columns from samples that are not present in the design
312
+ for tag in sample_columns_tags:
313
+ columns = helper.find_columns(amica_table, tag)
314
+ sample_columns = helper.find_sample_columns(
315
+ amica_table, tag, qtable.get_samples()
316
+ )
317
+ non_sample_columns = set(columns).difference(set(sample_columns))
318
+ amica_table.drop(non_sample_columns, inplace=True, axis=1)
319
+
320
+ # Log transform columns if necessary
321
+ for tag in intensity_column_tags:
322
+ for column in helper.find_columns(amica_table, tag):
323
+ if not helper.intensities_in_logspace(amica_table[column]):
324
+ amica_table[column] = amica_table[column].replace({0: np.nan})
325
+ amica_table[column] = np.log2(amica_table[column])
326
+
327
+ for old_column in helper.find_columns(amica_table, amica_comparison_tag[0]):
328
+ new_column = old_column.replace(*amica_comparison_tag)
329
+ amica_table.rename(columns={old_column: new_column}, inplace=True)
330
+
331
+ for column in filter_columns:
332
+ if column in amica_table.columns:
333
+ amica_table[column] = ["+" if i else "" for i in amica_table[column]]
334
+
335
+ for old_tag, new_tag in amica_column_tag_mapping.items():
336
+ for old_column in helper.find_columns(amica_table, old_tag):
337
+ new_column = old_column.replace(old_tag, new_tag)
338
+ amica_column_mapping[old_column] = new_column
339
+ amica_table.rename(columns=amica_column_mapping, inplace=True)
340
+
341
+ amica_columns = [
342
+ col for col in amica_column_mapping.values() if col in amica_table.columns
343
+ ]
344
+ return amica_table[amica_columns]
345
+
346
+
347
+ def _amica_design_from(qtable: Qtable) -> pd.DataFrame:
348
+ """Returns an experimental design table in the amica format.
349
+
350
+ Args:
351
+ design: A dataframe that must contain the columns "Sample" and "Experiment".
352
+
353
+ Returns:
354
+ A dataframe which columns are in the amica design table format.
355
+ """
356
+ design = qtable.get_design()
357
+ amica_design_columns = {"Sample": "samples", "Experiment": "groups"}
358
+ amica_design = design.rename(columns=amica_design_columns)
359
+ return amica_design
360
+
361
+
362
+ def _generate_html_coverage_map_page(
363
+ html_sequence_map: str, coverage: float, title: str = "Protein coverage map"
364
+ ) -> str:
365
+ """Generates the code for an html pag displaying a protein coverage map.
366
+
367
+ Args:
368
+ html_sequence_map: A string containing html code that represents a protein
369
+ coverage map.
370
+ coverage: Sequence coverage in percent.
371
+ title: Title of coverage page, is displayed in the browser tab as well as a
372
+ title on the page itself.
373
+
374
+ Returns:
375
+ A string containing the html code of the sequence coverage html page.
376
+
377
+ """
378
+ # fmt: off
379
+ html_lines = (
380
+ '<!-- index.html -->',
381
+ '',
382
+ '<!DOCTYPE html>',
383
+ '<html lang="en">',
384
+ ' <head>',
385
+ ' <meta charset="utf-8">',
386
+ f' <title>{title}</title>',
387
+ ' <style>',
388
+ ' h1 {font-family: "Arial", sans-serif;}'
389
+ ' body {',
390
+ ' font-family: "Lucida Console", "Courier new", monospace;',
391
+ ' font-size: 100%;'
392
+ ' }',
393
+ ' </style>',
394
+ ' </head>',
395
+ ' <body>',
396
+ f' <h1>{title}</h1>',
397
+ f' <p>Sequence coverage: {coverage}%</p>',
398
+ f' <p><PRE>{html_sequence_map}</PRE></p>',
399
+ ' </body>',
400
+ '</html>',
401
+ )
402
+ # fmt: on
403
+ html_string = "\n".join(html_lines)
404
+ return html_string
405
+
406
+
407
+ def _generate_html_sequence_map(
408
+ sequence: str,
409
+ covered_regions: Iterable[Iterable[int]],
410
+ coverage_color: str,
411
+ highlights: Optional[dict[int, str]] = None,
412
+ column_length: int = 10,
413
+ row_length: int = 50,
414
+ ) -> str:
415
+ """Generates the html code for a sequence coverage map with colored highlighting.
416
+
417
+ Args:
418
+ sequence: Amino acid sequence of a protein
419
+ covered_regions: A list of tuples, where each tuple specifies the start and end
420
+ positions of the continuously covered regions in the protein sequence. Note
421
+ that the positions are zero-indexed.
422
+ coverage_color: Hex color code for highlighting amino acids from the covered
423
+ regions.
424
+ highlights: Optional, allows specifying amino acid positions that should be
425
+ highlighted with a specific color. Must be a dictionary with keys being
426
+ zero indexed protein positions and values hex color codes.
427
+ column_length: Number of amino acids after which a space is inserted.
428
+ row_length: Number of amino acids after which a new line is inserted.
429
+
430
+ Returns:
431
+ A string containing the html code of the sequence coverage map.
432
+ """
433
+ if covered_regions:
434
+ coverage_start_idx, coverage_stop_idx = list(zip(*covered_regions))
435
+ else:
436
+ coverage_start_idx, coverage_stop_idx = (), ()
437
+ highlights = highlights if highlights is not None else {}
438
+ sequence_length = len(sequence)
439
+
440
+ def write_row_index(pos: int, strings: list) -> str:
441
+ ndigits = len(str(sequence_length))
442
+ row_index = str(pos + 1).rjust(ndigits)
443
+ html_entry = '<FONT COLOR="#000000">' + row_index + " " + "</FONT>"
444
+ strings.append(html_entry)
445
+
446
+ def open_coverage_region(strings: list):
447
+ strings.append(f'<FONT COLOR="{coverage_color}">')
448
+
449
+ def close_coverage_region(strings: list):
450
+ strings.append("</FONT>")
451
+
452
+ def is_end_of_row(pos: int):
453
+ return (pos != 0) and (pos % row_length == 0)
454
+
455
+ def is_end_of_column(pos: int):
456
+ return (pos != 0) and (pos % column_length == 0) and not is_end_of_row(pos)
457
+
458
+ in_covered_region: bool = False
459
+ strings = []
460
+ strings.append(f'<FONT COLOR="#606060">') # Set default text color to grey
461
+ write_row_index(0, strings)
462
+ for pos, character in enumerate(sequence):
463
+ if pos in coverage_start_idx:
464
+ in_covered_region = True
465
+ open_coverage_region(strings)
466
+
467
+ if is_end_of_row(pos):
468
+ if in_covered_region:
469
+ close_coverage_region(strings)
470
+ strings.append("<br>")
471
+ write_row_index(pos, strings)
472
+ if in_covered_region:
473
+ open_coverage_region(strings)
474
+ elif is_end_of_column(pos):
475
+ strings.append(" ")
476
+
477
+ if pos in highlights:
478
+ color = highlights[pos]
479
+ strings.append(f'<FONT COLOR="{color}"><u>{character}</u></FONT>')
480
+ else:
481
+ strings.append(character)
482
+
483
+ if pos in coverage_stop_idx:
484
+ in_covered_region = False
485
+ close_coverage_region(strings)
486
+ strings.append(f"</FONT>")
487
+
488
+ html_sequence_block = "".join(strings)
489
+ return html_sequence_block
490
+
491
+
492
+ def _find_covered_region_boundaries(coverage_mask: Iterable[bool]) -> list[tuple[int]]:
493
+ """Returns a list of boundaries from continuously covered regions in a protein.
494
+
495
+ Args:
496
+ coverage_mask: An iterable of boolean values that represents the coverage map of
497
+ a protein sequence. A True value at a specific position indicates that the
498
+ corresponding amino acid was covered by the identified peptides.
499
+
500
+ Returns:
501
+ A list of tuples, where each tuple specifies the start and end positions of the
502
+ continuously covered regions in the coverage mask. Note that the positions are
503
+ zero-indexed.
504
+
505
+ Examples:
506
+ >>> coverage_mask = [True, True, False, False, True]
507
+ >>> _find_covered_region_boundaries(coverage_mask)
508
+ ... [(0, 1), (4, 4)]
509
+ """
510
+ start = []
511
+ stop = []
512
+
513
+ start_index = 0
514
+
515
+ previous_was_covered = coverage_mask[0]
516
+ if previous_was_covered:
517
+ start.append(start_index)
518
+ for i, is_covered in enumerate(coverage_mask[1:], start=start_index + 1):
519
+ if is_covered and not previous_was_covered:
520
+ start.append(i)
521
+ if not is_covered and previous_was_covered:
522
+ stop.append(i - 1)
523
+ previous_was_covered = is_covered
524
+ if previous_was_covered:
525
+ stop.append(i)
526
+ return list(zip(start, stop))
msreport/fasta.py ADDED
@@ -0,0 +1,28 @@
1
+ import pathlib
2
+ from typing import Iterable, Union
3
+
4
+
5
+ from profasta.db import ProteinDatabase
6
+
7
+
8
+ def import_protein_database(
9
+ fasta_path: Union[str, pathlib.Path, Iterable[Union[str, pathlib.Path]]],
10
+ header_parser: str = "uniprot",
11
+ ) -> ProteinDatabase:
12
+ """Generates a protein database from one or a list of fasta files.
13
+
14
+ Args:
15
+ fasta_path: Path to a fasta file, or a list of paths. The path can be either a
16
+ string or a pathlib.Path instance.
17
+ header_parser: Allows specifying the name of the parser to use for parsing the
18
+ FASTA headers. The specified parser must be registered in the global parser
19
+ registry. By default a strict uniprot parser is used.
20
+
21
+ Returns:
22
+ A protein database containing entries from the parsed fasta files.
23
+ """
24
+ database = ProteinDatabase()
25
+ paths = [fasta_path] if isinstance(fasta_path, (str, pathlib.Path)) else fasta_path
26
+ for path in paths:
27
+ database.add_fasta(path, header_parser=header_parser, overwrite=True)
28
+ return database
@@ -0,0 +1,23 @@
1
+ from .calc import (
2
+ mode,
3
+ calculate_tryptic_ibaq_peptides,
4
+ make_coverage_mask,
5
+ calculate_sequence_coverage,
6
+ calculate_monoisotopic_mass,
7
+ )
8
+ from .table import (
9
+ apply_intensity_cutoff,
10
+ guess_design,
11
+ intensities_in_logspace,
12
+ find_columns,
13
+ find_sample_columns,
14
+ keep_rows_by_partial_match,
15
+ remove_rows_by_partial_match,
16
+ join_tables,
17
+ rename_sample_columns,
18
+ rename_mq_reporter_channels,
19
+ )
20
+ from .temp import (
21
+ extract_modifications,
22
+ modify_peptide,
23
+ )
@@ -0,0 +1,120 @@
1
+ import itertools
2
+ from typing import Iterable
3
+
4
+ import numpy as np
5
+ import scipy.stats
6
+ import scipy.optimize
7
+
8
+ import pyteomics.mass
9
+ import pyteomics.parser
10
+
11
+
12
+ def mode(values: Iterable) -> float:
13
+ """Calculate the mode by using kernel-density estimation.
14
+
15
+ Args:
16
+ values: Sequence of values for which the mode will be estimated, only finite
17
+ values are used for the calculation.
18
+
19
+ Returns:
20
+ The estimated mode. If no finite values are present, returns nan.
21
+ """
22
+ values = np.array(values)
23
+ finite_values = values[np.isfinite(values)]
24
+ if len(finite_values) == 0:
25
+ mode = np.nan
26
+ elif len(np.unique(finite_values)) == 1:
27
+ mode = np.unique(finite_values)[0]
28
+ else:
29
+ median = np.median(finite_values)
30
+ bounds = (median - 1.5, median + 1.5)
31
+ kde = scipy.stats.gaussian_kde(finite_values)
32
+ optimize_result = scipy.optimize.minimize_scalar(
33
+ lambda x: -kde(x)[0], method="Bounded", bounds=bounds
34
+ )
35
+ mode = optimize_result.x
36
+ # Maybe add fallback function if optimize was not successful
37
+ return mode
38
+
39
+
40
+ def calculate_tryptic_ibaq_peptides(protein_sequence: str) -> int:
41
+ """Calculates the number of tryptic iBAQ peptides.
42
+
43
+ The number of iBAQ peptides is calculated as the number of tryptic peptides with a
44
+ length between 7 and 30 amino acids. Multiple peptides with the same sequence are
45
+ counted multiple times.
46
+
47
+ Args:
48
+ protein_sequence: Amino acid sequence of a protein.
49
+
50
+ Returns:
51
+ Number of tryptic iBAQ peptides for the given protein sequence.
52
+ """
53
+ cleavage_rule = "[KR]"
54
+ missed_cleavage = 0
55
+ min_length = 7
56
+ max_length = 30
57
+
58
+ digestion_products = pyteomics.parser.icleave(
59
+ protein_sequence,
60
+ cleavage_rule,
61
+ missed_cleavages=missed_cleavage,
62
+ min_length=min_length,
63
+ max_length=max_length,
64
+ regex=True,
65
+ )
66
+ ibaq_peptides = [sequence for index, sequence in digestion_products]
67
+ return len(ibaq_peptides)
68
+
69
+
70
+ def calculate_monoisotopic_mass(protein_sequence: str) -> float:
71
+ """Calculates the monoisotopic mass of the protein sequence in Dalton.
72
+
73
+ Note that there is an opinionated behaviour for non-standard amino acids code. "O"
74
+ is Pyrrolysine, "U" is Selenocysteine, "B" is treated as "N", "Z" is treated as "Q",
75
+ and "X" is ignored.
76
+
77
+ Args:
78
+ protein_sequence: Amino acid sequence of a protein.
79
+
80
+ Returns:
81
+ Monoisotopic mass in Dalton.
82
+ """
83
+ sequence = protein_sequence.replace("B", "N").replace("Z", "Q").replace("X", "")
84
+ return pyteomics.mass.fast_mass(sequence)
85
+
86
+
87
+ def make_coverage_mask(
88
+ protein_length: int, peptide_positions: list[(int, int)]
89
+ ) -> np.array:
90
+ """Returns a Boolean array with True for positions present in 'peptide_positions'.
91
+
92
+ Args:
93
+ protein_length: The number of amino acids in the protein sequence.
94
+ peptide_positions: List of peptide start and end positions.
95
+
96
+ Returns:
97
+ A 1-dimensional Boolean array with length equal to 'protein_length'.
98
+ """
99
+ coverage_mask = np.zeros(protein_length, dtype="bool")
100
+ for start, end in peptide_positions:
101
+ coverage_mask[start - 1 : end] = True
102
+ return coverage_mask
103
+
104
+
105
+ def calculate_sequence_coverage(
106
+ protein_length: int, peptide_positions: list[(int, int)], ndigits: int = 1
107
+ ) -> np.array:
108
+ """Calculates the protein sequence coverage given a list of peptide positions.
109
+
110
+ Args:
111
+ protein_length: The number of amino acids in the protein sequence.
112
+ peptide_positions: List of peptide start and end positions.
113
+ ndigits: Optional, number of decimal places for rounding the sequence coverage.
114
+
115
+ Returns:
116
+ Sequence coverage in percent, with values ranging from 0 to 100.
117
+ """
118
+ coverage_mask = make_coverage_mask(protein_length, peptide_positions)
119
+ coverage = round(coverage_mask.sum() / protein_length * 100, ndigits)
120
+ return coverage