openms-insight 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,384 @@
1
+ """SequenceView component for peptide/protein sequence visualization with fragment matching."""
2
+
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ import polars as pl
6
+
7
+ from ..core.base import BaseComponent
8
+ from ..core.registry import register_component
9
+
10
+
11
+ def parse_openms_sequence(sequence_str: str) -> Tuple[List[str], List[Optional[float]]]:
12
+ """Parse OpenMS sequence format to extract residues and modification mass shifts.
13
+
14
+ Converts e.g. 'SHC(Carbamidomethyl)IAEVEK' to:
15
+ - residues: ['S', 'H', 'C', 'I', 'A', 'E', 'V', 'E', 'K']
16
+ - modifications: [None, None, 57.02, None, None, None, None, None, None]
17
+
18
+ Args:
19
+ sequence_str: Peptide sequence in OpenMS format with modifications in parentheses
20
+
21
+ Returns:
22
+ Tuple of (residues list, modifications list where None means unmodified)
23
+ """
24
+ try:
25
+ from pyopenms import AASequence
26
+
27
+ aa_seq = AASequence.fromString(sequence_str)
28
+ residues = []
29
+ modifications = []
30
+
31
+ for i in range(aa_seq.size()):
32
+ residue = aa_seq.getResidue(i)
33
+ one_letter = residue.getOneLetterCode()
34
+ residues.append(one_letter)
35
+
36
+ mod = residue.getModification()
37
+ if mod:
38
+ diff_mono = mod.getDiffMonoMass()
39
+ modifications.append(round(diff_mono, 2))
40
+ else:
41
+ modifications.append(None)
42
+
43
+ return residues, modifications
44
+ except ImportError:
45
+ # Fallback: just extract single-letter codes (naive parsing)
46
+ residues = []
47
+ modifications = []
48
+ i = 0
49
+ while i < len(sequence_str):
50
+ if sequence_str[i].isupper():
51
+ residues.append(sequence_str[i])
52
+ modifications.append(None)
53
+ i += 1
54
+ elif sequence_str[i] == '(':
55
+ # Skip modification name in parentheses
56
+ end = sequence_str.find(')', i)
57
+ if end > i:
58
+ i = end + 1
59
+ else:
60
+ i += 1
61
+ else:
62
+ i += 1
63
+ return residues, modifications
64
+ except Exception:
65
+ # On any error, return the raw sequence as single letters
66
+ return list(sequence_str), [None] * len(sequence_str)
67
+
68
+
69
+ # Amino acid monoisotopic masses
70
+ AA_MASSES = {
71
+ 'A': 71.037114, 'R': 156.101111, 'N': 114.042927, 'D': 115.026943,
72
+ 'C': 103.009185, 'E': 129.042593, 'Q': 128.058578, 'G': 57.021464,
73
+ 'H': 137.058912, 'I': 113.084064, 'L': 113.084064, 'K': 128.094963,
74
+ 'M': 131.040485, 'F': 147.068414, 'P': 97.052764, 'S': 87.032028,
75
+ 'T': 101.047679, 'U': 150.953633, 'W': 186.079313, 'Y': 163.063329,
76
+ 'V': 99.068414, 'X': 0, 'Z': 0,
77
+ }
78
+
79
+ # Ion type mass adjustments
80
+ # These are approximate - for precise values, use pyOpenMS
81
+ H2O = 18.010565
82
+ NH3 = 17.026549
83
+ PROTON = 1.007276
84
+
85
+ # Ion type offsets (from N-terminus for prefix, C-terminus for suffix)
86
+ ION_OFFSETS = {
87
+ 'a': -27.994915, # CO loss from b
88
+ 'b': 0.0,
89
+ 'c': 17.026549, # NH3 addition to b
90
+ 'x': 43.989829, # CO + CO addition to y
91
+ 'y': 18.010565, # H2O addition (protonated)
92
+ 'z': 1.991841, # NH loss from y
93
+ }
94
+
95
+
96
+ def calculate_prefix_mass(sequence: str, position: int) -> float:
97
+ """Calculate mass of N-terminal fragment (positions 0 to position inclusive)."""
98
+ mass = 0.0
99
+ for i in range(position + 1):
100
+ mass += AA_MASSES.get(sequence[i], 0.0)
101
+ return mass
102
+
103
+
104
+ def calculate_suffix_mass(sequence: str, position: int) -> float:
105
+ """Calculate mass of C-terminal fragment (positions position to end)."""
106
+ mass = 0.0
107
+ for i in range(position, len(sequence)):
108
+ mass += AA_MASSES.get(sequence[i], 0.0)
109
+ return mass
110
+
111
+
112
+ def calculate_fragment_masses(sequence: str) -> Dict[str, List[List[float]]]:
113
+ """
114
+ Calculate theoretical fragment masses for all ion types.
115
+
116
+ Args:
117
+ sequence: Amino acid sequence string
118
+
119
+ Returns:
120
+ Dict with keys fragment_masses_a, fragment_masses_b, etc.
121
+ Each value is a list of lists (to support ambiguous modifications)
122
+ """
123
+ n = len(sequence)
124
+ result = {}
125
+
126
+ # Prefix ions (a, b, c) - from N-terminus
127
+ for ion_type in ['a', 'b', 'c']:
128
+ masses = []
129
+ for i in range(n):
130
+ prefix_mass = calculate_prefix_mass(sequence, i)
131
+ ion_mass = prefix_mass + ION_OFFSETS[ion_type]
132
+ masses.append([ion_mass])
133
+ result[f'fragment_masses_{ion_type}'] = masses
134
+
135
+ # Suffix ions (x, y, z) - from C-terminus
136
+ for ion_type in ['x', 'y', 'z']:
137
+ masses = []
138
+ for i in range(n):
139
+ # For suffix ions, position i means i+1 residues from C-terminus
140
+ suffix_mass = calculate_suffix_mass(sequence, n - i - 1)
141
+ ion_mass = suffix_mass + ION_OFFSETS[ion_type]
142
+ masses.append([ion_mass])
143
+ result[f'fragment_masses_{ion_type}'] = masses
144
+
145
+ return result
146
+
147
+
148
+ def calculate_theoretical_mass(sequence: str) -> float:
149
+ """Calculate monoisotopic mass of full sequence."""
150
+ mass = H2O # Add water for full peptide
151
+ for aa in sequence:
152
+ mass += AA_MASSES.get(aa, 0.0)
153
+ return mass
154
+
155
+
156
+ @register_component("sequence_view")
157
+ class SequenceView(BaseComponent):
158
+ """
159
+ Interactive sequence view component for peptide/protein visualization.
160
+
161
+ Displays amino acid sequence with fragment ion markers. When provided with
162
+ observed masses from a spectrum, highlights matched theoretical fragments.
163
+
164
+ Features:
165
+ - Amino acid grid display with configurable row width
166
+ - Fragment ion markers (a, b, c, x, y, z)
167
+ - Tolerance-based fragment matching
168
+ - Fragment table showing matches
169
+ - Residue cleavage percentage calculation
170
+
171
+ Example:
172
+ sequence_view = SequenceView(
173
+ cache_id="peptide_view",
174
+ sequence="PEPTIDEK",
175
+ observed_masses=[147.1, 244.2, 359.3, ...],
176
+ precursor_mass=944.5,
177
+ )
178
+ sequence_view(state_manager=state_manager)
179
+ """
180
+
181
+ _component_type: str = "sequence_view"
182
+
183
+ def __init__(
184
+ self,
185
+ cache_id: str,
186
+ sequence: str,
187
+ observed_masses: Optional[List[float]] = None,
188
+ peak_ids: Optional[List[int]] = None,
189
+ precursor_mass: Optional[float] = None,
190
+ data: Optional[pl.LazyFrame] = None, # Not used but required by base
191
+ filters: Optional[Dict[str, str]] = None,
192
+ interactivity: Optional[Dict[str, str]] = None,
193
+ cache_path: str = ".",
194
+ regenerate_cache: bool = False,
195
+ fixed_modifications: Optional[List[str]] = None,
196
+ title: Optional[str] = None,
197
+ height: int = 400,
198
+ deconvolved: bool = True,
199
+ precursor_charge: int = 1,
200
+ _precomputed_sequence_data: Optional[Dict[str, Any]] = None,
201
+ **kwargs
202
+ ):
203
+ """
204
+ Initialize the SequenceView component.
205
+
206
+ Args:
207
+ cache_id: Unique identifier for this component's cache.
208
+ sequence: Amino acid sequence string (single-letter codes).
209
+ observed_masses: List of observed peak masses from spectrum.
210
+ peak_ids: List of peak IDs corresponding to observed_masses (for interactivity).
211
+ precursor_mass: Observed precursor mass.
212
+ data: Not used for SequenceView, but required by base class.
213
+ filters: Mapping of identifier names to column names for filtering.
214
+ interactivity: Mapping of identifier names to column names for clicks.
215
+ Example: {'peak': 'peak_id'} sets 'peak' selection to 'peak_id' value on click.
216
+ cache_path: Base path for cache storage.
217
+ regenerate_cache: If True, regenerate cache even if valid.
218
+ fixed_modifications: List of amino acids with fixed modifications (e.g., ['C']).
219
+ title: Optional title displayed above the sequence.
220
+ height: Component height in pixels.
221
+ deconvolved: If True (default), observed_masses are neutral masses.
222
+ If False, observed_masses are m/z values and fragment matching
223
+ considers charge states 1 to precursor_charge.
224
+ precursor_charge: Maximum charge state to consider for fragment matching
225
+ when deconvolved=False. Fragments can have charge 1 to this value.
226
+ _precomputed_sequence_data: Optional pre-computed sequence data dict.
227
+ If provided, skips fragment mass calculation (used when fragment
228
+ masses are already cached externally, e.g., in identification preprocessing).
229
+ **kwargs: Additional configuration options.
230
+ """
231
+ self._sequence_raw = sequence # Keep original for calculations
232
+ self._sequence = sequence.upper().replace(' ', '').replace('\n', '')
233
+ self._observed_masses = observed_masses or []
234
+ self._peak_ids = peak_ids # peak_ids corresponding to observed_masses
235
+ self._precursor_mass = precursor_mass or 0.0
236
+ self._fixed_modifications = fixed_modifications or []
237
+ self._title = title
238
+ self._height = height
239
+ self._deconvolved = deconvolved
240
+ self._precursor_charge = max(1, precursor_charge)
241
+ self._precomputed_sequence_data = _precomputed_sequence_data
242
+
243
+ # Parse sequence to extract residues and modifications
244
+ self._parsed_residues, self._parsed_modifications = parse_openms_sequence(self._sequence)
245
+
246
+ # Build peaks DataFrame for interactivity validation
247
+ # This allows interactivity={'peak': 'peak_id'} to validate naturally
248
+ # Note: Cache validity is based on sequence (via _get_cache_config), not peaks data
249
+ if data is None:
250
+ if self._observed_masses:
251
+ ids = self._peak_ids if self._peak_ids is not None else list(range(len(self._observed_masses)))
252
+ data = pl.LazyFrame({
253
+ 'peak_id': ids,
254
+ 'mass': self._observed_masses,
255
+ })
256
+ else:
257
+ # Empty peaks - use schema so validation still passes
258
+ data = pl.LazyFrame(schema={'peak_id': pl.Int64, 'mass': pl.Float64})
259
+
260
+ super().__init__(
261
+ cache_id=cache_id,
262
+ data=data,
263
+ filters=filters,
264
+ interactivity=interactivity,
265
+ cache_path=cache_path,
266
+ regenerate_cache=regenerate_cache,
267
+ **kwargs
268
+ )
269
+
270
+ def _get_cache_config(self) -> Dict[str, Any]:
271
+ """Get configuration that affects cache validity."""
272
+ return {
273
+ 'sequence': self._sequence,
274
+ 'fixed_modifications': self._fixed_modifications,
275
+ }
276
+
277
+ def _preprocess(self) -> None:
278
+ """
279
+ Preprocess sequence data.
280
+
281
+ Calculates theoretical fragment masses for all ion types.
282
+ This is cached so subsequent renders are fast.
283
+ """
284
+ # Calculate fragment masses using plain residues
285
+ plain_sequence = ''.join(self._parsed_residues)
286
+ fragment_masses = calculate_fragment_masses(plain_sequence)
287
+
288
+ # Calculate theoretical mass
289
+ theoretical_mass = calculate_theoretical_mass(plain_sequence)
290
+
291
+ # Build sequence data structure
292
+ sequence_data = {
293
+ 'sequence': self._parsed_residues,
294
+ 'modifications': self._parsed_modifications, # New: list of mass shifts per position
295
+ 'theoretical_mass': theoretical_mass,
296
+ 'fixed_modifications': self._fixed_modifications,
297
+ **fragment_masses,
298
+ }
299
+
300
+ self._preprocessed_data['sequence_data'] = sequence_data
301
+
302
+ def _get_vue_component_name(self) -> str:
303
+ """Return the Vue component name."""
304
+ return 'SequenceView'
305
+
306
+ def _get_data_key(self) -> str:
307
+ """Return the key used to send primary data to Vue."""
308
+ return 'sequenceData'
309
+
310
+ def _prepare_vue_data(self, state: Dict[str, Any]) -> Dict[str, Any]:
311
+ """
312
+ Prepare sequence data for Vue component.
313
+
314
+ Args:
315
+ state: Current selection state from StateManager
316
+
317
+ Returns:
318
+ Dict with sequenceData, observedMasses, precursorMass, and _hash
319
+ """
320
+ # Use precomputed data if available, otherwise use cached/computed data
321
+ if self._precomputed_sequence_data is not None:
322
+ sequence_data = self._precomputed_sequence_data
323
+ else:
324
+ sequence_data = self._preprocessed_data.get('sequence_data', {})
325
+
326
+ # Create a hash based on sequence and observed masses
327
+ import hashlib
328
+ hash_input = f"{self._sequence}:{len(self._observed_masses)}:{self._precursor_mass}"
329
+ data_hash = hashlib.md5(hash_input.encode()).hexdigest()[:8]
330
+
331
+ result = {
332
+ 'sequenceData': sequence_data,
333
+ 'observedMasses': self._observed_masses,
334
+ 'precursorMass': self._precursor_mass,
335
+ '_hash': data_hash,
336
+ }
337
+
338
+ # Include peak_ids if provided (for interactivity linking)
339
+ if self._peak_ids is not None:
340
+ result['peakIds'] = self._peak_ids
341
+
342
+ return result
343
+
344
+ def _get_component_args(self) -> Dict[str, Any]:
345
+ """Get component arguments to send to Vue."""
346
+ args: Dict[str, Any] = {
347
+ 'componentType': self._get_vue_component_name(),
348
+ 'height': self._height,
349
+ 'deconvolved': self._deconvolved,
350
+ 'precursorCharge': self._precursor_charge,
351
+ }
352
+
353
+ if self._title:
354
+ args['title'] = self._title
355
+
356
+ # Pass interactivity mapping to Vue (similar to other components)
357
+ if self._interactivity:
358
+ args['interactivity'] = self._interactivity
359
+
360
+ args.update(self._config)
361
+ return args
362
+
363
+ def update_observed_masses(
364
+ self,
365
+ observed_masses: List[float],
366
+ precursor_mass: Optional[float] = None
367
+ ) -> 'SequenceView':
368
+ """
369
+ Update the observed masses for fragment matching.
370
+
371
+ This allows reusing the same cached sequence data with different
372
+ spectra for matching.
373
+
374
+ Args:
375
+ observed_masses: New list of observed peak masses.
376
+ precursor_mass: Optional new precursor mass.
377
+
378
+ Returns:
379
+ Self for method chaining.
380
+ """
381
+ self._observed_masses = observed_masses
382
+ if precursor_mass is not None:
383
+ self._precursor_mass = precursor_mass
384
+ return self