openms-insight 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,21 @@
1
1
  """SequenceView component for peptide/protein sequence visualization with fragment matching."""
2
2
 
3
- from typing import Any, Dict, List, Optional, Tuple
3
+ import hashlib
4
+ import json
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
4
8
 
5
9
  import polars as pl
6
10
 
7
- from ..core.base import BaseComponent
8
11
  from ..core.registry import register_component
12
+ from ..preprocessing.filtering import optimize_for_transfer
13
+
14
+ # Proton mass for m/z calculations
15
+ PROTON_MASS = 1.007276
16
+
17
+ # Cache version - increment when cache format changes
18
+ CACHE_VERSION = 1
9
19
 
10
20
 
11
21
  def parse_openms_sequence(sequence_str: str) -> Tuple[List[str], List[Optional[float]]]:
@@ -51,9 +61,9 @@ def parse_openms_sequence(sequence_str: str) -> Tuple[List[str], List[Optional[f
51
61
  residues.append(sequence_str[i])
52
62
  modifications.append(None)
53
63
  i += 1
54
- elif sequence_str[i] == '(':
64
+ elif sequence_str[i] == "(":
55
65
  # Skip modification name in parentheses
56
- end = sequence_str.find(')', i)
66
+ end = sequence_str.find(")", i)
57
67
  if end > i:
58
68
  i = end + 1
59
69
  else:
@@ -66,116 +76,281 @@ def parse_openms_sequence(sequence_str: str) -> Tuple[List[str], List[Optional[f
66
76
  return list(sequence_str), [None] * len(sequence_str)
67
77
 
68
78
 
69
- # Amino acid monoisotopic masses
70
- AA_MASSES = {
71
- 'A': 71.037114, 'R': 156.101111, 'N': 114.042927, 'D': 115.026943,
72
- 'C': 103.009185, 'E': 129.042593, 'Q': 128.058578, 'G': 57.021464,
73
- 'H': 137.058912, 'I': 113.084064, 'L': 113.084064, 'K': 128.094963,
74
- 'M': 131.040485, 'F': 147.068414, 'P': 97.052764, 'S': 87.032028,
75
- 'T': 101.047679, 'U': 150.953633, 'W': 186.079313, 'Y': 163.063329,
76
- 'V': 99.068414, 'X': 0, 'Z': 0,
77
- }
78
-
79
- # Ion type mass adjustments
80
- # These are approximate - for precise values, use pyOpenMS
81
- H2O = 18.010565
82
- NH3 = 17.026549
83
- PROTON = 1.007276
84
-
85
- # Ion type offsets (from N-terminus for prefix, C-terminus for suffix)
86
- ION_OFFSETS = {
87
- 'a': -27.994915, # CO loss from b
88
- 'b': 0.0,
89
- 'c': 17.026549, # NH3 addition to b
90
- 'x': 43.989829, # CO + CO addition to y
91
- 'y': 18.010565, # H2O addition (protonated)
92
- 'z': 1.991841, # NH loss from y
93
- }
94
-
79
+ def calculate_fragment_masses_pyopenms(
80
+ sequence_str: str,
81
+ ) -> Dict[str, List[List[float]]]:
82
+ """Calculate theoretical fragment masses using pyOpenMS TheoreticalSpectrumGenerator.
95
83
 
96
- def calculate_prefix_mass(sequence: str, position: int) -> float:
97
- """Calculate mass of N-terminal fragment (positions 0 to position inclusive)."""
98
- mass = 0.0
99
- for i in range(position + 1):
100
- mass += AA_MASSES.get(sequence[i], 0.0)
101
- return mass
84
+ Args:
85
+ sequence_str: Peptide sequence string (can include modifications)
102
86
 
87
+ Returns:
88
+ Dict with fragment_masses_a, fragment_masses_b, etc.
89
+ Each is a list of lists (one per position, supporting multiple masses).
90
+ """
91
+ try:
92
+ from pyopenms import AASequence, MSSpectrum, TheoreticalSpectrumGenerator
103
93
 
104
- def calculate_suffix_mass(sequence: str, position: int) -> float:
105
- """Calculate mass of C-terminal fragment (positions position to end)."""
106
- mass = 0.0
107
- for i in range(position, len(sequence)):
108
- mass += AA_MASSES.get(sequence[i], 0.0)
109
- return mass
94
+ aa_seq = AASequence.fromString(sequence_str)
95
+ n = aa_seq.size()
96
+
97
+ # Configure TheoreticalSpectrumGenerator
98
+ tsg = TheoreticalSpectrumGenerator()
99
+ params = tsg.getParameters()
100
+
101
+ params.setValue("add_a_ions", "true")
102
+ params.setValue("add_b_ions", "true")
103
+ params.setValue("add_c_ions", "true")
104
+ params.setValue("add_x_ions", "true")
105
+ params.setValue("add_y_ions", "true")
106
+ params.setValue("add_z_ions", "true")
107
+ params.setValue("add_first_prefix_ion", "true") # Include b1/a1/c1 ions
108
+ params.setValue("add_metainfo", "true")
109
+
110
+ tsg.setParameters(params)
111
+
112
+ # Generate spectrum for charge 1, then convert to neutral masses
113
+ spec = MSSpectrum()
114
+ tsg.getSpectrum(spec, aa_seq, 1, 1)
115
+
116
+ ion_types = ["a", "b", "c", "x", "y", "z"]
117
+ result = {f"fragment_masses_{ion}": [[] for _ in range(n)] for ion in ion_types}
118
+
119
+ # Get ion names from StringDataArrays
120
+ ion_names = []
121
+ sdas = spec.getStringDataArrays()
122
+ for sda in sdas:
123
+ if sda.getName() == "IonNames":
124
+ for i in range(sda.size()):
125
+ name = sda[i]
126
+ if isinstance(name, bytes):
127
+ name = name.decode("utf-8")
128
+ ion_names.append(name)
129
+ break
130
+
131
+ # Parse peaks and organize by ion type and position
132
+ for i in range(spec.size()):
133
+ peak = spec[i]
134
+ # Convert singly-charged m/z to neutral mass
135
+ mz_charge1 = peak.getMZ()
136
+ neutral_mass = mz_charge1 - PROTON_MASS
137
+ ion_name = ion_names[i] if i < len(ion_names) else ""
138
+
139
+ if not ion_name:
140
+ continue
141
+
142
+ # Parse ion name (e.g., "b3+", "y5++")
143
+ ion_type = None
144
+ ion_number = None
145
+
146
+ for t in ion_types:
147
+ if ion_name.lower().startswith(t):
148
+ ion_type = t
149
+ try:
150
+ num_str = ""
151
+ for c in ion_name[1:]:
152
+ if c.isdigit():
153
+ num_str += c
154
+ else:
155
+ break
156
+ if num_str:
157
+ ion_number = int(num_str)
158
+ except (ValueError, IndexError):
159
+ pass
160
+ break
161
+
162
+ if ion_type and ion_number and 1 <= ion_number <= n:
163
+ idx = ion_number - 1
164
+ key = f"fragment_masses_{ion_type}"
165
+ if idx < len(result[key]):
166
+ result[key][idx].append(neutral_mass)
110
167
 
168
+ return result
111
169
 
112
- def calculate_fragment_masses(sequence: str) -> Dict[str, List[List[float]]]:
113
- """
114
- Calculate theoretical fragment masses for all ion types.
170
+ except ImportError:
171
+ # Fallback to simple calculation without pyOpenMS
172
+ return _calculate_fragment_masses_simple(sequence_str)
173
+ except Exception as e:
174
+ print(f"Error calculating fragments for {sequence_str}: {e}")
175
+ return {f"fragment_masses_{ion}": [] for ion in ["a", "b", "c", "x", "y", "z"]}
176
+
177
+
178
+ def _calculate_fragment_masses_simple(
179
+ sequence_str: str,
180
+ ) -> Dict[str, List[List[float]]]:
181
+ """Fallback fragment calculation without pyOpenMS."""
182
+ # Amino acid monoisotopic masses
183
+ AA_MASSES = {
184
+ "A": 71.037114,
185
+ "R": 156.101111,
186
+ "N": 114.042927,
187
+ "D": 115.026943,
188
+ "C": 103.009185,
189
+ "E": 129.042593,
190
+ "Q": 128.058578,
191
+ "G": 57.021464,
192
+ "H": 137.058912,
193
+ "I": 113.084064,
194
+ "L": 113.084064,
195
+ "K": 128.094963,
196
+ "M": 131.040485,
197
+ "F": 147.068414,
198
+ "P": 97.052764,
199
+ "S": 87.032028,
200
+ "T": 101.047679,
201
+ "U": 150.953633,
202
+ "W": 186.079313,
203
+ "Y": 163.063329,
204
+ "V": 99.068414,
205
+ }
206
+
207
+ # Ion type offsets
208
+ ION_OFFSETS = {
209
+ "a": -27.994915,
210
+ "b": 0.0,
211
+ "c": 17.026549,
212
+ "x": 43.989829,
213
+ "y": 18.010565,
214
+ "z": 1.991841,
215
+ }
216
+
217
+ # Extract plain sequence
218
+ residues, _ = parse_openms_sequence(sequence_str)
219
+ n = len(residues)
220
+ result = {}
115
221
 
116
- Args:
117
- sequence: Amino acid sequence string
222
+ # Calculate prefix masses
223
+ prefix_masses = []
224
+ mass = 0.0
225
+ for aa in residues:
226
+ mass += AA_MASSES.get(aa, 0.0)
227
+ prefix_masses.append(mass)
118
228
 
119
- Returns:
120
- Dict with keys fragment_masses_a, fragment_masses_b, etc.
121
- Each value is a list of lists (to support ambiguous modifications)
122
- """
123
- n = len(sequence)
124
- result = {}
229
+ # Calculate suffix masses
230
+ suffix_masses = []
231
+ mass = 0.0
232
+ for aa in reversed(residues):
233
+ mass += AA_MASSES.get(aa, 0.0)
234
+ suffix_masses.append(mass)
235
+ suffix_masses = list(reversed(suffix_masses))
125
236
 
126
- # Prefix ions (a, b, c) - from N-terminus
127
- for ion_type in ['a', 'b', 'c']:
237
+ # Prefix ions (a, b, c)
238
+ for ion_type in ["a", "b", "c"]:
128
239
  masses = []
129
240
  for i in range(n):
130
- prefix_mass = calculate_prefix_mass(sequence, i)
131
- ion_mass = prefix_mass + ION_OFFSETS[ion_type]
241
+ ion_mass = prefix_masses[i] + ION_OFFSETS[ion_type]
132
242
  masses.append([ion_mass])
133
- result[f'fragment_masses_{ion_type}'] = masses
243
+ result[f"fragment_masses_{ion_type}"] = masses
134
244
 
135
- # Suffix ions (x, y, z) - from C-terminus
136
- for ion_type in ['x', 'y', 'z']:
245
+ # Suffix ions (x, y, z)
246
+ for ion_type in ["x", "y", "z"]:
137
247
  masses = []
138
248
  for i in range(n):
139
- # For suffix ions, position i means i+1 residues from C-terminus
140
- suffix_mass = calculate_suffix_mass(sequence, n - i - 1)
141
- ion_mass = suffix_mass + ION_OFFSETS[ion_type]
249
+ idx = n - i - 1
250
+ ion_mass = suffix_masses[idx] + ION_OFFSETS[ion_type]
142
251
  masses.append([ion_mass])
143
- result[f'fragment_masses_{ion_type}'] = masses
252
+ result[f"fragment_masses_{ion_type}"] = masses
144
253
 
145
254
  return result
146
255
 
147
256
 
148
- def calculate_theoretical_mass(sequence: str) -> float:
149
- """Calculate monoisotopic mass of full sequence."""
150
- mass = H2O # Add water for full peptide
151
- for aa in sequence:
152
- mass += AA_MASSES.get(aa, 0.0)
153
- return mass
257
+ def get_theoretical_mass(sequence_str: str) -> float:
258
+ """Calculate monoisotopic mass of a peptide sequence."""
259
+ try:
260
+ from pyopenms import AASequence
261
+
262
+ aa_seq = AASequence.fromString(sequence_str)
263
+ return aa_seq.getMonoWeight()
264
+ except ImportError:
265
+ # Fallback
266
+ H2O = 18.010565
267
+ AA_MASSES = {
268
+ "A": 71.037114,
269
+ "R": 156.101111,
270
+ "N": 114.042927,
271
+ "D": 115.026943,
272
+ "C": 103.009185,
273
+ "E": 129.042593,
274
+ "Q": 128.058578,
275
+ "G": 57.021464,
276
+ "H": 137.058912,
277
+ "I": 113.084064,
278
+ "L": 113.084064,
279
+ "K": 128.094963,
280
+ "M": 131.040485,
281
+ "F": 147.068414,
282
+ "P": 97.052764,
283
+ "S": 87.032028,
284
+ "T": 101.047679,
285
+ "U": 150.953633,
286
+ "W": 186.079313,
287
+ "Y": 163.063329,
288
+ "V": 99.068414,
289
+ }
290
+ residues, _ = parse_openms_sequence(sequence_str)
291
+ mass = H2O
292
+ for aa in residues:
293
+ mass += AA_MASSES.get(aa, 0.0)
294
+ return mass
295
+ except Exception:
296
+ return 0.0
297
+
298
+
299
+ # Default annotation configuration
300
+ DEFAULT_ANNOTATION_CONFIG = {
301
+ "ion_types": ["b", "y"],
302
+ "neutral_losses": True,
303
+ "proton_loss_addition": False,
304
+ "tolerance": 20.0,
305
+ "tolerance_ppm": True,
306
+ "colors": {
307
+ "a": "#9B59B6",
308
+ "b": "#E74C3C",
309
+ "c": "#E67E22",
310
+ "x": "#1ABC9C",
311
+ "y": "#3498DB",
312
+ "z": "#2ECC71",
313
+ },
314
+ }
315
+
316
+
317
+ @dataclass
318
+ class SequenceViewResult:
319
+ """Result returned by SequenceView.__call__().
320
+
321
+ Attributes:
322
+ annotations: DataFrame with columns (peak_id, highlight_color, annotation)
323
+ containing fragment annotations computed by Vue. None if not yet available.
324
+ """
325
+
326
+ annotations: Optional[pl.DataFrame] = None
154
327
 
155
328
 
156
329
  @register_component("sequence_view")
157
- class SequenceView(BaseComponent):
330
+ class SequenceView:
158
331
  """
159
332
  Interactive sequence view component for peptide/protein visualization.
160
333
 
161
334
  Displays amino acid sequence with fragment ion markers. When provided with
162
- observed masses from a spectrum, highlights matched theoretical fragments.
335
+ peaks data, performs fragment matching on the Vue side and returns annotations.
163
336
 
164
337
  Features:
165
338
  - Amino acid grid display with configurable row width
166
- - Fragment ion markers (a, b, c, x, y, z)
167
- - Tolerance-based fragment matching
168
- - Fragment table showing matches
169
- - Residue cleavage percentage calculation
339
+ - Fragment ion markers (a, b, c, x, y, z) with configurable colors
340
+ - Tolerance-based fragment matching (done in Vue)
341
+ - Returns annotation dataframe for linked components
342
+ - Supports filtering by spectrum and sequence identifiers
170
343
 
171
344
  Example:
172
345
  sequence_view = SequenceView(
173
346
  cache_id="peptide_view",
174
- sequence="PEPTIDEK",
175
- observed_masses=[147.1, 244.2, 359.3, ...],
176
- precursor_mass=944.5,
347
+ sequence_data=pl.scan_parquet("sequences.parquet"),
348
+ peaks_data=pl.scan_parquet("peaks.parquet"),
349
+ filters={"spectrum": "scan_id", "sequence": "sequence_id"},
350
+ annotation_config={"ion_types": ["b", "y"], "tolerance": 20.0},
177
351
  )
178
- sequence_view(state_manager=state_manager)
352
+ result = sequence_view(key="sv", state_manager=state_manager)
353
+ # result.annotations contains the matched fragment annotations
179
354
  """
180
355
 
181
356
  _component_type: str = "sequence_view"
@@ -183,202 +358,491 @@ class SequenceView(BaseComponent):
183
358
  def __init__(
184
359
  self,
185
360
  cache_id: str,
186
- sequence: str,
187
- observed_masses: Optional[List[float]] = None,
188
- peak_ids: Optional[List[int]] = None,
189
- precursor_mass: Optional[float] = None,
190
- data: Optional[pl.LazyFrame] = None, # Not used but required by base
361
+ sequence_data: Optional[Union[pl.LazyFrame, Tuple[str, int], str]] = None,
362
+ sequence_data_path: Optional[str] = None,
363
+ peaks_data: Optional[pl.LazyFrame] = None,
364
+ peaks_data_path: Optional[str] = None,
191
365
  filters: Optional[Dict[str, str]] = None,
192
366
  interactivity: Optional[Dict[str, str]] = None,
367
+ deconvolved: bool = False,
368
+ annotation_config: Optional[Dict[str, Any]] = None,
193
369
  cache_path: str = ".",
194
- regenerate_cache: bool = False,
195
- fixed_modifications: Optional[List[str]] = None,
196
370
  title: Optional[str] = None,
197
371
  height: int = 400,
198
- deconvolved: bool = True,
199
- precursor_charge: int = 1,
200
- _precomputed_sequence_data: Optional[Dict[str, Any]] = None,
201
- **kwargs
372
+ **kwargs,
202
373
  ):
203
374
  """
204
375
  Initialize the SequenceView component.
205
376
 
206
377
  Args:
207
- cache_id: Unique identifier for this component's cache.
208
- sequence: Amino acid sequence string (single-letter codes).
209
- observed_masses: List of observed peak masses from spectrum.
210
- peak_ids: List of peak IDs corresponding to observed_masses (for interactivity).
211
- precursor_mass: Observed precursor mass.
212
- data: Not used for SequenceView, but required by base class.
378
+ cache_id: Unique identifier for this component instance.
379
+ sequence_data: Sequence information in one of three formats:
380
+ - LazyFrame with columns: sequence_id (if filtered), sequence, precursor_charge
381
+ - Tuple of (sequence_string, precursor_charge)
382
+ - String with just the sequence (charge defaults to 1)
383
+ sequence_data_path: Path to parquet file with sequence data.
384
+ peaks_data: LazyFrame with columns: scan_id (if filtered), peak_id, mass, intensity
385
+ peaks_data_path: Path to parquet file with peaks data.
213
386
  filters: Mapping of identifier names to column names for filtering.
387
+ Example: {"spectrum": "scan_id", "sequence": "sequence_id"}
214
388
  interactivity: Mapping of identifier names to column names for clicks.
215
- Example: {'peak': 'peak_id'} sets 'peak' selection to 'peak_id' value on click.
389
+ Example: {"peak": "peak_id"} sets 'peak' selection to clicked peak's ID.
390
+ deconvolved: If False (default), peaks are m/z values and matching considers
391
+ charge states 1 to precursor_charge. If True, peaks are neutral masses.
392
+ annotation_config: Configuration for fragment matching:
393
+ - ion_types: List of ion types to consider (default: ["b", "y"])
394
+ - neutral_losses: Whether to consider -H2O, -NH3 losses (default: True)
395
+ - tolerance: Mass tolerance value (default: 20.0)
396
+ - tolerance_ppm: True for ppm, False for Da (default: True)
397
+ - colors: Dict mapping ion types to hex colors
216
398
  cache_path: Base path for cache storage.
217
- regenerate_cache: If True, regenerate cache even if valid.
218
- fixed_modifications: List of amino acids with fixed modifications (e.g., ['C']).
219
399
  title: Optional title displayed above the sequence.
220
400
  height: Component height in pixels.
221
- deconvolved: If True (default), observed_masses are neutral masses.
222
- If False, observed_masses are m/z values and fragment matching
223
- considers charge states 1 to precursor_charge.
224
- precursor_charge: Maximum charge state to consider for fragment matching
225
- when deconvolved=False. Fragments can have charge 1 to this value.
226
- _precomputed_sequence_data: Optional pre-computed sequence data dict.
227
- If provided, skips fragment mass calculation (used when fragment
228
- masses are already cached externally, e.g., in identification preprocessing).
229
401
  **kwargs: Additional configuration options.
230
402
  """
231
- self._sequence_raw = sequence # Keep original for calculations
232
- self._sequence = sequence.upper().replace(' ', '').replace('\n', '')
233
- self._observed_masses = observed_masses or []
234
- self._peak_ids = peak_ids # peak_ids corresponding to observed_masses
235
- self._precursor_mass = precursor_mass or 0.0
236
- self._fixed_modifications = fixed_modifications or []
237
- self._title = title
238
- self._height = height
239
- self._deconvolved = deconvolved
240
- self._precursor_charge = max(1, precursor_charge)
241
- self._precomputed_sequence_data = _precomputed_sequence_data
242
-
243
- # Parse sequence to extract residues and modifications
244
- self._parsed_residues, self._parsed_modifications = parse_openms_sequence(self._sequence)
245
-
246
- # Build peaks DataFrame for interactivity validation
247
- # This allows interactivity={'peak': 'peak_id'} to validate naturally
248
- # Note: Cache validity is based on sequence (via _get_cache_config), not peaks data
249
- if data is None:
250
- if self._observed_masses:
251
- ids = self._peak_ids if self._peak_ids is not None else list(range(len(self._observed_masses)))
252
- data = pl.LazyFrame({
253
- 'peak_id': ids,
254
- 'mass': self._observed_masses,
255
- })
256
- else:
257
- # Empty peaks - use schema so validation still passes
258
- data = pl.LazyFrame(schema={'peak_id': pl.Int64, 'mass': pl.Float64})
259
-
260
- super().__init__(
261
- cache_id=cache_id,
262
- data=data,
263
- filters=filters,
264
- interactivity=interactivity,
265
- cache_path=cache_path,
266
- regenerate_cache=regenerate_cache,
267
- **kwargs
403
+ self._cache_id = cache_id
404
+ self._cache_path = Path(cache_path)
405
+ self._cache_dir = self._cache_path / cache_id
406
+
407
+ # Determine if data is provided (creation mode vs reconstruction mode)
408
+ has_sequence_data = sequence_data is not None or sequence_data_path is not None
409
+
410
+ # Check if any configuration arguments were provided
411
+ has_config = (
412
+ peaks_data is not None
413
+ or peaks_data_path is not None
414
+ or filters is not None
415
+ or interactivity is not None
416
+ or deconvolved is not False
417
+ or annotation_config is not None
418
+ or title is not None
419
+ or height != 400
420
+ or bool(kwargs)
268
421
  )
269
422
 
423
+ if not has_sequence_data:
424
+ # Reconstruction mode - only cache_id and cache_path allowed
425
+ if has_config:
426
+ raise ValueError(
427
+ "Configuration arguments require sequence_data= or sequence_data_path= to be provided. "
428
+ "For reconstruction from cache, use only cache_id and cache_path."
429
+ )
430
+ if not self._cache_exists():
431
+ raise ValueError(
432
+ f"Cache not found at '{self._cache_dir}'. "
433
+ f"Provide sequence_data= or sequence_data_path= to create the cache."
434
+ )
435
+ self._load_from_cache()
436
+ else:
437
+ # Creation mode - use provided config
438
+ self._title = title
439
+ self._height = height
440
+ self._deconvolved = deconvolved
441
+ self._config = kwargs
442
+ self._filters = filters or {}
443
+ self._interactivity = interactivity or {}
444
+
445
+ # Store annotation config with defaults
446
+ self._annotation_config = {**DEFAULT_ANNOTATION_CONFIG}
447
+ if annotation_config:
448
+ self._annotation_config.update(annotation_config)
449
+
450
+ # Parse sequence data input
451
+ if sequence_data is not None and sequence_data_path is not None:
452
+ raise ValueError(
453
+ "Provide either 'sequence_data' or 'sequence_data_path', not both"
454
+ )
455
+
456
+ self._source_sequence_data: Optional[pl.LazyFrame] = None
457
+ self._source_static_sequence: Optional[str] = None
458
+ self._source_static_charge: int = 1
459
+
460
+ if sequence_data_path is not None:
461
+ self._source_sequence_data = pl.scan_parquet(sequence_data_path)
462
+ elif isinstance(sequence_data, pl.LazyFrame):
463
+ self._source_sequence_data = sequence_data
464
+ elif isinstance(sequence_data, tuple):
465
+ self._source_static_sequence = sequence_data[0]
466
+ self._source_static_charge = sequence_data[1]
467
+ elif isinstance(sequence_data, str):
468
+ self._source_static_sequence = sequence_data
469
+ self._source_static_charge = 1
470
+
471
+ # Parse peaks data input
472
+ if peaks_data is not None and peaks_data_path is not None:
473
+ raise ValueError(
474
+ "Provide either 'peaks_data' or 'peaks_data_path', not both"
475
+ )
476
+
477
+ self._source_peaks_data: Optional[pl.LazyFrame] = None
478
+ if peaks_data_path is not None:
479
+ self._source_peaks_data = pl.scan_parquet(peaks_data_path)
480
+ elif peaks_data is not None:
481
+ self._source_peaks_data = peaks_data
482
+
483
+ # Create and save cache
484
+ self._create_cache()
485
+
486
+ # Discard source references - only cache is used from now on
487
+ self._source_sequence_data = None
488
+ self._source_static_sequence = None
489
+ self._source_peaks_data = None
490
+
491
+ # Load cached LazyFrames for reading
492
+ self._cached_sequences = pl.scan_parquet(
493
+ self._cache_dir / "sequences.parquet"
494
+ )
495
+ peaks_path = self._cache_dir / "peaks.parquet"
496
+ self._cached_peaks = (
497
+ pl.scan_parquet(peaks_path) if peaks_path.exists() else None
498
+ )
499
+
270
500
  def _get_cache_config(self) -> Dict[str, Any]:
271
- """Get configuration that affects cache validity."""
501
+ """Get all configuration to store in cache."""
272
502
  return {
273
- 'sequence': self._sequence,
274
- 'fixed_modifications': self._fixed_modifications,
503
+ "version": CACHE_VERSION,
504
+ "filters": self._filters,
505
+ "interactivity": self._interactivity,
506
+ "title": self._title,
507
+ "height": self._height,
508
+ "deconvolved": self._deconvolved,
509
+ "annotation_config": self._annotation_config,
275
510
  }
276
511
 
277
- def _preprocess(self) -> None:
278
- """
279
- Preprocess sequence data.
512
+ def _cache_exists(self) -> bool:
513
+ """Check if a valid cache exists that can be loaded."""
514
+ config_file = self._cache_dir / ".cache_config.json"
515
+ sequences_file = self._cache_dir / "sequences.parquet"
516
+
517
+ if not config_file.exists() or not sequences_file.exists():
518
+ return False
519
+
520
+ try:
521
+ with open(config_file, "r") as f:
522
+ cached_config = json.load(f)
523
+ # Just check version matches
524
+ return cached_config.get("version") == CACHE_VERSION
525
+ except Exception:
526
+ return False
527
+
528
+ def _load_from_cache(self) -> None:
529
+ """Load all configuration and data from cache."""
530
+ config_file = self._cache_dir / ".cache_config.json"
531
+
532
+ with open(config_file, "r") as f:
533
+ config = json.load(f)
534
+
535
+ # Restore all configuration
536
+ self._filters = config.get("filters", {})
537
+ self._interactivity = config.get("interactivity", {})
538
+ self._title = config.get("title")
539
+ self._height = config.get("height", 400)
540
+ self._deconvolved = config.get("deconvolved", False)
541
+ self._annotation_config = config.get(
542
+ "annotation_config", {**DEFAULT_ANNOTATION_CONFIG}
543
+ )
544
+ self._config = {}
545
+
546
+ # Load cached LazyFrames
547
+ self._cached_sequences = pl.scan_parquet(self._cache_dir / "sequences.parquet")
548
+ peaks_path = self._cache_dir / "peaks.parquet"
549
+ self._cached_peaks = (
550
+ pl.scan_parquet(peaks_path) if peaks_path.exists() else None
551
+ )
552
+
553
+ def _create_cache(self) -> None:
554
+ """Create cache from source data."""
555
+ # Create cache directory
556
+ self._cache_dir.mkdir(parents=True, exist_ok=True)
557
+
558
+ # Preprocess and write caches
559
+ self._preprocess_sequences()
560
+ self._preprocess_peaks()
561
+
562
+ # Write config
563
+ config_file = self._cache_dir / ".cache_config.json"
564
+ with open(config_file, "w") as f:
565
+ json.dump(self._get_cache_config(), f, indent=2)
566
+
567
+ def _preprocess_sequences(self) -> None:
568
+ """Preprocess and cache sequence data."""
569
+ output_path = self._cache_dir / "sequences.parquet"
570
+
571
+ if self._source_sequence_data is not None:
572
+ # LazyFrame input - select required columns, sort by filters
573
+ schema = self._source_sequence_data.collect_schema()
574
+ filter_cols = [c for c in self._filters.values() if c in schema.names()]
575
+
576
+ # Build column list: filter columns + required columns
577
+ required = ["sequence", "precursor_charge"]
578
+ cols = list(
579
+ dict.fromkeys(
580
+ filter_cols + [c for c in required if c in schema.names()]
581
+ )
582
+ )
583
+
584
+ lf = self._source_sequence_data.select(cols)
585
+
586
+ # Sort by filter columns for predicate pushdown
587
+ if filter_cols:
588
+ lf = lf.sort(filter_cols)
589
+
590
+ df = lf.collect()
591
+ else:
592
+ # Static input (string or tuple) - create single-row DataFrame
593
+ df = pl.DataFrame(
594
+ {
595
+ "sequence": [self._source_static_sequence or ""],
596
+ "precursor_charge": [self._source_static_charge],
597
+ }
598
+ )
599
+
600
+ # Optimize types and write
601
+ df = optimize_for_transfer(df)
602
+ df.write_parquet(output_path, compression="zstd")
603
+
604
+ def _preprocess_peaks(self) -> None:
605
+ """Preprocess and cache peaks data."""
606
+ if self._source_peaks_data is None:
607
+ return # No peaks to cache
608
+
609
+ output_path = self._cache_dir / "peaks.parquet"
610
+ schema = self._source_peaks_data.collect_schema()
611
+ filter_cols = [c for c in self._filters.values() if c in schema.names()]
612
+
613
+ # Build column list: filter columns + required columns
614
+ required = ["peak_id", "mass"]
615
+ optional = ["intensity"]
616
+ cols = list(
617
+ dict.fromkeys(
618
+ filter_cols
619
+ + [c for c in required if c in schema.names()]
620
+ + [c for c in optional if c in schema.names()]
621
+ )
622
+ )
623
+
624
+ lf = self._source_peaks_data.select(cols)
280
625
 
281
- Calculates theoretical fragment masses for all ion types.
282
- This is cached so subsequent renders are fast.
626
+ # Sort by filter columns for predicate pushdown
627
+ if filter_cols:
628
+ lf = lf.sort(filter_cols)
629
+
630
+ df = lf.collect()
631
+
632
+ # Optimize types and write
633
+ df = optimize_for_transfer(df)
634
+ df.write_parquet(output_path, compression="zstd")
635
+
636
+ def _get_sequence_for_state(self, state: Dict[str, Any]) -> Tuple[str, int]:
637
+ """Get sequence and charge for current state.
638
+
639
+ Reads from cached sequences.parquet with predicate pushdown.
640
+
641
+ Returns:
642
+ Tuple of (sequence_string, precursor_charge)
283
643
  """
284
- # Calculate fragment masses using plain residues
285
- plain_sequence = ''.join(self._parsed_residues)
286
- fragment_masses = calculate_fragment_masses(plain_sequence)
644
+ filtered = self._cached_sequences
287
645
 
288
- # Calculate theoretical mass
289
- theoretical_mass = calculate_theoretical_mass(plain_sequence)
646
+ # Apply filters for columns that exist in cached data
647
+ schema = filtered.collect_schema()
648
+ for identifier, column in self._filters.items():
649
+ if column in schema.names():
650
+ filter_value = state.get(identifier)
651
+ if filter_value is not None:
652
+ filtered = filtered.filter(pl.col(column) == filter_value)
290
653
 
291
- # Build sequence data structure
292
- sequence_data = {
293
- 'sequence': self._parsed_residues,
294
- 'modifications': self._parsed_modifications, # New: list of mass shifts per position
295
- 'theoretical_mass': theoretical_mass,
296
- 'fixed_modifications': self._fixed_modifications,
297
- **fragment_masses,
298
- }
654
+ # Collect and get first row
655
+ try:
656
+ df = filtered.select(["sequence", "precursor_charge"]).head(1).collect()
657
+ if df.height > 0:
658
+ return df["sequence"][0], df["precursor_charge"][0]
659
+ except Exception:
660
+ pass
299
661
 
300
- self._preprocessed_data['sequence_data'] = sequence_data
662
+ return "", 1
301
663
 
302
- def _get_vue_component_name(self) -> str:
303
- """Return the Vue component name."""
304
- return 'SequenceView'
664
+ def _get_peaks_for_state(self, state: Dict[str, Any]) -> pl.DataFrame:
665
+ """Get filtered peaks data for current state.
305
666
 
306
- def _get_data_key(self) -> str:
307
- """Return the key used to send primary data to Vue."""
308
- return 'sequenceData'
667
+ Reads from cached peaks.parquet with predicate pushdown.
668
+
669
+ Returns:
670
+ DataFrame with columns: peak_id, mass, (intensity if available)
671
+ """
672
+ if self._cached_peaks is None:
673
+ return pl.DataFrame(schema={"peak_id": pl.Int64, "mass": pl.Float64})
674
+
675
+ filtered = self._cached_peaks
676
+
677
+ # Apply filters for columns that exist in cached data
678
+ schema = filtered.collect_schema()
679
+ for identifier, column in self._filters.items():
680
+ if column in schema.names():
681
+ filter_value = state.get(identifier)
682
+ if filter_value is not None:
683
+ filtered = filtered.filter(pl.col(column) == filter_value)
684
+
685
+ # Select available columns
686
+ cols = ["peak_id", "mass"]
687
+ if "intensity" in schema.names():
688
+ cols.append("intensity")
689
+
690
+ try:
691
+ return filtered.select(cols).collect()
692
+ except Exception:
693
+ return pl.DataFrame(schema={"peak_id": pl.Int64, "mass": pl.Float64})
309
694
 
310
695
  def _prepare_vue_data(self, state: Dict[str, Any]) -> Dict[str, Any]:
311
696
  """
312
- Prepare sequence data for Vue component.
697
+ Prepare data for Vue component.
313
698
 
314
699
  Args:
315
700
  state: Current selection state from StateManager
316
701
 
317
702
  Returns:
318
- Dict with sequenceData, observedMasses, precursorMass, and _hash
703
+ Dict with sequenceData, peaksData, annotationConfig, etc.
319
704
  """
320
- # Use precomputed data if available, otherwise use cached/computed data
321
- if self._precomputed_sequence_data is not None:
322
- sequence_data = self._precomputed_sequence_data
323
- else:
324
- sequence_data = self._preprocessed_data.get('sequence_data', {})
705
+ # Get sequence for current state
706
+ sequence_str, precursor_charge = self._get_sequence_for_state(state)
707
+
708
+ # Parse sequence
709
+ residues, modifications = parse_openms_sequence(sequence_str)
325
710
 
326
- # Create a hash based on sequence and observed masses
327
- import hashlib
328
- hash_input = f"{self._sequence}:{len(self._observed_masses)}:{self._precursor_mass}"
711
+ # Calculate theoretical fragment masses
712
+ fragment_masses = calculate_fragment_masses_pyopenms(sequence_str)
713
+
714
+ # Calculate theoretical mass
715
+ theoretical_mass = get_theoretical_mass(sequence_str)
716
+
717
+ # Build sequence data structure
718
+ sequence_data = {
719
+ "sequence": residues,
720
+ "modifications": modifications,
721
+ "theoretical_mass": theoretical_mass,
722
+ "fixed_modifications": [],
723
+ # Include settings for Vue initialization
724
+ "fragment_tolerance": self._annotation_config.get("tolerance"),
725
+ "fragment_tolerance_ppm": self._annotation_config.get("tolerance_ppm"),
726
+ "neutral_losses": self._annotation_config.get("neutral_losses"),
727
+ "proton_loss_addition": self._annotation_config.get("proton_loss_addition"),
728
+ **fragment_masses,
729
+ }
730
+
731
+ # Get filtered peaks
732
+ peaks_df = self._get_peaks_for_state(state)
733
+
734
+ # Extract arrays from peaks DataFrame for Vue
735
+ # Vue expects observedMasses and peakIds as separate arrays
736
+ observed_masses: List[float] = []
737
+ peak_ids: List[int] = []
738
+ precursor_mass: float = 0.0
739
+
740
+ if peaks_df.height > 0:
741
+ observed_masses = peaks_df["mass"].to_list()
742
+ peak_ids = peaks_df["peak_id"].to_list()
743
+
744
+ # Create hash for change detection
745
+ hash_input = f"{sequence_str}:{peaks_df.height}:{precursor_charge}"
329
746
  data_hash = hashlib.md5(hash_input.encode()).hexdigest()[:8]
330
747
 
331
748
  result = {
332
- 'sequenceData': sequence_data,
333
- 'observedMasses': self._observed_masses,
334
- 'precursorMass': self._precursor_mass,
335
- '_hash': data_hash,
749
+ "sequenceData": sequence_data,
750
+ "observedMasses": observed_masses,
751
+ "peakIds": peak_ids,
752
+ "precursorMass": precursor_mass,
753
+ "annotationConfig": self._annotation_config,
754
+ "precursorCharge": precursor_charge,
755
+ "_hash": data_hash,
336
756
  }
337
757
 
338
- # Include peak_ids if provided (for interactivity linking)
339
- if self._peak_ids is not None:
340
- result['peakIds'] = self._peak_ids
341
-
342
758
  return result
343
759
 
760
+ def _get_vue_component_name(self) -> str:
761
+ """Return the Vue component name."""
762
+ return "SequenceView"
763
+
764
+ def _get_data_key(self) -> str:
765
+ """Return the key used to send primary data to Vue."""
766
+ return "sequenceData"
767
+
344
768
  def _get_component_args(self) -> Dict[str, Any]:
345
769
  """Get component arguments to send to Vue."""
346
770
  args: Dict[str, Any] = {
347
- 'componentType': self._get_vue_component_name(),
348
- 'height': self._height,
349
- 'deconvolved': self._deconvolved,
350
- 'precursorCharge': self._precursor_charge,
771
+ "componentType": self._get_vue_component_name(),
772
+ "height": self._height,
773
+ "deconvolved": self._deconvolved,
351
774
  }
352
775
 
353
776
  if self._title:
354
- args['title'] = self._title
777
+ args["title"] = self._title
355
778
 
356
- # Pass interactivity mapping to Vue (similar to other components)
357
779
  if self._interactivity:
358
- args['interactivity'] = self._interactivity
780
+ args["interactivity"] = self._interactivity
359
781
 
360
782
  args.update(self._config)
361
783
  return args
362
784
 
363
- def update_observed_masses(
785
+ @property
786
+ def peaks_data(self) -> Optional[pl.LazyFrame]:
787
+ """Return the cached peaks LazyFrame for linked components."""
788
+ return self._cached_peaks
789
+
790
+ def get_filters_mapping(self) -> Dict[str, str]:
791
+ """Return the filters identifier-to-column mapping."""
792
+ return self._filters.copy()
793
+
794
+ def get_interactivity_mapping(self) -> Dict[str, str]:
795
+ """Return the interactivity identifier-to-column mapping."""
796
+ return self._interactivity.copy()
797
+
798
+ def get_state_dependencies(self) -> List[str]:
799
+ """Return list of state keys that affect this component's data."""
800
+ return list(self._filters.keys())
801
+
802
+ def __call__(
364
803
  self,
365
- observed_masses: List[float],
366
- precursor_mass: Optional[float] = None
367
- ) -> 'SequenceView':
804
+ key: Optional[str] = None,
805
+ state_manager: Optional["StateManager"] = None,
806
+ height: Optional[int] = None,
807
+ ) -> SequenceViewResult:
368
808
  """
369
- Update the observed masses for fragment matching.
370
-
371
- This allows reusing the same cached sequence data with different
372
- spectra for matching.
809
+ Render the component in Streamlit.
373
810
 
374
811
  Args:
375
- observed_masses: New list of observed peak masses.
376
- precursor_mass: Optional new precursor mass.
812
+ key: Optional unique key for the Streamlit component
813
+ state_manager: Optional StateManager for cross-component state.
814
+ If not provided, uses a default shared StateManager.
815
+ height: Optional height in pixels for the component
377
816
 
378
817
  Returns:
379
- Self for method chaining.
818
+ SequenceViewResult with annotations DataFrame (if available)
380
819
  """
381
- self._observed_masses = observed_masses
382
- if precursor_mass is not None:
383
- self._precursor_mass = precursor_mass
384
- return self
820
+ from ..core.state import get_default_state_manager
821
+ from ..rendering.bridge import get_component_annotations, render_component
822
+
823
+ if state_manager is None:
824
+ state_manager = get_default_state_manager()
825
+
826
+ # Use provided height or default
827
+ render_height = height if height is not None else self._height
828
+
829
+ render_component(
830
+ component=self, state_manager=state_manager, key=key, height=render_height
831
+ )
832
+
833
+ # Get annotations from session state (set by Vue)
834
+ annotations = get_component_annotations(key) if key else None
835
+
836
+ return SequenceViewResult(annotations=annotations)
837
+
838
+ def __repr__(self) -> str:
839
+ return (
840
+ f"SequenceView("
841
+ f"cache_id='{self._cache_id}', "
842
+ f"filters={self._filters}, "
843
+ f"interactivity={self._interactivity})"
844
+ )
845
+
846
+
847
+ if TYPE_CHECKING:
848
+ from ..core.state import StateManager