masster 0.4.19__py3-none-any.whl → 0.4.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -90,6 +90,9 @@
90
90
  },
91
91
  "ms2_specs": {
92
92
  "dtype": "pl.Object"
93
+ },
94
+ "ms1_spec": {
95
+ "dtype": "pl.Object"
93
96
  }
94
97
  }
95
98
  },
masster/sample/save.py CHANGED
@@ -230,8 +230,9 @@ def export_mgf(
230
230
  features = features.filter(pl.col("rt") >= rt_start)
231
231
  if rt_end is not None:
232
232
  features = features.filter(pl.col("rt") <= rt_end)
233
- if not include_all_ms1:
234
- features = features.filter(pl.col("ms2_scans").is_not_null())
233
+ # Note: We no longer filter out features without MS2 data here since we want to export
234
+ # MS1 spectra for ALL features with isotope data. The MS2 filtering is done in the
235
+ # second pass where we specifically check for ms2_scans.
235
236
 
236
237
  # Convert to list of dictionaries for faster iteration
237
238
  features_list = features.to_dicts()
@@ -265,16 +266,42 @@ def export_mgf(
265
266
  setattr(spec, attr, getattr(spec, attr)[mask])
266
267
  return spec
267
268
 
268
- def write_ion(f, title, fid, mz, rt, charge, spect):
269
+ def write_ion(f, title, fuid, fid, mz, rt, charge, spect):
269
270
  if spect is None:
270
- return
271
- f.write(f"BEGIN IONS\nTITLE={title}\n")
271
+ return "none"
272
+
273
+ # For MSLEVEL=2 ions, don't write empty spectra
274
+ ms_level = spect.ms_level if spect.ms_level is not None else 1
275
+ if ms_level > 1 and (len(spect.mz) == 0 or len(spect.inty) == 0):
276
+ return "empty_ms2"
277
+
278
+ # Create dynamic title based on MS level
279
+ if ms_level == 1:
280
+ # MS1: uid, rt, mz
281
+ dynamic_title = f"uid:{fuid}, rt:{rt:.2f}, mz:{mz:.4f}"
282
+ else:
283
+ # MS2: uid, rt, mz, energy
284
+ energy = spect.energy if hasattr(spect, 'energy') else 0
285
+ dynamic_title = f"uid:{fuid}, rt:{rt:.2f}, mz:{mz:.4f}, energy:{energy}"
286
+
287
+ f.write(f"BEGIN IONS\nTITLE={dynamic_title}\n")
288
+ f.write(f"FEATURE_UID={fuid}\n")
272
289
  f.write(f"FEATURE_ID={fid}\n")
273
290
  f.write(f"CHARGE={charge}\nPEPMASS={mz}\nRTINSECONDS={rt}\n")
291
+
274
292
  if spect.ms_level is None:
275
293
  f.write("MSLEVEL=1\n")
294
+ # Add PRECURSORINTENSITY for MS1 spectra
295
+ if len(spect.inty) > 0:
296
+ precursor_intensity = max(spect.inty)
297
+ f.write(f"PRECURSORINTENSITY={precursor_intensity:.0f}\n")
276
298
  else:
277
299
  f.write(f"MSLEVEL={spect.ms_level}\n")
300
+ # Add PRECURSORINTENSITY for MS1 spectra
301
+ if spect.ms_level == 1 and len(spect.inty) > 0:
302
+ precursor_intensity = max(spect.inty)
303
+ f.write(f"PRECURSORINTENSITY={precursor_intensity:.0f}\n")
304
+
278
305
  if spect.ms_level is not None:
279
306
  if spect.ms_level > 1 and hasattr(spect, "energy"):
280
307
  f.write(f"ENERGY={spect.energy}\n")
@@ -285,6 +312,7 @@ def export_mgf(
285
312
  ]
286
313
  f.writelines(peak_lines)
287
314
  f.write("END IONS\n\n")
315
+ return "written"
288
316
 
289
317
  if centroid_algo is None:
290
318
  if hasattr(self.parameters, "centroid_algo"):
@@ -304,6 +332,9 @@ def export_mgf(
304
332
 
305
333
  c = 0
306
334
  skip = 0
335
+ empty_ms2_count = 0
336
+ ms1_spec_used_count = 0
337
+ ms1_fallback_count = 0
307
338
  # check if features is empty
308
339
  if len(features_list) == 0:
309
340
  self.logger.warning("No features found.")
@@ -311,57 +342,80 @@ def export_mgf(
311
342
  filename = os.path.abspath(filename)
312
343
  with open(filename, "w", encoding="utf-8") as f:
313
344
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
345
+
346
+ # First pass: Export MS1 spectra for ALL features with ms1_spec data
347
+ print("Exporting MS1 spectra...")
314
348
  for row in tqdm(
315
349
  features_list,
316
350
  total=len(features_list),
317
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export MGF",
351
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export MS1 spectra",
318
352
  disable=tdqm_disable,
319
353
  ):
320
354
  # Pre-calculate common values
321
355
  feature_uid = row["feature_uid"]
356
+ feature_id = row["feature_id"] if "feature_id" in row else feature_uid
322
357
  mz = row["mz"]
323
358
  rt = row["rt"]
324
359
  rt_str = f"{rt:.2f}"
325
360
  mz_str = f"{mz:.4f}"
326
361
 
327
- # Filtering is now done at DataFrame level, so we can skip these checks
328
- if row["ms2_scans"] is None and not include_all_ms1:
329
- skip = skip + 1
330
- continue
362
+ # Export MS1 spectrum for ALL features with ms1_spec data
363
+ if "ms1_spec" in row and row["ms1_spec"] is not None:
364
+ # Create spectrum from ms1_spec isotope pattern data
365
+ from masster.spectrum import Spectrum
366
+
367
+ iso_data = row["ms1_spec"]
368
+ if len(iso_data) >= 2: # Ensure we have mz and intensity arrays
369
+ ms1_mz = iso_data[0]
370
+ ms1_inty = iso_data[1]
371
+
372
+ # Create a Spectrum object from the isotope data
373
+ spect = Spectrum(
374
+ mz=np.array(ms1_mz),
375
+ inty=np.array(ms1_inty),
376
+ ms_level=1
377
+ )
378
+
379
+ charge = preferred_charge
380
+ if row["charge"] is not None and row["charge"] != 0:
381
+ charge = row["charge"]
331
382
 
332
- # write MS1 spectrum
333
- ms1_scan_uid = self.select_closest_scan(rt=rt)["scan_uid"][0]
334
- spect = self.get_spectrum(
335
- ms1_scan_uid,
336
- centroid=centroid,
337
- deisotope=deisotope,
338
- centroid_algo=centroid_algo,
339
- )
340
-
341
- spect = filter_peaks(spect, inty_min=inty_min)
342
-
343
- if not full_ms1:
344
- # trim spectrum to region around the precursor, it's wide to potentially identify adducts
345
- spect = spect.trim(
346
- mz_min=mz - 50,
347
- mz_max=mz + 50,
348
- )
349
-
350
- charge = preferred_charge
351
- if row["charge"] is not None and row["charge"] != 0:
352
- charge = row["charge"]
353
-
354
- write_ion(
355
- f,
356
- f"feature_uid:{feature_uid}, rt:{rt_str}, mz:{mz_str}",
357
- feature_uid,
358
- mz,
359
- rt,
360
- charge,
361
- spect,
362
- )
383
+ write_ion(
384
+ f,
385
+ f"uid:{feature_uid}",
386
+ feature_uid,
387
+ feature_id,
388
+ mz,
389
+ rt,
390
+ charge,
391
+ spect,
392
+ )
393
+ ms1_spec_used_count += 1
394
+ else:
395
+ ms1_fallback_count += 1
396
+ else:
397
+ # No MS1 spectrum exported for features without ms1_spec data
398
+ ms1_fallback_count += 1
399
+
400
+ # Second pass: Export MS2 spectra for features with MS2 data
401
+ print("Exporting MS2 spectra...")
402
+ for row in tqdm(
403
+ features_list,
404
+ total=len(features_list),
405
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export MS2 spectra",
406
+ disable=tdqm_disable,
407
+ ):
408
+ # Pre-calculate common values
409
+ feature_uid = row["feature_uid"]
410
+ feature_id = row["feature_id"] if "feature_id" in row else feature_uid
411
+ mz = row["mz"]
412
+ rt = row["rt"]
413
+ rt_str = f"{rt:.2f}"
414
+ mz_str = f"{mz:.4f}"
363
415
 
416
+ # Skip features without MS2 data (unless include_all_ms1 is True, but we already handled MS1 above)
364
417
  if row["ms2_scans"] is None:
418
+ skip = skip + 1
365
419
  continue
366
420
  elif use_cache:
367
421
  spect = row["ms2_specs"]
@@ -399,16 +453,20 @@ def export_mgf(
399
453
  current_scan_uid = (
400
454
  scan_uids[i] if i < len(scan_uids) else "unknown"
401
455
  )
402
- write_ion(
456
+ result = write_ion(
403
457
  f,
404
- f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{current_scan_uid}",
458
+ f"uid:{feature_uid}",
405
459
  feature_uid,
460
+ feature_id,
406
461
  mz,
407
462
  rt,
408
463
  charge,
409
464
  s,
410
465
  )
411
- c += 1
466
+ if result == "written":
467
+ c += 1
468
+ elif result == "empty_ms2":
469
+ empty_ms2_count += 1
412
470
  continue # Skip the rest of the processing for this feature
413
471
 
414
472
  # If we reach here, either use_cache=False or no cached spectra were available
@@ -455,16 +513,20 @@ def export_mgf(
455
513
  eic_min=eic_corr_min,
456
514
  q1_max=q1_ratio_max,
457
515
  )
458
- write_ion(
516
+ result = write_ion(
459
517
  f,
460
- f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{scan_uid}, energy:{energy}",
518
+ f"uid:{feature_uid}",
461
519
  feature_uid,
520
+ feature_id,
462
521
  mz,
463
522
  rt,
464
523
  charge,
465
524
  spect,
466
525
  )
467
- c += 1
526
+ if result == "written":
527
+ c += 1
528
+ elif result == "empty_ms2":
529
+ empty_ms2_count += 1
468
530
  else:
469
531
  if selection == "best":
470
532
  ms2_scans = row["ms2_scans"][0]
@@ -482,16 +544,20 @@ def export_mgf(
482
544
  eic_min=eic_corr_min,
483
545
  q1_max=q1_ratio_max,
484
546
  )
485
- write_ion(
547
+ result = write_ion(
486
548
  f,
487
- f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
549
+ f"uid:{feature_uid}",
488
550
  feature_uid,
551
+ feature_id,
489
552
  mz,
490
553
  rt,
491
554
  charge,
492
555
  spect,
493
556
  )
494
- c += 1
557
+ if result == "written":
558
+ c += 1
559
+ elif result == "empty_ms2":
560
+ empty_ms2_count += 1
495
561
  elif selection == "all":
496
562
  if merge:
497
563
  specs = []
@@ -527,7 +593,7 @@ def export_mgf(
527
593
  )
528
594
  if deisotope:
529
595
  spect = spect.deisotope()
530
- title = f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, merged"
596
+ title = f"uid:{feature_uid}"
531
597
  spect = filter_peaks(
532
598
  spect,
533
599
  inty_min=inty_min,
@@ -535,16 +601,20 @@ def export_mgf(
535
601
  eic_min=eic_corr_min,
536
602
  q1_max=q1_ratio_max,
537
603
  )
538
- write_ion(
604
+ result = write_ion(
539
605
  f,
540
606
  title,
541
607
  feature_uid,
608
+ feature_id,
542
609
  mz,
543
610
  rt,
544
611
  charge,
545
612
  spect,
546
613
  )
547
- c += 1
614
+ if result == "written":
615
+ c += 1
616
+ elif result == "empty_ms2":
617
+ empty_ms2_count += 1
548
618
  else:
549
619
  for ms2_scans in row["ms2_scans"]:
550
620
  spect = self.get_spectrum(
@@ -561,19 +631,27 @@ def export_mgf(
561
631
  eic_min=eic_corr_min,
562
632
  q1_max=q1_ratio_max,
563
633
  )
564
- write_ion(
634
+ result = write_ion(
565
635
  f,
566
- f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
636
+ f"uid:{feature_uid}",
567
637
  feature_uid,
638
+ feature_id,
568
639
  mz,
569
640
  rt,
570
641
  charge,
571
642
  spect,
572
643
  )
573
- c += 1
574
-
575
- self.logger.info(f"Exported {c} features to {filename}")
576
-
644
+ if result == "written":
645
+ c += 1
646
+ elif result == "empty_ms2":
647
+ empty_ms2_count += 1
648
+
649
+ self.logger.info(f"Exported {ms1_spec_used_count} MS1 spectra and {c} MS2 spectra to {filename}")
650
+ if empty_ms2_count > 0:
651
+ self.logger.info(f"Skipped {empty_ms2_count} empty MS2 spectra")
652
+ if ms1_fallback_count > 0:
653
+ self.logger.info(f"Skipped MS1 export for {ms1_fallback_count} features without isotope patterns")
654
+
577
655
  # Handle None values in logging
578
656
  inty_min_str = f"{inty_min:.3f}" if inty_min != float("-inf") else "None"
579
657
  q1_ratio_min_str = f"{q1_ratio_min:.3f}" if q1_ratio_min is not None else "None"
masster/spectrum.py CHANGED
@@ -197,16 +197,65 @@ class Spectrum:
197
197
  self.bl = None
198
198
 
199
199
  def check_if_centroided(self) -> bool:
200
- if self.mz.size == 0:
201
- return True
202
- mzs = self.mz[self.mz < np.min(self.mz) + 0.4]
203
- if len(mzs) < 20:
204
- if len(mzs) < 3:
205
- return True
206
- min_distance = np.min(np.diff(mzs))
207
- if min_distance > 0.003:
200
+ """
201
+ Fast determination if spectrum data is centroided or profile.
202
+
203
+ Uses optimized statistical approaches with early exits for speed:
204
+ 1. Fast median difference check (most decisive)
205
+ 2. Small gap ratio (profile characteristic)
206
+ 3. Density check (fallback)
207
+
208
+ Returns:
209
+ bool: True if centroided, False if profile
210
+ """
211
+ if self.mz.size < 5:
212
+ return True # Too few points to determine, assume centroided
213
+
214
+ # Fast path: check if mz is already sorted to avoid sorting cost
215
+ if np.all(self.mz[:-1] <= self.mz[1:]):
216
+ sorted_mz = self.mz
217
+ else:
218
+ sorted_mz = np.sort(self.mz)
219
+
220
+ # Calculate differences efficiently
221
+ mz_diffs = np.diff(sorted_mz)
222
+
223
+ # Remove zeros efficiently (keep positive differences)
224
+ mz_diffs = mz_diffs[mz_diffs > 0]
225
+
226
+ if mz_diffs.size == 0:
227
+ return True # All identical m/z values
228
+
229
+ # Fast approach 1: Median difference (most decisive, compute once)
230
+ median_diff = np.median(mz_diffs)
231
+
232
+ # Early exits for clear cases (>90% of cases)
233
+ if median_diff > 0.02:
234
+ return True # Clearly centroided
235
+ elif median_diff < 0.005:
236
+ return False # Clearly profile
237
+
238
+ # Fast approach 2: Small gap ratio (for borderline cases)
239
+ # Use vectorized comparison instead of creating new array
240
+ small_gap_count = np.sum(mz_diffs < 0.005)
241
+ small_gap_ratio = small_gap_count / mz_diffs.size
242
+
243
+ if small_gap_ratio > 0.7:
244
+ return False # High ratio of small gaps = profile
245
+ elif small_gap_ratio < 0.1:
246
+ return True # Low ratio of small gaps = centroided
247
+
248
+ # Fast approach 3: Density check (final fallback)
249
+ mz_range = sorted_mz[-1] - sorted_mz[0]
250
+ if mz_range > 0:
251
+ density = sorted_mz.size / mz_range
252
+ if density > 100: # High density = profile
253
+ return False
254
+ elif density < 10: # Low density = centroided
208
255
  return True
209
- return False
256
+
257
+ # Final fallback: median threshold
258
+ return median_diff > 0.01
210
259
 
211
260
  def reload(self):
212
261
  modname = self.__class__.__module__