masster 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/sample/save.py CHANGED
@@ -230,8 +230,9 @@ def export_mgf(
230
230
  features = features.filter(pl.col("rt") >= rt_start)
231
231
  if rt_end is not None:
232
232
  features = features.filter(pl.col("rt") <= rt_end)
233
- if not include_all_ms1:
234
- features = features.filter(pl.col("ms2_scans").is_not_null())
233
+ # Note: We no longer filter out features without MS2 data here since we want to export
234
+ # MS1 spectra for ALL features with isotope data. The MS2 filtering is done in the
235
+ # second pass where we specifically check for ms2_scans.
235
236
 
236
237
  # Convert to list of dictionaries for faster iteration
237
238
  features_list = features.to_dicts()
@@ -265,16 +266,42 @@ def export_mgf(
265
266
  setattr(spec, attr, getattr(spec, attr)[mask])
266
267
  return spec
267
268
 
268
- def write_ion(f, title, fid, mz, rt, charge, spect):
269
+ def write_ion(f, title, fuid, fid, mz, rt, charge, spect):
269
270
  if spect is None:
270
- return
271
- f.write(f"BEGIN IONS\nTITLE={title}\n")
271
+ return "none"
272
+
273
+ # For MSLEVEL=2 ions, don't write empty spectra
274
+ ms_level = spect.ms_level if spect.ms_level is not None else 1
275
+ if ms_level > 1 and (len(spect.mz) == 0 or len(spect.inty) == 0):
276
+ return "empty_ms2"
277
+
278
+ # Create dynamic title based on MS level
279
+ if ms_level == 1:
280
+ # MS1: uid, rt, mz
281
+ dynamic_title = f"uid:{fuid}, rt:{rt:.2f}, mz:{mz:.4f}"
282
+ else:
283
+ # MS2: uid, rt, mz, energy
284
+ energy = spect.energy if hasattr(spect, 'energy') else 0
285
+ dynamic_title = f"uid:{fuid}, rt:{rt:.2f}, mz:{mz:.4f}, energy:{energy}"
286
+
287
+ f.write(f"BEGIN IONS\nTITLE={dynamic_title}\n")
288
+ f.write(f"FEATURE_UID={fuid}\n")
272
289
  f.write(f"FEATURE_ID={fid}\n")
273
290
  f.write(f"CHARGE={charge}\nPEPMASS={mz}\nRTINSECONDS={rt}\n")
291
+
274
292
  if spect.ms_level is None:
275
293
  f.write("MSLEVEL=1\n")
294
+ # Add PRECURSORINTENSITY for MS1 spectra
295
+ if len(spect.inty) > 0:
296
+ precursor_intensity = max(spect.inty)
297
+ f.write(f"PRECURSORINTENSITY={precursor_intensity:.0f}\n")
276
298
  else:
277
299
  f.write(f"MSLEVEL={spect.ms_level}\n")
300
+ # Add PRECURSORINTENSITY for MS1 spectra
301
+ if spect.ms_level == 1 and len(spect.inty) > 0:
302
+ precursor_intensity = max(spect.inty)
303
+ f.write(f"PRECURSORINTENSITY={precursor_intensity:.0f}\n")
304
+
278
305
  if spect.ms_level is not None:
279
306
  if spect.ms_level > 1 and hasattr(spect, "energy"):
280
307
  f.write(f"ENERGY={spect.energy}\n")
@@ -285,6 +312,7 @@ def export_mgf(
285
312
  ]
286
313
  f.writelines(peak_lines)
287
314
  f.write("END IONS\n\n")
315
+ return "written"
288
316
 
289
317
  if centroid_algo is None:
290
318
  if hasattr(self.parameters, "centroid_algo"):
@@ -304,6 +332,9 @@ def export_mgf(
304
332
 
305
333
  c = 0
306
334
  skip = 0
335
+ empty_ms2_count = 0
336
+ ms1_spec_used_count = 0
337
+ ms1_fallback_count = 0
307
338
  # check if features is empty
308
339
  if len(features_list) == 0:
309
340
  self.logger.warning("No features found.")
@@ -311,57 +342,78 @@ def export_mgf(
311
342
  filename = os.path.abspath(filename)
312
343
  with open(filename, "w", encoding="utf-8") as f:
313
344
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
345
+
346
+ # First pass: Export MS1 spectra for ALL features with ms1_spec data
314
347
  for row in tqdm(
315
348
  features_list,
316
349
  total=len(features_list),
317
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export MGF",
350
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export MS1 spectra",
318
351
  disable=tdqm_disable,
319
352
  ):
320
353
  # Pre-calculate common values
321
354
  feature_uid = row["feature_uid"]
355
+ feature_id = row["feature_id"] if "feature_id" in row else feature_uid
322
356
  mz = row["mz"]
323
357
  rt = row["rt"]
324
358
  rt_str = f"{rt:.2f}"
325
359
  mz_str = f"{mz:.4f}"
326
360
 
327
- # Filtering is now done at DataFrame level, so we can skip these checks
328
- if row["ms2_scans"] is None and not include_all_ms1:
329
- skip = skip + 1
330
- continue
361
+ # Export MS1 spectrum for ALL features with ms1_spec data
362
+ if "ms1_spec" in row and row["ms1_spec"] is not None:
363
+ # Create spectrum from ms1_spec isotope pattern data
364
+ from masster.spectrum import Spectrum
365
+
366
+ iso_data = row["ms1_spec"]
367
+ if len(iso_data) >= 2: # Ensure we have mz and intensity arrays
368
+ ms1_mz = iso_data[0]
369
+ ms1_inty = iso_data[1]
370
+
371
+ # Create a Spectrum object from the isotope data
372
+ spect = Spectrum(
373
+ mz=np.array(ms1_mz),
374
+ inty=np.array(ms1_inty),
375
+ ms_level=1
376
+ )
377
+
378
+ charge = preferred_charge
379
+ if row["charge"] is not None and row["charge"] != 0:
380
+ charge = row["charge"]
331
381
 
332
- # write MS1 spectrum
333
- ms1_scan_uid = self.select_closest_scan(rt=rt)["scan_uid"][0]
334
- spect = self.get_spectrum(
335
- ms1_scan_uid,
336
- centroid=centroid,
337
- deisotope=deisotope,
338
- centroid_algo=centroid_algo,
339
- )
340
-
341
- spect = filter_peaks(spect, inty_min=inty_min)
342
-
343
- if not full_ms1:
344
- # trim spectrum to region around the precursor, it's wide to potentially identify adducts
345
- spect = spect.trim(
346
- mz_min=mz - 50,
347
- mz_max=mz + 50,
348
- )
349
-
350
- charge = preferred_charge
351
- if row["charge"] is not None and row["charge"] != 0:
352
- charge = row["charge"]
353
-
354
- write_ion(
355
- f,
356
- f"feature_uid:{feature_uid}, rt:{rt_str}, mz:{mz_str}",
357
- feature_uid,
358
- mz,
359
- rt,
360
- charge,
361
- spect,
362
- )
382
+ write_ion(
383
+ f,
384
+ f"uid:{feature_uid}",
385
+ feature_uid,
386
+ feature_id,
387
+ mz,
388
+ rt,
389
+ charge,
390
+ spect,
391
+ )
392
+ ms1_spec_used_count += 1
393
+ else:
394
+ ms1_fallback_count += 1
395
+ else:
396
+ # No MS1 spectrum exported for features without ms1_spec data
397
+ ms1_fallback_count += 1
398
+
399
+ # Second pass: Export MS2 spectra for features with MS2 data
400
+ for row in tqdm(
401
+ features_list,
402
+ total=len(features_list),
403
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export MS2 spectra",
404
+ disable=tdqm_disable,
405
+ ):
406
+ # Pre-calculate common values
407
+ feature_uid = row["feature_uid"]
408
+ feature_id = row["feature_id"] if "feature_id" in row else feature_uid
409
+ mz = row["mz"]
410
+ rt = row["rt"]
411
+ rt_str = f"{rt:.2f}"
412
+ mz_str = f"{mz:.4f}"
363
413
 
414
+ # Skip features without MS2 data (unless include_all_ms1 is True, but we already handled MS1 above)
364
415
  if row["ms2_scans"] is None:
416
+ skip = skip + 1
365
417
  continue
366
418
  elif use_cache:
367
419
  spect = row["ms2_specs"]
@@ -399,16 +451,20 @@ def export_mgf(
399
451
  current_scan_uid = (
400
452
  scan_uids[i] if i < len(scan_uids) else "unknown"
401
453
  )
402
- write_ion(
454
+ result = write_ion(
403
455
  f,
404
- f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{current_scan_uid}",
456
+ f"uid:{feature_uid}",
405
457
  feature_uid,
458
+ feature_id,
406
459
  mz,
407
460
  rt,
408
461
  charge,
409
462
  s,
410
463
  )
411
- c += 1
464
+ if result == "written":
465
+ c += 1
466
+ elif result == "empty_ms2":
467
+ empty_ms2_count += 1
412
468
  continue # Skip the rest of the processing for this feature
413
469
 
414
470
  # If we reach here, either use_cache=False or no cached spectra were available
@@ -455,16 +511,20 @@ def export_mgf(
455
511
  eic_min=eic_corr_min,
456
512
  q1_max=q1_ratio_max,
457
513
  )
458
- write_ion(
514
+ result = write_ion(
459
515
  f,
460
- f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{scan_uid}, energy:{energy}",
516
+ f"uid:{feature_uid}",
461
517
  feature_uid,
518
+ feature_id,
462
519
  mz,
463
520
  rt,
464
521
  charge,
465
522
  spect,
466
523
  )
467
- c += 1
524
+ if result == "written":
525
+ c += 1
526
+ elif result == "empty_ms2":
527
+ empty_ms2_count += 1
468
528
  else:
469
529
  if selection == "best":
470
530
  ms2_scans = row["ms2_scans"][0]
@@ -482,16 +542,20 @@ def export_mgf(
482
542
  eic_min=eic_corr_min,
483
543
  q1_max=q1_ratio_max,
484
544
  )
485
- write_ion(
545
+ result = write_ion(
486
546
  f,
487
- f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
547
+ f"uid:{feature_uid}",
488
548
  feature_uid,
549
+ feature_id,
489
550
  mz,
490
551
  rt,
491
552
  charge,
492
553
  spect,
493
554
  )
494
- c += 1
555
+ if result == "written":
556
+ c += 1
557
+ elif result == "empty_ms2":
558
+ empty_ms2_count += 1
495
559
  elif selection == "all":
496
560
  if merge:
497
561
  specs = []
@@ -527,7 +591,7 @@ def export_mgf(
527
591
  )
528
592
  if deisotope:
529
593
  spect = spect.deisotope()
530
- title = f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, merged"
594
+ title = f"uid:{feature_uid}"
531
595
  spect = filter_peaks(
532
596
  spect,
533
597
  inty_min=inty_min,
@@ -535,16 +599,20 @@ def export_mgf(
535
599
  eic_min=eic_corr_min,
536
600
  q1_max=q1_ratio_max,
537
601
  )
538
- write_ion(
602
+ result = write_ion(
539
603
  f,
540
604
  title,
541
605
  feature_uid,
606
+ feature_id,
542
607
  mz,
543
608
  rt,
544
609
  charge,
545
610
  spect,
546
611
  )
547
- c += 1
612
+ if result == "written":
613
+ c += 1
614
+ elif result == "empty_ms2":
615
+ empty_ms2_count += 1
548
616
  else:
549
617
  for ms2_scans in row["ms2_scans"]:
550
618
  spect = self.get_spectrum(
@@ -561,19 +629,27 @@ def export_mgf(
561
629
  eic_min=eic_corr_min,
562
630
  q1_max=q1_ratio_max,
563
631
  )
564
- write_ion(
632
+ result = write_ion(
565
633
  f,
566
- f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
634
+ f"uid:{feature_uid}",
567
635
  feature_uid,
636
+ feature_id,
568
637
  mz,
569
638
  rt,
570
639
  charge,
571
640
  spect,
572
641
  )
573
- c += 1
574
-
575
- self.logger.info(f"Exported {c} features to {filename}")
576
-
642
+ if result == "written":
643
+ c += 1
644
+ elif result == "empty_ms2":
645
+ empty_ms2_count += 1
646
+
647
+ self.logger.info(f"Exported {ms1_spec_used_count} MS1 spectra and {c} MS2 spectra to {filename}")
648
+ if empty_ms2_count > 0:
649
+ self.logger.info(f"Skipped {empty_ms2_count} empty MS2 spectra")
650
+ if ms1_fallback_count > 0:
651
+ self.logger.info(f"Skipped MS1 export for {ms1_fallback_count} features without isotope patterns")
652
+
577
653
  # Handle None values in logging
578
654
  inty_min_str = f"{inty_min:.3f}" if inty_min != float("-inf") else "None"
579
655
  q1_ratio_min_str = f"{q1_ratio_min:.3f}" if q1_ratio_min is not None else "None"
masster/spectrum.py CHANGED
@@ -197,16 +197,65 @@ class Spectrum:
197
197
  self.bl = None
198
198
 
199
199
  def check_if_centroided(self) -> bool:
200
- if self.mz.size == 0:
201
- return True
202
- mzs = self.mz[self.mz < np.min(self.mz) + 0.4]
203
- if len(mzs) < 20:
204
- if len(mzs) < 3:
205
- return True
206
- min_distance = np.min(np.diff(mzs))
207
- if min_distance > 0.003:
200
+ """
201
+ Fast determination if spectrum data is centroided or profile.
202
+
203
+ Uses optimized statistical approaches with early exits for speed:
204
+ 1. Fast median difference check (most decisive)
205
+ 2. Small gap ratio (profile characteristic)
206
+ 3. Density check (fallback)
207
+
208
+ Returns:
209
+ bool: True if centroided, False if profile
210
+ """
211
+ if self.mz.size < 5:
212
+ return True # Too few points to determine, assume centroided
213
+
214
+ # Fast path: check if mz is already sorted to avoid sorting cost
215
+ if np.all(self.mz[:-1] <= self.mz[1:]):
216
+ sorted_mz = self.mz
217
+ else:
218
+ sorted_mz = np.sort(self.mz)
219
+
220
+ # Calculate differences efficiently
221
+ mz_diffs = np.diff(sorted_mz)
222
+
223
+ # Remove zeros efficiently (keep positive differences)
224
+ mz_diffs = mz_diffs[mz_diffs > 0]
225
+
226
+ if mz_diffs.size == 0:
227
+ return True # All identical m/z values
228
+
229
+ # Fast approach 1: Median difference (most decisive, compute once)
230
+ median_diff = np.median(mz_diffs)
231
+
232
+ # Early exits for clear cases (>90% of cases)
233
+ if median_diff > 0.02:
234
+ return True # Clearly centroided
235
+ elif median_diff < 0.005:
236
+ return False # Clearly profile
237
+
238
+ # Fast approach 2: Small gap ratio (for borderline cases)
239
+ # Use vectorized comparison instead of creating new array
240
+ small_gap_count = np.sum(mz_diffs < 0.005)
241
+ small_gap_ratio = small_gap_count / mz_diffs.size
242
+
243
+ if small_gap_ratio > 0.7:
244
+ return False # High ratio of small gaps = profile
245
+ elif small_gap_ratio < 0.1:
246
+ return True # Low ratio of small gaps = centroided
247
+
248
+ # Fast approach 3: Density check (final fallback)
249
+ mz_range = sorted_mz[-1] - sorted_mz[0]
250
+ if mz_range > 0:
251
+ density = sorted_mz.size / mz_range
252
+ if density > 100: # High density = profile
253
+ return False
254
+ elif density < 10: # Low density = centroided
208
255
  return True
209
- return False
256
+
257
+ # Final fallback: median threshold
258
+ return median_diff > 0.01
210
259
 
211
260
  def reload(self):
212
261
  modname = self.__class__.__module__