masster 0.4.20__py3-none-any.whl → 0.4.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +6 -0
- masster/_version.py +1 -1
- masster/sample/h5.py +58 -1
- masster/sample/load.py +7 -1
- masster/sample/plot.py +56 -65
- masster/sample/processing.py +158 -0
- masster/sample/sample.py +2 -0
- masster/sample/sample5_schema.json +3 -0
- masster/sample/save.py +137 -59
- masster/spectrum.py +58 -9
- masster/study/export.py +238 -152
- masster/study/h5.py +65 -1
- masster/study/helpers.py +3 -3
- masster/study/merge.py +25 -10
- masster/study/plot.py +39 -2
- masster/study/processing.py +257 -1
- masster/study/save.py +48 -5
- masster/study/study.py +16 -3
- masster/study/study5_schema.json +3 -0
- masster/wizard/__init__.py +5 -2
- masster/wizard/wizard.py +430 -1866
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/METADATA +1 -1
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/RECORD +26 -28
- masster/wizard/test_structure.py +0 -49
- masster/wizard/test_wizard.py +0 -285
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/WHEEL +0 -0
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/entry_points.txt +0 -0
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/licenses/LICENSE +0 -0
masster/sample/save.py
CHANGED
|
@@ -230,8 +230,9 @@ def export_mgf(
|
|
|
230
230
|
features = features.filter(pl.col("rt") >= rt_start)
|
|
231
231
|
if rt_end is not None:
|
|
232
232
|
features = features.filter(pl.col("rt") <= rt_end)
|
|
233
|
-
|
|
234
|
-
|
|
233
|
+
# Note: We no longer filter out features without MS2 data here since we want to export
|
|
234
|
+
# MS1 spectra for ALL features with isotope data. The MS2 filtering is done in the
|
|
235
|
+
# second pass where we specifically check for ms2_scans.
|
|
235
236
|
|
|
236
237
|
# Convert to list of dictionaries for faster iteration
|
|
237
238
|
features_list = features.to_dicts()
|
|
@@ -265,16 +266,42 @@ def export_mgf(
|
|
|
265
266
|
setattr(spec, attr, getattr(spec, attr)[mask])
|
|
266
267
|
return spec
|
|
267
268
|
|
|
268
|
-
def write_ion(f, title, fid, mz, rt, charge, spect):
|
|
269
|
+
def write_ion(f, title, fuid, fid, mz, rt, charge, spect):
|
|
269
270
|
if spect is None:
|
|
270
|
-
return
|
|
271
|
-
|
|
271
|
+
return "none"
|
|
272
|
+
|
|
273
|
+
# For MSLEVEL=2 ions, don't write empty spectra
|
|
274
|
+
ms_level = spect.ms_level if spect.ms_level is not None else 1
|
|
275
|
+
if ms_level > 1 and (len(spect.mz) == 0 or len(spect.inty) == 0):
|
|
276
|
+
return "empty_ms2"
|
|
277
|
+
|
|
278
|
+
# Create dynamic title based on MS level
|
|
279
|
+
if ms_level == 1:
|
|
280
|
+
# MS1: uid, rt, mz
|
|
281
|
+
dynamic_title = f"uid:{fuid}, rt:{rt:.2f}, mz:{mz:.4f}"
|
|
282
|
+
else:
|
|
283
|
+
# MS2: uid, rt, mz, energy
|
|
284
|
+
energy = spect.energy if hasattr(spect, 'energy') else 0
|
|
285
|
+
dynamic_title = f"uid:{fuid}, rt:{rt:.2f}, mz:{mz:.4f}, energy:{energy}"
|
|
286
|
+
|
|
287
|
+
f.write(f"BEGIN IONS\nTITLE={dynamic_title}\n")
|
|
288
|
+
f.write(f"FEATURE_UID={fuid}\n")
|
|
272
289
|
f.write(f"FEATURE_ID={fid}\n")
|
|
273
290
|
f.write(f"CHARGE={charge}\nPEPMASS={mz}\nRTINSECONDS={rt}\n")
|
|
291
|
+
|
|
274
292
|
if spect.ms_level is None:
|
|
275
293
|
f.write("MSLEVEL=1\n")
|
|
294
|
+
# Add PRECURSORINTENSITY for MS1 spectra
|
|
295
|
+
if len(spect.inty) > 0:
|
|
296
|
+
precursor_intensity = max(spect.inty)
|
|
297
|
+
f.write(f"PRECURSORINTENSITY={precursor_intensity:.0f}\n")
|
|
276
298
|
else:
|
|
277
299
|
f.write(f"MSLEVEL={spect.ms_level}\n")
|
|
300
|
+
# Add PRECURSORINTENSITY for MS1 spectra
|
|
301
|
+
if spect.ms_level == 1 and len(spect.inty) > 0:
|
|
302
|
+
precursor_intensity = max(spect.inty)
|
|
303
|
+
f.write(f"PRECURSORINTENSITY={precursor_intensity:.0f}\n")
|
|
304
|
+
|
|
278
305
|
if spect.ms_level is not None:
|
|
279
306
|
if spect.ms_level > 1 and hasattr(spect, "energy"):
|
|
280
307
|
f.write(f"ENERGY={spect.energy}\n")
|
|
@@ -285,6 +312,7 @@ def export_mgf(
|
|
|
285
312
|
]
|
|
286
313
|
f.writelines(peak_lines)
|
|
287
314
|
f.write("END IONS\n\n")
|
|
315
|
+
return "written"
|
|
288
316
|
|
|
289
317
|
if centroid_algo is None:
|
|
290
318
|
if hasattr(self.parameters, "centroid_algo"):
|
|
@@ -304,6 +332,9 @@ def export_mgf(
|
|
|
304
332
|
|
|
305
333
|
c = 0
|
|
306
334
|
skip = 0
|
|
335
|
+
empty_ms2_count = 0
|
|
336
|
+
ms1_spec_used_count = 0
|
|
337
|
+
ms1_fallback_count = 0
|
|
307
338
|
# check if features is empty
|
|
308
339
|
if len(features_list) == 0:
|
|
309
340
|
self.logger.warning("No features found.")
|
|
@@ -311,57 +342,80 @@ def export_mgf(
|
|
|
311
342
|
filename = os.path.abspath(filename)
|
|
312
343
|
with open(filename, "w", encoding="utf-8") as f:
|
|
313
344
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
345
|
+
|
|
346
|
+
# First pass: Export MS1 spectra for ALL features with ms1_spec data
|
|
347
|
+
print("Exporting MS1 spectra...")
|
|
314
348
|
for row in tqdm(
|
|
315
349
|
features_list,
|
|
316
350
|
total=len(features_list),
|
|
317
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export
|
|
351
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export MS1 spectra",
|
|
318
352
|
disable=tdqm_disable,
|
|
319
353
|
):
|
|
320
354
|
# Pre-calculate common values
|
|
321
355
|
feature_uid = row["feature_uid"]
|
|
356
|
+
feature_id = row["feature_id"] if "feature_id" in row else feature_uid
|
|
322
357
|
mz = row["mz"]
|
|
323
358
|
rt = row["rt"]
|
|
324
359
|
rt_str = f"{rt:.2f}"
|
|
325
360
|
mz_str = f"{mz:.4f}"
|
|
326
361
|
|
|
327
|
-
#
|
|
328
|
-
if row["
|
|
329
|
-
|
|
330
|
-
|
|
362
|
+
# Export MS1 spectrum for ALL features with ms1_spec data
|
|
363
|
+
if "ms1_spec" in row and row["ms1_spec"] is not None:
|
|
364
|
+
# Create spectrum from ms1_spec isotope pattern data
|
|
365
|
+
from masster.spectrum import Spectrum
|
|
366
|
+
|
|
367
|
+
iso_data = row["ms1_spec"]
|
|
368
|
+
if len(iso_data) >= 2: # Ensure we have mz and intensity arrays
|
|
369
|
+
ms1_mz = iso_data[0]
|
|
370
|
+
ms1_inty = iso_data[1]
|
|
371
|
+
|
|
372
|
+
# Create a Spectrum object from the isotope data
|
|
373
|
+
spect = Spectrum(
|
|
374
|
+
mz=np.array(ms1_mz),
|
|
375
|
+
inty=np.array(ms1_inty),
|
|
376
|
+
ms_level=1
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
charge = preferred_charge
|
|
380
|
+
if row["charge"] is not None and row["charge"] != 0:
|
|
381
|
+
charge = row["charge"]
|
|
331
382
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
383
|
+
write_ion(
|
|
384
|
+
f,
|
|
385
|
+
f"uid:{feature_uid}",
|
|
386
|
+
feature_uid,
|
|
387
|
+
feature_id,
|
|
388
|
+
mz,
|
|
389
|
+
rt,
|
|
390
|
+
charge,
|
|
391
|
+
spect,
|
|
392
|
+
)
|
|
393
|
+
ms1_spec_used_count += 1
|
|
394
|
+
else:
|
|
395
|
+
ms1_fallback_count += 1
|
|
396
|
+
else:
|
|
397
|
+
# No MS1 spectrum exported for features without ms1_spec data
|
|
398
|
+
ms1_fallback_count += 1
|
|
399
|
+
|
|
400
|
+
# Second pass: Export MS2 spectra for features with MS2 data
|
|
401
|
+
print("Exporting MS2 spectra...")
|
|
402
|
+
for row in tqdm(
|
|
403
|
+
features_list,
|
|
404
|
+
total=len(features_list),
|
|
405
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export MS2 spectra",
|
|
406
|
+
disable=tdqm_disable,
|
|
407
|
+
):
|
|
408
|
+
# Pre-calculate common values
|
|
409
|
+
feature_uid = row["feature_uid"]
|
|
410
|
+
feature_id = row["feature_id"] if "feature_id" in row else feature_uid
|
|
411
|
+
mz = row["mz"]
|
|
412
|
+
rt = row["rt"]
|
|
413
|
+
rt_str = f"{rt:.2f}"
|
|
414
|
+
mz_str = f"{mz:.4f}"
|
|
363
415
|
|
|
416
|
+
# Skip features without MS2 data (unless include_all_ms1 is True, but we already handled MS1 above)
|
|
364
417
|
if row["ms2_scans"] is None:
|
|
418
|
+
skip = skip + 1
|
|
365
419
|
continue
|
|
366
420
|
elif use_cache:
|
|
367
421
|
spect = row["ms2_specs"]
|
|
@@ -399,16 +453,20 @@ def export_mgf(
|
|
|
399
453
|
current_scan_uid = (
|
|
400
454
|
scan_uids[i] if i < len(scan_uids) else "unknown"
|
|
401
455
|
)
|
|
402
|
-
write_ion(
|
|
456
|
+
result = write_ion(
|
|
403
457
|
f,
|
|
404
|
-
f"
|
|
458
|
+
f"uid:{feature_uid}",
|
|
405
459
|
feature_uid,
|
|
460
|
+
feature_id,
|
|
406
461
|
mz,
|
|
407
462
|
rt,
|
|
408
463
|
charge,
|
|
409
464
|
s,
|
|
410
465
|
)
|
|
411
|
-
|
|
466
|
+
if result == "written":
|
|
467
|
+
c += 1
|
|
468
|
+
elif result == "empty_ms2":
|
|
469
|
+
empty_ms2_count += 1
|
|
412
470
|
continue # Skip the rest of the processing for this feature
|
|
413
471
|
|
|
414
472
|
# If we reach here, either use_cache=False or no cached spectra were available
|
|
@@ -455,16 +513,20 @@ def export_mgf(
|
|
|
455
513
|
eic_min=eic_corr_min,
|
|
456
514
|
q1_max=q1_ratio_max,
|
|
457
515
|
)
|
|
458
|
-
write_ion(
|
|
516
|
+
result = write_ion(
|
|
459
517
|
f,
|
|
460
|
-
f"
|
|
518
|
+
f"uid:{feature_uid}",
|
|
461
519
|
feature_uid,
|
|
520
|
+
feature_id,
|
|
462
521
|
mz,
|
|
463
522
|
rt,
|
|
464
523
|
charge,
|
|
465
524
|
spect,
|
|
466
525
|
)
|
|
467
|
-
|
|
526
|
+
if result == "written":
|
|
527
|
+
c += 1
|
|
528
|
+
elif result == "empty_ms2":
|
|
529
|
+
empty_ms2_count += 1
|
|
468
530
|
else:
|
|
469
531
|
if selection == "best":
|
|
470
532
|
ms2_scans = row["ms2_scans"][0]
|
|
@@ -482,16 +544,20 @@ def export_mgf(
|
|
|
482
544
|
eic_min=eic_corr_min,
|
|
483
545
|
q1_max=q1_ratio_max,
|
|
484
546
|
)
|
|
485
|
-
write_ion(
|
|
547
|
+
result = write_ion(
|
|
486
548
|
f,
|
|
487
|
-
f"
|
|
549
|
+
f"uid:{feature_uid}",
|
|
488
550
|
feature_uid,
|
|
551
|
+
feature_id,
|
|
489
552
|
mz,
|
|
490
553
|
rt,
|
|
491
554
|
charge,
|
|
492
555
|
spect,
|
|
493
556
|
)
|
|
494
|
-
|
|
557
|
+
if result == "written":
|
|
558
|
+
c += 1
|
|
559
|
+
elif result == "empty_ms2":
|
|
560
|
+
empty_ms2_count += 1
|
|
495
561
|
elif selection == "all":
|
|
496
562
|
if merge:
|
|
497
563
|
specs = []
|
|
@@ -527,7 +593,7 @@ def export_mgf(
|
|
|
527
593
|
)
|
|
528
594
|
if deisotope:
|
|
529
595
|
spect = spect.deisotope()
|
|
530
|
-
title = f"
|
|
596
|
+
title = f"uid:{feature_uid}"
|
|
531
597
|
spect = filter_peaks(
|
|
532
598
|
spect,
|
|
533
599
|
inty_min=inty_min,
|
|
@@ -535,16 +601,20 @@ def export_mgf(
|
|
|
535
601
|
eic_min=eic_corr_min,
|
|
536
602
|
q1_max=q1_ratio_max,
|
|
537
603
|
)
|
|
538
|
-
write_ion(
|
|
604
|
+
result = write_ion(
|
|
539
605
|
f,
|
|
540
606
|
title,
|
|
541
607
|
feature_uid,
|
|
608
|
+
feature_id,
|
|
542
609
|
mz,
|
|
543
610
|
rt,
|
|
544
611
|
charge,
|
|
545
612
|
spect,
|
|
546
613
|
)
|
|
547
|
-
|
|
614
|
+
if result == "written":
|
|
615
|
+
c += 1
|
|
616
|
+
elif result == "empty_ms2":
|
|
617
|
+
empty_ms2_count += 1
|
|
548
618
|
else:
|
|
549
619
|
for ms2_scans in row["ms2_scans"]:
|
|
550
620
|
spect = self.get_spectrum(
|
|
@@ -561,19 +631,27 @@ def export_mgf(
|
|
|
561
631
|
eic_min=eic_corr_min,
|
|
562
632
|
q1_max=q1_ratio_max,
|
|
563
633
|
)
|
|
564
|
-
write_ion(
|
|
634
|
+
result = write_ion(
|
|
565
635
|
f,
|
|
566
|
-
f"
|
|
636
|
+
f"uid:{feature_uid}",
|
|
567
637
|
feature_uid,
|
|
638
|
+
feature_id,
|
|
568
639
|
mz,
|
|
569
640
|
rt,
|
|
570
641
|
charge,
|
|
571
642
|
spect,
|
|
572
643
|
)
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
644
|
+
if result == "written":
|
|
645
|
+
c += 1
|
|
646
|
+
elif result == "empty_ms2":
|
|
647
|
+
empty_ms2_count += 1
|
|
648
|
+
|
|
649
|
+
self.logger.info(f"Exported {ms1_spec_used_count} MS1 spectra and {c} MS2 spectra to {filename}")
|
|
650
|
+
if empty_ms2_count > 0:
|
|
651
|
+
self.logger.info(f"Skipped {empty_ms2_count} empty MS2 spectra")
|
|
652
|
+
if ms1_fallback_count > 0:
|
|
653
|
+
self.logger.info(f"Skipped MS1 export for {ms1_fallback_count} features without isotope patterns")
|
|
654
|
+
|
|
577
655
|
# Handle None values in logging
|
|
578
656
|
inty_min_str = f"{inty_min:.3f}" if inty_min != float("-inf") else "None"
|
|
579
657
|
q1_ratio_min_str = f"{q1_ratio_min:.3f}" if q1_ratio_min is not None else "None"
|
masster/spectrum.py
CHANGED
|
@@ -197,16 +197,65 @@ class Spectrum:
|
|
|
197
197
|
self.bl = None
|
|
198
198
|
|
|
199
199
|
def check_if_centroided(self) -> bool:
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
200
|
+
"""
|
|
201
|
+
Fast determination if spectrum data is centroided or profile.
|
|
202
|
+
|
|
203
|
+
Uses optimized statistical approaches with early exits for speed:
|
|
204
|
+
1. Fast median difference check (most decisive)
|
|
205
|
+
2. Small gap ratio (profile characteristic)
|
|
206
|
+
3. Density check (fallback)
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
bool: True if centroided, False if profile
|
|
210
|
+
"""
|
|
211
|
+
if self.mz.size < 5:
|
|
212
|
+
return True # Too few points to determine, assume centroided
|
|
213
|
+
|
|
214
|
+
# Fast path: check if mz is already sorted to avoid sorting cost
|
|
215
|
+
if np.all(self.mz[:-1] <= self.mz[1:]):
|
|
216
|
+
sorted_mz = self.mz
|
|
217
|
+
else:
|
|
218
|
+
sorted_mz = np.sort(self.mz)
|
|
219
|
+
|
|
220
|
+
# Calculate differences efficiently
|
|
221
|
+
mz_diffs = np.diff(sorted_mz)
|
|
222
|
+
|
|
223
|
+
# Remove zeros efficiently (keep positive differences)
|
|
224
|
+
mz_diffs = mz_diffs[mz_diffs > 0]
|
|
225
|
+
|
|
226
|
+
if mz_diffs.size == 0:
|
|
227
|
+
return True # All identical m/z values
|
|
228
|
+
|
|
229
|
+
# Fast approach 1: Median difference (most decisive, compute once)
|
|
230
|
+
median_diff = np.median(mz_diffs)
|
|
231
|
+
|
|
232
|
+
# Early exits for clear cases (>90% of cases)
|
|
233
|
+
if median_diff > 0.02:
|
|
234
|
+
return True # Clearly centroided
|
|
235
|
+
elif median_diff < 0.005:
|
|
236
|
+
return False # Clearly profile
|
|
237
|
+
|
|
238
|
+
# Fast approach 2: Small gap ratio (for borderline cases)
|
|
239
|
+
# Use vectorized comparison instead of creating new array
|
|
240
|
+
small_gap_count = np.sum(mz_diffs < 0.005)
|
|
241
|
+
small_gap_ratio = small_gap_count / mz_diffs.size
|
|
242
|
+
|
|
243
|
+
if small_gap_ratio > 0.7:
|
|
244
|
+
return False # High ratio of small gaps = profile
|
|
245
|
+
elif small_gap_ratio < 0.1:
|
|
246
|
+
return True # Low ratio of small gaps = centroided
|
|
247
|
+
|
|
248
|
+
# Fast approach 3: Density check (final fallback)
|
|
249
|
+
mz_range = sorted_mz[-1] - sorted_mz[0]
|
|
250
|
+
if mz_range > 0:
|
|
251
|
+
density = sorted_mz.size / mz_range
|
|
252
|
+
if density > 100: # High density = profile
|
|
253
|
+
return False
|
|
254
|
+
elif density < 10: # Low density = centroided
|
|
208
255
|
return True
|
|
209
|
-
|
|
256
|
+
|
|
257
|
+
# Final fallback: median threshold
|
|
258
|
+
return median_diff > 0.01
|
|
210
259
|
|
|
211
260
|
def reload(self):
|
|
212
261
|
modname = self.__class__.__module__
|