masster 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +6 -0
- masster/_version.py +1 -1
- masster/sample/h5.py +58 -1
- masster/sample/load.py +7 -1
- masster/sample/plot.py +56 -65
- masster/sample/processing.py +158 -0
- masster/sample/sample.py +2 -0
- masster/sample/sample5_schema.json +3 -0
- masster/sample/save.py +135 -59
- masster/spectrum.py +58 -9
- masster/study/export.py +240 -154
- masster/study/h5.py +65 -1
- masster/study/helpers.py +3 -3
- masster/study/load.py +39 -3
- masster/study/merge.py +25 -10
- masster/study/plot.py +162 -192
- masster/study/processing.py +362 -12
- masster/study/save.py +48 -5
- masster/study/study.py +16 -3
- masster/study/study5_schema.json +3 -0
- masster/wizard/__init__.py +5 -2
- masster/wizard/wizard.py +435 -1871
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/METADATA +1 -1
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/RECORD +27 -29
- masster/wizard/test_structure.py +0 -49
- masster/wizard/test_wizard.py +0 -285
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/WHEEL +0 -0
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/entry_points.txt +0 -0
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/licenses/LICENSE +0 -0
masster/sample/save.py
CHANGED
|
@@ -230,8 +230,9 @@ def export_mgf(
|
|
|
230
230
|
features = features.filter(pl.col("rt") >= rt_start)
|
|
231
231
|
if rt_end is not None:
|
|
232
232
|
features = features.filter(pl.col("rt") <= rt_end)
|
|
233
|
-
|
|
234
|
-
|
|
233
|
+
# Note: We no longer filter out features without MS2 data here since we want to export
|
|
234
|
+
# MS1 spectra for ALL features with isotope data. The MS2 filtering is done in the
|
|
235
|
+
# second pass where we specifically check for ms2_scans.
|
|
235
236
|
|
|
236
237
|
# Convert to list of dictionaries for faster iteration
|
|
237
238
|
features_list = features.to_dicts()
|
|
@@ -265,16 +266,42 @@ def export_mgf(
|
|
|
265
266
|
setattr(spec, attr, getattr(spec, attr)[mask])
|
|
266
267
|
return spec
|
|
267
268
|
|
|
268
|
-
def write_ion(f, title, fid, mz, rt, charge, spect):
|
|
269
|
+
def write_ion(f, title, fuid, fid, mz, rt, charge, spect):
|
|
269
270
|
if spect is None:
|
|
270
|
-
return
|
|
271
|
-
|
|
271
|
+
return "none"
|
|
272
|
+
|
|
273
|
+
# For MSLEVEL=2 ions, don't write empty spectra
|
|
274
|
+
ms_level = spect.ms_level if spect.ms_level is not None else 1
|
|
275
|
+
if ms_level > 1 and (len(spect.mz) == 0 or len(spect.inty) == 0):
|
|
276
|
+
return "empty_ms2"
|
|
277
|
+
|
|
278
|
+
# Create dynamic title based on MS level
|
|
279
|
+
if ms_level == 1:
|
|
280
|
+
# MS1: uid, rt, mz
|
|
281
|
+
dynamic_title = f"uid:{fuid}, rt:{rt:.2f}, mz:{mz:.4f}"
|
|
282
|
+
else:
|
|
283
|
+
# MS2: uid, rt, mz, energy
|
|
284
|
+
energy = spect.energy if hasattr(spect, 'energy') else 0
|
|
285
|
+
dynamic_title = f"uid:{fuid}, rt:{rt:.2f}, mz:{mz:.4f}, energy:{energy}"
|
|
286
|
+
|
|
287
|
+
f.write(f"BEGIN IONS\nTITLE={dynamic_title}\n")
|
|
288
|
+
f.write(f"FEATURE_UID={fuid}\n")
|
|
272
289
|
f.write(f"FEATURE_ID={fid}\n")
|
|
273
290
|
f.write(f"CHARGE={charge}\nPEPMASS={mz}\nRTINSECONDS={rt}\n")
|
|
291
|
+
|
|
274
292
|
if spect.ms_level is None:
|
|
275
293
|
f.write("MSLEVEL=1\n")
|
|
294
|
+
# Add PRECURSORINTENSITY for MS1 spectra
|
|
295
|
+
if len(spect.inty) > 0:
|
|
296
|
+
precursor_intensity = max(spect.inty)
|
|
297
|
+
f.write(f"PRECURSORINTENSITY={precursor_intensity:.0f}\n")
|
|
276
298
|
else:
|
|
277
299
|
f.write(f"MSLEVEL={spect.ms_level}\n")
|
|
300
|
+
# Add PRECURSORINTENSITY for MS1 spectra
|
|
301
|
+
if spect.ms_level == 1 and len(spect.inty) > 0:
|
|
302
|
+
precursor_intensity = max(spect.inty)
|
|
303
|
+
f.write(f"PRECURSORINTENSITY={precursor_intensity:.0f}\n")
|
|
304
|
+
|
|
278
305
|
if spect.ms_level is not None:
|
|
279
306
|
if spect.ms_level > 1 and hasattr(spect, "energy"):
|
|
280
307
|
f.write(f"ENERGY={spect.energy}\n")
|
|
@@ -285,6 +312,7 @@ def export_mgf(
|
|
|
285
312
|
]
|
|
286
313
|
f.writelines(peak_lines)
|
|
287
314
|
f.write("END IONS\n\n")
|
|
315
|
+
return "written"
|
|
288
316
|
|
|
289
317
|
if centroid_algo is None:
|
|
290
318
|
if hasattr(self.parameters, "centroid_algo"):
|
|
@@ -304,6 +332,9 @@ def export_mgf(
|
|
|
304
332
|
|
|
305
333
|
c = 0
|
|
306
334
|
skip = 0
|
|
335
|
+
empty_ms2_count = 0
|
|
336
|
+
ms1_spec_used_count = 0
|
|
337
|
+
ms1_fallback_count = 0
|
|
307
338
|
# check if features is empty
|
|
308
339
|
if len(features_list) == 0:
|
|
309
340
|
self.logger.warning("No features found.")
|
|
@@ -311,57 +342,78 @@ def export_mgf(
|
|
|
311
342
|
filename = os.path.abspath(filename)
|
|
312
343
|
with open(filename, "w", encoding="utf-8") as f:
|
|
313
344
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
345
|
+
|
|
346
|
+
# First pass: Export MS1 spectra for ALL features with ms1_spec data
|
|
314
347
|
for row in tqdm(
|
|
315
348
|
features_list,
|
|
316
349
|
total=len(features_list),
|
|
317
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export
|
|
350
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export MS1 spectra",
|
|
318
351
|
disable=tdqm_disable,
|
|
319
352
|
):
|
|
320
353
|
# Pre-calculate common values
|
|
321
354
|
feature_uid = row["feature_uid"]
|
|
355
|
+
feature_id = row["feature_id"] if "feature_id" in row else feature_uid
|
|
322
356
|
mz = row["mz"]
|
|
323
357
|
rt = row["rt"]
|
|
324
358
|
rt_str = f"{rt:.2f}"
|
|
325
359
|
mz_str = f"{mz:.4f}"
|
|
326
360
|
|
|
327
|
-
#
|
|
328
|
-
if row["
|
|
329
|
-
|
|
330
|
-
|
|
361
|
+
# Export MS1 spectrum for ALL features with ms1_spec data
|
|
362
|
+
if "ms1_spec" in row and row["ms1_spec"] is not None:
|
|
363
|
+
# Create spectrum from ms1_spec isotope pattern data
|
|
364
|
+
from masster.spectrum import Spectrum
|
|
365
|
+
|
|
366
|
+
iso_data = row["ms1_spec"]
|
|
367
|
+
if len(iso_data) >= 2: # Ensure we have mz and intensity arrays
|
|
368
|
+
ms1_mz = iso_data[0]
|
|
369
|
+
ms1_inty = iso_data[1]
|
|
370
|
+
|
|
371
|
+
# Create a Spectrum object from the isotope data
|
|
372
|
+
spect = Spectrum(
|
|
373
|
+
mz=np.array(ms1_mz),
|
|
374
|
+
inty=np.array(ms1_inty),
|
|
375
|
+
ms_level=1
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
charge = preferred_charge
|
|
379
|
+
if row["charge"] is not None and row["charge"] != 0:
|
|
380
|
+
charge = row["charge"]
|
|
331
381
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
382
|
+
write_ion(
|
|
383
|
+
f,
|
|
384
|
+
f"uid:{feature_uid}",
|
|
385
|
+
feature_uid,
|
|
386
|
+
feature_id,
|
|
387
|
+
mz,
|
|
388
|
+
rt,
|
|
389
|
+
charge,
|
|
390
|
+
spect,
|
|
391
|
+
)
|
|
392
|
+
ms1_spec_used_count += 1
|
|
393
|
+
else:
|
|
394
|
+
ms1_fallback_count += 1
|
|
395
|
+
else:
|
|
396
|
+
# No MS1 spectrum exported for features without ms1_spec data
|
|
397
|
+
ms1_fallback_count += 1
|
|
398
|
+
|
|
399
|
+
# Second pass: Export MS2 spectra for features with MS2 data
|
|
400
|
+
for row in tqdm(
|
|
401
|
+
features_list,
|
|
402
|
+
total=len(features_list),
|
|
403
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Export MS2 spectra",
|
|
404
|
+
disable=tdqm_disable,
|
|
405
|
+
):
|
|
406
|
+
# Pre-calculate common values
|
|
407
|
+
feature_uid = row["feature_uid"]
|
|
408
|
+
feature_id = row["feature_id"] if "feature_id" in row else feature_uid
|
|
409
|
+
mz = row["mz"]
|
|
410
|
+
rt = row["rt"]
|
|
411
|
+
rt_str = f"{rt:.2f}"
|
|
412
|
+
mz_str = f"{mz:.4f}"
|
|
363
413
|
|
|
414
|
+
# Skip features without MS2 data (unless include_all_ms1 is True, but we already handled MS1 above)
|
|
364
415
|
if row["ms2_scans"] is None:
|
|
416
|
+
skip = skip + 1
|
|
365
417
|
continue
|
|
366
418
|
elif use_cache:
|
|
367
419
|
spect = row["ms2_specs"]
|
|
@@ -399,16 +451,20 @@ def export_mgf(
|
|
|
399
451
|
current_scan_uid = (
|
|
400
452
|
scan_uids[i] if i < len(scan_uids) else "unknown"
|
|
401
453
|
)
|
|
402
|
-
write_ion(
|
|
454
|
+
result = write_ion(
|
|
403
455
|
f,
|
|
404
|
-
f"
|
|
456
|
+
f"uid:{feature_uid}",
|
|
405
457
|
feature_uid,
|
|
458
|
+
feature_id,
|
|
406
459
|
mz,
|
|
407
460
|
rt,
|
|
408
461
|
charge,
|
|
409
462
|
s,
|
|
410
463
|
)
|
|
411
|
-
|
|
464
|
+
if result == "written":
|
|
465
|
+
c += 1
|
|
466
|
+
elif result == "empty_ms2":
|
|
467
|
+
empty_ms2_count += 1
|
|
412
468
|
continue # Skip the rest of the processing for this feature
|
|
413
469
|
|
|
414
470
|
# If we reach here, either use_cache=False or no cached spectra were available
|
|
@@ -455,16 +511,20 @@ def export_mgf(
|
|
|
455
511
|
eic_min=eic_corr_min,
|
|
456
512
|
q1_max=q1_ratio_max,
|
|
457
513
|
)
|
|
458
|
-
write_ion(
|
|
514
|
+
result = write_ion(
|
|
459
515
|
f,
|
|
460
|
-
f"
|
|
516
|
+
f"uid:{feature_uid}",
|
|
461
517
|
feature_uid,
|
|
518
|
+
feature_id,
|
|
462
519
|
mz,
|
|
463
520
|
rt,
|
|
464
521
|
charge,
|
|
465
522
|
spect,
|
|
466
523
|
)
|
|
467
|
-
|
|
524
|
+
if result == "written":
|
|
525
|
+
c += 1
|
|
526
|
+
elif result == "empty_ms2":
|
|
527
|
+
empty_ms2_count += 1
|
|
468
528
|
else:
|
|
469
529
|
if selection == "best":
|
|
470
530
|
ms2_scans = row["ms2_scans"][0]
|
|
@@ -482,16 +542,20 @@ def export_mgf(
|
|
|
482
542
|
eic_min=eic_corr_min,
|
|
483
543
|
q1_max=q1_ratio_max,
|
|
484
544
|
)
|
|
485
|
-
write_ion(
|
|
545
|
+
result = write_ion(
|
|
486
546
|
f,
|
|
487
|
-
f"
|
|
547
|
+
f"uid:{feature_uid}",
|
|
488
548
|
feature_uid,
|
|
549
|
+
feature_id,
|
|
489
550
|
mz,
|
|
490
551
|
rt,
|
|
491
552
|
charge,
|
|
492
553
|
spect,
|
|
493
554
|
)
|
|
494
|
-
|
|
555
|
+
if result == "written":
|
|
556
|
+
c += 1
|
|
557
|
+
elif result == "empty_ms2":
|
|
558
|
+
empty_ms2_count += 1
|
|
495
559
|
elif selection == "all":
|
|
496
560
|
if merge:
|
|
497
561
|
specs = []
|
|
@@ -527,7 +591,7 @@ def export_mgf(
|
|
|
527
591
|
)
|
|
528
592
|
if deisotope:
|
|
529
593
|
spect = spect.deisotope()
|
|
530
|
-
title = f"
|
|
594
|
+
title = f"uid:{feature_uid}"
|
|
531
595
|
spect = filter_peaks(
|
|
532
596
|
spect,
|
|
533
597
|
inty_min=inty_min,
|
|
@@ -535,16 +599,20 @@ def export_mgf(
|
|
|
535
599
|
eic_min=eic_corr_min,
|
|
536
600
|
q1_max=q1_ratio_max,
|
|
537
601
|
)
|
|
538
|
-
write_ion(
|
|
602
|
+
result = write_ion(
|
|
539
603
|
f,
|
|
540
604
|
title,
|
|
541
605
|
feature_uid,
|
|
606
|
+
feature_id,
|
|
542
607
|
mz,
|
|
543
608
|
rt,
|
|
544
609
|
charge,
|
|
545
610
|
spect,
|
|
546
611
|
)
|
|
547
|
-
|
|
612
|
+
if result == "written":
|
|
613
|
+
c += 1
|
|
614
|
+
elif result == "empty_ms2":
|
|
615
|
+
empty_ms2_count += 1
|
|
548
616
|
else:
|
|
549
617
|
for ms2_scans in row["ms2_scans"]:
|
|
550
618
|
spect = self.get_spectrum(
|
|
@@ -561,19 +629,27 @@ def export_mgf(
|
|
|
561
629
|
eic_min=eic_corr_min,
|
|
562
630
|
q1_max=q1_ratio_max,
|
|
563
631
|
)
|
|
564
|
-
write_ion(
|
|
632
|
+
result = write_ion(
|
|
565
633
|
f,
|
|
566
|
-
f"
|
|
634
|
+
f"uid:{feature_uid}",
|
|
567
635
|
feature_uid,
|
|
636
|
+
feature_id,
|
|
568
637
|
mz,
|
|
569
638
|
rt,
|
|
570
639
|
charge,
|
|
571
640
|
spect,
|
|
572
641
|
)
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
642
|
+
if result == "written":
|
|
643
|
+
c += 1
|
|
644
|
+
elif result == "empty_ms2":
|
|
645
|
+
empty_ms2_count += 1
|
|
646
|
+
|
|
647
|
+
self.logger.info(f"Exported {ms1_spec_used_count} MS1 spectra and {c} MS2 spectra to {filename}")
|
|
648
|
+
if empty_ms2_count > 0:
|
|
649
|
+
self.logger.info(f"Skipped {empty_ms2_count} empty MS2 spectra")
|
|
650
|
+
if ms1_fallback_count > 0:
|
|
651
|
+
self.logger.info(f"Skipped MS1 export for {ms1_fallback_count} features without isotope patterns")
|
|
652
|
+
|
|
577
653
|
# Handle None values in logging
|
|
578
654
|
inty_min_str = f"{inty_min:.3f}" if inty_min != float("-inf") else "None"
|
|
579
655
|
q1_ratio_min_str = f"{q1_ratio_min:.3f}" if q1_ratio_min is not None else "None"
|
masster/spectrum.py
CHANGED
|
@@ -197,16 +197,65 @@ class Spectrum:
|
|
|
197
197
|
self.bl = None
|
|
198
198
|
|
|
199
199
|
def check_if_centroided(self) -> bool:
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
200
|
+
"""
|
|
201
|
+
Fast determination if spectrum data is centroided or profile.
|
|
202
|
+
|
|
203
|
+
Uses optimized statistical approaches with early exits for speed:
|
|
204
|
+
1. Fast median difference check (most decisive)
|
|
205
|
+
2. Small gap ratio (profile characteristic)
|
|
206
|
+
3. Density check (fallback)
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
bool: True if centroided, False if profile
|
|
210
|
+
"""
|
|
211
|
+
if self.mz.size < 5:
|
|
212
|
+
return True # Too few points to determine, assume centroided
|
|
213
|
+
|
|
214
|
+
# Fast path: check if mz is already sorted to avoid sorting cost
|
|
215
|
+
if np.all(self.mz[:-1] <= self.mz[1:]):
|
|
216
|
+
sorted_mz = self.mz
|
|
217
|
+
else:
|
|
218
|
+
sorted_mz = np.sort(self.mz)
|
|
219
|
+
|
|
220
|
+
# Calculate differences efficiently
|
|
221
|
+
mz_diffs = np.diff(sorted_mz)
|
|
222
|
+
|
|
223
|
+
# Remove zeros efficiently (keep positive differences)
|
|
224
|
+
mz_diffs = mz_diffs[mz_diffs > 0]
|
|
225
|
+
|
|
226
|
+
if mz_diffs.size == 0:
|
|
227
|
+
return True # All identical m/z values
|
|
228
|
+
|
|
229
|
+
# Fast approach 1: Median difference (most decisive, compute once)
|
|
230
|
+
median_diff = np.median(mz_diffs)
|
|
231
|
+
|
|
232
|
+
# Early exits for clear cases (>90% of cases)
|
|
233
|
+
if median_diff > 0.02:
|
|
234
|
+
return True # Clearly centroided
|
|
235
|
+
elif median_diff < 0.005:
|
|
236
|
+
return False # Clearly profile
|
|
237
|
+
|
|
238
|
+
# Fast approach 2: Small gap ratio (for borderline cases)
|
|
239
|
+
# Use vectorized comparison instead of creating new array
|
|
240
|
+
small_gap_count = np.sum(mz_diffs < 0.005)
|
|
241
|
+
small_gap_ratio = small_gap_count / mz_diffs.size
|
|
242
|
+
|
|
243
|
+
if small_gap_ratio > 0.7:
|
|
244
|
+
return False # High ratio of small gaps = profile
|
|
245
|
+
elif small_gap_ratio < 0.1:
|
|
246
|
+
return True # Low ratio of small gaps = centroided
|
|
247
|
+
|
|
248
|
+
# Fast approach 3: Density check (final fallback)
|
|
249
|
+
mz_range = sorted_mz[-1] - sorted_mz[0]
|
|
250
|
+
if mz_range > 0:
|
|
251
|
+
density = sorted_mz.size / mz_range
|
|
252
|
+
if density > 100: # High density = profile
|
|
253
|
+
return False
|
|
254
|
+
elif density < 10: # Low density = centroided
|
|
208
255
|
return True
|
|
209
|
-
|
|
256
|
+
|
|
257
|
+
# Final fallback: median threshold
|
|
258
|
+
return median_diff > 0.01
|
|
210
259
|
|
|
211
260
|
def reload(self):
|
|
212
261
|
modname = self.__class__.__module__
|