masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +3 -9
  4. masster/data/libs/README.md +1 -1
  5. masster/data/libs/ccm.csv +120 -120
  6. masster/data/libs/ccm.py +116 -62
  7. masster/data/libs/central_carbon_README.md +1 -1
  8. masster/data/libs/urine.py +161 -65
  9. masster/data/libs/urine_metabolites.csv +4693 -4693
  10. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
  11. masster/logger.py +43 -78
  12. masster/sample/__init__.py +1 -1
  13. masster/sample/adducts.py +264 -338
  14. masster/sample/defaults/find_adducts_def.py +8 -21
  15. masster/sample/defaults/find_features_def.py +1 -6
  16. masster/sample/defaults/get_spectrum_def.py +1 -5
  17. masster/sample/defaults/sample_def.py +1 -5
  18. masster/sample/h5.py +282 -561
  19. masster/sample/helpers.py +75 -131
  20. masster/sample/lib.py +17 -42
  21. masster/sample/load.py +17 -31
  22. masster/sample/parameters.py +2 -6
  23. masster/sample/plot.py +27 -88
  24. masster/sample/processing.py +87 -117
  25. masster/sample/quant.py +51 -57
  26. masster/sample/sample.py +90 -103
  27. masster/sample/sample5_schema.json +44 -44
  28. masster/sample/save.py +12 -35
  29. masster/sample/sciex.py +19 -66
  30. masster/spectrum.py +20 -58
  31. masster/study/__init__.py +1 -1
  32. masster/study/defaults/align_def.py +1 -5
  33. masster/study/defaults/fill_chrom_def.py +1 -5
  34. masster/study/defaults/fill_def.py +1 -5
  35. masster/study/defaults/integrate_chrom_def.py +1 -5
  36. masster/study/defaults/integrate_def.py +1 -5
  37. masster/study/defaults/study_def.py +25 -58
  38. masster/study/export.py +207 -233
  39. masster/study/h5.py +136 -470
  40. masster/study/helpers.py +202 -495
  41. masster/study/helpers_optimized.py +13 -40
  42. masster/study/id.py +110 -213
  43. masster/study/load.py +143 -230
  44. masster/study/plot.py +257 -518
  45. masster/study/processing.py +257 -469
  46. masster/study/save.py +5 -15
  47. masster/study/study.py +276 -379
  48. masster/study/study5_schema.json +96 -96
  49. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
  50. masster-0.4.1.dist-info/RECORD +67 -0
  51. masster-0.4.0.dist-info/RECORD +0 -67
  52. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
  53. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
  54. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/study/export.py CHANGED
@@ -10,9 +10,9 @@ import polars as pl
10
10
 
11
11
  from tqdm import tqdm
12
12
 
13
- from master.spectrum import combine_peaks
14
- from master.study.defaults import export_mgf_defaults
15
- from master._version import get_version
13
+ from masster.spectrum import combine_peaks
14
+ from masster.study.defaults import export_mgf_defaults
15
+ from masster._version import get_version
16
16
 
17
17
 
18
18
  def _get_mgf_df(self, **kwargs):
@@ -107,11 +107,7 @@ def _get_mgf_df(self, **kwargs):
107
107
  mask = mask & (spec.inty >= inty_min)
108
108
  for attr in spec.__dict__:
109
109
  arr = getattr(spec, attr)
110
- if (
111
- isinstance(arr, list | np.ndarray)
112
- and hasattr(arr, "__len__")
113
- and len(arr) == length
114
- ):
110
+ if isinstance(arr, list | np.ndarray) and hasattr(arr, "__len__") and len(arr) == length:
115
111
  setattr(spec, attr, np.array(arr)[mask])
116
112
  return spec
117
113
 
@@ -121,12 +117,8 @@ def _get_mgf_df(self, **kwargs):
121
117
  return None
122
118
 
123
119
  # Prepare spectrum data
124
- spectrum_mz = (
125
- spect.mz.tolist() if hasattr(spect.mz, "tolist") else list(spect.mz)
126
- )
127
- spectrum_inty = (
128
- spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
129
- )
120
+ spectrum_mz = spect.mz.tolist() if hasattr(spect.mz, "tolist") else list(spect.mz)
121
+ spectrum_inty = spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
130
122
 
131
123
  # Determine MS level
132
124
  ms_level = spect.ms_level if spect.ms_level is not None else 1
@@ -266,11 +258,7 @@ def _get_mgf_df(self, **kwargs):
266
258
 
267
259
  elif selection == "all":
268
260
  if merge:
269
- specs = [
270
- row_e["spec"]
271
- for row_e in cons_ms2.iter_rows(named=True)
272
- if row_e["spec"] is not None
273
- ]
261
+ specs = [row_e["spec"] for row_e in cons_ms2.iter_rows(named=True) if row_e["spec"] is not None]
274
262
  if not specs:
275
263
  continue
276
264
  spect = combine_peaks(specs)
@@ -422,6 +410,13 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
422
410
  description (str, optional): Human-readable description.
423
411
  **kwargs: Additional metadata or export options.
424
412
  """
413
+
414
+ def safe_str(value, default="null"):
415
+ """Convert value to string, replacing empty strings with 'null'"""
416
+ if value is None:
417
+ return default
418
+ str_val = str(value)
419
+ return str_val if str_val.strip() != "" else default
425
420
  if filename is None:
426
421
  filename = "study.mztab"
427
422
  if not os.path.isabs(filename):
@@ -435,16 +430,14 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
435
430
  top_id_data = None
436
431
  try:
437
432
  # Import here to avoid circular imports
438
- from master.study.id import get_id
439
-
433
+ from masster.study.id import get_id
440
434
  id_data = get_id(self)
441
435
  if id_data is not None and not id_data.is_empty():
442
436
  # Get top scoring identification for each consensus_uid for SML section
443
- top_id_data = (
444
- id_data.group_by("consensus_uid")
445
- .agg(pl.all().sort_by("score", descending=True).first())
446
- .sort("consensus_uid")
447
- )
437
+ top_id_data = (id_data
438
+ .group_by("consensus_uid")
439
+ .agg(pl.all().sort_by("score", descending=True).first())
440
+ .sort("consensus_uid"))
448
441
  else:
449
442
  self.logger.info("No identification data available for mzTab export")
450
443
  except Exception as e:
@@ -468,9 +461,7 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
468
461
 
469
462
  # --- Prepare MTD (metadata) section ---
470
463
  mtd_lines = []
471
- mtd_lines.append(
472
- f"COM\tfile generated by MASSter {get_version()} on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
473
- )
464
+ mtd_lines.append(f"COM\tfile generated by MASSter {get_version()} on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
474
465
  mtd_lines.append("\nMTD\tmzTab-version\t2.2.0-M")
475
466
  id = self.label if self.label else self.folder
476
467
  mtd_lines.append(f"MTD\tmzTab-id\t{id}")
@@ -478,67 +469,58 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
478
469
  mtd_lines.append("MTD\tcv[1]-label\tMS")
479
470
  mtd_lines.append("MTD\tcv[1]-full_name\tPSI-MS controlled vocabulary")
480
471
  mtd_lines.append("MTD\tcv[1]-version\t4.1.199")
481
- mtd_lines.append(
482
- "MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo",
483
- )
472
+ mtd_lines.append("MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo")
484
473
  mtd_lines.append("")
485
- mtd_lines.append(
486
- "MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
487
- )
488
- mtd_lines.append(
489
- "MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
490
- )
474
+ mtd_lines.append("MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
475
+ mtd_lines.append("MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
491
476
  mtd_lines.append(
492
477
  "MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]",
493
478
  )
494
-
479
+
495
480
  # Add identification confidence measures if identification data is available
496
481
  if id_data is not None:
497
- mtd_lines.append(
498
- "MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]",
499
- )
482
+ mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
500
483
  else:
501
- mtd_lines.append(
502
- "MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]",
503
- )
504
-
484
+ mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
485
+
505
486
  mtd_lines.append("")
506
487
  mtd_lines.append("MTD\tsoftware[1]\t[MS, MS:1003430, OpenMS, unknown]")
507
488
  mtd_lines.append(f"MTD\tsoftware[2]\t[MS, MS:1002878, MASSter, {get_version()}]")
508
- mtd_lines.append(
509
- "MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]",
510
- )
489
+ mtd_lines.append("MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]")
511
490
  mtd_lines.append("")
512
-
491
+
513
492
  # Database information - updated based on identification data
514
- if (
515
- id_data is not None
516
- and hasattr(self, "lib_df")
517
- and self.lib_df is not None
518
- and not self.lib_df.is_empty()
519
- ):
493
+ if id_data is not None and hasattr(self, 'lib_df') and self.lib_df is not None and not self.lib_df.is_empty():
520
494
  mtd_lines.append('MTD\tdatabase[1]\t[, , "compound library", ]')
521
495
  mtd_lines.append("MTD\tdatabase[1]-prefix\tcmpd")
522
496
  mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
523
- mtd_lines.append("MTD\tdatabase[1]-uri\tnull")
497
+ mtd_lines.append("MTD\tdatabase[1]-uri\thttps://pubchem.ncbi.nlm.nih.gov/")
524
498
  else:
525
- mtd_lines.append('MTD\tdatabase[1]\t[, , "no database", null]')
526
- mtd_lines.append("MTD\tdatabase[1]-prefix\tnull")
499
+ mtd_lines.append('MTD\tdatabase[1]\t[, , "PubChem", ]')
500
+ mtd_lines.append("MTD\tdatabase[1]-prefix\tCID")
527
501
  mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
528
- mtd_lines.append("MTD\tdatabase[1]-uri\tnull")
529
-
502
+ mtd_lines.append("MTD\tdatabase[1]-uri\thttps://pubchem.ncbi.nlm.nih.gov/")
503
+
530
504
  # Get abundance matrix to determine the number of assays needed
531
505
  abundance_matrix = self.get_consensus_matrix()
532
-
506
+
533
507
  # Get sample columns (excluding consensus_uid)
534
508
  sample_columns = [col for col in abundance_matrix.columns if col != "consensus_uid"]
535
509
  n_assays = len(sample_columns)
536
-
510
+
537
511
  # Define samples, ms_runs, and assays based on the abundance matrix columns
512
+ # Determine scan polarity based on study polarity
513
+ study_polarity = getattr(self, 'polarity', 'positive')
514
+ if study_polarity in ['negative', 'neg']:
515
+ scan_polarity_cv = "[MS, MS:1000129, negative scan, ]"
516
+ else:
517
+ scan_polarity_cv = "[MS, MS:1000130, positive scan, ]"
518
+
538
519
  for i, sample_col in enumerate(sample_columns, 1):
539
520
  mtd_lines.append(f"\nMTD\tsample[{i}]\t{sample_col}")
540
521
  mtd_lines.append(f"MTD\tsample[{i}]-description\t{sample_col}")
541
522
  mtd_lines.append(f"MTD\tms_run[{i}]-location\tfile://unknown")
523
+ mtd_lines.append(f"MTD\tms_run[{i}]-scan_polarity\t{scan_polarity_cv}")
542
524
  mtd_lines.append(f"MTD\tassay[{i}]\tAssay_{i}")
543
525
  mtd_lines.append(f"MTD\tassay[{i}]-sample_ref\tsample[{i}]")
544
526
  mtd_lines.append(f"MTD\tassay[{i}]-ms_run_ref\tms_run[{i}]")
@@ -575,24 +557,15 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
575
557
  # round to int - handle both Polars and Pandas DataFrames
576
558
  if hasattr(abundance_matrix, "with_columns"):
577
559
  # Polars DataFrame
578
- numeric_cols = [
579
- col
580
- for col in abundance_matrix.columns
581
- if abundance_matrix[col].dtype.is_numeric()
582
- ]
583
- abundance_matrix = abundance_matrix.with_columns(
584
- [abundance_matrix[col].round(0) for col in numeric_cols],
585
- )
560
+ numeric_cols = [col for col in abundance_matrix.columns if abundance_matrix[col].dtype.is_numeric()]
561
+ abundance_matrix = abundance_matrix.with_columns([abundance_matrix[col].round(0) for col in numeric_cols])
586
562
  else:
587
563
  # Pandas DataFrame
588
564
  abundance_matrix = abundance_matrix.round(0)
589
565
 
590
566
  # Use the n_assays already calculated from abundance matrix columns
591
567
  sml_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
592
- sml_header += [
593
- "abundance_study_variable[1]",
594
- "abundance_variation_study_variable[1]",
595
- ]
568
+ sml_header += ["abundance_study_variable[1]", "abundance_variation_study_variable[1]"]
596
569
  sml_lines.append("\t".join(sml_header))
597
570
 
598
571
  # get adducts from consensus_df['adduct_top'] - use the top-ranked adduct directly
@@ -602,7 +575,9 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
602
575
  # Use adduct_top if available, otherwise fall back to null
603
576
  if "adduct_top" in row and row["adduct_top"] is not None:
604
577
  adduct = str(row["adduct_top"])
605
-
578
+ # Replace ? with H for better mzTab compatibility
579
+ adduct = adduct.replace("?", "H")
580
+
606
581
  adduct_list.append(adduct)
607
582
 
608
583
  for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
@@ -613,56 +588,63 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
613
588
  id_matches = top_id_data.filter(pl.col("consensus_uid") == consensus_uid)
614
589
  if id_matches.height > 0:
615
590
  id_info = id_matches.row(0, named=True)
616
-
591
+
617
592
  # Populate identification fields
618
593
  database_identifier = "null"
619
594
  chemical_formula = "null"
620
595
  smiles_val = "null"
621
- inchi_val = "null"
596
+ inchi_val = "null"
622
597
  chemical_name = "null"
623
598
  best_id_confidence_measure = "null"
624
599
  best_id_confidence_value = "null"
625
600
  reliability = "4" # Default: unknown compound
626
-
601
+ theoretical_neutral_mass = "null" # Only set when we have database identification
602
+
627
603
  if id_info:
628
604
  # Use cmpd_uid as database identifier with prefix
629
605
  if id_info.get("cmpd_uid") is not None:
630
606
  database_identifier = f"cmpd:{id_info['cmpd_uid']}"
631
-
607
+
632
608
  # Chemical formula
633
609
  if id_info.get("formula") is not None and id_info["formula"] != "":
634
- chemical_formula = str(id_info["formula"])
635
-
610
+ chemical_formula = safe_str(id_info["formula"])
611
+
636
612
  # SMILES
637
613
  if id_info.get("smiles") is not None and id_info["smiles"] != "":
638
- smiles_val = str(id_info["smiles"])
639
-
614
+ smiles_val = safe_str(id_info["smiles"])
615
+
640
616
  # InChI
641
617
  if id_info.get("inchi") is not None and id_info["inchi"] != "":
642
- inchi_val = str(id_info["inchi"])
643
-
618
+ inchi_val = safe_str(id_info["inchi"])
619
+
644
620
  # Chemical name
645
621
  if id_info.get("name") is not None and id_info["name"] != "":
646
- chemical_name = str(id_info["name"])
647
-
622
+ chemical_name = safe_str(id_info["name"])
623
+
624
+ # Theoretical neutral mass - only from identification data, not consensus_df
625
+ if id_info.get("neutral_mass") is not None:
626
+ theoretical_neutral_mass = safe_str(id_info["neutral_mass"])
627
+ elif id_info.get("mass") is not None:
628
+ theoretical_neutral_mass = safe_str(id_info["mass"])
629
+
648
630
  # Identification confidence
649
631
  if id_info.get("matcher") is not None:
650
632
  best_id_confidence_measure = f"[MS, MS:1002888, {id_info['matcher']}, ]"
651
-
633
+
652
634
  if id_info.get("score") is not None:
653
- best_id_confidence_value = str(id_info["score"])
654
-
635
+ best_id_confidence_value = safe_str(id_info["score"])
636
+
655
637
  # Set reliability based on identification quality
656
638
  # Using mzTab-M hr-ms identification levels: 2a=compound match, 2b=library spectrum match, 3=compound class, 4=unknown
657
639
  if id_info.get("score", 0) >= 0.8:
658
640
  reliability = "2a" # High confidence compound match
659
641
  elif id_info.get("score", 0) >= 0.5:
660
- reliability = "2b" # Moderate confidence match
642
+ reliability = "2b" # Moderate confidence match
661
643
  elif id_info.get("score", 0) >= 0.2:
662
- reliability = "3" # Compound class level
644
+ reliability = "3" # Compound class level
663
645
  else:
664
- reliability = "4" # Unknown compound
665
-
646
+ reliability = "4" # Unknown compound
647
+
666
648
  # Get MGF indexes for this consensus feature
667
649
  mgf_indexes = mgf_mapping.get(row["consensus_uid"], [])
668
650
 
@@ -675,10 +657,8 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
675
657
  smiles_val,
676
658
  inchi_val,
677
659
  chemical_name,
678
- str(row.get("uri", "null")),
679
- str(
680
- row.get("adduct_mass_neutral_top", "null"),
681
- ), # Use calculated neutral mass from adduct analysis
660
+ safe_str(row.get("uri", "null")),
661
+ theoretical_neutral_mass, # Only set when database_identifier is not null
682
662
  adduct_list[idx - 1],
683
663
  reliability,
684
664
  best_id_confidence_measure,
@@ -688,24 +668,29 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
688
668
  # Add abundance values for each assay
689
669
  consensus_uid = row["consensus_uid"]
690
670
  # Check if consensus_uid exists in the abundance_matrix (Polars)
691
- filtered_matrix = abundance_matrix.filter(
692
- pl.col("consensus_uid") == consensus_uid,
693
- )
671
+ filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
694
672
  if filtered_matrix.height > 0:
695
673
  # Get the first (and should be only) matching row
696
674
  abundance_row = filtered_matrix.row(0, named=True)
697
675
  # Extract values excluding the consensus_uid column
698
- abundance_values = [
699
- abundance_row[col]
700
- for col in abundance_matrix.columns
701
- if col != "consensus_uid"
702
- ]
703
- sml_row += [
704
- str(val) if val is not None else "null" for val in abundance_values
705
- ]
676
+ abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
677
+ sml_row += [safe_str(val) if val is not None else "null" for val in abundance_values]
678
+
679
+ # Calculate study variable statistics
680
+ non_null_values = [val for val in abundance_values if val is not None]
681
+ if non_null_values:
682
+ abundance_study_variable = sum(non_null_values) / len(non_null_values)
683
+ abundance_variation_study_variable = (
684
+ sum((x - abundance_study_variable) ** 2 for x in non_null_values) / len(non_null_values)
685
+ ) ** 0.5 if len(non_null_values) > 1 else 0
686
+ else:
687
+ abundance_study_variable = "null"
688
+ abundance_variation_study_variable = "null"
689
+
690
+ sml_row += [safe_str(abundance_study_variable), safe_str(abundance_variation_study_variable)]
706
691
  else:
707
692
  sml_row += ["null"] * n_assays
708
- sml_row += ["null", "null"]
693
+ sml_row += ["null", "null"] # Study variable columns
709
694
  sml_lines.append("\t".join(sml_row))
710
695
  with open(filename, "a", encoding="utf-8") as f:
711
696
  f.write("\n")
@@ -717,8 +702,8 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
717
702
  smf_header = [
718
703
  "SFH",
719
704
  "SMF_ID",
720
- "SOME_ID_REFS",
721
- "SOME_ID_REF_ambiguity_code",
705
+ "SME_ID_REFS",
706
+ "SME_ID_REF_ambiguity_code",
722
707
  "adduct_ion",
723
708
  "isotopomer",
724
709
  "exp_mass_to_charge",
@@ -728,96 +713,99 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
728
713
  "retention_time_in_seconds_end",
729
714
  ]
730
715
  smf_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
716
+ smf_header += ["abundance_study_variable[1]", "abundance_variation_study_variable[1]"]
731
717
  smf_lines.append("\t".join(smf_header))
732
718
 
733
719
  # SMF table uses the same consensus features as SML, just different metadata
734
720
  for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
735
- # References to SOME entries - each SMF can reference multiple SOME entries for the same consensus_uid
736
- some_refs = "null"
737
- some_ambiguity = "null"
721
+ # References to SME entries - each SMF can reference multiple SME entries for the same consensus_uid
722
+ sme_refs = "null"
723
+ sme_ambiguity = "null"
738
724
  consensus_uid = row["consensus_uid"]
739
-
725
+
740
726
  if id_data is not None:
741
- # Find all SOME entries for this consensus_uid
742
- some_matches = id_data.filter(pl.col("consensus_uid") == consensus_uid)
743
- if some_matches.height > 0:
744
- # Generate SOME IDs - we'll create a mapping in the SOME section
727
+ # Find all SME entries for this consensus_uid
728
+ sme_matches = id_data.filter(pl.col("consensus_uid") == consensus_uid)
729
+ if sme_matches.height > 0:
730
+ # Generate SME IDs - we'll create a mapping in the SME section
745
731
  # For now, use a simple approach based on consensus_uid and lib_uid
746
- some_ids = []
747
- for i, some_row in enumerate(some_matches.iter_rows(named=True)):
748
- # Create a unique SOME ID based on consensus_uid and position
749
- some_id_base = (
750
- consensus_uid * 1000
751
- ) # Ensure uniqueness across consensus features
752
- some_id = some_id_base + i + 1
753
- some_ids.append(str(some_id))
754
-
755
- if some_ids:
756
- some_refs = "|".join(some_ids)
732
+ sme_ids = []
733
+ for i, sme_row in enumerate(sme_matches.iter_rows(named=True)):
734
+ # Create a unique SME ID based on consensus_uid and position
735
+ sme_id_base = consensus_uid * 1000 # Ensure uniqueness across consensus features
736
+ sme_id = sme_id_base + i + 1
737
+ sme_ids.append(str(sme_id))
738
+
739
+ if sme_ids:
740
+ sme_refs = "|".join(sme_ids)
757
741
  # Set ambiguity code: 1=ambiguous identification, 2=multiple evidence same molecule, 3=both
758
- if len(some_ids) > 1:
742
+ if len(sme_ids) > 1:
759
743
  # Check if all identifications point to the same compound
760
- unique_cmpds = {
761
- match["cmpd_uid"]
762
- for match in some_matches.iter_rows(named=True)
763
- if match.get("cmpd_uid") is not None
764
- }
744
+ unique_cmpds = set(match["cmpd_uid"] for match in sme_matches.iter_rows(named=True)
745
+ if match.get("cmpd_uid") is not None)
765
746
  if len(unique_cmpds) > 1:
766
- some_ambiguity = "1" # Ambiguous identification
747
+ sme_ambiguity = "1" # Ambiguous identification
767
748
  else:
768
- some_ambiguity = "2" # Multiple evidence for same molecule
749
+ sme_ambiguity = "2" # Multiple evidence for same molecule
769
750
  else:
770
- some_ambiguity = "null"
771
-
751
+ sme_ambiguity = "null"
752
+
772
753
  smf_row = [
773
754
  "SMF",
774
755
  str(idx),
775
- some_refs,
776
- some_ambiguity,
756
+ sme_refs,
757
+ sme_ambiguity,
777
758
  adduct_list[idx - 1], # adduct_ion
778
- str(row.get("isotopomer", "null")),
779
- str(row.get("mz", "null")), # exp_mass_to_charge
780
- str(row.get("adduct_charge_top", "null")), # Use top-ranked adduct charge
781
- str(row.get("rt", "null")), # retention_time_in_seconds
782
- str(row.get("retention_time_in_seconds_start", "null")),
783
- str(row.get("retention_time_in_seconds_end", "null")),
759
+ safe_str(row.get("isotopomer", "null")),
760
+ safe_str(row.get("mz", "null")), # exp_mass_to_charge
761
+ safe_str(row.get("adduct_charge_top", "null")), # Use top-ranked adduct charge
762
+ safe_str(row.get("rt", "null")), # retention_time_in_seconds
763
+ safe_str(row.get("retention_time_in_seconds_start", "null")),
764
+ safe_str(row.get("retention_time_in_seconds_end", "null")),
784
765
  ]
785
766
  # Add abundance values for each assay - same as SML (Polars)
786
767
  consensus_uid = row["consensus_uid"]
787
- filtered_matrix = abundance_matrix.filter(
788
- pl.col("consensus_uid") == consensus_uid,
789
- )
768
+ filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
790
769
  if filtered_matrix.height > 0:
791
770
  # Get the first (and should be only) matching row
792
771
  abundance_row = filtered_matrix.row(0, named=True)
793
772
  # Extract values excluding the consensus_uid column
794
- abundance_values = [
795
- abundance_row[col]
796
- for col in abundance_matrix.columns
797
- if col != "consensus_uid"
798
- ]
799
- smf_row += [
800
- str(val) if val is not None else "null" for val in abundance_values
801
- ]
773
+ abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
774
+ abundance_strings = [safe_str(val) if val is not None else "null" for val in abundance_values]
775
+ smf_row += abundance_strings
776
+
777
+ # Calculate study variable statistics (same as in SML section)
778
+ non_null_values = [val for val in abundance_values if val is not None]
779
+ if non_null_values:
780
+ abundance_study_variable = sum(non_null_values) / len(non_null_values)
781
+ abundance_variation_study_variable = (
782
+ sum((x - abundance_study_variable) ** 2 for x in non_null_values) / len(non_null_values)
783
+ ) ** 0.5 if len(non_null_values) > 1 else 0
784
+ else:
785
+ abundance_study_variable = "null"
786
+ abundance_variation_study_variable = "null"
787
+
788
+ smf_row += [safe_str(abundance_study_variable), safe_str(abundance_variation_study_variable)]
802
789
  else:
803
790
  smf_row += ["null"] * n_assays
791
+ smf_row += ["null", "null"] # Study variable columns
804
792
  smf_lines.append("\t".join(smf_row))
805
793
  with open(filename, "a", encoding="utf-8") as f:
806
794
  f.write("\n")
807
795
  for line in smf_lines:
808
796
  f.write(line + "\n")
809
797
 
810
- # --- SOME (Small Molecule Evidence) table ---
798
+ # --- SME (Small Molecule Evidence) table ---
811
799
  if id_data is not None and not id_data.is_empty():
812
- some_lines = []
813
- some_header = [
814
- "SHE",
815
- "SOME_ID",
800
+ sme_lines = []
801
+ sme_header = [
802
+ "SEH",
803
+ "SME_ID",
816
804
  "evidence_input_id",
817
805
  "database_identifier",
818
806
  "chemical_formula",
819
807
  "smiles",
820
- "inchi",
808
+ "inchi",
821
809
  "chemical_name",
822
810
  "uri",
823
811
  "derivatized_form",
@@ -831,87 +819,81 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
831
819
  "id_confidence_measure[1]",
832
820
  "rank",
833
821
  ]
834
- some_lines.append("\t".join(some_header))
835
-
836
- # Create SOME entries for all identification results
837
- for consensus_uid in (
838
- self.consensus_df.select("consensus_uid").to_series().unique()
839
- ):
822
+ sme_lines.append("\t".join(sme_header))
823
+
824
+ # Create SME entries for all identification results
825
+ for consensus_uid in self.consensus_df.select("consensus_uid").to_series().unique():
840
826
  # Get consensus feature data for this consensus_uid
841
- consensus_feature_data = self.consensus_df.filter(
842
- pl.col("consensus_uid") == consensus_uid,
843
- )
827
+ consensus_feature_data = self.consensus_df.filter(pl.col("consensus_uid") == consensus_uid)
844
828
  if consensus_feature_data.height == 0:
845
829
  continue
846
830
  consensus_row = consensus_feature_data.row(0, named=True)
847
-
831
+
848
832
  # Get all identification results for this consensus feature
849
- some_matches = id_data.filter(pl.col("consensus_uid") == consensus_uid)
850
-
851
- if some_matches.height > 0:
833
+ sme_matches = id_data.filter(pl.col("consensus_uid") == consensus_uid)
834
+
835
+ if sme_matches.height > 0:
852
836
  # Sort by score descending to maintain rank order
853
- some_matches = some_matches.sort("score", descending=True)
854
-
855
- for i, some_row in enumerate(some_matches.iter_rows(named=True)):
856
- # Generate unique SOME_ID
857
- some_id_base = consensus_uid * 1000
858
- some_id = some_id_base + i + 1
859
-
837
+ sme_matches = sme_matches.sort("score", descending=True)
838
+
839
+ for i, sme_row in enumerate(sme_matches.iter_rows(named=True)):
840
+ # Generate unique SME_ID
841
+ sme_id_base = consensus_uid * 1000
842
+ sme_id = sme_id_base + i + 1
843
+
860
844
  # Create evidence input ID - use consensus feature info
861
- evidence_id = f"consensus_feature:{consensus_uid}:mz={some_row.get('mz', 0):.4f}:rt={some_row.get('rt', 0):.2f}"
862
-
845
+ evidence_id = f"consensus_feature:{consensus_uid}:mz={sme_row.get('mz', 0):.4f}:rt={sme_row.get('rt', 0):.2f}"
846
+
863
847
  # Database identifier with prefix
864
848
  db_id = "null"
865
- if some_row.get("cmpd_uid") is not None:
866
- db_id = f"cmpd:{some_row['cmpd_uid']}"
867
-
868
- # Get adduct information
849
+ if sme_row.get("cmpd_uid") is not None:
850
+ db_id = f"cmpd:{sme_row['cmpd_uid']}"
851
+
852
+ # Get adduct information
869
853
  adduct_ion = "null"
870
- if some_row.get("adduct") is not None and some_row["adduct"] != "":
871
- adduct_ion = str(some_row["adduct"])
872
-
873
- # Spectra reference - reference to the consensus feature
874
- spectra_ref = f"consensus_feature:{consensus_uid}"
875
-
854
+ if sme_row.get("adduct") is not None and sme_row["adduct"] != "":
855
+ adduct_ion = safe_str(sme_row["adduct"])
856
+ # Replace ? with H for better mzTab compatibility
857
+ adduct_ion = adduct_ion.replace("?", "H")
858
+
859
+ # Spectra reference - reference to first ms_run with spectrum index 0
860
+ spectra_ref = "ms_run[1]:spectrum=0"
861
+
876
862
  # Identification method
877
863
  id_method = "[MS, MS:1002888, small molecule confidence measure, ]"
878
- if some_row.get("matcher") is not None:
879
- id_method = f"[MS, MS:1002888, {some_row['matcher']}, ]"
880
-
864
+ if sme_row.get("matcher") is not None:
865
+ id_method = f"[MS, MS:1002888, {sme_row['matcher']}, ]"
866
+
881
867
  # MS level - assume MS1 for now
882
868
  ms_level = "[MS, MS:1000511, ms level, 1]"
883
-
884
- some_line = [
885
- "SOME",
886
- str(some_id),
869
+
870
+ sme_line = [
871
+ "SME",
872
+ str(sme_id),
887
873
  evidence_id,
888
874
  db_id,
889
- str(some_row.get("formula", "null")),
890
- str(some_row.get("smiles", "null")),
891
- str(some_row.get("inchi", "null")),
892
- str(some_row.get("name", "null")),
875
+ safe_str(sme_row.get("formula", "null")),
876
+ safe_str(sme_row.get("smiles", "null")),
877
+ safe_str(sme_row.get("inchi", "null")),
878
+ safe_str(sme_row.get("name", "null")),
893
879
  "null", # uri - not available in current data
894
880
  "null", # derivatized_form
895
881
  adduct_ion,
896
- str(some_row.get("mz", "null")),
897
- str(
898
- consensus_row.get("adduct_charge_top", "1"),
899
- ), # Use consensus feature's top adduct charge
900
- str(
901
- some_row.get("mz", "null"),
902
- ), # theoretical m/z same as experimental for now
882
+ safe_str(sme_row.get("mz", "null")),
883
+ safe_str(consensus_row.get("adduct_charge_top", "1")), # Use consensus feature's top adduct charge
884
+ safe_str(sme_row.get("mz", "null")), # theoretical m/z same as experimental for now
903
885
  spectra_ref,
904
886
  id_method,
905
887
  ms_level,
906
- str(some_row.get("score", "null")),
888
+ safe_str(sme_row.get("score", "null")),
907
889
  str(i + 1), # rank within this consensus feature
908
890
  ]
909
- some_lines.append("\t".join(some_line))
910
-
911
- # Write SOME table
891
+ sme_lines.append("\t".join(sme_line))
892
+
893
+ # Write SME table
912
894
  with open(filename, "a", encoding="utf-8") as f:
913
895
  f.write("\n")
914
- for line in some_lines:
896
+ for line in sme_lines:
915
897
  f.write(line + "\n")
916
898
 
917
899
  # --- MGF table ---
@@ -945,23 +927,15 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
945
927
  spec_len = row["spec_len"] if row["spec_len"] is not None else 0
946
928
 
947
929
  # Format spectrum data as pipe-separated strings
948
- spec_mz_str = (
949
- "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
950
- )
951
- spec_int_str = (
952
- "|".join([f"{int(inty)}" for inty in spectrum_inty])
953
- if spectrum_inty
954
- else ""
955
- )
930
+ spec_mz_str = "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
931
+ spec_int_str = "|".join([f"{int(inty)}" for inty in spectrum_inty]) if spectrum_inty else ""
956
932
 
957
933
  mgf_row = [
958
934
  "COM",
959
935
  "MGF",
960
936
  str(row["mgf_index"]) if row["mgf_index"] is not None else "null",
961
937
  str(row["feature_id"]) if row["feature_id"] is not None else "null",
962
- f"{row['rtinseconds']:.2f}"
963
- if row["rtinseconds"] is not None
964
- else "null",
938
+ f"{row['rtinseconds']:.2f}" if row["rtinseconds"] is not None else "null",
965
939
  f"{row['pepmass']:.4f}" if row["pepmass"] is not None else "null",
966
940
  "null", # prec_int - not available in current data
967
941
  str(row["energy"]) if row["energy"] is not None else "null",