masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +3 -9
- masster/data/libs/README.md +1 -1
- masster/data/libs/ccm.csv +120 -120
- masster/data/libs/ccm.py +116 -62
- masster/data/libs/central_carbon_README.md +1 -1
- masster/data/libs/urine.py +161 -65
- masster/data/libs/urine_metabolites.csv +4693 -4693
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
- masster/logger.py +43 -78
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +264 -338
- masster/sample/defaults/find_adducts_def.py +8 -21
- masster/sample/defaults/find_features_def.py +1 -6
- masster/sample/defaults/get_spectrum_def.py +1 -5
- masster/sample/defaults/sample_def.py +1 -5
- masster/sample/h5.py +282 -561
- masster/sample/helpers.py +75 -131
- masster/sample/lib.py +17 -42
- masster/sample/load.py +17 -31
- masster/sample/parameters.py +2 -6
- masster/sample/plot.py +27 -88
- masster/sample/processing.py +87 -117
- masster/sample/quant.py +51 -57
- masster/sample/sample.py +90 -103
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +12 -35
- masster/sample/sciex.py +19 -66
- masster/spectrum.py +20 -58
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +1 -5
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/fill_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/integrate_def.py +1 -5
- masster/study/defaults/study_def.py +25 -58
- masster/study/export.py +207 -233
- masster/study/h5.py +136 -470
- masster/study/helpers.py +202 -495
- masster/study/helpers_optimized.py +13 -40
- masster/study/id.py +110 -213
- masster/study/load.py +143 -230
- masster/study/plot.py +257 -518
- masster/study/processing.py +257 -469
- masster/study/save.py +5 -15
- masster/study/study.py +276 -379
- masster/study/study5_schema.json +96 -96
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
- masster-0.4.1.dist-info/RECORD +67 -0
- masster-0.4.0.dist-info/RECORD +0 -67
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/study/export.py
CHANGED
|
@@ -10,9 +10,9 @@ import polars as pl
|
|
|
10
10
|
|
|
11
11
|
from tqdm import tqdm
|
|
12
12
|
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
from
|
|
13
|
+
from masster.spectrum import combine_peaks
|
|
14
|
+
from masster.study.defaults import export_mgf_defaults
|
|
15
|
+
from masster._version import get_version
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def _get_mgf_df(self, **kwargs):
|
|
@@ -107,11 +107,7 @@ def _get_mgf_df(self, **kwargs):
|
|
|
107
107
|
mask = mask & (spec.inty >= inty_min)
|
|
108
108
|
for attr in spec.__dict__:
|
|
109
109
|
arr = getattr(spec, attr)
|
|
110
|
-
if (
|
|
111
|
-
isinstance(arr, list | np.ndarray)
|
|
112
|
-
and hasattr(arr, "__len__")
|
|
113
|
-
and len(arr) == length
|
|
114
|
-
):
|
|
110
|
+
if isinstance(arr, list | np.ndarray) and hasattr(arr, "__len__") and len(arr) == length:
|
|
115
111
|
setattr(spec, attr, np.array(arr)[mask])
|
|
116
112
|
return spec
|
|
117
113
|
|
|
@@ -121,12 +117,8 @@ def _get_mgf_df(self, **kwargs):
|
|
|
121
117
|
return None
|
|
122
118
|
|
|
123
119
|
# Prepare spectrum data
|
|
124
|
-
spectrum_mz = (
|
|
125
|
-
|
|
126
|
-
)
|
|
127
|
-
spectrum_inty = (
|
|
128
|
-
spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
|
|
129
|
-
)
|
|
120
|
+
spectrum_mz = spect.mz.tolist() if hasattr(spect.mz, "tolist") else list(spect.mz)
|
|
121
|
+
spectrum_inty = spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
|
|
130
122
|
|
|
131
123
|
# Determine MS level
|
|
132
124
|
ms_level = spect.ms_level if spect.ms_level is not None else 1
|
|
@@ -266,11 +258,7 @@ def _get_mgf_df(self, **kwargs):
|
|
|
266
258
|
|
|
267
259
|
elif selection == "all":
|
|
268
260
|
if merge:
|
|
269
|
-
specs = [
|
|
270
|
-
row_e["spec"]
|
|
271
|
-
for row_e in cons_ms2.iter_rows(named=True)
|
|
272
|
-
if row_e["spec"] is not None
|
|
273
|
-
]
|
|
261
|
+
specs = [row_e["spec"] for row_e in cons_ms2.iter_rows(named=True) if row_e["spec"] is not None]
|
|
274
262
|
if not specs:
|
|
275
263
|
continue
|
|
276
264
|
spect = combine_peaks(specs)
|
|
@@ -422,6 +410,13 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
422
410
|
description (str, optional): Human-readable description.
|
|
423
411
|
**kwargs: Additional metadata or export options.
|
|
424
412
|
"""
|
|
413
|
+
|
|
414
|
+
def safe_str(value, default="null"):
|
|
415
|
+
"""Convert value to string, replacing empty strings with 'null'"""
|
|
416
|
+
if value is None:
|
|
417
|
+
return default
|
|
418
|
+
str_val = str(value)
|
|
419
|
+
return str_val if str_val.strip() != "" else default
|
|
425
420
|
if filename is None:
|
|
426
421
|
filename = "study.mztab"
|
|
427
422
|
if not os.path.isabs(filename):
|
|
@@ -435,16 +430,14 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
435
430
|
top_id_data = None
|
|
436
431
|
try:
|
|
437
432
|
# Import here to avoid circular imports
|
|
438
|
-
from
|
|
439
|
-
|
|
433
|
+
from masster.study.id import get_id
|
|
440
434
|
id_data = get_id(self)
|
|
441
435
|
if id_data is not None and not id_data.is_empty():
|
|
442
436
|
# Get top scoring identification for each consensus_uid for SML section
|
|
443
|
-
top_id_data = (
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
)
|
|
437
|
+
top_id_data = (id_data
|
|
438
|
+
.group_by("consensus_uid")
|
|
439
|
+
.agg(pl.all().sort_by("score", descending=True).first())
|
|
440
|
+
.sort("consensus_uid"))
|
|
448
441
|
else:
|
|
449
442
|
self.logger.info("No identification data available for mzTab export")
|
|
450
443
|
except Exception as e:
|
|
@@ -468,9 +461,7 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
468
461
|
|
|
469
462
|
# --- Prepare MTD (metadata) section ---
|
|
470
463
|
mtd_lines = []
|
|
471
|
-
mtd_lines.append(
|
|
472
|
-
f"COM\tfile generated by MASSter {get_version()} on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
|
473
|
-
)
|
|
464
|
+
mtd_lines.append(f"COM\tfile generated by MASSter {get_version()} on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
474
465
|
mtd_lines.append("\nMTD\tmzTab-version\t2.2.0-M")
|
|
475
466
|
id = self.label if self.label else self.folder
|
|
476
467
|
mtd_lines.append(f"MTD\tmzTab-id\t{id}")
|
|
@@ -478,67 +469,58 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
478
469
|
mtd_lines.append("MTD\tcv[1]-label\tMS")
|
|
479
470
|
mtd_lines.append("MTD\tcv[1]-full_name\tPSI-MS controlled vocabulary")
|
|
480
471
|
mtd_lines.append("MTD\tcv[1]-version\t4.1.199")
|
|
481
|
-
mtd_lines.append(
|
|
482
|
-
"MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo",
|
|
483
|
-
)
|
|
472
|
+
mtd_lines.append("MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo")
|
|
484
473
|
mtd_lines.append("")
|
|
485
|
-
mtd_lines.append(
|
|
486
|
-
|
|
487
|
-
)
|
|
488
|
-
mtd_lines.append(
|
|
489
|
-
"MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
|
|
490
|
-
)
|
|
474
|
+
mtd_lines.append("MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
|
|
475
|
+
mtd_lines.append("MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
|
|
491
476
|
mtd_lines.append(
|
|
492
477
|
"MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]",
|
|
493
478
|
)
|
|
494
|
-
|
|
479
|
+
|
|
495
480
|
# Add identification confidence measures if identification data is available
|
|
496
481
|
if id_data is not None:
|
|
497
|
-
mtd_lines.append(
|
|
498
|
-
"MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]",
|
|
499
|
-
)
|
|
482
|
+
mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
|
|
500
483
|
else:
|
|
501
|
-
mtd_lines.append(
|
|
502
|
-
|
|
503
|
-
)
|
|
504
|
-
|
|
484
|
+
mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
|
|
485
|
+
|
|
505
486
|
mtd_lines.append("")
|
|
506
487
|
mtd_lines.append("MTD\tsoftware[1]\t[MS, MS:1003430, OpenMS, unknown]")
|
|
507
488
|
mtd_lines.append(f"MTD\tsoftware[2]\t[MS, MS:1002878, MASSter, {get_version()}]")
|
|
508
|
-
mtd_lines.append(
|
|
509
|
-
"MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]",
|
|
510
|
-
)
|
|
489
|
+
mtd_lines.append("MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]")
|
|
511
490
|
mtd_lines.append("")
|
|
512
|
-
|
|
491
|
+
|
|
513
492
|
# Database information - updated based on identification data
|
|
514
|
-
if (
|
|
515
|
-
id_data is not None
|
|
516
|
-
and hasattr(self, "lib_df")
|
|
517
|
-
and self.lib_df is not None
|
|
518
|
-
and not self.lib_df.is_empty()
|
|
519
|
-
):
|
|
493
|
+
if id_data is not None and hasattr(self, 'lib_df') and self.lib_df is not None and not self.lib_df.is_empty():
|
|
520
494
|
mtd_lines.append('MTD\tdatabase[1]\t[, , "compound library", ]')
|
|
521
495
|
mtd_lines.append("MTD\tdatabase[1]-prefix\tcmpd")
|
|
522
496
|
mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
|
|
523
|
-
mtd_lines.append("MTD\tdatabase[1]-uri\
|
|
497
|
+
mtd_lines.append("MTD\tdatabase[1]-uri\thttps://pubchem.ncbi.nlm.nih.gov/")
|
|
524
498
|
else:
|
|
525
|
-
mtd_lines.append('MTD\tdatabase[1]\t[, , "
|
|
526
|
-
mtd_lines.append("MTD\tdatabase[1]-prefix\
|
|
499
|
+
mtd_lines.append('MTD\tdatabase[1]\t[, , "PubChem", ]')
|
|
500
|
+
mtd_lines.append("MTD\tdatabase[1]-prefix\tCID")
|
|
527
501
|
mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
|
|
528
|
-
mtd_lines.append("MTD\tdatabase[1]-uri\
|
|
529
|
-
|
|
502
|
+
mtd_lines.append("MTD\tdatabase[1]-uri\thttps://pubchem.ncbi.nlm.nih.gov/")
|
|
503
|
+
|
|
530
504
|
# Get abundance matrix to determine the number of assays needed
|
|
531
505
|
abundance_matrix = self.get_consensus_matrix()
|
|
532
|
-
|
|
506
|
+
|
|
533
507
|
# Get sample columns (excluding consensus_uid)
|
|
534
508
|
sample_columns = [col for col in abundance_matrix.columns if col != "consensus_uid"]
|
|
535
509
|
n_assays = len(sample_columns)
|
|
536
|
-
|
|
510
|
+
|
|
537
511
|
# Define samples, ms_runs, and assays based on the abundance matrix columns
|
|
512
|
+
# Determine scan polarity based on study polarity
|
|
513
|
+
study_polarity = getattr(self, 'polarity', 'positive')
|
|
514
|
+
if study_polarity in ['negative', 'neg']:
|
|
515
|
+
scan_polarity_cv = "[MS, MS:1000129, negative scan, ]"
|
|
516
|
+
else:
|
|
517
|
+
scan_polarity_cv = "[MS, MS:1000130, positive scan, ]"
|
|
518
|
+
|
|
538
519
|
for i, sample_col in enumerate(sample_columns, 1):
|
|
539
520
|
mtd_lines.append(f"\nMTD\tsample[{i}]\t{sample_col}")
|
|
540
521
|
mtd_lines.append(f"MTD\tsample[{i}]-description\t{sample_col}")
|
|
541
522
|
mtd_lines.append(f"MTD\tms_run[{i}]-location\tfile://unknown")
|
|
523
|
+
mtd_lines.append(f"MTD\tms_run[{i}]-scan_polarity\t{scan_polarity_cv}")
|
|
542
524
|
mtd_lines.append(f"MTD\tassay[{i}]\tAssay_{i}")
|
|
543
525
|
mtd_lines.append(f"MTD\tassay[{i}]-sample_ref\tsample[{i}]")
|
|
544
526
|
mtd_lines.append(f"MTD\tassay[{i}]-ms_run_ref\tms_run[{i}]")
|
|
@@ -575,24 +557,15 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
575
557
|
# round to int - handle both Polars and Pandas DataFrames
|
|
576
558
|
if hasattr(abundance_matrix, "with_columns"):
|
|
577
559
|
# Polars DataFrame
|
|
578
|
-
numeric_cols = [
|
|
579
|
-
|
|
580
|
-
for col in abundance_matrix.columns
|
|
581
|
-
if abundance_matrix[col].dtype.is_numeric()
|
|
582
|
-
]
|
|
583
|
-
abundance_matrix = abundance_matrix.with_columns(
|
|
584
|
-
[abundance_matrix[col].round(0) for col in numeric_cols],
|
|
585
|
-
)
|
|
560
|
+
numeric_cols = [col for col in abundance_matrix.columns if abundance_matrix[col].dtype.is_numeric()]
|
|
561
|
+
abundance_matrix = abundance_matrix.with_columns([abundance_matrix[col].round(0) for col in numeric_cols])
|
|
586
562
|
else:
|
|
587
563
|
# Pandas DataFrame
|
|
588
564
|
abundance_matrix = abundance_matrix.round(0)
|
|
589
565
|
|
|
590
566
|
# Use the n_assays already calculated from abundance matrix columns
|
|
591
567
|
sml_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
|
|
592
|
-
sml_header += [
|
|
593
|
-
"abundance_study_variable[1]",
|
|
594
|
-
"abundance_variation_study_variable[1]",
|
|
595
|
-
]
|
|
568
|
+
sml_header += ["abundance_study_variable[1]", "abundance_variation_study_variable[1]"]
|
|
596
569
|
sml_lines.append("\t".join(sml_header))
|
|
597
570
|
|
|
598
571
|
# get adducts from consensus_df['adduct_top'] - use the top-ranked adduct directly
|
|
@@ -602,7 +575,9 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
602
575
|
# Use adduct_top if available, otherwise fall back to null
|
|
603
576
|
if "adduct_top" in row and row["adduct_top"] is not None:
|
|
604
577
|
adduct = str(row["adduct_top"])
|
|
605
|
-
|
|
578
|
+
# Replace ? with H for better mzTab compatibility
|
|
579
|
+
adduct = adduct.replace("?", "H")
|
|
580
|
+
|
|
606
581
|
adduct_list.append(adduct)
|
|
607
582
|
|
|
608
583
|
for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
|
|
@@ -613,56 +588,63 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
613
588
|
id_matches = top_id_data.filter(pl.col("consensus_uid") == consensus_uid)
|
|
614
589
|
if id_matches.height > 0:
|
|
615
590
|
id_info = id_matches.row(0, named=True)
|
|
616
|
-
|
|
591
|
+
|
|
617
592
|
# Populate identification fields
|
|
618
593
|
database_identifier = "null"
|
|
619
594
|
chemical_formula = "null"
|
|
620
595
|
smiles_val = "null"
|
|
621
|
-
inchi_val = "null"
|
|
596
|
+
inchi_val = "null"
|
|
622
597
|
chemical_name = "null"
|
|
623
598
|
best_id_confidence_measure = "null"
|
|
624
599
|
best_id_confidence_value = "null"
|
|
625
600
|
reliability = "4" # Default: unknown compound
|
|
626
|
-
|
|
601
|
+
theoretical_neutral_mass = "null" # Only set when we have database identification
|
|
602
|
+
|
|
627
603
|
if id_info:
|
|
628
604
|
# Use cmpd_uid as database identifier with prefix
|
|
629
605
|
if id_info.get("cmpd_uid") is not None:
|
|
630
606
|
database_identifier = f"cmpd:{id_info['cmpd_uid']}"
|
|
631
|
-
|
|
607
|
+
|
|
632
608
|
# Chemical formula
|
|
633
609
|
if id_info.get("formula") is not None and id_info["formula"] != "":
|
|
634
|
-
chemical_formula =
|
|
635
|
-
|
|
610
|
+
chemical_formula = safe_str(id_info["formula"])
|
|
611
|
+
|
|
636
612
|
# SMILES
|
|
637
613
|
if id_info.get("smiles") is not None and id_info["smiles"] != "":
|
|
638
|
-
smiles_val =
|
|
639
|
-
|
|
614
|
+
smiles_val = safe_str(id_info["smiles"])
|
|
615
|
+
|
|
640
616
|
# InChI
|
|
641
617
|
if id_info.get("inchi") is not None and id_info["inchi"] != "":
|
|
642
|
-
inchi_val =
|
|
643
|
-
|
|
618
|
+
inchi_val = safe_str(id_info["inchi"])
|
|
619
|
+
|
|
644
620
|
# Chemical name
|
|
645
621
|
if id_info.get("name") is not None and id_info["name"] != "":
|
|
646
|
-
chemical_name =
|
|
647
|
-
|
|
622
|
+
chemical_name = safe_str(id_info["name"])
|
|
623
|
+
|
|
624
|
+
# Theoretical neutral mass - only from identification data, not consensus_df
|
|
625
|
+
if id_info.get("neutral_mass") is not None:
|
|
626
|
+
theoretical_neutral_mass = safe_str(id_info["neutral_mass"])
|
|
627
|
+
elif id_info.get("mass") is not None:
|
|
628
|
+
theoretical_neutral_mass = safe_str(id_info["mass"])
|
|
629
|
+
|
|
648
630
|
# Identification confidence
|
|
649
631
|
if id_info.get("matcher") is not None:
|
|
650
632
|
best_id_confidence_measure = f"[MS, MS:1002888, {id_info['matcher']}, ]"
|
|
651
|
-
|
|
633
|
+
|
|
652
634
|
if id_info.get("score") is not None:
|
|
653
|
-
best_id_confidence_value =
|
|
654
|
-
|
|
635
|
+
best_id_confidence_value = safe_str(id_info["score"])
|
|
636
|
+
|
|
655
637
|
# Set reliability based on identification quality
|
|
656
638
|
# Using mzTab-M hr-ms identification levels: 2a=compound match, 2b=library spectrum match, 3=compound class, 4=unknown
|
|
657
639
|
if id_info.get("score", 0) >= 0.8:
|
|
658
640
|
reliability = "2a" # High confidence compound match
|
|
659
641
|
elif id_info.get("score", 0) >= 0.5:
|
|
660
|
-
reliability = "2b" # Moderate confidence match
|
|
642
|
+
reliability = "2b" # Moderate confidence match
|
|
661
643
|
elif id_info.get("score", 0) >= 0.2:
|
|
662
|
-
reliability = "3"
|
|
644
|
+
reliability = "3" # Compound class level
|
|
663
645
|
else:
|
|
664
|
-
reliability = "4"
|
|
665
|
-
|
|
646
|
+
reliability = "4" # Unknown compound
|
|
647
|
+
|
|
666
648
|
# Get MGF indexes for this consensus feature
|
|
667
649
|
mgf_indexes = mgf_mapping.get(row["consensus_uid"], [])
|
|
668
650
|
|
|
@@ -675,10 +657,8 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
675
657
|
smiles_val,
|
|
676
658
|
inchi_val,
|
|
677
659
|
chemical_name,
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
row.get("adduct_mass_neutral_top", "null"),
|
|
681
|
-
), # Use calculated neutral mass from adduct analysis
|
|
660
|
+
safe_str(row.get("uri", "null")),
|
|
661
|
+
theoretical_neutral_mass, # Only set when database_identifier is not null
|
|
682
662
|
adduct_list[idx - 1],
|
|
683
663
|
reliability,
|
|
684
664
|
best_id_confidence_measure,
|
|
@@ -688,24 +668,29 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
688
668
|
# Add abundance values for each assay
|
|
689
669
|
consensus_uid = row["consensus_uid"]
|
|
690
670
|
# Check if consensus_uid exists in the abundance_matrix (Polars)
|
|
691
|
-
filtered_matrix = abundance_matrix.filter(
|
|
692
|
-
pl.col("consensus_uid") == consensus_uid,
|
|
693
|
-
)
|
|
671
|
+
filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
|
|
694
672
|
if filtered_matrix.height > 0:
|
|
695
673
|
# Get the first (and should be only) matching row
|
|
696
674
|
abundance_row = filtered_matrix.row(0, named=True)
|
|
697
675
|
# Extract values excluding the consensus_uid column
|
|
698
|
-
abundance_values = [
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
]
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
676
|
+
abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
|
|
677
|
+
sml_row += [safe_str(val) if val is not None else "null" for val in abundance_values]
|
|
678
|
+
|
|
679
|
+
# Calculate study variable statistics
|
|
680
|
+
non_null_values = [val for val in abundance_values if val is not None]
|
|
681
|
+
if non_null_values:
|
|
682
|
+
abundance_study_variable = sum(non_null_values) / len(non_null_values)
|
|
683
|
+
abundance_variation_study_variable = (
|
|
684
|
+
sum((x - abundance_study_variable) ** 2 for x in non_null_values) / len(non_null_values)
|
|
685
|
+
) ** 0.5 if len(non_null_values) > 1 else 0
|
|
686
|
+
else:
|
|
687
|
+
abundance_study_variable = "null"
|
|
688
|
+
abundance_variation_study_variable = "null"
|
|
689
|
+
|
|
690
|
+
sml_row += [safe_str(abundance_study_variable), safe_str(abundance_variation_study_variable)]
|
|
706
691
|
else:
|
|
707
692
|
sml_row += ["null"] * n_assays
|
|
708
|
-
|
|
693
|
+
sml_row += ["null", "null"] # Study variable columns
|
|
709
694
|
sml_lines.append("\t".join(sml_row))
|
|
710
695
|
with open(filename, "a", encoding="utf-8") as f:
|
|
711
696
|
f.write("\n")
|
|
@@ -717,8 +702,8 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
717
702
|
smf_header = [
|
|
718
703
|
"SFH",
|
|
719
704
|
"SMF_ID",
|
|
720
|
-
"
|
|
721
|
-
"
|
|
705
|
+
"SME_ID_REFS",
|
|
706
|
+
"SME_ID_REF_ambiguity_code",
|
|
722
707
|
"adduct_ion",
|
|
723
708
|
"isotopomer",
|
|
724
709
|
"exp_mass_to_charge",
|
|
@@ -728,96 +713,99 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
728
713
|
"retention_time_in_seconds_end",
|
|
729
714
|
]
|
|
730
715
|
smf_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
|
|
716
|
+
smf_header += ["abundance_study_variable[1]", "abundance_variation_study_variable[1]"]
|
|
731
717
|
smf_lines.append("\t".join(smf_header))
|
|
732
718
|
|
|
733
719
|
# SMF table uses the same consensus features as SML, just different metadata
|
|
734
720
|
for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
|
|
735
|
-
# References to
|
|
736
|
-
|
|
737
|
-
|
|
721
|
+
# References to SME entries - each SMF can reference multiple SME entries for the same consensus_uid
|
|
722
|
+
sme_refs = "null"
|
|
723
|
+
sme_ambiguity = "null"
|
|
738
724
|
consensus_uid = row["consensus_uid"]
|
|
739
|
-
|
|
725
|
+
|
|
740
726
|
if id_data is not None:
|
|
741
|
-
# Find all
|
|
742
|
-
|
|
743
|
-
if
|
|
744
|
-
# Generate
|
|
727
|
+
# Find all SME entries for this consensus_uid
|
|
728
|
+
sme_matches = id_data.filter(pl.col("consensus_uid") == consensus_uid)
|
|
729
|
+
if sme_matches.height > 0:
|
|
730
|
+
# Generate SME IDs - we'll create a mapping in the SME section
|
|
745
731
|
# For now, use a simple approach based on consensus_uid and lib_uid
|
|
746
|
-
|
|
747
|
-
for i,
|
|
748
|
-
# Create a unique
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
)
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
if some_ids:
|
|
756
|
-
some_refs = "|".join(some_ids)
|
|
732
|
+
sme_ids = []
|
|
733
|
+
for i, sme_row in enumerate(sme_matches.iter_rows(named=True)):
|
|
734
|
+
# Create a unique SME ID based on consensus_uid and position
|
|
735
|
+
sme_id_base = consensus_uid * 1000 # Ensure uniqueness across consensus features
|
|
736
|
+
sme_id = sme_id_base + i + 1
|
|
737
|
+
sme_ids.append(str(sme_id))
|
|
738
|
+
|
|
739
|
+
if sme_ids:
|
|
740
|
+
sme_refs = "|".join(sme_ids)
|
|
757
741
|
# Set ambiguity code: 1=ambiguous identification, 2=multiple evidence same molecule, 3=both
|
|
758
|
-
if len(
|
|
742
|
+
if len(sme_ids) > 1:
|
|
759
743
|
# Check if all identifications point to the same compound
|
|
760
|
-
unique_cmpds =
|
|
761
|
-
|
|
762
|
-
for match in some_matches.iter_rows(named=True)
|
|
763
|
-
if match.get("cmpd_uid") is not None
|
|
764
|
-
}
|
|
744
|
+
unique_cmpds = set(match["cmpd_uid"] for match in sme_matches.iter_rows(named=True)
|
|
745
|
+
if match.get("cmpd_uid") is not None)
|
|
765
746
|
if len(unique_cmpds) > 1:
|
|
766
|
-
|
|
747
|
+
sme_ambiguity = "1" # Ambiguous identification
|
|
767
748
|
else:
|
|
768
|
-
|
|
749
|
+
sme_ambiguity = "2" # Multiple evidence for same molecule
|
|
769
750
|
else:
|
|
770
|
-
|
|
771
|
-
|
|
751
|
+
sme_ambiguity = "null"
|
|
752
|
+
|
|
772
753
|
smf_row = [
|
|
773
754
|
"SMF",
|
|
774
755
|
str(idx),
|
|
775
|
-
|
|
776
|
-
|
|
756
|
+
sme_refs,
|
|
757
|
+
sme_ambiguity,
|
|
777
758
|
adduct_list[idx - 1], # adduct_ion
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
759
|
+
safe_str(row.get("isotopomer", "null")),
|
|
760
|
+
safe_str(row.get("mz", "null")), # exp_mass_to_charge
|
|
761
|
+
safe_str(row.get("adduct_charge_top", "null")), # Use top-ranked adduct charge
|
|
762
|
+
safe_str(row.get("rt", "null")), # retention_time_in_seconds
|
|
763
|
+
safe_str(row.get("retention_time_in_seconds_start", "null")),
|
|
764
|
+
safe_str(row.get("retention_time_in_seconds_end", "null")),
|
|
784
765
|
]
|
|
785
766
|
# Add abundance values for each assay - same as SML (Polars)
|
|
786
767
|
consensus_uid = row["consensus_uid"]
|
|
787
|
-
filtered_matrix = abundance_matrix.filter(
|
|
788
|
-
pl.col("consensus_uid") == consensus_uid,
|
|
789
|
-
)
|
|
768
|
+
filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
|
|
790
769
|
if filtered_matrix.height > 0:
|
|
791
770
|
# Get the first (and should be only) matching row
|
|
792
771
|
abundance_row = filtered_matrix.row(0, named=True)
|
|
793
772
|
# Extract values excluding the consensus_uid column
|
|
794
|
-
abundance_values = [
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
773
|
+
abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
|
|
774
|
+
abundance_strings = [safe_str(val) if val is not None else "null" for val in abundance_values]
|
|
775
|
+
smf_row += abundance_strings
|
|
776
|
+
|
|
777
|
+
# Calculate study variable statistics (same as in SML section)
|
|
778
|
+
non_null_values = [val for val in abundance_values if val is not None]
|
|
779
|
+
if non_null_values:
|
|
780
|
+
abundance_study_variable = sum(non_null_values) / len(non_null_values)
|
|
781
|
+
abundance_variation_study_variable = (
|
|
782
|
+
sum((x - abundance_study_variable) ** 2 for x in non_null_values) / len(non_null_values)
|
|
783
|
+
) ** 0.5 if len(non_null_values) > 1 else 0
|
|
784
|
+
else:
|
|
785
|
+
abundance_study_variable = "null"
|
|
786
|
+
abundance_variation_study_variable = "null"
|
|
787
|
+
|
|
788
|
+
smf_row += [safe_str(abundance_study_variable), safe_str(abundance_variation_study_variable)]
|
|
802
789
|
else:
|
|
803
790
|
smf_row += ["null"] * n_assays
|
|
791
|
+
smf_row += ["null", "null"] # Study variable columns
|
|
804
792
|
smf_lines.append("\t".join(smf_row))
|
|
805
793
|
with open(filename, "a", encoding="utf-8") as f:
|
|
806
794
|
f.write("\n")
|
|
807
795
|
for line in smf_lines:
|
|
808
796
|
f.write(line + "\n")
|
|
809
797
|
|
|
810
|
-
# ---
|
|
798
|
+
# --- SME (Small Molecule Evidence) table ---
|
|
811
799
|
if id_data is not None and not id_data.is_empty():
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
"
|
|
815
|
-
"
|
|
800
|
+
sme_lines = []
|
|
801
|
+
sme_header = [
|
|
802
|
+
"SEH",
|
|
803
|
+
"SME_ID",
|
|
816
804
|
"evidence_input_id",
|
|
817
805
|
"database_identifier",
|
|
818
806
|
"chemical_formula",
|
|
819
807
|
"smiles",
|
|
820
|
-
"inchi",
|
|
808
|
+
"inchi",
|
|
821
809
|
"chemical_name",
|
|
822
810
|
"uri",
|
|
823
811
|
"derivatized_form",
|
|
@@ -831,87 +819,81 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
831
819
|
"id_confidence_measure[1]",
|
|
832
820
|
"rank",
|
|
833
821
|
]
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
# Create
|
|
837
|
-
for consensus_uid in (
|
|
838
|
-
self.consensus_df.select("consensus_uid").to_series().unique()
|
|
839
|
-
):
|
|
822
|
+
sme_lines.append("\t".join(sme_header))
|
|
823
|
+
|
|
824
|
+
# Create SME entries for all identification results
|
|
825
|
+
for consensus_uid in self.consensus_df.select("consensus_uid").to_series().unique():
|
|
840
826
|
# Get consensus feature data for this consensus_uid
|
|
841
|
-
consensus_feature_data = self.consensus_df.filter(
|
|
842
|
-
pl.col("consensus_uid") == consensus_uid,
|
|
843
|
-
)
|
|
827
|
+
consensus_feature_data = self.consensus_df.filter(pl.col("consensus_uid") == consensus_uid)
|
|
844
828
|
if consensus_feature_data.height == 0:
|
|
845
829
|
continue
|
|
846
830
|
consensus_row = consensus_feature_data.row(0, named=True)
|
|
847
|
-
|
|
831
|
+
|
|
848
832
|
# Get all identification results for this consensus feature
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
if
|
|
833
|
+
sme_matches = id_data.filter(pl.col("consensus_uid") == consensus_uid)
|
|
834
|
+
|
|
835
|
+
if sme_matches.height > 0:
|
|
852
836
|
# Sort by score descending to maintain rank order
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
for i,
|
|
856
|
-
# Generate unique
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
837
|
+
sme_matches = sme_matches.sort("score", descending=True)
|
|
838
|
+
|
|
839
|
+
for i, sme_row in enumerate(sme_matches.iter_rows(named=True)):
|
|
840
|
+
# Generate unique SME_ID
|
|
841
|
+
sme_id_base = consensus_uid * 1000
|
|
842
|
+
sme_id = sme_id_base + i + 1
|
|
843
|
+
|
|
860
844
|
# Create evidence input ID - use consensus feature info
|
|
861
|
-
evidence_id = f"consensus_feature:{consensus_uid}:mz={
|
|
862
|
-
|
|
845
|
+
evidence_id = f"consensus_feature:{consensus_uid}:mz={sme_row.get('mz', 0):.4f}:rt={sme_row.get('rt', 0):.2f}"
|
|
846
|
+
|
|
863
847
|
# Database identifier with prefix
|
|
864
848
|
db_id = "null"
|
|
865
|
-
if
|
|
866
|
-
db_id = f"cmpd:{
|
|
867
|
-
|
|
868
|
-
# Get adduct information
|
|
849
|
+
if sme_row.get("cmpd_uid") is not None:
|
|
850
|
+
db_id = f"cmpd:{sme_row['cmpd_uid']}"
|
|
851
|
+
|
|
852
|
+
# Get adduct information
|
|
869
853
|
adduct_ion = "null"
|
|
870
|
-
if
|
|
871
|
-
adduct_ion =
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
854
|
+
if sme_row.get("adduct") is not None and sme_row["adduct"] != "":
|
|
855
|
+
adduct_ion = safe_str(sme_row["adduct"])
|
|
856
|
+
# Replace ? with H for better mzTab compatibility
|
|
857
|
+
adduct_ion = adduct_ion.replace("?", "H")
|
|
858
|
+
|
|
859
|
+
# Spectra reference - reference to first ms_run with spectrum index 0
|
|
860
|
+
spectra_ref = "ms_run[1]:spectrum=0"
|
|
861
|
+
|
|
876
862
|
# Identification method
|
|
877
863
|
id_method = "[MS, MS:1002888, small molecule confidence measure, ]"
|
|
878
|
-
if
|
|
879
|
-
id_method = f"[MS, MS:1002888, {
|
|
880
|
-
|
|
864
|
+
if sme_row.get("matcher") is not None:
|
|
865
|
+
id_method = f"[MS, MS:1002888, {sme_row['matcher']}, ]"
|
|
866
|
+
|
|
881
867
|
# MS level - assume MS1 for now
|
|
882
868
|
ms_level = "[MS, MS:1000511, ms level, 1]"
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
"
|
|
886
|
-
str(
|
|
869
|
+
|
|
870
|
+
sme_line = [
|
|
871
|
+
"SME",
|
|
872
|
+
str(sme_id),
|
|
887
873
|
evidence_id,
|
|
888
874
|
db_id,
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
875
|
+
safe_str(sme_row.get("formula", "null")),
|
|
876
|
+
safe_str(sme_row.get("smiles", "null")),
|
|
877
|
+
safe_str(sme_row.get("inchi", "null")),
|
|
878
|
+
safe_str(sme_row.get("name", "null")),
|
|
893
879
|
"null", # uri - not available in current data
|
|
894
880
|
"null", # derivatized_form
|
|
895
881
|
adduct_ion,
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
), # Use consensus feature's top adduct charge
|
|
900
|
-
str(
|
|
901
|
-
some_row.get("mz", "null"),
|
|
902
|
-
), # theoretical m/z same as experimental for now
|
|
882
|
+
safe_str(sme_row.get("mz", "null")),
|
|
883
|
+
safe_str(consensus_row.get("adduct_charge_top", "1")), # Use consensus feature's top adduct charge
|
|
884
|
+
safe_str(sme_row.get("mz", "null")), # theoretical m/z same as experimental for now
|
|
903
885
|
spectra_ref,
|
|
904
886
|
id_method,
|
|
905
887
|
ms_level,
|
|
906
|
-
|
|
888
|
+
safe_str(sme_row.get("score", "null")),
|
|
907
889
|
str(i + 1), # rank within this consensus feature
|
|
908
890
|
]
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
# Write
|
|
891
|
+
sme_lines.append("\t".join(sme_line))
|
|
892
|
+
|
|
893
|
+
# Write SME table
|
|
912
894
|
with open(filename, "a", encoding="utf-8") as f:
|
|
913
895
|
f.write("\n")
|
|
914
|
-
for line in
|
|
896
|
+
for line in sme_lines:
|
|
915
897
|
f.write(line + "\n")
|
|
916
898
|
|
|
917
899
|
# --- MGF table ---
|
|
@@ -945,23 +927,15 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
945
927
|
spec_len = row["spec_len"] if row["spec_len"] is not None else 0
|
|
946
928
|
|
|
947
929
|
# Format spectrum data as pipe-separated strings
|
|
948
|
-
spec_mz_str = (
|
|
949
|
-
|
|
950
|
-
)
|
|
951
|
-
spec_int_str = (
|
|
952
|
-
"|".join([f"{int(inty)}" for inty in spectrum_inty])
|
|
953
|
-
if spectrum_inty
|
|
954
|
-
else ""
|
|
955
|
-
)
|
|
930
|
+
spec_mz_str = "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
|
|
931
|
+
spec_int_str = "|".join([f"{int(inty)}" for inty in spectrum_inty]) if spectrum_inty else ""
|
|
956
932
|
|
|
957
933
|
mgf_row = [
|
|
958
934
|
"COM",
|
|
959
935
|
"MGF",
|
|
960
936
|
str(row["mgf_index"]) if row["mgf_index"] is not None else "null",
|
|
961
937
|
str(row["feature_id"]) if row["feature_id"] is not None else "null",
|
|
962
|
-
f"{row['rtinseconds']:.2f}"
|
|
963
|
-
if row["rtinseconds"] is not None
|
|
964
|
-
else "null",
|
|
938
|
+
f"{row['rtinseconds']:.2f}" if row["rtinseconds"] is not None else "null",
|
|
965
939
|
f"{row['pepmass']:.4f}" if row["pepmass"] is not None else "null",
|
|
966
940
|
"null", # prec_int - not available in current data
|
|
967
941
|
str(row["energy"]) if row["energy"] is not None else "null",
|