masster 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +1 -1
- masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
- masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
- masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
- masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
- masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
- masster/data/libs/__pycache__/ccm.cpython-312.pyc +0 -0
- masster/data/libs/__pycache__/urine.cpython-312.pyc +0 -0
- masster/data/libs/ccm.csv +120 -0
- masster/data/libs/urine.csv +4693 -0
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
- masster/logger.py +11 -11
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +338 -264
- masster/sample/defaults/find_adducts_def.py +21 -8
- masster/sample/h5.py +561 -282
- masster/sample/helpers.py +131 -75
- masster/sample/lib.py +4 -4
- masster/sample/load.py +31 -17
- masster/sample/parameters.py +1 -1
- masster/sample/plot.py +7 -7
- masster/sample/processing.py +117 -87
- masster/sample/sample.py +103 -90
- masster/sample/sample5_schema.json +196 -0
- masster/sample/save.py +35 -12
- masster/spectrum.py +1 -1
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +5 -1
- masster/study/defaults/identify_def.py +3 -1
- masster/study/defaults/study_def.py +58 -25
- masster/study/export.py +360 -210
- masster/study/h5.py +560 -158
- masster/study/helpers.py +496 -203
- masster/study/helpers_optimized.py +1 -1
- masster/study/id.py +538 -349
- masster/study/load.py +233 -143
- masster/study/plot.py +71 -71
- masster/study/processing.py +456 -254
- masster/study/save.py +15 -5
- masster/study/study.py +213 -131
- masster/study/study5_schema.json +360 -0
- masster-0.4.5.dist-info/METADATA +131 -0
- masster-0.4.5.dist-info/RECORD +71 -0
- masster-0.4.3.dist-info/METADATA +0 -791
- masster-0.4.3.dist-info/RECORD +0 -56
- {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/WHEEL +0 -0
- {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/entry_points.txt +0 -0
- {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/top_level.txt +0 -0
masster/study/export.py
CHANGED
|
@@ -10,9 +10,9 @@ import polars as pl
|
|
|
10
10
|
|
|
11
11
|
from tqdm import tqdm
|
|
12
12
|
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
from
|
|
13
|
+
from master.spectrum import combine_peaks
|
|
14
|
+
from master.study.defaults import export_mgf_defaults
|
|
15
|
+
from master._version import get_version
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def _get_mgf_df(self, **kwargs):
|
|
@@ -107,7 +107,11 @@ def _get_mgf_df(self, **kwargs):
|
|
|
107
107
|
mask = mask & (spec.inty >= inty_min)
|
|
108
108
|
for attr in spec.__dict__:
|
|
109
109
|
arr = getattr(spec, attr)
|
|
110
|
-
if
|
|
110
|
+
if (
|
|
111
|
+
isinstance(arr, list | np.ndarray)
|
|
112
|
+
and hasattr(arr, "__len__")
|
|
113
|
+
and len(arr) == length
|
|
114
|
+
):
|
|
111
115
|
setattr(spec, attr, np.array(arr)[mask])
|
|
112
116
|
return spec
|
|
113
117
|
|
|
@@ -117,8 +121,12 @@ def _get_mgf_df(self, **kwargs):
|
|
|
117
121
|
return None
|
|
118
122
|
|
|
119
123
|
# Prepare spectrum data
|
|
120
|
-
spectrum_mz =
|
|
121
|
-
|
|
124
|
+
spectrum_mz = (
|
|
125
|
+
spect.mz.tolist() if hasattr(spect.mz, "tolist") else list(spect.mz)
|
|
126
|
+
)
|
|
127
|
+
spectrum_inty = (
|
|
128
|
+
spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
|
|
129
|
+
)
|
|
122
130
|
|
|
123
131
|
# Determine MS level
|
|
124
132
|
ms_level = spect.ms_level if spect.ms_level is not None else 1
|
|
@@ -258,7 +266,11 @@ def _get_mgf_df(self, **kwargs):
|
|
|
258
266
|
|
|
259
267
|
elif selection == "all":
|
|
260
268
|
if merge:
|
|
261
|
-
specs = [
|
|
269
|
+
specs = [
|
|
270
|
+
row_e["spec"]
|
|
271
|
+
for row_e in cons_ms2.iter_rows(named=True)
|
|
272
|
+
if row_e["spec"] is not None
|
|
273
|
+
]
|
|
262
274
|
if not specs:
|
|
263
275
|
continue
|
|
264
276
|
spect = combine_peaks(specs)
|
|
@@ -410,13 +422,14 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
410
422
|
description (str, optional): Human-readable description.
|
|
411
423
|
**kwargs: Additional metadata or export options.
|
|
412
424
|
"""
|
|
413
|
-
|
|
425
|
+
|
|
414
426
|
def safe_str(value, default="null"):
|
|
415
427
|
"""Convert value to string, replacing empty strings with 'null'"""
|
|
416
428
|
if value is None:
|
|
417
429
|
return default
|
|
418
430
|
str_val = str(value)
|
|
419
431
|
return str_val if str_val.strip() != "" else default
|
|
432
|
+
|
|
420
433
|
if filename is None:
|
|
421
434
|
filename = "study.mztab"
|
|
422
435
|
if not os.path.isabs(filename):
|
|
@@ -431,17 +444,23 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
431
444
|
full_id_data = None
|
|
432
445
|
try:
|
|
433
446
|
# Import here to avoid circular imports
|
|
434
|
-
from
|
|
435
|
-
|
|
447
|
+
from master.study.id import get_id
|
|
448
|
+
|
|
449
|
+
# Get full enriched identification data for SOME section
|
|
436
450
|
full_id_data = get_id(self)
|
|
437
451
|
if full_id_data is not None and not full_id_data.is_empty():
|
|
438
452
|
# Get top scoring identification for each consensus_uid for SML section
|
|
439
|
-
top_id_data = (
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
453
|
+
top_id_data = (
|
|
454
|
+
full_id_data.group_by("consensus_uid")
|
|
455
|
+
.agg(pl.all().sort_by("score", descending=True).first())
|
|
456
|
+
.sort("consensus_uid")
|
|
457
|
+
)
|
|
443
458
|
# Keep raw id_data for backward compatibility (if needed elsewhere)
|
|
444
|
-
id_data =
|
|
459
|
+
id_data = (
|
|
460
|
+
self.id_df
|
|
461
|
+
if hasattr(self, "id_df") and self.id_df is not None
|
|
462
|
+
else None
|
|
463
|
+
)
|
|
445
464
|
else:
|
|
446
465
|
self.logger.info("No identification data available for mzTab export")
|
|
447
466
|
except Exception as e:
|
|
@@ -466,7 +485,9 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
466
485
|
|
|
467
486
|
# --- Prepare MTD (metadata) section ---
|
|
468
487
|
mtd_lines = []
|
|
469
|
-
mtd_lines.append(
|
|
488
|
+
mtd_lines.append(
|
|
489
|
+
f"COM\tfile generated by MASSter {get_version()} on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
|
490
|
+
)
|
|
470
491
|
mtd_lines.append("\nMTD\tmzTab-version\t2.2.0-M")
|
|
471
492
|
id = self.label if self.label else self.folder
|
|
472
493
|
mtd_lines.append(f"MTD\tmzTab-id\t{id}")
|
|
@@ -474,28 +495,45 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
474
495
|
mtd_lines.append("MTD\tcv[1]-label\tMS")
|
|
475
496
|
mtd_lines.append("MTD\tcv[1]-full_name\tPSI-MS controlled vocabulary")
|
|
476
497
|
mtd_lines.append("MTD\tcv[1]-version\t4.1.199")
|
|
477
|
-
mtd_lines.append(
|
|
498
|
+
mtd_lines.append(
|
|
499
|
+
"MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo",
|
|
500
|
+
)
|
|
478
501
|
mtd_lines.append("")
|
|
479
|
-
mtd_lines.append(
|
|
480
|
-
|
|
502
|
+
mtd_lines.append(
|
|
503
|
+
"MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
|
|
504
|
+
)
|
|
505
|
+
mtd_lines.append(
|
|
506
|
+
"MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
|
|
507
|
+
)
|
|
481
508
|
mtd_lines.append(
|
|
482
509
|
"MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]",
|
|
483
510
|
)
|
|
484
|
-
|
|
511
|
+
|
|
485
512
|
# Add identification confidence measures if identification data is available
|
|
486
513
|
if full_id_data is not None:
|
|
487
|
-
mtd_lines.append(
|
|
514
|
+
mtd_lines.append(
|
|
515
|
+
"MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]",
|
|
516
|
+
)
|
|
488
517
|
else:
|
|
489
|
-
mtd_lines.append(
|
|
490
|
-
|
|
518
|
+
mtd_lines.append(
|
|
519
|
+
"MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]",
|
|
520
|
+
)
|
|
521
|
+
|
|
491
522
|
mtd_lines.append("")
|
|
492
523
|
mtd_lines.append("MTD\tsoftware[1]\t[MS, MS:1003430, OpenMS, unknown]")
|
|
493
524
|
mtd_lines.append(f"MTD\tsoftware[2]\t[MS, MS:1002878, MASSter, {get_version()}]")
|
|
494
|
-
mtd_lines.append(
|
|
525
|
+
mtd_lines.append(
|
|
526
|
+
"MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]",
|
|
527
|
+
)
|
|
495
528
|
mtd_lines.append("")
|
|
496
|
-
|
|
529
|
+
|
|
497
530
|
# Database information - updated based on identification data
|
|
498
|
-
if
|
|
531
|
+
if (
|
|
532
|
+
full_id_data is not None
|
|
533
|
+
and hasattr(self, "lib_df")
|
|
534
|
+
and self.lib_df is not None
|
|
535
|
+
and not self.lib_df.is_empty()
|
|
536
|
+
):
|
|
499
537
|
mtd_lines.append('MTD\tdatabase[1]\t[, , "compound library", ]')
|
|
500
538
|
mtd_lines.append("MTD\tdatabase[1]-prefix\tcmpd")
|
|
501
539
|
mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
|
|
@@ -505,22 +543,22 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
505
543
|
mtd_lines.append("MTD\tdatabase[1]-prefix\tCID")
|
|
506
544
|
mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
|
|
507
545
|
mtd_lines.append("MTD\tdatabase[1]-uri\thttps://pubchem.ncbi.nlm.nih.gov/")
|
|
508
|
-
|
|
546
|
+
|
|
509
547
|
# Get abundance matrix to determine the number of assays needed
|
|
510
548
|
abundance_matrix = self.get_consensus_matrix()
|
|
511
|
-
|
|
549
|
+
|
|
512
550
|
# Get sample columns (excluding consensus_uid)
|
|
513
551
|
sample_columns = [col for col in abundance_matrix.columns if col != "consensus_uid"]
|
|
514
552
|
n_assays = len(sample_columns)
|
|
515
|
-
|
|
553
|
+
|
|
516
554
|
# Define samples, ms_runs, and assays based on the abundance matrix columns
|
|
517
555
|
# Determine scan polarity based on study polarity
|
|
518
|
-
study_polarity = getattr(self,
|
|
519
|
-
if study_polarity in [
|
|
556
|
+
study_polarity = getattr(self, "polarity", "positive")
|
|
557
|
+
if study_polarity in ["negative", "neg"]:
|
|
520
558
|
scan_polarity_cv = "[MS, MS:1000129, negative scan, ]"
|
|
521
559
|
else:
|
|
522
560
|
scan_polarity_cv = "[MS, MS:1000130, positive scan, ]"
|
|
523
|
-
|
|
561
|
+
|
|
524
562
|
for i, sample_col in enumerate(sample_columns, 1):
|
|
525
563
|
mtd_lines.append(f"\nMTD\tsample[{i}]\t{sample_col}")
|
|
526
564
|
mtd_lines.append(f"MTD\tsample[{i}]-description\t{sample_col}")
|
|
@@ -562,15 +600,24 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
562
600
|
# round to int - handle both Polars and Pandas DataFrames
|
|
563
601
|
if hasattr(abundance_matrix, "with_columns"):
|
|
564
602
|
# Polars DataFrame
|
|
565
|
-
numeric_cols = [
|
|
566
|
-
|
|
603
|
+
numeric_cols = [
|
|
604
|
+
col
|
|
605
|
+
for col in abundance_matrix.columns
|
|
606
|
+
if abundance_matrix[col].dtype.is_numeric()
|
|
607
|
+
]
|
|
608
|
+
abundance_matrix = abundance_matrix.with_columns(
|
|
609
|
+
[abundance_matrix[col].round(0) for col in numeric_cols],
|
|
610
|
+
)
|
|
567
611
|
else:
|
|
568
612
|
# Pandas DataFrame
|
|
569
613
|
abundance_matrix = abundance_matrix.round(0)
|
|
570
614
|
|
|
571
615
|
# Use the n_assays already calculated from abundance matrix columns
|
|
572
616
|
sml_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
|
|
573
|
-
sml_header += [
|
|
617
|
+
sml_header += [
|
|
618
|
+
"abundance_study_variable[1]",
|
|
619
|
+
"abundance_variation_study_variable[1]",
|
|
620
|
+
]
|
|
574
621
|
sml_lines.append("\t".join(sml_header))
|
|
575
622
|
|
|
576
623
|
# get adducts from consensus_df['adduct_top'] - use the top-ranked adduct directly
|
|
@@ -582,7 +629,7 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
582
629
|
adduct = str(row["adduct_top"])
|
|
583
630
|
# Replace ? with H for better mzTab compatibility
|
|
584
631
|
adduct = adduct.replace("?", "H")
|
|
585
|
-
|
|
632
|
+
|
|
586
633
|
adduct_list.append(adduct)
|
|
587
634
|
|
|
588
635
|
for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
|
|
@@ -593,63 +640,65 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
593
640
|
id_matches = top_id_data.filter(pl.col("consensus_uid") == consensus_uid)
|
|
594
641
|
if id_matches.height > 0:
|
|
595
642
|
id_info = id_matches.row(0, named=True)
|
|
596
|
-
|
|
643
|
+
|
|
597
644
|
# Populate identification fields
|
|
598
645
|
database_identifier = "null"
|
|
599
646
|
chemical_formula = "null"
|
|
600
647
|
smiles_val = "null"
|
|
601
|
-
inchi_val = "null"
|
|
648
|
+
inchi_val = "null"
|
|
602
649
|
chemical_name = "null"
|
|
603
650
|
best_id_confidence_measure = "null"
|
|
604
651
|
best_id_confidence_value = "null"
|
|
605
652
|
reliability = "4" # Default: unknown compound
|
|
606
|
-
theoretical_neutral_mass =
|
|
607
|
-
|
|
653
|
+
theoretical_neutral_mass = (
|
|
654
|
+
"null" # Only set when we have database identification
|
|
655
|
+
)
|
|
656
|
+
|
|
608
657
|
if id_info:
|
|
609
658
|
# Use cmpd_uid as database identifier with prefix
|
|
610
659
|
if id_info.get("cmpd_uid") is not None:
|
|
611
660
|
database_identifier = f"cmpd:{id_info['cmpd_uid']}"
|
|
612
|
-
|
|
661
|
+
|
|
613
662
|
# Chemical formula
|
|
614
663
|
if id_info.get("formula") is not None and id_info["formula"] != "":
|
|
615
664
|
chemical_formula = safe_str(id_info["formula"])
|
|
616
|
-
|
|
665
|
+
|
|
617
666
|
# SMILES
|
|
618
667
|
if id_info.get("smiles") is not None and id_info["smiles"] != "":
|
|
619
668
|
smiles_val = safe_str(id_info["smiles"])
|
|
620
|
-
|
|
669
|
+
|
|
621
670
|
# InChI
|
|
622
671
|
if id_info.get("inchi") is not None and id_info["inchi"] != "":
|
|
623
672
|
inchi_val = safe_str(id_info["inchi"])
|
|
624
|
-
|
|
673
|
+
|
|
625
674
|
# Chemical name
|
|
626
675
|
if id_info.get("name") is not None and id_info["name"] != "":
|
|
627
676
|
chemical_name = safe_str(id_info["name"])
|
|
628
|
-
|
|
677
|
+
|
|
629
678
|
# Theoretical neutral mass - only from identification data, not consensus_df
|
|
630
679
|
if id_info.get("neutral_mass") is not None:
|
|
631
680
|
theoretical_neutral_mass = safe_str(id_info["neutral_mass"])
|
|
632
681
|
elif id_info.get("mass") is not None:
|
|
633
682
|
theoretical_neutral_mass = safe_str(id_info["mass"])
|
|
634
|
-
|
|
683
|
+
|
|
635
684
|
# Identification confidence
|
|
636
685
|
if id_info.get("matcher") is not None:
|
|
637
686
|
best_id_confidence_measure = f"[MS, MS:1002888, {id_info['matcher']}, ]"
|
|
638
|
-
|
|
687
|
+
|
|
639
688
|
if id_info.get("score") is not None:
|
|
640
689
|
best_id_confidence_value = safe_str(id_info["score"])
|
|
641
|
-
|
|
690
|
+
|
|
642
691
|
# Set reliability based on identification quality
|
|
643
692
|
# Using mzTab-M hr-ms identification levels: 2a=compound match, 2b=library spectrum match, 3=compound class, 4=unknown
|
|
644
693
|
if id_info.get("score", 0) >= 0.8:
|
|
645
694
|
reliability = "2a" # High confidence compound match
|
|
646
695
|
elif id_info.get("score", 0) >= 0.5:
|
|
647
|
-
reliability = "2b" # Moderate confidence match
|
|
696
|
+
reliability = "2b" # Moderate confidence match
|
|
648
697
|
elif id_info.get("score", 0) >= 0.2:
|
|
649
|
-
reliability = "3"
|
|
698
|
+
reliability = "3" # Compound class level
|
|
650
699
|
else:
|
|
651
|
-
reliability = "4"
|
|
652
|
-
|
|
700
|
+
reliability = "4" # Unknown compound
|
|
701
|
+
|
|
653
702
|
# Get MGF indexes for this consensus feature
|
|
654
703
|
mgf_indexes = mgf_mapping.get(row["consensus_uid"], [])
|
|
655
704
|
|
|
@@ -673,26 +722,45 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
673
722
|
# Add abundance values for each assay
|
|
674
723
|
consensus_uid = row["consensus_uid"]
|
|
675
724
|
# Check if consensus_uid exists in the abundance_matrix (Polars)
|
|
676
|
-
filtered_matrix = abundance_matrix.filter(
|
|
725
|
+
filtered_matrix = abundance_matrix.filter(
|
|
726
|
+
pl.col("consensus_uid") == consensus_uid,
|
|
727
|
+
)
|
|
677
728
|
if filtered_matrix.height > 0:
|
|
678
729
|
# Get the first (and should be only) matching row
|
|
679
730
|
abundance_row = filtered_matrix.row(0, named=True)
|
|
680
731
|
# Extract values excluding the consensus_uid column
|
|
681
|
-
abundance_values = [
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
732
|
+
abundance_values = [
|
|
733
|
+
abundance_row[col]
|
|
734
|
+
for col in abundance_matrix.columns
|
|
735
|
+
if col != "consensus_uid"
|
|
736
|
+
]
|
|
737
|
+
sml_row += [
|
|
738
|
+
safe_str(val) if val is not None else "null" for val in abundance_values
|
|
739
|
+
]
|
|
740
|
+
|
|
741
|
+
# Calculate study variable statistics
|
|
685
742
|
non_null_values = [val for val in abundance_values if val is not None]
|
|
686
743
|
if non_null_values:
|
|
687
744
|
abundance_study_variable = sum(non_null_values) / len(non_null_values)
|
|
688
745
|
abundance_variation_study_variable = (
|
|
689
|
-
|
|
690
|
-
|
|
746
|
+
(
|
|
747
|
+
sum(
|
|
748
|
+
(x - abundance_study_variable) ** 2 for x in non_null_values
|
|
749
|
+
)
|
|
750
|
+
/ len(non_null_values)
|
|
751
|
+
)
|
|
752
|
+
** 0.5
|
|
753
|
+
if len(non_null_values) > 1
|
|
754
|
+
else 0
|
|
755
|
+
)
|
|
691
756
|
else:
|
|
692
757
|
abundance_study_variable = "null"
|
|
693
758
|
abundance_variation_study_variable = "null"
|
|
694
|
-
|
|
695
|
-
sml_row += [
|
|
759
|
+
|
|
760
|
+
sml_row += [
|
|
761
|
+
safe_str(abundance_study_variable),
|
|
762
|
+
safe_str(abundance_variation_study_variable),
|
|
763
|
+
]
|
|
696
764
|
else:
|
|
697
765
|
sml_row += ["null"] * n_assays
|
|
698
766
|
sml_row += ["null", "null"] # Study variable columns
|
|
@@ -707,8 +775,8 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
707
775
|
smf_header = [
|
|
708
776
|
"SFH",
|
|
709
777
|
"SMF_ID",
|
|
710
|
-
"
|
|
711
|
-
"
|
|
778
|
+
"SOME_ID_REFS",
|
|
779
|
+
"SOME_ID_REF_ambiguity_code",
|
|
712
780
|
"adduct_ion",
|
|
713
781
|
"isotopomer",
|
|
714
782
|
"exp_mass_to_charge",
|
|
@@ -718,86 +786,115 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
718
786
|
"retention_time_in_seconds_end",
|
|
719
787
|
]
|
|
720
788
|
smf_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
|
|
721
|
-
smf_header += [
|
|
789
|
+
smf_header += [
|
|
790
|
+
"abundance_study_variable[1]",
|
|
791
|
+
"abundance_variation_study_variable[1]",
|
|
792
|
+
]
|
|
722
793
|
smf_lines.append("\t".join(smf_header))
|
|
723
794
|
|
|
724
795
|
# SMF table uses the same consensus features as SML, just different metadata
|
|
725
796
|
for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
|
|
726
|
-
# References to
|
|
727
|
-
|
|
728
|
-
|
|
797
|
+
# References to SOME entries - each SMF can reference multiple SOME entries for the same consensus_uid
|
|
798
|
+
some_refs = "null"
|
|
799
|
+
some_ambiguity = "null"
|
|
729
800
|
consensus_uid = row["consensus_uid"]
|
|
730
|
-
|
|
801
|
+
|
|
731
802
|
if full_id_data is not None:
|
|
732
|
-
# Find all
|
|
733
|
-
|
|
734
|
-
if
|
|
735
|
-
# Generate
|
|
803
|
+
# Find all SOME entries for this consensus_uid
|
|
804
|
+
some_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
|
|
805
|
+
if some_matches.height > 0:
|
|
806
|
+
# Generate SOME IDs - we'll create a mapping in the SOME section
|
|
736
807
|
# For now, use a simple approach based on consensus_uid and lib_uid
|
|
737
|
-
|
|
738
|
-
for i,
|
|
739
|
-
# Create a unique
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
808
|
+
some_ids = []
|
|
809
|
+
for i, some_row in enumerate(some_matches.iter_rows(named=True)):
|
|
810
|
+
# Create a unique SOME ID based on consensus_uid and position
|
|
811
|
+
some_id_base = (
|
|
812
|
+
consensus_uid * 1000
|
|
813
|
+
) # Ensure uniqueness across consensus features
|
|
814
|
+
some_id = some_id_base + i + 1
|
|
815
|
+
some_ids.append(str(some_id))
|
|
816
|
+
|
|
817
|
+
if some_ids:
|
|
818
|
+
some_refs = "|".join(some_ids)
|
|
746
819
|
# Set ambiguity code: 1=ambiguous identification, 2=multiple evidence same molecule, 3=both
|
|
747
|
-
if len(
|
|
820
|
+
if len(some_ids) > 1:
|
|
748
821
|
# Check if all identifications point to the same compound
|
|
749
|
-
unique_cmpds =
|
|
750
|
-
|
|
822
|
+
unique_cmpds = {
|
|
823
|
+
match["cmpd_uid"]
|
|
824
|
+
for match in some_matches.iter_rows(named=True)
|
|
825
|
+
if match.get("cmpd_uid") is not None
|
|
826
|
+
}
|
|
751
827
|
if len(unique_cmpds) > 1:
|
|
752
|
-
|
|
828
|
+
some_ambiguity = "1" # Ambiguous identification
|
|
753
829
|
else:
|
|
754
|
-
|
|
830
|
+
some_ambiguity = "2" # Multiple evidence for same molecule
|
|
755
831
|
else:
|
|
756
|
-
|
|
757
|
-
|
|
832
|
+
some_ambiguity = "null"
|
|
833
|
+
|
|
758
834
|
# Format isotopomer according to mzTab-M specification
|
|
759
835
|
iso_value = row.get("iso_mean", 0)
|
|
760
836
|
if iso_value is not None and round(iso_value) != 0:
|
|
761
|
-
isotopomer = f
|
|
837
|
+
isotopomer = f'[MS,MS:1002957,"isotopomer MS peak","+{round(iso_value)}"]'
|
|
762
838
|
else:
|
|
763
839
|
isotopomer = "null"
|
|
764
|
-
|
|
840
|
+
|
|
765
841
|
smf_row = [
|
|
766
842
|
"SMF",
|
|
767
843
|
str(idx),
|
|
768
|
-
|
|
769
|
-
|
|
844
|
+
some_refs,
|
|
845
|
+
some_ambiguity,
|
|
770
846
|
adduct_list[idx - 1], # adduct_ion
|
|
771
847
|
isotopomer, # isotopomer formatted according to mzTab-M specification
|
|
772
848
|
safe_str(row.get("mz", "null")), # exp_mass_to_charge
|
|
773
|
-
safe_str(
|
|
849
|
+
safe_str(
|
|
850
|
+
row.get("adduct_charge_top", "null"),
|
|
851
|
+
), # Use top-ranked adduct charge
|
|
774
852
|
safe_str(row.get("rt", "null")), # retention_time_in_seconds
|
|
775
853
|
safe_str(row.get("retention_time_in_seconds_start", "null")),
|
|
776
854
|
safe_str(row.get("retention_time_in_seconds_end", "null")),
|
|
777
855
|
]
|
|
778
856
|
# Add abundance values for each assay - same as SML (Polars)
|
|
779
857
|
consensus_uid = row["consensus_uid"]
|
|
780
|
-
filtered_matrix = abundance_matrix.filter(
|
|
858
|
+
filtered_matrix = abundance_matrix.filter(
|
|
859
|
+
pl.col("consensus_uid") == consensus_uid,
|
|
860
|
+
)
|
|
781
861
|
if filtered_matrix.height > 0:
|
|
782
862
|
# Get the first (and should be only) matching row
|
|
783
863
|
abundance_row = filtered_matrix.row(0, named=True)
|
|
784
864
|
# Extract values excluding the consensus_uid column
|
|
785
|
-
abundance_values = [
|
|
786
|
-
|
|
865
|
+
abundance_values = [
|
|
866
|
+
abundance_row[col]
|
|
867
|
+
for col in abundance_matrix.columns
|
|
868
|
+
if col != "consensus_uid"
|
|
869
|
+
]
|
|
870
|
+
abundance_strings = [
|
|
871
|
+
safe_str(val) if val is not None else "null" for val in abundance_values
|
|
872
|
+
]
|
|
787
873
|
smf_row += abundance_strings
|
|
788
|
-
|
|
874
|
+
|
|
789
875
|
# Calculate study variable statistics (same as in SML section)
|
|
790
876
|
non_null_values = [val for val in abundance_values if val is not None]
|
|
791
877
|
if non_null_values:
|
|
792
878
|
abundance_study_variable = sum(non_null_values) / len(non_null_values)
|
|
793
879
|
abundance_variation_study_variable = (
|
|
794
|
-
|
|
795
|
-
|
|
880
|
+
(
|
|
881
|
+
sum(
|
|
882
|
+
(x - abundance_study_variable) ** 2 for x in non_null_values
|
|
883
|
+
)
|
|
884
|
+
/ len(non_null_values)
|
|
885
|
+
)
|
|
886
|
+
** 0.5
|
|
887
|
+
if len(non_null_values) > 1
|
|
888
|
+
else 0
|
|
889
|
+
)
|
|
796
890
|
else:
|
|
797
891
|
abundance_study_variable = "null"
|
|
798
892
|
abundance_variation_study_variable = "null"
|
|
799
|
-
|
|
800
|
-
smf_row += [
|
|
893
|
+
|
|
894
|
+
smf_row += [
|
|
895
|
+
safe_str(abundance_study_variable),
|
|
896
|
+
safe_str(abundance_variation_study_variable),
|
|
897
|
+
]
|
|
801
898
|
else:
|
|
802
899
|
smf_row += ["null"] * n_assays
|
|
803
900
|
smf_row += ["null", "null"] # Study variable columns
|
|
@@ -807,19 +904,21 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
807
904
|
for line in smf_lines:
|
|
808
905
|
f.write(line + "\n")
|
|
809
906
|
|
|
810
|
-
# ---
|
|
907
|
+
# --- SOME (Small Molecule Evidence) table ---
|
|
811
908
|
if full_id_data is not None and not full_id_data.is_empty():
|
|
812
|
-
|
|
909
|
+
some_lines = []
|
|
813
910
|
# Add comment about spectra_ref being dummy placeholders
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
911
|
+
some_lines.append(
|
|
912
|
+
"COM\tThe spectra_ref are dummy placeholders, as the annotation was based on aggregated data",
|
|
913
|
+
)
|
|
914
|
+
some_header = [
|
|
915
|
+
"SHE",
|
|
916
|
+
"SOME_ID",
|
|
818
917
|
"evidence_input_id",
|
|
819
918
|
"database_identifier",
|
|
820
919
|
"chemical_formula",
|
|
821
920
|
"smiles",
|
|
822
|
-
"inchi",
|
|
921
|
+
"inchi",
|
|
823
922
|
"chemical_name",
|
|
824
923
|
"uri",
|
|
825
924
|
"derivatized_form",
|
|
@@ -833,93 +932,101 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
833
932
|
"id_confidence_measure[1]",
|
|
834
933
|
"rank",
|
|
835
934
|
]
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
# Create
|
|
839
|
-
for consensus_uid in
|
|
935
|
+
some_lines.append("\t".join(some_header))
|
|
936
|
+
|
|
937
|
+
# Create SOME entries for all identification results using enriched data
|
|
938
|
+
for consensus_uid in (
|
|
939
|
+
self.consensus_df.select("consensus_uid").to_series().unique()
|
|
940
|
+
):
|
|
840
941
|
# Get consensus feature data for this consensus_uid
|
|
841
|
-
consensus_feature_data = self.consensus_df.filter(
|
|
942
|
+
consensus_feature_data = self.consensus_df.filter(
|
|
943
|
+
pl.col("consensus_uid") == consensus_uid,
|
|
944
|
+
)
|
|
842
945
|
if consensus_feature_data.height == 0:
|
|
843
946
|
continue
|
|
844
947
|
consensus_row = consensus_feature_data.row(0, named=True)
|
|
845
|
-
|
|
948
|
+
|
|
846
949
|
# Get all identification results for this consensus feature from enriched data
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
if
|
|
950
|
+
some_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
|
|
951
|
+
|
|
952
|
+
if some_matches.height > 0:
|
|
850
953
|
# Sort by score descending to maintain rank order
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
for i,
|
|
854
|
-
# Generate unique
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
954
|
+
some_matches = some_matches.sort("score", descending=True)
|
|
955
|
+
|
|
956
|
+
for i, some_row in enumerate(some_matches.iter_rows(named=True)):
|
|
957
|
+
# Generate unique SOME_ID
|
|
958
|
+
some_id_base = consensus_uid * 1000
|
|
959
|
+
some_id = some_id_base + i + 1
|
|
960
|
+
|
|
858
961
|
# Create evidence input ID using consensus_uid:mz:rt format
|
|
859
962
|
consensus_mz = consensus_row.get("mz", 0)
|
|
860
963
|
consensus_rt = consensus_row.get("rt", 0)
|
|
861
964
|
evidence_id = f"consensus_uid={consensus_uid}:mz={consensus_mz:.4f}:rt={consensus_rt:.2f}"
|
|
862
|
-
|
|
965
|
+
|
|
863
966
|
# Database identifier - use db_id if available, otherwise fallback to cmpd_uid
|
|
864
967
|
db_id = "null"
|
|
865
|
-
if
|
|
866
|
-
db_id = safe_str(
|
|
867
|
-
elif
|
|
868
|
-
db_id = f"cmpd:{
|
|
869
|
-
|
|
870
|
-
# Get adduct information
|
|
968
|
+
if some_row.get("db_id") is not None and some_row["db_id"] != "":
|
|
969
|
+
db_id = safe_str(some_row["db_id"])
|
|
970
|
+
elif some_row.get("cmpd_uid") is not None:
|
|
971
|
+
db_id = f"cmpd:{some_row['cmpd_uid']}"
|
|
972
|
+
|
|
973
|
+
# Get adduct information
|
|
871
974
|
adduct_ion = "null"
|
|
872
|
-
if
|
|
873
|
-
adduct_ion = safe_str(
|
|
975
|
+
if some_row.get("adduct") is not None and some_row["adduct"] != "":
|
|
976
|
+
adduct_ion = safe_str(some_row["adduct"])
|
|
874
977
|
# Replace ? with H for better mzTab compatibility
|
|
875
978
|
adduct_ion = adduct_ion.replace("?", "H")
|
|
876
|
-
|
|
979
|
+
|
|
877
980
|
# Spectra reference - reference to first ms_run with spectrum index 0
|
|
878
981
|
spectra_ref = "ms_run[1]:spectrum=0"
|
|
879
|
-
|
|
982
|
+
|
|
880
983
|
# Identification method
|
|
881
984
|
id_method = "[MS, MS:1002888, small molecule confidence measure, ]"
|
|
882
|
-
if
|
|
883
|
-
id_method = f"[MS, MS:1002888, {
|
|
884
|
-
|
|
985
|
+
if some_row.get("matcher") is not None:
|
|
986
|
+
id_method = f"[MS, MS:1002888, {some_row['matcher']}, ]"
|
|
987
|
+
|
|
885
988
|
# MS level - assume MS1 for now
|
|
886
989
|
ms_level = "[MS, MS:1000511, ms level, 1]"
|
|
887
|
-
|
|
990
|
+
|
|
888
991
|
# Experimental mass-to-charge from consensus feature
|
|
889
992
|
exp_mz = safe_str(consensus_mz)
|
|
890
|
-
|
|
891
|
-
# Theoretical mass-to-charge from lib_df
|
|
993
|
+
|
|
994
|
+
# Theoretical mass-to-charge from lib_df
|
|
892
995
|
theoretical_mz = "null"
|
|
893
|
-
if
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
996
|
+
if (
|
|
997
|
+
some_row.get("mz") is not None
|
|
998
|
+
): # This comes from lib_df via get_id() join
|
|
999
|
+
theoretical_mz = safe_str(some_row["mz"])
|
|
1000
|
+
|
|
1001
|
+
some_line = [
|
|
1002
|
+
"SOME",
|
|
1003
|
+
str(some_id),
|
|
899
1004
|
evidence_id,
|
|
900
1005
|
db_id,
|
|
901
|
-
safe_str(
|
|
902
|
-
safe_str(
|
|
903
|
-
safe_str(
|
|
904
|
-
safe_str(
|
|
1006
|
+
safe_str(some_row.get("formula", "null")),
|
|
1007
|
+
safe_str(some_row.get("smiles", "null")),
|
|
1008
|
+
safe_str(some_row.get("inchi", "null")),
|
|
1009
|
+
safe_str(some_row.get("name", "null")),
|
|
905
1010
|
"null", # uri - not available in current data
|
|
906
1011
|
"null", # derivatized_form
|
|
907
1012
|
adduct_ion,
|
|
908
1013
|
exp_mz, # experimental m/z from consensus feature
|
|
909
|
-
safe_str(
|
|
1014
|
+
safe_str(
|
|
1015
|
+
consensus_row.get("adduct_charge_top", "1"),
|
|
1016
|
+
), # Use consensus feature's top adduct charge
|
|
910
1017
|
theoretical_mz, # theoretical m/z from lib_df
|
|
911
1018
|
spectra_ref,
|
|
912
1019
|
id_method,
|
|
913
1020
|
ms_level,
|
|
914
|
-
safe_str(
|
|
1021
|
+
safe_str(some_row.get("score", "null")),
|
|
915
1022
|
str(i + 1), # rank within this consensus feature
|
|
916
1023
|
]
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
# Write
|
|
1024
|
+
some_lines.append("\t".join(some_line))
|
|
1025
|
+
|
|
1026
|
+
# Write SOME table
|
|
920
1027
|
with open(filename, "a", encoding="utf-8") as f:
|
|
921
1028
|
f.write("\n")
|
|
922
|
-
for line in
|
|
1029
|
+
for line in some_lines:
|
|
923
1030
|
f.write(line + "\n")
|
|
924
1031
|
|
|
925
1032
|
# --- MGF table ---
|
|
@@ -953,15 +1060,23 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
953
1060
|
spec_len = row["spec_len"] if row["spec_len"] is not None else 0
|
|
954
1061
|
|
|
955
1062
|
# Format spectrum data as pipe-separated strings
|
|
956
|
-
spec_mz_str =
|
|
957
|
-
|
|
1063
|
+
spec_mz_str = (
|
|
1064
|
+
"|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
|
|
1065
|
+
)
|
|
1066
|
+
spec_int_str = (
|
|
1067
|
+
"|".join([f"{int(inty)}" for inty in spectrum_inty])
|
|
1068
|
+
if spectrum_inty
|
|
1069
|
+
else ""
|
|
1070
|
+
)
|
|
958
1071
|
|
|
959
1072
|
mgf_row = [
|
|
960
1073
|
"COM",
|
|
961
1074
|
"MGF",
|
|
962
1075
|
str(row["mgf_index"]) if row["mgf_index"] is not None else "null",
|
|
963
1076
|
str(row["feature_id"]) if row["feature_id"] is not None else "null",
|
|
964
|
-
f"{row['rtinseconds']:.2f}"
|
|
1077
|
+
f"{row['rtinseconds']:.2f}"
|
|
1078
|
+
if row["rtinseconds"] is not None
|
|
1079
|
+
else "null",
|
|
965
1080
|
f"{row['pepmass']:.4f}" if row["pepmass"] is not None else "null",
|
|
966
1081
|
"null", # prec_int - not available in current data
|
|
967
1082
|
str(row["energy"]) if row["energy"] is not None else "null",
|
|
@@ -986,94 +1101,110 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
986
1101
|
def export_xlsx(self, filename: str = None) -> None:
|
|
987
1102
|
"""
|
|
988
1103
|
Export the study data to an Excel workbook with multiple worksheets.
|
|
989
|
-
|
|
1104
|
+
|
|
990
1105
|
The Excel file contains three worksheets:
|
|
991
1106
|
- consensus_df: Consensus features dataframe
|
|
992
|
-
- matrix: Consensus matrix with samples as columns (get_consensus_matrix)
|
|
1107
|
+
- matrix: Consensus matrix with samples as columns (get_consensus_matrix)
|
|
993
1108
|
- identification: Identification results with library annotations (get_id)
|
|
994
|
-
|
|
1109
|
+
|
|
995
1110
|
Args:
|
|
996
|
-
filename (str, optional): Path to the output Excel file. Defaults to "study.xlsx"
|
|
1111
|
+
filename (str, optional): Path to the output Excel file. Defaults to "study.xlsx"
|
|
997
1112
|
in the study folder.
|
|
998
1113
|
"""
|
|
999
1114
|
try:
|
|
1000
1115
|
import openpyxl
|
|
1001
1116
|
except ImportError:
|
|
1002
|
-
self.logger.error(
|
|
1117
|
+
self.logger.error(
|
|
1118
|
+
"openpyxl package is required for Excel export. Install with: pip install openpyxl",
|
|
1119
|
+
)
|
|
1003
1120
|
return
|
|
1004
|
-
|
|
1121
|
+
|
|
1005
1122
|
# Set default filename
|
|
1006
1123
|
if filename is None:
|
|
1007
1124
|
filename = "study.xlsx"
|
|
1008
|
-
|
|
1125
|
+
|
|
1009
1126
|
# Make filename absolute if not already
|
|
1010
1127
|
if not os.path.isabs(filename):
|
|
1011
1128
|
if self.folder is not None:
|
|
1012
1129
|
filename = os.path.join(self.folder, filename)
|
|
1013
1130
|
else:
|
|
1014
1131
|
filename = os.path.join(os.getcwd(), filename)
|
|
1015
|
-
|
|
1016
|
-
self.logger.debug(
|
|
1017
|
-
|
|
1132
|
+
|
|
1133
|
+
self.logger.debug("Exporting study to Excel...")
|
|
1134
|
+
|
|
1018
1135
|
# Prepare data for export in the desired order
|
|
1019
1136
|
from collections import OrderedDict
|
|
1137
|
+
|
|
1020
1138
|
worksheets = OrderedDict()
|
|
1021
|
-
|
|
1139
|
+
|
|
1022
1140
|
# 1. Samples dataframe (first worksheet)
|
|
1023
1141
|
if self.samples_df is not None and not self.samples_df.is_empty():
|
|
1024
1142
|
samples_pandas = self.samples_df.to_pandas()
|
|
1025
|
-
worksheets[
|
|
1143
|
+
worksheets["samples"] = samples_pandas
|
|
1026
1144
|
self.logger.debug(f"Added samples worksheet with {len(samples_pandas)} rows")
|
|
1027
1145
|
else:
|
|
1028
1146
|
self.logger.warning("samples_df is empty or None, skipping worksheet")
|
|
1029
|
-
|
|
1147
|
+
|
|
1030
1148
|
# 2. Consensus dataframe (renamed to 'consensus')
|
|
1031
1149
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1032
1150
|
consensus_pandas = self.consensus_df.to_pandas()
|
|
1033
|
-
worksheets[
|
|
1034
|
-
self.logger.debug(
|
|
1151
|
+
worksheets["consensus"] = consensus_pandas
|
|
1152
|
+
self.logger.debug(
|
|
1153
|
+
f"Added consensus worksheet with {len(consensus_pandas)} rows",
|
|
1154
|
+
)
|
|
1035
1155
|
else:
|
|
1036
1156
|
self.logger.warning("consensus_df is empty or None, skipping worksheet")
|
|
1037
|
-
|
|
1157
|
+
|
|
1038
1158
|
# 3. Identification results
|
|
1039
1159
|
try:
|
|
1040
|
-
from
|
|
1160
|
+
from master.study.id import get_id
|
|
1161
|
+
|
|
1041
1162
|
id_df = get_id(self)
|
|
1042
1163
|
if id_df is not None and not id_df.is_empty():
|
|
1043
1164
|
id_pandas = id_df.to_pandas()
|
|
1044
|
-
worksheets[
|
|
1045
|
-
self.logger.debug(
|
|
1165
|
+
worksheets["identification"] = id_pandas
|
|
1166
|
+
self.logger.debug(
|
|
1167
|
+
f"Added identification worksheet with {len(id_pandas)} rows",
|
|
1168
|
+
)
|
|
1046
1169
|
else:
|
|
1047
|
-
self.logger.warning(
|
|
1170
|
+
self.logger.warning(
|
|
1171
|
+
"get_id() returned empty data, skipping identification worksheet",
|
|
1172
|
+
)
|
|
1048
1173
|
except Exception as e:
|
|
1049
|
-
self.logger.warning(
|
|
1050
|
-
|
|
1174
|
+
self.logger.warning(
|
|
1175
|
+
f"Error getting identification data: {e}. Skipping identification worksheet.",
|
|
1176
|
+
)
|
|
1177
|
+
|
|
1051
1178
|
# 4. Consensus matrix (last worksheet)
|
|
1052
1179
|
try:
|
|
1053
1180
|
matrix_df = self.get_consensus_matrix()
|
|
1054
1181
|
if matrix_df is not None and not matrix_df.is_empty():
|
|
1055
1182
|
matrix_pandas = matrix_df.to_pandas()
|
|
1056
|
-
worksheets[
|
|
1183
|
+
worksheets["matrix"] = matrix_pandas
|
|
1057
1184
|
self.logger.debug(f"Added matrix worksheet with {len(matrix_pandas)} rows")
|
|
1058
1185
|
else:
|
|
1059
|
-
self.logger.warning(
|
|
1186
|
+
self.logger.warning(
|
|
1187
|
+
"get_consensus_matrix() returned empty data, skipping matrix worksheet",
|
|
1188
|
+
)
|
|
1060
1189
|
except Exception as e:
|
|
1061
1190
|
self.logger.error(f"Error getting consensus matrix: {e}")
|
|
1062
|
-
|
|
1191
|
+
|
|
1063
1192
|
# Check if we have any data to export
|
|
1064
1193
|
if not worksheets:
|
|
1065
1194
|
self.logger.error("No data available to export to Excel")
|
|
1066
1195
|
return
|
|
1067
|
-
|
|
1196
|
+
|
|
1068
1197
|
# Write to Excel file
|
|
1069
1198
|
try:
|
|
1070
|
-
with pd.ExcelWriter(filename, engine=
|
|
1199
|
+
with pd.ExcelWriter(filename, engine="openpyxl") as writer:
|
|
1071
1200
|
for sheet_name, data in worksheets.items():
|
|
1072
1201
|
data.to_excel(writer, sheet_name=sheet_name, index=False)
|
|
1073
|
-
self.logger.debug(
|
|
1074
|
-
|
|
1202
|
+
self.logger.debug(
|
|
1203
|
+
f"Written worksheet '{sheet_name}' with shape {data.shape}",
|
|
1204
|
+
)
|
|
1205
|
+
|
|
1075
1206
|
self.logger.info(f"Study exported to {filename}")
|
|
1076
|
-
|
|
1207
|
+
|
|
1077
1208
|
except Exception as e:
|
|
1078
1209
|
self.logger.error(f"Error writing Excel file: {e}")
|
|
1079
1210
|
|
|
@@ -1081,13 +1212,13 @@ def export_xlsx(self, filename: str = None) -> None:
|
|
|
1081
1212
|
def export_parquet(self, basename: str = None) -> None:
|
|
1082
1213
|
"""
|
|
1083
1214
|
Export the study data to multiple Parquet files with different suffixes.
|
|
1084
|
-
|
|
1215
|
+
|
|
1085
1216
|
The export creates separate Parquet files for each dataset:
|
|
1086
1217
|
- <basename>_samples.parquet: Samples dataframe
|
|
1087
1218
|
- <basename>_consensus.parquet: Consensus features dataframe
|
|
1088
1219
|
- <basename>_identification.parquet: Identification results with library annotations
|
|
1089
1220
|
- <basename>_matrix.parquet: Consensus matrix with samples as columns
|
|
1090
|
-
|
|
1221
|
+
|
|
1091
1222
|
Args:
|
|
1092
1223
|
basename (str, optional): Base name for the output files. Defaults to "study"
|
|
1093
1224
|
in the study folder.
|
|
@@ -1095,59 +1226,74 @@ def export_parquet(self, basename: str = None) -> None:
|
|
|
1095
1226
|
# Set default basename
|
|
1096
1227
|
if basename is None:
|
|
1097
1228
|
basename = "study"
|
|
1098
|
-
|
|
1229
|
+
|
|
1099
1230
|
# Make basename absolute path if not already (without extension)
|
|
1100
1231
|
if not os.path.isabs(basename):
|
|
1101
1232
|
if self.folder is not None:
|
|
1102
1233
|
basename = os.path.join(self.folder, basename)
|
|
1103
1234
|
else:
|
|
1104
1235
|
basename = os.path.join(os.getcwd(), basename)
|
|
1105
|
-
|
|
1236
|
+
|
|
1106
1237
|
self.logger.debug(f"Exporting study to Parquet files with basename: {basename}")
|
|
1107
|
-
|
|
1238
|
+
|
|
1108
1239
|
exported_files = []
|
|
1109
|
-
|
|
1240
|
+
|
|
1110
1241
|
# 1. Samples dataframe
|
|
1111
1242
|
if self.samples_df is not None and not self.samples_df.is_empty():
|
|
1112
1243
|
samples_file = f"{basename}_samples.parquet"
|
|
1113
1244
|
try:
|
|
1114
1245
|
self.samples_df.write_parquet(samples_file)
|
|
1115
1246
|
exported_files.append(samples_file)
|
|
1116
|
-
self.logger.debug(
|
|
1247
|
+
self.logger.debug(
|
|
1248
|
+
f"Exported samples to {samples_file} ({self.samples_df.height} rows)",
|
|
1249
|
+
)
|
|
1117
1250
|
except Exception as e:
|
|
1118
1251
|
self.logger.error(f"Error writing samples parquet file: {e}")
|
|
1119
1252
|
else:
|
|
1120
|
-
self.logger.warning(
|
|
1121
|
-
|
|
1253
|
+
self.logger.warning(
|
|
1254
|
+
"samples_df is empty or None, skipping samples parquet file",
|
|
1255
|
+
)
|
|
1256
|
+
|
|
1122
1257
|
# 2. Consensus dataframe
|
|
1123
1258
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1124
1259
|
consensus_file = f"{basename}_consensus.parquet"
|
|
1125
1260
|
try:
|
|
1126
1261
|
self.consensus_df.write_parquet(consensus_file)
|
|
1127
1262
|
exported_files.append(consensus_file)
|
|
1128
|
-
self.logger.debug(
|
|
1263
|
+
self.logger.debug(
|
|
1264
|
+
f"Exported consensus to {consensus_file} ({self.consensus_df.height} rows)",
|
|
1265
|
+
)
|
|
1129
1266
|
except Exception as e:
|
|
1130
1267
|
self.logger.error(f"Error writing consensus parquet file: {e}")
|
|
1131
1268
|
else:
|
|
1132
|
-
self.logger.warning(
|
|
1133
|
-
|
|
1269
|
+
self.logger.warning(
|
|
1270
|
+
"consensus_df is empty or None, skipping consensus parquet file",
|
|
1271
|
+
)
|
|
1272
|
+
|
|
1134
1273
|
# 3. Identification results
|
|
1135
1274
|
try:
|
|
1136
|
-
from
|
|
1275
|
+
from master.study.id import get_id
|
|
1276
|
+
|
|
1137
1277
|
id_df = get_id(self)
|
|
1138
1278
|
if id_df is not None and not id_df.is_empty():
|
|
1139
1279
|
identification_file = f"{basename}_identification.parquet"
|
|
1140
1280
|
try:
|
|
1141
1281
|
id_df.write_parquet(identification_file)
|
|
1142
1282
|
exported_files.append(identification_file)
|
|
1143
|
-
self.logger.debug(
|
|
1283
|
+
self.logger.debug(
|
|
1284
|
+
f"Exported identification to {identification_file} ({id_df.height} rows)",
|
|
1285
|
+
)
|
|
1144
1286
|
except Exception as e:
|
|
1145
1287
|
self.logger.error(f"Error writing identification parquet file: {e}")
|
|
1146
1288
|
else:
|
|
1147
|
-
self.logger.warning(
|
|
1289
|
+
self.logger.warning(
|
|
1290
|
+
"get_id() returned empty data, skipping identification parquet file",
|
|
1291
|
+
)
|
|
1148
1292
|
except Exception as e:
|
|
1149
|
-
self.logger.warning(
|
|
1150
|
-
|
|
1293
|
+
self.logger.warning(
|
|
1294
|
+
f"Error getting identification data: {e}. Skipping identification parquet file.",
|
|
1295
|
+
)
|
|
1296
|
+
|
|
1151
1297
|
# 4. Consensus matrix
|
|
1152
1298
|
try:
|
|
1153
1299
|
matrix_df = self.get_consensus_matrix()
|
|
@@ -1156,14 +1302,18 @@ def export_parquet(self, basename: str = None) -> None:
|
|
|
1156
1302
|
try:
|
|
1157
1303
|
matrix_df.write_parquet(matrix_file)
|
|
1158
1304
|
exported_files.append(matrix_file)
|
|
1159
|
-
self.logger.debug(
|
|
1305
|
+
self.logger.debug(
|
|
1306
|
+
f"Exported matrix to {matrix_file} ({matrix_df.height} rows)",
|
|
1307
|
+
)
|
|
1160
1308
|
except Exception as e:
|
|
1161
1309
|
self.logger.error(f"Error writing matrix parquet file: {e}")
|
|
1162
1310
|
else:
|
|
1163
|
-
self.logger.warning(
|
|
1311
|
+
self.logger.warning(
|
|
1312
|
+
"get_consensus_matrix() returned empty data, skipping matrix parquet file",
|
|
1313
|
+
)
|
|
1164
1314
|
except Exception as e:
|
|
1165
1315
|
self.logger.error(f"Error getting consensus matrix: {e}")
|
|
1166
|
-
|
|
1316
|
+
|
|
1167
1317
|
# Report results
|
|
1168
1318
|
if exported_files:
|
|
1169
1319
|
self.logger.info(f"Study exported to {len(exported_files)} Parquet files:")
|