masster 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (39) hide show
  1. masster/__init__.py +8 -8
  2. masster/chromatogram.py +1 -1
  3. masster/data/libs/urine.csv +3 -3
  4. masster/logger.py +11 -11
  5. masster/sample/__init__.py +1 -1
  6. masster/sample/adducts.py +338 -264
  7. masster/sample/defaults/find_adducts_def.py +21 -8
  8. masster/sample/h5.py +561 -282
  9. masster/sample/helpers.py +131 -75
  10. masster/sample/lib.py +4 -4
  11. masster/sample/load.py +31 -17
  12. masster/sample/parameters.py +1 -1
  13. masster/sample/plot.py +7 -7
  14. masster/sample/processing.py +117 -87
  15. masster/sample/sample.py +103 -90
  16. masster/sample/sample5_schema.json +44 -44
  17. masster/sample/save.py +35 -12
  18. masster/spectrum.py +1 -1
  19. masster/study/__init__.py +1 -1
  20. masster/study/defaults/align_def.py +5 -1
  21. masster/study/defaults/identify_def.py +3 -1
  22. masster/study/defaults/study_def.py +58 -25
  23. masster/study/export.py +360 -210
  24. masster/study/h5.py +560 -158
  25. masster/study/helpers.py +496 -203
  26. masster/study/helpers_optimized.py +1 -1
  27. masster/study/id.py +538 -349
  28. masster/study/load.py +233 -143
  29. masster/study/plot.py +71 -71
  30. masster/study/processing.py +456 -254
  31. masster/study/save.py +15 -5
  32. masster/study/study.py +213 -131
  33. masster/study/study5_schema.json +149 -149
  34. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/METADATA +3 -1
  35. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/RECORD +39 -39
  36. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/WHEEL +0 -0
  37. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/entry_points.txt +0 -0
  38. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/licenses/LICENSE +0 -0
  39. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/top_level.txt +0 -0
masster/study/export.py CHANGED
@@ -10,9 +10,9 @@ import polars as pl
10
10
 
11
11
  from tqdm import tqdm
12
12
 
13
- from masster.spectrum import combine_peaks
14
- from masster.study.defaults import export_mgf_defaults
15
- from masster._version import get_version
13
+ from master.spectrum import combine_peaks
14
+ from master.study.defaults import export_mgf_defaults
15
+ from master._version import get_version
16
16
 
17
17
 
18
18
  def _get_mgf_df(self, **kwargs):
@@ -107,7 +107,11 @@ def _get_mgf_df(self, **kwargs):
107
107
  mask = mask & (spec.inty >= inty_min)
108
108
  for attr in spec.__dict__:
109
109
  arr = getattr(spec, attr)
110
- if isinstance(arr, list | np.ndarray) and hasattr(arr, "__len__") and len(arr) == length:
110
+ if (
111
+ isinstance(arr, list | np.ndarray)
112
+ and hasattr(arr, "__len__")
113
+ and len(arr) == length
114
+ ):
111
115
  setattr(spec, attr, np.array(arr)[mask])
112
116
  return spec
113
117
 
@@ -117,8 +121,12 @@ def _get_mgf_df(self, **kwargs):
117
121
  return None
118
122
 
119
123
  # Prepare spectrum data
120
- spectrum_mz = spect.mz.tolist() if hasattr(spect.mz, "tolist") else list(spect.mz)
121
- spectrum_inty = spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
124
+ spectrum_mz = (
125
+ spect.mz.tolist() if hasattr(spect.mz, "tolist") else list(spect.mz)
126
+ )
127
+ spectrum_inty = (
128
+ spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
129
+ )
122
130
 
123
131
  # Determine MS level
124
132
  ms_level = spect.ms_level if spect.ms_level is not None else 1
@@ -258,7 +266,11 @@ def _get_mgf_df(self, **kwargs):
258
266
 
259
267
  elif selection == "all":
260
268
  if merge:
261
- specs = [row_e["spec"] for row_e in cons_ms2.iter_rows(named=True) if row_e["spec"] is not None]
269
+ specs = [
270
+ row_e["spec"]
271
+ for row_e in cons_ms2.iter_rows(named=True)
272
+ if row_e["spec"] is not None
273
+ ]
262
274
  if not specs:
263
275
  continue
264
276
  spect = combine_peaks(specs)
@@ -410,13 +422,14 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
410
422
  description (str, optional): Human-readable description.
411
423
  **kwargs: Additional metadata or export options.
412
424
  """
413
-
425
+
414
426
  def safe_str(value, default="null"):
415
427
  """Convert value to string, replacing empty strings with 'null'"""
416
428
  if value is None:
417
429
  return default
418
430
  str_val = str(value)
419
431
  return str_val if str_val.strip() != "" else default
432
+
420
433
  if filename is None:
421
434
  filename = "study.mztab"
422
435
  if not os.path.isabs(filename):
@@ -431,17 +444,23 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
431
444
  full_id_data = None
432
445
  try:
433
446
  # Import here to avoid circular imports
434
- from masster.study.id import get_id
435
- # Get full enriched identification data for SME section
447
+ from master.study.id import get_id
448
+
449
+ # Get full enriched identification data for SOME section
436
450
  full_id_data = get_id(self)
437
451
  if full_id_data is not None and not full_id_data.is_empty():
438
452
  # Get top scoring identification for each consensus_uid for SML section
439
- top_id_data = (full_id_data
440
- .group_by("consensus_uid")
441
- .agg(pl.all().sort_by("score", descending=True).first())
442
- .sort("consensus_uid"))
453
+ top_id_data = (
454
+ full_id_data.group_by("consensus_uid")
455
+ .agg(pl.all().sort_by("score", descending=True).first())
456
+ .sort("consensus_uid")
457
+ )
443
458
  # Keep raw id_data for backward compatibility (if needed elsewhere)
444
- id_data = self.id_df if hasattr(self, 'id_df') and self.id_df is not None else None
459
+ id_data = (
460
+ self.id_df
461
+ if hasattr(self, "id_df") and self.id_df is not None
462
+ else None
463
+ )
445
464
  else:
446
465
  self.logger.info("No identification data available for mzTab export")
447
466
  except Exception as e:
@@ -466,7 +485,9 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
466
485
 
467
486
  # --- Prepare MTD (metadata) section ---
468
487
  mtd_lines = []
469
- mtd_lines.append(f"COM\tfile generated by MASSter {get_version()} on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
488
+ mtd_lines.append(
489
+ f"COM\tfile generated by MASSter {get_version()} on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
490
+ )
470
491
  mtd_lines.append("\nMTD\tmzTab-version\t2.2.0-M")
471
492
  id = self.label if self.label else self.folder
472
493
  mtd_lines.append(f"MTD\tmzTab-id\t{id}")
@@ -474,28 +495,45 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
474
495
  mtd_lines.append("MTD\tcv[1]-label\tMS")
475
496
  mtd_lines.append("MTD\tcv[1]-full_name\tPSI-MS controlled vocabulary")
476
497
  mtd_lines.append("MTD\tcv[1]-version\t4.1.199")
477
- mtd_lines.append("MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo")
498
+ mtd_lines.append(
499
+ "MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo",
500
+ )
478
501
  mtd_lines.append("")
479
- mtd_lines.append("MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
480
- mtd_lines.append("MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
502
+ mtd_lines.append(
503
+ "MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
504
+ )
505
+ mtd_lines.append(
506
+ "MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
507
+ )
481
508
  mtd_lines.append(
482
509
  "MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]",
483
510
  )
484
-
511
+
485
512
  # Add identification confidence measures if identification data is available
486
513
  if full_id_data is not None:
487
- mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
514
+ mtd_lines.append(
515
+ "MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]",
516
+ )
488
517
  else:
489
- mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
490
-
518
+ mtd_lines.append(
519
+ "MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]",
520
+ )
521
+
491
522
  mtd_lines.append("")
492
523
  mtd_lines.append("MTD\tsoftware[1]\t[MS, MS:1003430, OpenMS, unknown]")
493
524
  mtd_lines.append(f"MTD\tsoftware[2]\t[MS, MS:1002878, MASSter, {get_version()}]")
494
- mtd_lines.append("MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]")
525
+ mtd_lines.append(
526
+ "MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]",
527
+ )
495
528
  mtd_lines.append("")
496
-
529
+
497
530
  # Database information - updated based on identification data
498
- if full_id_data is not None and hasattr(self, 'lib_df') and self.lib_df is not None and not self.lib_df.is_empty():
531
+ if (
532
+ full_id_data is not None
533
+ and hasattr(self, "lib_df")
534
+ and self.lib_df is not None
535
+ and not self.lib_df.is_empty()
536
+ ):
499
537
  mtd_lines.append('MTD\tdatabase[1]\t[, , "compound library", ]')
500
538
  mtd_lines.append("MTD\tdatabase[1]-prefix\tcmpd")
501
539
  mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
@@ -505,22 +543,22 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
505
543
  mtd_lines.append("MTD\tdatabase[1]-prefix\tCID")
506
544
  mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
507
545
  mtd_lines.append("MTD\tdatabase[1]-uri\thttps://pubchem.ncbi.nlm.nih.gov/")
508
-
546
+
509
547
  # Get abundance matrix to determine the number of assays needed
510
548
  abundance_matrix = self.get_consensus_matrix()
511
-
549
+
512
550
  # Get sample columns (excluding consensus_uid)
513
551
  sample_columns = [col for col in abundance_matrix.columns if col != "consensus_uid"]
514
552
  n_assays = len(sample_columns)
515
-
553
+
516
554
  # Define samples, ms_runs, and assays based on the abundance matrix columns
517
555
  # Determine scan polarity based on study polarity
518
- study_polarity = getattr(self, 'polarity', 'positive')
519
- if study_polarity in ['negative', 'neg']:
556
+ study_polarity = getattr(self, "polarity", "positive")
557
+ if study_polarity in ["negative", "neg"]:
520
558
  scan_polarity_cv = "[MS, MS:1000129, negative scan, ]"
521
559
  else:
522
560
  scan_polarity_cv = "[MS, MS:1000130, positive scan, ]"
523
-
561
+
524
562
  for i, sample_col in enumerate(sample_columns, 1):
525
563
  mtd_lines.append(f"\nMTD\tsample[{i}]\t{sample_col}")
526
564
  mtd_lines.append(f"MTD\tsample[{i}]-description\t{sample_col}")
@@ -562,15 +600,24 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
562
600
  # round to int - handle both Polars and Pandas DataFrames
563
601
  if hasattr(abundance_matrix, "with_columns"):
564
602
  # Polars DataFrame
565
- numeric_cols = [col for col in abundance_matrix.columns if abundance_matrix[col].dtype.is_numeric()]
566
- abundance_matrix = abundance_matrix.with_columns([abundance_matrix[col].round(0) for col in numeric_cols])
603
+ numeric_cols = [
604
+ col
605
+ for col in abundance_matrix.columns
606
+ if abundance_matrix[col].dtype.is_numeric()
607
+ ]
608
+ abundance_matrix = abundance_matrix.with_columns(
609
+ [abundance_matrix[col].round(0) for col in numeric_cols],
610
+ )
567
611
  else:
568
612
  # Pandas DataFrame
569
613
  abundance_matrix = abundance_matrix.round(0)
570
614
 
571
615
  # Use the n_assays already calculated from abundance matrix columns
572
616
  sml_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
573
- sml_header += ["abundance_study_variable[1]", "abundance_variation_study_variable[1]"]
617
+ sml_header += [
618
+ "abundance_study_variable[1]",
619
+ "abundance_variation_study_variable[1]",
620
+ ]
574
621
  sml_lines.append("\t".join(sml_header))
575
622
 
576
623
  # get adducts from consensus_df['adduct_top'] - use the top-ranked adduct directly
@@ -582,7 +629,7 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
582
629
  adduct = str(row["adduct_top"])
583
630
  # Replace ? with H for better mzTab compatibility
584
631
  adduct = adduct.replace("?", "H")
585
-
632
+
586
633
  adduct_list.append(adduct)
587
634
 
588
635
  for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
@@ -593,63 +640,65 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
593
640
  id_matches = top_id_data.filter(pl.col("consensus_uid") == consensus_uid)
594
641
  if id_matches.height > 0:
595
642
  id_info = id_matches.row(0, named=True)
596
-
643
+
597
644
  # Populate identification fields
598
645
  database_identifier = "null"
599
646
  chemical_formula = "null"
600
647
  smiles_val = "null"
601
- inchi_val = "null"
648
+ inchi_val = "null"
602
649
  chemical_name = "null"
603
650
  best_id_confidence_measure = "null"
604
651
  best_id_confidence_value = "null"
605
652
  reliability = "4" # Default: unknown compound
606
- theoretical_neutral_mass = "null" # Only set when we have database identification
607
-
653
+ theoretical_neutral_mass = (
654
+ "null" # Only set when we have database identification
655
+ )
656
+
608
657
  if id_info:
609
658
  # Use cmpd_uid as database identifier with prefix
610
659
  if id_info.get("cmpd_uid") is not None:
611
660
  database_identifier = f"cmpd:{id_info['cmpd_uid']}"
612
-
661
+
613
662
  # Chemical formula
614
663
  if id_info.get("formula") is not None and id_info["formula"] != "":
615
664
  chemical_formula = safe_str(id_info["formula"])
616
-
665
+
617
666
  # SMILES
618
667
  if id_info.get("smiles") is not None and id_info["smiles"] != "":
619
668
  smiles_val = safe_str(id_info["smiles"])
620
-
669
+
621
670
  # InChI
622
671
  if id_info.get("inchi") is not None and id_info["inchi"] != "":
623
672
  inchi_val = safe_str(id_info["inchi"])
624
-
673
+
625
674
  # Chemical name
626
675
  if id_info.get("name") is not None and id_info["name"] != "":
627
676
  chemical_name = safe_str(id_info["name"])
628
-
677
+
629
678
  # Theoretical neutral mass - only from identification data, not consensus_df
630
679
  if id_info.get("neutral_mass") is not None:
631
680
  theoretical_neutral_mass = safe_str(id_info["neutral_mass"])
632
681
  elif id_info.get("mass") is not None:
633
682
  theoretical_neutral_mass = safe_str(id_info["mass"])
634
-
683
+
635
684
  # Identification confidence
636
685
  if id_info.get("matcher") is not None:
637
686
  best_id_confidence_measure = f"[MS, MS:1002888, {id_info['matcher']}, ]"
638
-
687
+
639
688
  if id_info.get("score") is not None:
640
689
  best_id_confidence_value = safe_str(id_info["score"])
641
-
690
+
642
691
  # Set reliability based on identification quality
643
692
  # Using mzTab-M hr-ms identification levels: 2a=compound match, 2b=library spectrum match, 3=compound class, 4=unknown
644
693
  if id_info.get("score", 0) >= 0.8:
645
694
  reliability = "2a" # High confidence compound match
646
695
  elif id_info.get("score", 0) >= 0.5:
647
- reliability = "2b" # Moderate confidence match
696
+ reliability = "2b" # Moderate confidence match
648
697
  elif id_info.get("score", 0) >= 0.2:
649
- reliability = "3" # Compound class level
698
+ reliability = "3" # Compound class level
650
699
  else:
651
- reliability = "4" # Unknown compound
652
-
700
+ reliability = "4" # Unknown compound
701
+
653
702
  # Get MGF indexes for this consensus feature
654
703
  mgf_indexes = mgf_mapping.get(row["consensus_uid"], [])
655
704
 
@@ -673,26 +722,45 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
673
722
  # Add abundance values for each assay
674
723
  consensus_uid = row["consensus_uid"]
675
724
  # Check if consensus_uid exists in the abundance_matrix (Polars)
676
- filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
725
+ filtered_matrix = abundance_matrix.filter(
726
+ pl.col("consensus_uid") == consensus_uid,
727
+ )
677
728
  if filtered_matrix.height > 0:
678
729
  # Get the first (and should be only) matching row
679
730
  abundance_row = filtered_matrix.row(0, named=True)
680
731
  # Extract values excluding the consensus_uid column
681
- abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
682
- sml_row += [safe_str(val) if val is not None else "null" for val in abundance_values]
683
-
684
- # Calculate study variable statistics
732
+ abundance_values = [
733
+ abundance_row[col]
734
+ for col in abundance_matrix.columns
735
+ if col != "consensus_uid"
736
+ ]
737
+ sml_row += [
738
+ safe_str(val) if val is not None else "null" for val in abundance_values
739
+ ]
740
+
741
+ # Calculate study variable statistics
685
742
  non_null_values = [val for val in abundance_values if val is not None]
686
743
  if non_null_values:
687
744
  abundance_study_variable = sum(non_null_values) / len(non_null_values)
688
745
  abundance_variation_study_variable = (
689
- sum((x - abundance_study_variable) ** 2 for x in non_null_values) / len(non_null_values)
690
- ) ** 0.5 if len(non_null_values) > 1 else 0
746
+ (
747
+ sum(
748
+ (x - abundance_study_variable) ** 2 for x in non_null_values
749
+ )
750
+ / len(non_null_values)
751
+ )
752
+ ** 0.5
753
+ if len(non_null_values) > 1
754
+ else 0
755
+ )
691
756
  else:
692
757
  abundance_study_variable = "null"
693
758
  abundance_variation_study_variable = "null"
694
-
695
- sml_row += [safe_str(abundance_study_variable), safe_str(abundance_variation_study_variable)]
759
+
760
+ sml_row += [
761
+ safe_str(abundance_study_variable),
762
+ safe_str(abundance_variation_study_variable),
763
+ ]
696
764
  else:
697
765
  sml_row += ["null"] * n_assays
698
766
  sml_row += ["null", "null"] # Study variable columns
@@ -707,8 +775,8 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
707
775
  smf_header = [
708
776
  "SFH",
709
777
  "SMF_ID",
710
- "SME_ID_REFS",
711
- "SME_ID_REF_ambiguity_code",
778
+ "SOME_ID_REFS",
779
+ "SOME_ID_REF_ambiguity_code",
712
780
  "adduct_ion",
713
781
  "isotopomer",
714
782
  "exp_mass_to_charge",
@@ -718,86 +786,115 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
718
786
  "retention_time_in_seconds_end",
719
787
  ]
720
788
  smf_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
721
- smf_header += ["abundance_study_variable[1]", "abundance_variation_study_variable[1]"]
789
+ smf_header += [
790
+ "abundance_study_variable[1]",
791
+ "abundance_variation_study_variable[1]",
792
+ ]
722
793
  smf_lines.append("\t".join(smf_header))
723
794
 
724
795
  # SMF table uses the same consensus features as SML, just different metadata
725
796
  for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
726
- # References to SME entries - each SMF can reference multiple SME entries for the same consensus_uid
727
- sme_refs = "null"
728
- sme_ambiguity = "null"
797
+ # References to SOME entries - each SMF can reference multiple SOME entries for the same consensus_uid
798
+ some_refs = "null"
799
+ some_ambiguity = "null"
729
800
  consensus_uid = row["consensus_uid"]
730
-
801
+
731
802
  if full_id_data is not None:
732
- # Find all SME entries for this consensus_uid
733
- sme_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
734
- if sme_matches.height > 0:
735
- # Generate SME IDs - we'll create a mapping in the SME section
803
+ # Find all SOME entries for this consensus_uid
804
+ some_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
805
+ if some_matches.height > 0:
806
+ # Generate SOME IDs - we'll create a mapping in the SOME section
736
807
  # For now, use a simple approach based on consensus_uid and lib_uid
737
- sme_ids = []
738
- for i, sme_row in enumerate(sme_matches.iter_rows(named=True)):
739
- # Create a unique SME ID based on consensus_uid and position
740
- sme_id_base = consensus_uid * 1000 # Ensure uniqueness across consensus features
741
- sme_id = sme_id_base + i + 1
742
- sme_ids.append(str(sme_id))
743
-
744
- if sme_ids:
745
- sme_refs = "|".join(sme_ids)
808
+ some_ids = []
809
+ for i, some_row in enumerate(some_matches.iter_rows(named=True)):
810
+ # Create a unique SOME ID based on consensus_uid and position
811
+ some_id_base = (
812
+ consensus_uid * 1000
813
+ ) # Ensure uniqueness across consensus features
814
+ some_id = some_id_base + i + 1
815
+ some_ids.append(str(some_id))
816
+
817
+ if some_ids:
818
+ some_refs = "|".join(some_ids)
746
819
  # Set ambiguity code: 1=ambiguous identification, 2=multiple evidence same molecule, 3=both
747
- if len(sme_ids) > 1:
820
+ if len(some_ids) > 1:
748
821
  # Check if all identifications point to the same compound
749
- unique_cmpds = set(match["cmpd_uid"] for match in sme_matches.iter_rows(named=True)
750
- if match.get("cmpd_uid") is not None)
822
+ unique_cmpds = {
823
+ match["cmpd_uid"]
824
+ for match in some_matches.iter_rows(named=True)
825
+ if match.get("cmpd_uid") is not None
826
+ }
751
827
  if len(unique_cmpds) > 1:
752
- sme_ambiguity = "1" # Ambiguous identification
828
+ some_ambiguity = "1" # Ambiguous identification
753
829
  else:
754
- sme_ambiguity = "2" # Multiple evidence for same molecule
830
+ some_ambiguity = "2" # Multiple evidence for same molecule
755
831
  else:
756
- sme_ambiguity = "null"
757
-
832
+ some_ambiguity = "null"
833
+
758
834
  # Format isotopomer according to mzTab-M specification
759
835
  iso_value = row.get("iso_mean", 0)
760
836
  if iso_value is not None and round(iso_value) != 0:
761
- isotopomer = f"[MS,MS:1002957,\"isotopomer MS peak\",\"+{round(iso_value)}\"]"
837
+ isotopomer = f'[MS,MS:1002957,"isotopomer MS peak","+{round(iso_value)}"]'
762
838
  else:
763
839
  isotopomer = "null"
764
-
840
+
765
841
  smf_row = [
766
842
  "SMF",
767
843
  str(idx),
768
- sme_refs,
769
- sme_ambiguity,
844
+ some_refs,
845
+ some_ambiguity,
770
846
  adduct_list[idx - 1], # adduct_ion
771
847
  isotopomer, # isotopomer formatted according to mzTab-M specification
772
848
  safe_str(row.get("mz", "null")), # exp_mass_to_charge
773
- safe_str(row.get("adduct_charge_top", "null")), # Use top-ranked adduct charge
849
+ safe_str(
850
+ row.get("adduct_charge_top", "null"),
851
+ ), # Use top-ranked adduct charge
774
852
  safe_str(row.get("rt", "null")), # retention_time_in_seconds
775
853
  safe_str(row.get("retention_time_in_seconds_start", "null")),
776
854
  safe_str(row.get("retention_time_in_seconds_end", "null")),
777
855
  ]
778
856
  # Add abundance values for each assay - same as SML (Polars)
779
857
  consensus_uid = row["consensus_uid"]
780
- filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
858
+ filtered_matrix = abundance_matrix.filter(
859
+ pl.col("consensus_uid") == consensus_uid,
860
+ )
781
861
  if filtered_matrix.height > 0:
782
862
  # Get the first (and should be only) matching row
783
863
  abundance_row = filtered_matrix.row(0, named=True)
784
864
  # Extract values excluding the consensus_uid column
785
- abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
786
- abundance_strings = [safe_str(val) if val is not None else "null" for val in abundance_values]
865
+ abundance_values = [
866
+ abundance_row[col]
867
+ for col in abundance_matrix.columns
868
+ if col != "consensus_uid"
869
+ ]
870
+ abundance_strings = [
871
+ safe_str(val) if val is not None else "null" for val in abundance_values
872
+ ]
787
873
  smf_row += abundance_strings
788
-
874
+
789
875
  # Calculate study variable statistics (same as in SML section)
790
876
  non_null_values = [val for val in abundance_values if val is not None]
791
877
  if non_null_values:
792
878
  abundance_study_variable = sum(non_null_values) / len(non_null_values)
793
879
  abundance_variation_study_variable = (
794
- sum((x - abundance_study_variable) ** 2 for x in non_null_values) / len(non_null_values)
795
- ) ** 0.5 if len(non_null_values) > 1 else 0
880
+ (
881
+ sum(
882
+ (x - abundance_study_variable) ** 2 for x in non_null_values
883
+ )
884
+ / len(non_null_values)
885
+ )
886
+ ** 0.5
887
+ if len(non_null_values) > 1
888
+ else 0
889
+ )
796
890
  else:
797
891
  abundance_study_variable = "null"
798
892
  abundance_variation_study_variable = "null"
799
-
800
- smf_row += [safe_str(abundance_study_variable), safe_str(abundance_variation_study_variable)]
893
+
894
+ smf_row += [
895
+ safe_str(abundance_study_variable),
896
+ safe_str(abundance_variation_study_variable),
897
+ ]
801
898
  else:
802
899
  smf_row += ["null"] * n_assays
803
900
  smf_row += ["null", "null"] # Study variable columns
@@ -807,19 +904,21 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
807
904
  for line in smf_lines:
808
905
  f.write(line + "\n")
809
906
 
810
- # --- SME (Small Molecule Evidence) table ---
907
+ # --- SOME (Small Molecule Evidence) table ---
811
908
  if full_id_data is not None and not full_id_data.is_empty():
812
- sme_lines = []
909
+ some_lines = []
813
910
  # Add comment about spectra_ref being dummy placeholders
814
- sme_lines.append("COM\tThe spectra_ref are dummy placeholders, as the annotation was based on aggregated data")
815
- sme_header = [
816
- "SEH",
817
- "SME_ID",
911
+ some_lines.append(
912
+ "COM\tThe spectra_ref are dummy placeholders, as the annotation was based on aggregated data",
913
+ )
914
+ some_header = [
915
+ "SHE",
916
+ "SOME_ID",
818
917
  "evidence_input_id",
819
918
  "database_identifier",
820
919
  "chemical_formula",
821
920
  "smiles",
822
- "inchi",
921
+ "inchi",
823
922
  "chemical_name",
824
923
  "uri",
825
924
  "derivatized_form",
@@ -833,93 +932,101 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
833
932
  "id_confidence_measure[1]",
834
933
  "rank",
835
934
  ]
836
- sme_lines.append("\t".join(sme_header))
837
-
838
- # Create SME entries for all identification results using enriched data
839
- for consensus_uid in self.consensus_df.select("consensus_uid").to_series().unique():
935
+ some_lines.append("\t".join(some_header))
936
+
937
+ # Create SOME entries for all identification results using enriched data
938
+ for consensus_uid in (
939
+ self.consensus_df.select("consensus_uid").to_series().unique()
940
+ ):
840
941
  # Get consensus feature data for this consensus_uid
841
- consensus_feature_data = self.consensus_df.filter(pl.col("consensus_uid") == consensus_uid)
942
+ consensus_feature_data = self.consensus_df.filter(
943
+ pl.col("consensus_uid") == consensus_uid,
944
+ )
842
945
  if consensus_feature_data.height == 0:
843
946
  continue
844
947
  consensus_row = consensus_feature_data.row(0, named=True)
845
-
948
+
846
949
  # Get all identification results for this consensus feature from enriched data
847
- sme_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
848
-
849
- if sme_matches.height > 0:
950
+ some_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
951
+
952
+ if some_matches.height > 0:
850
953
  # Sort by score descending to maintain rank order
851
- sme_matches = sme_matches.sort("score", descending=True)
852
-
853
- for i, sme_row in enumerate(sme_matches.iter_rows(named=True)):
854
- # Generate unique SME_ID
855
- sme_id_base = consensus_uid * 1000
856
- sme_id = sme_id_base + i + 1
857
-
954
+ some_matches = some_matches.sort("score", descending=True)
955
+
956
+ for i, some_row in enumerate(some_matches.iter_rows(named=True)):
957
+ # Generate unique SOME_ID
958
+ some_id_base = consensus_uid * 1000
959
+ some_id = some_id_base + i + 1
960
+
858
961
  # Create evidence input ID using consensus_uid:mz:rt format
859
962
  consensus_mz = consensus_row.get("mz", 0)
860
963
  consensus_rt = consensus_row.get("rt", 0)
861
964
  evidence_id = f"consensus_uid={consensus_uid}:mz={consensus_mz:.4f}:rt={consensus_rt:.2f}"
862
-
965
+
863
966
  # Database identifier - use db_id if available, otherwise fallback to cmpd_uid
864
967
  db_id = "null"
865
- if sme_row.get("db_id") is not None and sme_row["db_id"] != "":
866
- db_id = safe_str(sme_row["db_id"])
867
- elif sme_row.get("cmpd_uid") is not None:
868
- db_id = f"cmpd:{sme_row['cmpd_uid']}"
869
-
870
- # Get adduct information
968
+ if some_row.get("db_id") is not None and some_row["db_id"] != "":
969
+ db_id = safe_str(some_row["db_id"])
970
+ elif some_row.get("cmpd_uid") is not None:
971
+ db_id = f"cmpd:{some_row['cmpd_uid']}"
972
+
973
+ # Get adduct information
871
974
  adduct_ion = "null"
872
- if sme_row.get("adduct") is not None and sme_row["adduct"] != "":
873
- adduct_ion = safe_str(sme_row["adduct"])
975
+ if some_row.get("adduct") is not None and some_row["adduct"] != "":
976
+ adduct_ion = safe_str(some_row["adduct"])
874
977
  # Replace ? with H for better mzTab compatibility
875
978
  adduct_ion = adduct_ion.replace("?", "H")
876
-
979
+
877
980
  # Spectra reference - reference to first ms_run with spectrum index 0
878
981
  spectra_ref = "ms_run[1]:spectrum=0"
879
-
982
+
880
983
  # Identification method
881
984
  id_method = "[MS, MS:1002888, small molecule confidence measure, ]"
882
- if sme_row.get("matcher") is not None:
883
- id_method = f"[MS, MS:1002888, {sme_row['matcher']}, ]"
884
-
985
+ if some_row.get("matcher") is not None:
986
+ id_method = f"[MS, MS:1002888, {some_row['matcher']}, ]"
987
+
885
988
  # MS level - assume MS1 for now
886
989
  ms_level = "[MS, MS:1000511, ms level, 1]"
887
-
990
+
888
991
  # Experimental mass-to-charge from consensus feature
889
992
  exp_mz = safe_str(consensus_mz)
890
-
891
- # Theoretical mass-to-charge from lib_df
993
+
994
+ # Theoretical mass-to-charge from lib_df
892
995
  theoretical_mz = "null"
893
- if sme_row.get("mz") is not None: # This comes from lib_df via get_id() join
894
- theoretical_mz = safe_str(sme_row["mz"])
895
-
896
- sme_line = [
897
- "SME",
898
- str(sme_id),
996
+ if (
997
+ some_row.get("mz") is not None
998
+ ): # This comes from lib_df via get_id() join
999
+ theoretical_mz = safe_str(some_row["mz"])
1000
+
1001
+ some_line = [
1002
+ "SOME",
1003
+ str(some_id),
899
1004
  evidence_id,
900
1005
  db_id,
901
- safe_str(sme_row.get("formula", "null")),
902
- safe_str(sme_row.get("smiles", "null")),
903
- safe_str(sme_row.get("inchi", "null")),
904
- safe_str(sme_row.get("name", "null")),
1006
+ safe_str(some_row.get("formula", "null")),
1007
+ safe_str(some_row.get("smiles", "null")),
1008
+ safe_str(some_row.get("inchi", "null")),
1009
+ safe_str(some_row.get("name", "null")),
905
1010
  "null", # uri - not available in current data
906
1011
  "null", # derivatized_form
907
1012
  adduct_ion,
908
1013
  exp_mz, # experimental m/z from consensus feature
909
- safe_str(consensus_row.get("adduct_charge_top", "1")), # Use consensus feature's top adduct charge
1014
+ safe_str(
1015
+ consensus_row.get("adduct_charge_top", "1"),
1016
+ ), # Use consensus feature's top adduct charge
910
1017
  theoretical_mz, # theoretical m/z from lib_df
911
1018
  spectra_ref,
912
1019
  id_method,
913
1020
  ms_level,
914
- safe_str(sme_row.get("score", "null")),
1021
+ safe_str(some_row.get("score", "null")),
915
1022
  str(i + 1), # rank within this consensus feature
916
1023
  ]
917
- sme_lines.append("\t".join(sme_line))
918
-
919
- # Write SME table
1024
+ some_lines.append("\t".join(some_line))
1025
+
1026
+ # Write SOME table
920
1027
  with open(filename, "a", encoding="utf-8") as f:
921
1028
  f.write("\n")
922
- for line in sme_lines:
1029
+ for line in some_lines:
923
1030
  f.write(line + "\n")
924
1031
 
925
1032
  # --- MGF table ---
@@ -953,15 +1060,23 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
953
1060
  spec_len = row["spec_len"] if row["spec_len"] is not None else 0
954
1061
 
955
1062
  # Format spectrum data as pipe-separated strings
956
- spec_mz_str = "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
957
- spec_int_str = "|".join([f"{int(inty)}" for inty in spectrum_inty]) if spectrum_inty else ""
1063
+ spec_mz_str = (
1064
+ "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
1065
+ )
1066
+ spec_int_str = (
1067
+ "|".join([f"{int(inty)}" for inty in spectrum_inty])
1068
+ if spectrum_inty
1069
+ else ""
1070
+ )
958
1071
 
959
1072
  mgf_row = [
960
1073
  "COM",
961
1074
  "MGF",
962
1075
  str(row["mgf_index"]) if row["mgf_index"] is not None else "null",
963
1076
  str(row["feature_id"]) if row["feature_id"] is not None else "null",
964
- f"{row['rtinseconds']:.2f}" if row["rtinseconds"] is not None else "null",
1077
+ f"{row['rtinseconds']:.2f}"
1078
+ if row["rtinseconds"] is not None
1079
+ else "null",
965
1080
  f"{row['pepmass']:.4f}" if row["pepmass"] is not None else "null",
966
1081
  "null", # prec_int - not available in current data
967
1082
  str(row["energy"]) if row["energy"] is not None else "null",
@@ -986,94 +1101,110 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
986
1101
  def export_xlsx(self, filename: str = None) -> None:
987
1102
  """
988
1103
  Export the study data to an Excel workbook with multiple worksheets.
989
-
1104
+
990
1105
  The Excel file contains three worksheets:
991
1106
  - consensus_df: Consensus features dataframe
992
- - matrix: Consensus matrix with samples as columns (get_consensus_matrix)
1107
+ - matrix: Consensus matrix with samples as columns (get_consensus_matrix)
993
1108
  - identification: Identification results with library annotations (get_id)
994
-
1109
+
995
1110
  Args:
996
- filename (str, optional): Path to the output Excel file. Defaults to "study.xlsx"
1111
+ filename (str, optional): Path to the output Excel file. Defaults to "study.xlsx"
997
1112
  in the study folder.
998
1113
  """
999
1114
  try:
1000
1115
  import openpyxl
1001
1116
  except ImportError:
1002
- self.logger.error("openpyxl package is required for Excel export. Install with: pip install openpyxl")
1117
+ self.logger.error(
1118
+ "openpyxl package is required for Excel export. Install with: pip install openpyxl",
1119
+ )
1003
1120
  return
1004
-
1121
+
1005
1122
  # Set default filename
1006
1123
  if filename is None:
1007
1124
  filename = "study.xlsx"
1008
-
1125
+
1009
1126
  # Make filename absolute if not already
1010
1127
  if not os.path.isabs(filename):
1011
1128
  if self.folder is not None:
1012
1129
  filename = os.path.join(self.folder, filename)
1013
1130
  else:
1014
1131
  filename = os.path.join(os.getcwd(), filename)
1015
-
1016
- self.logger.debug(f"Exporting study to Excel...")
1017
-
1132
+
1133
+ self.logger.debug("Exporting study to Excel...")
1134
+
1018
1135
  # Prepare data for export in the desired order
1019
1136
  from collections import OrderedDict
1137
+
1020
1138
  worksheets = OrderedDict()
1021
-
1139
+
1022
1140
  # 1. Samples dataframe (first worksheet)
1023
1141
  if self.samples_df is not None and not self.samples_df.is_empty():
1024
1142
  samples_pandas = self.samples_df.to_pandas()
1025
- worksheets['samples'] = samples_pandas
1143
+ worksheets["samples"] = samples_pandas
1026
1144
  self.logger.debug(f"Added samples worksheet with {len(samples_pandas)} rows")
1027
1145
  else:
1028
1146
  self.logger.warning("samples_df is empty or None, skipping worksheet")
1029
-
1147
+
1030
1148
  # 2. Consensus dataframe (renamed to 'consensus')
1031
1149
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1032
1150
  consensus_pandas = self.consensus_df.to_pandas()
1033
- worksheets['consensus'] = consensus_pandas
1034
- self.logger.debug(f"Added consensus worksheet with {len(consensus_pandas)} rows")
1151
+ worksheets["consensus"] = consensus_pandas
1152
+ self.logger.debug(
1153
+ f"Added consensus worksheet with {len(consensus_pandas)} rows",
1154
+ )
1035
1155
  else:
1036
1156
  self.logger.warning("consensus_df is empty or None, skipping worksheet")
1037
-
1157
+
1038
1158
  # 3. Identification results
1039
1159
  try:
1040
- from masster.study.id import get_id
1160
+ from master.study.id import get_id
1161
+
1041
1162
  id_df = get_id(self)
1042
1163
  if id_df is not None and not id_df.is_empty():
1043
1164
  id_pandas = id_df.to_pandas()
1044
- worksheets['identification'] = id_pandas
1045
- self.logger.debug(f"Added identification worksheet with {len(id_pandas)} rows")
1165
+ worksheets["identification"] = id_pandas
1166
+ self.logger.debug(
1167
+ f"Added identification worksheet with {len(id_pandas)} rows",
1168
+ )
1046
1169
  else:
1047
- self.logger.warning("get_id() returned empty data, skipping identification worksheet")
1170
+ self.logger.warning(
1171
+ "get_id() returned empty data, skipping identification worksheet",
1172
+ )
1048
1173
  except Exception as e:
1049
- self.logger.warning(f"Error getting identification data: {e}. Skipping identification worksheet.")
1050
-
1174
+ self.logger.warning(
1175
+ f"Error getting identification data: {e}. Skipping identification worksheet.",
1176
+ )
1177
+
1051
1178
  # 4. Consensus matrix (last worksheet)
1052
1179
  try:
1053
1180
  matrix_df = self.get_consensus_matrix()
1054
1181
  if matrix_df is not None and not matrix_df.is_empty():
1055
1182
  matrix_pandas = matrix_df.to_pandas()
1056
- worksheets['matrix'] = matrix_pandas
1183
+ worksheets["matrix"] = matrix_pandas
1057
1184
  self.logger.debug(f"Added matrix worksheet with {len(matrix_pandas)} rows")
1058
1185
  else:
1059
- self.logger.warning("get_consensus_matrix() returned empty data, skipping matrix worksheet")
1186
+ self.logger.warning(
1187
+ "get_consensus_matrix() returned empty data, skipping matrix worksheet",
1188
+ )
1060
1189
  except Exception as e:
1061
1190
  self.logger.error(f"Error getting consensus matrix: {e}")
1062
-
1191
+
1063
1192
  # Check if we have any data to export
1064
1193
  if not worksheets:
1065
1194
  self.logger.error("No data available to export to Excel")
1066
1195
  return
1067
-
1196
+
1068
1197
  # Write to Excel file
1069
1198
  try:
1070
- with pd.ExcelWriter(filename, engine='openpyxl') as writer:
1199
+ with pd.ExcelWriter(filename, engine="openpyxl") as writer:
1071
1200
  for sheet_name, data in worksheets.items():
1072
1201
  data.to_excel(writer, sheet_name=sheet_name, index=False)
1073
- self.logger.debug(f"Written worksheet '{sheet_name}' with shape {data.shape}")
1074
-
1202
+ self.logger.debug(
1203
+ f"Written worksheet '{sheet_name}' with shape {data.shape}",
1204
+ )
1205
+
1075
1206
  self.logger.info(f"Study exported to {filename}")
1076
-
1207
+
1077
1208
  except Exception as e:
1078
1209
  self.logger.error(f"Error writing Excel file: {e}")
1079
1210
 
@@ -1081,13 +1212,13 @@ def export_xlsx(self, filename: str = None) -> None:
1081
1212
  def export_parquet(self, basename: str = None) -> None:
1082
1213
  """
1083
1214
  Export the study data to multiple Parquet files with different suffixes.
1084
-
1215
+
1085
1216
  The export creates separate Parquet files for each dataset:
1086
1217
  - <basename>_samples.parquet: Samples dataframe
1087
1218
  - <basename>_consensus.parquet: Consensus features dataframe
1088
1219
  - <basename>_identification.parquet: Identification results with library annotations
1089
1220
  - <basename>_matrix.parquet: Consensus matrix with samples as columns
1090
-
1221
+
1091
1222
  Args:
1092
1223
  basename (str, optional): Base name for the output files. Defaults to "study"
1093
1224
  in the study folder.
@@ -1095,59 +1226,74 @@ def export_parquet(self, basename: str = None) -> None:
1095
1226
  # Set default basename
1096
1227
  if basename is None:
1097
1228
  basename = "study"
1098
-
1229
+
1099
1230
  # Make basename absolute path if not already (without extension)
1100
1231
  if not os.path.isabs(basename):
1101
1232
  if self.folder is not None:
1102
1233
  basename = os.path.join(self.folder, basename)
1103
1234
  else:
1104
1235
  basename = os.path.join(os.getcwd(), basename)
1105
-
1236
+
1106
1237
  self.logger.debug(f"Exporting study to Parquet files with basename: {basename}")
1107
-
1238
+
1108
1239
  exported_files = []
1109
-
1240
+
1110
1241
  # 1. Samples dataframe
1111
1242
  if self.samples_df is not None and not self.samples_df.is_empty():
1112
1243
  samples_file = f"{basename}_samples.parquet"
1113
1244
  try:
1114
1245
  self.samples_df.write_parquet(samples_file)
1115
1246
  exported_files.append(samples_file)
1116
- self.logger.debug(f"Exported samples to {samples_file} ({self.samples_df.height} rows)")
1247
+ self.logger.debug(
1248
+ f"Exported samples to {samples_file} ({self.samples_df.height} rows)",
1249
+ )
1117
1250
  except Exception as e:
1118
1251
  self.logger.error(f"Error writing samples parquet file: {e}")
1119
1252
  else:
1120
- self.logger.warning("samples_df is empty or None, skipping samples parquet file")
1121
-
1253
+ self.logger.warning(
1254
+ "samples_df is empty or None, skipping samples parquet file",
1255
+ )
1256
+
1122
1257
  # 2. Consensus dataframe
1123
1258
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1124
1259
  consensus_file = f"{basename}_consensus.parquet"
1125
1260
  try:
1126
1261
  self.consensus_df.write_parquet(consensus_file)
1127
1262
  exported_files.append(consensus_file)
1128
- self.logger.debug(f"Exported consensus to {consensus_file} ({self.consensus_df.height} rows)")
1263
+ self.logger.debug(
1264
+ f"Exported consensus to {consensus_file} ({self.consensus_df.height} rows)",
1265
+ )
1129
1266
  except Exception as e:
1130
1267
  self.logger.error(f"Error writing consensus parquet file: {e}")
1131
1268
  else:
1132
- self.logger.warning("consensus_df is empty or None, skipping consensus parquet file")
1133
-
1269
+ self.logger.warning(
1270
+ "consensus_df is empty or None, skipping consensus parquet file",
1271
+ )
1272
+
1134
1273
  # 3. Identification results
1135
1274
  try:
1136
- from masster.study.id import get_id
1275
+ from master.study.id import get_id
1276
+
1137
1277
  id_df = get_id(self)
1138
1278
  if id_df is not None and not id_df.is_empty():
1139
1279
  identification_file = f"{basename}_identification.parquet"
1140
1280
  try:
1141
1281
  id_df.write_parquet(identification_file)
1142
1282
  exported_files.append(identification_file)
1143
- self.logger.debug(f"Exported identification to {identification_file} ({id_df.height} rows)")
1283
+ self.logger.debug(
1284
+ f"Exported identification to {identification_file} ({id_df.height} rows)",
1285
+ )
1144
1286
  except Exception as e:
1145
1287
  self.logger.error(f"Error writing identification parquet file: {e}")
1146
1288
  else:
1147
- self.logger.warning("get_id() returned empty data, skipping identification parquet file")
1289
+ self.logger.warning(
1290
+ "get_id() returned empty data, skipping identification parquet file",
1291
+ )
1148
1292
  except Exception as e:
1149
- self.logger.warning(f"Error getting identification data: {e}. Skipping identification parquet file.")
1150
-
1293
+ self.logger.warning(
1294
+ f"Error getting identification data: {e}. Skipping identification parquet file.",
1295
+ )
1296
+
1151
1297
  # 4. Consensus matrix
1152
1298
  try:
1153
1299
  matrix_df = self.get_consensus_matrix()
@@ -1156,14 +1302,18 @@ def export_parquet(self, basename: str = None) -> None:
1156
1302
  try:
1157
1303
  matrix_df.write_parquet(matrix_file)
1158
1304
  exported_files.append(matrix_file)
1159
- self.logger.debug(f"Exported matrix to {matrix_file} ({matrix_df.height} rows)")
1305
+ self.logger.debug(
1306
+ f"Exported matrix to {matrix_file} ({matrix_df.height} rows)",
1307
+ )
1160
1308
  except Exception as e:
1161
1309
  self.logger.error(f"Error writing matrix parquet file: {e}")
1162
1310
  else:
1163
- self.logger.warning("get_consensus_matrix() returned empty data, skipping matrix parquet file")
1311
+ self.logger.warning(
1312
+ "get_consensus_matrix() returned empty data, skipping matrix parquet file",
1313
+ )
1164
1314
  except Exception as e:
1165
1315
  self.logger.error(f"Error getting consensus matrix: {e}")
1166
-
1316
+
1167
1317
  # Report results
1168
1318
  if exported_files:
1169
1319
  self.logger.info(f"Study exported to {len(exported_files)} Parquet files:")