masster 0.5.28__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -96,19 +96,15 @@ class study_defaults:
96
96
  "adducts": {
97
97
  "dtype": "list[str]",
98
98
  "description": "List of adduct specifications in OpenMS format (element:charge:probability). Charged adduct probabilities must sum to 1.0.",
99
- "default": ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"],
99
+ "default": ["+H:1:0.65", "+Na:1:0.15", "+NH4:1:0.15", "+K:1:0.05"],
100
100
  "examples": {
101
- "positive": ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"],
102
- "negative": [
103
- "H-1:-:0.95",
104
- "Cl:-:0.05",
105
- "CH2O2:0:0.2",
106
- "H-2-O:0:0.2",
107
- ],
101
+ "positive": ["+H:1:0.65", "+Na:1:0.15", "+NH4:1:0.15", "+K:1:0.05", "-H2O:0:0.15"],
102
+ "negative": ["-H:-1:0.95", "+Cl:-1:0.05", "+CH2O2:0:0.2", "-H2O:0:0.2"],
108
103
  },
109
104
  "validation_rules": [
110
- "Format: element:charge:probability",
111
- "Charge must be +, -, or 0 (neutral)",
105
+ "Format: formula:charge:probability (e.g., '+H:1:0.65', '-H:-1:0.95', '-H2O:0:0.15')",
106
+ "Formula must start with + or - to indicate gain/loss (e.g., '+H', '-H', '+Na', '-H2O')",
107
+ "Charge must be an integer (positive, negative, or 0 for neutral)",
112
108
  "Probability must be between 0.0 and 1.0",
113
109
  "Sum of all charged adduct probabilities must equal 1.0",
114
110
  ],
@@ -128,7 +124,7 @@ class study_defaults:
128
124
  """Set polarity-specific defaults for adducts if not explicitly provided."""
129
125
  # If adducts is None, set based on polarity
130
126
  if self.adducts is None:
131
- if self.polarity.lower() in ["positive", "pos"]:
127
+ if self.polarity.lower() in ["positive", "pos", "+"]:
132
128
  self.adducts = [
133
129
  "+H:1:0.65",
134
130
  "+Na:1:0.15",
@@ -136,7 +132,7 @@ class study_defaults:
136
132
  "+K:1:0.05",
137
133
  "-H2O:0:0.15",
138
134
  ]
139
- elif self.polarity.lower() in ["negative", "neg"]:
135
+ elif self.polarity.lower() in ["negative", "neg", "-"]:
140
136
  self.adducts = [
141
137
  "-H:-1:0.9",
142
138
  "+Cl:-1:0.1",
masster/study/export.py CHANGED
@@ -524,7 +524,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
524
524
  # Import here to avoid circular imports
525
525
  from masster.study.id import get_id
526
526
 
527
- # Get full enriched identification data for SOME section
527
+ # Get full enriched identification data for SME section
528
528
  full_id_data = get_id(self)
529
529
  if full_id_data is not None and not full_id_data.is_empty():
530
530
  # Get top scoring identification for each consensus_uid for SML section
@@ -828,8 +828,8 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
828
828
  smf_header = [
829
829
  "SFH",
830
830
  "SMF_ID",
831
- "SOME_ID_REFS",
832
- "SOME_ID_REF_ambiguity_code",
831
+ "SME_ID_REFS",
832
+ "SME_ID_REF_ambiguity_code",
833
833
  "adduct_ion",
834
834
  "isotopomer",
835
835
  "exp_mass_to_charge",
@@ -847,40 +847,40 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
847
847
 
848
848
  # SMF table uses the same consensus features as SML, just different metadata
849
849
  for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
850
- # References to SOME entries - each SMF can reference multiple SOME entries for the same consensus_uid
851
- some_refs = "null"
852
- some_ambiguity = "null"
850
+ # References to SME entries - each SMF can reference multiple SME entries for the same consensus_uid
851
+ SME_refs = "null"
852
+ SME_ambiguity = "null"
853
853
  consensus_uid = row["consensus_uid"]
854
854
 
855
855
  if full_id_data is not None:
856
- # Find all SOME entries for this consensus_uid
857
- some_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
858
- if some_matches.height > 0:
859
- # Generate SOME IDs - we'll create a mapping in the SOME section
856
+ # Find all SME entries for this consensus_uid
857
+ SME_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
858
+ if SME_matches.height > 0:
859
+ # Generate SME IDs - we'll create a mapping in the SME section
860
860
  # For now, use a simple approach based on consensus_uid and lib_uid
861
- some_ids = []
862
- for i, some_row in enumerate(some_matches.iter_rows(named=True)):
863
- # Create a unique SOME ID based on consensus_uid and position
864
- some_id_base = consensus_uid * 1000 # Ensure uniqueness across consensus features
865
- some_id = some_id_base + i + 1
866
- some_ids.append(str(some_id))
867
-
868
- if some_ids:
869
- some_refs = "|".join(some_ids)
861
+ SME_ids = []
862
+ for i, SME_row in enumerate(SME_matches.iter_rows(named=True)):
863
+ # Create a unique SME ID based on consensus_uid and position
864
+ SME_id_base = consensus_uid * 1000 # Ensure uniqueness across consensus features
865
+ SME_id = SME_id_base + i + 1
866
+ SME_ids.append(str(SME_id))
867
+
868
+ if SME_ids:
869
+ SME_refs = "|".join(SME_ids)
870
870
  # Set ambiguity code: 1=ambiguous identification, 2=multiple evidence same molecule, 3=both
871
- if len(some_ids) > 1:
871
+ if len(SME_ids) > 1:
872
872
  # Check if all identifications point to the same compound
873
873
  unique_cmpds = {
874
874
  match["cmpd_uid"]
875
- for match in some_matches.iter_rows(named=True)
875
+ for match in SME_matches.iter_rows(named=True)
876
876
  if match.get("cmpd_uid") is not None
877
877
  }
878
878
  if len(unique_cmpds) > 1:
879
- some_ambiguity = "1" # Ambiguous identification
879
+ SME_ambiguity = "1" # Ambiguous identification
880
880
  else:
881
- some_ambiguity = "2" # Multiple evidence for same molecule
881
+ SME_ambiguity = "2" # Multiple evidence for same molecule
882
882
  else:
883
- some_ambiguity = "null"
883
+ SME_ambiguity = "null"
884
884
 
885
885
  # Format isotopomer according to mzTab-M specification
886
886
  iso_value = row.get("iso_mean", 0)
@@ -892,8 +892,8 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
892
892
  smf_row = [
893
893
  "SMF",
894
894
  str(idx),
895
- some_refs,
896
- some_ambiguity,
895
+ SME_refs,
896
+ SME_ambiguity,
897
897
  adduct_list[idx - 1], # adduct_ion
898
898
  isotopomer, # isotopomer formatted according to mzTab-M specification
899
899
  safe_str(row.get("mz", "null")), # exp_mass_to_charge
@@ -943,16 +943,16 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
943
943
  for line in smf_lines:
944
944
  f.write(line + "\n")
945
945
 
946
- # --- SOME (Small Molecule Evidence) table ---
946
+ # --- SME (Small Molecule Evidence) table ---
947
947
  if full_id_data is not None and not full_id_data.is_empty():
948
- some_lines = []
948
+ SME_lines = []
949
949
  # Add comment about spectra_ref being dummy placeholders
950
- some_lines.append(
950
+ SME_lines.append(
951
951
  "COM\tThe spectra_ref are dummy placeholders, as the annotation was based on aggregated data",
952
952
  )
953
- some_header = [
954
- "SHE",
955
- "SOME_ID",
953
+ SME_header = [
954
+ "SEH",
955
+ "SME_ID",
956
956
  "evidence_input_id",
957
957
  "database_identifier",
958
958
  "chemical_formula",
@@ -971,9 +971,9 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
971
971
  "id_confidence_measure[1]",
972
972
  "rank",
973
973
  ]
974
- some_lines.append("\t".join(some_header))
974
+ SME_lines.append("\t".join(SME_header))
975
975
 
976
- # Create SOME entries for all identification results using enriched data
976
+ # Create SME entries for all identification results using enriched data
977
977
  for consensus_uid in self.consensus_df.select("consensus_uid").to_series().unique():
978
978
  # Get consensus feature data for this consensus_uid
979
979
  consensus_feature_data = self.consensus_df.filter(
@@ -984,16 +984,16 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
984
984
  consensus_row = consensus_feature_data.row(0, named=True)
985
985
 
986
986
  # Get all identification results for this consensus feature from enriched data
987
- some_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
987
+ SME_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
988
988
 
989
- if some_matches.height > 0:
989
+ if SME_matches.height > 0:
990
990
  # Sort by score descending to maintain rank order
991
- some_matches = some_matches.sort("score", descending=True)
991
+ SME_matches = SME_matches.sort("score", descending=True)
992
992
 
993
- for i, some_row in enumerate(some_matches.iter_rows(named=True)):
994
- # Generate unique SOME_ID
995
- some_id_base = consensus_uid * 1000
996
- some_id = some_id_base + i + 1
993
+ for i, SME_row in enumerate(SME_matches.iter_rows(named=True)):
994
+ # Generate unique SME_ID
995
+ SME_id_base = consensus_uid * 1000
996
+ SME_id = SME_id_base + i + 1
997
997
 
998
998
  # Create evidence input ID using consensus_uid:mz:rt format
999
999
  consensus_mz = consensus_row.get("mz", 0)
@@ -1002,15 +1002,15 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
1002
1002
 
1003
1003
  # Database identifier - use db_id if available, otherwise fallback to cmpd_uid
1004
1004
  db_id = "null"
1005
- if some_row.get("db_id") is not None and some_row["db_id"] != "":
1006
- db_id = safe_str(some_row["db_id"])
1007
- elif some_row.get("cmpd_uid") is not None:
1008
- db_id = f"cmpd:{some_row['cmpd_uid']}"
1005
+ if SME_row.get("db_id") is not None and SME_row["db_id"] != "":
1006
+ db_id = safe_str(SME_row["db_id"])
1007
+ elif SME_row.get("cmpd_uid") is not None:
1008
+ db_id = f"cmpd:{SME_row['cmpd_uid']}"
1009
1009
 
1010
1010
  # Get adduct information
1011
1011
  adduct_ion = "null"
1012
- if some_row.get("adduct") is not None and some_row["adduct"] != "":
1013
- adduct_ion = safe_str(some_row["adduct"])
1012
+ if SME_row.get("adduct") is not None and SME_row["adduct"] != "":
1013
+ adduct_ion = safe_str(SME_row["adduct"])
1014
1014
  # Replace ? with H for better mzTab compatibility
1015
1015
  adduct_ion = adduct_ion.replace("?", "H")
1016
1016
 
@@ -1019,29 +1019,32 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
1019
1019
 
1020
1020
  # Identification method
1021
1021
  id_method = "[MS, MS:1002888, small molecule confidence measure, ]"
1022
- if some_row.get("matcher") is not None:
1023
- id_method = f"[MS, MS:1002888, {some_row['matcher']}, ]"
1022
+ if SME_row.get("matcher") is not None:
1023
+ id_method = f"[MS, MS:1002888, {SME_row['matcher']}, ]"
1024
1024
 
1025
- # MS level - assume MS1 for now
1026
- ms_level = "[MS, MS:1000511, ms level, 1]"
1025
+ # MS level - check if ms1 exists in matched
1026
+ if 'ms1' in SME_row['matcher'].lower():
1027
+ ms_level = "[MS, MS:1000511, ms level, 1]"
1028
+ else:
1029
+ ms_level = "[MS,MS:1000511, ms level, 2]"
1027
1030
 
1028
1031
  # Experimental mass-to-charge from consensus feature
1029
1032
  exp_mz = safe_str(consensus_mz)
1030
1033
 
1031
1034
  # Theoretical mass-to-charge from lib_df
1032
1035
  theoretical_mz = "null"
1033
- if some_row.get("mz") is not None: # This comes from lib_df via get_id() join
1034
- theoretical_mz = safe_str(some_row["mz"])
1036
+ if SME_row.get("mz") is not None: # This comes from lib_df via get_id() join
1037
+ theoretical_mz = safe_str(SME_row["mz"])
1035
1038
 
1036
- some_line = [
1037
- "SOME",
1038
- str(some_id),
1039
+ SME_line = [
1040
+ "SME",
1041
+ str(SME_id),
1039
1042
  evidence_id,
1040
1043
  db_id,
1041
- safe_str(some_row.get("formula", "null")),
1042
- safe_str(some_row.get("smiles", "null")),
1043
- safe_str(some_row.get("inchi", "null")),
1044
- safe_str(some_row.get("name", "null")),
1044
+ safe_str(SME_row.get("formula", "null")),
1045
+ safe_str(SME_row.get("smiles", "null")),
1046
+ safe_str(SME_row.get("inchi", "null")),
1047
+ safe_str(SME_row.get("name", "null")),
1045
1048
  "null", # uri - not available in current data
1046
1049
  "null", # derivatized_form
1047
1050
  adduct_ion,
@@ -1053,15 +1056,15 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
1053
1056
  spectra_ref,
1054
1057
  id_method,
1055
1058
  ms_level,
1056
- safe_str(some_row.get("score", "null")),
1059
+ safe_str(SME_row.get("score", "null")),
1057
1060
  str(i + 1), # rank within this consensus feature
1058
1061
  ]
1059
- some_lines.append("\t".join(some_line))
1062
+ SME_lines.append("\t".join(SME_line))
1060
1063
 
1061
- # Write SOME table
1064
+ # Write SME table
1062
1065
  with open(filename, "a", encoding="utf-8") as f:
1063
1066
  f.write("\n")
1064
- for line in some_lines:
1067
+ for line in SME_lines:
1065
1068
  f.write(line + "\n")
1066
1069
 
1067
1070
  # --- MGF table ---
@@ -1125,7 +1128,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
1125
1128
  self.logger.success(f"Exported mzTab-M to {filename}")
1126
1129
 
1127
1130
 
1128
- def export_xlsx(self, filename: str | None = None) -> None:
1131
+ def export_excel(self, filename: str | None = None) -> None:
1129
1132
  """
1130
1133
  Export the study data to an Excel workbook with multiple worksheets.
1131
1134
 
@@ -1390,3 +1393,151 @@ def export_parquet(self, filename: str | None = None) -> None:
1390
1393
  self.logger.success(f"Study exported to {len(exported_files)} Parquet files.")
1391
1394
  else:
1392
1395
  self.logger.error("No Parquet files were created - no data available to export")
1396
+
1397
+
1398
+ def export_slaw(self, filename="features_slaw.csv"):
1399
+ """
1400
+ Export the consensus features DataFrame to a SLAW-formatted CSV file.
1401
+
1402
+ This method exports the consensus features to a CSV format compatible with SLAW,
1403
+ including feature metadata and intensity quantification across all samples. The file
1404
+ contains comprehensive feature information including m/z, RT, annotations, isotopic
1405
+ patterns, MS2 data, and intensity values for each sample.
1406
+
1407
+ Parameters:
1408
+ filename (str): The path to the output CSV file. Defaults to 'features_slaw.csv'.
1409
+
1410
+ Side Effects:
1411
+ Writes the exported data to the specified CSV file and logs the export operation.
1412
+ """
1413
+ if self.consensus_df is None:
1414
+ self.logger.warning("No consensus features found. Cannot export to SLAW format.")
1415
+ return
1416
+
1417
+ # Make filename absolute if not already
1418
+ if not os.path.isabs(filename):
1419
+ if self.folder is not None:
1420
+ filename = os.path.join(self.folder, filename)
1421
+ else:
1422
+ filename = os.path.join(os.getcwd(), filename)
1423
+
1424
+ df = self.consensus_df
1425
+
1426
+ # Get consensus matrix for quantification across samples
1427
+ try:
1428
+ quant_matrix = self.get_consensus_matrix()
1429
+ except Exception as e:
1430
+ self.logger.error(f"Error getting consensus matrix: {e}")
1431
+ return
1432
+
1433
+ # Evaluate the charge column
1434
+ if "charge_mean" in df.columns:
1435
+ charge_series = df.select(
1436
+ pl.when(pl.col("charge_mean") == 0)
1437
+ .then(1 if self.polarity == "positive" else -1)
1438
+ .otherwise(pl.col("charge_mean"))
1439
+ .alias("charge")
1440
+ ).get_column("charge")
1441
+ else:
1442
+ charge_series = pl.Series([1 if self.polarity == "positive" else -1] * len(df))
1443
+
1444
+ # Evaluate the group column (from adduct_group_top)
1445
+ # Features with adduct_group_top == 0 should each get a unique group index
1446
+ if "adduct_group_top" in df.columns:
1447
+ max_adduct_group = df.get_column("adduct_group_top").max()
1448
+ if max_adduct_group is None:
1449
+ max_adduct_group = 0
1450
+
1451
+ group_series = df.select(
1452
+ pl.when(pl.col("adduct_group_top") == 0)
1453
+ .then(max_adduct_group + 1 + pl.int_range(pl.len()).over(pl.col("adduct_group_top") == 0))
1454
+ .otherwise(pl.col("adduct_group_top"))
1455
+ .alias("group")
1456
+ ).get_column("group")
1457
+ else:
1458
+ group_series = pl.Series([None] * len(df))
1459
+
1460
+ # Evaluate the annotation column (adduct + isotope info)
1461
+ if "adduct_top" in df.columns and "iso_mean" in df.columns:
1462
+ annotation_series = df.select(
1463
+ pl.when(pl.col("iso_mean") == 0)
1464
+ .then(pl.col("adduct_top").str.replace(r"\?", "H"))
1465
+ .otherwise(pl.col("adduct_top").str.replace(r"\?", "H") + " +" + pl.col("iso_mean").cast(pl.Int64).cast(pl.Utf8))
1466
+ .alias("annotation")
1467
+ ).get_column("annotation")
1468
+ elif "adduct_top" in df.columns:
1469
+ annotation_series = df.get_column("adduct_top").str.replace(r"\?", "H")
1470
+ else:
1471
+ annotation_series = pl.Series([""] * len(df))
1472
+
1473
+ # Get sample columns from quant_matrix (excluding consensus_uid)
1474
+ sample_columns = [col for col in quant_matrix.columns if col != "consensus_uid"]
1475
+
1476
+ # Create SLAW columns with appropriate mappings from consensus_df
1477
+ slaw_data = {
1478
+ "feature_id": df.get_column("consensus_id") if "consensus_id" in df.columns else pl.Series(range(1, len(df) + 1)),
1479
+ "mz": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
1480
+ "rt": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
1481
+ "group": group_series,
1482
+ "annotation": annotation_series,
1483
+ "neutral_mass": df.get_column("adduct_neutral_mass_top") if "adduct_neutral_mass_top" in df.columns else pl.Series([None] * len(df)),
1484
+ "charge": charge_series,
1485
+ "main_id": df.get_column("main_id") if "main_id" in df.columns else df.get_column("consensus_id") if "consensus_id" in df.columns else pl.Series(range(1, len(df) + 1)),
1486
+ "ion": df.get_column("adduct_top").str.replace(r"\?", "H") if "adduct_top" in df.columns else pl.Series([""] * len(df)),
1487
+ "iso": df.get_column("iso_mean").cast(pl.Int64) if "iso_mean" in df.columns else pl.Series([0] * len(df)),
1488
+ "clique": df.get_column("clique") if "clique" in df.columns else pl.Series([None] * len(df)),
1489
+ "num_detection": df.get_column("num_detection") if "num_detection" in df.columns else pl.Series([1] * len(df)),
1490
+ "total_detection": df.get_column("total_detection") if "total_detection" in df.columns else pl.Series([1] * len(df)),
1491
+ "mz_mean": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
1492
+ "mz_min": df.get_column("mz_min") if "mz_min" in df.columns else df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
1493
+ "mz_max": df.get_column("mz_max") if "mz_max" in df.columns else df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
1494
+ "rt_mean": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
1495
+ "rt_min": df.get_column("rt_min") if "rt_min" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
1496
+ "rt_max": df.get_column("rt_max") if "rt_max" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
1497
+ "rt_cor_mean": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
1498
+ "rt_cor_min": df.get_column("rt_min") if "rt_min" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
1499
+ "rt_cor_max": df.get_column("rt_max") if "rt_max" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
1500
+ "height_mean": df.get_column("height_mean") if "height_mean" in df.columns else pl.Series([None] * len(df)),
1501
+ "height_min": df.get_column("height_min") if "height_min" in df.columns else pl.Series([None] * len(df)),
1502
+ "height_max": df.get_column("height_max") if "height_max" in df.columns else pl.Series([None] * len(df)),
1503
+ "intensity_mean": df.get_column("inty_mean") if "inty_mean" in df.columns else pl.Series([None] * len(df)),
1504
+ "intensity_min": df.get_column("inty_min") if "inty_min" in df.columns else pl.Series([None] * len(df)),
1505
+ "intensity_max": df.get_column("inty_max") if "inty_max" in df.columns else pl.Series([None] * len(df)),
1506
+ "SN_mean": df.get_column("sn_mean") if "sn_mean" in df.columns else pl.Series([None] * len(df)),
1507
+ "SN_min": df.get_column("sn_min") if "sn_min" in df.columns else pl.Series([None] * len(df)),
1508
+ "SN_max": df.get_column("sn_max") if "sn_max" in df.columns else pl.Series([None] * len(df)),
1509
+ "peakwidth_mean": df.get_column("fwhm_mean") if "fwhm_mean" in df.columns else pl.Series([None] * len(df)),
1510
+ "peakwidth_min": df.get_column("fwhm_min") if "fwhm_min" in df.columns else pl.Series([None] * len(df)),
1511
+ "peakwidth_max": df.get_column("fwhm_max") if "fwhm_max" in df.columns else pl.Series([None] * len(df)),
1512
+ "ms2_mgf_id": pl.Series([""] * len(df)), # Not available in study
1513
+ "ms2_num_fused": pl.Series([None] * len(df)), # Not available in study
1514
+ "ms2_source": pl.Series([""] * len(df)), # Not available in study
1515
+ "isotopic_pattern_annot": pl.Series([""] * len(df)), # Not available in study
1516
+ "isotopic_pattern_rel": pl.Series([""] * len(df)), # Not available in study
1517
+ "isotopic_pattern_abs": pl.Series([""] * len(df)), # Not available in study
1518
+ }
1519
+
1520
+ # Add quantification columns for each sample
1521
+ for sample_col in sample_columns:
1522
+ quant_column_name = f"quant_{sample_col}"
1523
+ # Join with quant_matrix to get values for this sample
1524
+ sample_values = quant_matrix.join(
1525
+ df.select("consensus_uid"),
1526
+ on="consensus_uid",
1527
+ how="right"
1528
+ ).get_column(sample_col)
1529
+ slaw_data[quant_column_name] = sample_values
1530
+
1531
+ # Create the polars DataFrame
1532
+ slaw_df = pl.DataFrame(slaw_data)
1533
+
1534
+ # Convert to pandas for CSV export
1535
+ pandas_df = slaw_df.to_pandas()
1536
+
1537
+ # Export to CSV with comma separator - only quote when necessary (QUOTE_MINIMAL)
1538
+ try:
1539
+ pandas_df.to_csv(filename, sep=',', index=False, quoting=0) # quoting=0 means QUOTE_MINIMAL
1540
+ self.logger.success(f"Features exported to {filename} (SLAW format)")
1541
+ self.logger.debug(f"Exported {len(slaw_df)} features with {len(slaw_df.columns)} columns")
1542
+ except PermissionError:
1543
+ self.logger.error(f"Permission denied: Cannot write to {filename}. The file may be open in another program. Please close it and try again.")
masster/study/id.py CHANGED
@@ -24,7 +24,8 @@ def lib_load(
24
24
  lib_source: either a CSV/JSON file path (str) or a Lib instance
25
25
  polarity: ionization polarity ("positive" or "negative") - used when lib_source is a CSV/JSON path.
26
26
  If None, uses study.polarity automatically.
27
- adducts: specific adducts to generate - used when lib_source is a CSV/JSON path
27
+ adducts: specific adducts to generate - used when lib_source is a CSV/JSON path.
28
+ If None, uses study.parameters.adducts if available.
28
29
  iso: isotope generation mode ("13C" to generate 13C isotopes, None for no isotopes)
29
30
 
30
31
  Side effects:
@@ -51,6 +52,18 @@ def lib_load(
51
52
  else:
52
53
  polarity = "positive" # Default fallback
53
54
  study.logger.debug(f"Using study polarity: {polarity}")
55
+
56
+ # Use study.parameters.adducts if adducts not explicitly provided
57
+ # If study.parameters.adducts is also None, lib will use its default adducts for the polarity
58
+ if adducts is None:
59
+ if hasattr(study, "parameters") and hasattr(study.parameters, "adducts"):
60
+ adducts = study.parameters.adducts
61
+ if adducts:
62
+ study.logger.debug(f"Using study.parameters.adducts: {adducts}")
63
+ else:
64
+ study.logger.debug(f"study.parameters.adducts is None, lib will use default adducts for {polarity} mode")
65
+ else:
66
+ study.logger.debug(f"study.parameters.adducts not found, lib will use default adducts for {polarity} mode")
54
67
 
55
68
  # Handle string input (CSV or JSON file path)
56
69
  if isinstance(lib_source, str):
@@ -403,42 +416,64 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
403
416
  """
404
417
  Find library matches using optimized vectorized operations.
405
418
 
406
- FIXED VERSION: Prevents incorrect matching of same compound to different m/z values.
419
+ Automatically skips RT filtering if library has no RT data for the matched entries.
407
420
  """
408
421
  # Filter by m/z tolerance using vectorized operations
409
422
  matches = lib_df.filter((pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol))
410
423
 
411
424
  initial_match_count = len(matches)
412
425
 
413
- # Apply RT filter if available - STRICT VERSION (no fallback)
426
+ # Apply RT filter if requested AND if data is available
427
+ # Strategy: Handle mixed RT/no-RT entries properly by treating them separately
414
428
  if rt_tol is not None and cons_rt is not None and not matches.is_empty():
415
- # First, check if any m/z matches have RT data
429
+ # Separate entries with and without RT data
416
430
  rt_candidates = matches.filter(pl.col("rt").is_not_null())
431
+ no_rt_entries = matches.filter(pl.col("rt").is_null())
417
432
 
418
433
  if not rt_candidates.is_empty():
419
434
  # Apply RT filtering to candidates with RT data
420
435
  rt_matches = rt_candidates.filter((pl.col("rt") >= cons_rt - rt_tol) & (pl.col("rt") <= cons_rt + rt_tol))
421
436
 
422
- if not rt_matches.is_empty():
437
+ # Combine RT-filtered matches with entries that have no RT data
438
+ # Rationale: Entries without RT can't be filtered by RT, so include them
439
+ if not rt_matches.is_empty() and not no_rt_entries.is_empty():
440
+ # Both RT matches and no-RT entries exist
441
+ matches = pl.concat([rt_matches, no_rt_entries])
442
+ if logger:
443
+ logger.debug(
444
+ f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, "
445
+ f"{len(rt_matches)} passed RT filter, {len(no_rt_entries)} with no RT → {len(matches)} total matches"
446
+ )
447
+ elif not rt_matches.is_empty():
448
+ # Only RT matches, no entries without RT
423
449
  matches = rt_matches
424
450
  if logger:
425
451
  logger.debug(
426
- f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, {len(matches)} after RT filter"
452
+ f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, "
453
+ f"{len(matches)} passed RT filter"
454
+ )
455
+ elif not no_rt_entries.is_empty():
456
+ # No RT matches passed filter, but there are entries without RT
457
+ matches = no_rt_entries
458
+ if logger:
459
+ logger.debug(
460
+ f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT but none passed RT filter, "
461
+ f"using {len(matches)} entries with no RT data"
427
462
  )
428
463
  else:
429
- # NO FALLBACK - if RT filtering finds no matches, return empty
430
- matches = rt_matches # This is empty
464
+ # No RT matches and no entries without RT - return empty
465
+ matches = pl.DataFrame()
431
466
  if logger:
432
467
  logger.debug(
433
468
  f"Consensus {cons_uid}: RT filtering eliminated all {len(rt_candidates)} candidates (rt_tol={rt_tol}s) - no matches returned"
434
469
  )
435
470
  else:
436
- # No RT data in library matches - return empty if strict RT filtering requested
471
+ # All m/z matches have no RT data - keep all m/z matches
437
472
  if logger:
438
473
  logger.debug(
439
- f"Consensus {cons_uid}: {initial_match_count} m/z matches but none have library RT data - no matches returned due to RT filtering"
474
+ f"Consensus {cons_uid}: {initial_match_count} m/z matches, all have no RT data - using m/z matches only"
440
475
  )
441
- matches = pl.DataFrame() # Return empty DataFrame
476
+ # matches already contains the m/z-filtered results (which are all no_rt_entries)
442
477
 
443
478
  # FIX 1: Add stricter m/z validation - prioritize more accurate matches
444
479
  if not matches.is_empty():
@@ -884,6 +919,18 @@ def identify(study, features=None, params=None, **kwargs):
884
919
  effective_mz_tol = getattr(params, "mz_tol", 0.01)
885
920
  effective_rt_tol = getattr(params, "rt_tol", 2.0)
886
921
 
922
+ # Check if library has RT data - if not, disable RT filtering
923
+ if effective_rt_tol is not None and hasattr(study, "lib_df") and study.lib_df is not None:
924
+ if "rt" in study.lib_df.columns:
925
+ # Check if library has any non-null RT values
926
+ rt_count = study.lib_df.filter(pl.col("rt").is_not_null()).shape[0]
927
+ if rt_count == 0:
928
+ if logger:
929
+ logger.info(
930
+ f"Library has no retention time data - disabling RT filtering (was rt_tol={effective_rt_tol})"
931
+ )
932
+ effective_rt_tol = None
933
+
887
934
  if logger:
888
935
  logger.debug(
889
936
  f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}",
@@ -1483,7 +1530,7 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
1483
1530
  if charge_min <= abs(total_charge) <= charge_max and total_charge != 0:
1484
1531
  components = [spec] * multiplier
1485
1532
  formatted_name = _format_adduct_name(components)
1486
- probability_multiplied = float(spec["probability"]) ** multiplier
1533
+ probability_multiplied = (float(spec["probability"]) ** multiplier) / 2.0
1487
1534
 
1488
1535
  combinations_list.append(
1489
1536
  {