masster 0.5.28__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/data/libs/aa_nort.json +240 -0
- masster/data/libs/ccm_nort.json +1319 -0
- masster/lib/lib.py +1 -1
- masster/logger.py +0 -6
- masster/sample/adducts.py +1 -1
- masster/sample/defaults/find_adducts_def.py +1 -1
- masster/sample/h5.py +152 -2
- masster/sample/helpers.py +91 -5
- masster/sample/id.py +1160 -0
- masster/sample/importers.py +715 -0
- masster/sample/plot.py +175 -71
- masster/sample/sample.py +26 -5
- masster/sample/sample5_schema.json +99 -1
- masster/sample/save.py +724 -1
- masster/study/defaults/study_def.py +8 -12
- masster/study/export.py +216 -65
- masster/study/id.py +59 -12
- masster/study/importers.py +384 -1
- masster/study/load.py +0 -11
- masster/study/merge.py +153 -0
- masster/study/plot.py +197 -0
- masster/study/study.py +6 -4
- masster/study/study5_schema.json +15 -0
- masster/wizard/wizard.py +13 -14
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/METADATA +17 -18
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/RECORD +30 -29
- masster/data/libs/aa.csv +0 -22
- masster/data/libs/ccm.csv +0 -120
- masster/data/libs/urine.csv +0 -4693
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/WHEEL +0 -0
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/entry_points.txt +0 -0
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -96,19 +96,15 @@ class study_defaults:
|
|
|
96
96
|
"adducts": {
|
|
97
97
|
"dtype": "list[str]",
|
|
98
98
|
"description": "List of adduct specifications in OpenMS format (element:charge:probability). Charged adduct probabilities must sum to 1.0.",
|
|
99
|
-
"default": ["H
|
|
99
|
+
"default": ["+H:1:0.65", "+Na:1:0.15", "+NH4:1:0.15", "+K:1:0.05"],
|
|
100
100
|
"examples": {
|
|
101
|
-
"positive": ["H
|
|
102
|
-
"negative": [
|
|
103
|
-
"H-1:-:0.95",
|
|
104
|
-
"Cl:-:0.05",
|
|
105
|
-
"CH2O2:0:0.2",
|
|
106
|
-
"H-2-O:0:0.2",
|
|
107
|
-
],
|
|
101
|
+
"positive": ["+H:1:0.65", "+Na:1:0.15", "+NH4:1:0.15", "+K:1:0.05", "-H2O:0:0.15"],
|
|
102
|
+
"negative": ["-H:-1:0.95", "+Cl:-1:0.05", "+CH2O2:0:0.2", "-H2O:0:0.2"],
|
|
108
103
|
},
|
|
109
104
|
"validation_rules": [
|
|
110
|
-
"Format:
|
|
111
|
-
"
|
|
105
|
+
"Format: formula:charge:probability (e.g., '+H:1:0.65', '-H:-1:0.95', '-H2O:0:0.15')",
|
|
106
|
+
"Formula must start with + or - to indicate gain/loss (e.g., '+H', '-H', '+Na', '-H2O')",
|
|
107
|
+
"Charge must be an integer (positive, negative, or 0 for neutral)",
|
|
112
108
|
"Probability must be between 0.0 and 1.0",
|
|
113
109
|
"Sum of all charged adduct probabilities must equal 1.0",
|
|
114
110
|
],
|
|
@@ -128,7 +124,7 @@ class study_defaults:
|
|
|
128
124
|
"""Set polarity-specific defaults for adducts if not explicitly provided."""
|
|
129
125
|
# If adducts is None, set based on polarity
|
|
130
126
|
if self.adducts is None:
|
|
131
|
-
if self.polarity.lower() in ["positive", "pos"]:
|
|
127
|
+
if self.polarity.lower() in ["positive", "pos", "+"]:
|
|
132
128
|
self.adducts = [
|
|
133
129
|
"+H:1:0.65",
|
|
134
130
|
"+Na:1:0.15",
|
|
@@ -136,7 +132,7 @@ class study_defaults:
|
|
|
136
132
|
"+K:1:0.05",
|
|
137
133
|
"-H2O:0:0.15",
|
|
138
134
|
]
|
|
139
|
-
elif self.polarity.lower() in ["negative", "neg"]:
|
|
135
|
+
elif self.polarity.lower() in ["negative", "neg", "-"]:
|
|
140
136
|
self.adducts = [
|
|
141
137
|
"-H:-1:0.9",
|
|
142
138
|
"+Cl:-1:0.1",
|
masster/study/export.py
CHANGED
|
@@ -524,7 +524,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
524
524
|
# Import here to avoid circular imports
|
|
525
525
|
from masster.study.id import get_id
|
|
526
526
|
|
|
527
|
-
# Get full enriched identification data for
|
|
527
|
+
# Get full enriched identification data for SME section
|
|
528
528
|
full_id_data = get_id(self)
|
|
529
529
|
if full_id_data is not None and not full_id_data.is_empty():
|
|
530
530
|
# Get top scoring identification for each consensus_uid for SML section
|
|
@@ -828,8 +828,8 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
828
828
|
smf_header = [
|
|
829
829
|
"SFH",
|
|
830
830
|
"SMF_ID",
|
|
831
|
-
"
|
|
832
|
-
"
|
|
831
|
+
"SME_ID_REFS",
|
|
832
|
+
"SME_ID_REF_ambiguity_code",
|
|
833
833
|
"adduct_ion",
|
|
834
834
|
"isotopomer",
|
|
835
835
|
"exp_mass_to_charge",
|
|
@@ -847,40 +847,40 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
847
847
|
|
|
848
848
|
# SMF table uses the same consensus features as SML, just different metadata
|
|
849
849
|
for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
|
|
850
|
-
# References to
|
|
851
|
-
|
|
852
|
-
|
|
850
|
+
# References to SME entries - each SMF can reference multiple SME entries for the same consensus_uid
|
|
851
|
+
SME_refs = "null"
|
|
852
|
+
SME_ambiguity = "null"
|
|
853
853
|
consensus_uid = row["consensus_uid"]
|
|
854
854
|
|
|
855
855
|
if full_id_data is not None:
|
|
856
|
-
# Find all
|
|
857
|
-
|
|
858
|
-
if
|
|
859
|
-
# Generate
|
|
856
|
+
# Find all SME entries for this consensus_uid
|
|
857
|
+
SME_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
|
|
858
|
+
if SME_matches.height > 0:
|
|
859
|
+
# Generate SME IDs - we'll create a mapping in the SME section
|
|
860
860
|
# For now, use a simple approach based on consensus_uid and lib_uid
|
|
861
|
-
|
|
862
|
-
for i,
|
|
863
|
-
# Create a unique
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
if
|
|
869
|
-
|
|
861
|
+
SME_ids = []
|
|
862
|
+
for i, SME_row in enumerate(SME_matches.iter_rows(named=True)):
|
|
863
|
+
# Create a unique SME ID based on consensus_uid and position
|
|
864
|
+
SME_id_base = consensus_uid * 1000 # Ensure uniqueness across consensus features
|
|
865
|
+
SME_id = SME_id_base + i + 1
|
|
866
|
+
SME_ids.append(str(SME_id))
|
|
867
|
+
|
|
868
|
+
if SME_ids:
|
|
869
|
+
SME_refs = "|".join(SME_ids)
|
|
870
870
|
# Set ambiguity code: 1=ambiguous identification, 2=multiple evidence same molecule, 3=both
|
|
871
|
-
if len(
|
|
871
|
+
if len(SME_ids) > 1:
|
|
872
872
|
# Check if all identifications point to the same compound
|
|
873
873
|
unique_cmpds = {
|
|
874
874
|
match["cmpd_uid"]
|
|
875
|
-
for match in
|
|
875
|
+
for match in SME_matches.iter_rows(named=True)
|
|
876
876
|
if match.get("cmpd_uid") is not None
|
|
877
877
|
}
|
|
878
878
|
if len(unique_cmpds) > 1:
|
|
879
|
-
|
|
879
|
+
SME_ambiguity = "1" # Ambiguous identification
|
|
880
880
|
else:
|
|
881
|
-
|
|
881
|
+
SME_ambiguity = "2" # Multiple evidence for same molecule
|
|
882
882
|
else:
|
|
883
|
-
|
|
883
|
+
SME_ambiguity = "null"
|
|
884
884
|
|
|
885
885
|
# Format isotopomer according to mzTab-M specification
|
|
886
886
|
iso_value = row.get("iso_mean", 0)
|
|
@@ -892,8 +892,8 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
892
892
|
smf_row = [
|
|
893
893
|
"SMF",
|
|
894
894
|
str(idx),
|
|
895
|
-
|
|
896
|
-
|
|
895
|
+
SME_refs,
|
|
896
|
+
SME_ambiguity,
|
|
897
897
|
adduct_list[idx - 1], # adduct_ion
|
|
898
898
|
isotopomer, # isotopomer formatted according to mzTab-M specification
|
|
899
899
|
safe_str(row.get("mz", "null")), # exp_mass_to_charge
|
|
@@ -943,16 +943,16 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
943
943
|
for line in smf_lines:
|
|
944
944
|
f.write(line + "\n")
|
|
945
945
|
|
|
946
|
-
# ---
|
|
946
|
+
# --- SME (Small Molecule Evidence) table ---
|
|
947
947
|
if full_id_data is not None and not full_id_data.is_empty():
|
|
948
|
-
|
|
948
|
+
SME_lines = []
|
|
949
949
|
# Add comment about spectra_ref being dummy placeholders
|
|
950
|
-
|
|
950
|
+
SME_lines.append(
|
|
951
951
|
"COM\tThe spectra_ref are dummy placeholders, as the annotation was based on aggregated data",
|
|
952
952
|
)
|
|
953
|
-
|
|
954
|
-
"
|
|
955
|
-
"
|
|
953
|
+
SME_header = [
|
|
954
|
+
"SEH",
|
|
955
|
+
"SME_ID",
|
|
956
956
|
"evidence_input_id",
|
|
957
957
|
"database_identifier",
|
|
958
958
|
"chemical_formula",
|
|
@@ -971,9 +971,9 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
971
971
|
"id_confidence_measure[1]",
|
|
972
972
|
"rank",
|
|
973
973
|
]
|
|
974
|
-
|
|
974
|
+
SME_lines.append("\t".join(SME_header))
|
|
975
975
|
|
|
976
|
-
# Create
|
|
976
|
+
# Create SME entries for all identification results using enriched data
|
|
977
977
|
for consensus_uid in self.consensus_df.select("consensus_uid").to_series().unique():
|
|
978
978
|
# Get consensus feature data for this consensus_uid
|
|
979
979
|
consensus_feature_data = self.consensus_df.filter(
|
|
@@ -984,16 +984,16 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
984
984
|
consensus_row = consensus_feature_data.row(0, named=True)
|
|
985
985
|
|
|
986
986
|
# Get all identification results for this consensus feature from enriched data
|
|
987
|
-
|
|
987
|
+
SME_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
|
|
988
988
|
|
|
989
|
-
if
|
|
989
|
+
if SME_matches.height > 0:
|
|
990
990
|
# Sort by score descending to maintain rank order
|
|
991
|
-
|
|
991
|
+
SME_matches = SME_matches.sort("score", descending=True)
|
|
992
992
|
|
|
993
|
-
for i,
|
|
994
|
-
# Generate unique
|
|
995
|
-
|
|
996
|
-
|
|
993
|
+
for i, SME_row in enumerate(SME_matches.iter_rows(named=True)):
|
|
994
|
+
# Generate unique SME_ID
|
|
995
|
+
SME_id_base = consensus_uid * 1000
|
|
996
|
+
SME_id = SME_id_base + i + 1
|
|
997
997
|
|
|
998
998
|
# Create evidence input ID using consensus_uid:mz:rt format
|
|
999
999
|
consensus_mz = consensus_row.get("mz", 0)
|
|
@@ -1002,15 +1002,15 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
1002
1002
|
|
|
1003
1003
|
# Database identifier - use db_id if available, otherwise fallback to cmpd_uid
|
|
1004
1004
|
db_id = "null"
|
|
1005
|
-
if
|
|
1006
|
-
db_id = safe_str(
|
|
1007
|
-
elif
|
|
1008
|
-
db_id = f"cmpd:{
|
|
1005
|
+
if SME_row.get("db_id") is not None and SME_row["db_id"] != "":
|
|
1006
|
+
db_id = safe_str(SME_row["db_id"])
|
|
1007
|
+
elif SME_row.get("cmpd_uid") is not None:
|
|
1008
|
+
db_id = f"cmpd:{SME_row['cmpd_uid']}"
|
|
1009
1009
|
|
|
1010
1010
|
# Get adduct information
|
|
1011
1011
|
adduct_ion = "null"
|
|
1012
|
-
if
|
|
1013
|
-
adduct_ion = safe_str(
|
|
1012
|
+
if SME_row.get("adduct") is not None and SME_row["adduct"] != "":
|
|
1013
|
+
adduct_ion = safe_str(SME_row["adduct"])
|
|
1014
1014
|
# Replace ? with H for better mzTab compatibility
|
|
1015
1015
|
adduct_ion = adduct_ion.replace("?", "H")
|
|
1016
1016
|
|
|
@@ -1019,29 +1019,32 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
1019
1019
|
|
|
1020
1020
|
# Identification method
|
|
1021
1021
|
id_method = "[MS, MS:1002888, small molecule confidence measure, ]"
|
|
1022
|
-
if
|
|
1023
|
-
id_method = f"[MS, MS:1002888, {
|
|
1022
|
+
if SME_row.get("matcher") is not None:
|
|
1023
|
+
id_method = f"[MS, MS:1002888, {SME_row['matcher']}, ]"
|
|
1024
1024
|
|
|
1025
|
-
# MS level -
|
|
1026
|
-
|
|
1025
|
+
# MS level - check if ms1 exists in matched
|
|
1026
|
+
if 'ms1' in SME_row['matcher'].lower():
|
|
1027
|
+
ms_level = "[MS, MS:1000511, ms level, 1]"
|
|
1028
|
+
else:
|
|
1029
|
+
ms_level = "[MS,MS:1000511, ms level, 2]"
|
|
1027
1030
|
|
|
1028
1031
|
# Experimental mass-to-charge from consensus feature
|
|
1029
1032
|
exp_mz = safe_str(consensus_mz)
|
|
1030
1033
|
|
|
1031
1034
|
# Theoretical mass-to-charge from lib_df
|
|
1032
1035
|
theoretical_mz = "null"
|
|
1033
|
-
if
|
|
1034
|
-
theoretical_mz = safe_str(
|
|
1036
|
+
if SME_row.get("mz") is not None: # This comes from lib_df via get_id() join
|
|
1037
|
+
theoretical_mz = safe_str(SME_row["mz"])
|
|
1035
1038
|
|
|
1036
|
-
|
|
1037
|
-
"
|
|
1038
|
-
str(
|
|
1039
|
+
SME_line = [
|
|
1040
|
+
"SME",
|
|
1041
|
+
str(SME_id),
|
|
1039
1042
|
evidence_id,
|
|
1040
1043
|
db_id,
|
|
1041
|
-
safe_str(
|
|
1042
|
-
safe_str(
|
|
1043
|
-
safe_str(
|
|
1044
|
-
safe_str(
|
|
1044
|
+
safe_str(SME_row.get("formula", "null")),
|
|
1045
|
+
safe_str(SME_row.get("smiles", "null")),
|
|
1046
|
+
safe_str(SME_row.get("inchi", "null")),
|
|
1047
|
+
safe_str(SME_row.get("name", "null")),
|
|
1045
1048
|
"null", # uri - not available in current data
|
|
1046
1049
|
"null", # derivatized_form
|
|
1047
1050
|
adduct_ion,
|
|
@@ -1053,15 +1056,15 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
1053
1056
|
spectra_ref,
|
|
1054
1057
|
id_method,
|
|
1055
1058
|
ms_level,
|
|
1056
|
-
safe_str(
|
|
1059
|
+
safe_str(SME_row.get("score", "null")),
|
|
1057
1060
|
str(i + 1), # rank within this consensus feature
|
|
1058
1061
|
]
|
|
1059
|
-
|
|
1062
|
+
SME_lines.append("\t".join(SME_line))
|
|
1060
1063
|
|
|
1061
|
-
# Write
|
|
1064
|
+
# Write SME table
|
|
1062
1065
|
with open(filename, "a", encoding="utf-8") as f:
|
|
1063
1066
|
f.write("\n")
|
|
1064
|
-
for line in
|
|
1067
|
+
for line in SME_lines:
|
|
1065
1068
|
f.write(line + "\n")
|
|
1066
1069
|
|
|
1067
1070
|
# --- MGF table ---
|
|
@@ -1125,7 +1128,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
1125
1128
|
self.logger.success(f"Exported mzTab-M to {filename}")
|
|
1126
1129
|
|
|
1127
1130
|
|
|
1128
|
-
def
|
|
1131
|
+
def export_excel(self, filename: str | None = None) -> None:
|
|
1129
1132
|
"""
|
|
1130
1133
|
Export the study data to an Excel workbook with multiple worksheets.
|
|
1131
1134
|
|
|
@@ -1390,3 +1393,151 @@ def export_parquet(self, filename: str | None = None) -> None:
|
|
|
1390
1393
|
self.logger.success(f"Study exported to {len(exported_files)} Parquet files.")
|
|
1391
1394
|
else:
|
|
1392
1395
|
self.logger.error("No Parquet files were created - no data available to export")
|
|
1396
|
+
|
|
1397
|
+
|
|
1398
|
+
def export_slaw(self, filename="features_slaw.csv"):
|
|
1399
|
+
"""
|
|
1400
|
+
Export the consensus features DataFrame to a SLAW-formatted CSV file.
|
|
1401
|
+
|
|
1402
|
+
This method exports the consensus features to a CSV format compatible with SLAW,
|
|
1403
|
+
including feature metadata and intensity quantification across all samples. The file
|
|
1404
|
+
contains comprehensive feature information including m/z, RT, annotations, isotopic
|
|
1405
|
+
patterns, MS2 data, and intensity values for each sample.
|
|
1406
|
+
|
|
1407
|
+
Parameters:
|
|
1408
|
+
filename (str): The path to the output CSV file. Defaults to 'features_slaw.csv'.
|
|
1409
|
+
|
|
1410
|
+
Side Effects:
|
|
1411
|
+
Writes the exported data to the specified CSV file and logs the export operation.
|
|
1412
|
+
"""
|
|
1413
|
+
if self.consensus_df is None:
|
|
1414
|
+
self.logger.warning("No consensus features found. Cannot export to SLAW format.")
|
|
1415
|
+
return
|
|
1416
|
+
|
|
1417
|
+
# Make filename absolute if not already
|
|
1418
|
+
if not os.path.isabs(filename):
|
|
1419
|
+
if self.folder is not None:
|
|
1420
|
+
filename = os.path.join(self.folder, filename)
|
|
1421
|
+
else:
|
|
1422
|
+
filename = os.path.join(os.getcwd(), filename)
|
|
1423
|
+
|
|
1424
|
+
df = self.consensus_df
|
|
1425
|
+
|
|
1426
|
+
# Get consensus matrix for quantification across samples
|
|
1427
|
+
try:
|
|
1428
|
+
quant_matrix = self.get_consensus_matrix()
|
|
1429
|
+
except Exception as e:
|
|
1430
|
+
self.logger.error(f"Error getting consensus matrix: {e}")
|
|
1431
|
+
return
|
|
1432
|
+
|
|
1433
|
+
# Evaluate the charge column
|
|
1434
|
+
if "charge_mean" in df.columns:
|
|
1435
|
+
charge_series = df.select(
|
|
1436
|
+
pl.when(pl.col("charge_mean") == 0)
|
|
1437
|
+
.then(1 if self.polarity == "positive" else -1)
|
|
1438
|
+
.otherwise(pl.col("charge_mean"))
|
|
1439
|
+
.alias("charge")
|
|
1440
|
+
).get_column("charge")
|
|
1441
|
+
else:
|
|
1442
|
+
charge_series = pl.Series([1 if self.polarity == "positive" else -1] * len(df))
|
|
1443
|
+
|
|
1444
|
+
# Evaluate the group column (from adduct_group_top)
|
|
1445
|
+
# Features with adduct_group_top == 0 should each get a unique group index
|
|
1446
|
+
if "adduct_group_top" in df.columns:
|
|
1447
|
+
max_adduct_group = df.get_column("adduct_group_top").max()
|
|
1448
|
+
if max_adduct_group is None:
|
|
1449
|
+
max_adduct_group = 0
|
|
1450
|
+
|
|
1451
|
+
group_series = df.select(
|
|
1452
|
+
pl.when(pl.col("adduct_group_top") == 0)
|
|
1453
|
+
.then(max_adduct_group + 1 + pl.int_range(pl.len()).over(pl.col("adduct_group_top") == 0))
|
|
1454
|
+
.otherwise(pl.col("adduct_group_top"))
|
|
1455
|
+
.alias("group")
|
|
1456
|
+
).get_column("group")
|
|
1457
|
+
else:
|
|
1458
|
+
group_series = pl.Series([None] * len(df))
|
|
1459
|
+
|
|
1460
|
+
# Evaluate the annotation column (adduct + isotope info)
|
|
1461
|
+
if "adduct_top" in df.columns and "iso_mean" in df.columns:
|
|
1462
|
+
annotation_series = df.select(
|
|
1463
|
+
pl.when(pl.col("iso_mean") == 0)
|
|
1464
|
+
.then(pl.col("adduct_top").str.replace(r"\?", "H"))
|
|
1465
|
+
.otherwise(pl.col("adduct_top").str.replace(r"\?", "H") + " +" + pl.col("iso_mean").cast(pl.Int64).cast(pl.Utf8))
|
|
1466
|
+
.alias("annotation")
|
|
1467
|
+
).get_column("annotation")
|
|
1468
|
+
elif "adduct_top" in df.columns:
|
|
1469
|
+
annotation_series = df.get_column("adduct_top").str.replace(r"\?", "H")
|
|
1470
|
+
else:
|
|
1471
|
+
annotation_series = pl.Series([""] * len(df))
|
|
1472
|
+
|
|
1473
|
+
# Get sample columns from quant_matrix (excluding consensus_uid)
|
|
1474
|
+
sample_columns = [col for col in quant_matrix.columns if col != "consensus_uid"]
|
|
1475
|
+
|
|
1476
|
+
# Create SLAW columns with appropriate mappings from consensus_df
|
|
1477
|
+
slaw_data = {
|
|
1478
|
+
"feature_id": df.get_column("consensus_id") if "consensus_id" in df.columns else pl.Series(range(1, len(df) + 1)),
|
|
1479
|
+
"mz": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
|
|
1480
|
+
"rt": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
|
|
1481
|
+
"group": group_series,
|
|
1482
|
+
"annotation": annotation_series,
|
|
1483
|
+
"neutral_mass": df.get_column("adduct_neutral_mass_top") if "adduct_neutral_mass_top" in df.columns else pl.Series([None] * len(df)),
|
|
1484
|
+
"charge": charge_series,
|
|
1485
|
+
"main_id": df.get_column("main_id") if "main_id" in df.columns else df.get_column("consensus_id") if "consensus_id" in df.columns else pl.Series(range(1, len(df) + 1)),
|
|
1486
|
+
"ion": df.get_column("adduct_top").str.replace(r"\?", "H") if "adduct_top" in df.columns else pl.Series([""] * len(df)),
|
|
1487
|
+
"iso": df.get_column("iso_mean").cast(pl.Int64) if "iso_mean" in df.columns else pl.Series([0] * len(df)),
|
|
1488
|
+
"clique": df.get_column("clique") if "clique" in df.columns else pl.Series([None] * len(df)),
|
|
1489
|
+
"num_detection": df.get_column("num_detection") if "num_detection" in df.columns else pl.Series([1] * len(df)),
|
|
1490
|
+
"total_detection": df.get_column("total_detection") if "total_detection" in df.columns else pl.Series([1] * len(df)),
|
|
1491
|
+
"mz_mean": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
|
|
1492
|
+
"mz_min": df.get_column("mz_min") if "mz_min" in df.columns else df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
|
|
1493
|
+
"mz_max": df.get_column("mz_max") if "mz_max" in df.columns else df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
|
|
1494
|
+
"rt_mean": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
|
|
1495
|
+
"rt_min": df.get_column("rt_min") if "rt_min" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
|
|
1496
|
+
"rt_max": df.get_column("rt_max") if "rt_max" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
|
|
1497
|
+
"rt_cor_mean": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
|
|
1498
|
+
"rt_cor_min": df.get_column("rt_min") if "rt_min" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
|
|
1499
|
+
"rt_cor_max": df.get_column("rt_max") if "rt_max" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
|
|
1500
|
+
"height_mean": df.get_column("height_mean") if "height_mean" in df.columns else pl.Series([None] * len(df)),
|
|
1501
|
+
"height_min": df.get_column("height_min") if "height_min" in df.columns else pl.Series([None] * len(df)),
|
|
1502
|
+
"height_max": df.get_column("height_max") if "height_max" in df.columns else pl.Series([None] * len(df)),
|
|
1503
|
+
"intensity_mean": df.get_column("inty_mean") if "inty_mean" in df.columns else pl.Series([None] * len(df)),
|
|
1504
|
+
"intensity_min": df.get_column("inty_min") if "inty_min" in df.columns else pl.Series([None] * len(df)),
|
|
1505
|
+
"intensity_max": df.get_column("inty_max") if "inty_max" in df.columns else pl.Series([None] * len(df)),
|
|
1506
|
+
"SN_mean": df.get_column("sn_mean") if "sn_mean" in df.columns else pl.Series([None] * len(df)),
|
|
1507
|
+
"SN_min": df.get_column("sn_min") if "sn_min" in df.columns else pl.Series([None] * len(df)),
|
|
1508
|
+
"SN_max": df.get_column("sn_max") if "sn_max" in df.columns else pl.Series([None] * len(df)),
|
|
1509
|
+
"peakwidth_mean": df.get_column("fwhm_mean") if "fwhm_mean" in df.columns else pl.Series([None] * len(df)),
|
|
1510
|
+
"peakwidth_min": df.get_column("fwhm_min") if "fwhm_min" in df.columns else pl.Series([None] * len(df)),
|
|
1511
|
+
"peakwidth_max": df.get_column("fwhm_max") if "fwhm_max" in df.columns else pl.Series([None] * len(df)),
|
|
1512
|
+
"ms2_mgf_id": pl.Series([""] * len(df)), # Not available in study
|
|
1513
|
+
"ms2_num_fused": pl.Series([None] * len(df)), # Not available in study
|
|
1514
|
+
"ms2_source": pl.Series([""] * len(df)), # Not available in study
|
|
1515
|
+
"isotopic_pattern_annot": pl.Series([""] * len(df)), # Not available in study
|
|
1516
|
+
"isotopic_pattern_rel": pl.Series([""] * len(df)), # Not available in study
|
|
1517
|
+
"isotopic_pattern_abs": pl.Series([""] * len(df)), # Not available in study
|
|
1518
|
+
}
|
|
1519
|
+
|
|
1520
|
+
# Add quantification columns for each sample
|
|
1521
|
+
for sample_col in sample_columns:
|
|
1522
|
+
quant_column_name = f"quant_{sample_col}"
|
|
1523
|
+
# Join with quant_matrix to get values for this sample
|
|
1524
|
+
sample_values = quant_matrix.join(
|
|
1525
|
+
df.select("consensus_uid"),
|
|
1526
|
+
on="consensus_uid",
|
|
1527
|
+
how="right"
|
|
1528
|
+
).get_column(sample_col)
|
|
1529
|
+
slaw_data[quant_column_name] = sample_values
|
|
1530
|
+
|
|
1531
|
+
# Create the polars DataFrame
|
|
1532
|
+
slaw_df = pl.DataFrame(slaw_data)
|
|
1533
|
+
|
|
1534
|
+
# Convert to pandas for CSV export
|
|
1535
|
+
pandas_df = slaw_df.to_pandas()
|
|
1536
|
+
|
|
1537
|
+
# Export to CSV with comma separator - only quote when necessary (QUOTE_MINIMAL)
|
|
1538
|
+
try:
|
|
1539
|
+
pandas_df.to_csv(filename, sep=',', index=False, quoting=0) # quoting=0 means QUOTE_MINIMAL
|
|
1540
|
+
self.logger.success(f"Features exported to {filename} (SLAW format)")
|
|
1541
|
+
self.logger.debug(f"Exported {len(slaw_df)} features with {len(slaw_df.columns)} columns")
|
|
1542
|
+
except PermissionError:
|
|
1543
|
+
self.logger.error(f"Permission denied: Cannot write to {filename}. The file may be open in another program. Please close it and try again.")
|
masster/study/id.py
CHANGED
|
@@ -24,7 +24,8 @@ def lib_load(
|
|
|
24
24
|
lib_source: either a CSV/JSON file path (str) or a Lib instance
|
|
25
25
|
polarity: ionization polarity ("positive" or "negative") - used when lib_source is a CSV/JSON path.
|
|
26
26
|
If None, uses study.polarity automatically.
|
|
27
|
-
adducts: specific adducts to generate - used when lib_source is a CSV/JSON path
|
|
27
|
+
adducts: specific adducts to generate - used when lib_source is a CSV/JSON path.
|
|
28
|
+
If None, uses study.parameters.adducts if available.
|
|
28
29
|
iso: isotope generation mode ("13C" to generate 13C isotopes, None for no isotopes)
|
|
29
30
|
|
|
30
31
|
Side effects:
|
|
@@ -51,6 +52,18 @@ def lib_load(
|
|
|
51
52
|
else:
|
|
52
53
|
polarity = "positive" # Default fallback
|
|
53
54
|
study.logger.debug(f"Using study polarity: {polarity}")
|
|
55
|
+
|
|
56
|
+
# Use study.parameters.adducts if adducts not explicitly provided
|
|
57
|
+
# If study.parameters.adducts is also None, lib will use its default adducts for the polarity
|
|
58
|
+
if adducts is None:
|
|
59
|
+
if hasattr(study, "parameters") and hasattr(study.parameters, "adducts"):
|
|
60
|
+
adducts = study.parameters.adducts
|
|
61
|
+
if adducts:
|
|
62
|
+
study.logger.debug(f"Using study.parameters.adducts: {adducts}")
|
|
63
|
+
else:
|
|
64
|
+
study.logger.debug(f"study.parameters.adducts is None, lib will use default adducts for {polarity} mode")
|
|
65
|
+
else:
|
|
66
|
+
study.logger.debug(f"study.parameters.adducts not found, lib will use default adducts for {polarity} mode")
|
|
54
67
|
|
|
55
68
|
# Handle string input (CSV or JSON file path)
|
|
56
69
|
if isinstance(lib_source, str):
|
|
@@ -403,42 +416,64 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
|
|
|
403
416
|
"""
|
|
404
417
|
Find library matches using optimized vectorized operations.
|
|
405
418
|
|
|
406
|
-
|
|
419
|
+
Automatically skips RT filtering if library has no RT data for the matched entries.
|
|
407
420
|
"""
|
|
408
421
|
# Filter by m/z tolerance using vectorized operations
|
|
409
422
|
matches = lib_df.filter((pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol))
|
|
410
423
|
|
|
411
424
|
initial_match_count = len(matches)
|
|
412
425
|
|
|
413
|
-
# Apply RT filter if
|
|
426
|
+
# Apply RT filter if requested AND if data is available
|
|
427
|
+
# Strategy: Handle mixed RT/no-RT entries properly by treating them separately
|
|
414
428
|
if rt_tol is not None and cons_rt is not None and not matches.is_empty():
|
|
415
|
-
#
|
|
429
|
+
# Separate entries with and without RT data
|
|
416
430
|
rt_candidates = matches.filter(pl.col("rt").is_not_null())
|
|
431
|
+
no_rt_entries = matches.filter(pl.col("rt").is_null())
|
|
417
432
|
|
|
418
433
|
if not rt_candidates.is_empty():
|
|
419
434
|
# Apply RT filtering to candidates with RT data
|
|
420
435
|
rt_matches = rt_candidates.filter((pl.col("rt") >= cons_rt - rt_tol) & (pl.col("rt") <= cons_rt + rt_tol))
|
|
421
436
|
|
|
422
|
-
|
|
437
|
+
# Combine RT-filtered matches with entries that have no RT data
|
|
438
|
+
# Rationale: Entries without RT can't be filtered by RT, so include them
|
|
439
|
+
if not rt_matches.is_empty() and not no_rt_entries.is_empty():
|
|
440
|
+
# Both RT matches and no-RT entries exist
|
|
441
|
+
matches = pl.concat([rt_matches, no_rt_entries])
|
|
442
|
+
if logger:
|
|
443
|
+
logger.debug(
|
|
444
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, "
|
|
445
|
+
f"{len(rt_matches)} passed RT filter, {len(no_rt_entries)} with no RT → {len(matches)} total matches"
|
|
446
|
+
)
|
|
447
|
+
elif not rt_matches.is_empty():
|
|
448
|
+
# Only RT matches, no entries without RT
|
|
423
449
|
matches = rt_matches
|
|
424
450
|
if logger:
|
|
425
451
|
logger.debug(
|
|
426
|
-
f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT,
|
|
452
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, "
|
|
453
|
+
f"{len(matches)} passed RT filter"
|
|
454
|
+
)
|
|
455
|
+
elif not no_rt_entries.is_empty():
|
|
456
|
+
# No RT matches passed filter, but there are entries without RT
|
|
457
|
+
matches = no_rt_entries
|
|
458
|
+
if logger:
|
|
459
|
+
logger.debug(
|
|
460
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT but none passed RT filter, "
|
|
461
|
+
f"using {len(matches)} entries with no RT data"
|
|
427
462
|
)
|
|
428
463
|
else:
|
|
429
|
-
#
|
|
430
|
-
matches =
|
|
464
|
+
# No RT matches and no entries without RT - return empty
|
|
465
|
+
matches = pl.DataFrame()
|
|
431
466
|
if logger:
|
|
432
467
|
logger.debug(
|
|
433
468
|
f"Consensus {cons_uid}: RT filtering eliminated all {len(rt_candidates)} candidates (rt_tol={rt_tol}s) - no matches returned"
|
|
434
469
|
)
|
|
435
470
|
else:
|
|
436
|
-
#
|
|
471
|
+
# All m/z matches have no RT data - keep all m/z matches
|
|
437
472
|
if logger:
|
|
438
473
|
logger.debug(
|
|
439
|
-
f"Consensus {cons_uid}: {initial_match_count} m/z matches
|
|
474
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches, all have no RT data - using m/z matches only"
|
|
440
475
|
)
|
|
441
|
-
matches
|
|
476
|
+
# matches already contains the m/z-filtered results (which are all no_rt_entries)
|
|
442
477
|
|
|
443
478
|
# FIX 1: Add stricter m/z validation - prioritize more accurate matches
|
|
444
479
|
if not matches.is_empty():
|
|
@@ -884,6 +919,18 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
884
919
|
effective_mz_tol = getattr(params, "mz_tol", 0.01)
|
|
885
920
|
effective_rt_tol = getattr(params, "rt_tol", 2.0)
|
|
886
921
|
|
|
922
|
+
# Check if library has RT data - if not, disable RT filtering
|
|
923
|
+
if effective_rt_tol is not None and hasattr(study, "lib_df") and study.lib_df is not None:
|
|
924
|
+
if "rt" in study.lib_df.columns:
|
|
925
|
+
# Check if library has any non-null RT values
|
|
926
|
+
rt_count = study.lib_df.filter(pl.col("rt").is_not_null()).shape[0]
|
|
927
|
+
if rt_count == 0:
|
|
928
|
+
if logger:
|
|
929
|
+
logger.info(
|
|
930
|
+
f"Library has no retention time data - disabling RT filtering (was rt_tol={effective_rt_tol})"
|
|
931
|
+
)
|
|
932
|
+
effective_rt_tol = None
|
|
933
|
+
|
|
887
934
|
if logger:
|
|
888
935
|
logger.debug(
|
|
889
936
|
f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}",
|
|
@@ -1483,7 +1530,7 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
|
|
|
1483
1530
|
if charge_min <= abs(total_charge) <= charge_max and total_charge != 0:
|
|
1484
1531
|
components = [spec] * multiplier
|
|
1485
1532
|
formatted_name = _format_adduct_name(components)
|
|
1486
|
-
probability_multiplied = float(spec["probability"]) ** multiplier
|
|
1533
|
+
probability_multiplied = (float(spec["probability"]) ** multiplier) / 2.0
|
|
1487
1534
|
|
|
1488
1535
|
combinations_list.append(
|
|
1489
1536
|
{
|