masster 0.5.28__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/data/libs/aa_nort.json +240 -0
- masster/data/libs/ccm_nort.json +1319 -0
- masster/lib/lib.py +1 -1
- masster/logger.py +0 -6
- masster/sample/adducts.py +1 -1
- masster/sample/defaults/find_adducts_def.py +1 -1
- masster/sample/h5.py +152 -2
- masster/sample/helpers.py +91 -5
- masster/sample/id.py +1160 -0
- masster/sample/importers.py +316 -0
- masster/sample/plot.py +175 -71
- masster/sample/sample.py +18 -3
- masster/sample/sample5_schema.json +99 -1
- masster/study/defaults/study_def.py +8 -12
- masster/study/export.py +62 -62
- masster/study/id.py +59 -12
- masster/study/load.py +0 -11
- masster/study/merge.py +153 -0
- masster/study/plot.py +197 -0
- masster/study/study.py +3 -1
- masster/study/study5_schema.json +15 -0
- masster/wizard/wizard.py +11 -12
- {masster-0.5.28.dist-info → masster-0.6.1.dist-info}/METADATA +17 -18
- {masster-0.5.28.dist-info → masster-0.6.1.dist-info}/RECORD +28 -27
- masster/data/libs/aa.csv +0 -22
- masster/data/libs/ccm.csv +0 -120
- masster/data/libs/urine.csv +0 -4693
- {masster-0.5.28.dist-info → masster-0.6.1.dist-info}/WHEEL +0 -0
- {masster-0.5.28.dist-info → masster-0.6.1.dist-info}/entry_points.txt +0 -0
- {masster-0.5.28.dist-info → masster-0.6.1.dist-info}/licenses/LICENSE +0 -0
masster/sample/sample.py
CHANGED
|
@@ -129,6 +129,12 @@ from masster.sample.helpers import get_eic
|
|
|
129
129
|
from masster.sample.helpers import set_source
|
|
130
130
|
from masster.sample.helpers import _recreate_feature_map
|
|
131
131
|
from masster.sample.helpers import _get_feature_map
|
|
132
|
+
from masster.sample.id import lib_load
|
|
133
|
+
from masster.sample.id import identify
|
|
134
|
+
from masster.sample.id import get_id
|
|
135
|
+
from masster.sample.id import id_reset
|
|
136
|
+
from masster.sample.id import lib_reset
|
|
137
|
+
from masster.sample.importers import import_oracle
|
|
132
138
|
from masster.sample.load import chrom_extract
|
|
133
139
|
from masster.sample.load import _index_file
|
|
134
140
|
from masster.sample.load import load
|
|
@@ -259,9 +265,10 @@ class Sample:
|
|
|
259
265
|
# the polars data frame with MS1 level data
|
|
260
266
|
self.ms1_df = pl.DataFrame()
|
|
261
267
|
|
|
262
|
-
#
|
|
263
|
-
self.
|
|
264
|
-
self.
|
|
268
|
+
# identification DataFrames (lib_df and id_df)
|
|
269
|
+
self.lib_df = None # library DataFrame (from masster.lib or CSV/JSON)
|
|
270
|
+
self.id_df = None # identification results DataFrame
|
|
271
|
+
self._lib = None # reference to Lib object if loaded
|
|
265
272
|
self.chrom_df = None
|
|
266
273
|
|
|
267
274
|
if params.filename is not None:
|
|
@@ -292,6 +299,14 @@ class Sample:
|
|
|
292
299
|
update_parameters = update_parameters
|
|
293
300
|
get_parameters_property = get_parameters_property
|
|
294
301
|
set_parameters_property = set_parameters_property
|
|
302
|
+
# Identification methods from id.py
|
|
303
|
+
lib_load = lib_load
|
|
304
|
+
identify = identify
|
|
305
|
+
get_id = get_id
|
|
306
|
+
id_reset = id_reset
|
|
307
|
+
lib_reset = lib_reset
|
|
308
|
+
# Importers from importers.py
|
|
309
|
+
import_oracle = import_oracle
|
|
295
310
|
export_features = export_features
|
|
296
311
|
export_xlsx = export_xlsx
|
|
297
312
|
export_mgf = export_mgf
|
|
@@ -93,10 +93,108 @@
|
|
|
93
93
|
},
|
|
94
94
|
"ms1_spec": {
|
|
95
95
|
"dtype": "pl.Object"
|
|
96
|
+
},
|
|
97
|
+
"id_top_name": {
|
|
98
|
+
"dtype": "pl.Utf8"
|
|
99
|
+
},
|
|
100
|
+
"id_top_class": {
|
|
101
|
+
"dtype": "pl.Utf8"
|
|
102
|
+
},
|
|
103
|
+
"id_top_adduct": {
|
|
104
|
+
"dtype": "pl.Utf8"
|
|
105
|
+
},
|
|
106
|
+
"id_top_score": {
|
|
107
|
+
"dtype": "pl.Float64"
|
|
108
|
+
},
|
|
109
|
+
"id_source": {
|
|
110
|
+
"dtype": "pl.Utf8"
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
},
|
|
114
|
+
"lib_df": {
|
|
115
|
+
"columns": {
|
|
116
|
+
"lib_uid": {
|
|
117
|
+
"dtype": "pl.Int64"
|
|
118
|
+
},
|
|
119
|
+
"cmpd_uid": {
|
|
120
|
+
"dtype": "pl.Int64"
|
|
121
|
+
},
|
|
122
|
+
"name": {
|
|
123
|
+
"dtype": "pl.Utf8"
|
|
124
|
+
},
|
|
125
|
+
"shortname": {
|
|
126
|
+
"dtype": "pl.Utf8"
|
|
127
|
+
},
|
|
128
|
+
"class": {
|
|
129
|
+
"dtype": "pl.Utf8"
|
|
130
|
+
},
|
|
131
|
+
"formula": {
|
|
132
|
+
"dtype": "pl.Utf8"
|
|
133
|
+
},
|
|
134
|
+
"iso": {
|
|
135
|
+
"dtype": "pl.Int64"
|
|
136
|
+
},
|
|
137
|
+
"smiles": {
|
|
138
|
+
"dtype": "pl.Utf8"
|
|
139
|
+
},
|
|
140
|
+
"inchi": {
|
|
141
|
+
"dtype": "pl.Utf8"
|
|
142
|
+
},
|
|
143
|
+
"inchikey": {
|
|
144
|
+
"dtype": "pl.Utf8"
|
|
145
|
+
},
|
|
146
|
+
"adduct": {
|
|
147
|
+
"dtype": "pl.Utf8"
|
|
148
|
+
},
|
|
149
|
+
"z": {
|
|
150
|
+
"dtype": "pl.Int64"
|
|
151
|
+
},
|
|
152
|
+
"m": {
|
|
153
|
+
"dtype": "pl.Float64"
|
|
154
|
+
},
|
|
155
|
+
"mz": {
|
|
156
|
+
"dtype": "pl.Float64"
|
|
157
|
+
},
|
|
158
|
+
"rt": {
|
|
159
|
+
"dtype": "pl.Float64"
|
|
160
|
+
},
|
|
161
|
+
"quant_group": {
|
|
162
|
+
"dtype": "pl.Int64"
|
|
163
|
+
},
|
|
164
|
+
"probability": {
|
|
165
|
+
"dtype": "pl.Float64"
|
|
166
|
+
},
|
|
167
|
+
"source_id": {
|
|
168
|
+
"dtype": "pl.Utf8"
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
},
|
|
172
|
+
"id_df": {
|
|
173
|
+
"columns": {
|
|
174
|
+
"feature_uid": {
|
|
175
|
+
"dtype": "pl.Int64"
|
|
176
|
+
},
|
|
177
|
+
"lib_uid": {
|
|
178
|
+
"dtype": "pl.Int64"
|
|
179
|
+
},
|
|
180
|
+
"mz_delta": {
|
|
181
|
+
"dtype": "pl.Float64"
|
|
182
|
+
},
|
|
183
|
+
"rt_delta": {
|
|
184
|
+
"dtype": "pl.Float64"
|
|
185
|
+
},
|
|
186
|
+
"matcher": {
|
|
187
|
+
"dtype": "pl.Utf8"
|
|
188
|
+
},
|
|
189
|
+
"score": {
|
|
190
|
+
"dtype": "pl.Float64"
|
|
191
|
+
},
|
|
192
|
+
"iso": {
|
|
193
|
+
"dtype": "pl.Int64"
|
|
96
194
|
}
|
|
97
195
|
}
|
|
98
196
|
},
|
|
99
|
-
"generated_date": "2025-
|
|
197
|
+
"generated_date": "2025-10-30",
|
|
100
198
|
"ms1_df": {
|
|
101
199
|
"columns": {
|
|
102
200
|
"cycle": {
|
|
@@ -96,19 +96,15 @@ class study_defaults:
|
|
|
96
96
|
"adducts": {
|
|
97
97
|
"dtype": "list[str]",
|
|
98
98
|
"description": "List of adduct specifications in OpenMS format (element:charge:probability). Charged adduct probabilities must sum to 1.0.",
|
|
99
|
-
"default": ["H
|
|
99
|
+
"default": ["+H:1:0.65", "+Na:1:0.15", "+NH4:1:0.15", "+K:1:0.05"],
|
|
100
100
|
"examples": {
|
|
101
|
-
"positive": ["H
|
|
102
|
-
"negative": [
|
|
103
|
-
"H-1:-:0.95",
|
|
104
|
-
"Cl:-:0.05",
|
|
105
|
-
"CH2O2:0:0.2",
|
|
106
|
-
"H-2-O:0:0.2",
|
|
107
|
-
],
|
|
101
|
+
"positive": ["+H:1:0.65", "+Na:1:0.15", "+NH4:1:0.15", "+K:1:0.05", "-H2O:0:0.15"],
|
|
102
|
+
"negative": ["-H:-1:0.95", "+Cl:-1:0.05", "+CH2O2:0:0.2", "-H2O:0:0.2"],
|
|
108
103
|
},
|
|
109
104
|
"validation_rules": [
|
|
110
|
-
"Format:
|
|
111
|
-
"
|
|
105
|
+
"Format: formula:charge:probability (e.g., '+H:1:0.65', '-H:-1:0.95', '-H2O:0:0.15')",
|
|
106
|
+
"Formula must start with + or - to indicate gain/loss (e.g., '+H', '-H', '+Na', '-H2O')",
|
|
107
|
+
"Charge must be an integer (positive, negative, or 0 for neutral)",
|
|
112
108
|
"Probability must be between 0.0 and 1.0",
|
|
113
109
|
"Sum of all charged adduct probabilities must equal 1.0",
|
|
114
110
|
],
|
|
@@ -128,7 +124,7 @@ class study_defaults:
|
|
|
128
124
|
"""Set polarity-specific defaults for adducts if not explicitly provided."""
|
|
129
125
|
# If adducts is None, set based on polarity
|
|
130
126
|
if self.adducts is None:
|
|
131
|
-
if self.polarity.lower() in ["positive", "pos"]:
|
|
127
|
+
if self.polarity.lower() in ["positive", "pos", "+"]:
|
|
132
128
|
self.adducts = [
|
|
133
129
|
"+H:1:0.65",
|
|
134
130
|
"+Na:1:0.15",
|
|
@@ -136,7 +132,7 @@ class study_defaults:
|
|
|
136
132
|
"+K:1:0.05",
|
|
137
133
|
"-H2O:0:0.15",
|
|
138
134
|
]
|
|
139
|
-
elif self.polarity.lower() in ["negative", "neg"]:
|
|
135
|
+
elif self.polarity.lower() in ["negative", "neg", "-"]:
|
|
140
136
|
self.adducts = [
|
|
141
137
|
"-H:-1:0.9",
|
|
142
138
|
"+Cl:-1:0.1",
|
masster/study/export.py
CHANGED
|
@@ -524,7 +524,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
524
524
|
# Import here to avoid circular imports
|
|
525
525
|
from masster.study.id import get_id
|
|
526
526
|
|
|
527
|
-
# Get full enriched identification data for
|
|
527
|
+
# Get full enriched identification data for SME section
|
|
528
528
|
full_id_data = get_id(self)
|
|
529
529
|
if full_id_data is not None and not full_id_data.is_empty():
|
|
530
530
|
# Get top scoring identification for each consensus_uid for SML section
|
|
@@ -828,8 +828,8 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
828
828
|
smf_header = [
|
|
829
829
|
"SFH",
|
|
830
830
|
"SMF_ID",
|
|
831
|
-
"
|
|
832
|
-
"
|
|
831
|
+
"SME_ID_REFS",
|
|
832
|
+
"SME_ID_REF_ambiguity_code",
|
|
833
833
|
"adduct_ion",
|
|
834
834
|
"isotopomer",
|
|
835
835
|
"exp_mass_to_charge",
|
|
@@ -847,40 +847,40 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
847
847
|
|
|
848
848
|
# SMF table uses the same consensus features as SML, just different metadata
|
|
849
849
|
for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
|
|
850
|
-
# References to
|
|
851
|
-
|
|
852
|
-
|
|
850
|
+
# References to SME entries - each SMF can reference multiple SME entries for the same consensus_uid
|
|
851
|
+
SME_refs = "null"
|
|
852
|
+
SME_ambiguity = "null"
|
|
853
853
|
consensus_uid = row["consensus_uid"]
|
|
854
854
|
|
|
855
855
|
if full_id_data is not None:
|
|
856
|
-
# Find all
|
|
857
|
-
|
|
858
|
-
if
|
|
859
|
-
# Generate
|
|
856
|
+
# Find all SME entries for this consensus_uid
|
|
857
|
+
SME_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
|
|
858
|
+
if SME_matches.height > 0:
|
|
859
|
+
# Generate SME IDs - we'll create a mapping in the SME section
|
|
860
860
|
# For now, use a simple approach based on consensus_uid and lib_uid
|
|
861
|
-
|
|
862
|
-
for i,
|
|
863
|
-
# Create a unique
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
if
|
|
869
|
-
|
|
861
|
+
SME_ids = []
|
|
862
|
+
for i, SME_row in enumerate(SME_matches.iter_rows(named=True)):
|
|
863
|
+
# Create a unique SME ID based on consensus_uid and position
|
|
864
|
+
SME_id_base = consensus_uid * 1000 # Ensure uniqueness across consensus features
|
|
865
|
+
SME_id = SME_id_base + i + 1
|
|
866
|
+
SME_ids.append(str(SME_id))
|
|
867
|
+
|
|
868
|
+
if SME_ids:
|
|
869
|
+
SME_refs = "|".join(SME_ids)
|
|
870
870
|
# Set ambiguity code: 1=ambiguous identification, 2=multiple evidence same molecule, 3=both
|
|
871
|
-
if len(
|
|
871
|
+
if len(SME_ids) > 1:
|
|
872
872
|
# Check if all identifications point to the same compound
|
|
873
873
|
unique_cmpds = {
|
|
874
874
|
match["cmpd_uid"]
|
|
875
|
-
for match in
|
|
875
|
+
for match in SME_matches.iter_rows(named=True)
|
|
876
876
|
if match.get("cmpd_uid") is not None
|
|
877
877
|
}
|
|
878
878
|
if len(unique_cmpds) > 1:
|
|
879
|
-
|
|
879
|
+
SME_ambiguity = "1" # Ambiguous identification
|
|
880
880
|
else:
|
|
881
|
-
|
|
881
|
+
SME_ambiguity = "2" # Multiple evidence for same molecule
|
|
882
882
|
else:
|
|
883
|
-
|
|
883
|
+
SME_ambiguity = "null"
|
|
884
884
|
|
|
885
885
|
# Format isotopomer according to mzTab-M specification
|
|
886
886
|
iso_value = row.get("iso_mean", 0)
|
|
@@ -892,8 +892,8 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
892
892
|
smf_row = [
|
|
893
893
|
"SMF",
|
|
894
894
|
str(idx),
|
|
895
|
-
|
|
896
|
-
|
|
895
|
+
SME_refs,
|
|
896
|
+
SME_ambiguity,
|
|
897
897
|
adduct_list[idx - 1], # adduct_ion
|
|
898
898
|
isotopomer, # isotopomer formatted according to mzTab-M specification
|
|
899
899
|
safe_str(row.get("mz", "null")), # exp_mass_to_charge
|
|
@@ -943,16 +943,16 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
943
943
|
for line in smf_lines:
|
|
944
944
|
f.write(line + "\n")
|
|
945
945
|
|
|
946
|
-
# ---
|
|
946
|
+
# --- SME (Small Molecule Evidence) table ---
|
|
947
947
|
if full_id_data is not None and not full_id_data.is_empty():
|
|
948
|
-
|
|
948
|
+
SME_lines = []
|
|
949
949
|
# Add comment about spectra_ref being dummy placeholders
|
|
950
|
-
|
|
950
|
+
SME_lines.append(
|
|
951
951
|
"COM\tThe spectra_ref are dummy placeholders, as the annotation was based on aggregated data",
|
|
952
952
|
)
|
|
953
|
-
|
|
954
|
-
"
|
|
955
|
-
"
|
|
953
|
+
SME_header = [
|
|
954
|
+
"SEH",
|
|
955
|
+
"SME_ID",
|
|
956
956
|
"evidence_input_id",
|
|
957
957
|
"database_identifier",
|
|
958
958
|
"chemical_formula",
|
|
@@ -971,9 +971,9 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
971
971
|
"id_confidence_measure[1]",
|
|
972
972
|
"rank",
|
|
973
973
|
]
|
|
974
|
-
|
|
974
|
+
SME_lines.append("\t".join(SME_header))
|
|
975
975
|
|
|
976
|
-
# Create
|
|
976
|
+
# Create SME entries for all identification results using enriched data
|
|
977
977
|
for consensus_uid in self.consensus_df.select("consensus_uid").to_series().unique():
|
|
978
978
|
# Get consensus feature data for this consensus_uid
|
|
979
979
|
consensus_feature_data = self.consensus_df.filter(
|
|
@@ -984,16 +984,16 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
984
984
|
consensus_row = consensus_feature_data.row(0, named=True)
|
|
985
985
|
|
|
986
986
|
# Get all identification results for this consensus feature from enriched data
|
|
987
|
-
|
|
987
|
+
SME_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
|
|
988
988
|
|
|
989
|
-
if
|
|
989
|
+
if SME_matches.height > 0:
|
|
990
990
|
# Sort by score descending to maintain rank order
|
|
991
|
-
|
|
991
|
+
SME_matches = SME_matches.sort("score", descending=True)
|
|
992
992
|
|
|
993
|
-
for i,
|
|
994
|
-
# Generate unique
|
|
995
|
-
|
|
996
|
-
|
|
993
|
+
for i, SME_row in enumerate(SME_matches.iter_rows(named=True)):
|
|
994
|
+
# Generate unique SME_ID
|
|
995
|
+
SME_id_base = consensus_uid * 1000
|
|
996
|
+
SME_id = SME_id_base + i + 1
|
|
997
997
|
|
|
998
998
|
# Create evidence input ID using consensus_uid:mz:rt format
|
|
999
999
|
consensus_mz = consensus_row.get("mz", 0)
|
|
@@ -1002,15 +1002,15 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
1002
1002
|
|
|
1003
1003
|
# Database identifier - use db_id if available, otherwise fallback to cmpd_uid
|
|
1004
1004
|
db_id = "null"
|
|
1005
|
-
if
|
|
1006
|
-
db_id = safe_str(
|
|
1007
|
-
elif
|
|
1008
|
-
db_id = f"cmpd:{
|
|
1005
|
+
if SME_row.get("db_id") is not None and SME_row["db_id"] != "":
|
|
1006
|
+
db_id = safe_str(SME_row["db_id"])
|
|
1007
|
+
elif SME_row.get("cmpd_uid") is not None:
|
|
1008
|
+
db_id = f"cmpd:{SME_row['cmpd_uid']}"
|
|
1009
1009
|
|
|
1010
1010
|
# Get adduct information
|
|
1011
1011
|
adduct_ion = "null"
|
|
1012
|
-
if
|
|
1013
|
-
adduct_ion = safe_str(
|
|
1012
|
+
if SME_row.get("adduct") is not None and SME_row["adduct"] != "":
|
|
1013
|
+
adduct_ion = safe_str(SME_row["adduct"])
|
|
1014
1014
|
# Replace ? with H for better mzTab compatibility
|
|
1015
1015
|
adduct_ion = adduct_ion.replace("?", "H")
|
|
1016
1016
|
|
|
@@ -1019,8 +1019,8 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
1019
1019
|
|
|
1020
1020
|
# Identification method
|
|
1021
1021
|
id_method = "[MS, MS:1002888, small molecule confidence measure, ]"
|
|
1022
|
-
if
|
|
1023
|
-
id_method = f"[MS, MS:1002888, {
|
|
1022
|
+
if SME_row.get("matcher") is not None:
|
|
1023
|
+
id_method = f"[MS, MS:1002888, {SME_row['matcher']}, ]"
|
|
1024
1024
|
|
|
1025
1025
|
# MS level - assume MS1 for now
|
|
1026
1026
|
ms_level = "[MS, MS:1000511, ms level, 1]"
|
|
@@ -1030,18 +1030,18 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
1030
1030
|
|
|
1031
1031
|
# Theoretical mass-to-charge from lib_df
|
|
1032
1032
|
theoretical_mz = "null"
|
|
1033
|
-
if
|
|
1034
|
-
theoretical_mz = safe_str(
|
|
1033
|
+
if SME_row.get("mz") is not None: # This comes from lib_df via get_id() join
|
|
1034
|
+
theoretical_mz = safe_str(SME_row["mz"])
|
|
1035
1035
|
|
|
1036
|
-
|
|
1037
|
-
"
|
|
1038
|
-
str(
|
|
1036
|
+
SME_line = [
|
|
1037
|
+
"SME",
|
|
1038
|
+
str(SME_id),
|
|
1039
1039
|
evidence_id,
|
|
1040
1040
|
db_id,
|
|
1041
|
-
safe_str(
|
|
1042
|
-
safe_str(
|
|
1043
|
-
safe_str(
|
|
1044
|
-
safe_str(
|
|
1041
|
+
safe_str(SME_row.get("formula", "null")),
|
|
1042
|
+
safe_str(SME_row.get("smiles", "null")),
|
|
1043
|
+
safe_str(SME_row.get("inchi", "null")),
|
|
1044
|
+
safe_str(SME_row.get("name", "null")),
|
|
1045
1045
|
"null", # uri - not available in current data
|
|
1046
1046
|
"null", # derivatized_form
|
|
1047
1047
|
adduct_ion,
|
|
@@ -1053,15 +1053,15 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
1053
1053
|
spectra_ref,
|
|
1054
1054
|
id_method,
|
|
1055
1055
|
ms_level,
|
|
1056
|
-
safe_str(
|
|
1056
|
+
safe_str(SME_row.get("score", "null")),
|
|
1057
1057
|
str(i + 1), # rank within this consensus feature
|
|
1058
1058
|
]
|
|
1059
|
-
|
|
1059
|
+
SME_lines.append("\t".join(SME_line))
|
|
1060
1060
|
|
|
1061
|
-
# Write
|
|
1061
|
+
# Write SME table
|
|
1062
1062
|
with open(filename, "a", encoding="utf-8") as f:
|
|
1063
1063
|
f.write("\n")
|
|
1064
|
-
for line in
|
|
1064
|
+
for line in SME_lines:
|
|
1065
1065
|
f.write(line + "\n")
|
|
1066
1066
|
|
|
1067
1067
|
# --- MGF table ---
|
masster/study/id.py
CHANGED
|
@@ -24,7 +24,8 @@ def lib_load(
|
|
|
24
24
|
lib_source: either a CSV/JSON file path (str) or a Lib instance
|
|
25
25
|
polarity: ionization polarity ("positive" or "negative") - used when lib_source is a CSV/JSON path.
|
|
26
26
|
If None, uses study.polarity automatically.
|
|
27
|
-
adducts: specific adducts to generate - used when lib_source is a CSV/JSON path
|
|
27
|
+
adducts: specific adducts to generate - used when lib_source is a CSV/JSON path.
|
|
28
|
+
If None, uses study.parameters.adducts if available.
|
|
28
29
|
iso: isotope generation mode ("13C" to generate 13C isotopes, None for no isotopes)
|
|
29
30
|
|
|
30
31
|
Side effects:
|
|
@@ -51,6 +52,18 @@ def lib_load(
|
|
|
51
52
|
else:
|
|
52
53
|
polarity = "positive" # Default fallback
|
|
53
54
|
study.logger.debug(f"Using study polarity: {polarity}")
|
|
55
|
+
|
|
56
|
+
# Use study.parameters.adducts if adducts not explicitly provided
|
|
57
|
+
# If study.parameters.adducts is also None, lib will use its default adducts for the polarity
|
|
58
|
+
if adducts is None:
|
|
59
|
+
if hasattr(study, "parameters") and hasattr(study.parameters, "adducts"):
|
|
60
|
+
adducts = study.parameters.adducts
|
|
61
|
+
if adducts:
|
|
62
|
+
study.logger.debug(f"Using study.parameters.adducts: {adducts}")
|
|
63
|
+
else:
|
|
64
|
+
study.logger.debug(f"study.parameters.adducts is None, lib will use default adducts for {polarity} mode")
|
|
65
|
+
else:
|
|
66
|
+
study.logger.debug(f"study.parameters.adducts not found, lib will use default adducts for {polarity} mode")
|
|
54
67
|
|
|
55
68
|
# Handle string input (CSV or JSON file path)
|
|
56
69
|
if isinstance(lib_source, str):
|
|
@@ -403,42 +416,64 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
|
|
|
403
416
|
"""
|
|
404
417
|
Find library matches using optimized vectorized operations.
|
|
405
418
|
|
|
406
|
-
|
|
419
|
+
Automatically skips RT filtering if library has no RT data for the matched entries.
|
|
407
420
|
"""
|
|
408
421
|
# Filter by m/z tolerance using vectorized operations
|
|
409
422
|
matches = lib_df.filter((pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol))
|
|
410
423
|
|
|
411
424
|
initial_match_count = len(matches)
|
|
412
425
|
|
|
413
|
-
# Apply RT filter if
|
|
426
|
+
# Apply RT filter if requested AND if data is available
|
|
427
|
+
# Strategy: Handle mixed RT/no-RT entries properly by treating them separately
|
|
414
428
|
if rt_tol is not None and cons_rt is not None and not matches.is_empty():
|
|
415
|
-
#
|
|
429
|
+
# Separate entries with and without RT data
|
|
416
430
|
rt_candidates = matches.filter(pl.col("rt").is_not_null())
|
|
431
|
+
no_rt_entries = matches.filter(pl.col("rt").is_null())
|
|
417
432
|
|
|
418
433
|
if not rt_candidates.is_empty():
|
|
419
434
|
# Apply RT filtering to candidates with RT data
|
|
420
435
|
rt_matches = rt_candidates.filter((pl.col("rt") >= cons_rt - rt_tol) & (pl.col("rt") <= cons_rt + rt_tol))
|
|
421
436
|
|
|
422
|
-
|
|
437
|
+
# Combine RT-filtered matches with entries that have no RT data
|
|
438
|
+
# Rationale: Entries without RT can't be filtered by RT, so include them
|
|
439
|
+
if not rt_matches.is_empty() and not no_rt_entries.is_empty():
|
|
440
|
+
# Both RT matches and no-RT entries exist
|
|
441
|
+
matches = pl.concat([rt_matches, no_rt_entries])
|
|
442
|
+
if logger:
|
|
443
|
+
logger.debug(
|
|
444
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, "
|
|
445
|
+
f"{len(rt_matches)} passed RT filter, {len(no_rt_entries)} with no RT → {len(matches)} total matches"
|
|
446
|
+
)
|
|
447
|
+
elif not rt_matches.is_empty():
|
|
448
|
+
# Only RT matches, no entries without RT
|
|
423
449
|
matches = rt_matches
|
|
424
450
|
if logger:
|
|
425
451
|
logger.debug(
|
|
426
|
-
f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT,
|
|
452
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, "
|
|
453
|
+
f"{len(matches)} passed RT filter"
|
|
454
|
+
)
|
|
455
|
+
elif not no_rt_entries.is_empty():
|
|
456
|
+
# No RT matches passed filter, but there are entries without RT
|
|
457
|
+
matches = no_rt_entries
|
|
458
|
+
if logger:
|
|
459
|
+
logger.debug(
|
|
460
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT but none passed RT filter, "
|
|
461
|
+
f"using {len(matches)} entries with no RT data"
|
|
427
462
|
)
|
|
428
463
|
else:
|
|
429
|
-
#
|
|
430
|
-
matches =
|
|
464
|
+
# No RT matches and no entries without RT - return empty
|
|
465
|
+
matches = pl.DataFrame()
|
|
431
466
|
if logger:
|
|
432
467
|
logger.debug(
|
|
433
468
|
f"Consensus {cons_uid}: RT filtering eliminated all {len(rt_candidates)} candidates (rt_tol={rt_tol}s) - no matches returned"
|
|
434
469
|
)
|
|
435
470
|
else:
|
|
436
|
-
#
|
|
471
|
+
# All m/z matches have no RT data - keep all m/z matches
|
|
437
472
|
if logger:
|
|
438
473
|
logger.debug(
|
|
439
|
-
f"Consensus {cons_uid}: {initial_match_count} m/z matches
|
|
474
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches, all have no RT data - using m/z matches only"
|
|
440
475
|
)
|
|
441
|
-
matches
|
|
476
|
+
# matches already contains the m/z-filtered results (which are all no_rt_entries)
|
|
442
477
|
|
|
443
478
|
# FIX 1: Add stricter m/z validation - prioritize more accurate matches
|
|
444
479
|
if not matches.is_empty():
|
|
@@ -884,6 +919,18 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
884
919
|
effective_mz_tol = getattr(params, "mz_tol", 0.01)
|
|
885
920
|
effective_rt_tol = getattr(params, "rt_tol", 2.0)
|
|
886
921
|
|
|
922
|
+
# Check if library has RT data - if not, disable RT filtering
|
|
923
|
+
if effective_rt_tol is not None and hasattr(study, "lib_df") and study.lib_df is not None:
|
|
924
|
+
if "rt" in study.lib_df.columns:
|
|
925
|
+
# Check if library has any non-null RT values
|
|
926
|
+
rt_count = study.lib_df.filter(pl.col("rt").is_not_null()).shape[0]
|
|
927
|
+
if rt_count == 0:
|
|
928
|
+
if logger:
|
|
929
|
+
logger.info(
|
|
930
|
+
f"Library has no retention time data - disabling RT filtering (was rt_tol={effective_rt_tol})"
|
|
931
|
+
)
|
|
932
|
+
effective_rt_tol = None
|
|
933
|
+
|
|
887
934
|
if logger:
|
|
888
935
|
logger.debug(
|
|
889
936
|
f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}",
|
|
@@ -1483,7 +1530,7 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
|
|
|
1483
1530
|
if charge_min <= abs(total_charge) <= charge_max and total_charge != 0:
|
|
1484
1531
|
components = [spec] * multiplier
|
|
1485
1532
|
formatted_name = _format_adduct_name(components)
|
|
1486
|
-
probability_multiplied = float(spec["probability"]) ** multiplier
|
|
1533
|
+
probability_multiplied = (float(spec["probability"]) ** multiplier) / 2.0
|
|
1487
1534
|
|
|
1488
1535
|
combinations_list.append(
|
|
1489
1536
|
{
|
masster/study/load.py
CHANGED
|
@@ -191,17 +191,6 @@ def load(self, filename=None):
|
|
|
191
191
|
|
|
192
192
|
_load_study5(self, filename)
|
|
193
193
|
|
|
194
|
-
# After loading the study, check if we have consensus features before loading consensus XML
|
|
195
|
-
# if (self.consensus_df is not None and not self.consensus_df.is_empty()):
|
|
196
|
-
# consensus_xml_path = filename.replace(".study5", ".consensusXML")
|
|
197
|
-
# if os.path.exists(consensus_xml_path):
|
|
198
|
-
# self._load_consensusXML(filename=consensus_xml_path)
|
|
199
|
-
# self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
|
|
200
|
-
# else:
|
|
201
|
-
# self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
|
|
202
|
-
# else:
|
|
203
|
-
# self.logger.debug("No consensus features found, skipping consensusXML loading")
|
|
204
|
-
|
|
205
194
|
self.filename = filename
|
|
206
195
|
|
|
207
196
|
|