masster 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (55) hide show
  1. masster/__init__.py +27 -27
  2. masster/_version.py +17 -17
  3. masster/chromatogram.py +497 -503
  4. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
  5. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
  6. masster/logger.py +318 -244
  7. masster/sample/__init__.py +9 -9
  8. masster/sample/defaults/__init__.py +15 -15
  9. masster/sample/defaults/find_adducts_def.py +325 -325
  10. masster/sample/defaults/find_features_def.py +366 -366
  11. masster/sample/defaults/find_ms2_def.py +285 -285
  12. masster/sample/defaults/get_spectrum_def.py +314 -318
  13. masster/sample/defaults/sample_def.py +374 -378
  14. masster/sample/h5.py +1321 -1297
  15. masster/sample/helpers.py +833 -364
  16. masster/sample/lib.py +762 -0
  17. masster/sample/load.py +1220 -1187
  18. masster/sample/parameters.py +131 -131
  19. masster/sample/plot.py +1610 -1622
  20. masster/sample/processing.py +1402 -1416
  21. masster/sample/quant.py +209 -0
  22. masster/sample/sample.py +391 -387
  23. masster/sample/sample5_schema.json +181 -181
  24. masster/sample/save.py +737 -736
  25. masster/sample/sciex.py +1213 -0
  26. masster/spectrum.py +1287 -1319
  27. masster/study/__init__.py +9 -9
  28. masster/study/defaults/__init__.py +21 -19
  29. masster/study/defaults/align_def.py +267 -267
  30. masster/study/defaults/export_def.py +41 -40
  31. masster/study/defaults/fill_chrom_def.py +264 -264
  32. masster/study/defaults/fill_def.py +260 -0
  33. masster/study/defaults/find_consensus_def.py +256 -256
  34. masster/study/defaults/find_ms2_def.py +163 -163
  35. masster/study/defaults/integrate_chrom_def.py +225 -225
  36. masster/study/defaults/integrate_def.py +221 -0
  37. masster/study/defaults/merge_def.py +256 -0
  38. masster/study/defaults/study_def.py +272 -269
  39. masster/study/export.py +674 -287
  40. masster/study/h5.py +1398 -886
  41. masster/study/helpers.py +1650 -433
  42. masster/study/helpers_optimized.py +317 -0
  43. masster/study/load.py +1201 -1078
  44. masster/study/parameters.py +99 -99
  45. masster/study/plot.py +632 -645
  46. masster/study/processing.py +1057 -1046
  47. masster/study/save.py +149 -134
  48. masster/study/study.py +606 -522
  49. masster/study/study5_schema.json +247 -241
  50. {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
  51. masster-0.3.0.dist-info/RECORD +59 -0
  52. {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
  53. masster-0.2.5.dist-info/RECORD +0 -50
  54. {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
  55. {masster-0.2.5.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/study/export.py CHANGED
@@ -1,287 +1,674 @@
1
- from __future__ import annotations
2
-
3
- import os
4
-
5
- from datetime import datetime
6
-
7
- import numpy as np
8
- import pandas as pd
9
-
10
-
11
- from tqdm import tqdm
12
-
13
- from masster.spectrum import combine_peaks
14
- from masster.study.defaults import export_mgf_defaults
15
-
16
-
17
- def export_mgf(self, **kwargs):
18
- """
19
- Export consensus features as MGF format for database searching.
20
-
21
- Parameters:
22
- **kwargs: Keyword arguments for export parameters. Can include:
23
- - An export_defaults instance to set all parameters at once
24
- - Individual parameter names and values (see export_defaults for details)
25
-
26
- Key Parameters:
27
- filename (str): Output MGF file name (default: "features.mgf").
28
- selection (str): "best" for first scan, "all" for every scan (default: "best").
29
- split_energy (bool): Process MS2 scans by unique energy (default: True).
30
- merge (bool): If selection="all", merge MS2 scans into one spectrum (default: False).
31
- mz_start (float): Minimum m/z for feature selection (default: None).
32
- mz_end (float): Maximum m/z for feature selection (default: None).
33
- rt_start (float): Minimum RT for feature selection (default: None).
34
- rt_end (float): Maximum RT for feature selection (default: None).
35
- centroid (bool): Apply centroiding to spectra (default: True).
36
- inty_min (float): Minimum intensity threshold (default: None).
37
- deisotope (bool): Apply deisotoping to spectra (default: True).
38
- verbose (bool): Enable verbose logging (default: False).
39
- precursor_trim (float): Precursor trimming value (default: -10).
40
- centroid_algo (str): Centroiding algorithm (default: "lmp").
41
- """
42
- # parameters initialization
43
- params = export_mgf_defaults()
44
- for key, value in kwargs.items():
45
- if isinstance(value, export_mgf_defaults):
46
- params = value
47
- self.logger.debug("Using provided export_defaults parameters")
48
- else:
49
- if hasattr(params, key):
50
- if params.set(key, value, validate=True):
51
- self.logger.debug(f"Updated parameter {key} = {value}")
52
- else:
53
- self.logger.warning(
54
- f"Failed to set parameter {key} = {value} (validation failed)",
55
- )
56
- else:
57
- self.logger.debug(f"Unknown parameter {key} ignored")
58
- # end of parameter initialization
59
-
60
- # Store parameters in the Study object
61
- self.store_history(["export_mgf"], params.to_dict())
62
- self.logger.debug("Parameters stored to export_mgf")
63
-
64
- # Get parameter values for use in the method
65
- filename = params.get("filename")
66
- selection = params.get("selection")
67
- split_energy = params.get("split_energy")
68
- merge = params.get("merge")
69
- mz_start = params.get("mz_start")
70
- mz_end = params.get("mz_end")
71
- rt_start = params.get("rt_start")
72
- rt_end = params.get("rt_end")
73
- centroid = params.get("centroid")
74
- inty_min = params.get("inty_min")
75
- deisotope = params.get("deisotope")
76
-
77
- if self.consensus_df is None:
78
- self.logger.error("No consensus map found. Please run find_consensus() first.")
79
- return
80
- if self.consensus_ms2 is None:
81
- self.logger.error("No consensus MS2 data found. Please run link_ms2() first.")
82
- return
83
-
84
- # Convert to pandas for merge operation since the result is used for groupby
85
- consensus_df_pd = self.consensus_df.to_pandas()
86
- consensus_ms2_pd = self.consensus_ms2.to_pandas()
87
-
88
- features = pd.merge(
89
- consensus_df_pd,
90
- consensus_ms2_pd,
91
- how="right",
92
- on="consensus_uid",
93
- )
94
- if len(features) == 0:
95
- self.logger.warning("No features found.")
96
- return
97
-
98
- # Pre-group by consensus_uid for fast access
99
- grouped = features.groupby("consensus_uid")
100
-
101
- def filter_peaks(spec, inty_min=None):
102
- spec = spec.copy()
103
- length = len(spec.mz)
104
- mask = np.ones(length, dtype=bool)
105
- if inty_min is not None and inty_min > 0:
106
- mask = mask & (spec.inty >= inty_min)
107
- for attr in spec.__dict__:
108
- arr = getattr(spec, attr)
109
- if (
110
- isinstance(arr, list | np.ndarray)
111
- and hasattr(arr, "__len__")
112
- and len(arr) == length
113
- ):
114
- setattr(spec, attr, np.array(arr)[mask])
115
- return spec
116
-
117
- def write_ion(f, title, id, uid, mz, rt, charge, spect):
118
- if spect is None:
119
- return
120
- f.write(f"BEGIN IONS\nTITLE={title}\n")
121
- f.write(f"FEATURE_ID={id}\n")
122
- f.write(f"FEATURE_UID={uid}\n")
123
- f.write(f"CHARGE={charge}\nPEPMASS={mz}\nRTINSECONDS={rt}\n")
124
- if spect.ms_level is None:
125
- f.write("MSLEVEL=1\n")
126
- else:
127
- f.write(f"MSLEVEL={spect.ms_level}\n")
128
- if (
129
- spect.ms_level is not None
130
- and spect.ms_level > 1
131
- and hasattr(spect, "energy")
132
- ):
133
- f.write(f"ENERGY={spect.energy}\n")
134
- for mz, inty in zip(spect.mz, spect.inty, strict=False):
135
- f.write(f"{mz:.5f} {inty:.0f}\n")
136
- f.write("END IONS\n\n")
137
-
138
- # Prepare output path
139
- if not os.path.isabs(filename):
140
- if self.default_folder is not None:
141
- filename = os.path.join(self.default_folder, filename)
142
- else:
143
- filename = os.path.join(os.getcwd(), filename)
144
-
145
- skip = 0
146
- self.logger.info(f"Exporting MGF for {len(grouped)} consensus features...")
147
- with open(filename, "w", encoding="utf-8") as f:
148
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
149
- for _consensus_uid, cons_ms2 in tqdm(
150
- grouped,
151
- total=len(grouped),
152
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Feature",
153
- disable=tdqm_disable,
154
- ):
155
- # Use the first row for feature-level info
156
- row = cons_ms2.iloc[0]
157
- if mz_start is not None and row["mz"] < mz_start:
158
- continue
159
- if mz_end is not None and row["mz"] > mz_end:
160
- continue
161
- if rt_start is not None and row["rt"] < rt_start:
162
- continue
163
- if rt_end is not None and row["rt"] > rt_end:
164
- continue
165
- if len(cons_ms2) == 0:
166
- skip += 1
167
- continue
168
-
169
- if split_energy:
170
- energies = cons_ms2["energy"].unique()
171
- for e in energies:
172
- cons_ms2_e = cons_ms2[cons_ms2["energy"] == e]
173
- if selection == "best":
174
- idx = cons_ms2_e["prec_inty"].idxmax()
175
- cons_ms2_e_row = cons_ms2_e.loc[idx]
176
- spect = cons_ms2_e_row["spec"]
177
- if spect is None:
178
- skip += 1
179
- continue
180
- if centroid:
181
- spect = spect.centroid()
182
- if deisotope:
183
- spect = spect.deisotope()
184
- spect = filter_peaks(spect, inty_min=inty_min)
185
- write_ion(
186
- f,
187
- f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{e}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
188
- cons_ms2_e_row["consensus_id"],
189
- cons_ms2_e_row["consensus_uid"],
190
- cons_ms2_e_row["mz"],
191
- cons_ms2_e_row["rt"],
192
- round(cons_ms2_e_row["charge_mean"]),
193
- spect,
194
- )
195
- else:
196
- for row_e in cons_ms2_e.iter_rows(named=True):
197
- spect = row_e["spec"]
198
- if spect is None:
199
- continue
200
- if centroid:
201
- spect = spect.centroid()
202
- if deisotope:
203
- spect = spect.deisotope()
204
- spect = filter_peaks(spect, inty_min=inty_min)
205
- write_ion(
206
- f,
207
- f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{e}, sample_uid:{row_e['sample_uid']}, scanid:{row_e['scan_id']}",
208
- row_e["consensus_id"],
209
- row_e["consensus_uid"],
210
- row_e["mz"],
211
- row_e["rt"],
212
- round(row_e["charge_mean"]),
213
- spect,
214
- )
215
- else:
216
- if selection == "best":
217
- idx = cons_ms2["prec_inty"].idxmax()
218
- cons_ms2_e_row = cons_ms2.loc[idx]
219
- spect = cons_ms2_e_row["spec"]
220
- if spect is None:
221
- continue
222
- if centroid:
223
- spect = spect.centroid()
224
- if deisotope:
225
- spect = spect.deisotope()
226
- spect = filter_peaks(spect, inty_min=inty_min)
227
- write_ion(
228
- f,
229
- f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{cons_ms2_e_row['energy']}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
230
- cons_ms2_e_row["consensus_id"],
231
- cons_ms2_e_row["consensus_uid"],
232
- cons_ms2_e_row["mz"],
233
- cons_ms2_e_row["rt"],
234
- round(cons_ms2_e_row["charge_mean"]),
235
- spect,
236
- )
237
-
238
- elif selection == "all":
239
- if merge:
240
- specs = [
241
- row_e["spec"]
242
- for row_e in cons_ms2.iter_rows(named=True)
243
- if row_e["spec"] is not None
244
- ]
245
- if not specs:
246
- continue
247
- spect = combine_peaks(specs)
248
- if centroid:
249
- spect = spect.denoise()
250
- spect = spect.centroid()
251
- spect = spect.centroid()
252
- if deisotope:
253
- spect = spect.deisotope()
254
- spect = filter_peaks(spect, inty_min=inty_min)
255
- write_ion(
256
- f,
257
- f"uid:{row['consensus_uid']}, rt:{row['rt']:.2f}, mz:{row['mz']:.4f}, sample_uid:{row['sample_uid']}, scan_id:{row['scan_id']}",
258
- row["consensus_id"],
259
- row["consensus_uid"],
260
- row["mz"],
261
- row["rt"],
262
- round(row["charge_mean"]),
263
- spect,
264
- )
265
- else:
266
- for row_e in cons_ms2.iter_rows(named=True):
267
- spect = row_e["spec"]
268
- if spect is None:
269
- continue
270
- if centroid:
271
- spect = spect.centroid()
272
- if deisotope:
273
- spect = spect.deisotope()
274
- spect = filter_peaks(spect, inty_min=inty_min)
275
- write_ion(
276
- f,
277
- f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{row_e['energy']}, sample_uid:{row_e['sample_uid']}, scan_id:{row_e['scan_id']}",
278
- row_e["consensus_id"],
279
- row_e["consensus_uid"],
280
- row_e["mz"],
281
- row_e["rt"],
282
- round(row_e["charge_mean"]),
283
- spect,
284
- )
285
- self.logger.info(
286
- f"Exported {len(grouped) - skip} features to {filename}. Skipped {skip} features due to missing data.",
287
- )
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ from datetime import datetime
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import polars as pl
10
+
11
+ from tqdm import tqdm
12
+
13
+ from masster.spectrum import combine_peaks
14
+ from masster.study.defaults import export_mgf_defaults
15
+ from masster._version import get_version
16
+
17
+
18
+ def _get_mgf_df(self, **kwargs):
19
+ """
20
+ Generate MGF data as a Polars DataFrame.
21
+
22
+ This is the core data generation function used by export_mgf().
23
+
24
+ Parameters:
25
+ **kwargs: Keyword arguments for export parameters. Same as export_mgf()
26
+ except return_data is not relevant here.
27
+
28
+ Returns:
29
+ pl.DataFrame: DataFrame with columns:
30
+ - mgf_index: MGF index
31
+ - title: MGF title string
32
+ - feature_id: Consensus feature ID
33
+ - feature_uid: Consensus feature UID
34
+ - charge: Charge state
35
+ - pepmass: Precursor m/z
36
+ - rtinseconds: Retention time in seconds
37
+ - mslevel: MS level
38
+ - type: Spectrum type (e.g., "MS2")
39
+ - energy: Collision energy (if available)
40
+ - spec_len: Number of peaks in spectrum
41
+ - spec_mz: List of spectrum m/z values
42
+ - spec_int: List of spectrum intensity values
43
+ """
44
+ # parameters initialization
45
+ params = export_mgf_defaults()
46
+ for key, value in kwargs.items():
47
+ if isinstance(value, export_mgf_defaults):
48
+ params = value
49
+ self.logger.debug("Using provided export_defaults parameters")
50
+ else:
51
+ if hasattr(params, key):
52
+ if params.set(key, value, validate=True):
53
+ self.logger.debug(f"Updated parameter {key} = {value}")
54
+ else:
55
+ self.logger.warning(
56
+ f"Failed to set parameter {key} = {value} (validation failed)",
57
+ )
58
+ else:
59
+ self.logger.debug(f"Unknown parameter {key} ignored")
60
+ # end of parameter initialization
61
+
62
+ # Store parameters in the Study object
63
+ self.store_history(["get_mgf"], params.to_dict())
64
+ self.logger.debug("Parameters stored to get_mgf")
65
+
66
+ # Get parameter values for use in the method
67
+ selection = params.get("selection")
68
+ split_energy = params.get("split_energy")
69
+ merge = params.get("merge")
70
+ mz_start = params.get("mz_start")
71
+ mz_end = params.get("mz_end")
72
+ rt_start = params.get("rt_start")
73
+ rt_end = params.get("rt_end")
74
+ centroid = params.get("centroid")
75
+ inty_min = params.get("inty_min")
76
+ deisotope = params.get("deisotope")
77
+
78
+ if self.consensus_df is None:
79
+ self.logger.error("No consensus map found. Please run merge() first.")
80
+ return None
81
+ if self.consensus_ms2 is None:
82
+ self.logger.error("No consensus MS2 data found. Please run link_ms2() first.")
83
+ return None
84
+
85
+ # Convert to pandas for merge operation since the result is used for groupby
86
+ consensus_df_pd = self.consensus_df.to_pandas()
87
+ consensus_ms2_pd = self.consensus_ms2.to_pandas()
88
+
89
+ features = pd.merge(
90
+ consensus_df_pd,
91
+ consensus_ms2_pd,
92
+ how="right",
93
+ on="consensus_uid",
94
+ )
95
+ if len(features) == 0:
96
+ self.logger.warning("No features found.")
97
+ return pl.DataFrame()
98
+
99
+ # Pre-group by consensus_uid for fast access
100
+ grouped = features.groupby("consensus_uid")
101
+
102
+ def filter_peaks(spec, inty_min=None):
103
+ spec = spec.copy()
104
+ length = len(spec.mz)
105
+ mask = np.ones(length, dtype=bool)
106
+ if inty_min is not None and inty_min > 0:
107
+ mask = mask & (spec.inty >= inty_min)
108
+ for attr in spec.__dict__:
109
+ arr = getattr(spec, attr)
110
+ if isinstance(arr, list | np.ndarray) and hasattr(arr, "__len__") and len(arr) == length:
111
+ setattr(spec, attr, np.array(arr)[mask])
112
+ return spec
113
+
114
+ def create_ion_dict(title, id, uid, mz, rt, charge, spect, mgf_id):
115
+ """Create a dictionary representing an ion for the DataFrame."""
116
+ if spect is None:
117
+ return None
118
+
119
+ # Prepare spectrum data
120
+ spectrum_mz = spect.mz.tolist() if hasattr(spect.mz, 'tolist') else list(spect.mz)
121
+ spectrum_inty = spect.inty.tolist() if hasattr(spect.inty, 'tolist') else list(spect.inty)
122
+
123
+ # Determine MS level
124
+ ms_level = spect.ms_level if spect.ms_level is not None else 1
125
+
126
+ # Get energy if available
127
+ energy = getattr(spect, 'energy', None)
128
+
129
+ # Determine spectrum type based on MS level
130
+ spec_type = f"MS{ms_level}" if ms_level > 1 else "MS1"
131
+
132
+ # Calculate spectrum length
133
+ spec_len = len(spectrum_mz)
134
+
135
+ return {
136
+ 'mgf_index': mgf_id,
137
+ 'title': title,
138
+ 'feature_id': id,
139
+ 'feature_uid': uid,
140
+ 'charge': charge,
141
+ 'pepmass': mz,
142
+ 'rtinseconds': rt,
143
+ 'mslevel': ms_level,
144
+ 'type': spec_type,
145
+ 'energy': energy,
146
+ 'spec_len': spec_len,
147
+ 'spec_mz': spectrum_mz,
148
+ 'spec_int': spectrum_inty,
149
+ }
150
+
151
+ # Collect all ion data
152
+ ion_data = []
153
+ skip = 0
154
+ mgf_counter = 0
155
+ self.logger.info(f"Generating MGF data for {len(grouped)} consensus features...")
156
+
157
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
158
+ for _consensus_uid, cons_ms2 in tqdm(
159
+ grouped,
160
+ total=len(grouped),
161
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Feature",
162
+ disable=tdqm_disable,
163
+ ):
164
+ # Use the first row for feature-level info
165
+ row = cons_ms2.iloc[0]
166
+ if mz_start is not None and row["mz"] < mz_start:
167
+ continue
168
+ if mz_end is not None and row["mz"] > mz_end:
169
+ continue
170
+ if rt_start is not None and row["rt"] < rt_start:
171
+ continue
172
+ if rt_end is not None and row["rt"] > rt_end:
173
+ continue
174
+ if len(cons_ms2) == 0:
175
+ skip += 1
176
+ continue
177
+
178
+ if split_energy:
179
+ energies = cons_ms2["energy"].unique()
180
+ for e in energies:
181
+ cons_ms2_e = cons_ms2[cons_ms2["energy"] == e]
182
+ if selection == "best":
183
+ idx = cons_ms2_e["prec_inty"].idxmax()
184
+ cons_ms2_e_row = cons_ms2_e.loc[idx]
185
+ spect = cons_ms2_e_row["spec"]
186
+ if spect is None:
187
+ skip += 1
188
+ continue
189
+ if centroid:
190
+ spect = spect.centroid()
191
+ if deisotope:
192
+ spect = spect.deisotope()
193
+ spect = filter_peaks(spect, inty_min=inty_min)
194
+ mgf_counter += 1
195
+ ion_dict = create_ion_dict(
196
+ f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{e}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
197
+ cons_ms2_e_row["consensus_id"],
198
+ cons_ms2_e_row["consensus_uid"],
199
+ cons_ms2_e_row["mz"],
200
+ cons_ms2_e_row["rt"],
201
+ round(cons_ms2_e_row["charge_mean"]),
202
+ spect,
203
+ mgf_counter,
204
+ )
205
+ if ion_dict is not None:
206
+ ion_data.append(ion_dict)
207
+ else:
208
+ for row_e in cons_ms2_e.iter_rows(named=True):
209
+ spect = row_e["spec"]
210
+ if spect is None:
211
+ continue
212
+ if centroid:
213
+ spect = spect.centroid()
214
+ if deisotope:
215
+ spect = spect.deisotope()
216
+ spect = filter_peaks(spect, inty_min=inty_min)
217
+ mgf_counter += 1
218
+ ion_dict = create_ion_dict(
219
+ f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{e}, sample_uid:{row_e['sample_uid']}, scanid:{row_e['scan_id']}",
220
+ row_e["consensus_id"],
221
+ row_e["consensus_uid"],
222
+ row_e["mz"],
223
+ row_e["rt"],
224
+ round(row_e["charge_mean"]),
225
+ spect,
226
+ mgf_counter,
227
+ )
228
+ if ion_dict is not None:
229
+ ion_data.append(ion_dict)
230
+ else:
231
+ if selection == "best":
232
+ idx = cons_ms2["prec_inty"].idxmax()
233
+ cons_ms2_e_row = cons_ms2.loc[idx]
234
+ spect = cons_ms2_e_row["spec"]
235
+ if spect is None:
236
+ continue
237
+ if centroid:
238
+ spect = spect.centroid()
239
+ if deisotope:
240
+ spect = spect.deisotope()
241
+ spect = filter_peaks(spect, inty_min=inty_min)
242
+ mgf_counter += 1
243
+ ion_dict = create_ion_dict(
244
+ f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{cons_ms2_e_row['energy']}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
245
+ cons_ms2_e_row["consensus_id"],
246
+ cons_ms2_e_row["consensus_uid"],
247
+ cons_ms2_e_row["mz"],
248
+ cons_ms2_e_row["rt"],
249
+ round(cons_ms2_e_row["charge_mean"]),
250
+ spect,
251
+ mgf_counter,
252
+ )
253
+ if ion_dict is not None:
254
+ ion_data.append(ion_dict)
255
+
256
+ elif selection == "all":
257
+ if merge:
258
+ specs = [row_e["spec"] for row_e in cons_ms2.iter_rows(named=True) if row_e["spec"] is not None]
259
+ if not specs:
260
+ continue
261
+ spect = combine_peaks(specs)
262
+ if centroid:
263
+ spect = spect.denoise()
264
+ spect = spect.centroid()
265
+ if deisotope:
266
+ spect = spect.deisotope()
267
+ spect = filter_peaks(spect, inty_min=inty_min)
268
+ mgf_counter += 1
269
+ ion_dict = create_ion_dict(
270
+ f"uid:{row['consensus_uid']}, rt:{row['rt']:.2f}, mz:{row['mz']:.4f}, sample_uid:{row['sample_uid']}, scan_id:{row['scan_id']}",
271
+ row["consensus_id"],
272
+ row["consensus_uid"],
273
+ row["mz"],
274
+ row["rt"],
275
+ round(row["charge_mean"]),
276
+ spect,
277
+ mgf_counter,
278
+ )
279
+ if ion_dict is not None:
280
+ ion_data.append(ion_dict)
281
+ else:
282
+ for row_e in cons_ms2.iter_rows(named=True):
283
+ spect = row_e["spec"]
284
+ if spect is None:
285
+ continue
286
+ if centroid:
287
+ spect = spect.centroid()
288
+ if deisotope:
289
+ spect = spect.deisotope()
290
+ spect = filter_peaks(spect, inty_min=inty_min)
291
+ mgf_counter += 1
292
+ ion_dict = create_ion_dict(
293
+ f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{row_e['energy']}, sample_uid:{row_e['sample_uid']}, scan_id:{row_e['scan_id']}",
294
+ row_e["consensus_id"],
295
+ row_e["consensus_uid"],
296
+ row_e["mz"],
297
+ row_e["rt"],
298
+ round(row_e["charge_mean"]),
299
+ spect,
300
+ mgf_counter,
301
+ )
302
+ if ion_dict is not None:
303
+ ion_data.append(ion_dict)
304
+
305
+ self.logger.debug(f"Generated MGF data for {len(ion_data)} spectra")
306
+ self.logger.debug(f"Skipped {skip} features due to missing data.")
307
+
308
+ # Convert to Polars DataFrame
309
+ if not ion_data:
310
+ return pl.DataFrame()
311
+
312
+ return pl.DataFrame(ion_data)
313
+
314
+
315
+ def export_mgf(self, **kwargs):
316
+ """
317
+ Export consensus features as MGF format for database searching.
318
+
319
+ Parameters:
320
+ **kwargs: Keyword arguments for export parameters. Can include:
321
+ - An export_defaults instance to set all parameters at once
322
+ - Individual parameter names and values (see export_defaults for details)
323
+
324
+ Key Parameters:
325
+ filename (str): Output MGF file name (default: "features.mgf").
326
+ selection (str): "best" for first scan, "all" for every scan (default: "best").
327
+ split_energy (bool): Process MS2 scans by unique energy (default: True).
328
+ merge (bool): If selection="all", merge MS2 scans into one spectrum (default: False).
329
+ mz_start (float): Minimum m/z for feature selection (default: None).
330
+ mz_end (float): Maximum m/z for feature selection (default: None).
331
+ rt_start (float): Minimum RT for feature selection (default: None).
332
+ rt_end (float): Maximum RT for feature selection (default: None).
333
+ centroid (bool): Apply centroiding to spectra (default: True).
334
+ inty_min (float): Minimum intensity threshold (default: None).
335
+ deisotope (bool): Apply deisotoping to spectra (default: True).
336
+ verbose (bool): Enable verbose logging (default: False).
337
+ precursor_trim (float): Precursor trimming value (default: -10).
338
+ centroid_algo (str): Centroiding algorithm (default: "lmp").
339
+
340
+ Returns:
341
+ None: Writes MGF file to disk.
342
+ """
343
+ # Get mgf data as DataFrame
344
+ mgf_data = self._get_mgf_df(**kwargs)
345
+
346
+ if mgf_data is None or len(mgf_data) == 0:
347
+ self.logger.warning("No MGF data generated.")
348
+ return
349
+
350
+ # Get filename from parameters
351
+ params = export_mgf_defaults()
352
+ for key, value in kwargs.items():
353
+ if isinstance(value, export_mgf_defaults):
354
+ params = value
355
+ else:
356
+ if hasattr(params, key):
357
+ params.set(key, value, validate=True)
358
+
359
+ filename = params.get("filename")
360
+
361
+ # Prepare output path
362
+ if not os.path.isabs(filename):
363
+ if self.folder is not None:
364
+ filename = os.path.join(self.folder, filename)
365
+ else:
366
+ filename = os.path.join(os.getcwd(), filename)
367
+
368
+ # Write MGF file
369
+ with open(filename, "w", encoding="utf-8") as f:
370
+ for row in mgf_data.iter_rows(named=True):
371
+ # Write BEGIN IONS
372
+ f.write("BEGIN IONS\n")
373
+
374
+ # Write metadata
375
+ if row["mgf_index"] is not None:
376
+ f.write(f"INDEX={row['mgf_index']}\n")
377
+ f.write(f"TITLE={row['title']}\n")
378
+ f.write(f"FEATURE_ID={row['feature_id']}\n")
379
+ f.write(f"FEATURE_UID={row['feature_uid']}\n")
380
+ f.write(f"CHARGE={row['charge']}\n")
381
+ f.write(f"PEPMASS={row['pepmass']}\n")
382
+ f.write(f"RTINSECONDS={row['rtinseconds']}\n")
383
+ f.write(f"MSLEVEL={row['mslevel']}\n")
384
+
385
+ if row["energy"] is not None:
386
+ f.write(f"ENERGY={row['energy']}\n")
387
+
388
+ # Write spectrum data
389
+ spectrum_mz = row["spec_mz"]
390
+ spectrum_inty = row["spec_int"]
391
+ for mz_val, inty in zip(spectrum_mz, spectrum_inty, strict=False):
392
+ f.write(f"{mz_val:.5f} {inty:.0f}\n")
393
+
394
+ # Write END IONS
395
+ f.write("END IONS\n\n")
396
+
397
+ self.logger.info(f"Exported {len(mgf_data)} spectra to {filename}")
398
+
399
+
400
+ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None:
401
+ """
402
+ Export the study as a fully compliant mzTab-M file.
403
+
404
+ Args:
405
+ filename (str, optional): Path to the output mzTab-M file.
406
+ title (str, optional): Human-readable title for the file.
407
+ description (str, optional): Human-readable description.
408
+ **kwargs: Additional metadata or export options.
409
+ """
410
+ if filename is None:
411
+ filename = "study.mztab"
412
+ if not os.path.isabs(filename):
413
+ if self.folder is not None:
414
+ filename = os.path.join(self.folder, filename)
415
+ else:
416
+ filename = os.path.join(os.getcwd(), filename)
417
+
418
+ # get mgf data
419
+ mgf_data = self._get_mgf_df(**kwargs)
420
+ # Create mapping from feature_uid to MGF indexes
421
+ mgf_mapping: dict[str, list[int]] = {}
422
+ if mgf_data is not None and len(mgf_data) > 0:
423
+ for row in mgf_data.iter_rows(named=True):
424
+ feature_uid = row['feature_uid']
425
+ mgf_index = row['mgf_index']
426
+ if feature_uid not in mgf_mapping:
427
+ mgf_mapping[feature_uid] = []
428
+ mgf_mapping[feature_uid].append(mgf_index)
429
+
430
+ # --- Prepare MTD (metadata) section ---
431
+ mtd_lines = []
432
+ mtd_lines.append(f"COM file generated by MASSter on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
433
+ mtd_lines.append("\nMTD\tmzTab-version\t2.2.0-M")
434
+ id = self.label if self.label else self.folder
435
+ mtd_lines.append(f"MTD\tmzTab-id\t{id}")
436
+ mtd_lines.append('')
437
+ mtd_lines.append("MTD\tcv[1]-label\tMS")
438
+ mtd_lines.append("MTD\tcv[1]-full_name\tPSI-MS controlled vocabulary")
439
+ mtd_lines.append("MTD\tcv[1]-version\t4.1.199")
440
+ mtd_lines.append("MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo")
441
+ mtd_lines.append('')
442
+ mtd_lines.append("MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
443
+ mtd_lines.append("MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
444
+ mtd_lines.append("MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]")
445
+ mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
446
+ mtd_lines.append('')
447
+ mtd_lines.append("MTD\tsoftware[1]\t[MS, MS:1003430, OpenMS, unknown]")
448
+ mtd_lines.append(f"MTD\tsoftware[2]\t[MS, MS:1002878, MASSter, {get_version()}]")
449
+ mtd_lines.append("MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]")
450
+ mtd_lines.append('')
451
+ mtd_lines.append("MTD\tdatabase[1]\t[, , \"no database\", null]")
452
+ mtd_lines.append("MTD\tdatabase[1]-prefix\tnull")
453
+ mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
454
+ mtd_lines.append("MTD\tdatabase[1]-uri\tnull")
455
+ #mtd_lines.append('')
456
+ for i, row in enumerate(self.samples_df.iter_rows(named=True), 1):
457
+ mtd_lines.append(f"\nMTD\tsample[{i}]\t{row.get('sample_uid', f'sample_{i}')}")
458
+ mtd_lines.append(f"MTD\tsample[{i}]-description\t{row.get('sample_name', 'unknown')}")
459
+ mtd_lines.append(f"MTD\tms_run[{i}]-location\tfile://{row.get('sample_path', 'unknown')}")
460
+ mtd_lines.append(f"MTD\tassay[{i}]\tAssay_{i}")
461
+ mtd_lines.append(f"MTD\tassay[{i}]-sample_ref\tsample[{i}]")
462
+ mtd_lines.append(f"MTD\tassay[{i}]-ms_run_ref\tms_run[{i}]")
463
+ mtd_lines.append('')
464
+ mtd_lines.append("MTD\tstudy_variable[1]\tundefined")
465
+ mtd_lines.append("MTD\tstudy_variable[1]_refs\tundefined")
466
+ #assay_refs = '|'.join([f"assay[{i}]" for i in range(1, len(self.samples_df)+1)])
467
+ #mtd_lines.append(f"MTD\tstudy_variable[1]-assay_refs\t{assay_refs}")
468
+ #mtd_lines.append("MTD\tstudy_variable[1]-description\tAll assays grouped (default)")
469
+ with open(filename, 'w', encoding='utf-8') as f:
470
+ for line in mtd_lines:
471
+ f.write(line + '\n')
472
+
473
+ # --- SML (Small Molecule) table ---
474
+ sml_lines = []
475
+ sml_header = [
476
+ "SMH",
477
+ "SML_ID",
478
+ "SMF_ID_REFS",
479
+ "database_identifier",
480
+ "chemical_formula",
481
+ "smiles",
482
+ "inchi",
483
+ "chemical_name",
484
+ "uri",
485
+ "theoretical_neutral_mass",
486
+ "adduct_ions",
487
+ "reliability",
488
+ "best_id_confidence_measure",
489
+ "best_id_confidence_value",
490
+ "opt_global_mgf_index",
491
+ ]
492
+
493
+ abundance_matrix = self.get_consensus_matrix()
494
+ # Use the matrix as-is since it already has the correct sample columns
495
+ # The matrix columns are sample names, which is what we want for the assay columns
496
+
497
+ # round to int
498
+ abundance_matrix = abundance_matrix.round(0)
499
+
500
+ # Use actual number of samples from the abundance matrix
501
+ n_assays = len(abundance_matrix.columns)
502
+ sml_header += [f"abundance_assay[{i}]" for i in range(1, n_assays+1)]
503
+ sml_header += ["abundance_study_variable[1]", "abundance_variation_study_variable[1]"]
504
+ sml_lines.append('\t'.join(sml_header))
505
+
506
+ # get adducts from consensus_df['adducts']. If value is None or [], use 'null'. If there is, take the first element and the first string
507
+ adduct_list = []
508
+ mapping = {'H1': '[M+H]+', 'H2': '[M+2H]2+',
509
+ 'Na1': '[M+Na]+', 'Na2': '[M+2Na]2+',
510
+ 'NH4': '[M+NH4]+', 'HCOO': '[M+HCOO]-',
511
+ 'CH3COO': '[M+CH3COO]-', 'H2O': '[M+H2O]+',
512
+ 'HCO2': '[M+HCO2]-', 'H3PO4': '[M+H3PO4]+',
513
+ 'H3O1': '[M+H3O]+', 'K1': '[M+K]+',
514
+ 'H4N1': '[M+NH4]+',
515
+ 'H-1': '[M-H]-', 'Cl1': '[M+Cl]-',
516
+ 'Br1': '[M+Br]-', 'I1': '[M+I]-',
517
+ 'H2O2': '[M+H2O2]+', 'H3O2': '[M+H3O2]+',}
518
+ for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
519
+ adduct = 'null'
520
+ if 'adducts' in row:
521
+ row_adducts = row['adducts']
522
+ if isinstance(row_adducts, list) and row_adducts:
523
+ # Each adduct is a dictionary with 'adduct' key
524
+ first_adduct_dict = row_adducts[0]
525
+ if isinstance(first_adduct_dict, dict) and 'adduct' in first_adduct_dict:
526
+ adduct_str = first_adduct_dict['adduct']
527
+ if adduct_str in mapping:
528
+ adduct = mapping[adduct_str]
529
+ else:
530
+ adduct = adduct_str
531
+
532
+ adduct_list.append(adduct)
533
+
534
+ for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
535
+ # Get MGF indexes for this consensus feature
536
+ mgf_indexes = mgf_mapping.get(row['consensus_uid'], [])
537
+
538
+ sml_row = [
539
+ "SML",
540
+ str(idx),
541
+ str(idx),
542
+ str(row.get('database_identifier', 'null')),
543
+ str(row.get('chemical_formula', 'null')),
544
+ str(row.get('smiles', 'null')),
545
+ str(row.get('inchi', 'null')),
546
+ str(row.get('chemical_name', 'null')),
547
+ str(row.get('uri', 'null')),
548
+ str(row.get('theoretical_neutral_mass', 'null')),
549
+ adduct_list[idx-1],
550
+ str(row.get('reliability', 'null')),
551
+ str(row.get('best_id_confidence_measure', 'null')),
552
+ str(row.get('best_id_confidence_value', 'null')),
553
+ ','.join(map(str, mgf_indexes)) if mgf_indexes else 'null',
554
+ ]
555
+ # Add abundance values for each assay
556
+ consensus_uid = row['consensus_uid']
557
+ if consensus_uid in abundance_matrix.index:
558
+ abundance_values = abundance_matrix.loc[consensus_uid].tolist()
559
+ sml_row += [str(val) if pd.notna(val) else 'null' for val in abundance_values]
560
+ else:
561
+ sml_row += ['null'] * n_assays
562
+ sml_row += ['null', 'null']
563
+ sml_lines.append('\t'.join(sml_row))
564
+ with open(filename, 'a', encoding='utf-8') as f:
565
+ f.write('\n')
566
+ for line in sml_lines:
567
+ f.write(line + '\n')
568
+
569
+ # --- SMF (Small Molecule Feature) table ---
570
+ smf_lines = []
571
+ smf_header = [
572
+ "SFH",
573
+ "SMF_ID",
574
+ "SME_ID_REFS",
575
+ "SME_ID_REF_ambiguity_code",
576
+ "adduct_ion",
577
+ "isotopomer",
578
+ "exp_mass_to_charge",
579
+ "charge",
580
+ "retention_time_in_seconds",
581
+ "retention_time_in_seconds_start",
582
+ "retention_time_in_seconds_end",
583
+ ]
584
+ smf_header += [f"abundance_assay[{i}]" for i in range(1, n_assays+1)]
585
+ smf_lines.append('\t'.join(smf_header))
586
+
587
+ # SMF table uses the same consensus features as SML, just different metadata
588
+ for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
589
+ smf_row = [
590
+ "SMF",
591
+ str(idx),
592
+ "null",
593
+ "null",
594
+ adduct_list[idx-1], # adduct_ion
595
+ str(row.get('isotopomer', 'null')),
596
+ str(row.get('mz', 'null')), # exp_mass_to_charge
597
+ str(row.get('charge', 'null')),
598
+ str(row.get('rt', 'null')), # retention_time_in_seconds
599
+ str(row.get('retention_time_in_seconds_start', 'null')),
600
+ str(row.get('retention_time_in_seconds_end', 'null')),
601
+ ]
602
+ # Add abundance values for each assay - same as SML
603
+ consensus_uid = row['consensus_uid']
604
+ if consensus_uid in abundance_matrix.index:
605
+ abundance_values = abundance_matrix.loc[consensus_uid].tolist()
606
+ smf_row += [str(val) if pd.notna(val) else 'null' for val in abundance_values]
607
+ else:
608
+ smf_row += ['null'] * n_assays
609
+ smf_lines.append('\t'.join(smf_row))
610
+ with open(filename, 'a', encoding='utf-8') as f:
611
+ f.write('\n')
612
+ for line in smf_lines:
613
+ f.write(line + '\n')
614
+
615
+ # --- MGF table ---
616
+ if include_mgf and mgf_data is not None and len(mgf_data) > 0:
617
+ mgf_lines = []
618
+ # Header
619
+ mgf_header = [
620
+ "COM",
621
+ "MGH",
622
+ "mgf_id",
623
+ "prec_id",
624
+ "prec_rt",
625
+ "prec_mz",
626
+ "prec_int",
627
+ "energy",
628
+ "level",
629
+ "title",
630
+ "spec_tic",
631
+ "spec_len",
632
+ "spec_mz",
633
+ "spec_int"
634
+ ]
635
+ mgf_lines.append('\t'.join(mgf_header))
636
+
637
+ # Data rows
638
+ for row in mgf_data.iter_rows(named=True):
639
+ # Calculate spectrum TIC (total ion current) from the spectrum data
640
+ spectrum_mz = row["spec_mz"]
641
+ spectrum_inty = row["spec_int"]
642
+ spec_tic = sum(spectrum_inty) if spectrum_inty else 0
643
+ spec_len = row["spec_len"] if row["spec_len"] is not None else 0
644
+
645
+ # Format spectrum data as pipe-separated strings
646
+ spec_mz_str = '|'.join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
647
+ spec_int_str = '|'.join([f"{int(inty)}" for inty in spectrum_inty]) if spectrum_inty else ""
648
+
649
+ mgf_row = [
650
+ "COM",
651
+ "MGF",
652
+ str(row["mgf_index"]) if row["mgf_index"] is not None else "null",
653
+ str(row["feature_id"]) if row["feature_id"] is not None else "null",
654
+ f"{row['rtinseconds']:.2f}" if row["rtinseconds"] is not None else "null",
655
+ f"{row['pepmass']:.4f}" if row["pepmass"] is not None else "null",
656
+ "null", # prec_int - not available in current data
657
+ str(row["energy"]) if row["energy"] is not None else "null",
658
+ str(row["mslevel"]) if row["mslevel"] is not None else "null",
659
+ str(row["title"]) if row["title"] is not None else "null",
660
+ f"{int(spec_tic)}" if spec_tic > 0 else "null",
661
+ str(spec_len) if spec_len > 0 else "null",
662
+ spec_mz_str if spec_mz_str else "null",
663
+ spec_int_str if spec_int_str else "null"
664
+ ]
665
+ mgf_lines.append('\t'.join(mgf_row))
666
+
667
+ # Write MGF table
668
+ with open(filename, 'a', encoding='utf-8') as f:
669
+ f.write('\n')
670
+ for line in mgf_lines:
671
+ f.write(line + '\n')
672
+
673
+ if include_mgf:
674
+ self.logger.info(f"Exported mzTab-M to {filename}")