masster 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +6 -0
- masster/_version.py +1 -1
- masster/sample/h5.py +58 -1
- masster/sample/load.py +7 -1
- masster/sample/plot.py +56 -65
- masster/sample/processing.py +158 -0
- masster/sample/sample.py +2 -0
- masster/sample/sample5_schema.json +3 -0
- masster/sample/save.py +135 -59
- masster/spectrum.py +58 -9
- masster/study/export.py +240 -154
- masster/study/h5.py +65 -1
- masster/study/helpers.py +3 -3
- masster/study/load.py +39 -3
- masster/study/merge.py +25 -10
- masster/study/plot.py +162 -192
- masster/study/processing.py +362 -12
- masster/study/save.py +48 -5
- masster/study/study.py +16 -3
- masster/study/study5_schema.json +3 -0
- masster/wizard/__init__.py +5 -2
- masster/wizard/wizard.py +435 -1871
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/METADATA +1 -1
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/RECORD +27 -29
- masster/wizard/test_structure.py +0 -49
- masster/wizard/test_wizard.py +0 -285
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/WHEEL +0 -0
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/entry_points.txt +0 -0
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/licenses/LICENSE +0 -0
masster/study/export.py
CHANGED
|
@@ -78,26 +78,31 @@ def _get_mgf_df(self, **kwargs):
|
|
|
78
78
|
if self.consensus_df is None:
|
|
79
79
|
self.logger.error("No consensus map found. Please run merge() first.")
|
|
80
80
|
return None
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
81
|
+
|
|
82
|
+
# MS2 data is optional - we can generate MS1 data without it
|
|
83
|
+
ms2_available = self.consensus_ms2 is not None and not self.consensus_ms2.is_empty()
|
|
84
|
+
if not ms2_available:
|
|
85
|
+
self.logger.info("No consensus MS2 data found. Generating MS1-only MGF data.")
|
|
86
|
+
|
|
87
|
+
# Convert to pandas for merge operation only if we have MS2 data
|
|
88
|
+
if ms2_available:
|
|
89
|
+
consensus_df_pd = self.consensus_df.to_pandas()
|
|
90
|
+
consensus_ms2_pd = self.consensus_ms2.to_pandas()
|
|
91
|
+
|
|
92
|
+
features = pd.merge(
|
|
93
|
+
consensus_df_pd,
|
|
94
|
+
consensus_ms2_pd,
|
|
95
|
+
how="right",
|
|
96
|
+
on="consensus_uid",
|
|
97
|
+
)
|
|
98
|
+
if len(features) == 0:
|
|
99
|
+
self.logger.warning("No MS2 features found.")
|
|
100
|
+
grouped = {} # Empty groupby result
|
|
101
|
+
else:
|
|
102
|
+
# Pre-group by consensus_uid for fast access
|
|
103
|
+
grouped = features.groupby("consensus_uid")
|
|
104
|
+
else:
|
|
105
|
+
grouped = {} # No MS2 data available
|
|
101
106
|
|
|
102
107
|
def filter_peaks(spec, inty_min=None):
|
|
103
108
|
spec = spec.copy()
|
|
@@ -115,6 +120,12 @@ def _get_mgf_df(self, **kwargs):
|
|
|
115
120
|
setattr(spec, attr, np.array(arr)[mask])
|
|
116
121
|
return spec
|
|
117
122
|
|
|
123
|
+
def safe_charge(charge_value):
|
|
124
|
+
"""Safely convert charge value to integer, handling NaN and None"""
|
|
125
|
+
if charge_value is None or (isinstance(charge_value, float) and np.isnan(charge_value)):
|
|
126
|
+
return 1
|
|
127
|
+
return int(round(charge_value))
|
|
128
|
+
|
|
118
129
|
def create_ion_dict(title, id, uid, mz, rt, charge, spect, mgf_id):
|
|
119
130
|
"""Create a dictionary representing an ion for the DataFrame."""
|
|
120
131
|
if spect is None:
|
|
@@ -160,65 +171,115 @@ def _get_mgf_df(self, **kwargs):
|
|
|
160
171
|
ion_data = []
|
|
161
172
|
skip = 0
|
|
162
173
|
mgf_counter = 0
|
|
163
|
-
self.logger.
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
if mz_start is not None and
|
|
175
|
-
continue
|
|
176
|
-
if mz_end is not None and row["mz"] > mz_end:
|
|
174
|
+
self.logger.debug(f"Generating MGF data for {len(self.consensus_df)} consensus features...")
|
|
175
|
+
|
|
176
|
+
# First, generate MS1 spectra for all consensus features using isotope data
|
|
177
|
+
self.logger.debug("Generating MS1 spectra from isotope data...")
|
|
178
|
+
for row in self.consensus_df.iter_rows(named=True):
|
|
179
|
+
# Apply filtering at individual feature level for MS1 data
|
|
180
|
+
consensus_uid = row["consensus_uid"]
|
|
181
|
+
consensus_mz = row["mz"]
|
|
182
|
+
consensus_rt = row["rt"]
|
|
183
|
+
consensus_inty_mean = row.get("inty_mean", 0)
|
|
184
|
+
|
|
185
|
+
if mz_start is not None and consensus_mz < mz_start:
|
|
177
186
|
continue
|
|
178
|
-
if
|
|
187
|
+
if mz_end is not None and consensus_mz > mz_end:
|
|
179
188
|
continue
|
|
180
|
-
if
|
|
189
|
+
if rt_start is not None and consensus_rt < rt_start:
|
|
181
190
|
continue
|
|
182
|
-
if
|
|
183
|
-
skip += 1
|
|
191
|
+
if rt_end is not None and consensus_rt > rt_end:
|
|
184
192
|
continue
|
|
193
|
+
|
|
194
|
+
# Create MS1 spectrum using isotope data
|
|
195
|
+
iso_data = row.get("iso", None)
|
|
196
|
+
|
|
197
|
+
if iso_data is not None and len(iso_data) > 0:
|
|
198
|
+
# Use isotope data for spectrum
|
|
199
|
+
spectrum_mz = [float(peak[0]) for peak in iso_data]
|
|
200
|
+
spectrum_inty = [float(peak[1]) for peak in iso_data]
|
|
201
|
+
else:
|
|
202
|
+
# Use consensus mz and inty_mean as single peak
|
|
203
|
+
spectrum_mz = [float(consensus_mz)]
|
|
204
|
+
spectrum_inty = [float(consensus_inty_mean)]
|
|
205
|
+
|
|
206
|
+
# Apply intensity minimum filter if specified
|
|
207
|
+
if inty_min is not None and inty_min > 0:
|
|
208
|
+
filtered_pairs = [(mz, inty) for mz, inty in zip(spectrum_mz, spectrum_inty, strict=False) if inty >= inty_min]
|
|
209
|
+
if filtered_pairs:
|
|
210
|
+
spectrum_mz, spectrum_inty = zip(*filtered_pairs, strict=False)
|
|
211
|
+
spectrum_mz = list(spectrum_mz)
|
|
212
|
+
spectrum_inty = list(spectrum_inty)
|
|
213
|
+
else:
|
|
214
|
+
# If all peaks are below threshold, skip this feature
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
mgf_counter += 1
|
|
218
|
+
|
|
219
|
+
# Create MS1 spectrum object to use with create_ion_dict
|
|
220
|
+
class SimpleSpectrum:
|
|
221
|
+
def __init__(self, mz_list, inty_list):
|
|
222
|
+
self.mz = np.array(mz_list)
|
|
223
|
+
self.inty = np.array(inty_list)
|
|
224
|
+
self.ms_level = 1
|
|
225
|
+
self.energy = None
|
|
226
|
+
|
|
227
|
+
ms1_spectrum = SimpleSpectrum(spectrum_mz, spectrum_inty)
|
|
228
|
+
|
|
229
|
+
# Use create_ion_dict to ensure consistent schema
|
|
230
|
+
ion_dict = create_ion_dict(
|
|
231
|
+
f"uid:{consensus_uid}, rt:{consensus_rt:.2f}, mz:{consensus_mz:.4f}, MS1",
|
|
232
|
+
row["consensus_id"],
|
|
233
|
+
consensus_uid,
|
|
234
|
+
consensus_mz,
|
|
235
|
+
consensus_rt,
|
|
236
|
+
safe_charge(row.get("charge_mean")),
|
|
237
|
+
ms1_spectrum,
|
|
238
|
+
mgf_counter,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
if ion_dict is not None:
|
|
242
|
+
ion_data.append(ion_dict)
|
|
243
|
+
|
|
244
|
+
self.logger.debug(f"Generated {len(ion_data)} MS1 spectra from isotope data")
|
|
245
|
+
|
|
246
|
+
# Now generate MS2 spectra if available
|
|
247
|
+
if ms2_available and len(grouped) > 0:
|
|
248
|
+
self.logger.debug(f"Processing MS2 data for {len(grouped)} consensus features with MS2...")
|
|
249
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
250
|
+
for _consensus_uid, cons_ms2 in tqdm(
|
|
251
|
+
grouped,
|
|
252
|
+
total=len(grouped),
|
|
253
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Feature",
|
|
254
|
+
disable=tdqm_disable,
|
|
255
|
+
):
|
|
256
|
+
# Use the first row for feature-level info
|
|
257
|
+
row = cons_ms2.iloc[0]
|
|
258
|
+
if mz_start is not None and row["mz"] < mz_start:
|
|
259
|
+
continue
|
|
260
|
+
if mz_end is not None and row["mz"] > mz_end:
|
|
261
|
+
continue
|
|
262
|
+
if rt_start is not None and row["rt"] < rt_start:
|
|
263
|
+
continue
|
|
264
|
+
if rt_end is not None and row["rt"] > rt_end:
|
|
265
|
+
continue
|
|
266
|
+
if len(cons_ms2) == 0:
|
|
267
|
+
skip += 1
|
|
268
|
+
continue
|
|
185
269
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
if spect is None:
|
|
198
|
-
skip += 1
|
|
199
|
-
continue
|
|
200
|
-
if centroid:
|
|
201
|
-
spect = spect.centroid()
|
|
202
|
-
if deisotope:
|
|
203
|
-
spect = spect.deisotope()
|
|
204
|
-
spect = filter_peaks(spect, inty_min=inty_min)
|
|
205
|
-
mgf_counter += 1
|
|
206
|
-
ion_dict = create_ion_dict(
|
|
207
|
-
f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{e}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
|
|
208
|
-
cons_ms2_e_row["consensus_id"],
|
|
209
|
-
cons_ms2_e_row["consensus_uid"],
|
|
210
|
-
cons_ms2_e_row["mz"],
|
|
211
|
-
cons_ms2_e_row["rt"],
|
|
212
|
-
round(cons_ms2_e_row["charge_mean"]),
|
|
213
|
-
spect,
|
|
214
|
-
mgf_counter,
|
|
215
|
-
)
|
|
216
|
-
if ion_dict is not None:
|
|
217
|
-
ion_data.append(ion_dict)
|
|
218
|
-
else:
|
|
219
|
-
for row_e in cons_ms2_e.iter_rows(named=True):
|
|
220
|
-
spect = row_e["spec"]
|
|
270
|
+
if split_energy:
|
|
271
|
+
energies = cons_ms2["energy"].unique()
|
|
272
|
+
for e in energies:
|
|
273
|
+
cons_ms2_e = cons_ms2[cons_ms2["energy"] == e]
|
|
274
|
+
if selection == "best":
|
|
275
|
+
# Check if the filtered DataFrame is empty
|
|
276
|
+
if len(cons_ms2_e) == 0:
|
|
277
|
+
continue
|
|
278
|
+
idx = cons_ms2_e["prec_inty"].idxmax()
|
|
279
|
+
cons_ms2_e_row = cons_ms2_e.loc[idx]
|
|
280
|
+
spect = cons_ms2_e_row["spec"]
|
|
221
281
|
if spect is None:
|
|
282
|
+
skip += 1
|
|
222
283
|
continue
|
|
223
284
|
if centroid:
|
|
224
285
|
spect = spect.centroid()
|
|
@@ -227,104 +288,129 @@ def _get_mgf_df(self, **kwargs):
|
|
|
227
288
|
spect = filter_peaks(spect, inty_min=inty_min)
|
|
228
289
|
mgf_counter += 1
|
|
229
290
|
ion_dict = create_ion_dict(
|
|
230
|
-
f"uid:{
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
291
|
+
f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{e}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
|
|
292
|
+
cons_ms2_e_row["consensus_id"],
|
|
293
|
+
cons_ms2_e_row["consensus_uid"],
|
|
294
|
+
cons_ms2_e_row["mz"],
|
|
295
|
+
cons_ms2_e_row["rt"],
|
|
296
|
+
safe_charge(cons_ms2_e_row["charge_mean"]),
|
|
236
297
|
spect,
|
|
237
298
|
mgf_counter,
|
|
238
299
|
)
|
|
239
300
|
if ion_dict is not None:
|
|
240
301
|
ion_data.append(ion_dict)
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
row_e["spec"]
|
|
271
|
-
for row_e in cons_ms2.iter_rows(named=True)
|
|
272
|
-
if row_e["spec"] is not None
|
|
273
|
-
]
|
|
274
|
-
if not specs:
|
|
302
|
+
else:
|
|
303
|
+
for row_e in cons_ms2_e.iter_rows(named=True):
|
|
304
|
+
spect = row_e["spec"]
|
|
305
|
+
if spect is None:
|
|
306
|
+
continue
|
|
307
|
+
if centroid:
|
|
308
|
+
spect = spect.centroid()
|
|
309
|
+
if deisotope:
|
|
310
|
+
spect = spect.deisotope()
|
|
311
|
+
spect = filter_peaks(spect, inty_min=inty_min)
|
|
312
|
+
mgf_counter += 1
|
|
313
|
+
ion_dict = create_ion_dict(
|
|
314
|
+
f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{e}, sample_uid:{row_e['sample_uid']}, scanid:{row_e['scan_id']}",
|
|
315
|
+
row_e["consensus_id"],
|
|
316
|
+
row_e["consensus_uid"],
|
|
317
|
+
row_e["mz"],
|
|
318
|
+
row_e["rt"],
|
|
319
|
+
safe_charge(row_e["charge_mean"]),
|
|
320
|
+
spect,
|
|
321
|
+
mgf_counter,
|
|
322
|
+
)
|
|
323
|
+
if ion_dict is not None:
|
|
324
|
+
ion_data.append(ion_dict)
|
|
325
|
+
else:
|
|
326
|
+
if selection == "best":
|
|
327
|
+
idx = cons_ms2["prec_inty"].idxmax()
|
|
328
|
+
cons_ms2_e_row = cons_ms2.loc[idx]
|
|
329
|
+
spect = cons_ms2_e_row["spec"]
|
|
330
|
+
if spect is None:
|
|
275
331
|
continue
|
|
276
|
-
spect = combine_peaks(specs)
|
|
277
332
|
if centroid:
|
|
278
|
-
spect = spect.denoise()
|
|
279
333
|
spect = spect.centroid()
|
|
280
334
|
if deisotope:
|
|
281
335
|
spect = spect.deisotope()
|
|
282
336
|
spect = filter_peaks(spect, inty_min=inty_min)
|
|
283
337
|
mgf_counter += 1
|
|
284
338
|
ion_dict = create_ion_dict(
|
|
285
|
-
f"uid:{
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
339
|
+
f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{cons_ms2_e_row['energy']}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
|
|
340
|
+
cons_ms2_e_row["consensus_id"],
|
|
341
|
+
cons_ms2_e_row["consensus_uid"],
|
|
342
|
+
cons_ms2_e_row["mz"],
|
|
343
|
+
cons_ms2_e_row["rt"],
|
|
344
|
+
safe_charge(cons_ms2_e_row["charge_mean"]),
|
|
291
345
|
spect,
|
|
292
346
|
mgf_counter,
|
|
293
347
|
)
|
|
294
348
|
if ion_dict is not None:
|
|
295
349
|
ion_data.append(ion_dict)
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
350
|
+
|
|
351
|
+
elif selection == "all":
|
|
352
|
+
if merge:
|
|
353
|
+
specs = [
|
|
354
|
+
row_e["spec"]
|
|
355
|
+
for row_e in cons_ms2.iter_rows(named=True)
|
|
356
|
+
if row_e["spec"] is not None
|
|
357
|
+
]
|
|
358
|
+
if not specs:
|
|
300
359
|
continue
|
|
360
|
+
spect = combine_peaks(specs)
|
|
301
361
|
if centroid:
|
|
362
|
+
spect = spect.denoise()
|
|
302
363
|
spect = spect.centroid()
|
|
303
364
|
if deisotope:
|
|
304
365
|
spect = spect.deisotope()
|
|
305
366
|
spect = filter_peaks(spect, inty_min=inty_min)
|
|
306
367
|
mgf_counter += 1
|
|
307
368
|
ion_dict = create_ion_dict(
|
|
308
|
-
f"uid:{
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
369
|
+
f"uid:{row['consensus_uid']}, rt:{row['rt']:.2f}, mz:{row['mz']:.4f}, sample_uid:{row['sample_uid']}, scan_id:{row['scan_id']}",
|
|
370
|
+
row["consensus_id"],
|
|
371
|
+
row["consensus_uid"],
|
|
372
|
+
row["mz"],
|
|
373
|
+
row["rt"],
|
|
374
|
+
safe_charge(row["charge_mean"]),
|
|
314
375
|
spect,
|
|
315
376
|
mgf_counter,
|
|
316
377
|
)
|
|
317
378
|
if ion_dict is not None:
|
|
318
379
|
ion_data.append(ion_dict)
|
|
380
|
+
else:
|
|
381
|
+
for row_e in cons_ms2.iter_rows(named=True):
|
|
382
|
+
spect = row_e["spec"]
|
|
383
|
+
if spect is None:
|
|
384
|
+
continue
|
|
385
|
+
if centroid:
|
|
386
|
+
spect = spect.centroid()
|
|
387
|
+
if deisotope:
|
|
388
|
+
spect = spect.deisotope()
|
|
389
|
+
spect = filter_peaks(spect, inty_min=inty_min)
|
|
390
|
+
mgf_counter += 1
|
|
391
|
+
ion_dict = create_ion_dict(
|
|
392
|
+
f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{row_e['energy']}, sample_uid:{row_e['sample_uid']}, scan_id:{row_e['scan_id']}",
|
|
393
|
+
row_e["consensus_id"],
|
|
394
|
+
row_e["consensus_uid"],
|
|
395
|
+
row_e["mz"],
|
|
396
|
+
row_e["rt"],
|
|
397
|
+
safe_charge(row_e["charge_mean"]),
|
|
398
|
+
spect,
|
|
399
|
+
mgf_counter,
|
|
400
|
+
)
|
|
401
|
+
if ion_dict is not None:
|
|
402
|
+
ion_data.append(ion_dict)
|
|
403
|
+
else:
|
|
404
|
+
self.logger.info("Skipping MS2 data generation - no MS2 data available")
|
|
319
405
|
|
|
320
|
-
self.logger.debug(f"Generated MGF data for {len(ion_data)} spectra")
|
|
321
|
-
self.logger.debug(f"Skipped {skip} features due to missing data.")
|
|
406
|
+
self.logger.debug(f"Generated MGF data for {len(ion_data)} spectra (MS1 + MS2)")
|
|
407
|
+
self.logger.debug(f"Skipped {skip} MS2 features due to missing data.")
|
|
322
408
|
|
|
323
409
|
# Convert to Polars DataFrame
|
|
324
410
|
if not ion_data:
|
|
325
411
|
return pl.DataFrame()
|
|
326
412
|
|
|
327
|
-
return pl.DataFrame(ion_data)
|
|
413
|
+
return pl.DataFrame(ion_data, infer_schema_length=None)
|
|
328
414
|
|
|
329
415
|
|
|
330
416
|
def export_mgf(self, **kwargs):
|
|
@@ -412,7 +498,7 @@ def export_mgf(self, **kwargs):
|
|
|
412
498
|
self.logger.info(f"Exported {len(mgf_data)} spectra to {filename}")
|
|
413
499
|
|
|
414
500
|
|
|
415
|
-
def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None:
|
|
501
|
+
def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs) -> None:
|
|
416
502
|
"""
|
|
417
503
|
Export the study as a fully compliant mzTab-M file.
|
|
418
504
|
|
|
@@ -1098,7 +1184,7 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
1098
1184
|
self.logger.info(f"Exported mzTab-M to {filename}")
|
|
1099
1185
|
|
|
1100
1186
|
|
|
1101
|
-
def export_xlsx(self, filename: str = None) -> None:
|
|
1187
|
+
def export_xlsx(self, filename: str | None = None) -> None:
|
|
1102
1188
|
"""
|
|
1103
1189
|
Export the study data to an Excel workbook with multiple worksheets.
|
|
1104
1190
|
|
|
@@ -1209,38 +1295,38 @@ def export_xlsx(self, filename: str = None) -> None:
|
|
|
1209
1295
|
self.logger.error(f"Error writing Excel file: {e}")
|
|
1210
1296
|
|
|
1211
1297
|
|
|
1212
|
-
def export_parquet(self,
|
|
1298
|
+
def export_parquet(self, filename: str | None = None) -> None:
|
|
1213
1299
|
"""
|
|
1214
1300
|
Export the study data to multiple Parquet files with different suffixes.
|
|
1215
1301
|
|
|
1216
1302
|
The export creates separate Parquet files for each dataset:
|
|
1217
|
-
- <
|
|
1218
|
-
- <
|
|
1219
|
-
- <
|
|
1220
|
-
- <
|
|
1303
|
+
- <filename>_samples.parquet: Samples dataframe
|
|
1304
|
+
- <filename>_consensus.parquet: Consensus features dataframe
|
|
1305
|
+
- <filename>_identification.parquet: Identification results with library annotations
|
|
1306
|
+
- <filename>_matrix.parquet: Consensus matrix with samples as columns
|
|
1221
1307
|
|
|
1222
1308
|
Args:
|
|
1223
|
-
|
|
1309
|
+
filename (str, optional): Base name for the output files. Defaults to "study"
|
|
1224
1310
|
in the study folder.
|
|
1225
1311
|
"""
|
|
1226
|
-
# Set default
|
|
1227
|
-
if
|
|
1228
|
-
|
|
1312
|
+
# Set default filename
|
|
1313
|
+
if filename is None:
|
|
1314
|
+
filename = "study"
|
|
1229
1315
|
|
|
1230
|
-
# Make
|
|
1231
|
-
if not os.path.isabs(
|
|
1316
|
+
# Make filename absolute path if not already (without extension)
|
|
1317
|
+
if not os.path.isabs(filename):
|
|
1232
1318
|
if self.folder is not None:
|
|
1233
|
-
|
|
1319
|
+
filename = os.path.join(self.folder, filename)
|
|
1234
1320
|
else:
|
|
1235
|
-
|
|
1321
|
+
filename = os.path.join(os.getcwd(), filename)
|
|
1236
1322
|
|
|
1237
|
-
self.logger.debug(f"Exporting study to Parquet files with
|
|
1323
|
+
self.logger.debug(f"Exporting study to Parquet files with filename: {filename}")
|
|
1238
1324
|
|
|
1239
1325
|
exported_files = []
|
|
1240
1326
|
|
|
1241
1327
|
# 1. Samples dataframe
|
|
1242
1328
|
if self.samples_df is not None and not self.samples_df.is_empty():
|
|
1243
|
-
samples_file = f"{
|
|
1329
|
+
samples_file = f"{filename}_samples.parquet"
|
|
1244
1330
|
try:
|
|
1245
1331
|
self.samples_df.write_parquet(samples_file)
|
|
1246
1332
|
exported_files.append(samples_file)
|
|
@@ -1256,7 +1342,7 @@ def export_parquet(self, basename: str = None) -> None:
|
|
|
1256
1342
|
|
|
1257
1343
|
# 2. Consensus dataframe
|
|
1258
1344
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1259
|
-
consensus_file = f"{
|
|
1345
|
+
consensus_file = f"{filename}_consensus.parquet"
|
|
1260
1346
|
try:
|
|
1261
1347
|
self.consensus_df.write_parquet(consensus_file)
|
|
1262
1348
|
exported_files.append(consensus_file)
|
|
@@ -1276,7 +1362,7 @@ def export_parquet(self, basename: str = None) -> None:
|
|
|
1276
1362
|
|
|
1277
1363
|
id_df = get_id(self)
|
|
1278
1364
|
if id_df is not None and not id_df.is_empty():
|
|
1279
|
-
identification_file = f"{
|
|
1365
|
+
identification_file = f"{filename}_identification.parquet"
|
|
1280
1366
|
try:
|
|
1281
1367
|
id_df.write_parquet(identification_file)
|
|
1282
1368
|
exported_files.append(identification_file)
|
|
@@ -1298,7 +1384,7 @@ def export_parquet(self, basename: str = None) -> None:
|
|
|
1298
1384
|
try:
|
|
1299
1385
|
matrix_df = self.get_consensus_matrix()
|
|
1300
1386
|
if matrix_df is not None and not matrix_df.is_empty():
|
|
1301
|
-
matrix_file = f"{
|
|
1387
|
+
matrix_file = f"{filename}_matrix.parquet"
|
|
1302
1388
|
try:
|
|
1303
1389
|
matrix_df.write_parquet(matrix_file)
|
|
1304
1390
|
exported_files.append(matrix_file)
|
masster/study/h5.py
CHANGED
|
@@ -974,7 +974,7 @@ def _load_dataframe_from_group(
|
|
|
974
974
|
|
|
975
975
|
# Second pass: handle missing columns
|
|
976
976
|
for col in missing_columns:
|
|
977
|
-
logger.
|
|
977
|
+
logger.info(f"Column '{col}' not found in {df_name}.")
|
|
978
978
|
# For missing columns, create appropriately sized array with appropriate defaults
|
|
979
979
|
if col in object_columns:
|
|
980
980
|
data[col] = [None] * expected_length
|
|
@@ -2008,3 +2008,67 @@ def _load_study5(self, filename=None):
|
|
|
2008
2008
|
)
|
|
2009
2009
|
|
|
2010
2010
|
self.logger.debug("Study loaded")
|
|
2011
|
+
|
|
2012
|
+
|
|
2013
|
+
def _load_ms1(self, sample_path: str) -> pl.DataFrame:
|
|
2014
|
+
"""
|
|
2015
|
+
Optimized method to load only MS1 data from a sample5 file for isotope detection.
|
|
2016
|
+
|
|
2017
|
+
This method efficiently loads only the ms1_df from a sample5 HDF5 file without
|
|
2018
|
+
loading other potentially large datasets like features_df, scans_df, etc.
|
|
2019
|
+
|
|
2020
|
+
Args:
|
|
2021
|
+
sample_path (str): Path to the sample5 HDF5 file
|
|
2022
|
+
|
|
2023
|
+
Returns:
|
|
2024
|
+
pl.DataFrame: MS1 data with columns [cycle, scan_uid, rt, mz, inty]
|
|
2025
|
+
Returns empty DataFrame if no MS1 data found or file cannot be read
|
|
2026
|
+
|
|
2027
|
+
Note:
|
|
2028
|
+
Used by find_iso() for efficient isotope pattern detection without full sample loading
|
|
2029
|
+
"""
|
|
2030
|
+
try:
|
|
2031
|
+
with h5py.File(sample_path, "r") as f:
|
|
2032
|
+
# Check if ms1 group exists
|
|
2033
|
+
if "ms1" not in f:
|
|
2034
|
+
self.logger.debug(f"No MS1 data found in {sample_path}")
|
|
2035
|
+
return pl.DataFrame()
|
|
2036
|
+
|
|
2037
|
+
ms1_group = f["ms1"]
|
|
2038
|
+
|
|
2039
|
+
# Load MS1 data efficiently
|
|
2040
|
+
ms1_data = {}
|
|
2041
|
+
for col in ms1_group.keys():
|
|
2042
|
+
ms1_data[col] = ms1_group[col][:]
|
|
2043
|
+
|
|
2044
|
+
if not ms1_data:
|
|
2045
|
+
self.logger.debug(f"Empty MS1 data in {sample_path}")
|
|
2046
|
+
return pl.DataFrame()
|
|
2047
|
+
|
|
2048
|
+
# Create DataFrame with proper schema
|
|
2049
|
+
ms1_df = pl.DataFrame(ms1_data)
|
|
2050
|
+
|
|
2051
|
+
# Apply expected schema for MS1 data
|
|
2052
|
+
expected_schema = {
|
|
2053
|
+
"cycle": pl.Int64,
|
|
2054
|
+
"scan_uid": pl.Int64,
|
|
2055
|
+
"rt": pl.Float64,
|
|
2056
|
+
"mz": pl.Float64,
|
|
2057
|
+
"inty": pl.Float64
|
|
2058
|
+
}
|
|
2059
|
+
|
|
2060
|
+
# Cast columns to expected types if they exist
|
|
2061
|
+
cast_expressions = []
|
|
2062
|
+
for col, dtype in expected_schema.items():
|
|
2063
|
+
if col in ms1_df.columns:
|
|
2064
|
+
cast_expressions.append(pl.col(col).cast(dtype))
|
|
2065
|
+
|
|
2066
|
+
if cast_expressions:
|
|
2067
|
+
ms1_df = ms1_df.with_columns(cast_expressions)
|
|
2068
|
+
|
|
2069
|
+
self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {sample_path}")
|
|
2070
|
+
return ms1_df
|
|
2071
|
+
|
|
2072
|
+
except Exception as e:
|
|
2073
|
+
self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
|
|
2074
|
+
return pl.DataFrame()
|
masster/study/helpers.py
CHANGED
|
@@ -509,8 +509,9 @@ def get_consensus(self, quant="chrom_area"):
|
|
|
509
509
|
# Convert Polars DataFrame to pandas for this operation since the result is used for export
|
|
510
510
|
df1 = self.consensus_df.to_pandas().copy()
|
|
511
511
|
|
|
512
|
-
#
|
|
513
|
-
|
|
512
|
+
# Keep consensus_id as string (UUID format)
|
|
513
|
+
# Note: consensus_id is now a 16-character UUID string, not an integer
|
|
514
|
+
df1["consensus_id"] = df1["consensus_id"].astype("string")
|
|
514
515
|
# set consensus_id as index
|
|
515
516
|
df1.set_index("consensus_uid", inplace=True)
|
|
516
517
|
# sort by consensus_id
|
|
@@ -640,7 +641,6 @@ def get_gaps_stats(self, uids=None):
|
|
|
640
641
|
return gaps_stats
|
|
641
642
|
|
|
642
643
|
|
|
643
|
-
# TODO is uid not supposed to be a list anymore?
|
|
644
644
|
def get_consensus_matches(self, uids=None, filled=True):
|
|
645
645
|
"""
|
|
646
646
|
Get feature matches for consensus UIDs with optimized join operation.
|