masster 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/export.py CHANGED
@@ -78,26 +78,31 @@ def _get_mgf_df(self, **kwargs):
78
78
  if self.consensus_df is None:
79
79
  self.logger.error("No consensus map found. Please run merge() first.")
80
80
  return None
81
- if self.consensus_ms2 is None:
82
- self.logger.error("No consensus MS2 data found. Please run link_ms2() first.")
83
- return None
84
-
85
- # Convert to pandas for merge operation since the result is used for groupby
86
- consensus_df_pd = self.consensus_df.to_pandas()
87
- consensus_ms2_pd = self.consensus_ms2.to_pandas()
88
-
89
- features = pd.merge(
90
- consensus_df_pd,
91
- consensus_ms2_pd,
92
- how="right",
93
- on="consensus_uid",
94
- )
95
- if len(features) == 0:
96
- self.logger.warning("No features found.")
97
- return pl.DataFrame()
98
-
99
- # Pre-group by consensus_uid for fast access
100
- grouped = features.groupby("consensus_uid")
81
+
82
+ # MS2 data is optional - we can generate MS1 data without it
83
+ ms2_available = self.consensus_ms2 is not None and not self.consensus_ms2.is_empty()
84
+ if not ms2_available:
85
+ self.logger.info("No consensus MS2 data found. Generating MS1-only MGF data.")
86
+
87
+ # Convert to pandas for merge operation only if we have MS2 data
88
+ if ms2_available:
89
+ consensus_df_pd = self.consensus_df.to_pandas()
90
+ consensus_ms2_pd = self.consensus_ms2.to_pandas()
91
+
92
+ features = pd.merge(
93
+ consensus_df_pd,
94
+ consensus_ms2_pd,
95
+ how="right",
96
+ on="consensus_uid",
97
+ )
98
+ if len(features) == 0:
99
+ self.logger.warning("No MS2 features found.")
100
+ grouped = {} # Empty groupby result
101
+ else:
102
+ # Pre-group by consensus_uid for fast access
103
+ grouped = features.groupby("consensus_uid")
104
+ else:
105
+ grouped = {} # No MS2 data available
101
106
 
102
107
  def filter_peaks(spec, inty_min=None):
103
108
  spec = spec.copy()
@@ -115,6 +120,12 @@ def _get_mgf_df(self, **kwargs):
115
120
  setattr(spec, attr, np.array(arr)[mask])
116
121
  return spec
117
122
 
123
+ def safe_charge(charge_value):
124
+ """Safely convert charge value to integer, handling NaN and None"""
125
+ if charge_value is None or (isinstance(charge_value, float) and np.isnan(charge_value)):
126
+ return 1
127
+ return int(round(charge_value))
128
+
118
129
  def create_ion_dict(title, id, uid, mz, rt, charge, spect, mgf_id):
119
130
  """Create a dictionary representing an ion for the DataFrame."""
120
131
  if spect is None:
@@ -160,65 +171,115 @@ def _get_mgf_df(self, **kwargs):
160
171
  ion_data = []
161
172
  skip = 0
162
173
  mgf_counter = 0
163
- self.logger.info(f"Generating MGF data for {len(grouped)} consensus features...")
164
-
165
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
166
- for _consensus_uid, cons_ms2 in tqdm(
167
- grouped,
168
- total=len(grouped),
169
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Feature",
170
- disable=tdqm_disable,
171
- ):
172
- # Use the first row for feature-level info
173
- row = cons_ms2.iloc[0]
174
- if mz_start is not None and row["mz"] < mz_start:
175
- continue
176
- if mz_end is not None and row["mz"] > mz_end:
174
+ self.logger.debug(f"Generating MGF data for {len(self.consensus_df)} consensus features...")
175
+
176
+ # First, generate MS1 spectra for all consensus features using isotope data
177
+ self.logger.debug("Generating MS1 spectra from isotope data...")
178
+ for row in self.consensus_df.iter_rows(named=True):
179
+ # Apply filtering at individual feature level for MS1 data
180
+ consensus_uid = row["consensus_uid"]
181
+ consensus_mz = row["mz"]
182
+ consensus_rt = row["rt"]
183
+ consensus_inty_mean = row.get("inty_mean", 0)
184
+
185
+ if mz_start is not None and consensus_mz < mz_start:
177
186
  continue
178
- if rt_start is not None and row["rt"] < rt_start:
187
+ if mz_end is not None and consensus_mz > mz_end:
179
188
  continue
180
- if rt_end is not None and row["rt"] > rt_end:
189
+ if rt_start is not None and consensus_rt < rt_start:
181
190
  continue
182
- if len(cons_ms2) == 0:
183
- skip += 1
191
+ if rt_end is not None and consensus_rt > rt_end:
184
192
  continue
193
+
194
+ # Create MS1 spectrum using isotope data
195
+ iso_data = row.get("iso", None)
196
+
197
+ if iso_data is not None and len(iso_data) > 0:
198
+ # Use isotope data for spectrum
199
+ spectrum_mz = [float(peak[0]) for peak in iso_data]
200
+ spectrum_inty = [float(peak[1]) for peak in iso_data]
201
+ else:
202
+ # Use consensus mz and inty_mean as single peak
203
+ spectrum_mz = [float(consensus_mz)]
204
+ spectrum_inty = [float(consensus_inty_mean)]
205
+
206
+ # Apply intensity minimum filter if specified
207
+ if inty_min is not None and inty_min > 0:
208
+ filtered_pairs = [(mz, inty) for mz, inty in zip(spectrum_mz, spectrum_inty, strict=False) if inty >= inty_min]
209
+ if filtered_pairs:
210
+ spectrum_mz, spectrum_inty = zip(*filtered_pairs, strict=False)
211
+ spectrum_mz = list(spectrum_mz)
212
+ spectrum_inty = list(spectrum_inty)
213
+ else:
214
+ # If all peaks are below threshold, skip this feature
215
+ continue
216
+
217
+ mgf_counter += 1
218
+
219
+ # Create MS1 spectrum object to use with create_ion_dict
220
+ class SimpleSpectrum:
221
+ def __init__(self, mz_list, inty_list):
222
+ self.mz = np.array(mz_list)
223
+ self.inty = np.array(inty_list)
224
+ self.ms_level = 1
225
+ self.energy = None
226
+
227
+ ms1_spectrum = SimpleSpectrum(spectrum_mz, spectrum_inty)
228
+
229
+ # Use create_ion_dict to ensure consistent schema
230
+ ion_dict = create_ion_dict(
231
+ f"uid:{consensus_uid}, rt:{consensus_rt:.2f}, mz:{consensus_mz:.4f}, MS1",
232
+ row["consensus_id"],
233
+ consensus_uid,
234
+ consensus_mz,
235
+ consensus_rt,
236
+ safe_charge(row.get("charge_mean")),
237
+ ms1_spectrum,
238
+ mgf_counter,
239
+ )
240
+
241
+ if ion_dict is not None:
242
+ ion_data.append(ion_dict)
243
+
244
+ self.logger.debug(f"Generated {len(ion_data)} MS1 spectra from isotope data")
245
+
246
+ # Now generate MS2 spectra if available
247
+ if ms2_available and len(grouped) > 0:
248
+ self.logger.debug(f"Processing MS2 data for {len(grouped)} consensus features with MS2...")
249
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
250
+ for _consensus_uid, cons_ms2 in tqdm(
251
+ grouped,
252
+ total=len(grouped),
253
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Feature",
254
+ disable=tdqm_disable,
255
+ ):
256
+ # Use the first row for feature-level info
257
+ row = cons_ms2.iloc[0]
258
+ if mz_start is not None and row["mz"] < mz_start:
259
+ continue
260
+ if mz_end is not None and row["mz"] > mz_end:
261
+ continue
262
+ if rt_start is not None and row["rt"] < rt_start:
263
+ continue
264
+ if rt_end is not None and row["rt"] > rt_end:
265
+ continue
266
+ if len(cons_ms2) == 0:
267
+ skip += 1
268
+ continue
185
269
 
186
- if split_energy:
187
- energies = cons_ms2["energy"].unique()
188
- for e in energies:
189
- cons_ms2_e = cons_ms2[cons_ms2["energy"] == e]
190
- if selection == "best":
191
- # Check if the filtered DataFrame is empty
192
- if len(cons_ms2_e) == 0:
193
- continue
194
- idx = cons_ms2_e["prec_inty"].idxmax()
195
- cons_ms2_e_row = cons_ms2_e.loc[idx]
196
- spect = cons_ms2_e_row["spec"]
197
- if spect is None:
198
- skip += 1
199
- continue
200
- if centroid:
201
- spect = spect.centroid()
202
- if deisotope:
203
- spect = spect.deisotope()
204
- spect = filter_peaks(spect, inty_min=inty_min)
205
- mgf_counter += 1
206
- ion_dict = create_ion_dict(
207
- f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{e}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
208
- cons_ms2_e_row["consensus_id"],
209
- cons_ms2_e_row["consensus_uid"],
210
- cons_ms2_e_row["mz"],
211
- cons_ms2_e_row["rt"],
212
- round(cons_ms2_e_row["charge_mean"]),
213
- spect,
214
- mgf_counter,
215
- )
216
- if ion_dict is not None:
217
- ion_data.append(ion_dict)
218
- else:
219
- for row_e in cons_ms2_e.iter_rows(named=True):
220
- spect = row_e["spec"]
270
+ if split_energy:
271
+ energies = cons_ms2["energy"].unique()
272
+ for e in energies:
273
+ cons_ms2_e = cons_ms2[cons_ms2["energy"] == e]
274
+ if selection == "best":
275
+ # Check if the filtered DataFrame is empty
276
+ if len(cons_ms2_e) == 0:
277
+ continue
278
+ idx = cons_ms2_e["prec_inty"].idxmax()
279
+ cons_ms2_e_row = cons_ms2_e.loc[idx]
280
+ spect = cons_ms2_e_row["spec"]
221
281
  if spect is None:
282
+ skip += 1
222
283
  continue
223
284
  if centroid:
224
285
  spect = spect.centroid()
@@ -227,104 +288,129 @@ def _get_mgf_df(self, **kwargs):
227
288
  spect = filter_peaks(spect, inty_min=inty_min)
228
289
  mgf_counter += 1
229
290
  ion_dict = create_ion_dict(
230
- f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{e}, sample_uid:{row_e['sample_uid']}, scanid:{row_e['scan_id']}",
231
- row_e["consensus_id"],
232
- row_e["consensus_uid"],
233
- row_e["mz"],
234
- row_e["rt"],
235
- round(row_e["charge_mean"]),
291
+ f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{e}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
292
+ cons_ms2_e_row["consensus_id"],
293
+ cons_ms2_e_row["consensus_uid"],
294
+ cons_ms2_e_row["mz"],
295
+ cons_ms2_e_row["rt"],
296
+ safe_charge(cons_ms2_e_row["charge_mean"]),
236
297
  spect,
237
298
  mgf_counter,
238
299
  )
239
300
  if ion_dict is not None:
240
301
  ion_data.append(ion_dict)
241
- else:
242
- if selection == "best":
243
- idx = cons_ms2["prec_inty"].idxmax()
244
- cons_ms2_e_row = cons_ms2.loc[idx]
245
- spect = cons_ms2_e_row["spec"]
246
- if spect is None:
247
- continue
248
- if centroid:
249
- spect = spect.centroid()
250
- if deisotope:
251
- spect = spect.deisotope()
252
- spect = filter_peaks(spect, inty_min=inty_min)
253
- mgf_counter += 1
254
- ion_dict = create_ion_dict(
255
- f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{cons_ms2_e_row['energy']}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
256
- cons_ms2_e_row["consensus_id"],
257
- cons_ms2_e_row["consensus_uid"],
258
- cons_ms2_e_row["mz"],
259
- cons_ms2_e_row["rt"],
260
- round(cons_ms2_e_row["charge_mean"]),
261
- spect,
262
- mgf_counter,
263
- )
264
- if ion_dict is not None:
265
- ion_data.append(ion_dict)
266
-
267
- elif selection == "all":
268
- if merge:
269
- specs = [
270
- row_e["spec"]
271
- for row_e in cons_ms2.iter_rows(named=True)
272
- if row_e["spec"] is not None
273
- ]
274
- if not specs:
302
+ else:
303
+ for row_e in cons_ms2_e.iter_rows(named=True):
304
+ spect = row_e["spec"]
305
+ if spect is None:
306
+ continue
307
+ if centroid:
308
+ spect = spect.centroid()
309
+ if deisotope:
310
+ spect = spect.deisotope()
311
+ spect = filter_peaks(spect, inty_min=inty_min)
312
+ mgf_counter += 1
313
+ ion_dict = create_ion_dict(
314
+ f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{e}, sample_uid:{row_e['sample_uid']}, scanid:{row_e['scan_id']}",
315
+ row_e["consensus_id"],
316
+ row_e["consensus_uid"],
317
+ row_e["mz"],
318
+ row_e["rt"],
319
+ safe_charge(row_e["charge_mean"]),
320
+ spect,
321
+ mgf_counter,
322
+ )
323
+ if ion_dict is not None:
324
+ ion_data.append(ion_dict)
325
+ else:
326
+ if selection == "best":
327
+ idx = cons_ms2["prec_inty"].idxmax()
328
+ cons_ms2_e_row = cons_ms2.loc[idx]
329
+ spect = cons_ms2_e_row["spec"]
330
+ if spect is None:
275
331
  continue
276
- spect = combine_peaks(specs)
277
332
  if centroid:
278
- spect = spect.denoise()
279
333
  spect = spect.centroid()
280
334
  if deisotope:
281
335
  spect = spect.deisotope()
282
336
  spect = filter_peaks(spect, inty_min=inty_min)
283
337
  mgf_counter += 1
284
338
  ion_dict = create_ion_dict(
285
- f"uid:{row['consensus_uid']}, rt:{row['rt']:.2f}, mz:{row['mz']:.4f}, sample_uid:{row['sample_uid']}, scan_id:{row['scan_id']}",
286
- row["consensus_id"],
287
- row["consensus_uid"],
288
- row["mz"],
289
- row["rt"],
290
- round(row["charge_mean"]),
339
+ f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{cons_ms2_e_row['energy']}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
340
+ cons_ms2_e_row["consensus_id"],
341
+ cons_ms2_e_row["consensus_uid"],
342
+ cons_ms2_e_row["mz"],
343
+ cons_ms2_e_row["rt"],
344
+ safe_charge(cons_ms2_e_row["charge_mean"]),
291
345
  spect,
292
346
  mgf_counter,
293
347
  )
294
348
  if ion_dict is not None:
295
349
  ion_data.append(ion_dict)
296
- else:
297
- for row_e in cons_ms2.iter_rows(named=True):
298
- spect = row_e["spec"]
299
- if spect is None:
350
+
351
+ elif selection == "all":
352
+ if merge:
353
+ specs = [
354
+ row_e["spec"]
355
+ for row_e in cons_ms2.iter_rows(named=True)
356
+ if row_e["spec"] is not None
357
+ ]
358
+ if not specs:
300
359
  continue
360
+ spect = combine_peaks(specs)
301
361
  if centroid:
362
+ spect = spect.denoise()
302
363
  spect = spect.centroid()
303
364
  if deisotope:
304
365
  spect = spect.deisotope()
305
366
  spect = filter_peaks(spect, inty_min=inty_min)
306
367
  mgf_counter += 1
307
368
  ion_dict = create_ion_dict(
308
- f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{row_e['energy']}, sample_uid:{row_e['sample_uid']}, scan_id:{row_e['scan_id']}",
309
- row_e["consensus_id"],
310
- row_e["consensus_uid"],
311
- row_e["mz"],
312
- row_e["rt"],
313
- round(row_e["charge_mean"]),
369
+ f"uid:{row['consensus_uid']}, rt:{row['rt']:.2f}, mz:{row['mz']:.4f}, sample_uid:{row['sample_uid']}, scan_id:{row['scan_id']}",
370
+ row["consensus_id"],
371
+ row["consensus_uid"],
372
+ row["mz"],
373
+ row["rt"],
374
+ safe_charge(row["charge_mean"]),
314
375
  spect,
315
376
  mgf_counter,
316
377
  )
317
378
  if ion_dict is not None:
318
379
  ion_data.append(ion_dict)
380
+ else:
381
+ for row_e in cons_ms2.iter_rows(named=True):
382
+ spect = row_e["spec"]
383
+ if spect is None:
384
+ continue
385
+ if centroid:
386
+ spect = spect.centroid()
387
+ if deisotope:
388
+ spect = spect.deisotope()
389
+ spect = filter_peaks(spect, inty_min=inty_min)
390
+ mgf_counter += 1
391
+ ion_dict = create_ion_dict(
392
+ f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{row_e['energy']}, sample_uid:{row_e['sample_uid']}, scan_id:{row_e['scan_id']}",
393
+ row_e["consensus_id"],
394
+ row_e["consensus_uid"],
395
+ row_e["mz"],
396
+ row_e["rt"],
397
+ safe_charge(row_e["charge_mean"]),
398
+ spect,
399
+ mgf_counter,
400
+ )
401
+ if ion_dict is not None:
402
+ ion_data.append(ion_dict)
403
+ else:
404
+ self.logger.info("Skipping MS2 data generation - no MS2 data available")
319
405
 
320
- self.logger.debug(f"Generated MGF data for {len(ion_data)} spectra")
321
- self.logger.debug(f"Skipped {skip} features due to missing data.")
406
+ self.logger.debug(f"Generated MGF data for {len(ion_data)} spectra (MS1 + MS2)")
407
+ self.logger.debug(f"Skipped {skip} MS2 features due to missing data.")
322
408
 
323
409
  # Convert to Polars DataFrame
324
410
  if not ion_data:
325
411
  return pl.DataFrame()
326
412
 
327
- return pl.DataFrame(ion_data)
413
+ return pl.DataFrame(ion_data, infer_schema_length=None)
328
414
 
329
415
 
330
416
  def export_mgf(self, **kwargs):
@@ -412,7 +498,7 @@ def export_mgf(self, **kwargs):
412
498
  self.logger.info(f"Exported {len(mgf_data)} spectra to {filename}")
413
499
 
414
500
 
415
- def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None:
501
+ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs) -> None:
416
502
  """
417
503
  Export the study as a fully compliant mzTab-M file.
418
504
 
@@ -1098,7 +1184,7 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
1098
1184
  self.logger.info(f"Exported mzTab-M to {filename}")
1099
1185
 
1100
1186
 
1101
- def export_xlsx(self, filename: str = None) -> None:
1187
+ def export_xlsx(self, filename: str | None = None) -> None:
1102
1188
  """
1103
1189
  Export the study data to an Excel workbook with multiple worksheets.
1104
1190
 
@@ -1209,38 +1295,38 @@ def export_xlsx(self, filename: str = None) -> None:
1209
1295
  self.logger.error(f"Error writing Excel file: {e}")
1210
1296
 
1211
1297
 
1212
- def export_parquet(self, basename: str = None) -> None:
1298
+ def export_parquet(self, filename: str | None = None) -> None:
1213
1299
  """
1214
1300
  Export the study data to multiple Parquet files with different suffixes.
1215
1301
 
1216
1302
  The export creates separate Parquet files for each dataset:
1217
- - <basename>_samples.parquet: Samples dataframe
1218
- - <basename>_consensus.parquet: Consensus features dataframe
1219
- - <basename>_identification.parquet: Identification results with library annotations
1220
- - <basename>_matrix.parquet: Consensus matrix with samples as columns
1303
+ - <filename>_samples.parquet: Samples dataframe
1304
+ - <filename>_consensus.parquet: Consensus features dataframe
1305
+ - <filename>_identification.parquet: Identification results with library annotations
1306
+ - <filename>_matrix.parquet: Consensus matrix with samples as columns
1221
1307
 
1222
1308
  Args:
1223
- basename (str, optional): Base name for the output files. Defaults to "study"
1309
+ filename (str, optional): Base name for the output files. Defaults to "study"
1224
1310
  in the study folder.
1225
1311
  """
1226
- # Set default basename
1227
- if basename is None:
1228
- basename = "study"
1312
+ # Set default filename
1313
+ if filename is None:
1314
+ filename = "study"
1229
1315
 
1230
- # Make basename absolute path if not already (without extension)
1231
- if not os.path.isabs(basename):
1316
+ # Make filename absolute path if not already (without extension)
1317
+ if not os.path.isabs(filename):
1232
1318
  if self.folder is not None:
1233
- basename = os.path.join(self.folder, basename)
1319
+ filename = os.path.join(self.folder, filename)
1234
1320
  else:
1235
- basename = os.path.join(os.getcwd(), basename)
1321
+ filename = os.path.join(os.getcwd(), filename)
1236
1322
 
1237
- self.logger.debug(f"Exporting study to Parquet files with basename: {basename}")
1323
+ self.logger.debug(f"Exporting study to Parquet files with filename: {filename}")
1238
1324
 
1239
1325
  exported_files = []
1240
1326
 
1241
1327
  # 1. Samples dataframe
1242
1328
  if self.samples_df is not None and not self.samples_df.is_empty():
1243
- samples_file = f"{basename}_samples.parquet"
1329
+ samples_file = f"{filename}_samples.parquet"
1244
1330
  try:
1245
1331
  self.samples_df.write_parquet(samples_file)
1246
1332
  exported_files.append(samples_file)
@@ -1256,7 +1342,7 @@ def export_parquet(self, basename: str = None) -> None:
1256
1342
 
1257
1343
  # 2. Consensus dataframe
1258
1344
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1259
- consensus_file = f"{basename}_consensus.parquet"
1345
+ consensus_file = f"{filename}_consensus.parquet"
1260
1346
  try:
1261
1347
  self.consensus_df.write_parquet(consensus_file)
1262
1348
  exported_files.append(consensus_file)
@@ -1276,7 +1362,7 @@ def export_parquet(self, basename: str = None) -> None:
1276
1362
 
1277
1363
  id_df = get_id(self)
1278
1364
  if id_df is not None and not id_df.is_empty():
1279
- identification_file = f"{basename}_identification.parquet"
1365
+ identification_file = f"{filename}_identification.parquet"
1280
1366
  try:
1281
1367
  id_df.write_parquet(identification_file)
1282
1368
  exported_files.append(identification_file)
@@ -1298,7 +1384,7 @@ def export_parquet(self, basename: str = None) -> None:
1298
1384
  try:
1299
1385
  matrix_df = self.get_consensus_matrix()
1300
1386
  if matrix_df is not None and not matrix_df.is_empty():
1301
- matrix_file = f"{basename}_matrix.parquet"
1387
+ matrix_file = f"{filename}_matrix.parquet"
1302
1388
  try:
1303
1389
  matrix_df.write_parquet(matrix_file)
1304
1390
  exported_files.append(matrix_file)
masster/study/h5.py CHANGED
@@ -974,7 +974,7 @@ def _load_dataframe_from_group(
974
974
 
975
975
  # Second pass: handle missing columns
976
976
  for col in missing_columns:
977
- logger.warning(f"Column '{col}' not found in {df_name}.")
977
+ logger.info(f"Column '{col}' not found in {df_name}.")
978
978
  # For missing columns, create appropriately sized array with appropriate defaults
979
979
  if col in object_columns:
980
980
  data[col] = [None] * expected_length
@@ -2008,3 +2008,67 @@ def _load_study5(self, filename=None):
2008
2008
  )
2009
2009
 
2010
2010
  self.logger.debug("Study loaded")
2011
+
2012
+
2013
+ def _load_ms1(self, sample_path: str) -> pl.DataFrame:
2014
+ """
2015
+ Optimized method to load only MS1 data from a sample5 file for isotope detection.
2016
+
2017
+ This method efficiently loads only the ms1_df from a sample5 HDF5 file without
2018
+ loading other potentially large datasets like features_df, scans_df, etc.
2019
+
2020
+ Args:
2021
+ sample_path (str): Path to the sample5 HDF5 file
2022
+
2023
+ Returns:
2024
+ pl.DataFrame: MS1 data with columns [cycle, scan_uid, rt, mz, inty]
2025
+ Returns empty DataFrame if no MS1 data found or file cannot be read
2026
+
2027
+ Note:
2028
+ Used by find_iso() for efficient isotope pattern detection without full sample loading
2029
+ """
2030
+ try:
2031
+ with h5py.File(sample_path, "r") as f:
2032
+ # Check if ms1 group exists
2033
+ if "ms1" not in f:
2034
+ self.logger.debug(f"No MS1 data found in {sample_path}")
2035
+ return pl.DataFrame()
2036
+
2037
+ ms1_group = f["ms1"]
2038
+
2039
+ # Load MS1 data efficiently
2040
+ ms1_data = {}
2041
+ for col in ms1_group.keys():
2042
+ ms1_data[col] = ms1_group[col][:]
2043
+
2044
+ if not ms1_data:
2045
+ self.logger.debug(f"Empty MS1 data in {sample_path}")
2046
+ return pl.DataFrame()
2047
+
2048
+ # Create DataFrame with proper schema
2049
+ ms1_df = pl.DataFrame(ms1_data)
2050
+
2051
+ # Apply expected schema for MS1 data
2052
+ expected_schema = {
2053
+ "cycle": pl.Int64,
2054
+ "scan_uid": pl.Int64,
2055
+ "rt": pl.Float64,
2056
+ "mz": pl.Float64,
2057
+ "inty": pl.Float64
2058
+ }
2059
+
2060
+ # Cast columns to expected types if they exist
2061
+ cast_expressions = []
2062
+ for col, dtype in expected_schema.items():
2063
+ if col in ms1_df.columns:
2064
+ cast_expressions.append(pl.col(col).cast(dtype))
2065
+
2066
+ if cast_expressions:
2067
+ ms1_df = ms1_df.with_columns(cast_expressions)
2068
+
2069
+ self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {sample_path}")
2070
+ return ms1_df
2071
+
2072
+ except Exception as e:
2073
+ self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
2074
+ return pl.DataFrame()
masster/study/helpers.py CHANGED
@@ -509,8 +509,9 @@ def get_consensus(self, quant="chrom_area"):
509
509
  # Convert Polars DataFrame to pandas for this operation since the result is used for export
510
510
  df1 = self.consensus_df.to_pandas().copy()
511
511
 
512
- # set consensus_id as uint64
513
- df1["consensus_id"] = df1["consensus_id"].astype("uint64")
512
+ # Keep consensus_id as string (UUID format)
513
+ # Note: consensus_id is now a 16-character UUID string, not an integer
514
+ df1["consensus_id"] = df1["consensus_id"].astype("string")
514
515
  # set consensus_id as index
515
516
  df1.set_index("consensus_uid", inplace=True)
516
517
  # sort by consensus_id
@@ -640,7 +641,6 @@ def get_gaps_stats(self, uids=None):
640
641
  return gaps_stats
641
642
 
642
643
 
643
- # TODO is uid not supposed to be a list anymore?
644
644
  def get_consensus_matches(self, uids=None, filled=True):
645
645
  """
646
646
  Get feature matches for consensus UIDs with optimized join operation.