masster 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (55) hide show
  1. masster/__init__.py +27 -27
  2. masster/_version.py +17 -17
  3. masster/chromatogram.py +497 -503
  4. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
  5. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
  6. masster/logger.py +318 -244
  7. masster/sample/__init__.py +9 -9
  8. masster/sample/defaults/__init__.py +15 -15
  9. masster/sample/defaults/find_adducts_def.py +325 -325
  10. masster/sample/defaults/find_features_def.py +366 -366
  11. masster/sample/defaults/find_ms2_def.py +285 -285
  12. masster/sample/defaults/get_spectrum_def.py +314 -318
  13. masster/sample/defaults/sample_def.py +374 -378
  14. masster/sample/h5.py +1321 -1297
  15. masster/sample/helpers.py +833 -364
  16. masster/sample/lib.py +762 -0
  17. masster/sample/load.py +1220 -1187
  18. masster/sample/parameters.py +131 -131
  19. masster/sample/plot.py +1610 -1622
  20. masster/sample/processing.py +1402 -1416
  21. masster/sample/quant.py +209 -0
  22. masster/sample/sample.py +391 -387
  23. masster/sample/sample5_schema.json +181 -181
  24. masster/sample/save.py +737 -719
  25. masster/sample/sciex.py +1213 -0
  26. masster/spectrum.py +1287 -1319
  27. masster/study/__init__.py +9 -9
  28. masster/study/defaults/__init__.py +21 -19
  29. masster/study/defaults/align_def.py +267 -267
  30. masster/study/defaults/export_def.py +41 -40
  31. masster/study/defaults/fill_chrom_def.py +264 -264
  32. masster/study/defaults/fill_def.py +260 -0
  33. masster/study/defaults/find_consensus_def.py +256 -256
  34. masster/study/defaults/find_ms2_def.py +163 -163
  35. masster/study/defaults/integrate_chrom_def.py +225 -225
  36. masster/study/defaults/integrate_def.py +221 -0
  37. masster/study/defaults/merge_def.py +256 -0
  38. masster/study/defaults/study_def.py +272 -269
  39. masster/study/export.py +674 -287
  40. masster/study/h5.py +1398 -886
  41. masster/study/helpers.py +1650 -433
  42. masster/study/helpers_optimized.py +317 -0
  43. masster/study/load.py +1201 -1078
  44. masster/study/parameters.py +99 -99
  45. masster/study/plot.py +632 -645
  46. masster/study/processing.py +1057 -1046
  47. masster/study/save.py +149 -134
  48. masster/study/study.py +606 -522
  49. masster/study/study5_schema.json +247 -241
  50. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
  51. masster-0.3.0.dist-info/RECORD +59 -0
  52. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
  53. masster-0.2.4.dist-info/RECORD +0 -50
  54. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
  55. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/study/load.py CHANGED
@@ -1,1078 +1,1201 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- import concurrent.futures
5
- from datetime import datetime
6
-
7
- import numpy as np
8
- import polars as pl
9
- import pyopenms as oms
10
-
11
- from tqdm import tqdm
12
-
13
- from masster.chromatogram import Chromatogram
14
- from masster.study.defaults import fill_chrom_defaults
15
- from masster.sample.sample import Sample
16
- from masster.spectrum import Spectrum
17
-
18
-
19
- # Pre-import heavy modules to avoid repeated loading in add_sample()
20
- try:
21
- import alpharaw.sciex
22
-
23
- ALPHARAW_AVAILABLE = True
24
- except ImportError:
25
- ALPHARAW_AVAILABLE = False
26
-
27
- try:
28
- import pythonnet
29
-
30
- PYTHONNET_AVAILABLE = True
31
- except ImportError:
32
- PYTHONNET_AVAILABLE = False
33
-
34
- import glob
35
-
36
-
37
- def add_folder(
38
- self,
39
- folder=None,
40
- reset=False,
41
- adducts=None,
42
- max_files=None,
43
- ):
44
- if folder is None:
45
- if self.default_folder is not None:
46
- folder = self.default_folder
47
- else:
48
- folder = os.getcwd()
49
-
50
- files = []
51
-
52
- if not any(char in folder for char in ["*", "?", "[", "]"]):
53
- folder = os.path.join(folder, "**", "*.sample5")
54
-
55
- self.logger.debug(f"Adding files from: {folder}")
56
- files = glob.glob(folder, recursive=True)
57
-
58
- not_zero = False
59
-
60
- bname = []
61
- counter = 1
62
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
63
- if len(files) > 0:
64
- for i, file in enumerate(
65
- tqdm(
66
- files,
67
- total=len(files),
68
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add sample5 files",
69
- disable=tdqm_disable,
70
- ),
71
- ):
72
- # check if file is already in the study
73
- if max_files is not None and counter > max_files:
74
- break
75
- self.logger.debug(f"Add file {i + 1}/{len(files)}: {file}")
76
- self.add_sample(file, reset=reset, adducts=adducts)
77
- bname.append(os.path.basename(file))
78
- bname.append(os.path.basename(file).replace(".sample5", ".wiff"))
79
- not_zero = True
80
- counter += 1
81
-
82
- if max_files is not None and counter < max_files:
83
- files = glob.glob(folder.replace('.sample5','*.wiff'), recursive=True)
84
-
85
- if len(files) > 0:
86
- # iterate over all files
87
- for i, file in enumerate(
88
- tqdm(
89
- files,
90
- total=len(files),
91
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add wiff files",
92
- disable=tdqm_disable,
93
- ),
94
- ):
95
- # check if file is already in the study
96
- if os.path.basename(file) in bname:
97
- continue
98
- if max_files is not None and counter > max_files:
99
- break
100
- self.logger.debug(f"Add file {i + 1}/{len(files)}: {file}")
101
- self.add_sample(file, reset=reset, adducts=adducts)
102
- not_zero = True
103
-
104
- if max_files is not None and counter > max_files:
105
- self.logger.info(
106
- f"Reached maximum number of files to add: {max_files}. Stopping further additions.",
107
- )
108
-
109
- if not not_zero:
110
- self.logger.error(
111
- f"No files found in {folder}. Please check the folder path or file patterns.",
112
- )
113
-
114
-
115
-
116
- # TODO type is not used
117
- def add_sample(self, file, type=None, reset=False, adducts=None):
118
- self.logger.debug(f"Adding: {file}")
119
- sample_name = (
120
- os.path.basename(file)
121
- .replace(".mzpkl", "")
122
- .replace(".wiff", "")
123
- .replace(".h5", "")
124
- .replace(".sample5", "")
125
- )
126
- # check is sample_name is already in the samples_df
127
- if sample_name in self.samples_df["sample_name"].to_list():
128
- self.logger.warning(
129
- f"Sample {sample_name} already exists in the study. Skipping.",
130
- )
131
- return
132
-
133
- # check if file exists
134
- if not os.path.exists(file):
135
- self.logger.error(f"File {file} does not exist.")
136
- return
137
-
138
- if not file.endswith((".sample5", ".wiff", ".mzML")):
139
- self.logger.error(f"File {file} is not a valid sample5 file.")
140
- return
141
-
142
- # try:
143
- if file.endswith((".sample5")):
144
- ddaobj = Sample()
145
- ddaobj.logger_update(level='WARNING', label=os.path.basename(file))
146
- ddaobj.load(file)
147
- elif file.endswith(".wiff"):
148
- ddaobj = Sample()
149
- ddaobj.logger_update(level='WARNING', label=os.path.basename(file))
150
- ddaobj.load(file)
151
- if ddaobj.features_df is None and not reset:
152
- self.logger.warning(
153
- f"File {file} will be newly processed.",
154
- )
155
- ddaobj.features = None
156
-
157
- if ddaobj.features is None or reset:
158
- ddaobj.find_features()
159
- ddaobj.find_adducts(adducts=adducts)
160
- ddaobj.filter_features(coherence=0.3, prominence_scaled=1.0)
161
- ddaobj.find_ms2()
162
-
163
- self.features_maps.append(ddaobj.features)
164
-
165
- sample_type = "sample" if type is None else type
166
- if "qc" in sample_name.lower():
167
- sample_type = "qc"
168
- if "blank" in sample_name.lower():
169
- sample_type = "blank"
170
- map_id_value = str(ddaobj.features.getUniqueId())
171
-
172
- new_sample = pl.DataFrame(
173
- {
174
- "sample_uid": [int(len(self.samples_df) + 1)],
175
- "sample_name": [sample_name],
176
- "sample_path": [file],
177
- "sample_type": [sample_type],
178
- "size": [int(ddaobj.features.size())],
179
- "map_id": [map_id_value],
180
- },
181
- schema={
182
- "sample_uid": pl.Int64,
183
- "sample_name": pl.Utf8,
184
- "sample_path": pl.Utf8,
185
- "sample_type": pl.Utf8,
186
- "size": pl.Int64,
187
- "map_id": pl.Utf8,
188
- },
189
- )
190
- # save ddaobj to default_folder if it is set
191
- if self.default_folder is not None:
192
- if not os.path.exists(self.default_folder):
193
- os.makedirs(self.default_folder)
194
- basename = os.path.basename(file)
195
- sample_name = os.path.splitext(basename)[0]
196
- ddaobj.save(os.path.join(self.default_folder, sample_name + ".sample5"))
197
- self.samples_df = pl.concat([self.samples_df, new_sample])
198
-
199
- # Optimized DataFrame operations - chain operations instead of multiple clones
200
- columns_to_add = [
201
- pl.lit(len(self.samples_df)).alias("sample_uid"),
202
- pl.lit(False).alias("filled"),
203
- pl.lit(-1.0).alias("chrom_area"),
204
- ]
205
-
206
- # Only add rt_original if it doesn't exist
207
- if "rt_original" not in ddaobj.features_df.columns:
208
- columns_to_add.append(pl.col("rt").alias("rt_original"))
209
-
210
- f_df = ddaobj.features_df.with_columns(columns_to_add)
211
-
212
- if self.features_df.is_empty():
213
- # Create new features_df with feature_uid column
214
- self.features_df = f_df.with_columns(
215
- pl.int_range(pl.len()).add(1).alias("feature_uid"),
216
- ).select(
217
- ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
218
- )
219
- else:
220
- offset = (
221
- self.features_df["feature_uid"].max() + 1
222
- if not self.features_df.is_empty()
223
- else 1
224
- )
225
- # Chain operations and add to existing DataFrame
226
- f_df = f_df.with_columns(
227
- pl.int_range(pl.len()).add(offset).alias("feature_uid"),
228
- ).select(
229
- ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
230
- )
231
- self.features_df = pl.concat([self.features_df, f_df])
232
- self.logger.debug(
233
- f"Added sample {sample_name} with {ddaobj.features.size()} features to the study.",
234
- )
235
-
236
-
237
- def load(self, filename=None):
238
- """
239
- Load a study from an HDF5 file.
240
-
241
- Args:
242
- study: The study object to load into
243
- filename (str, optional): The path to the HDF5 file to load the study from.
244
- """
245
-
246
- # Handle default filename
247
- if filename is None:
248
- if self.default_folder is not None:
249
- # search for *.study5 in default_folder
250
- study5_files = glob.glob(os.path.join(self.default_folder, "*.study5"))
251
- if study5_files:
252
- filename = study5_files[-1]
253
- else:
254
- self.logger.error("No .study5 files found in default_folder")
255
- return
256
- else:
257
- self.logger.error("Either filename or default_folder must be provided")
258
- return
259
-
260
- self.logger.info(f"Loading study from {filename}")
261
- self._load_study5(filename)
262
- # After loading the study, check if consensus XML exists and load it
263
- consensus_xml_path = filename.replace(".study5", ".consensusXML")
264
- if os.path.exists(consensus_xml_path):
265
- self._load_consensusXML(filename=consensus_xml_path)
266
- # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
267
- else:
268
- self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
269
-
270
-
271
- def fill_chrom_single(
272
- self,
273
- uids=None,
274
- mz_tol: float = 0.010,
275
- rt_tol: float = 10.0,
276
- min_samples_rel: float = 0.0,
277
- min_samples_abs: int = 2,
278
- ):
279
- """Fill missing chromatograms by extracting from raw data.
280
-
281
- Simplified version that loads one sample at a time without preloading or batching.
282
-
283
- Args:
284
- uids: Consensus UIDs to process (default: all)
285
- mz_tol: m/z tolerance for extraction (default: 0.010 Da)
286
- rt_tol: RT tolerance for extraction (default: 10.0 seconds)
287
- min_samples_rel: Relative minimum sample threshold (default: 0.0)
288
- min_samples_abs: Absolute minimum sample threshold (default: 2)
289
- """
290
- uids = self._get_consensus_uids(uids)
291
-
292
- self.logger.info("Gap filling...")
293
- self.logger.debug(
294
- f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}",
295
- )
296
-
297
- # Apply minimum sample filters
298
- min_number_rel = 1
299
- min_number_abs = 1
300
- if isinstance(min_samples_rel, float) and min_samples_rel > 0:
301
- min_number_rel = int(min_samples_rel * len(self.samples_df))
302
- if isinstance(min_samples_abs, int) and min_samples_abs > 0:
303
- min_number_abs = int(min_samples_abs)
304
- min_number = max(min_number_rel, min_number_abs)
305
- self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
306
-
307
- if min_number > 0:
308
- original_count = len(uids)
309
- uids = self.consensus_df.filter(
310
- (pl.col("number_samples") >= min_number)
311
- & (pl.col("consensus_uid").is_in(uids)),
312
- )["consensus_uid"].to_list()
313
- self.logger.debug(
314
- f"Features to fill: {original_count} -> {len(uids)}",
315
- )
316
- self.logger.debug("Identifying missing features...")
317
- # Instead of building full chromatogram matrix, identify missing consensus/sample combinations directly
318
- missing_combinations = self._get_missing_consensus_sample_combinations(uids)
319
- if not missing_combinations:
320
- self.logger.info("No missing features found to fill.")
321
- return
322
-
323
- # Build lookup dictionaries
324
- self.logger.debug("Building lookup dictionaries...")
325
- consensus_info = {}
326
- consensus_subset = self.consensus_df.select([
327
- "consensus_uid",
328
- "rt_start_mean",
329
- "rt_end_mean",
330
- "mz",
331
- "rt",
332
- ]).filter(pl.col("consensus_uid").is_in(uids))
333
-
334
- for row in consensus_subset.iter_rows(named=True):
335
- consensus_info[row["consensus_uid"]] = {
336
- "rt_start_mean": row["rt_start_mean"],
337
- "rt_end_mean": row["rt_end_mean"],
338
- "mz": row["mz"],
339
- "rt": row["rt"],
340
- }
341
-
342
- # Process each sample individually
343
- # Group missing combinations by sample for efficient processing
344
- missing_by_sample = {}
345
- for consensus_uid, sample_uid, sample_name, sample_path in missing_combinations:
346
- if sample_name not in missing_by_sample:
347
- missing_by_sample[sample_name] = {
348
- "sample_uid": sample_uid,
349
- "sample_path": sample_path,
350
- "missing_consensus_uids": [],
351
- }
352
- missing_by_sample[sample_name]["missing_consensus_uids"].append(consensus_uid)
353
-
354
- new_features: list[dict] = []
355
- new_mapping: list[dict] = []
356
- counter = 0
357
-
358
- self.logger.debug(
359
- f"Missing features: {len(missing_combinations)} in {len(missing_by_sample)} samples...",
360
- )
361
-
362
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
363
-
364
- for sample_name, sample_info in tqdm(
365
- missing_by_sample.items(),
366
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}File",
367
- disable=tdqm_disable,
368
- ):
369
- # Load this sample
370
- sample_uid = sample_info["sample_uid"]
371
- sample_path = sample_info["sample_path"]
372
- missing_consensus_uids = sample_info["missing_consensus_uids"]
373
-
374
- try:
375
- # self.logger.debug(f"Loading sample: {sample_path}")
376
- file = Sample()
377
- file.logger_update("WARNING")
378
- file.load(sample_path)
379
- except Exception as e:
380
- self.logger.warning(f"Failed to load sample {sample_name}: {e}")
381
- continue
382
-
383
- self.logger.debug(
384
- f"Sample {sample_name}: Processing {len(missing_consensus_uids)} missing features",
385
- )
386
-
387
- # Process each missing feature
388
- for consensus_uid in missing_consensus_uids:
389
- cons = consensus_info[consensus_uid]
390
- mz = cons["mz"]
391
- rt = cons["rt"]
392
- rt_start_mean = cons["rt_start_mean"]
393
- rt_end_mean = cons["rt_end_mean"]
394
-
395
- # Filter MS1 data for this feature
396
- if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
397
- d = file.ms1_df.filter(
398
- (pl.col("mz") >= mz - mz_tol)
399
- & (pl.col("mz") <= mz + mz_tol)
400
- & (pl.col("rt") >= rt_start_mean - rt_tol)
401
- & (pl.col("rt") <= rt_end_mean + rt_tol),
402
- )
403
- else:
404
- d = pl.DataFrame()
405
-
406
- # Create chromatogram
407
- if d.is_empty():
408
- self.logger.debug(
409
- f"Feature {consensus_uid}: No MS1 data found, creating empty chromatogram",
410
- )
411
- eic = Chromatogram(
412
- rt=np.array([rt_start_mean, rt_end_mean]),
413
- inty=np.array([0.0, 0.0]),
414
- label=f"EIC mz={mz:.4f}",
415
- file=sample_path,
416
- mz=mz,
417
- mz_tol=mz_tol,
418
- feature_start=rt_start_mean,
419
- feature_end=rt_end_mean,
420
- feature_apex=rt,
421
- )
422
- max_inty = 0.0
423
- area = 0.0
424
- else:
425
- self.logger.debug(
426
- f"Feature {consensus_uid}: Found {len(d)} MS1 points, creating EIC",
427
- )
428
- eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
429
-
430
- if len(eic_rt) > 4:
431
- eic = Chromatogram(
432
- eic_rt["rt"].to_numpy(),
433
- eic_rt["inty"].to_numpy(),
434
- label=f"EIC mz={mz:.4f}",
435
- file=sample_path,
436
- mz=mz,
437
- mz_tol=mz_tol,
438
- feature_start=rt_start_mean,
439
- feature_end=rt_end_mean,
440
- feature_apex=rt,
441
- ).find_peaks()
442
- max_inty = np.max(eic.inty)
443
- area = eic.feature_area
444
- else:
445
- eic = Chromatogram(
446
- eic_rt["rt"].to_numpy(),
447
- eic_rt["inty"].to_numpy(),
448
- label=f"EIC mz={mz:.4f}",
449
- file=sample_path,
450
- mz=mz,
451
- mz_tol=mz_tol,
452
- feature_start=rt_start_mean,
453
- feature_end=rt_end_mean,
454
- feature_apex=rt,
455
- )
456
- max_inty = 0.0
457
- area = 0.0
458
-
459
- # Generate feature UID
460
- feature_uid = (
461
- self.features_df["feature_uid"].max() + len(new_features) + 1
462
- if not self.features_df.is_empty()
463
- else len(new_features) + 1
464
- )
465
-
466
- # Create new feature entry
467
- new_feature = {
468
- "sample_uid": sample_uid,
469
- "feature_uid": feature_uid,
470
- "feature_id": None,
471
- "mz": mz,
472
- "rt": rt,
473
- "rt_original": None,
474
- "rt_start": rt_start_mean,
475
- "rt_end": rt_end_mean,
476
- "rt_delta": rt_end_mean - rt_start_mean,
477
- "mz_start": None,
478
- "mz_end": None,
479
- "inty": max_inty,
480
- "quality": None,
481
- "charge": None,
482
- "iso": None,
483
- "iso_of": None,
484
- "adduct": None,
485
- "adduct_mass": None,
486
- "adduct_group": None,
487
- "chrom": eic,
488
- "chrom_coherence": None,
489
- "chrom_prominence": None,
490
- "chrom_prominence_scaled": None,
491
- "chrom_height_scaled": None,
492
- "ms2_scans": None,
493
- "ms2_specs": None,
494
- "filled": True,
495
- "chrom_area": area,
496
- }
497
-
498
- new_features.append(new_feature)
499
- new_mapping.append({
500
- "consensus_uid": consensus_uid,
501
- "sample_uid": sample_uid,
502
- "feature_uid": feature_uid,
503
- })
504
- counter += 1
505
-
506
- # Add new features to DataFrames
507
- self.logger.debug(f"Adding {len(new_features)} new features to DataFrame...")
508
- if new_features:
509
- # Create properly formatted rows
510
- rows_to_add = []
511
- for feature_dict in new_features:
512
- new_row = {}
513
- for col in self.features_df.columns:
514
- if col in feature_dict:
515
- new_row[col] = feature_dict[col]
516
- else:
517
- new_row[col] = None
518
- rows_to_add.append(new_row)
519
-
520
- # Create and add new DataFrame
521
- new_df = pl.from_dicts(rows_to_add)
522
-
523
- # Cast columns to match existing schema
524
- cast_exprs = []
525
- for col in self.features_df.columns:
526
- existing_dtype = self.features_df[col].dtype
527
- cast_exprs.append(pl.col(col).cast(existing_dtype, strict=False))
528
-
529
- new_df = new_df.with_columns(cast_exprs)
530
- self.features_df = self.features_df.vstack(new_df)
531
-
532
- # Add consensus mapping
533
- new_mapping_df = pl.DataFrame(new_mapping)
534
- self.consensus_mapping_df = pl.concat(
535
- [self.consensus_mapping_df, new_mapping_df],
536
- how="diagonal",
537
- )
538
-
539
- self.logger.info(f"Filled {counter} chromatograms from raw data.")
540
-
541
-
542
- def _process_sample_for_parallel_fill(
543
- self,
544
- sample_info,
545
- consensus_info,
546
- uids,
547
- mz_tol,
548
- rt_tol,
549
- missing_combinations_df,
550
- features_df_max_uid,
551
- ):
552
- """Process a single sample for parallel gap filling."""
553
- sample_uid = sample_info["sample_uid"]
554
- sample_path = sample_info["sample_path"]
555
-
556
- new_features: list[dict] = []
557
- new_mapping: list[dict] = []
558
- counter = 0
559
-
560
- try:
561
- # Load this sample
562
- file = Sample()
563
- file.logger_update(level="WARNING")
564
- file.load(sample_path)
565
- except Exception:
566
- # Skip this sample if loading fails
567
- return new_features, new_mapping, counter
568
-
569
- # Find missing features for this sample from precomputed combinations
570
- sample_missing = missing_combinations_df.filter(
571
- pl.col("sample_uid") == sample_uid,
572
- )["consensus_uid"].to_list()
573
-
574
- if not sample_missing:
575
- return new_features, new_mapping, counter
576
-
577
- # Process each missing feature
578
- for consensus_uid in sample_missing:
579
- cons = consensus_info[consensus_uid]
580
- mz = cons["mz"]
581
- rt = cons["rt"]
582
- rt_start_mean = cons["rt_start_mean"]
583
- rt_end_mean = cons["rt_end_mean"]
584
-
585
- # Filter MS1 data for this feature
586
- if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
587
- d = file.ms1_df.filter(
588
- (pl.col("mz") >= mz - mz_tol)
589
- & (pl.col("mz") <= mz + mz_tol)
590
- & (pl.col("rt") >= rt_start_mean - rt_tol)
591
- & (pl.col("rt") <= rt_end_mean + rt_tol),
592
- )
593
- else:
594
- d = pl.DataFrame()
595
-
596
- # Create chromatogram
597
- if d.is_empty():
598
- eic = Chromatogram(
599
- rt=np.array([rt_start_mean, rt_end_mean]),
600
- inty=np.array([0.0, 0.0]),
601
- label=f"EIC mz={mz:.4f}",
602
- file=sample_path,
603
- mz=mz,
604
- mz_tol=mz_tol,
605
- feature_start=rt_start_mean,
606
- feature_end=rt_end_mean,
607
- feature_apex=rt,
608
- )
609
- max_inty = 0.0
610
- area = 0.0
611
- else:
612
- eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
613
-
614
- if len(eic_rt) > 4:
615
- eic = Chromatogram(
616
- eic_rt["rt"].to_numpy(),
617
- eic_rt["inty"].to_numpy(),
618
- label=f"EIC mz={mz:.4f}",
619
- file=sample_path,
620
- mz=mz,
621
- mz_tol=mz_tol,
622
- feature_start=rt_start_mean,
623
- feature_end=rt_end_mean,
624
- feature_apex=rt,
625
- ).find_peaks()
626
- max_inty = np.max(eic.inty)
627
- area = eic.feature_area
628
- else:
629
- eic = Chromatogram(
630
- eic_rt["rt"].to_numpy(),
631
- eic_rt["inty"].to_numpy(),
632
- label=f"EIC mz={mz:.4f}",
633
- file=sample_path,
634
- mz=mz,
635
- mz_tol=mz_tol,
636
- feature_start=rt_start_mean,
637
- feature_end=rt_end_mean,
638
- feature_apex=rt,
639
- )
640
- max_inty = 0.0
641
- area = 0.0
642
-
643
- # Generate feature UID (will be adjusted later to ensure global uniqueness)
644
- feature_uid = features_df_max_uid + len(new_features) + 1
645
-
646
- # Create new feature entry
647
- new_feature = {
648
- "sample_uid": sample_uid,
649
- "feature_uid": feature_uid,
650
- "feature_id": None,
651
- "mz": mz,
652
- "rt": rt,
653
- "rt_original": None,
654
- "rt_start": rt_start_mean,
655
- "rt_end": rt_end_mean,
656
- "rt_delta": rt_end_mean - rt_start_mean,
657
- "mz_start": None,
658
- "mz_end": None,
659
- "inty": max_inty,
660
- "quality": None,
661
- "charge": None,
662
- "iso": None,
663
- "iso_of": None,
664
- "adduct": None,
665
- "adduct_mass": None,
666
- "adduct_group": None,
667
- "chrom": eic,
668
- "filled": True,
669
- "chrom_area": area,
670
- "chrom_coherence": None,
671
- "chrom_prominence": None,
672
- "chrom_prominence_scaled": None,
673
- "chrom_height_scaled": None,
674
- "ms2_scans": None,
675
- "ms2_specs": None,
676
- }
677
-
678
- new_features.append(new_feature)
679
- new_mapping.append({
680
- "consensus_uid": consensus_uid,
681
- "sample_uid": sample_uid,
682
- "feature_uid": feature_uid,
683
- })
684
- counter += 1
685
-
686
- return new_features, new_mapping, counter
687
-
688
-
689
- def fill_chrom(
690
- self,
691
- uids=None,
692
- mz_tol: float = 0.010,
693
- rt_tol: float = 10.0,
694
- min_samples_rel: float = 0.0,
695
- min_samples_abs: int = 2,
696
- num_workers=4,
697
- ):
698
- """Fill missing chromatograms by extracting from raw data using parallel processing.
699
-
700
- Args:
701
- uids: Consensus UIDs to process (default: all)
702
- mz_tol: m/z tolerance for extraction (default: 0.010 Da)
703
- rt_tol: RT tolerance for extraction (default: 10.0 seconds)
704
- min_samples_rel: Relative minimum sample threshold (default: 0.0)
705
- min_samples_abs: Absolute minimum sample threshold (default: 2)
706
- num_workers: Number of parallel workers (default: 4)
707
- """
708
- uids = self._get_consensus_uids(uids)
709
-
710
- self.logger.info(f"Gap filling with {num_workers} workers...")
711
- self.logger.debug(
712
- f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}, num_workers={num_workers}",
713
- )
714
-
715
- # Apply minimum sample filters
716
- min_number_rel = 1
717
- min_number_abs = 1
718
- if isinstance(min_samples_rel, float) and min_samples_rel > 0:
719
- min_number_rel = int(min_samples_rel * len(self.samples_df))
720
- if isinstance(min_samples_abs, int) and min_samples_abs > 0:
721
- min_number_abs = int(min_samples_abs)
722
- min_number = max(min_number_rel, min_number_abs)
723
-
724
- self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
725
-
726
- if min_number > 0:
727
- original_count = len(uids)
728
- uids = self.consensus_df.filter(
729
- (pl.col("number_samples") >= min_number)
730
- & (pl.col("consensus_uid").is_in(uids)),
731
- )["consensus_uid"].to_list()
732
- self.logger.debug(f"Features to fill: {original_count} -> {len(uids)}")
733
-
734
- # Get missing consensus/sample combinations using the optimized method
735
- self.logger.debug("Identifying missing features...")
736
- missing_combinations = self._get_missing_consensus_sample_combinations(uids)
737
-
738
- if not missing_combinations or len(missing_combinations) == 0:
739
- self.logger.info("No missing features found to fill.")
740
- return
741
-
742
- # Convert to DataFrame for easier processing
743
- missing_combinations_df = pl.DataFrame(
744
- missing_combinations,
745
- schema={
746
- "consensus_uid": pl.Int64,
747
- "sample_uid": pl.Int64,
748
- "sample_name": pl.Utf8,
749
- "sample_path": pl.Utf8,
750
- },
751
- orient="row",
752
- )
753
-
754
- # Build lookup dictionaries
755
- self.logger.debug("Building lookup dictionaries...")
756
- consensus_info = {}
757
- consensus_subset = self.consensus_df.select([
758
- "consensus_uid",
759
- "rt_start_mean",
760
- "rt_end_mean",
761
- "mz",
762
- "rt",
763
- ]).filter(pl.col("consensus_uid").is_in(uids))
764
-
765
- for row in consensus_subset.iter_rows(named=True):
766
- consensus_info[row["consensus_uid"]] = {
767
- "rt_start_mean": row["rt_start_mean"],
768
- "rt_end_mean": row["rt_end_mean"],
769
- "mz": row["mz"],
770
- "rt": row["rt"],
771
- }
772
-
773
- # Get sample info for all samples that need processing
774
- samples_to_process = []
775
- unique_sample_uids = missing_combinations_df["sample_uid"].unique().to_list()
776
-
777
- for row in self.samples_df.filter(
778
- pl.col("sample_uid").is_in(unique_sample_uids),
779
- ).iter_rows(named=True):
780
- samples_to_process.append({
781
- "sample_name": row["sample_name"],
782
- "sample_uid": row["sample_uid"],
783
- "sample_path": row["sample_path"],
784
- })
785
-
786
- total_missing = len(missing_combinations_df)
787
- total_samples = len(samples_to_process)
788
-
789
- self.logger.info(
790
- f"Gap filling for {total_missing} missing features from {total_samples} samples using {num_workers} workers...",
791
- )
792
-
793
- # Calculate current max feature_uid to avoid conflicts
794
- features_df_max_uid = (
795
- self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
796
- )
797
-
798
- # Process samples in parallel
799
- all_new_features: list[dict] = []
800
- all_new_mapping: list[dict] = []
801
- total_counter = 0
802
-
803
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
804
-
805
- with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
806
- # Submit all samples for processing
807
- future_to_sample = {}
808
- for sample_info in samples_to_process:
809
- future = executor.submit(
810
- self._process_sample_for_parallel_fill,
811
- sample_info,
812
- consensus_info,
813
- uids,
814
- mz_tol,
815
- rt_tol,
816
- missing_combinations_df,
817
- features_df_max_uid,
818
- )
819
- future_to_sample[future] = sample_info
820
-
821
- # Collect results with progress bar
822
- with tqdm(
823
- total=len(samples_to_process),
824
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Processing samples",
825
- disable=tdqm_disable,
826
- ) as pbar:
827
- for future in concurrent.futures.as_completed(future_to_sample):
828
- try:
829
- new_features, new_mapping, counter = future.result()
830
-
831
- # Adjust feature UIDs to ensure global uniqueness
832
- uid_offset = features_df_max_uid + len(all_new_features)
833
- for i, feature in enumerate(new_features):
834
- feature["feature_uid"] = uid_offset + i + 1
835
- for i, mapping in enumerate(new_mapping):
836
- mapping["feature_uid"] = uid_offset + i + 1
837
-
838
- all_new_features.extend(new_features)
839
- all_new_mapping.extend(new_mapping)
840
- total_counter += counter
841
-
842
- except Exception as e:
843
- sample_info = future_to_sample[future]
844
- self.logger.warning(
845
- f"Sample {sample_info['sample_name']} failed: {e}",
846
- )
847
-
848
- pbar.update(1)
849
-
850
- # Add new features to DataFrames
851
- self.logger.debug(f"Adding {len(all_new_features)} new features to DataFrame...")
852
- if all_new_features:
853
- # Create properly formatted rows
854
- rows_to_add = []
855
- for feature_dict in all_new_features:
856
- new_row = {}
857
- for col in self.features_df.columns:
858
- if col in feature_dict:
859
- new_row[col] = feature_dict[col]
860
- else:
861
- new_row[col] = None
862
- rows_to_add.append(new_row)
863
-
864
- # Create and add new DataFrame
865
- new_df = pl.from_dicts(rows_to_add)
866
-
867
- # Cast columns to match existing schema
868
- cast_exprs = []
869
- for col in self.features_df.columns:
870
- existing_dtype = self.features_df[col].dtype
871
- cast_exprs.append(pl.col(col).cast(existing_dtype, strict=False))
872
-
873
- new_df = new_df.with_columns(cast_exprs)
874
- self.features_df = self.features_df.vstack(new_df)
875
-
876
- # Add consensus mapping
877
- new_mapping_df = pl.DataFrame(all_new_mapping)
878
- self.consensus_mapping_df = pl.concat(
879
- [self.consensus_mapping_df, new_mapping_df],
880
- how="diagonal",
881
- )
882
-
883
- self.logger.info(
884
- f"Filled {total_counter} chromatograms from raw data using {num_workers} parallel workers.",
885
- )
886
-
887
-
888
- def _get_missing_consensus_sample_combinations(self, uids):
889
- """
890
- Efficiently identify which consensus_uid/sample combinations are missing.
891
- Returns a list of tuples: (consensus_uid, sample_uid, sample_name, sample_path)
892
- """
893
- # Get all consensus UIDs we're interested in
894
- consensus_uids_set = set(uids)
895
-
896
- # Get all sample UIDs and create lookup
897
- all_sample_info = {}
898
- for row in self.samples_df.select([
899
- "sample_uid",
900
- "sample_name",
901
- "sample_path",
902
- ]).iter_rows(named=True):
903
- all_sample_info[row["sample_uid"]] = {
904
- "sample_name": row["sample_name"],
905
- "sample_path": row["sample_path"],
906
- }
907
-
908
- # Get existing consensus/sample combinations from consensus_mapping_df
909
- existing_combinations = set()
910
- consensus_mapping_filtered = self.consensus_mapping_df.filter(
911
- pl.col("consensus_uid").is_in(list(consensus_uids_set)),
912
- )
913
-
914
- # Join with features_df to get sample_uid information
915
- existing_features = consensus_mapping_filtered.join(
916
- self.features_df.select(["feature_uid", "sample_uid"]),
917
- on="feature_uid",
918
- how="inner",
919
- )
920
-
921
- for row in existing_features.select(["consensus_uid", "sample_uid"]).iter_rows():
922
- existing_combinations.add((row[0], row[1])) # (consensus_uid, sample_uid)
923
-
924
- # Find missing combinations
925
- missing_combinations = []
926
- for consensus_uid in consensus_uids_set:
927
- for sample_uid, sample_info in all_sample_info.items():
928
- if (consensus_uid, sample_uid) not in existing_combinations:
929
- missing_combinations.append((
930
- consensus_uid,
931
- sample_uid,
932
- sample_info["sample_name"],
933
- sample_info["sample_path"],
934
- ))
935
-
936
- return missing_combinations
937
-
938
-
939
- def sanitize(self):
940
- """
941
- Sanitize features DataFrame to ensure all complex objects are properly typed.
942
- Convert serialized objects back to their proper types (Chromatogram, Spectrum).
943
- """
944
- if self.features_df is None or self.features_df.is_empty():
945
- return
946
-
947
- self.logger.debug(
948
- "Sanitizing features DataFrame to ensure all complex objects are properly typed.",
949
- )
950
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
951
-
952
- # Check if we have object columns that need sanitization
953
- has_chrom = "chrom" in self.features_df.columns
954
- has_ms2_specs = "ms2_specs" in self.features_df.columns
955
-
956
- if not has_chrom and not has_ms2_specs:
957
- self.logger.debug("No object columns found that need sanitization.")
958
- return
959
-
960
- # Convert to list of dictionaries for easier manipulation
961
- rows_data = []
962
-
963
- for row_dict in tqdm(
964
- self.features_df.iter_rows(named=True),
965
- total=len(self.features_df),
966
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO |{self.log_label}Sanitize features",
967
- disable=tdqm_disable,
968
- ):
969
- row_data = dict(row_dict)
970
-
971
- # Sanitize chrom column
972
- if has_chrom and row_data["chrom"] is not None:
973
- if not isinstance(row_data["chrom"], Chromatogram):
974
- try:
975
- # Create new Chromatogram and populate from dict if needed
976
- new_chrom = Chromatogram(rt=np.array([]), inty=np.array([]))
977
- if hasattr(row_data["chrom"], "__dict__"):
978
- new_chrom.from_dict(row_data["chrom"].__dict__)
979
- else:
980
- # If it's already a dict
981
- new_chrom.from_dict(row_data["chrom"])
982
- row_data["chrom"] = new_chrom
983
- except Exception as e:
984
- self.logger.warning(f"Failed to sanitize chrom object: {e}")
985
- row_data["chrom"] = None
986
-
987
- # Sanitize ms2_specs column
988
- if has_ms2_specs and row_data["ms2_specs"] is not None:
989
- if isinstance(row_data["ms2_specs"], list):
990
- sanitized_specs = []
991
- for ms2_specs in row_data["ms2_specs"]:
992
- if not isinstance(ms2_specs, Spectrum):
993
- try:
994
- new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
995
- if hasattr(ms2_specs, "__dict__"):
996
- new_ms2_specs.from_dict(ms2_specs.__dict__)
997
- else:
998
- new_ms2_specs.from_dict(ms2_specs)
999
- sanitized_specs.append(new_ms2_specs)
1000
- except Exception as e:
1001
- self.logger.warning(
1002
- f"Failed to sanitize ms2_specs object: {e}",
1003
- )
1004
- sanitized_specs.append(None)
1005
- else:
1006
- sanitized_specs.append(ms2_specs)
1007
- row_data["ms2_specs"] = sanitized_specs
1008
- elif not isinstance(row_data["ms2_specs"], Spectrum):
1009
- try:
1010
- new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
1011
- if hasattr(row_data["ms2_specs"], "__dict__"):
1012
- new_ms2_specs.from_dict(row_data["ms2_specs"].__dict__)
1013
- else:
1014
- new_ms2_specs.from_dict(row_data["ms2_specs"])
1015
- row_data["ms2_specs"] = new_ms2_specs
1016
- except Exception as e:
1017
- self.logger.warning(f"Failed to sanitize ms2_specs object: {e}")
1018
- row_data["ms2_specs"] = None
1019
-
1020
- rows_data.append(row_data)
1021
-
1022
- # Recreate the DataFrame with sanitized data
1023
- try:
1024
- self.features_df = pl.DataFrame(rows_data)
1025
- self.logger.success("Features DataFrame sanitization completed successfully.")
1026
- except Exception as e:
1027
- self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
1028
-
1029
-
1030
- def load_features(self):
1031
- # iterate over all samples in samples_df
1032
-
1033
- self.features_maps = []
1034
- self.logger.debug("Loading features from featureXML files.")
1035
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1036
- for _index, row_dict in tqdm(
1037
- enumerate(self.samples_df.iter_rows(named=True)),
1038
- total=len(self.samples_df),
1039
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Load feature maps from XML",
1040
- disable=tdqm_disable,
1041
- ):
1042
- if self.default_folder is not None:
1043
- filename = os.path.join(
1044
- self.default_folder,
1045
- row_dict["sample_name"] + ".featureXML",
1046
- )
1047
- else:
1048
- filename = os.path.join(
1049
- os.getcwd(),
1050
- row_dict["sample_name"] + ".featureXML",
1051
- )
1052
- # check if file exists
1053
- if not os.path.exists(filename):
1054
- filename = row_dict["sample_path"].replace(".sample5", ".featureXML")
1055
-
1056
- if not os.path.exists(filename):
1057
- self.features_maps.append(None)
1058
- continue
1059
-
1060
- fh = oms.FeatureXMLFile()
1061
- fm = oms.FeatureMap()
1062
- fh.load(filename, fm)
1063
- self.features_maps.append(fm)
1064
- self.logger.debug("Features loaded successfully.")
1065
-
1066
-
1067
- def _load_consensusXML(self, filename="alignment.consensusXML"):
1068
- """
1069
- Load a consensus map from a file.
1070
- """
1071
- if not os.path.exists(filename):
1072
- self.logger.error(f"File {filename} does not exist.")
1073
- return
1074
- fh = oms.ConsensusXMLFile()
1075
- self.consensus_map = oms.ConsensusMap()
1076
- fh.load(filename, self.consensus_map)
1077
- self.logger.debug(f"Loaded consensus map from {filename}.")
1078
-
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import concurrent.futures
5
+ from datetime import datetime
6
+
7
+ import numpy as np
8
+ import polars as pl
9
+ import pyopenms as oms
10
+
11
+ from tqdm import tqdm
12
+
13
+ from masster.chromatogram import Chromatogram
14
+ from masster.study.defaults import fill_defaults
15
+ from masster.sample.sample import Sample
16
+ from masster.spectrum import Spectrum
17
+
18
+
19
+ # Pre-import heavy modules to avoid repeated loading in add_sample()
20
+ try:
21
+ import alpharaw.sciex
22
+
23
+ ALPHARAW_AVAILABLE = True
24
+ except ImportError:
25
+ ALPHARAW_AVAILABLE = False
26
+
27
+ try:
28
+ import pythonnet
29
+
30
+ PYTHONNET_AVAILABLE = True
31
+ except ImportError:
32
+ PYTHONNET_AVAILABLE = False
33
+
34
+ import glob
35
+
36
+
37
+ def add(
38
+ self,
39
+ folder=None,
40
+ reset=False,
41
+ adducts=None,
42
+ max_files=None,
43
+ ):
44
+ if folder is None:
45
+ if self.folder is not None:
46
+ folder = self.folder
47
+ else:
48
+ folder = os.getcwd()
49
+
50
+ self.logger.debug(f"Adding files from: {folder}")
51
+
52
+ # Define file extensions to search for in order of priority
53
+ extensions = [".sample5", ".wiff", ".raw", ".mzML"]
54
+
55
+ # Check if folder contains glob patterns
56
+ if not any(char in folder for char in ["*", "?", "[", "]"]):
57
+ search_folder = folder
58
+ else:
59
+ search_folder = os.path.dirname(folder) if os.path.dirname(folder) else folder
60
+
61
+ # Blacklist to track filenames without extensions that have already been processed
62
+ blacklist = set()
63
+ counter = 0
64
+ not_zero = False
65
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
66
+
67
+ # Search for files in order of priority
68
+ for ext in extensions:
69
+ if max_files is not None and counter >= max_files:
70
+ break
71
+
72
+ # Build search pattern
73
+ if any(char in folder for char in ["*", "?", "[", "]"]):
74
+ # If folder already contains glob patterns, modify the extension
75
+ if folder.endswith("*.sample5"):
76
+ pattern = folder.replace("*.sample5", f"*{ext}")
77
+ else:
78
+ pattern = os.path.join(search_folder, "**", f"*{ext}")
79
+ else:
80
+ pattern = os.path.join(search_folder, "**", f"*{ext}")
81
+
82
+ files = glob.glob(pattern, recursive=True)
83
+
84
+ if len(files) > 0:
85
+ # Limit files if max_files is specified
86
+ remaining_slots = max_files - counter if max_files is not None else len(files)
87
+ files = files[:remaining_slots]
88
+
89
+ self.logger.debug(f"Found {len(files)} {ext} files")
90
+
91
+ # Process files
92
+ for i, file in enumerate(
93
+ tqdm(
94
+ files,
95
+ total=len(files),
96
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add *{ext}",
97
+ disable=tdqm_disable,
98
+ ),
99
+ ):
100
+ if max_files is not None and counter >= max_files:
101
+ break
102
+
103
+ # Get filename without extension for blacklist check
104
+ basename = os.path.basename(file)
105
+ filename_no_ext = os.path.splitext(basename)[0]
106
+
107
+ # Check if this filename (without extension) is already in blacklist
108
+ if filename_no_ext in blacklist:
109
+ self.logger.debug(f"Skipping {file} - filename already processed")
110
+ continue
111
+
112
+ self.logger.debug(f"Add file {counter + 1}: {file}")
113
+
114
+ # Try to add the sample
115
+ try:
116
+ self.add_sample(file=file, reset=reset, adducts=adducts)
117
+ # If successful, add to blacklist and increment counter
118
+ blacklist.add(filename_no_ext)
119
+ counter += 1
120
+ not_zero = True
121
+ except Exception as e:
122
+ self.logger.warning(f"Failed to add sample {file}: {e}")
123
+ continue
124
+
125
+ if max_files is not None and counter >= max_files:
126
+ self.logger.debug(
127
+ f"Reached maximum number of files to add: {max_files}. Stopping further additions.",
128
+ )
129
+
130
+ if not not_zero:
131
+ self.logger.warning(
132
+ f"No files found in {folder}. Please check the folder path or file patterns.",
133
+ )
134
+ else:
135
+ self.logger.debug(f"Successfully added {counter} samples to the study.")
136
+
137
+
138
+ # TODO type is not used
139
+ def add_sample(self, file, type=None, reset=False, adducts=None):
140
+ self.logger.debug(f"Adding: {file}")
141
+
142
+ # Extract sample name by removing any known extension
143
+ basename = os.path.basename(file)
144
+ sample_name = os.path.splitext(basename)[0]
145
+
146
+ # check if sample_name is already in the samples_df
147
+ if sample_name in self.samples_df["sample_name"].to_list():
148
+ self.logger.warning(
149
+ f"Sample {sample_name} already exists in the study. Skipping.",
150
+ )
151
+ return
152
+
153
+ # check if file exists
154
+ if not os.path.exists(file):
155
+ self.logger.error(f"File {file} does not exist.")
156
+ return
157
+
158
+ # Check for supported file extensions
159
+ if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
160
+ self.logger.error(f"File {file} is not a supported file type. Supported: .sample5, .wiff, .raw, .mzML")
161
+ return
162
+
163
+ # Load the sample based on file type
164
+ ddaobj = Sample()
165
+ ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
166
+
167
+ if file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
168
+ ddaobj.load(file)
169
+ else:
170
+ self.logger.error(f"Unsupported file format: {file}")
171
+ return
172
+ if ddaobj.features_df is None and not reset:
173
+ self.logger.warning(
174
+ f"File {file} will be newly processed.",
175
+ )
176
+ ddaobj.features = None
177
+
178
+ if ddaobj.features is None or reset:
179
+ ddaobj.find_features()
180
+ ddaobj.find_adducts(adducts=adducts)
181
+ ddaobj.find_ms2()
182
+
183
+ self.features_maps.append(ddaobj.features)
184
+
185
+ sample_type = "sample" if type is None else type
186
+ if "qc" in sample_name.lower():
187
+ sample_type = "qc"
188
+ if "blank" in sample_name.lower():
189
+ sample_type = "blank"
190
+ map_id_value = str(ddaobj.features.getUniqueId())
191
+
192
+ new_sample = pl.DataFrame(
193
+ {
194
+ "sample_uid": [int(len(self.samples_df) + 1)],
195
+ "sample_name": [sample_name],
196
+ "sample_path": [file],
197
+ "sample_type": [sample_type],
198
+ "size": [int(ddaobj.features.size())],
199
+ "map_id": [map_id_value],
200
+ "file_source": [getattr(ddaobj, 'file_source', file)],
201
+ },
202
+ schema={
203
+ "sample_uid": pl.Int64,
204
+ "sample_name": pl.Utf8,
205
+ "sample_path": pl.Utf8,
206
+ "sample_type": pl.Utf8,
207
+ "size": pl.Int64,
208
+ "map_id": pl.Utf8,
209
+ "file_source": pl.Utf8,
210
+ },
211
+ )
212
+ # save ddaobj to folder if it is set
213
+ if self.folder is not None:
214
+ if not os.path.exists(self.folder):
215
+ os.makedirs(self.folder)
216
+ basename = os.path.basename(file)
217
+ sample_name = os.path.splitext(basename)[0]
218
+ ddaobj.save(os.path.join(self.folder, sample_name + ".sample5"))
219
+ self.samples_df = pl.concat([self.samples_df, new_sample])
220
+
221
+ # Optimized DataFrame operations - chain operations instead of multiple clones
222
+ columns_to_add = [
223
+ pl.lit(len(self.samples_df)).alias("sample_uid"),
224
+ pl.lit(False).alias("filled"),
225
+ pl.lit(-1.0).alias("chrom_area"),
226
+ ]
227
+
228
+ # Only add rt_original if it doesn't exist
229
+ if "rt_original" not in ddaobj.features_df.columns:
230
+ columns_to_add.append(pl.col("rt").alias("rt_original"))
231
+
232
+ f_df = ddaobj.features_df.with_columns(columns_to_add)
233
+
234
+ if self.features_df.is_empty():
235
+ # Create new features_df with feature_uid column
236
+ self.features_df = f_df.with_columns(
237
+ pl.int_range(pl.len()).add(1).alias("feature_uid"),
238
+ ).select(
239
+ ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
240
+ )
241
+ else:
242
+ offset = self.features_df["feature_uid"].max() + 1 if not self.features_df.is_empty() else 1
243
+ # Chain operations and add to existing DataFrame
244
+ f_df = f_df.with_columns(
245
+ pl.int_range(pl.len()).add(offset).alias("feature_uid"),
246
+ ).select(
247
+ ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
248
+ )
249
+ self.features_df = pl.concat([self.features_df, f_df])
250
+ self.logger.debug(
251
+ f"Added sample {sample_name} with {ddaobj.features.size()} features to the study.",
252
+ )
253
+
254
+
255
+ def load(self, filename=None):
256
+ """
257
+ Load a study from an HDF5 file.
258
+
259
+ Args:
260
+ study: The study object to load into
261
+ filename (str, optional): The path to the HDF5 file to load the study from.
262
+ """
263
+
264
+ # Handle default filename
265
+ if filename is None:
266
+ if self.folder is not None:
267
+ # search for *.study5 in folder
268
+ study5_files = glob.glob(os.path.join(self.folder, "*.study5"))
269
+ if study5_files:
270
+ filename = study5_files[-1]
271
+ else:
272
+ self.logger.error("No .study5 files found in folder")
273
+ return
274
+ else:
275
+ self.logger.error("Either filename or folder must be provided")
276
+ return
277
+
278
+ #self.logger.info(f"Loading study from {filename}")
279
+ self._load_study5(filename)
280
+ # After loading the study, check if consensus XML exists and load it
281
+ consensus_xml_path = filename.replace(".study5", ".consensusXML")
282
+ if os.path.exists(consensus_xml_path):
283
+ self._load_consensusXML(filename=consensus_xml_path)
284
+ # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
285
+ else:
286
+ self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
287
+ self.filename = filename
288
+
289
+
290
+ def _fill_chrom_single_impl(
291
+ self,
292
+ uids=None,
293
+ mz_tol: float = 0.010,
294
+ rt_tol: float = 10.0,
295
+ min_samples_rel: float = 0.0,
296
+ min_samples_abs: int = 2,
297
+ ):
298
+ """Fill missing chromatograms by extracting from raw data.
299
+
300
+ Simplified version that loads one sample at a time without preloading or batching.
301
+
302
+ Args:
303
+ uids: Consensus UIDs to process (default: all)
304
+ mz_tol: m/z tolerance for extraction (default: 0.010 Da)
305
+ rt_tol: RT tolerance for extraction (default: 10.0 seconds)
306
+ min_samples_rel: Relative minimum sample threshold (default: 0.0)
307
+ min_samples_abs: Absolute minimum sample threshold (default: 2)
308
+ """
309
+ uids = self._get_consensus_uids(uids)
310
+
311
+ self.logger.info("Gap filling...")
312
+ self.logger.debug(
313
+ f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}",
314
+ )
315
+
316
+ # Apply minimum sample filters
317
+ min_number_rel = 1
318
+ min_number_abs = 1
319
+ if isinstance(min_samples_rel, float) and min_samples_rel > 0:
320
+ min_number_rel = int(min_samples_rel * len(self.samples_df))
321
+ if isinstance(min_samples_abs, int) and min_samples_abs > 0:
322
+ min_number_abs = int(min_samples_abs)
323
+ min_number = max(min_number_rel, min_number_abs)
324
+ self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
325
+
326
+ if min_number > 0:
327
+ original_count = len(uids)
328
+ uids = self.consensus_df.filter(
329
+ (pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
330
+ )["consensus_uid"].to_list()
331
+ self.logger.debug(
332
+ f"Features to fill: {original_count} -> {len(uids)}",
333
+ )
334
+ self.logger.debug("Identifying missing features...")
335
+ # Instead of building full chromatogram matrix, identify missing consensus/sample combinations directly
336
+ missing_combinations = self._get_missing_consensus_sample_combinations(uids)
337
+ if not missing_combinations:
338
+ self.logger.info("No missing features found to fill.")
339
+ return
340
+
341
+ # Build lookup dictionaries
342
+ self.logger.debug("Building lookup dictionaries...")
343
+ consensus_info = {}
344
+ consensus_subset = self.consensus_df.select([
345
+ "consensus_uid",
346
+ "rt_start_mean",
347
+ "rt_end_mean",
348
+ "mz",
349
+ "rt",
350
+ ]).filter(pl.col("consensus_uid").is_in(uids))
351
+
352
+ for row in consensus_subset.iter_rows(named=True):
353
+ consensus_info[row["consensus_uid"]] = {
354
+ "rt_start_mean": row["rt_start_mean"],
355
+ "rt_end_mean": row["rt_end_mean"],
356
+ "mz": row["mz"],
357
+ "rt": row["rt"],
358
+ }
359
+
360
+ # Process each sample individually
361
+ # Group missing combinations by sample for efficient processing
362
+ missing_by_sample = {}
363
+ for consensus_uid, sample_uid, sample_name, sample_path in missing_combinations:
364
+ if sample_name not in missing_by_sample:
365
+ missing_by_sample[sample_name] = {
366
+ "sample_uid": sample_uid,
367
+ "sample_path": sample_path,
368
+ "missing_consensus_uids": [],
369
+ }
370
+ missing_by_sample[sample_name]["missing_consensus_uids"].append(consensus_uid)
371
+
372
+ new_features: list[dict] = []
373
+ new_mapping: list[dict] = []
374
+ counter = 0
375
+
376
+ self.logger.debug(
377
+ f"Missing features: {len(missing_combinations)} in {len(missing_by_sample)} samples...",
378
+ )
379
+
380
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
381
+
382
+ for sample_name, sample_info in tqdm(
383
+ missing_by_sample.items(),
384
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}File",
385
+ disable=tdqm_disable,
386
+ ):
387
+ # Load this sample
388
+ sample_uid = sample_info["sample_uid"]
389
+ sample_path = sample_info["sample_path"]
390
+ missing_consensus_uids = sample_info["missing_consensus_uids"]
391
+
392
+ try:
393
+ # self.logger.debug(f"Loading sample: {sample_path}")
394
+ file = Sample()
395
+ file.logger_update("WARNING")
396
+ file.load(sample_path)
397
+ except Exception as e:
398
+ self.logger.warning(f"Failed to load sample {sample_name}: {e}")
399
+ continue
400
+
401
+ self.logger.debug(
402
+ f"Sample {sample_name}: Processing {len(missing_consensus_uids)} missing features",
403
+ )
404
+
405
+ # Process each missing feature
406
+ for consensus_uid in missing_consensus_uids:
407
+ cons = consensus_info[consensus_uid]
408
+ mz = cons["mz"]
409
+ rt = cons["rt"]
410
+ rt_start_mean = cons["rt_start_mean"]
411
+ rt_end_mean = cons["rt_end_mean"]
412
+
413
+ # Filter MS1 data for this feature
414
+ if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
415
+ d = file.ms1_df.filter(
416
+ (pl.col("mz") >= mz - mz_tol)
417
+ & (pl.col("mz") <= mz + mz_tol)
418
+ & (pl.col("rt") >= rt_start_mean - rt_tol)
419
+ & (pl.col("rt") <= rt_end_mean + rt_tol),
420
+ )
421
+ else:
422
+ d = pl.DataFrame()
423
+
424
+ # Create chromatogram
425
+ if d.is_empty():
426
+ self.logger.debug(
427
+ f"Feature {consensus_uid}: No MS1 data found, creating empty chromatogram",
428
+ )
429
+ eic = Chromatogram(
430
+ rt=np.array([rt_start_mean, rt_end_mean]),
431
+ inty=np.array([0.0, 0.0]),
432
+ label=f"EIC mz={mz:.4f}",
433
+ file=sample_path,
434
+ mz=mz,
435
+ mz_tol=mz_tol,
436
+ feature_start=rt_start_mean,
437
+ feature_end=rt_end_mean,
438
+ feature_apex=rt,
439
+ )
440
+ max_inty = 0.0
441
+ area = 0.0
442
+ else:
443
+ self.logger.debug(
444
+ f"Feature {consensus_uid}: Found {len(d)} MS1 points, creating EIC",
445
+ )
446
+ eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
447
+
448
+ if len(eic_rt) > 4:
449
+ eic = Chromatogram(
450
+ eic_rt["rt"].to_numpy(),
451
+ eic_rt["inty"].to_numpy(),
452
+ label=f"EIC mz={mz:.4f}",
453
+ file=sample_path,
454
+ mz=mz,
455
+ mz_tol=mz_tol,
456
+ feature_start=rt_start_mean,
457
+ feature_end=rt_end_mean,
458
+ feature_apex=rt,
459
+ ).find_peaks()
460
+ max_inty = np.max(eic.inty)
461
+ area = eic.feature_area
462
+ else:
463
+ eic = Chromatogram(
464
+ eic_rt["rt"].to_numpy(),
465
+ eic_rt["inty"].to_numpy(),
466
+ label=f"EIC mz={mz:.4f}",
467
+ file=sample_path,
468
+ mz=mz,
469
+ mz_tol=mz_tol,
470
+ feature_start=rt_start_mean,
471
+ feature_end=rt_end_mean,
472
+ feature_apex=rt,
473
+ )
474
+ max_inty = 0.0
475
+ area = 0.0
476
+
477
+ # Generate feature UID
478
+ feature_uid = (
479
+ self.features_df["feature_uid"].max() + len(new_features) + 1
480
+ if not self.features_df.is_empty()
481
+ else len(new_features) + 1
482
+ )
483
+
484
+ # Create new feature entry
485
+ new_feature = {
486
+ "sample_uid": sample_uid,
487
+ "feature_uid": feature_uid,
488
+ "feature_id": None,
489
+ "mz": mz,
490
+ "rt": rt,
491
+ "rt_original": None,
492
+ "rt_start": rt_start_mean,
493
+ "rt_end": rt_end_mean,
494
+ "rt_delta": rt_end_mean - rt_start_mean,
495
+ "mz_start": None,
496
+ "mz_end": None,
497
+ "inty": max_inty,
498
+ "quality": None,
499
+ "charge": None,
500
+ "iso": None,
501
+ "iso_of": None,
502
+ "adduct": None,
503
+ "adduct_mass": None,
504
+ "adduct_group": None,
505
+ "chrom": eic,
506
+ "chrom_coherence": None,
507
+ "chrom_prominence": None,
508
+ "chrom_prominence_scaled": None,
509
+ "chrom_height_scaled": None,
510
+ "ms2_scans": None,
511
+ "ms2_specs": None,
512
+ "filled": True,
513
+ "chrom_area": area,
514
+ }
515
+
516
+ new_features.append(new_feature)
517
+ new_mapping.append({
518
+ "consensus_uid": consensus_uid,
519
+ "sample_uid": sample_uid,
520
+ "feature_uid": feature_uid,
521
+ })
522
+ counter += 1
523
+
524
+ # Add new features to DataFrames
525
+ self.logger.debug(f"Adding {len(new_features)} new features to DataFrame...")
526
+ if new_features:
527
+ # Create properly formatted rows
528
+ rows_to_add = []
529
+ for feature_dict in new_features:
530
+ new_row = {}
531
+ for col in self.features_df.columns:
532
+ if col in feature_dict:
533
+ new_row[col] = feature_dict[col]
534
+ else:
535
+ new_row[col] = None
536
+ rows_to_add.append(new_row)
537
+
538
+ # Create and add new DataFrame
539
+ new_df = pl.from_dicts(rows_to_add)
540
+
541
+ # Cast columns to match existing schema
542
+ cast_exprs = []
543
+ for col in self.features_df.columns:
544
+ existing_dtype = self.features_df[col].dtype
545
+ cast_exprs.append(pl.col(col).cast(existing_dtype, strict=False))
546
+
547
+ new_df = new_df.with_columns(cast_exprs)
548
+ self.features_df = self.features_df.vstack(new_df)
549
+
550
+ # Add consensus mapping
551
+ new_mapping_df = pl.DataFrame(new_mapping)
552
+ self.consensus_mapping_df = pl.concat(
553
+ [self.consensus_mapping_df, new_mapping_df],
554
+ how="diagonal",
555
+ )
556
+
557
+ self.logger.info(f"Filled {counter} chromatograms from raw data.")
558
+
559
+
560
+ def fill_single(self, **kwargs):
561
+ """Fill missing chromatograms by extracting from raw data.
562
+
563
+ Simplified version that loads one sample at a time without preloading or batching.
564
+
565
+ Parameters:
566
+ **kwargs: Keyword arguments for fill_single parameters. Can include:
567
+ - A fill_defaults instance to set all parameters at once
568
+ - Individual parameter names and values (see fill_defaults for details)
569
+
570
+ Key Parameters:
571
+ uids: Consensus UIDs to process (default: all)
572
+ mz_tol: m/z tolerance for extraction (default: 0.010 Da)
573
+ rt_tol: RT tolerance for extraction (default: 10.0 seconds)
574
+ min_samples_rel: Relative minimum sample threshold (default: 0.0)
575
+ min_samples_abs: Absolute minimum sample threshold (default: 2)
576
+ """
577
+ # parameters initialization
578
+ from masster.study.defaults import fill_defaults
579
+ params = fill_defaults()
580
+
581
+ for key, value in kwargs.items():
582
+ if isinstance(value, fill_defaults):
583
+ params = value
584
+ self.logger.debug("Using provided fill_defaults parameters")
585
+ else:
586
+ if hasattr(params, key):
587
+ if params.set(key, value, validate=True):
588
+ self.logger.debug(f"Updated parameter {key} = {value}")
589
+ else:
590
+ self.logger.warning(
591
+ f"Failed to set parameter {key} = {value} (validation failed)",
592
+ )
593
+ else:
594
+ self.logger.debug(f"Unknown parameter {key} ignored")
595
+ # end of parameter initialization
596
+
597
+ # Store parameters in the Study object
598
+ self.store_history(["fill_single"], params.to_dict())
599
+ self.logger.debug("Parameters stored to fill_single")
600
+
601
+ # Call the original fill_chrom_single function with extracted parameters
602
+ return _fill_chrom_single_impl(
603
+ self,
604
+ uids=params.get("uids"),
605
+ mz_tol=params.get("mz_tol"),
606
+ rt_tol=params.get("rt_tol"),
607
+ min_samples_rel=params.get("min_samples_rel"),
608
+ min_samples_abs=params.get("min_samples_abs"),
609
+ )
610
+
611
+
612
+ def _process_sample_for_parallel_fill(
613
+ self,
614
+ sample_info,
615
+ consensus_info,
616
+ uids,
617
+ mz_tol,
618
+ rt_tol,
619
+ missing_combinations_df,
620
+ features_df_max_uid,
621
+ ):
622
+ """Process a single sample for parallel gap filling."""
623
+ sample_uid = sample_info["sample_uid"]
624
+ sample_path = sample_info["sample_path"]
625
+
626
+ new_features: list[dict] = []
627
+ new_mapping: list[dict] = []
628
+ counter = 0
629
+
630
+ try:
631
+ # Load this sample
632
+ file = Sample()
633
+ file.logger_update(level="WARNING")
634
+ file.load(sample_path)
635
+ except Exception:
636
+ # Skip this sample if loading fails
637
+ return new_features, new_mapping, counter
638
+
639
+ # Find missing features for this sample from precomputed combinations
640
+ sample_missing = missing_combinations_df.filter(
641
+ pl.col("sample_uid") == sample_uid,
642
+ )["consensus_uid"].to_list()
643
+
644
+ if not sample_missing:
645
+ return new_features, new_mapping, counter
646
+
647
+ # Process each missing feature
648
+ for consensus_uid in sample_missing:
649
+ cons = consensus_info[consensus_uid]
650
+ mz = cons["mz"]
651
+ rt = cons["rt"]
652
+ rt_start_mean = cons["rt_start_mean"]
653
+ rt_end_mean = cons["rt_end_mean"]
654
+
655
+ # Filter MS1 data for this feature
656
+ if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
657
+ d = file.ms1_df.filter(
658
+ (pl.col("mz") >= mz - mz_tol)
659
+ & (pl.col("mz") <= mz + mz_tol)
660
+ & (pl.col("rt") >= rt_start_mean - rt_tol)
661
+ & (pl.col("rt") <= rt_end_mean + rt_tol),
662
+ )
663
+ else:
664
+ d = pl.DataFrame()
665
+
666
+ # Create chromatogram
667
+ if d.is_empty():
668
+ eic = Chromatogram(
669
+ rt=np.array([rt_start_mean, rt_end_mean]),
670
+ inty=np.array([0.0, 0.0]),
671
+ label=f"EIC mz={mz:.4f}",
672
+ file=sample_path,
673
+ mz=mz,
674
+ mz_tol=mz_tol,
675
+ feature_start=rt_start_mean,
676
+ feature_end=rt_end_mean,
677
+ feature_apex=rt,
678
+ )
679
+ max_inty = 0.0
680
+ area = 0.0
681
+ else:
682
+ eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
683
+
684
+ if len(eic_rt) > 4:
685
+ eic = Chromatogram(
686
+ eic_rt["rt"].to_numpy(),
687
+ eic_rt["inty"].to_numpy(),
688
+ label=f"EIC mz={mz:.4f}",
689
+ file=sample_path,
690
+ mz=mz,
691
+ mz_tol=mz_tol,
692
+ feature_start=rt_start_mean,
693
+ feature_end=rt_end_mean,
694
+ feature_apex=rt,
695
+ ).find_peaks()
696
+ max_inty = np.max(eic.inty)
697
+ area = eic.feature_area
698
+ else:
699
+ eic = Chromatogram(
700
+ eic_rt["rt"].to_numpy(),
701
+ eic_rt["inty"].to_numpy(),
702
+ label=f"EIC mz={mz:.4f}",
703
+ file=sample_path,
704
+ mz=mz,
705
+ mz_tol=mz_tol,
706
+ feature_start=rt_start_mean,
707
+ feature_end=rt_end_mean,
708
+ feature_apex=rt,
709
+ )
710
+ max_inty = 0.0
711
+ area = 0.0
712
+
713
+ # Generate feature UID (will be adjusted later to ensure global uniqueness)
714
+ feature_uid = features_df_max_uid + len(new_features) + 1
715
+
716
+ # Create new feature entry
717
+ new_feature = {
718
+ "sample_uid": sample_uid,
719
+ "feature_uid": feature_uid,
720
+ "feature_id": None,
721
+ "mz": mz,
722
+ "rt": rt,
723
+ "rt_original": None,
724
+ "rt_start": rt_start_mean,
725
+ "rt_end": rt_end_mean,
726
+ "rt_delta": rt_end_mean - rt_start_mean,
727
+ "mz_start": None,
728
+ "mz_end": None,
729
+ "inty": max_inty,
730
+ "quality": None,
731
+ "charge": None,
732
+ "iso": None,
733
+ "iso_of": None,
734
+ "adduct": None,
735
+ "adduct_mass": None,
736
+ "adduct_group": None,
737
+ "chrom": eic,
738
+ "filled": True,
739
+ "chrom_area": area,
740
+ "chrom_coherence": None,
741
+ "chrom_prominence": None,
742
+ "chrom_prominence_scaled": None,
743
+ "chrom_height_scaled": None,
744
+ "ms2_scans": None,
745
+ "ms2_specs": None,
746
+ }
747
+
748
+ new_features.append(new_feature)
749
+ new_mapping.append({
750
+ "consensus_uid": consensus_uid,
751
+ "sample_uid": sample_uid,
752
+ "feature_uid": feature_uid,
753
+ })
754
+ counter += 1
755
+
756
+ return new_features, new_mapping, counter
757
+
758
+
759
+ def _fill_chrom_impl(
760
+ self,
761
+ uids=None,
762
+ mz_tol: float = 0.010,
763
+ rt_tol: float = 10.0,
764
+ min_samples_rel: float = 0.0,
765
+ min_samples_abs: int = 2,
766
+ num_workers=4,
767
+ ):
768
+ """Fill missing chromatograms by extracting from raw data using parallel processing.
769
+
770
+ Args:
771
+ uids: Consensus UIDs to process (default: all)
772
+ mz_tol: m/z tolerance for extraction (default: 0.010 Da)
773
+ rt_tol: RT tolerance for extraction (default: 10.0 seconds)
774
+ min_samples_rel: Relative minimum sample threshold (default: 0.0)
775
+ min_samples_abs: Absolute minimum sample threshold (default: 2)
776
+ num_workers: Number of parallel workers (default: 4)
777
+ """
778
+ uids = self._get_consensus_uids(uids)
779
+
780
+ self.logger.info(f"Gap filling with {num_workers} workers...")
781
+ self.logger.debug(
782
+ f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}, num_workers={num_workers}",
783
+ )
784
+
785
+ # Apply minimum sample filters
786
+ min_number_rel = 1
787
+ min_number_abs = 1
788
+ if isinstance(min_samples_rel, float) and min_samples_rel > 0:
789
+ min_number_rel = int(min_samples_rel * len(self.samples_df))
790
+ if isinstance(min_samples_abs, int) and min_samples_abs > 0:
791
+ min_number_abs = int(min_samples_abs)
792
+ min_number = max(min_number_rel, min_number_abs)
793
+
794
+ self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
795
+
796
+ if min_number > 0:
797
+ original_count = len(uids)
798
+ uids = self.consensus_df.filter(
799
+ (pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
800
+ )["consensus_uid"].to_list()
801
+ self.logger.debug(f"Features to fill: {original_count} -> {len(uids)}")
802
+
803
+ # Get missing consensus/sample combinations using the optimized method
804
+ self.logger.debug("Identifying missing features...")
805
+ missing_combinations = self._get_missing_consensus_sample_combinations(uids)
806
+
807
+ if not missing_combinations or len(missing_combinations) == 0:
808
+ self.logger.info("No missing features found to fill.")
809
+ return
810
+
811
+ # Convert to DataFrame for easier processing
812
+ missing_combinations_df = pl.DataFrame(
813
+ missing_combinations,
814
+ schema={
815
+ "consensus_uid": pl.Int64,
816
+ "sample_uid": pl.Int64,
817
+ "sample_name": pl.Utf8,
818
+ "sample_path": pl.Utf8,
819
+ },
820
+ orient="row",
821
+ )
822
+
823
+ # Build lookup dictionaries
824
+ self.logger.debug("Building lookup dictionaries...")
825
+ consensus_info = {}
826
+ consensus_subset = self.consensus_df.select([
827
+ "consensus_uid",
828
+ "rt_start_mean",
829
+ "rt_end_mean",
830
+ "mz",
831
+ "rt",
832
+ ]).filter(pl.col("consensus_uid").is_in(uids))
833
+
834
+ for row in consensus_subset.iter_rows(named=True):
835
+ consensus_info[row["consensus_uid"]] = {
836
+ "rt_start_mean": row["rt_start_mean"],
837
+ "rt_end_mean": row["rt_end_mean"],
838
+ "mz": row["mz"],
839
+ "rt": row["rt"],
840
+ }
841
+
842
+ # Get sample info for all samples that need processing
843
+ samples_to_process = []
844
+ unique_sample_uids = missing_combinations_df["sample_uid"].unique().to_list()
845
+
846
+ for row in self.samples_df.filter(
847
+ pl.col("sample_uid").is_in(unique_sample_uids),
848
+ ).iter_rows(named=True):
849
+ samples_to_process.append({
850
+ "sample_name": row["sample_name"],
851
+ "sample_uid": row["sample_uid"],
852
+ "sample_path": row["sample_path"],
853
+ })
854
+
855
+ total_missing = len(missing_combinations_df)
856
+ total_samples = len(samples_to_process)
857
+
858
+ self.logger.debug(
859
+ f"Gap filling for {total_missing} missing features...",
860
+ )
861
+
862
+ # Calculate current max feature_uid to avoid conflicts
863
+ features_df_max_uid = self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
864
+
865
+ # Process samples in parallel
866
+ all_new_features: list[dict] = []
867
+ all_new_mapping: list[dict] = []
868
+ total_counter = 0
869
+
870
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
871
+
872
+ with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
873
+ # Submit all samples for processing
874
+ future_to_sample = {}
875
+ for sample_info in samples_to_process:
876
+ future = executor.submit(
877
+ self._process_sample_for_parallel_fill,
878
+ sample_info,
879
+ consensus_info,
880
+ uids,
881
+ mz_tol,
882
+ rt_tol,
883
+ missing_combinations_df,
884
+ features_df_max_uid,
885
+ )
886
+ future_to_sample[future] = sample_info
887
+
888
+ # Collect results with progress bar
889
+ with tqdm(
890
+ total=len(samples_to_process),
891
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Processing samples",
892
+ disable=tdqm_disable,
893
+ ) as pbar:
894
+ for future in concurrent.futures.as_completed(future_to_sample):
895
+ try:
896
+ new_features, new_mapping, counter = future.result()
897
+
898
+ # Adjust feature UIDs to ensure global uniqueness
899
+ uid_offset = features_df_max_uid + len(all_new_features)
900
+ for i, feature in enumerate(new_features):
901
+ feature["feature_uid"] = uid_offset + i + 1
902
+ for i, mapping in enumerate(new_mapping):
903
+ mapping["feature_uid"] = uid_offset + i + 1
904
+
905
+ all_new_features.extend(new_features)
906
+ all_new_mapping.extend(new_mapping)
907
+ total_counter += counter
908
+
909
+ except Exception as e:
910
+ sample_info = future_to_sample[future]
911
+ self.logger.warning(
912
+ f"Sample {sample_info['sample_name']} failed: {e}",
913
+ )
914
+
915
+ pbar.update(1)
916
+
917
+ # Add new features to DataFrames
918
+ self.logger.debug(f"Adding {len(all_new_features)} new features to DataFrame...")
919
+ if all_new_features:
920
+ # Create properly formatted rows
921
+ rows_to_add = []
922
+ for feature_dict in all_new_features:
923
+ new_row = {}
924
+ for col in self.features_df.columns:
925
+ if col in feature_dict:
926
+ new_row[col] = feature_dict[col]
927
+ else:
928
+ new_row[col] = None
929
+ rows_to_add.append(new_row)
930
+
931
+ # Create and add new DataFrame
932
+ new_df = pl.from_dicts(rows_to_add)
933
+
934
+ # Cast columns to match existing schema
935
+ cast_exprs = []
936
+ for col in self.features_df.columns:
937
+ existing_dtype = self.features_df[col].dtype
938
+ cast_exprs.append(pl.col(col).cast(existing_dtype, strict=False))
939
+
940
+ new_df = new_df.with_columns(cast_exprs)
941
+ self.features_df = self.features_df.vstack(new_df)
942
+
943
+ # Add consensus mapping
944
+ new_mapping_df = pl.DataFrame(all_new_mapping)
945
+ self.consensus_mapping_df = pl.concat(
946
+ [self.consensus_mapping_df, new_mapping_df],
947
+ how="diagonal",
948
+ )
949
+
950
+ self.logger.info(
951
+ f"Filled {total_counter} chromatograms from raw data using {num_workers} parallel workers.",
952
+ )
953
+
954
+
955
+ def fill(self, **kwargs):
956
+ """Fill missing chromatograms by extracting from raw data using parallel processing.
957
+
958
+ Parameters:
959
+ **kwargs: Keyword arguments for fill parameters. Can include:
960
+ - A fill_defaults instance to set all parameters at once
961
+ - Individual parameter names and values (see fill_defaults for details)
962
+
963
+ Key Parameters:
964
+ uids: Consensus UIDs to process (default: all)
965
+ mz_tol: m/z tolerance for extraction (default: 0.010 Da)
966
+ rt_tol: RT tolerance for extraction (default: 10.0 seconds)
967
+ min_samples_rel: Relative minimum sample threshold (default: 0.05)
968
+ min_samples_abs: Absolute minimum sample threshold (default: 5)
969
+ num_workers: Number of parallel workers (default: 4)
970
+ """
971
+ # parameters initialization
972
+ params = fill_defaults()
973
+ num_workers = kwargs.get("num_workers", 4) # Default parameter not in defaults class
974
+
975
+ for key, value in kwargs.items():
976
+ if isinstance(value, fill_defaults):
977
+ params = value
978
+ self.logger.debug("Using provided fill_defaults parameters")
979
+ else:
980
+ if hasattr(params, key):
981
+ if params.set(key, value, validate=True):
982
+ self.logger.debug(f"Updated parameter {key} = {value}")
983
+ else:
984
+ self.logger.warning(
985
+ f"Failed to set parameter {key} = {value} (validation failed)",
986
+ )
987
+ elif key != "num_workers": # Allow num_workers as valid parameter
988
+ self.logger.debug(f"Unknown parameter {key} ignored")
989
+ # end of parameter initialization
990
+
991
+ # Store parameters in the Study object
992
+ self.store_history(["fill"], params.to_dict())
993
+ self.logger.debug("Parameters stored to fill")
994
+
995
+ # Call the original fill_chrom function with extracted parameters
996
+ return _fill_chrom_impl(
997
+ self,
998
+ uids=params.get("uids"),
999
+ mz_tol=params.get("mz_tol"),
1000
+ rt_tol=params.get("rt_tol"),
1001
+ min_samples_rel=params.get("min_samples_rel"),
1002
+ min_samples_abs=params.get("min_samples_abs"),
1003
+ num_workers=num_workers,
1004
+ )
1005
+
1006
+
1007
+ # Backward compatibility alias
1008
+ fill_chrom = fill
1009
+
1010
+
1011
+ def _get_missing_consensus_sample_combinations(self, uids):
1012
+ """
1013
+ Efficiently identify which consensus_uid/sample combinations are missing.
1014
+ Returns a list of tuples: (consensus_uid, sample_uid, sample_name, sample_path)
1015
+ """
1016
+ # Get all consensus UIDs we're interested in
1017
+ consensus_uids_set = set(uids)
1018
+
1019
+ # Get all sample UIDs and create lookup
1020
+ all_sample_info = {}
1021
+ for row in self.samples_df.select([
1022
+ "sample_uid",
1023
+ "sample_name",
1024
+ "sample_path",
1025
+ ]).iter_rows(named=True):
1026
+ all_sample_info[row["sample_uid"]] = {
1027
+ "sample_name": row["sample_name"],
1028
+ "sample_path": row["sample_path"],
1029
+ }
1030
+
1031
+ # Get existing consensus/sample combinations from consensus_mapping_df
1032
+ existing_combinations = set()
1033
+ consensus_mapping_filtered = self.consensus_mapping_df.filter(
1034
+ pl.col("consensus_uid").is_in(list(consensus_uids_set)),
1035
+ )
1036
+
1037
+ # Join with features_df to get sample_uid information
1038
+ existing_features = consensus_mapping_filtered.join(
1039
+ self.features_df.select(["feature_uid", "sample_uid"]),
1040
+ on="feature_uid",
1041
+ how="inner",
1042
+ )
1043
+
1044
+ for row in existing_features.select(["consensus_uid", "sample_uid"]).iter_rows():
1045
+ existing_combinations.add((row[0], row[1])) # (consensus_uid, sample_uid)
1046
+
1047
+ # Find missing combinations
1048
+ missing_combinations = []
1049
+ for consensus_uid in consensus_uids_set:
1050
+ for sample_uid, sample_info in all_sample_info.items():
1051
+ if (consensus_uid, sample_uid) not in existing_combinations:
1052
+ missing_combinations.append((
1053
+ consensus_uid,
1054
+ sample_uid,
1055
+ sample_info["sample_name"],
1056
+ sample_info["sample_path"],
1057
+ ))
1058
+
1059
+ return missing_combinations
1060
+
1061
+
1062
+ def sanitize(self):
1063
+ """
1064
+ Sanitize features DataFrame to ensure all complex objects are properly typed.
1065
+ Convert serialized objects back to their proper types (Chromatogram, Spectrum).
1066
+ """
1067
+ if self.features_df is None or self.features_df.is_empty():
1068
+ return
1069
+
1070
+ self.logger.debug(
1071
+ "Sanitizing features DataFrame to ensure all complex objects are properly typed.",
1072
+ )
1073
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1074
+
1075
+ # Check if we have object columns that need sanitization
1076
+ has_chrom = "chrom" in self.features_df.columns
1077
+ has_ms2_specs = "ms2_specs" in self.features_df.columns
1078
+
1079
+ if not has_chrom and not has_ms2_specs:
1080
+ self.logger.debug("No object columns found that need sanitization.")
1081
+ return
1082
+
1083
+ # Convert to list of dictionaries for easier manipulation
1084
+ rows_data = []
1085
+
1086
+ for row_dict in tqdm(
1087
+ self.features_df.iter_rows(named=True),
1088
+ total=len(self.features_df),
1089
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO |{self.log_label}Sanitize features",
1090
+ disable=tdqm_disable,
1091
+ ):
1092
+ row_data = dict(row_dict)
1093
+
1094
+ # Sanitize chrom column
1095
+ if has_chrom and row_data["chrom"] is not None:
1096
+ if not isinstance(row_data["chrom"], Chromatogram):
1097
+ try:
1098
+ # Create new Chromatogram and populate from dict if needed
1099
+ new_chrom = Chromatogram(rt=np.array([]), inty=np.array([]))
1100
+ if hasattr(row_data["chrom"], "__dict__"):
1101
+ new_chrom.from_dict(row_data["chrom"].__dict__)
1102
+ else:
1103
+ # If it's already a dict
1104
+ new_chrom.from_dict(row_data["chrom"])
1105
+ row_data["chrom"] = new_chrom
1106
+ except Exception as e:
1107
+ self.logger.warning(f"Failed to sanitize chrom object: {e}")
1108
+ row_data["chrom"] = None
1109
+
1110
+ # Sanitize ms2_specs column
1111
+ if has_ms2_specs and row_data["ms2_specs"] is not None:
1112
+ if isinstance(row_data["ms2_specs"], list):
1113
+ sanitized_specs = []
1114
+ for ms2_specs in row_data["ms2_specs"]:
1115
+ if not isinstance(ms2_specs, Spectrum):
1116
+ try:
1117
+ new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
1118
+ if hasattr(ms2_specs, "__dict__"):
1119
+ new_ms2_specs.from_dict(ms2_specs.__dict__)
1120
+ else:
1121
+ new_ms2_specs.from_dict(ms2_specs)
1122
+ sanitized_specs.append(new_ms2_specs)
1123
+ except Exception as e:
1124
+ self.logger.warning(
1125
+ f"Failed to sanitize ms2_specs object: {e}",
1126
+ )
1127
+ sanitized_specs.append(None)
1128
+ else:
1129
+ sanitized_specs.append(ms2_specs)
1130
+ row_data["ms2_specs"] = sanitized_specs
1131
+ elif not isinstance(row_data["ms2_specs"], Spectrum):
1132
+ try:
1133
+ new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
1134
+ if hasattr(row_data["ms2_specs"], "__dict__"):
1135
+ new_ms2_specs.from_dict(row_data["ms2_specs"].__dict__)
1136
+ else:
1137
+ new_ms2_specs.from_dict(row_data["ms2_specs"])
1138
+ row_data["ms2_specs"] = new_ms2_specs
1139
+ except Exception as e:
1140
+ self.logger.warning(f"Failed to sanitize ms2_specs object: {e}")
1141
+ row_data["ms2_specs"] = None
1142
+
1143
+ rows_data.append(row_data)
1144
+
1145
+ # Recreate the DataFrame with sanitized data
1146
+ try:
1147
+ self.features_df = pl.DataFrame(rows_data)
1148
+ self.logger.success("Features DataFrame sanitization completed successfully.")
1149
+ except Exception as e:
1150
+ self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
1151
+
1152
+
1153
+ def load_features(self):
1154
+ # iterate over all samples in samples_df
1155
+
1156
+ self.features_maps = []
1157
+ self.logger.debug("Loading features from featureXML files.")
1158
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1159
+ for _index, row_dict in tqdm(
1160
+ enumerate(self.samples_df.iter_rows(named=True)),
1161
+ total=len(self.samples_df),
1162
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Load feature maps from XML",
1163
+ disable=tdqm_disable,
1164
+ ):
1165
+ if self.folder is not None:
1166
+ filename = os.path.join(
1167
+ self.folder,
1168
+ row_dict["sample_name"] + ".featureXML",
1169
+ )
1170
+ else:
1171
+ filename = os.path.join(
1172
+ os.getcwd(),
1173
+ row_dict["sample_name"] + ".featureXML",
1174
+ )
1175
+ # check if file exists
1176
+ if not os.path.exists(filename):
1177
+ filename = row_dict["sample_path"].replace(".sample5", ".featureXML")
1178
+
1179
+ if not os.path.exists(filename):
1180
+ self.features_maps.append(None)
1181
+ continue
1182
+
1183
+ fh = oms.FeatureXMLFile()
1184
+ fm = oms.FeatureMap()
1185
+ fh.load(filename, fm)
1186
+ self.features_maps.append(fm)
1187
+ self.logger.debug("Features loaded successfully.")
1188
+
1189
+
1190
+ def _load_consensusXML(self, filename="alignment.consensusXML"):
1191
+ """
1192
+ Load a consensus map from a file.
1193
+ """
1194
+ if not os.path.exists(filename):
1195
+ self.logger.error(f"File {filename} does not exist.")
1196
+ return
1197
+ fh = oms.ConsensusXMLFile()
1198
+ self.consensus_map = oms.ConsensusMap()
1199
+ fh.load(filename, self.consensus_map)
1200
+ self.logger.debug(f"Loaded consensus map from {filename}.")
1201
+