masster 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (55) hide show
  1. masster/__init__.py +27 -27
  2. masster/_version.py +17 -17
  3. masster/chromatogram.py +497 -503
  4. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
  5. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
  6. masster/logger.py +318 -244
  7. masster/sample/__init__.py +9 -9
  8. masster/sample/defaults/__init__.py +15 -15
  9. masster/sample/defaults/find_adducts_def.py +325 -325
  10. masster/sample/defaults/find_features_def.py +366 -366
  11. masster/sample/defaults/find_ms2_def.py +285 -285
  12. masster/sample/defaults/get_spectrum_def.py +314 -318
  13. masster/sample/defaults/sample_def.py +374 -378
  14. masster/sample/h5.py +1321 -1297
  15. masster/sample/helpers.py +833 -364
  16. masster/sample/lib.py +762 -0
  17. masster/sample/load.py +1220 -1187
  18. masster/sample/parameters.py +131 -131
  19. masster/sample/plot.py +1685 -1622
  20. masster/sample/processing.py +1402 -1416
  21. masster/sample/quant.py +209 -0
  22. masster/sample/sample.py +393 -387
  23. masster/sample/sample5_schema.json +181 -181
  24. masster/sample/save.py +737 -736
  25. masster/sample/sciex.py +1213 -0
  26. masster/spectrum.py +1287 -1319
  27. masster/study/__init__.py +9 -9
  28. masster/study/defaults/__init__.py +21 -19
  29. masster/study/defaults/align_def.py +267 -267
  30. masster/study/defaults/export_def.py +41 -40
  31. masster/study/defaults/fill_chrom_def.py +264 -264
  32. masster/study/defaults/fill_def.py +260 -0
  33. masster/study/defaults/find_consensus_def.py +256 -256
  34. masster/study/defaults/find_ms2_def.py +163 -163
  35. masster/study/defaults/integrate_chrom_def.py +225 -225
  36. masster/study/defaults/integrate_def.py +221 -0
  37. masster/study/defaults/merge_def.py +256 -0
  38. masster/study/defaults/study_def.py +272 -269
  39. masster/study/export.py +674 -287
  40. masster/study/h5.py +1406 -886
  41. masster/study/helpers.py +1713 -433
  42. masster/study/helpers_optimized.py +317 -0
  43. masster/study/load.py +1231 -1078
  44. masster/study/parameters.py +99 -99
  45. masster/study/plot.py +632 -645
  46. masster/study/processing.py +1057 -1046
  47. masster/study/save.py +161 -134
  48. masster/study/study.py +612 -522
  49. masster/study/study5_schema.json +253 -241
  50. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/METADATA +15 -10
  51. masster-0.3.1.dist-info/RECORD +59 -0
  52. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/licenses/LICENSE +661 -661
  53. masster-0.2.5.dist-info/RECORD +0 -50
  54. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/WHEEL +0 -0
  55. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/entry_points.txt +0 -0
masster/study/load.py CHANGED
@@ -1,1078 +1,1231 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- import concurrent.futures
5
- from datetime import datetime
6
-
7
- import numpy as np
8
- import polars as pl
9
- import pyopenms as oms
10
-
11
- from tqdm import tqdm
12
-
13
- from masster.chromatogram import Chromatogram
14
- from masster.study.defaults import fill_chrom_defaults
15
- from masster.sample.sample import Sample
16
- from masster.spectrum import Spectrum
17
-
18
-
19
- # Pre-import heavy modules to avoid repeated loading in add_sample()
20
- try:
21
- import alpharaw.sciex
22
-
23
- ALPHARAW_AVAILABLE = True
24
- except ImportError:
25
- ALPHARAW_AVAILABLE = False
26
-
27
- try:
28
- import pythonnet
29
-
30
- PYTHONNET_AVAILABLE = True
31
- except ImportError:
32
- PYTHONNET_AVAILABLE = False
33
-
34
- import glob
35
-
36
-
37
- def add_folder(
38
- self,
39
- folder=None,
40
- reset=False,
41
- adducts=None,
42
- max_files=None,
43
- ):
44
- if folder is None:
45
- if self.default_folder is not None:
46
- folder = self.default_folder
47
- else:
48
- folder = os.getcwd()
49
-
50
- files = []
51
-
52
- if not any(char in folder for char in ["*", "?", "[", "]"]):
53
- folder = os.path.join(folder, "**", "*.sample5")
54
-
55
- self.logger.debug(f"Adding files from: {folder}")
56
- files = glob.glob(folder, recursive=True)
57
-
58
- not_zero = False
59
-
60
- bname = []
61
- counter = 1
62
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
63
- if len(files) > 0:
64
- for i, file in enumerate(
65
- tqdm(
66
- files,
67
- total=len(files),
68
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add sample5 files",
69
- disable=tdqm_disable,
70
- ),
71
- ):
72
- # check if file is already in the study
73
- if max_files is not None and counter > max_files:
74
- break
75
- self.logger.debug(f"Add file {i + 1}/{len(files)}: {file}")
76
- self.add_sample(file, reset=reset, adducts=adducts)
77
- bname.append(os.path.basename(file))
78
- bname.append(os.path.basename(file).replace(".sample5", ".wiff"))
79
- not_zero = True
80
- counter += 1
81
-
82
- if max_files is not None and counter < max_files:
83
- files = glob.glob(folder.replace('.sample5','*.wiff'), recursive=True)
84
-
85
- if len(files) > 0:
86
- # iterate over all files
87
- for i, file in enumerate(
88
- tqdm(
89
- files,
90
- total=len(files),
91
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add wiff files",
92
- disable=tdqm_disable,
93
- ),
94
- ):
95
- # check if file is already in the study
96
- if os.path.basename(file) in bname:
97
- continue
98
- if max_files is not None and counter > max_files:
99
- break
100
- self.logger.debug(f"Add file {i + 1}/{len(files)}: {file}")
101
- self.add_sample(file, reset=reset, adducts=adducts)
102
- not_zero = True
103
-
104
- if max_files is not None and counter > max_files:
105
- self.logger.info(
106
- f"Reached maximum number of files to add: {max_files}. Stopping further additions.",
107
- )
108
-
109
- if not not_zero:
110
- self.logger.error(
111
- f"No files found in {folder}. Please check the folder path or file patterns.",
112
- )
113
-
114
-
115
-
116
- # TODO type is not used
117
- def add_sample(self, file, type=None, reset=False, adducts=None):
118
- self.logger.debug(f"Adding: {file}")
119
- sample_name = (
120
- os.path.basename(file)
121
- .replace(".mzpkl", "")
122
- .replace(".wiff", "")
123
- .replace(".h5", "")
124
- .replace(".sample5", "")
125
- )
126
- # check is sample_name is already in the samples_df
127
- if sample_name in self.samples_df["sample_name"].to_list():
128
- self.logger.warning(
129
- f"Sample {sample_name} already exists in the study. Skipping.",
130
- )
131
- return
132
-
133
- # check if file exists
134
- if not os.path.exists(file):
135
- self.logger.error(f"File {file} does not exist.")
136
- return
137
-
138
- if not file.endswith((".sample5", ".wiff", ".mzML")):
139
- self.logger.error(f"File {file} is not a valid sample5 file.")
140
- return
141
-
142
- # try:
143
- if file.endswith((".sample5")):
144
- ddaobj = Sample()
145
- ddaobj.logger_update(level='WARNING', label=os.path.basename(file))
146
- ddaobj.load(file)
147
- elif file.endswith(".wiff"):
148
- ddaobj = Sample()
149
- ddaobj.logger_update(level='WARNING', label=os.path.basename(file))
150
- ddaobj.load(file)
151
- if ddaobj.features_df is None and not reset:
152
- self.logger.warning(
153
- f"File {file} will be newly processed.",
154
- )
155
- ddaobj.features = None
156
-
157
- if ddaobj.features is None or reset:
158
- ddaobj.find_features()
159
- ddaobj.find_adducts(adducts=adducts)
160
- ddaobj.filter_features(coherence=0.3, prominence_scaled=1.0)
161
- ddaobj.find_ms2()
162
-
163
- self.features_maps.append(ddaobj.features)
164
-
165
- sample_type = "sample" if type is None else type
166
- if "qc" in sample_name.lower():
167
- sample_type = "qc"
168
- if "blank" in sample_name.lower():
169
- sample_type = "blank"
170
- map_id_value = str(ddaobj.features.getUniqueId())
171
-
172
- new_sample = pl.DataFrame(
173
- {
174
- "sample_uid": [int(len(self.samples_df) + 1)],
175
- "sample_name": [sample_name],
176
- "sample_path": [file],
177
- "sample_type": [sample_type],
178
- "size": [int(ddaobj.features.size())],
179
- "map_id": [map_id_value],
180
- },
181
- schema={
182
- "sample_uid": pl.Int64,
183
- "sample_name": pl.Utf8,
184
- "sample_path": pl.Utf8,
185
- "sample_type": pl.Utf8,
186
- "size": pl.Int64,
187
- "map_id": pl.Utf8,
188
- },
189
- )
190
- # save ddaobj to default_folder if it is set
191
- if self.default_folder is not None:
192
- if not os.path.exists(self.default_folder):
193
- os.makedirs(self.default_folder)
194
- basename = os.path.basename(file)
195
- sample_name = os.path.splitext(basename)[0]
196
- ddaobj.save(os.path.join(self.default_folder, sample_name + ".sample5"))
197
- self.samples_df = pl.concat([self.samples_df, new_sample])
198
-
199
- # Optimized DataFrame operations - chain operations instead of multiple clones
200
- columns_to_add = [
201
- pl.lit(len(self.samples_df)).alias("sample_uid"),
202
- pl.lit(False).alias("filled"),
203
- pl.lit(-1.0).alias("chrom_area"),
204
- ]
205
-
206
- # Only add rt_original if it doesn't exist
207
- if "rt_original" not in ddaobj.features_df.columns:
208
- columns_to_add.append(pl.col("rt").alias("rt_original"))
209
-
210
- f_df = ddaobj.features_df.with_columns(columns_to_add)
211
-
212
- if self.features_df.is_empty():
213
- # Create new features_df with feature_uid column
214
- self.features_df = f_df.with_columns(
215
- pl.int_range(pl.len()).add(1).alias("feature_uid"),
216
- ).select(
217
- ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
218
- )
219
- else:
220
- offset = (
221
- self.features_df["feature_uid"].max() + 1
222
- if not self.features_df.is_empty()
223
- else 1
224
- )
225
- # Chain operations and add to existing DataFrame
226
- f_df = f_df.with_columns(
227
- pl.int_range(pl.len()).add(offset).alias("feature_uid"),
228
- ).select(
229
- ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
230
- )
231
- self.features_df = pl.concat([self.features_df, f_df])
232
- self.logger.debug(
233
- f"Added sample {sample_name} with {ddaobj.features.size()} features to the study.",
234
- )
235
-
236
-
237
- def load(self, filename=None):
238
- """
239
- Load a study from an HDF5 file.
240
-
241
- Args:
242
- study: The study object to load into
243
- filename (str, optional): The path to the HDF5 file to load the study from.
244
- """
245
-
246
- # Handle default filename
247
- if filename is None:
248
- if self.default_folder is not None:
249
- # search for *.study5 in default_folder
250
- study5_files = glob.glob(os.path.join(self.default_folder, "*.study5"))
251
- if study5_files:
252
- filename = study5_files[-1]
253
- else:
254
- self.logger.error("No .study5 files found in default_folder")
255
- return
256
- else:
257
- self.logger.error("Either filename or default_folder must be provided")
258
- return
259
-
260
- self.logger.info(f"Loading study from {filename}")
261
- self._load_study5(filename)
262
- # After loading the study, check if consensus XML exists and load it
263
- consensus_xml_path = filename.replace(".study5", ".consensusXML")
264
- if os.path.exists(consensus_xml_path):
265
- self._load_consensusXML(filename=consensus_xml_path)
266
- # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
267
- else:
268
- self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
269
-
270
-
271
- def fill_chrom_single(
272
- self,
273
- uids=None,
274
- mz_tol: float = 0.010,
275
- rt_tol: float = 10.0,
276
- min_samples_rel: float = 0.0,
277
- min_samples_abs: int = 2,
278
- ):
279
- """Fill missing chromatograms by extracting from raw data.
280
-
281
- Simplified version that loads one sample at a time without preloading or batching.
282
-
283
- Args:
284
- uids: Consensus UIDs to process (default: all)
285
- mz_tol: m/z tolerance for extraction (default: 0.010 Da)
286
- rt_tol: RT tolerance for extraction (default: 10.0 seconds)
287
- min_samples_rel: Relative minimum sample threshold (default: 0.0)
288
- min_samples_abs: Absolute minimum sample threshold (default: 2)
289
- """
290
- uids = self._get_consensus_uids(uids)
291
-
292
- self.logger.info("Gap filling...")
293
- self.logger.debug(
294
- f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}",
295
- )
296
-
297
- # Apply minimum sample filters
298
- min_number_rel = 1
299
- min_number_abs = 1
300
- if isinstance(min_samples_rel, float) and min_samples_rel > 0:
301
- min_number_rel = int(min_samples_rel * len(self.samples_df))
302
- if isinstance(min_samples_abs, int) and min_samples_abs > 0:
303
- min_number_abs = int(min_samples_abs)
304
- min_number = max(min_number_rel, min_number_abs)
305
- self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
306
-
307
- if min_number > 0:
308
- original_count = len(uids)
309
- uids = self.consensus_df.filter(
310
- (pl.col("number_samples") >= min_number)
311
- & (pl.col("consensus_uid").is_in(uids)),
312
- )["consensus_uid"].to_list()
313
- self.logger.debug(
314
- f"Features to fill: {original_count} -> {len(uids)}",
315
- )
316
- self.logger.debug("Identifying missing features...")
317
- # Instead of building full chromatogram matrix, identify missing consensus/sample combinations directly
318
- missing_combinations = self._get_missing_consensus_sample_combinations(uids)
319
- if not missing_combinations:
320
- self.logger.info("No missing features found to fill.")
321
- return
322
-
323
- # Build lookup dictionaries
324
- self.logger.debug("Building lookup dictionaries...")
325
- consensus_info = {}
326
- consensus_subset = self.consensus_df.select([
327
- "consensus_uid",
328
- "rt_start_mean",
329
- "rt_end_mean",
330
- "mz",
331
- "rt",
332
- ]).filter(pl.col("consensus_uid").is_in(uids))
333
-
334
- for row in consensus_subset.iter_rows(named=True):
335
- consensus_info[row["consensus_uid"]] = {
336
- "rt_start_mean": row["rt_start_mean"],
337
- "rt_end_mean": row["rt_end_mean"],
338
- "mz": row["mz"],
339
- "rt": row["rt"],
340
- }
341
-
342
- # Process each sample individually
343
- # Group missing combinations by sample for efficient processing
344
- missing_by_sample = {}
345
- for consensus_uid, sample_uid, sample_name, sample_path in missing_combinations:
346
- if sample_name not in missing_by_sample:
347
- missing_by_sample[sample_name] = {
348
- "sample_uid": sample_uid,
349
- "sample_path": sample_path,
350
- "missing_consensus_uids": [],
351
- }
352
- missing_by_sample[sample_name]["missing_consensus_uids"].append(consensus_uid)
353
-
354
- new_features: list[dict] = []
355
- new_mapping: list[dict] = []
356
- counter = 0
357
-
358
- self.logger.debug(
359
- f"Missing features: {len(missing_combinations)} in {len(missing_by_sample)} samples...",
360
- )
361
-
362
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
363
-
364
- for sample_name, sample_info in tqdm(
365
- missing_by_sample.items(),
366
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}File",
367
- disable=tdqm_disable,
368
- ):
369
- # Load this sample
370
- sample_uid = sample_info["sample_uid"]
371
- sample_path = sample_info["sample_path"]
372
- missing_consensus_uids = sample_info["missing_consensus_uids"]
373
-
374
- try:
375
- # self.logger.debug(f"Loading sample: {sample_path}")
376
- file = Sample()
377
- file.logger_update("WARNING")
378
- file.load(sample_path)
379
- except Exception as e:
380
- self.logger.warning(f"Failed to load sample {sample_name}: {e}")
381
- continue
382
-
383
- self.logger.debug(
384
- f"Sample {sample_name}: Processing {len(missing_consensus_uids)} missing features",
385
- )
386
-
387
- # Process each missing feature
388
- for consensus_uid in missing_consensus_uids:
389
- cons = consensus_info[consensus_uid]
390
- mz = cons["mz"]
391
- rt = cons["rt"]
392
- rt_start_mean = cons["rt_start_mean"]
393
- rt_end_mean = cons["rt_end_mean"]
394
-
395
- # Filter MS1 data for this feature
396
- if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
397
- d = file.ms1_df.filter(
398
- (pl.col("mz") >= mz - mz_tol)
399
- & (pl.col("mz") <= mz + mz_tol)
400
- & (pl.col("rt") >= rt_start_mean - rt_tol)
401
- & (pl.col("rt") <= rt_end_mean + rt_tol),
402
- )
403
- else:
404
- d = pl.DataFrame()
405
-
406
- # Create chromatogram
407
- if d.is_empty():
408
- self.logger.debug(
409
- f"Feature {consensus_uid}: No MS1 data found, creating empty chromatogram",
410
- )
411
- eic = Chromatogram(
412
- rt=np.array([rt_start_mean, rt_end_mean]),
413
- inty=np.array([0.0, 0.0]),
414
- label=f"EIC mz={mz:.4f}",
415
- file=sample_path,
416
- mz=mz,
417
- mz_tol=mz_tol,
418
- feature_start=rt_start_mean,
419
- feature_end=rt_end_mean,
420
- feature_apex=rt,
421
- )
422
- max_inty = 0.0
423
- area = 0.0
424
- else:
425
- self.logger.debug(
426
- f"Feature {consensus_uid}: Found {len(d)} MS1 points, creating EIC",
427
- )
428
- eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
429
-
430
- if len(eic_rt) > 4:
431
- eic = Chromatogram(
432
- eic_rt["rt"].to_numpy(),
433
- eic_rt["inty"].to_numpy(),
434
- label=f"EIC mz={mz:.4f}",
435
- file=sample_path,
436
- mz=mz,
437
- mz_tol=mz_tol,
438
- feature_start=rt_start_mean,
439
- feature_end=rt_end_mean,
440
- feature_apex=rt,
441
- ).find_peaks()
442
- max_inty = np.max(eic.inty)
443
- area = eic.feature_area
444
- else:
445
- eic = Chromatogram(
446
- eic_rt["rt"].to_numpy(),
447
- eic_rt["inty"].to_numpy(),
448
- label=f"EIC mz={mz:.4f}",
449
- file=sample_path,
450
- mz=mz,
451
- mz_tol=mz_tol,
452
- feature_start=rt_start_mean,
453
- feature_end=rt_end_mean,
454
- feature_apex=rt,
455
- )
456
- max_inty = 0.0
457
- area = 0.0
458
-
459
- # Generate feature UID
460
- feature_uid = (
461
- self.features_df["feature_uid"].max() + len(new_features) + 1
462
- if not self.features_df.is_empty()
463
- else len(new_features) + 1
464
- )
465
-
466
- # Create new feature entry
467
- new_feature = {
468
- "sample_uid": sample_uid,
469
- "feature_uid": feature_uid,
470
- "feature_id": None,
471
- "mz": mz,
472
- "rt": rt,
473
- "rt_original": None,
474
- "rt_start": rt_start_mean,
475
- "rt_end": rt_end_mean,
476
- "rt_delta": rt_end_mean - rt_start_mean,
477
- "mz_start": None,
478
- "mz_end": None,
479
- "inty": max_inty,
480
- "quality": None,
481
- "charge": None,
482
- "iso": None,
483
- "iso_of": None,
484
- "adduct": None,
485
- "adduct_mass": None,
486
- "adduct_group": None,
487
- "chrom": eic,
488
- "chrom_coherence": None,
489
- "chrom_prominence": None,
490
- "chrom_prominence_scaled": None,
491
- "chrom_height_scaled": None,
492
- "ms2_scans": None,
493
- "ms2_specs": None,
494
- "filled": True,
495
- "chrom_area": area,
496
- }
497
-
498
- new_features.append(new_feature)
499
- new_mapping.append({
500
- "consensus_uid": consensus_uid,
501
- "sample_uid": sample_uid,
502
- "feature_uid": feature_uid,
503
- })
504
- counter += 1
505
-
506
- # Add new features to DataFrames
507
- self.logger.debug(f"Adding {len(new_features)} new features to DataFrame...")
508
- if new_features:
509
- # Create properly formatted rows
510
- rows_to_add = []
511
- for feature_dict in new_features:
512
- new_row = {}
513
- for col in self.features_df.columns:
514
- if col in feature_dict:
515
- new_row[col] = feature_dict[col]
516
- else:
517
- new_row[col] = None
518
- rows_to_add.append(new_row)
519
-
520
- # Create and add new DataFrame
521
- new_df = pl.from_dicts(rows_to_add)
522
-
523
- # Cast columns to match existing schema
524
- cast_exprs = []
525
- for col in self.features_df.columns:
526
- existing_dtype = self.features_df[col].dtype
527
- cast_exprs.append(pl.col(col).cast(existing_dtype, strict=False))
528
-
529
- new_df = new_df.with_columns(cast_exprs)
530
- self.features_df = self.features_df.vstack(new_df)
531
-
532
- # Add consensus mapping
533
- new_mapping_df = pl.DataFrame(new_mapping)
534
- self.consensus_mapping_df = pl.concat(
535
- [self.consensus_mapping_df, new_mapping_df],
536
- how="diagonal",
537
- )
538
-
539
- self.logger.info(f"Filled {counter} chromatograms from raw data.")
540
-
541
-
542
- def _process_sample_for_parallel_fill(
543
- self,
544
- sample_info,
545
- consensus_info,
546
- uids,
547
- mz_tol,
548
- rt_tol,
549
- missing_combinations_df,
550
- features_df_max_uid,
551
- ):
552
- """Process a single sample for parallel gap filling."""
553
- sample_uid = sample_info["sample_uid"]
554
- sample_path = sample_info["sample_path"]
555
-
556
- new_features: list[dict] = []
557
- new_mapping: list[dict] = []
558
- counter = 0
559
-
560
- try:
561
- # Load this sample
562
- file = Sample()
563
- file.logger_update(level="WARNING")
564
- file.load(sample_path)
565
- except Exception:
566
- # Skip this sample if loading fails
567
- return new_features, new_mapping, counter
568
-
569
- # Find missing features for this sample from precomputed combinations
570
- sample_missing = missing_combinations_df.filter(
571
- pl.col("sample_uid") == sample_uid,
572
- )["consensus_uid"].to_list()
573
-
574
- if not sample_missing:
575
- return new_features, new_mapping, counter
576
-
577
- # Process each missing feature
578
- for consensus_uid in sample_missing:
579
- cons = consensus_info[consensus_uid]
580
- mz = cons["mz"]
581
- rt = cons["rt"]
582
- rt_start_mean = cons["rt_start_mean"]
583
- rt_end_mean = cons["rt_end_mean"]
584
-
585
- # Filter MS1 data for this feature
586
- if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
587
- d = file.ms1_df.filter(
588
- (pl.col("mz") >= mz - mz_tol)
589
- & (pl.col("mz") <= mz + mz_tol)
590
- & (pl.col("rt") >= rt_start_mean - rt_tol)
591
- & (pl.col("rt") <= rt_end_mean + rt_tol),
592
- )
593
- else:
594
- d = pl.DataFrame()
595
-
596
- # Create chromatogram
597
- if d.is_empty():
598
- eic = Chromatogram(
599
- rt=np.array([rt_start_mean, rt_end_mean]),
600
- inty=np.array([0.0, 0.0]),
601
- label=f"EIC mz={mz:.4f}",
602
- file=sample_path,
603
- mz=mz,
604
- mz_tol=mz_tol,
605
- feature_start=rt_start_mean,
606
- feature_end=rt_end_mean,
607
- feature_apex=rt,
608
- )
609
- max_inty = 0.0
610
- area = 0.0
611
- else:
612
- eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
613
-
614
- if len(eic_rt) > 4:
615
- eic = Chromatogram(
616
- eic_rt["rt"].to_numpy(),
617
- eic_rt["inty"].to_numpy(),
618
- label=f"EIC mz={mz:.4f}",
619
- file=sample_path,
620
- mz=mz,
621
- mz_tol=mz_tol,
622
- feature_start=rt_start_mean,
623
- feature_end=rt_end_mean,
624
- feature_apex=rt,
625
- ).find_peaks()
626
- max_inty = np.max(eic.inty)
627
- area = eic.feature_area
628
- else:
629
- eic = Chromatogram(
630
- eic_rt["rt"].to_numpy(),
631
- eic_rt["inty"].to_numpy(),
632
- label=f"EIC mz={mz:.4f}",
633
- file=sample_path,
634
- mz=mz,
635
- mz_tol=mz_tol,
636
- feature_start=rt_start_mean,
637
- feature_end=rt_end_mean,
638
- feature_apex=rt,
639
- )
640
- max_inty = 0.0
641
- area = 0.0
642
-
643
- # Generate feature UID (will be adjusted later to ensure global uniqueness)
644
- feature_uid = features_df_max_uid + len(new_features) + 1
645
-
646
- # Create new feature entry
647
- new_feature = {
648
- "sample_uid": sample_uid,
649
- "feature_uid": feature_uid,
650
- "feature_id": None,
651
- "mz": mz,
652
- "rt": rt,
653
- "rt_original": None,
654
- "rt_start": rt_start_mean,
655
- "rt_end": rt_end_mean,
656
- "rt_delta": rt_end_mean - rt_start_mean,
657
- "mz_start": None,
658
- "mz_end": None,
659
- "inty": max_inty,
660
- "quality": None,
661
- "charge": None,
662
- "iso": None,
663
- "iso_of": None,
664
- "adduct": None,
665
- "adduct_mass": None,
666
- "adduct_group": None,
667
- "chrom": eic,
668
- "filled": True,
669
- "chrom_area": area,
670
- "chrom_coherence": None,
671
- "chrom_prominence": None,
672
- "chrom_prominence_scaled": None,
673
- "chrom_height_scaled": None,
674
- "ms2_scans": None,
675
- "ms2_specs": None,
676
- }
677
-
678
- new_features.append(new_feature)
679
- new_mapping.append({
680
- "consensus_uid": consensus_uid,
681
- "sample_uid": sample_uid,
682
- "feature_uid": feature_uid,
683
- })
684
- counter += 1
685
-
686
- return new_features, new_mapping, counter
687
-
688
-
689
- def fill_chrom(
690
- self,
691
- uids=None,
692
- mz_tol: float = 0.010,
693
- rt_tol: float = 10.0,
694
- min_samples_rel: float = 0.0,
695
- min_samples_abs: int = 2,
696
- num_workers=4,
697
- ):
698
- """Fill missing chromatograms by extracting from raw data using parallel processing.
699
-
700
- Args:
701
- uids: Consensus UIDs to process (default: all)
702
- mz_tol: m/z tolerance for extraction (default: 0.010 Da)
703
- rt_tol: RT tolerance for extraction (default: 10.0 seconds)
704
- min_samples_rel: Relative minimum sample threshold (default: 0.0)
705
- min_samples_abs: Absolute minimum sample threshold (default: 2)
706
- num_workers: Number of parallel workers (default: 4)
707
- """
708
- uids = self._get_consensus_uids(uids)
709
-
710
- self.logger.info(f"Gap filling with {num_workers} workers...")
711
- self.logger.debug(
712
- f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}, num_workers={num_workers}",
713
- )
714
-
715
- # Apply minimum sample filters
716
- min_number_rel = 1
717
- min_number_abs = 1
718
- if isinstance(min_samples_rel, float) and min_samples_rel > 0:
719
- min_number_rel = int(min_samples_rel * len(self.samples_df))
720
- if isinstance(min_samples_abs, int) and min_samples_abs > 0:
721
- min_number_abs = int(min_samples_abs)
722
- min_number = max(min_number_rel, min_number_abs)
723
-
724
- self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
725
-
726
- if min_number > 0:
727
- original_count = len(uids)
728
- uids = self.consensus_df.filter(
729
- (pl.col("number_samples") >= min_number)
730
- & (pl.col("consensus_uid").is_in(uids)),
731
- )["consensus_uid"].to_list()
732
- self.logger.debug(f"Features to fill: {original_count} -> {len(uids)}")
733
-
734
- # Get missing consensus/sample combinations using the optimized method
735
- self.logger.debug("Identifying missing features...")
736
- missing_combinations = self._get_missing_consensus_sample_combinations(uids)
737
-
738
- if not missing_combinations or len(missing_combinations) == 0:
739
- self.logger.info("No missing features found to fill.")
740
- return
741
-
742
- # Convert to DataFrame for easier processing
743
- missing_combinations_df = pl.DataFrame(
744
- missing_combinations,
745
- schema={
746
- "consensus_uid": pl.Int64,
747
- "sample_uid": pl.Int64,
748
- "sample_name": pl.Utf8,
749
- "sample_path": pl.Utf8,
750
- },
751
- orient="row",
752
- )
753
-
754
- # Build lookup dictionaries
755
- self.logger.debug("Building lookup dictionaries...")
756
- consensus_info = {}
757
- consensus_subset = self.consensus_df.select([
758
- "consensus_uid",
759
- "rt_start_mean",
760
- "rt_end_mean",
761
- "mz",
762
- "rt",
763
- ]).filter(pl.col("consensus_uid").is_in(uids))
764
-
765
- for row in consensus_subset.iter_rows(named=True):
766
- consensus_info[row["consensus_uid"]] = {
767
- "rt_start_mean": row["rt_start_mean"],
768
- "rt_end_mean": row["rt_end_mean"],
769
- "mz": row["mz"],
770
- "rt": row["rt"],
771
- }
772
-
773
- # Get sample info for all samples that need processing
774
- samples_to_process = []
775
- unique_sample_uids = missing_combinations_df["sample_uid"].unique().to_list()
776
-
777
- for row in self.samples_df.filter(
778
- pl.col("sample_uid").is_in(unique_sample_uids),
779
- ).iter_rows(named=True):
780
- samples_to_process.append({
781
- "sample_name": row["sample_name"],
782
- "sample_uid": row["sample_uid"],
783
- "sample_path": row["sample_path"],
784
- })
785
-
786
- total_missing = len(missing_combinations_df)
787
- total_samples = len(samples_to_process)
788
-
789
- self.logger.info(
790
- f"Gap filling for {total_missing} missing features from {total_samples} samples using {num_workers} workers...",
791
- )
792
-
793
- # Calculate current max feature_uid to avoid conflicts
794
- features_df_max_uid = (
795
- self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
796
- )
797
-
798
- # Process samples in parallel
799
- all_new_features: list[dict] = []
800
- all_new_mapping: list[dict] = []
801
- total_counter = 0
802
-
803
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
804
-
805
- with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
806
- # Submit all samples for processing
807
- future_to_sample = {}
808
- for sample_info in samples_to_process:
809
- future = executor.submit(
810
- self._process_sample_for_parallel_fill,
811
- sample_info,
812
- consensus_info,
813
- uids,
814
- mz_tol,
815
- rt_tol,
816
- missing_combinations_df,
817
- features_df_max_uid,
818
- )
819
- future_to_sample[future] = sample_info
820
-
821
- # Collect results with progress bar
822
- with tqdm(
823
- total=len(samples_to_process),
824
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Processing samples",
825
- disable=tdqm_disable,
826
- ) as pbar:
827
- for future in concurrent.futures.as_completed(future_to_sample):
828
- try:
829
- new_features, new_mapping, counter = future.result()
830
-
831
- # Adjust feature UIDs to ensure global uniqueness
832
- uid_offset = features_df_max_uid + len(all_new_features)
833
- for i, feature in enumerate(new_features):
834
- feature["feature_uid"] = uid_offset + i + 1
835
- for i, mapping in enumerate(new_mapping):
836
- mapping["feature_uid"] = uid_offset + i + 1
837
-
838
- all_new_features.extend(new_features)
839
- all_new_mapping.extend(new_mapping)
840
- total_counter += counter
841
-
842
- except Exception as e:
843
- sample_info = future_to_sample[future]
844
- self.logger.warning(
845
- f"Sample {sample_info['sample_name']} failed: {e}",
846
- )
847
-
848
- pbar.update(1)
849
-
850
- # Add new features to DataFrames
851
- self.logger.debug(f"Adding {len(all_new_features)} new features to DataFrame...")
852
- if all_new_features:
853
- # Create properly formatted rows
854
- rows_to_add = []
855
- for feature_dict in all_new_features:
856
- new_row = {}
857
- for col in self.features_df.columns:
858
- if col in feature_dict:
859
- new_row[col] = feature_dict[col]
860
- else:
861
- new_row[col] = None
862
- rows_to_add.append(new_row)
863
-
864
- # Create and add new DataFrame
865
- new_df = pl.from_dicts(rows_to_add)
866
-
867
- # Cast columns to match existing schema
868
- cast_exprs = []
869
- for col in self.features_df.columns:
870
- existing_dtype = self.features_df[col].dtype
871
- cast_exprs.append(pl.col(col).cast(existing_dtype, strict=False))
872
-
873
- new_df = new_df.with_columns(cast_exprs)
874
- self.features_df = self.features_df.vstack(new_df)
875
-
876
- # Add consensus mapping
877
- new_mapping_df = pl.DataFrame(all_new_mapping)
878
- self.consensus_mapping_df = pl.concat(
879
- [self.consensus_mapping_df, new_mapping_df],
880
- how="diagonal",
881
- )
882
-
883
- self.logger.info(
884
- f"Filled {total_counter} chromatograms from raw data using {num_workers} parallel workers.",
885
- )
886
-
887
-
888
- def _get_missing_consensus_sample_combinations(self, uids):
889
- """
890
- Efficiently identify which consensus_uid/sample combinations are missing.
891
- Returns a list of tuples: (consensus_uid, sample_uid, sample_name, sample_path)
892
- """
893
- # Get all consensus UIDs we're interested in
894
- consensus_uids_set = set(uids)
895
-
896
- # Get all sample UIDs and create lookup
897
- all_sample_info = {}
898
- for row in self.samples_df.select([
899
- "sample_uid",
900
- "sample_name",
901
- "sample_path",
902
- ]).iter_rows(named=True):
903
- all_sample_info[row["sample_uid"]] = {
904
- "sample_name": row["sample_name"],
905
- "sample_path": row["sample_path"],
906
- }
907
-
908
- # Get existing consensus/sample combinations from consensus_mapping_df
909
- existing_combinations = set()
910
- consensus_mapping_filtered = self.consensus_mapping_df.filter(
911
- pl.col("consensus_uid").is_in(list(consensus_uids_set)),
912
- )
913
-
914
- # Join with features_df to get sample_uid information
915
- existing_features = consensus_mapping_filtered.join(
916
- self.features_df.select(["feature_uid", "sample_uid"]),
917
- on="feature_uid",
918
- how="inner",
919
- )
920
-
921
- for row in existing_features.select(["consensus_uid", "sample_uid"]).iter_rows():
922
- existing_combinations.add((row[0], row[1])) # (consensus_uid, sample_uid)
923
-
924
- # Find missing combinations
925
- missing_combinations = []
926
- for consensus_uid in consensus_uids_set:
927
- for sample_uid, sample_info in all_sample_info.items():
928
- if (consensus_uid, sample_uid) not in existing_combinations:
929
- missing_combinations.append((
930
- consensus_uid,
931
- sample_uid,
932
- sample_info["sample_name"],
933
- sample_info["sample_path"],
934
- ))
935
-
936
- return missing_combinations
937
-
938
-
939
- def sanitize(self):
940
- """
941
- Sanitize features DataFrame to ensure all complex objects are properly typed.
942
- Convert serialized objects back to their proper types (Chromatogram, Spectrum).
943
- """
944
- if self.features_df is None or self.features_df.is_empty():
945
- return
946
-
947
- self.logger.debug(
948
- "Sanitizing features DataFrame to ensure all complex objects are properly typed.",
949
- )
950
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
951
-
952
- # Check if we have object columns that need sanitization
953
- has_chrom = "chrom" in self.features_df.columns
954
- has_ms2_specs = "ms2_specs" in self.features_df.columns
955
-
956
- if not has_chrom and not has_ms2_specs:
957
- self.logger.debug("No object columns found that need sanitization.")
958
- return
959
-
960
- # Convert to list of dictionaries for easier manipulation
961
- rows_data = []
962
-
963
- for row_dict in tqdm(
964
- self.features_df.iter_rows(named=True),
965
- total=len(self.features_df),
966
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO |{self.log_label}Sanitize features",
967
- disable=tdqm_disable,
968
- ):
969
- row_data = dict(row_dict)
970
-
971
- # Sanitize chrom column
972
- if has_chrom and row_data["chrom"] is not None:
973
- if not isinstance(row_data["chrom"], Chromatogram):
974
- try:
975
- # Create new Chromatogram and populate from dict if needed
976
- new_chrom = Chromatogram(rt=np.array([]), inty=np.array([]))
977
- if hasattr(row_data["chrom"], "__dict__"):
978
- new_chrom.from_dict(row_data["chrom"].__dict__)
979
- else:
980
- # If it's already a dict
981
- new_chrom.from_dict(row_data["chrom"])
982
- row_data["chrom"] = new_chrom
983
- except Exception as e:
984
- self.logger.warning(f"Failed to sanitize chrom object: {e}")
985
- row_data["chrom"] = None
986
-
987
- # Sanitize ms2_specs column
988
- if has_ms2_specs and row_data["ms2_specs"] is not None:
989
- if isinstance(row_data["ms2_specs"], list):
990
- sanitized_specs = []
991
- for ms2_specs in row_data["ms2_specs"]:
992
- if not isinstance(ms2_specs, Spectrum):
993
- try:
994
- new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
995
- if hasattr(ms2_specs, "__dict__"):
996
- new_ms2_specs.from_dict(ms2_specs.__dict__)
997
- else:
998
- new_ms2_specs.from_dict(ms2_specs)
999
- sanitized_specs.append(new_ms2_specs)
1000
- except Exception as e:
1001
- self.logger.warning(
1002
- f"Failed to sanitize ms2_specs object: {e}",
1003
- )
1004
- sanitized_specs.append(None)
1005
- else:
1006
- sanitized_specs.append(ms2_specs)
1007
- row_data["ms2_specs"] = sanitized_specs
1008
- elif not isinstance(row_data["ms2_specs"], Spectrum):
1009
- try:
1010
- new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
1011
- if hasattr(row_data["ms2_specs"], "__dict__"):
1012
- new_ms2_specs.from_dict(row_data["ms2_specs"].__dict__)
1013
- else:
1014
- new_ms2_specs.from_dict(row_data["ms2_specs"])
1015
- row_data["ms2_specs"] = new_ms2_specs
1016
- except Exception as e:
1017
- self.logger.warning(f"Failed to sanitize ms2_specs object: {e}")
1018
- row_data["ms2_specs"] = None
1019
-
1020
- rows_data.append(row_data)
1021
-
1022
- # Recreate the DataFrame with sanitized data
1023
- try:
1024
- self.features_df = pl.DataFrame(rows_data)
1025
- self.logger.success("Features DataFrame sanitization completed successfully.")
1026
- except Exception as e:
1027
- self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
1028
-
1029
-
1030
- def load_features(self):
1031
- # iterate over all samples in samples_df
1032
-
1033
- self.features_maps = []
1034
- self.logger.debug("Loading features from featureXML files.")
1035
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1036
- for _index, row_dict in tqdm(
1037
- enumerate(self.samples_df.iter_rows(named=True)),
1038
- total=len(self.samples_df),
1039
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Load feature maps from XML",
1040
- disable=tdqm_disable,
1041
- ):
1042
- if self.default_folder is not None:
1043
- filename = os.path.join(
1044
- self.default_folder,
1045
- row_dict["sample_name"] + ".featureXML",
1046
- )
1047
- else:
1048
- filename = os.path.join(
1049
- os.getcwd(),
1050
- row_dict["sample_name"] + ".featureXML",
1051
- )
1052
- # check if file exists
1053
- if not os.path.exists(filename):
1054
- filename = row_dict["sample_path"].replace(".sample5", ".featureXML")
1055
-
1056
- if not os.path.exists(filename):
1057
- self.features_maps.append(None)
1058
- continue
1059
-
1060
- fh = oms.FeatureXMLFile()
1061
- fm = oms.FeatureMap()
1062
- fh.load(filename, fm)
1063
- self.features_maps.append(fm)
1064
- self.logger.debug("Features loaded successfully.")
1065
-
1066
-
1067
- def _load_consensusXML(self, filename="alignment.consensusXML"):
1068
- """
1069
- Load a consensus map from a file.
1070
- """
1071
- if not os.path.exists(filename):
1072
- self.logger.error(f"File {filename} does not exist.")
1073
- return
1074
- fh = oms.ConsensusXMLFile()
1075
- self.consensus_map = oms.ConsensusMap()
1076
- fh.load(filename, self.consensus_map)
1077
- self.logger.debug(f"Loaded consensus map from {filename}.")
1078
-
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import concurrent.futures
5
+ from datetime import datetime
6
+
7
+ import numpy as np
8
+ import polars as pl
9
+ import pyopenms as oms
10
+
11
+ from tqdm import tqdm
12
+
13
+ from masster.chromatogram import Chromatogram
14
+ from masster.study.defaults import fill_defaults
15
+ from masster.sample.sample import Sample
16
+ from masster.spectrum import Spectrum
17
+
18
+
19
+ # Pre-import heavy modules to avoid repeated loading in add_sample()
20
+ try:
21
+ import alpharaw.sciex
22
+
23
+ ALPHARAW_AVAILABLE = True
24
+ except ImportError:
25
+ ALPHARAW_AVAILABLE = False
26
+
27
+ try:
28
+ import pythonnet
29
+
30
+ PYTHONNET_AVAILABLE = True
31
+ except ImportError:
32
+ PYTHONNET_AVAILABLE = False
33
+
34
+ import glob
35
+
36
+
37
+ def add(
38
+ self,
39
+ folder=None,
40
+ reset=False,
41
+ adducts=None,
42
+ max_files=None,
43
+ ):
44
+ if folder is None:
45
+ if self.folder is not None:
46
+ folder = self.folder
47
+ else:
48
+ folder = os.getcwd()
49
+
50
+ self.logger.debug(f"Adding files from: {folder}")
51
+
52
+ # Define file extensions to search for in order of priority
53
+ extensions = [".sample5", ".wiff", ".raw", ".mzML"]
54
+
55
+ # Check if folder contains glob patterns
56
+ if not any(char in folder for char in ["*", "?", "[", "]"]):
57
+ search_folder = folder
58
+ else:
59
+ search_folder = os.path.dirname(folder) if os.path.dirname(folder) else folder
60
+
61
+ # Blacklist to track filenames without extensions that have already been processed
62
+ blacklist = set()
63
+ counter = 0
64
+ not_zero = False
65
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
66
+
67
+ # Search for files in order of priority
68
+ for ext in extensions:
69
+ if max_files is not None and counter >= max_files:
70
+ break
71
+
72
+ # Build search pattern
73
+ if any(char in folder for char in ["*", "?", "[", "]"]):
74
+ # If folder already contains glob patterns, modify the extension
75
+ if folder.endswith("*.sample5"):
76
+ pattern = folder.replace("*.sample5", f"*{ext}")
77
+ else:
78
+ pattern = os.path.join(search_folder, "**", f"*{ext}")
79
+ else:
80
+ pattern = os.path.join(search_folder, "**", f"*{ext}")
81
+
82
+ files = glob.glob(pattern, recursive=True)
83
+
84
+ if len(files) > 0:
85
+ # Limit files if max_files is specified
86
+ remaining_slots = max_files - counter if max_files is not None else len(files)
87
+ files = files[:remaining_slots]
88
+
89
+ self.logger.debug(f"Found {len(files)} {ext} files")
90
+
91
+ # Process files
92
+ for i, file in enumerate(
93
+ tqdm(
94
+ files,
95
+ total=len(files),
96
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add *{ext}",
97
+ disable=tdqm_disable,
98
+ ),
99
+ ):
100
+ if max_files is not None and counter >= max_files:
101
+ break
102
+
103
+ # Get filename without extension for blacklist check
104
+ basename = os.path.basename(file)
105
+ filename_no_ext = os.path.splitext(basename)[0]
106
+
107
+ # Check if this filename (without extension) is already in blacklist
108
+ if filename_no_ext in blacklist:
109
+ self.logger.debug(f"Skipping {file} - filename already processed")
110
+ continue
111
+
112
+ self.logger.debug(f"Add file {counter + 1}: {file}")
113
+
114
+ # Try to add the sample
115
+ try:
116
+ self.add_sample(file=file, reset=reset, adducts=adducts)
117
+ # If successful, add to blacklist and increment counter
118
+ blacklist.add(filename_no_ext)
119
+ counter += 1
120
+ not_zero = True
121
+ except Exception as e:
122
+ self.logger.warning(f"Failed to add sample {file}: {e}")
123
+ continue
124
+
125
+ if max_files is not None and counter >= max_files:
126
+ self.logger.debug(
127
+ f"Reached maximum number of files to add: {max_files}. Stopping further additions.",
128
+ )
129
+
130
+ if not not_zero:
131
+ self.logger.warning(
132
+ f"No files found in {folder}. Please check the folder path or file patterns.",
133
+ )
134
+ else:
135
+ self.logger.debug(f"Successfully added {counter} samples to the study.")
136
+
137
+
138
+ # TODO type is not used
139
+ def add_sample(self, file, type=None, reset=False, adducts=None):
140
+ self.logger.debug(f"Adding: {file}")
141
+
142
+ # Extract sample name by removing any known extension
143
+ basename = os.path.basename(file)
144
+ sample_name = os.path.splitext(basename)[0]
145
+
146
+ # check if sample_name is already in the samples_df
147
+ if sample_name in self.samples_df["sample_name"].to_list():
148
+ self.logger.warning(
149
+ f"Sample {sample_name} already exists in the study. Skipping.",
150
+ )
151
+ return
152
+
153
+ # check if file exists
154
+ if not os.path.exists(file):
155
+ self.logger.error(f"File {file} does not exist.")
156
+ return
157
+
158
+ # Check for supported file extensions
159
+ if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
160
+ self.logger.error(f"File {file} is not a supported file type. Supported: .sample5, .wiff, .raw, .mzML")
161
+ return
162
+
163
+ # Load the sample based on file type
164
+ ddaobj = Sample()
165
+ ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
166
+
167
+ if file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
168
+ ddaobj.load(file)
169
+ else:
170
+ self.logger.error(f"Unsupported file format: {file}")
171
+ return
172
+ if ddaobj.features_df is None and not reset:
173
+ self.logger.warning(
174
+ f"File {file} will be newly processed.",
175
+ )
176
+ ddaobj.features = None
177
+
178
+ if ddaobj.features is None or reset:
179
+ ddaobj.find_features()
180
+ ddaobj.find_adducts(adducts=adducts)
181
+ ddaobj.find_ms2()
182
+
183
+ self.features_maps.append(ddaobj.features)
184
+
185
+ sample_type = "sample" if type is None else type
186
+ if "qc" in sample_name.lower():
187
+ sample_type = "qc"
188
+ if "blank" in sample_name.lower():
189
+ sample_type = "blank"
190
+ map_id_value = str(ddaobj.features.getUniqueId())
191
+
192
+ # Determine the final sample path based on file type
193
+ if file.endswith(".sample5"):
194
+ # If input is already .sample5, keep it in original location
195
+ final_sample_path = file
196
+ self.logger.debug(f"Using existing .sample5 file at original location: {final_sample_path}")
197
+
198
+ # Check if there's a corresponding featureXML file in the same directory
199
+ featurexml_path = file.replace(".sample5", ".featureXML")
200
+ if os.path.exists(featurexml_path):
201
+ self.logger.debug(f"Found corresponding featureXML file: {featurexml_path}")
202
+ else:
203
+ self.logger.debug(f"No corresponding featureXML file found at: {featurexml_path}")
204
+ else:
205
+ # For .wiff, .mzML, .raw files, save to study folder (original behavior)
206
+ if self.folder is not None:
207
+ if not os.path.exists(self.folder):
208
+ os.makedirs(self.folder)
209
+ final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
210
+ ddaobj.save(final_sample_path)
211
+ self.logger.debug(f"Saved converted sample to study folder: {final_sample_path}")
212
+ else:
213
+ # If no study folder is set, save in current directory
214
+ final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
215
+ ddaobj.save(final_sample_path)
216
+ self.logger.debug(f"Saved converted sample to current directory: {final_sample_path}")
217
+
218
+ # Count MS1 and MS2 scans from the loaded sample
219
+ ms1_count = 0
220
+ ms2_count = 0
221
+ if hasattr(ddaobj, 'scans_df') and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
222
+ ms1_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 1).height)
223
+ ms2_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 2).height)
224
+
225
+ new_sample = pl.DataFrame(
226
+ {
227
+ "sample_uid": [int(len(self.samples_df) + 1)],
228
+ "sample_name": [sample_name],
229
+ "sample_path": [final_sample_path], # Use the determined path
230
+ "sample_type": [sample_type],
231
+ "size": [int(ddaobj.features.size())],
232
+ "map_id": [map_id_value],
233
+ "file_source": [getattr(ddaobj, 'file_source', file)],
234
+ "ms1": [ms1_count],
235
+ "ms2": [ms2_count],
236
+ },
237
+ schema={
238
+ "sample_uid": pl.Int64,
239
+ "sample_name": pl.Utf8,
240
+ "sample_path": pl.Utf8,
241
+ "sample_type": pl.Utf8,
242
+ "size": pl.Int64,
243
+ "map_id": pl.Utf8,
244
+ "file_source": pl.Utf8,
245
+ "ms1": pl.Int64,
246
+ "ms2": pl.Int64,
247
+ },
248
+ )
249
+ self.samples_df = pl.concat([self.samples_df, new_sample])
250
+
251
+ # Optimized DataFrame operations - chain operations instead of multiple clones
252
+ columns_to_add = [
253
+ pl.lit(len(self.samples_df)).alias("sample_uid"),
254
+ pl.lit(False).alias("filled"),
255
+ pl.lit(-1.0).alias("chrom_area"),
256
+ ]
257
+
258
+ # Only add rt_original if it doesn't exist
259
+ if "rt_original" not in ddaobj.features_df.columns:
260
+ columns_to_add.append(pl.col("rt").alias("rt_original"))
261
+
262
+ f_df = ddaobj.features_df.with_columns(columns_to_add)
263
+
264
+ if self.features_df.is_empty():
265
+ # Create new features_df with feature_uid column
266
+ self.features_df = f_df.with_columns(
267
+ pl.int_range(pl.len()).add(1).alias("feature_uid"),
268
+ ).select(
269
+ ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
270
+ )
271
+ else:
272
+ offset = self.features_df["feature_uid"].max() + 1 if not self.features_df.is_empty() else 1
273
+ # Chain operations and add to existing DataFrame
274
+ f_df = f_df.with_columns(
275
+ pl.int_range(pl.len()).add(offset).alias("feature_uid"),
276
+ ).select(
277
+ ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
278
+ )
279
+ self.features_df = pl.concat([self.features_df, f_df])
280
+ self.logger.debug(
281
+ f"Added sample {sample_name} with {ddaobj.features.size()} features to the study.",
282
+ )
283
+
284
+
285
+ def load(self, filename=None):
286
+ """
287
+ Load a study from an HDF5 file.
288
+
289
+ Args:
290
+ study: The study object to load into
291
+ filename (str, optional): The path to the HDF5 file to load the study from.
292
+ """
293
+
294
+ # Handle default filename
295
+ if filename is None:
296
+ if self.folder is not None:
297
+ # search for *.study5 in folder
298
+ study5_files = glob.glob(os.path.join(self.folder, "*.study5"))
299
+ if study5_files:
300
+ filename = study5_files[-1]
301
+ else:
302
+ self.logger.error("No .study5 files found in folder")
303
+ return
304
+ else:
305
+ self.logger.error("Either filename or folder must be provided")
306
+ return
307
+
308
+ #self.logger.info(f"Loading study from {filename}")
309
+ self._load_study5(filename)
310
+ # After loading the study, check if consensus XML exists and load it
311
+ consensus_xml_path = filename.replace(".study5", ".consensusXML")
312
+ if os.path.exists(consensus_xml_path):
313
+ self._load_consensusXML(filename=consensus_xml_path)
314
+ # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
315
+ else:
316
+ self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
317
+ self.filename = filename
318
+
319
+
320
+ def _fill_chrom_single_impl(
321
+ self,
322
+ uids=None,
323
+ mz_tol: float = 0.010,
324
+ rt_tol: float = 10.0,
325
+ min_samples_rel: float = 0.0,
326
+ min_samples_abs: int = 2,
327
+ ):
328
+ """Fill missing chromatograms by extracting from raw data.
329
+
330
+ Simplified version that loads one sample at a time without preloading or batching.
331
+
332
+ Args:
333
+ uids: Consensus UIDs to process (default: all)
334
+ mz_tol: m/z tolerance for extraction (default: 0.010 Da)
335
+ rt_tol: RT tolerance for extraction (default: 10.0 seconds)
336
+ min_samples_rel: Relative minimum sample threshold (default: 0.0)
337
+ min_samples_abs: Absolute minimum sample threshold (default: 2)
338
+ """
339
+ uids = self._get_consensus_uids(uids)
340
+
341
+ self.logger.info("Gap filling...")
342
+ self.logger.debug(
343
+ f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}",
344
+ )
345
+
346
+ # Apply minimum sample filters
347
+ min_number_rel = 1
348
+ min_number_abs = 1
349
+ if isinstance(min_samples_rel, float) and min_samples_rel > 0:
350
+ min_number_rel = int(min_samples_rel * len(self.samples_df))
351
+ if isinstance(min_samples_abs, int) and min_samples_abs > 0:
352
+ min_number_abs = int(min_samples_abs)
353
+ min_number = max(min_number_rel, min_number_abs)
354
+ self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
355
+
356
+ if min_number > 0:
357
+ original_count = len(uids)
358
+ uids = self.consensus_df.filter(
359
+ (pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
360
+ )["consensus_uid"].to_list()
361
+ self.logger.debug(
362
+ f"Features to fill: {original_count} -> {len(uids)}",
363
+ )
364
+ self.logger.debug("Identifying missing features...")
365
+ # Instead of building full chromatogram matrix, identify missing consensus/sample combinations directly
366
+ missing_combinations = self._get_missing_consensus_sample_combinations(uids)
367
+ if not missing_combinations:
368
+ self.logger.info("No missing features found to fill.")
369
+ return
370
+
371
+ # Build lookup dictionaries
372
+ self.logger.debug("Building lookup dictionaries...")
373
+ consensus_info = {}
374
+ consensus_subset = self.consensus_df.select([
375
+ "consensus_uid",
376
+ "rt_start_mean",
377
+ "rt_end_mean",
378
+ "mz",
379
+ "rt",
380
+ ]).filter(pl.col("consensus_uid").is_in(uids))
381
+
382
+ for row in consensus_subset.iter_rows(named=True):
383
+ consensus_info[row["consensus_uid"]] = {
384
+ "rt_start_mean": row["rt_start_mean"],
385
+ "rt_end_mean": row["rt_end_mean"],
386
+ "mz": row["mz"],
387
+ "rt": row["rt"],
388
+ }
389
+
390
+ # Process each sample individually
391
+ # Group missing combinations by sample for efficient processing
392
+ missing_by_sample = {}
393
+ for consensus_uid, sample_uid, sample_name, sample_path in missing_combinations:
394
+ if sample_name not in missing_by_sample:
395
+ missing_by_sample[sample_name] = {
396
+ "sample_uid": sample_uid,
397
+ "sample_path": sample_path,
398
+ "missing_consensus_uids": [],
399
+ }
400
+ missing_by_sample[sample_name]["missing_consensus_uids"].append(consensus_uid)
401
+
402
+ new_features: list[dict] = []
403
+ new_mapping: list[dict] = []
404
+ counter = 0
405
+
406
+ self.logger.debug(
407
+ f"Missing features: {len(missing_combinations)} in {len(missing_by_sample)} samples...",
408
+ )
409
+
410
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
411
+
412
+ for sample_name, sample_info in tqdm(
413
+ missing_by_sample.items(),
414
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}File",
415
+ disable=tdqm_disable,
416
+ ):
417
+ # Load this sample
418
+ sample_uid = sample_info["sample_uid"]
419
+ sample_path = sample_info["sample_path"]
420
+ missing_consensus_uids = sample_info["missing_consensus_uids"]
421
+
422
+ try:
423
+ # self.logger.debug(f"Loading sample: {sample_path}")
424
+ file = Sample()
425
+ file.logger_update("WARNING")
426
+ file.load(sample_path)
427
+ except Exception as e:
428
+ self.logger.warning(f"Failed to load sample {sample_name}: {e}")
429
+ continue
430
+
431
+ self.logger.debug(
432
+ f"Sample {sample_name}: Processing {len(missing_consensus_uids)} missing features",
433
+ )
434
+
435
+ # Process each missing feature
436
+ for consensus_uid in missing_consensus_uids:
437
+ cons = consensus_info[consensus_uid]
438
+ mz = cons["mz"]
439
+ rt = cons["rt"]
440
+ rt_start_mean = cons["rt_start_mean"]
441
+ rt_end_mean = cons["rt_end_mean"]
442
+
443
+ # Filter MS1 data for this feature
444
+ if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
445
+ d = file.ms1_df.filter(
446
+ (pl.col("mz") >= mz - mz_tol)
447
+ & (pl.col("mz") <= mz + mz_tol)
448
+ & (pl.col("rt") >= rt_start_mean - rt_tol)
449
+ & (pl.col("rt") <= rt_end_mean + rt_tol),
450
+ )
451
+ else:
452
+ d = pl.DataFrame()
453
+
454
+ # Create chromatogram
455
+ if d.is_empty():
456
+ self.logger.debug(
457
+ f"Feature {consensus_uid}: No MS1 data found, creating empty chromatogram",
458
+ )
459
+ eic = Chromatogram(
460
+ rt=np.array([rt_start_mean, rt_end_mean]),
461
+ inty=np.array([0.0, 0.0]),
462
+ label=f"EIC mz={mz:.4f}",
463
+ file=sample_path,
464
+ mz=mz,
465
+ mz_tol=mz_tol,
466
+ feature_start=rt_start_mean,
467
+ feature_end=rt_end_mean,
468
+ feature_apex=rt,
469
+ )
470
+ max_inty = 0.0
471
+ area = 0.0
472
+ else:
473
+ self.logger.debug(
474
+ f"Feature {consensus_uid}: Found {len(d)} MS1 points, creating EIC",
475
+ )
476
+ eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
477
+
478
+ if len(eic_rt) > 4:
479
+ eic = Chromatogram(
480
+ eic_rt["rt"].to_numpy(),
481
+ eic_rt["inty"].to_numpy(),
482
+ label=f"EIC mz={mz:.4f}",
483
+ file=sample_path,
484
+ mz=mz,
485
+ mz_tol=mz_tol,
486
+ feature_start=rt_start_mean,
487
+ feature_end=rt_end_mean,
488
+ feature_apex=rt,
489
+ ).find_peaks()
490
+ max_inty = np.max(eic.inty)
491
+ area = eic.feature_area
492
+ else:
493
+ eic = Chromatogram(
494
+ eic_rt["rt"].to_numpy(),
495
+ eic_rt["inty"].to_numpy(),
496
+ label=f"EIC mz={mz:.4f}",
497
+ file=sample_path,
498
+ mz=mz,
499
+ mz_tol=mz_tol,
500
+ feature_start=rt_start_mean,
501
+ feature_end=rt_end_mean,
502
+ feature_apex=rt,
503
+ )
504
+ max_inty = 0.0
505
+ area = 0.0
506
+
507
+ # Generate feature UID
508
+ feature_uid = (
509
+ self.features_df["feature_uid"].max() + len(new_features) + 1
510
+ if not self.features_df.is_empty()
511
+ else len(new_features) + 1
512
+ )
513
+
514
+ # Create new feature entry
515
+ new_feature = {
516
+ "sample_uid": sample_uid,
517
+ "feature_uid": feature_uid,
518
+ "feature_id": None,
519
+ "mz": mz,
520
+ "rt": rt,
521
+ "rt_original": None,
522
+ "rt_start": rt_start_mean,
523
+ "rt_end": rt_end_mean,
524
+ "rt_delta": rt_end_mean - rt_start_mean,
525
+ "mz_start": None,
526
+ "mz_end": None,
527
+ "inty": max_inty,
528
+ "quality": None,
529
+ "charge": None,
530
+ "iso": None,
531
+ "iso_of": None,
532
+ "adduct": None,
533
+ "adduct_mass": None,
534
+ "adduct_group": None,
535
+ "chrom": eic,
536
+ "chrom_coherence": None,
537
+ "chrom_prominence": None,
538
+ "chrom_prominence_scaled": None,
539
+ "chrom_height_scaled": None,
540
+ "ms2_scans": None,
541
+ "ms2_specs": None,
542
+ "filled": True,
543
+ "chrom_area": area,
544
+ }
545
+
546
+ new_features.append(new_feature)
547
+ new_mapping.append({
548
+ "consensus_uid": consensus_uid,
549
+ "sample_uid": sample_uid,
550
+ "feature_uid": feature_uid,
551
+ })
552
+ counter += 1
553
+
554
+ # Add new features to DataFrames
555
+ self.logger.debug(f"Adding {len(new_features)} new features to DataFrame...")
556
+ if new_features:
557
+ # Create properly formatted rows
558
+ rows_to_add = []
559
+ for feature_dict in new_features:
560
+ new_row = {}
561
+ for col in self.features_df.columns:
562
+ if col in feature_dict:
563
+ new_row[col] = feature_dict[col]
564
+ else:
565
+ new_row[col] = None
566
+ rows_to_add.append(new_row)
567
+
568
+ # Create and add new DataFrame
569
+ new_df = pl.from_dicts(rows_to_add)
570
+
571
+ # Cast columns to match existing schema
572
+ cast_exprs = []
573
+ for col in self.features_df.columns:
574
+ existing_dtype = self.features_df[col].dtype
575
+ cast_exprs.append(pl.col(col).cast(existing_dtype, strict=False))
576
+
577
+ new_df = new_df.with_columns(cast_exprs)
578
+ self.features_df = self.features_df.vstack(new_df)
579
+
580
+ # Add consensus mapping
581
+ new_mapping_df = pl.DataFrame(new_mapping)
582
+ self.consensus_mapping_df = pl.concat(
583
+ [self.consensus_mapping_df, new_mapping_df],
584
+ how="diagonal",
585
+ )
586
+
587
+ self.logger.info(f"Filled {counter} chromatograms from raw data.")
588
+
589
+
590
+ def fill_single(self, **kwargs):
591
+ """Fill missing chromatograms by extracting from raw data.
592
+
593
+ Simplified version that loads one sample at a time without preloading or batching.
594
+
595
+ Parameters:
596
+ **kwargs: Keyword arguments for fill_single parameters. Can include:
597
+ - A fill_defaults instance to set all parameters at once
598
+ - Individual parameter names and values (see fill_defaults for details)
599
+
600
+ Key Parameters:
601
+ uids: Consensus UIDs to process (default: all)
602
+ mz_tol: m/z tolerance for extraction (default: 0.010 Da)
603
+ rt_tol: RT tolerance for extraction (default: 10.0 seconds)
604
+ min_samples_rel: Relative minimum sample threshold (default: 0.0)
605
+ min_samples_abs: Absolute minimum sample threshold (default: 2)
606
+ """
607
+ # parameters initialization
608
+ from masster.study.defaults import fill_defaults
609
+ params = fill_defaults()
610
+
611
+ for key, value in kwargs.items():
612
+ if isinstance(value, fill_defaults):
613
+ params = value
614
+ self.logger.debug("Using provided fill_defaults parameters")
615
+ else:
616
+ if hasattr(params, key):
617
+ if params.set(key, value, validate=True):
618
+ self.logger.debug(f"Updated parameter {key} = {value}")
619
+ else:
620
+ self.logger.warning(
621
+ f"Failed to set parameter {key} = {value} (validation failed)",
622
+ )
623
+ else:
624
+ self.logger.debug(f"Unknown parameter {key} ignored")
625
+ # end of parameter initialization
626
+
627
+ # Store parameters in the Study object
628
+ self.store_history(["fill_single"], params.to_dict())
629
+ self.logger.debug("Parameters stored to fill_single")
630
+
631
+ # Call the original fill_chrom_single function with extracted parameters
632
+ return _fill_chrom_single_impl(
633
+ self,
634
+ uids=params.get("uids"),
635
+ mz_tol=params.get("mz_tol"),
636
+ rt_tol=params.get("rt_tol"),
637
+ min_samples_rel=params.get("min_samples_rel"),
638
+ min_samples_abs=params.get("min_samples_abs"),
639
+ )
640
+
641
+
642
+ def _process_sample_for_parallel_fill(
643
+ self,
644
+ sample_info,
645
+ consensus_info,
646
+ uids,
647
+ mz_tol,
648
+ rt_tol,
649
+ missing_combinations_df,
650
+ features_df_max_uid,
651
+ ):
652
+ """Process a single sample for parallel gap filling."""
653
+ sample_uid = sample_info["sample_uid"]
654
+ sample_path = sample_info["sample_path"]
655
+
656
+ new_features: list[dict] = []
657
+ new_mapping: list[dict] = []
658
+ counter = 0
659
+
660
+ try:
661
+ # Load this sample
662
+ file = Sample()
663
+ file.logger_update(level="WARNING")
664
+ file.load(sample_path)
665
+ except Exception:
666
+ # Skip this sample if loading fails
667
+ return new_features, new_mapping, counter
668
+
669
+ # Find missing features for this sample from precomputed combinations
670
+ sample_missing = missing_combinations_df.filter(
671
+ pl.col("sample_uid") == sample_uid,
672
+ )["consensus_uid"].to_list()
673
+
674
+ if not sample_missing:
675
+ return new_features, new_mapping, counter
676
+
677
+ # Process each missing feature
678
+ for consensus_uid in sample_missing:
679
+ cons = consensus_info[consensus_uid]
680
+ mz = cons["mz"]
681
+ rt = cons["rt"]
682
+ rt_start_mean = cons["rt_start_mean"]
683
+ rt_end_mean = cons["rt_end_mean"]
684
+
685
+ # Filter MS1 data for this feature
686
+ if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
687
+ d = file.ms1_df.filter(
688
+ (pl.col("mz") >= mz - mz_tol)
689
+ & (pl.col("mz") <= mz + mz_tol)
690
+ & (pl.col("rt") >= rt_start_mean - rt_tol)
691
+ & (pl.col("rt") <= rt_end_mean + rt_tol),
692
+ )
693
+ else:
694
+ d = pl.DataFrame()
695
+
696
+ # Create chromatogram
697
+ if d.is_empty():
698
+ eic = Chromatogram(
699
+ rt=np.array([rt_start_mean, rt_end_mean]),
700
+ inty=np.array([0.0, 0.0]),
701
+ label=f"EIC mz={mz:.4f}",
702
+ file=sample_path,
703
+ mz=mz,
704
+ mz_tol=mz_tol,
705
+ feature_start=rt_start_mean,
706
+ feature_end=rt_end_mean,
707
+ feature_apex=rt,
708
+ )
709
+ max_inty = 0.0
710
+ area = 0.0
711
+ else:
712
+ eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
713
+
714
+ if len(eic_rt) > 4:
715
+ eic = Chromatogram(
716
+ eic_rt["rt"].to_numpy(),
717
+ eic_rt["inty"].to_numpy(),
718
+ label=f"EIC mz={mz:.4f}",
719
+ file=sample_path,
720
+ mz=mz,
721
+ mz_tol=mz_tol,
722
+ feature_start=rt_start_mean,
723
+ feature_end=rt_end_mean,
724
+ feature_apex=rt,
725
+ ).find_peaks()
726
+ max_inty = np.max(eic.inty)
727
+ area = eic.feature_area
728
+ else:
729
+ eic = Chromatogram(
730
+ eic_rt["rt"].to_numpy(),
731
+ eic_rt["inty"].to_numpy(),
732
+ label=f"EIC mz={mz:.4f}",
733
+ file=sample_path,
734
+ mz=mz,
735
+ mz_tol=mz_tol,
736
+ feature_start=rt_start_mean,
737
+ feature_end=rt_end_mean,
738
+ feature_apex=rt,
739
+ )
740
+ max_inty = 0.0
741
+ area = 0.0
742
+
743
+ # Generate feature UID (will be adjusted later to ensure global uniqueness)
744
+ feature_uid = features_df_max_uid + len(new_features) + 1
745
+
746
+ # Create new feature entry
747
+ new_feature = {
748
+ "sample_uid": sample_uid,
749
+ "feature_uid": feature_uid,
750
+ "feature_id": None,
751
+ "mz": mz,
752
+ "rt": rt,
753
+ "rt_original": None,
754
+ "rt_start": rt_start_mean,
755
+ "rt_end": rt_end_mean,
756
+ "rt_delta": rt_end_mean - rt_start_mean,
757
+ "mz_start": None,
758
+ "mz_end": None,
759
+ "inty": max_inty,
760
+ "quality": None,
761
+ "charge": None,
762
+ "iso": None,
763
+ "iso_of": None,
764
+ "adduct": None,
765
+ "adduct_mass": None,
766
+ "adduct_group": None,
767
+ "chrom": eic,
768
+ "filled": True,
769
+ "chrom_area": area,
770
+ "chrom_coherence": None,
771
+ "chrom_prominence": None,
772
+ "chrom_prominence_scaled": None,
773
+ "chrom_height_scaled": None,
774
+ "ms2_scans": None,
775
+ "ms2_specs": None,
776
+ }
777
+
778
+ new_features.append(new_feature)
779
+ new_mapping.append({
780
+ "consensus_uid": consensus_uid,
781
+ "sample_uid": sample_uid,
782
+ "feature_uid": feature_uid,
783
+ })
784
+ counter += 1
785
+
786
+ return new_features, new_mapping, counter
787
+
788
+
789
+ def _fill_chrom_impl(
790
+ self,
791
+ uids=None,
792
+ mz_tol: float = 0.010,
793
+ rt_tol: float = 10.0,
794
+ min_samples_rel: float = 0.0,
795
+ min_samples_abs: int = 2,
796
+ num_workers=4,
797
+ ):
798
+ """Fill missing chromatograms by extracting from raw data using parallel processing.
799
+
800
+ Args:
801
+ uids: Consensus UIDs to process (default: all)
802
+ mz_tol: m/z tolerance for extraction (default: 0.010 Da)
803
+ rt_tol: RT tolerance for extraction (default: 10.0 seconds)
804
+ min_samples_rel: Relative minimum sample threshold (default: 0.0)
805
+ min_samples_abs: Absolute minimum sample threshold (default: 2)
806
+ num_workers: Number of parallel workers (default: 4)
807
+ """
808
+ uids = self._get_consensus_uids(uids)
809
+
810
+ self.logger.info(f"Gap filling with {num_workers} workers...")
811
+ self.logger.debug(
812
+ f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}, num_workers={num_workers}",
813
+ )
814
+
815
+ # Apply minimum sample filters
816
+ min_number_rel = 1
817
+ min_number_abs = 1
818
+ if isinstance(min_samples_rel, float) and min_samples_rel > 0:
819
+ min_number_rel = int(min_samples_rel * len(self.samples_df))
820
+ if isinstance(min_samples_abs, int) and min_samples_abs > 0:
821
+ min_number_abs = int(min_samples_abs)
822
+ min_number = max(min_number_rel, min_number_abs)
823
+
824
+ self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
825
+
826
+ if min_number > 0:
827
+ original_count = len(uids)
828
+ uids = self.consensus_df.filter(
829
+ (pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
830
+ )["consensus_uid"].to_list()
831
+ self.logger.debug(f"Features to fill: {original_count} -> {len(uids)}")
832
+
833
+ # Get missing consensus/sample combinations using the optimized method
834
+ self.logger.debug("Identifying missing features...")
835
+ missing_combinations = self._get_missing_consensus_sample_combinations(uids)
836
+
837
+ if not missing_combinations or len(missing_combinations) == 0:
838
+ self.logger.info("No missing features found to fill.")
839
+ return
840
+
841
+ # Convert to DataFrame for easier processing
842
+ missing_combinations_df = pl.DataFrame(
843
+ missing_combinations,
844
+ schema={
845
+ "consensus_uid": pl.Int64,
846
+ "sample_uid": pl.Int64,
847
+ "sample_name": pl.Utf8,
848
+ "sample_path": pl.Utf8,
849
+ },
850
+ orient="row",
851
+ )
852
+
853
+ # Build lookup dictionaries
854
+ self.logger.debug("Building lookup dictionaries...")
855
+ consensus_info = {}
856
+ consensus_subset = self.consensus_df.select([
857
+ "consensus_uid",
858
+ "rt_start_mean",
859
+ "rt_end_mean",
860
+ "mz",
861
+ "rt",
862
+ ]).filter(pl.col("consensus_uid").is_in(uids))
863
+
864
+ for row in consensus_subset.iter_rows(named=True):
865
+ consensus_info[row["consensus_uid"]] = {
866
+ "rt_start_mean": row["rt_start_mean"],
867
+ "rt_end_mean": row["rt_end_mean"],
868
+ "mz": row["mz"],
869
+ "rt": row["rt"],
870
+ }
871
+
872
+ # Get sample info for all samples that need processing
873
+ samples_to_process = []
874
+ unique_sample_uids = missing_combinations_df["sample_uid"].unique().to_list()
875
+
876
+ for row in self.samples_df.filter(
877
+ pl.col("sample_uid").is_in(unique_sample_uids),
878
+ ).iter_rows(named=True):
879
+ samples_to_process.append({
880
+ "sample_name": row["sample_name"],
881
+ "sample_uid": row["sample_uid"],
882
+ "sample_path": row["sample_path"],
883
+ })
884
+
885
+ total_missing = len(missing_combinations_df)
886
+ total_samples = len(samples_to_process)
887
+
888
+ self.logger.debug(
889
+ f"Gap filling for {total_missing} missing features...",
890
+ )
891
+
892
+ # Calculate current max feature_uid to avoid conflicts
893
+ features_df_max_uid = self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
894
+
895
+ # Process samples in parallel
896
+ all_new_features: list[dict] = []
897
+ all_new_mapping: list[dict] = []
898
+ total_counter = 0
899
+
900
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
901
+
902
+ with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
903
+ # Submit all samples for processing
904
+ future_to_sample = {}
905
+ for sample_info in samples_to_process:
906
+ future = executor.submit(
907
+ self._process_sample_for_parallel_fill,
908
+ sample_info,
909
+ consensus_info,
910
+ uids,
911
+ mz_tol,
912
+ rt_tol,
913
+ missing_combinations_df,
914
+ features_df_max_uid,
915
+ )
916
+ future_to_sample[future] = sample_info
917
+
918
+ # Collect results with progress bar
919
+ with tqdm(
920
+ total=len(samples_to_process),
921
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Processing samples",
922
+ disable=tdqm_disable,
923
+ ) as pbar:
924
+ for future in concurrent.futures.as_completed(future_to_sample):
925
+ try:
926
+ new_features, new_mapping, counter = future.result()
927
+
928
+ # Adjust feature UIDs to ensure global uniqueness
929
+ uid_offset = features_df_max_uid + len(all_new_features)
930
+ for i, feature in enumerate(new_features):
931
+ feature["feature_uid"] = uid_offset + i + 1
932
+ for i, mapping in enumerate(new_mapping):
933
+ mapping["feature_uid"] = uid_offset + i + 1
934
+
935
+ all_new_features.extend(new_features)
936
+ all_new_mapping.extend(new_mapping)
937
+ total_counter += counter
938
+
939
+ except Exception as e:
940
+ sample_info = future_to_sample[future]
941
+ self.logger.warning(
942
+ f"Sample {sample_info['sample_name']} failed: {e}",
943
+ )
944
+
945
+ pbar.update(1)
946
+
947
+ # Add new features to DataFrames
948
+ self.logger.debug(f"Adding {len(all_new_features)} new features to DataFrame...")
949
+ if all_new_features:
950
+ # Create properly formatted rows
951
+ rows_to_add = []
952
+ for feature_dict in all_new_features:
953
+ new_row = {}
954
+ for col in self.features_df.columns:
955
+ if col in feature_dict:
956
+ new_row[col] = feature_dict[col]
957
+ else:
958
+ new_row[col] = None
959
+ rows_to_add.append(new_row)
960
+
961
+ # Create and add new DataFrame
962
+ new_df = pl.from_dicts(rows_to_add)
963
+
964
+ # Cast columns to match existing schema
965
+ cast_exprs = []
966
+ for col in self.features_df.columns:
967
+ existing_dtype = self.features_df[col].dtype
968
+ cast_exprs.append(pl.col(col).cast(existing_dtype, strict=False))
969
+
970
+ new_df = new_df.with_columns(cast_exprs)
971
+ self.features_df = self.features_df.vstack(new_df)
972
+
973
+ # Add consensus mapping
974
+ new_mapping_df = pl.DataFrame(all_new_mapping)
975
+ self.consensus_mapping_df = pl.concat(
976
+ [self.consensus_mapping_df, new_mapping_df],
977
+ how="diagonal",
978
+ )
979
+
980
+ self.logger.info(
981
+ f"Filled {total_counter} chromatograms from raw data using {num_workers} parallel workers.",
982
+ )
983
+
984
+
985
+ def fill(self, **kwargs):
986
+ """Fill missing chromatograms by extracting from raw data using parallel processing.
987
+
988
+ Parameters:
989
+ **kwargs: Keyword arguments for fill parameters. Can include:
990
+ - A fill_defaults instance to set all parameters at once
991
+ - Individual parameter names and values (see fill_defaults for details)
992
+
993
+ Key Parameters:
994
+ uids: Consensus UIDs to process (default: all)
995
+ mz_tol: m/z tolerance for extraction (default: 0.010 Da)
996
+ rt_tol: RT tolerance for extraction (default: 10.0 seconds)
997
+ min_samples_rel: Relative minimum sample threshold (default: 0.05)
998
+ min_samples_abs: Absolute minimum sample threshold (default: 5)
999
+ num_workers: Number of parallel workers (default: 4)
1000
+ """
1001
+ # parameters initialization
1002
+ params = fill_defaults()
1003
+ num_workers = kwargs.get("num_workers", 4) # Default parameter not in defaults class
1004
+
1005
+ for key, value in kwargs.items():
1006
+ if isinstance(value, fill_defaults):
1007
+ params = value
1008
+ self.logger.debug("Using provided fill_defaults parameters")
1009
+ else:
1010
+ if hasattr(params, key):
1011
+ if params.set(key, value, validate=True):
1012
+ self.logger.debug(f"Updated parameter {key} = {value}")
1013
+ else:
1014
+ self.logger.warning(
1015
+ f"Failed to set parameter {key} = {value} (validation failed)",
1016
+ )
1017
+ elif key != "num_workers": # Allow num_workers as valid parameter
1018
+ self.logger.debug(f"Unknown parameter {key} ignored")
1019
+ # end of parameter initialization
1020
+
1021
+ # Store parameters in the Study object
1022
+ self.store_history(["fill"], params.to_dict())
1023
+ self.logger.debug("Parameters stored to fill")
1024
+
1025
+ # Call the original fill_chrom function with extracted parameters
1026
+ return _fill_chrom_impl(
1027
+ self,
1028
+ uids=params.get("uids"),
1029
+ mz_tol=params.get("mz_tol"),
1030
+ rt_tol=params.get("rt_tol"),
1031
+ min_samples_rel=params.get("min_samples_rel"),
1032
+ min_samples_abs=params.get("min_samples_abs"),
1033
+ num_workers=num_workers,
1034
+ )
1035
+
1036
+
1037
+ # Backward compatibility alias
1038
+ fill_chrom = fill
1039
+
1040
+
1041
+ def _get_missing_consensus_sample_combinations(self, uids):
1042
+ """
1043
+ Efficiently identify which consensus_uid/sample combinations are missing.
1044
+ Returns a list of tuples: (consensus_uid, sample_uid, sample_name, sample_path)
1045
+ """
1046
+ # Get all consensus UIDs we're interested in
1047
+ consensus_uids_set = set(uids)
1048
+
1049
+ # Get all sample UIDs and create lookup
1050
+ all_sample_info = {}
1051
+ for row in self.samples_df.select([
1052
+ "sample_uid",
1053
+ "sample_name",
1054
+ "sample_path",
1055
+ ]).iter_rows(named=True):
1056
+ all_sample_info[row["sample_uid"]] = {
1057
+ "sample_name": row["sample_name"],
1058
+ "sample_path": row["sample_path"],
1059
+ }
1060
+
1061
+ # Get existing consensus/sample combinations from consensus_mapping_df
1062
+ existing_combinations = set()
1063
+ consensus_mapping_filtered = self.consensus_mapping_df.filter(
1064
+ pl.col("consensus_uid").is_in(list(consensus_uids_set)),
1065
+ )
1066
+
1067
+ # Join with features_df to get sample_uid information
1068
+ existing_features = consensus_mapping_filtered.join(
1069
+ self.features_df.select(["feature_uid", "sample_uid"]),
1070
+ on="feature_uid",
1071
+ how="inner",
1072
+ )
1073
+
1074
+ for row in existing_features.select(["consensus_uid", "sample_uid"]).iter_rows():
1075
+ existing_combinations.add((row[0], row[1])) # (consensus_uid, sample_uid)
1076
+
1077
+ # Find missing combinations
1078
+ missing_combinations = []
1079
+ for consensus_uid in consensus_uids_set:
1080
+ for sample_uid, sample_info in all_sample_info.items():
1081
+ if (consensus_uid, sample_uid) not in existing_combinations:
1082
+ missing_combinations.append((
1083
+ consensus_uid,
1084
+ sample_uid,
1085
+ sample_info["sample_name"],
1086
+ sample_info["sample_path"],
1087
+ ))
1088
+
1089
+ return missing_combinations
1090
+
1091
+
1092
+ def sanitize(self):
1093
+ """
1094
+ Sanitize features DataFrame to ensure all complex objects are properly typed.
1095
+ Convert serialized objects back to their proper types (Chromatogram, Spectrum).
1096
+ """
1097
+ if self.features_df is None or self.features_df.is_empty():
1098
+ return
1099
+
1100
+ self.logger.debug(
1101
+ "Sanitizing features DataFrame to ensure all complex objects are properly typed.",
1102
+ )
1103
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1104
+
1105
+ # Check if we have object columns that need sanitization
1106
+ has_chrom = "chrom" in self.features_df.columns
1107
+ has_ms2_specs = "ms2_specs" in self.features_df.columns
1108
+
1109
+ if not has_chrom and not has_ms2_specs:
1110
+ self.logger.debug("No object columns found that need sanitization.")
1111
+ return
1112
+
1113
+ # Convert to list of dictionaries for easier manipulation
1114
+ rows_data = []
1115
+
1116
+ for row_dict in tqdm(
1117
+ self.features_df.iter_rows(named=True),
1118
+ total=len(self.features_df),
1119
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO |{self.log_label}Sanitize features",
1120
+ disable=tdqm_disable,
1121
+ ):
1122
+ row_data = dict(row_dict)
1123
+
1124
+ # Sanitize chrom column
1125
+ if has_chrom and row_data["chrom"] is not None:
1126
+ if not isinstance(row_data["chrom"], Chromatogram):
1127
+ try:
1128
+ # Create new Chromatogram and populate from dict if needed
1129
+ new_chrom = Chromatogram(rt=np.array([]), inty=np.array([]))
1130
+ if hasattr(row_data["chrom"], "__dict__"):
1131
+ new_chrom.from_dict(row_data["chrom"].__dict__)
1132
+ else:
1133
+ # If it's already a dict
1134
+ new_chrom.from_dict(row_data["chrom"])
1135
+ row_data["chrom"] = new_chrom
1136
+ except Exception as e:
1137
+ self.logger.warning(f"Failed to sanitize chrom object: {e}")
1138
+ row_data["chrom"] = None
1139
+
1140
+ # Sanitize ms2_specs column
1141
+ if has_ms2_specs and row_data["ms2_specs"] is not None:
1142
+ if isinstance(row_data["ms2_specs"], list):
1143
+ sanitized_specs = []
1144
+ for ms2_specs in row_data["ms2_specs"]:
1145
+ if not isinstance(ms2_specs, Spectrum):
1146
+ try:
1147
+ new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
1148
+ if hasattr(ms2_specs, "__dict__"):
1149
+ new_ms2_specs.from_dict(ms2_specs.__dict__)
1150
+ else:
1151
+ new_ms2_specs.from_dict(ms2_specs)
1152
+ sanitized_specs.append(new_ms2_specs)
1153
+ except Exception as e:
1154
+ self.logger.warning(
1155
+ f"Failed to sanitize ms2_specs object: {e}",
1156
+ )
1157
+ sanitized_specs.append(None)
1158
+ else:
1159
+ sanitized_specs.append(ms2_specs)
1160
+ row_data["ms2_specs"] = sanitized_specs
1161
+ elif not isinstance(row_data["ms2_specs"], Spectrum):
1162
+ try:
1163
+ new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
1164
+ if hasattr(row_data["ms2_specs"], "__dict__"):
1165
+ new_ms2_specs.from_dict(row_data["ms2_specs"].__dict__)
1166
+ else:
1167
+ new_ms2_specs.from_dict(row_data["ms2_specs"])
1168
+ row_data["ms2_specs"] = new_ms2_specs
1169
+ except Exception as e:
1170
+ self.logger.warning(f"Failed to sanitize ms2_specs object: {e}")
1171
+ row_data["ms2_specs"] = None
1172
+
1173
+ rows_data.append(row_data)
1174
+
1175
+ # Recreate the DataFrame with sanitized data
1176
+ try:
1177
+ self.features_df = pl.DataFrame(rows_data)
1178
+ self.logger.success("Features DataFrame sanitization completed successfully.")
1179
+ except Exception as e:
1180
+ self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
1181
+
1182
+
1183
+ def load_features(self):
1184
+ # iterate over all samples in samples_df
1185
+
1186
+ self.features_maps = []
1187
+ self.logger.debug("Loading features from featureXML files.")
1188
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1189
+ for _index, row_dict in tqdm(
1190
+ enumerate(self.samples_df.iter_rows(named=True)),
1191
+ total=len(self.samples_df),
1192
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Load feature maps from XML",
1193
+ disable=tdqm_disable,
1194
+ ):
1195
+ if self.folder is not None:
1196
+ filename = os.path.join(
1197
+ self.folder,
1198
+ row_dict["sample_name"] + ".featureXML",
1199
+ )
1200
+ else:
1201
+ filename = os.path.join(
1202
+ os.getcwd(),
1203
+ row_dict["sample_name"] + ".featureXML",
1204
+ )
1205
+ # check if file exists
1206
+ if not os.path.exists(filename):
1207
+ filename = row_dict["sample_path"].replace(".sample5", ".featureXML")
1208
+
1209
+ if not os.path.exists(filename):
1210
+ self.features_maps.append(None)
1211
+ continue
1212
+
1213
+ fh = oms.FeatureXMLFile()
1214
+ fm = oms.FeatureMap()
1215
+ fh.load(filename, fm)
1216
+ self.features_maps.append(fm)
1217
+ self.logger.debug("Features loaded successfully.")
1218
+
1219
+
1220
+ def _load_consensusXML(self, filename="alignment.consensusXML"):
1221
+ """
1222
+ Load a consensus map from a file.
1223
+ """
1224
+ if not os.path.exists(filename):
1225
+ self.logger.error(f"File {filename} does not exist.")
1226
+ return
1227
+ fh = oms.ConsensusXMLFile()
1228
+ self.consensus_map = oms.ConsensusMap()
1229
+ fh.load(filename, self.consensus_map)
1230
+ self.logger.debug(f"Loaded consensus map from {filename}.")
1231
+