masster 0.4.19__py3-none-any.whl → 0.4.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/merge.py CHANGED
@@ -10,7 +10,8 @@ from datetime import datetime
10
10
  from tqdm import tqdm
11
11
  import pyopenms as oms
12
12
  import polars as pl
13
- from concurrent.futures import ProcessPoolExecutor, as_completed
13
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
14
+ from concurrent.futures.process import BrokenProcessPool
14
15
  from masster.study.defaults import merge_defaults
15
16
 
16
17
 
@@ -367,6 +368,17 @@ def merge(self, **kwargs) -> None:
367
368
  if params.method not in ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']:
368
369
  raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']")
369
370
 
371
+ # Check if chunked method is advisable for large datasets
372
+ num_samples = len(self.samples_df) if hasattr(self, 'samples_df') and self.samples_df is not None else 0
373
+ if num_samples > 500:
374
+ chunked_methods = {'kd_chunked', 'qt_chunked'}
375
+ if params.method not in chunked_methods:
376
+ self.logger.warning(
377
+ f"Large dataset detected ({num_samples} samples > 500). "
378
+ f"For better performance and memory efficiency, consider using a chunked method: "
379
+ f"'kd_chunked' or 'qt_chunked' instead of '{params.method}'"
380
+ )
381
+
370
382
  # Persist last used params for diagnostics
371
383
  try:
372
384
  self._merge_params_last = params.to_dict()
@@ -385,10 +397,10 @@ def merge(self, **kwargs) -> None:
385
397
  # Ensure feature maps are available for merging (regenerate if needed)
386
398
  if len(self.features_maps) < len(self.samples_df):
387
399
  self.features_maps = []
388
- self.load_features()
400
+ # Feature maps will be generated on-demand within each merge method
389
401
 
390
402
  self.logger.info(
391
- f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da, min_rel_cc_size={params.min_rel_cc_size}, max_pairwise_log_fc={params.max_pairwise_log_fc}, max_nr_conflicts={params.max_nr_conflicts}"
403
+ f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da"
392
404
  )
393
405
 
394
406
  # Initialize
@@ -433,9 +445,16 @@ def merge(self, **kwargs) -> None:
433
445
  consensus_map = _merge_qt_chunked(self, params, cached_adducts_df, cached_valid_adducts)
434
446
  # Note: _merge_qt_chunked populates consensus_df directly, no need to extract
435
447
 
448
+ # Enhanced post-clustering to merge over-segmented features (for qt and kd methods)
449
+ if params.method in ['qt', 'sensitivity', 'qt_chunked', 'kd_chunked', 'quality']:
450
+ self._consensus_cleanup(params.rt_tol, params.mz_tol)
451
+
436
452
  # Perform adduct grouping
437
453
  self._perform_adduct_grouping(params.rt_tol, params.mz_tol)
438
454
 
455
+ # Identify coeluting consensus features by mass shifts and update adduct information
456
+ self._identify_adduct_by_mass_shift(params.rt_tol, cached_adducts_df)
457
+
439
458
  # Link MS2 if requested
440
459
  if params.link_ms2:
441
460
  self._finalize_merge(params.link_ms2, params.min_samples)
@@ -448,10 +467,13 @@ def merge(self, **kwargs) -> None:
448
467
  def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
449
468
  """KD-tree based merge (fast, recommended)"""
450
469
 
470
+ # Generate temporary feature maps on-demand from features_df
471
+ temp_feature_maps = _generate_feature_maps_on_demand(self)
472
+
451
473
  consensus_map = oms.ConsensusMap()
452
474
  file_descriptions = consensus_map.getColumnHeaders()
453
475
 
454
- for i, feature_map in enumerate(self.features_maps):
476
+ for i, feature_map in enumerate(temp_feature_maps):
455
477
  file_description = file_descriptions.get(i, oms.ColumnHeader())
456
478
  file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
457
479
  file_description.size = feature_map.size()
@@ -477,22 +499,145 @@ def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
477
499
  #params_oms.setValue("link:charge_merging", "With_charge_zero") THIS LEADS TO A CRASH
478
500
 
479
501
  grouper.setParameters(params_oms)
480
- grouper.group(self.features_maps, consensus_map)
502
+ grouper.group(temp_feature_maps, consensus_map)
481
503
 
482
504
  return consensus_map
483
505
 
484
506
 
507
+ def _generate_feature_maps_on_demand(study):
508
+ """
509
+ Generate feature maps on-demand from study.features_df for merge operations.
510
+ Returns temporary feature maps that are not cached in the study.
511
+
512
+ Args:
513
+ study: Study object containing features_df and samples_df
514
+
515
+ Returns:
516
+ list: List of temporary FeatureMap objects
517
+ """
518
+ import polars as pl
519
+ import pyopenms as oms
520
+ import numpy as np
521
+
522
+ if study.features_df is None or len(study.features_df) == 0:
523
+ study.logger.error("No features_df available for generating feature maps")
524
+ return []
525
+
526
+ temp_feature_maps = []
527
+ n_samples = len(study.samples_df)
528
+ n_features = len(study.features_df)
529
+
530
+ # Performance optimization: use efficient polars groupby for large datasets
531
+ use_groupby_optimization = n_features > 5000
532
+ if use_groupby_optimization:
533
+ study.logger.debug(f"Using polars groupby optimization for {n_features} features across {n_samples} samples")
534
+
535
+ # Pre-group features by sample_uid - this is much more efficient than repeated filtering
536
+ features_by_sample = study.features_df.group_by("sample_uid").agg([
537
+ pl.col("feature_id"),
538
+ pl.col("mz"),
539
+ pl.col("rt"),
540
+ pl.col("inty"),
541
+ pl.col("quality").fill_null(1.0),
542
+ pl.col("charge").fill_null(0)
543
+ ])
544
+
545
+ # Convert to dictionary for fast lookups
546
+ sample_feature_dict = {}
547
+ for row in features_by_sample.iter_rows(named=True):
548
+ sample_uid = row["sample_uid"]
549
+ # Convert lists to numpy arrays for vectorized operations
550
+ sample_feature_dict[sample_uid] = {
551
+ "feature_id": np.array(row["feature_id"]),
552
+ "mz": np.array(row["mz"]),
553
+ "rt": np.array(row["rt"]),
554
+ "inty": np.array(row["inty"]),
555
+ "quality": np.array(row["quality"]),
556
+ "charge": np.array(row["charge"])
557
+ }
558
+
559
+ # Process each sample in order
560
+ for sample_index, row_dict in enumerate(study.samples_df.iter_rows(named=True)):
561
+ sample_uid = row_dict["sample_uid"]
562
+
563
+ if use_groupby_optimization:
564
+ # Use pre-grouped data with vectorized operations
565
+ if sample_uid not in sample_feature_dict:
566
+ feature_map = oms.FeatureMap()
567
+ temp_feature_maps.append(feature_map)
568
+ continue
569
+
570
+ sample_data = sample_feature_dict[sample_uid]
571
+ n_sample_features = len(sample_data["feature_id"])
572
+
573
+ if n_sample_features == 0:
574
+ feature_map = oms.FeatureMap()
575
+ temp_feature_maps.append(feature_map)
576
+ continue
577
+
578
+ # Create new FeatureMap
579
+ feature_map = oms.FeatureMap()
580
+
581
+ # Use vectorized data directly (no conversion needed)
582
+ for i in range(n_sample_features):
583
+ try:
584
+ feature = oms.Feature()
585
+ feature.setUniqueId(int(sample_data["feature_id"][i]))
586
+ feature.setMZ(float(sample_data["mz"][i]))
587
+ feature.setRT(float(sample_data["rt"][i]))
588
+ feature.setIntensity(float(sample_data["inty"][i]))
589
+ feature.setOverallQuality(float(sample_data["quality"][i]))
590
+ feature.setCharge(int(sample_data["charge"][i]))
591
+ feature_map.push_back(feature)
592
+ except (ValueError, TypeError) as e:
593
+ study.logger.warning(f"Skipping feature due to conversion error: {e}")
594
+ continue
595
+ else:
596
+ # Use original polars-based approach for smaller datasets
597
+ sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
598
+
599
+ # Create new FeatureMap
600
+ feature_map = oms.FeatureMap()
601
+
602
+ # Convert DataFrame features to OpenMS Features
603
+ for feature_row in sample_features.iter_rows(named=True):
604
+ feature = oms.Feature()
605
+
606
+ # Set properties from DataFrame (handle missing values gracefully)
607
+ try:
608
+ feature.setUniqueId(int(feature_row["feature_id"]))
609
+ feature.setMZ(float(feature_row["mz"]))
610
+ feature.setRT(float(feature_row["rt"]))
611
+ feature.setIntensity(float(feature_row["inty"]))
612
+ feature.setOverallQuality(float(feature_row["quality"]))
613
+ feature.setCharge(int(feature_row["charge"]))
614
+
615
+ # Add to feature map
616
+ feature_map.push_back(feature)
617
+ except (ValueError, TypeError) as e:
618
+ study.logger.warning(f"Skipping feature due to conversion error: {e}")
619
+ continue
620
+
621
+ temp_feature_maps.append(feature_map)
622
+
623
+ study.logger.debug(f"Generated {len(temp_feature_maps)} temporary feature maps from features_df")
624
+ return temp_feature_maps
625
+
626
+
485
627
  def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
486
628
  """QT (Quality Threshold) based merge"""
487
629
 
488
- n_samples = len(self.features_maps)
630
+ # Generate temporary feature maps on-demand from features_df
631
+ temp_feature_maps = _generate_feature_maps_on_demand(self)
632
+
633
+ n_samples = len(temp_feature_maps)
489
634
  if n_samples > 1000:
490
635
  self.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
491
636
 
492
637
  consensus_map = oms.ConsensusMap()
493
638
  file_descriptions = consensus_map.getColumnHeaders()
494
639
 
495
- for i, feature_map in enumerate(self.features_maps):
640
+ for i, feature_map in enumerate(temp_feature_maps):
496
641
  file_description = file_descriptions.get(i, oms.ColumnHeader())
497
642
  file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
498
643
  file_description.size = feature_map.size()
@@ -515,7 +660,7 @@ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
515
660
  params_oms.setValue("nr_partitions", params.nr_partitions)
516
661
 
517
662
  grouper.setParameters(params_oms)
518
- grouper.group(self.features_maps, consensus_map)
663
+ grouper.group(temp_feature_maps, consensus_map)
519
664
 
520
665
  return consensus_map
521
666
 
@@ -560,11 +705,11 @@ def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
560
705
  optimized_params = params
561
706
 
562
707
  # Phase 1: Standard KD clustering
563
- self.logger.info("Initial KD clustering")
708
+ self.logger.debug("Initial KD clustering")
564
709
  consensus_map = _merge_kd(self, optimized_params)
565
710
 
566
711
  # Phase 2: Post-processing quality control
567
- self.logger.info("Post-processing quality control")
712
+ self.logger.debug("Post-processing quality control")
568
713
  consensus_map = _apply_kd_strict_postprocessing(self, consensus_map, optimized_params)
569
714
 
570
715
  return consensus_map
@@ -766,7 +911,7 @@ def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, param
766
911
  final_feature_count = len(self.consensus_df)
767
912
  reduction_pct = ((initial_feature_count - final_feature_count) / initial_feature_count * 100) if initial_feature_count > 0 else 0
768
913
 
769
- self.logger.info(f"Post-processing complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
914
+ self.logger.info(f"Consensus cleanup complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
770
915
 
771
916
  # Create a new consensus map for compatibility (the processed data is in consensus_df)
772
917
  processed_consensus_map = oms.ConsensusMap()
@@ -1013,10 +1158,13 @@ def _filter_coherence(self, features: list, min_coherence: float) -> list:
1013
1158
  def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
1014
1159
  """KD-tree based merge without RT warping"""
1015
1160
 
1161
+ # Generate temporary feature maps on-demand from features_df
1162
+ temp_feature_maps = _generate_feature_maps_on_demand(self)
1163
+
1016
1164
  consensus_map = oms.ConsensusMap()
1017
1165
  file_descriptions = consensus_map.getColumnHeaders()
1018
1166
 
1019
- for i, feature_map in enumerate(self.features_maps):
1167
+ for i, feature_map in enumerate(temp_feature_maps):
1020
1168
  file_description = file_descriptions.get(i, oms.ColumnHeader())
1021
1169
  file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
1022
1170
  file_description.size = feature_map.size()
@@ -1040,7 +1188,7 @@ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
1040
1188
  #params_oms.setValue("link:charge_merging", "Any")
1041
1189
 
1042
1190
  grouper.setParameters(params_oms)
1043
- grouper.group(self.features_maps, consensus_map)
1191
+ grouper.group(temp_feature_maps, consensus_map)
1044
1192
 
1045
1193
  return consensus_map
1046
1194
 
@@ -1048,7 +1196,10 @@ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
1048
1196
  def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
1049
1197
  """KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
1050
1198
 
1051
- n_samples = len(self.features_maps)
1199
+ # Generate temporary feature maps on-demand from features_df
1200
+ temp_feature_maps = _generate_feature_maps_on_demand(self)
1201
+
1202
+ n_samples = len(temp_feature_maps)
1052
1203
  if n_samples <= params.chunk_size:
1053
1204
  self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
1054
1205
  consensus_map = _merge_kd(self, params)
@@ -1060,7 +1211,7 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1060
1211
  chunks = []
1061
1212
  for i in range(0, n_samples, params.chunk_size):
1062
1213
  chunk_end = min(i + params.chunk_size, n_samples)
1063
- chunks.append((i, self.features_maps[i:chunk_end]))
1214
+ chunks.append((i, temp_feature_maps[i:chunk_end]))
1064
1215
 
1065
1216
  self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
1066
1217
 
@@ -1146,28 +1297,66 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1146
1297
  }
1147
1298
  chunk_data_list.append(chunk_data)
1148
1299
 
1149
- # Process chunks in parallel
1150
- with ProcessPoolExecutor(max_workers=params.threads) as executor:
1151
- # Submit all chunk processing tasks
1152
- future_to_chunk = {executor.submit(_process_kd_chunk_parallel, chunk_data): i
1153
- for i, chunk_data in enumerate(chunk_data_list)}
1154
-
1155
- # Collect results with progress tracking
1156
- completed_chunks = 0
1157
- total_chunks = len(chunk_data_list)
1158
- serialized_chunk_results = []
1159
-
1160
- for future in as_completed(future_to_chunk):
1161
- chunk_idx = future_to_chunk[future]
1162
- try:
1163
- chunk_start_idx, consensus_features = future.result()
1164
- serialized_chunk_results.append((chunk_start_idx, consensus_features))
1165
- completed_chunks += 1
1166
- n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1167
- self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1168
- except Exception as exc:
1169
- self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1170
- raise exc
1300
+ # Process chunks in parallel - try ProcessPoolExecutor first, fallback to ThreadPoolExecutor on Windows
1301
+ try:
1302
+ with ProcessPoolExecutor(max_workers=params.threads) as executor:
1303
+ # Submit all chunk processing tasks
1304
+ future_to_chunk = {executor.submit(_process_kd_chunk_parallel, chunk_data): i
1305
+ for i, chunk_data in enumerate(chunk_data_list)}
1306
+
1307
+ # Collect results with progress tracking
1308
+ completed_chunks = 0
1309
+ total_chunks = len(chunk_data_list)
1310
+ serialized_chunk_results = []
1311
+
1312
+ for future in as_completed(future_to_chunk):
1313
+ chunk_idx = future_to_chunk[future]
1314
+ try:
1315
+ chunk_start_idx, consensus_features = future.result()
1316
+ serialized_chunk_results.append((chunk_start_idx, consensus_features))
1317
+ completed_chunks += 1
1318
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1319
+ self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1320
+ except Exception as exc:
1321
+ # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
1322
+ if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
1323
+ # Convert to RuntimeError so outer except block can catch it for fallback
1324
+ raise RuntimeError(f"Windows multiprocessing failure: {exc}")
1325
+ else:
1326
+ self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1327
+ raise exc
1328
+
1329
+ except (RuntimeError, OSError, BrokenProcessPool) as e:
1330
+ # Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
1331
+ if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
1332
+ "process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
1333
+ self.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
1334
+ self.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
1335
+
1336
+ with ThreadPoolExecutor(max_workers=params.threads) as executor:
1337
+ # Submit all chunk processing tasks
1338
+ future_to_chunk = {executor.submit(_process_kd_chunk_parallel, chunk_data): i
1339
+ for i, chunk_data in enumerate(chunk_data_list)}
1340
+
1341
+ # Collect results with progress tracking
1342
+ completed_chunks = 0
1343
+ total_chunks = len(chunk_data_list)
1344
+ serialized_chunk_results = []
1345
+
1346
+ for future in as_completed(future_to_chunk):
1347
+ chunk_idx = future_to_chunk[future]
1348
+ try:
1349
+ chunk_start_idx, consensus_features = future.result()
1350
+ serialized_chunk_results.append((chunk_start_idx, consensus_features))
1351
+ completed_chunks += 1
1352
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1353
+ self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1354
+ except Exception as exc:
1355
+ self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1356
+ raise exc
1357
+ else:
1358
+ # Re-raise other exceptions
1359
+ raise
1171
1360
 
1172
1361
  # Store serialized results for _merge_chunk_results to handle directly
1173
1362
  chunk_consensus_maps = []
@@ -1187,7 +1376,10 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1187
1376
  def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
1188
1377
  """QT-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
1189
1378
 
1190
- n_samples = len(self.features_maps)
1379
+ # Generate temporary feature maps on-demand from features_df
1380
+ temp_feature_maps = _generate_feature_maps_on_demand(self)
1381
+
1382
+ n_samples = len(temp_feature_maps)
1191
1383
  if n_samples <= params.chunk_size:
1192
1384
  self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
1193
1385
  consensus_map = _merge_qt(self, params)
@@ -1199,7 +1391,7 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1199
1391
  chunks = []
1200
1392
  for i in range(0, n_samples, params.chunk_size):
1201
1393
  chunk_end = min(i + params.chunk_size, n_samples)
1202
- chunks.append((i, self.features_maps[i:chunk_end]))
1394
+ chunks.append((i, temp_feature_maps[i:chunk_end]))
1203
1395
 
1204
1396
  self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
1205
1397
 
@@ -1277,28 +1469,69 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1277
1469
  }
1278
1470
  chunk_data_list.append(chunk_data)
1279
1471
 
1280
- # Process chunks in parallel
1281
- with ProcessPoolExecutor(max_workers=params.threads) as executor:
1282
- # Submit all chunk processing tasks
1283
- future_to_chunk = {executor.submit(_process_qt_chunk_parallel, chunk_data): i
1284
- for i, chunk_data in enumerate(chunk_data_list)}
1285
-
1286
- # Collect results with progress tracking
1287
- completed_chunks = 0
1288
- total_chunks = len(chunk_data_list)
1289
- serialized_chunk_results = []
1290
-
1291
- for future in as_completed(future_to_chunk):
1292
- chunk_idx = future_to_chunk[future]
1293
- try:
1294
- chunk_start_idx, consensus_features = future.result()
1295
- serialized_chunk_results.append((chunk_start_idx, consensus_features))
1296
- completed_chunks += 1
1297
- n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1298
- self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1299
- except Exception as exc:
1300
- self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1301
- raise exc
1472
+ # Process chunks in parallel - try ProcessPoolExecutor first, fallback to ThreadPoolExecutor on Windows
1473
+ executor_class = ProcessPoolExecutor
1474
+ executor_name = "processes"
1475
+
1476
+ try:
1477
+ with ProcessPoolExecutor(max_workers=params.threads) as executor:
1478
+ # Submit all chunk processing tasks
1479
+ future_to_chunk = {executor.submit(_process_qt_chunk_parallel, chunk_data): i
1480
+ for i, chunk_data in enumerate(chunk_data_list)}
1481
+
1482
+ # Collect results with progress tracking
1483
+ completed_chunks = 0
1484
+ total_chunks = len(chunk_data_list)
1485
+ serialized_chunk_results = []
1486
+
1487
+ for future in as_completed(future_to_chunk):
1488
+ chunk_idx = future_to_chunk[future]
1489
+ try:
1490
+ chunk_start_idx, consensus_features = future.result()
1491
+ serialized_chunk_results.append((chunk_start_idx, consensus_features))
1492
+ completed_chunks += 1
1493
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1494
+ self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1495
+ except Exception as exc:
1496
+ # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
1497
+ if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
1498
+ # Convert to RuntimeError so outer except block can catch it for fallback
1499
+ raise RuntimeError(f"Windows multiprocessing failure: {exc}")
1500
+ else:
1501
+ self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1502
+ raise exc
1503
+
1504
+ except (RuntimeError, OSError, BrokenProcessPool) as e:
1505
+ # Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
1506
+ if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
1507
+ "process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
1508
+ self.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
1509
+ self.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
1510
+
1511
+ with ThreadPoolExecutor(max_workers=params.threads) as executor:
1512
+ # Submit all chunk processing tasks
1513
+ future_to_chunk = {executor.submit(_process_qt_chunk_parallel, chunk_data): i
1514
+ for i, chunk_data in enumerate(chunk_data_list)}
1515
+
1516
+ # Collect results with progress tracking
1517
+ completed_chunks = 0
1518
+ total_chunks = len(chunk_data_list)
1519
+ serialized_chunk_results = []
1520
+
1521
+ for future in as_completed(future_to_chunk):
1522
+ chunk_idx = future_to_chunk[future]
1523
+ try:
1524
+ chunk_start_idx, consensus_features = future.result()
1525
+ serialized_chunk_results.append((chunk_start_idx, consensus_features))
1526
+ completed_chunks += 1
1527
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1528
+ self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1529
+ except Exception as exc:
1530
+ self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1531
+ raise exc
1532
+ else:
1533
+ # Re-raise other exceptions
1534
+ raise
1302
1535
 
1303
1536
  # Store serialized results for _merge_chunk_results to handle directly
1304
1537
  chunk_consensus_maps = []
@@ -1458,8 +1691,12 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1458
1691
  mz_min_local = mz_max_local = consensus_mz
1459
1692
 
1460
1693
  # Store chunk consensus with feature tracking
1694
+ # Generate unique 16-character consensus_id string
1695
+ import uuid
1696
+ consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
1697
+
1461
1698
  chunk_consensus_data = {
1462
- 'consensus_id': consensus_id_counter,
1699
+ 'consensus_id': consensus_id_str,
1463
1700
  'chunk_idx': chunk_idx,
1464
1701
  'chunk_start_idx': chunk_start_idx,
1465
1702
  'mz': consensus_mz,
@@ -1477,7 +1714,6 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1477
1714
  }
1478
1715
 
1479
1716
  all_chunk_consensus.append(chunk_consensus_data)
1480
- consensus_id_counter += 1
1481
1717
 
1482
1718
  if not all_chunk_consensus:
1483
1719
  # No valid consensus features found
@@ -1861,9 +2097,13 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
1861
2097
  ms2_count += len(ms2_scans)
1862
2098
 
1863
2099
  # Build consensus metadata
2100
+ # Generate unique 16-character consensus_id string
2101
+ import uuid
2102
+ consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
2103
+
1864
2104
  return {
1865
2105
  "consensus_uid": int(consensus_uid),
1866
- "consensus_id": str(consensus_uid), # Use simple string ID
2106
+ "consensus_id": consensus_id_str, # Use unique 16-char string ID
1867
2107
  "quality": round(float(np.mean(quality_values)), 3) if len(quality_values) > 0 else 1.0,
1868
2108
  "number_samples": number_samples if number_samples is not None else len(feature_data_list),
1869
2109
  "rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
@@ -1885,6 +2125,7 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
1885
2125
  "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
1886
2126
  "chrom_prominence_scaled_mean": round(float(np.mean(prominence_scaled_values)), 3) if len(prominence_scaled_values) > 0 else 0.0,
1887
2127
  "chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3) if len(height_scaled_values) > 0 else 0.0,
2128
+ "iso": None, # Will be filled by find_iso() function
1888
2129
  "iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
1889
2130
  "charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
1890
2131
  "number_ms2": int(ms2_count),
@@ -2276,10 +2517,14 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
2276
2517
  if ms2_scans is not None:
2277
2518
  ms2_count += len(ms2_scans)
2278
2519
 
2520
+ # Generate unique 16-character consensus_id string (UUID-based)
2521
+ import uuid
2522
+ consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
2523
+
2279
2524
  metadata_list.append(
2280
2525
  {
2281
2526
  "consensus_uid": int(i), # "consensus_id": i,
2282
- "consensus_id": str(feature.getUniqueId()),
2527
+ "consensus_id": consensus_id_str, # Use unique 16-char string ID
2283
2528
  "quality": round(float(feature.getQuality()), 3),
2284
2529
  "number_samples": len(feature_data_list),
2285
2530
  # "number_ext": int(len(features_list)),
@@ -2344,6 +2589,7 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
2344
2589
  )
2345
2590
  if len(height_scaled_values) > 0
2346
2591
  else 0.0,
2592
+ "iso": None, # Will be filled by find_iso() function
2347
2593
  "iso_mean": round(float(np.mean(iso_values)), 2)
2348
2594
  if len(iso_values) > 0
2349
2595
  else 0.0,
@@ -2445,6 +2691,595 @@ def _perform_adduct_grouping(self, rt_tol, mz_tol):
2445
2691
  )
2446
2692
 
2447
2693
 
2694
+ def _count_tight_clusters(self, mz_tol: float = 0.04, rt_tol: float = 0.3) -> int:
2695
+ """
2696
+ Count consensus features grouped in tight clusters.
2697
+
2698
+ Args:
2699
+ mz_tol: m/z tolerance in Daltons for cluster detection
2700
+ rt_tol: RT tolerance in seconds for cluster detection
2701
+
2702
+ Returns:
2703
+ Number of tight clusters found
2704
+ """
2705
+ if len(self.consensus_df) < 2:
2706
+ return 0
2707
+
2708
+ # Extract consensus feature data
2709
+ consensus_data = []
2710
+ for row in self.consensus_df.iter_rows(named=True):
2711
+ consensus_data.append({
2712
+ 'consensus_uid': row['consensus_uid'],
2713
+ 'mz': row['mz'],
2714
+ 'rt': row['rt']
2715
+ })
2716
+
2717
+ # Build spatial index using bins
2718
+ rt_bin_size = rt_tol / 2
2719
+ mz_bin_size = mz_tol / 2
2720
+
2721
+ bins = defaultdict(list)
2722
+ for feature in consensus_data:
2723
+ rt_bin = int(feature['rt'] / rt_bin_size)
2724
+ mz_bin = int(feature['mz'] / mz_bin_size)
2725
+ bins[(rt_bin, mz_bin)].append(feature)
2726
+
2727
+ processed_features = set()
2728
+ tight_clusters_count = 0
2729
+
2730
+ for bin_key, bin_features in bins.items():
2731
+ if len(bin_features) < 2:
2732
+ continue
2733
+
2734
+ # Check neighboring bins for additional features
2735
+ rt_bin, mz_bin = bin_key
2736
+ all_nearby_features = list(bin_features)
2737
+
2738
+ # Check 8 neighboring bins
2739
+ for drt in [-1, 0, 1]:
2740
+ for dmz in [-1, 0, 1]:
2741
+ if drt == 0 and dmz == 0:
2742
+ continue
2743
+ neighbor_key = (rt_bin + drt, mz_bin + dmz)
2744
+ if neighbor_key in bins:
2745
+ all_nearby_features.extend(bins[neighbor_key])
2746
+
2747
+ # Filter to features within actual tolerances and not yet processed
2748
+ valid_cluster_features = []
2749
+ for feature in all_nearby_features:
2750
+ if feature['consensus_uid'] in processed_features:
2751
+ continue
2752
+
2753
+ # Check if this feature is within tolerances of any bin feature
2754
+ for bin_feature in bin_features:
2755
+ rt_diff = abs(feature['rt'] - bin_feature['rt'])
2756
+ mz_diff = abs(feature['mz'] - bin_feature['mz'])
2757
+
2758
+ if rt_diff <= rt_tol and mz_diff <= mz_tol:
2759
+ valid_cluster_features.append(feature)
2760
+ break
2761
+
2762
+ # Count as tight cluster if we have multiple features
2763
+ if len(valid_cluster_features) >= 2:
2764
+ tight_clusters_count += 1
2765
+ for feature in valid_cluster_features:
2766
+ processed_features.add(feature['consensus_uid'])
2767
+
2768
+ return tight_clusters_count
2769
+
2770
+
2771
+ def _consensus_cleanup(self, rt_tol, mz_tol):
2772
+ """
2773
+ Consensus cleanup to merge over-segmented consensus features and remove isotopic features.
2774
+
2775
+ This function:
2776
+ 1. Identifies and merges consensus features that are likely over-segmented
2777
+ (too many features in very tight m/z and RT windows)
2778
+ 2. Performs deisotoping to remove +1 and +2 isotopic features
2779
+ """
2780
+ if len(self.consensus_df) == 0:
2781
+ return
2782
+
2783
+ initial_count = len(self.consensus_df)
2784
+
2785
+ # Only perform enhanced post-clustering if there are many features
2786
+ if initial_count < 50:
2787
+ return
2788
+
2789
+ self.logger.debug(f"Enhanced post-clustering: processing {initial_count} consensus features")
2790
+
2791
+ # Find tight clusters using spatial binning
2792
+ consensus_data = []
2793
+ for row in self.consensus_df.iter_rows(named=True):
2794
+ consensus_data.append({
2795
+ 'consensus_uid': row['consensus_uid'],
2796
+ 'mz': row['mz'],
2797
+ 'rt': row['rt'],
2798
+ 'inty_mean': row.get('inty_mean', 0),
2799
+ 'number_samples': row.get('number_samples', 0)
2800
+ })
2801
+
2802
+ # Parameters for tight clustering detection - more lenient for effective merging
2803
+ tight_rt_tol = min(0.5, rt_tol * 0.5) # More lenient RT tolerance (max 0.5s)
2804
+ tight_mz_tol = min(0.05, max(0.03, mz_tol * 2.0)) # More lenient m/z tolerance (min 30 mDa, max 50 mDa)
2805
+
2806
+ # Build spatial index using smaller RT and m/z bins for better coverage
2807
+ rt_bin_size = tight_rt_tol / 4 # Smaller bins to ensure nearby features are captured
2808
+ mz_bin_size = tight_mz_tol / 4 # Smaller bins to ensure nearby features are captured
2809
+
2810
+ bins = defaultdict(list)
2811
+ for feature in consensus_data:
2812
+ rt_bin = int(feature['rt'] / rt_bin_size)
2813
+ mz_bin = int(feature['mz'] / mz_bin_size)
2814
+ bins[(rt_bin, mz_bin)].append(feature)
2815
+
2816
+ # Find clusters that need merging
2817
+ merge_groups = []
2818
+ processed_uids = set()
2819
+
2820
+ for bin_key, bin_features in bins.items():
2821
+ # Check current bin and extended neighboring bins for complete cluster
2822
+ rt_bin, mz_bin = bin_key
2823
+ cluster_features = list(bin_features)
2824
+
2825
+ # Check a larger neighborhood (±2 bins) to ensure we capture all nearby features
2826
+ for dr in [-2, -1, 0, 1, 2]:
2827
+ for dm in [-2, -1, 0, 1, 2]:
2828
+ if dr == 0 and dm == 0:
2829
+ continue
2830
+ neighbor_key = (rt_bin + dr, mz_bin + dm)
2831
+ if neighbor_key in bins:
2832
+ cluster_features.extend(bins[neighbor_key])
2833
+
2834
+ # Remove duplicates
2835
+ seen_uids = set()
2836
+ unique_features = []
2837
+ for f in cluster_features:
2838
+ if f['consensus_uid'] not in seen_uids:
2839
+ unique_features.append(f)
2840
+ seen_uids.add(f['consensus_uid'])
2841
+
2842
+ # Only proceed if we have at least 2 features after including neighbors
2843
+ if len(unique_features) < 2:
2844
+ continue
2845
+
2846
+ # Calculate cluster bounds
2847
+ mzs = [f['mz'] for f in unique_features]
2848
+ rts = [f['rt'] for f in unique_features]
2849
+
2850
+ mz_spread = max(mzs) - min(mzs)
2851
+ rt_spread = max(rts) - min(rts)
2852
+
2853
+ # Only merge if features are tightly clustered
2854
+ if mz_spread <= tight_mz_tol and rt_spread <= tight_rt_tol:
2855
+ # Filter out features that were already processed
2856
+ uids_in_cluster = {f['consensus_uid'] for f in unique_features}
2857
+ unprocessed_features = [f for f in unique_features if f['consensus_uid'] not in processed_uids]
2858
+
2859
+ # Only proceed if we have at least 2 unprocessed features that still form a tight cluster
2860
+ if len(unprocessed_features) >= 2:
2861
+ # Recalculate bounds for unprocessed features only
2862
+ unprocessed_mzs = [f['mz'] for f in unprocessed_features]
2863
+ unprocessed_rts = [f['rt'] for f in unprocessed_features]
2864
+
2865
+ unprocessed_mz_spread = max(unprocessed_mzs) - min(unprocessed_mzs)
2866
+ unprocessed_rt_spread = max(unprocessed_rts) - min(unprocessed_rts)
2867
+
2868
+ # Check if unprocessed features still meet tight clustering criteria
2869
+ if unprocessed_mz_spread <= tight_mz_tol and unprocessed_rt_spread <= tight_rt_tol:
2870
+ merge_groups.append(unprocessed_features)
2871
+ processed_uids.update({f['consensus_uid'] for f in unprocessed_features})
2872
+
2873
+ if not merge_groups:
2874
+ return
2875
+
2876
+ self.logger.debug(f"Found {len(merge_groups)} over-segmented clusters to merge")
2877
+
2878
+ # Merge clusters by keeping the most representative feature
2879
+ uids_to_remove = set()
2880
+
2881
+ for group in merge_groups:
2882
+ if len(group) < 2:
2883
+ continue
2884
+
2885
+ # Find the most representative feature (highest intensity and sample count)
2886
+ best_feature = max(group, key=lambda x: (x['number_samples'], x['inty_mean']))
2887
+
2888
+ # Mark other features for removal
2889
+ for f in group:
2890
+ if f['consensus_uid'] != best_feature['consensus_uid']:
2891
+ uids_to_remove.add(f['consensus_uid'])
2892
+
2893
+ if uids_to_remove:
2894
+ # Remove merged features from consensus_df
2895
+ self.consensus_df = self.consensus_df.filter(
2896
+ ~pl.col('consensus_uid').is_in(list(uids_to_remove))
2897
+ )
2898
+
2899
+ # Also update consensus_mapping_df if it exists
2900
+ if hasattr(self, 'consensus_mapping_df') and not self.consensus_mapping_df.is_empty():
2901
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
2902
+ ~pl.col('consensus_uid').is_in(list(uids_to_remove))
2903
+ )
2904
+
2905
+ final_count = len(self.consensus_df)
2906
+ reduction = initial_count - final_count
2907
+ reduction_pct = (reduction / initial_count) * 100
2908
+
2909
+ if reduction > 0:
2910
+ self.logger.debug(f"Enhanced post-clustering: {initial_count} → {final_count} features ({reduction_pct:.1f}% reduction)")
2911
+
2912
+ # Step 2: Deisotoping - Remove +1 and +2 isotopic consensus features
2913
+ pre_deisotoping_count = len(self.consensus_df)
2914
+ isotope_uids_to_remove = set()
2915
+
2916
+ # Use strict tolerances for deisotoping (same as declustering)
2917
+ deisotope_rt_tol = min(0.3, rt_tol * 0.3) # Strict RT tolerance for isotope detection
2918
+ deisotope_mz_tol = min(0.01, mz_tol * 0.5) # Strict m/z tolerance for isotope detection
2919
+
2920
+ # Get current consensus data for isotope detection
2921
+ current_consensus_data = []
2922
+ for row in self.consensus_df.iter_rows(named=True):
2923
+ current_consensus_data.append({
2924
+ 'consensus_uid': row['consensus_uid'],
2925
+ 'mz': row['mz'],
2926
+ 'rt': row['rt'],
2927
+ 'number_samples': row.get('number_samples', 0)
2928
+ })
2929
+
2930
+ # Sort by m/z for efficient searching
2931
+ current_consensus_data.sort(key=lambda x: x['mz'])
2932
+ n_current = len(current_consensus_data)
2933
+
2934
+ for i in range(n_current):
2935
+ feature_i = current_consensus_data[i]
2936
+
2937
+ # Skip if already marked for removal
2938
+ if feature_i['consensus_uid'] in isotope_uids_to_remove:
2939
+ continue
2940
+
2941
+ # Look for potential +1 and +2 isotopes (higher m/z)
2942
+ for j in range(i + 1, n_current):
2943
+ feature_j = current_consensus_data[j]
2944
+
2945
+ # Skip if already marked for removal
2946
+ if feature_j['consensus_uid'] in isotope_uids_to_remove:
2947
+ continue
2948
+
2949
+ mz_diff = feature_j['mz'] - feature_i['mz']
2950
+
2951
+ # Break if m/z difference is too large (features are sorted by m/z)
2952
+ if mz_diff > 2.1: # Beyond +2 isotope range
2953
+ break
2954
+
2955
+ rt_diff = abs(feature_j['rt'] - feature_i['rt'])
2956
+
2957
+ # Check for +1 isotope (C13 mass difference ≈ 1.003354 Da)
2958
+ if (0.995 <= mz_diff <= 1.011) and rt_diff <= deisotope_rt_tol:
2959
+ # Potential +1 isotope - should have fewer samples than main feature
2960
+ if feature_j['number_samples'] < feature_i['number_samples']:
2961
+ isotope_uids_to_remove.add(feature_j['consensus_uid'])
2962
+ continue
2963
+
2964
+ # Check for +2 isotope (2 * C13 mass difference ≈ 2.006708 Da)
2965
+ if (1.995 <= mz_diff <= 2.018) and rt_diff <= deisotope_rt_tol:
2966
+ # Potential +2 isotope - should have fewer samples than main feature
2967
+ if feature_j['number_samples'] < feature_i['number_samples']:
2968
+ isotope_uids_to_remove.add(feature_j['consensus_uid'])
2969
+ continue
2970
+
2971
+ # Remove isotopic features
2972
+ if isotope_uids_to_remove:
2973
+ self.consensus_df = self.consensus_df.filter(
2974
+ ~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
2975
+ )
2976
+
2977
+ # Also update consensus_mapping_df if it exists
2978
+ if hasattr(self, 'consensus_mapping_df') and not self.consensus_mapping_df.is_empty():
2979
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
2980
+ ~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
2981
+ )
2982
+
2983
+ post_deisotoping_count = len(self.consensus_df)
2984
+ isotope_reduction = pre_deisotoping_count - post_deisotoping_count
2985
+
2986
+ if isotope_reduction > 0:
2987
+ self.logger.debug(f"Deisotoping: {pre_deisotoping_count} → {post_deisotoping_count} features ({isotope_reduction} isotopic features removed)")
2988
+
2989
+ # Final summary
2990
+ final_count = len(self.consensus_df)
2991
+ total_reduction = initial_count - final_count
2992
+ if total_reduction > 0:
2993
+ total_reduction_pct = (total_reduction / initial_count) * 100
2994
+ self.logger.debug(f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)")
2995
+
2996
+
2997
+ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
2998
+ """
2999
+ Identify coeluting consensus features by characteristic mass shifts between adducts
3000
+ and update their adduct information accordingly.
3001
+
3002
+ This function:
3003
+ 1. Generates a catalogue of mass shifts between adducts using _get_adducts()
3004
+ 2. Searches for pairs of consensus features with same RT (within strict RT tolerance)
3005
+ and matching m/z shifts (±0.005 Da)
3006
+ 3. Updates adduct_* columns based on identified relationships
3007
+
3008
+ Args:
3009
+ rt_tol: RT tolerance in seconds (strict tolerance for coelution detection)
3010
+ cached_adducts_df: Pre-computed adducts DataFrame for performance
3011
+ """
3012
+ import polars as pl
3013
+ import numpy as np
3014
+ from collections import defaultdict
3015
+
3016
+ # Check if consensus_df exists and has features
3017
+ if len(self.consensus_df) == 0:
3018
+ self.logger.debug("No consensus features for adduct identification by mass shift")
3019
+ return
3020
+
3021
+ self.logger.info(f"Identifying coeluting adducts by mass shifts in {len(self.consensus_df)} consensus features...")
3022
+
3023
+ # Get adducts DataFrame if not provided
3024
+ if cached_adducts_df is None or cached_adducts_df.is_empty():
3025
+ try:
3026
+ # Use lower min_probability for better adduct coverage in mass shift identification
3027
+ cached_adducts_df = self._get_adducts(min_probability=0.01)
3028
+ except Exception as e:
3029
+ self.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
3030
+ return
3031
+
3032
+ if cached_adducts_df.is_empty():
3033
+ self.logger.debug("No adducts available for mass shift identification")
3034
+ return
3035
+
3036
+ # Build catalogue of mass shifts between adducts
3037
+ mass_shift_catalog = {}
3038
+ adduct_info = {}
3039
+
3040
+ # Extract adduct information
3041
+ adducts_data = cached_adducts_df.select(["name", "charge", "mass_shift"]).to_dicts()
3042
+
3043
+ for adduct in adducts_data:
3044
+ name = adduct["name"]
3045
+ charge = adduct["charge"]
3046
+ mass_shift = adduct["mass_shift"]
3047
+
3048
+ adduct_info[name] = {
3049
+ "charge": charge,
3050
+ "mass_shift": mass_shift
3051
+ }
3052
+
3053
+ # Generate pairwise mass differences for catalog
3054
+ for adduct1 in adducts_data:
3055
+ for adduct2 in adducts_data:
3056
+ if adduct1["name"] == adduct2["name"]:
3057
+ continue
3058
+
3059
+ name1, charge1, ms1 = adduct1["name"], adduct1["charge"], adduct1["mass_shift"]
3060
+ name2, charge2, ms2 = adduct2["name"], adduct2["charge"], adduct2["mass_shift"]
3061
+
3062
+ # Only consider shifts between adducts that have the same charge (same ionization state)
3063
+ if charge1 != charge2:
3064
+ continue
3065
+
3066
+ # Calculate expected m/z difference
3067
+ if charge1 != 0 and charge2 != 0:
3068
+ mz_diff = (ms1 - ms2) / abs(charge1)
3069
+ else:
3070
+ continue # Skip neutral adducts for this analysis
3071
+
3072
+ # Store the mass shift relationship
3073
+ shift_key = round(mz_diff, 4) # Round to 4 decimal places for matching
3074
+ if shift_key not in mass_shift_catalog:
3075
+ mass_shift_catalog[shift_key] = []
3076
+ mass_shift_catalog[shift_key].append({
3077
+ "from_adduct": name1,
3078
+ "to_adduct": name2,
3079
+ "mz_shift": mz_diff,
3080
+ "from_charge": charge1,
3081
+ "to_charge": charge2
3082
+ })
3083
+
3084
+ self.logger.debug(f"Generated mass shift catalog with {len(mass_shift_catalog)} unique shifts")
3085
+
3086
+ # Get consensus features data
3087
+ consensus_data = []
3088
+ for i, row in enumerate(self.consensus_df.iter_rows(named=True)):
3089
+ consensus_data.append({
3090
+ "index": i,
3091
+ "consensus_uid": row["consensus_uid"],
3092
+ "rt": row["rt"],
3093
+ "mz": row["mz"],
3094
+ "adduct_top": row.get("adduct_top", "[M+?]1+"),
3095
+ "adduct_charge_top": row.get("adduct_charge_top", 1),
3096
+ "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
3097
+ "adduct_mass_shift_top": row.get("adduct_mass_shift_top"),
3098
+ "inty_mean": row.get("inty_mean", 0)
3099
+ })
3100
+
3101
+ # Sort by RT for efficient searching
3102
+ consensus_data.sort(key=lambda x: x["rt"])
3103
+ n_features = len(consensus_data)
3104
+
3105
+ # Track updates to make
3106
+ adduct_updates = {} # consensus_uid -> new_adduct_info
3107
+
3108
+ # Strict RT tolerance for coelution (convert to minutes)
3109
+ rt_tol_strict = rt_tol * 0.5 # Use half the merge tolerance for strict coelution
3110
+ mz_tol_shift = 0.005 # ±5 mDa tolerance for mass shift matching
3111
+
3112
+ # Search for coeluting pairs with characteristic mass shifts
3113
+ updated_count = 0
3114
+
3115
+ for i in range(n_features):
3116
+ feature1 = consensus_data[i]
3117
+ rt1 = feature1["rt"]
3118
+ mz1 = feature1["mz"]
3119
+ adduct1 = feature1["adduct_top"]
3120
+
3121
+ # Skip if already has identified adduct (not [M+?]) - DISABLED to allow re-evaluation
3122
+ # if adduct1 and "?" not in adduct1:
3123
+ # continue
3124
+
3125
+ # Search for coeluting features within strict RT tolerance
3126
+ for j in range(i + 1, n_features):
3127
+ feature2 = consensus_data[j]
3128
+ rt2 = feature2["rt"]
3129
+
3130
+ # Break if RT difference exceeds tolerance (sorted by RT)
3131
+ if abs(rt2 - rt1) > rt_tol_strict:
3132
+ break
3133
+
3134
+ mz2 = feature2["mz"]
3135
+ adduct2 = feature2["adduct_top"]
3136
+
3137
+ # Skip if already has identified adduct (not [M+?]) - DISABLED to allow re-evaluation
3138
+ # if adduct2 and "?" not in adduct2:
3139
+ # continue
3140
+
3141
+ # Calculate observed m/z difference
3142
+ mz_diff = mz2 - mz1
3143
+ shift_key = round(mz_diff, 4)
3144
+
3145
+ # Check if this mass shift matches any known adduct relationships
3146
+ for catalog_shift, relationships in mass_shift_catalog.items():
3147
+ if abs(shift_key - catalog_shift) <= mz_tol_shift:
3148
+ # Found a matching mass shift!
3149
+
3150
+ # Choose the best relationship based on common adducts
3151
+ best_rel = None
3152
+ best_score = 0
3153
+
3154
+ for rel in relationships:
3155
+ # Prioritize common adducts ([M+H]+, [M+Na]+, [M+NH4]+)
3156
+ score = 0
3157
+ if "H]" in rel["from_adduct"]: score += 3
3158
+ if "Na]" in rel["from_adduct"]: score += 2
3159
+ if "NH4]" in rel["from_adduct"]: score += 2
3160
+ if "H]" in rel["to_adduct"]: score += 3
3161
+ if "Na]" in rel["to_adduct"]: score += 2
3162
+ if "NH4]" in rel["to_adduct"]: score += 2
3163
+
3164
+ if score > best_score:
3165
+ best_score = score
3166
+ best_rel = rel
3167
+
3168
+ if best_rel:
3169
+ # Determine which feature gets which adduct based on intensity
3170
+ inty1 = feature1["inty_mean"]
3171
+ inty2 = feature2["inty_mean"]
3172
+
3173
+ # Assign higher intensity to [M+H]+ if possible
3174
+ if "H]" in best_rel["from_adduct"] and inty1 >= inty2:
3175
+ # Feature 1 = from_adduct, Feature 2 = to_adduct
3176
+ from_feature = feature1
3177
+ to_feature = feature2
3178
+ from_adduct_name = best_rel["from_adduct"]
3179
+ to_adduct_name = best_rel["to_adduct"]
3180
+ elif "H]" in best_rel["to_adduct"] and inty2 >= inty1:
3181
+ # Feature 2 = to_adduct (reverse), Feature 1 = from_adduct
3182
+ from_feature = feature2
3183
+ to_feature = feature1
3184
+ from_adduct_name = best_rel["to_adduct"]
3185
+ to_adduct_name = best_rel["from_adduct"]
3186
+ else:
3187
+ # Assignment based on mass shift direction
3188
+ # catalog_shift = (ms1 - ms2) / abs(charge1) where ms1 = from_adduct mass shift, ms2 = to_adduct mass shift
3189
+ # If catalog_shift > 0: from_adduct has higher m/z than to_adduct
3190
+ # If catalog_shift < 0: from_adduct has lower m/z than to_adduct
3191
+ # observed mz_diff = mz2 - mz1
3192
+ # If mz_diff matches catalog_shift: feature2 should get to_adduct, feature1 should get from_adduct
3193
+ # If mz_diff matches -catalog_shift: assignments are swapped
3194
+
3195
+ if abs(mz_diff - catalog_shift) <= abs(mz_diff - (-catalog_shift)):
3196
+ # mz_diff matches catalog_shift direction
3197
+ from_feature = feature1
3198
+ to_feature = feature2
3199
+ from_adduct_name = best_rel["from_adduct"]
3200
+ to_adduct_name = best_rel["to_adduct"]
3201
+ else:
3202
+ # mz_diff matches reverse direction of catalog_shift
3203
+ from_feature = feature2
3204
+ to_feature = feature1
3205
+ from_adduct_name = best_rel["to_adduct"]
3206
+ to_adduct_name = best_rel["from_adduct"]
3207
+
3208
+ # Get adduct details from catalog
3209
+ from_adduct_info = adduct_info.get(from_adduct_name, {})
3210
+ to_adduct_info = adduct_info.get(to_adduct_name, {})
3211
+
3212
+ # Calculate neutral masses
3213
+ from_charge = from_adduct_info.get("charge", 1)
3214
+ to_charge = to_adduct_info.get("charge", 1)
3215
+ from_mass_shift = from_adduct_info.get("mass_shift", 1.007825)
3216
+ to_mass_shift = to_adduct_info.get("mass_shift", 1.007825)
3217
+
3218
+ from_neutral_mass = from_feature["mz"] * abs(from_charge) - from_mass_shift
3219
+ to_neutral_mass = to_feature["mz"] * abs(to_charge) - to_mass_shift
3220
+
3221
+ # Store updates
3222
+ adduct_updates[from_feature["consensus_uid"]] = {
3223
+ "adduct_top": from_adduct_name,
3224
+ "adduct_charge_top": from_charge,
3225
+ "adduct_mass_neutral_top": from_neutral_mass,
3226
+ "adduct_mass_shift_top": from_mass_shift
3227
+ }
3228
+
3229
+ adduct_updates[to_feature["consensus_uid"]] = {
3230
+ "adduct_top": to_adduct_name,
3231
+ "adduct_charge_top": to_charge,
3232
+ "adduct_mass_neutral_top": to_neutral_mass,
3233
+ "adduct_mass_shift_top": to_mass_shift
3234
+ }
3235
+
3236
+ updated_count += 2
3237
+ self.logger.debug(
3238
+ f"Identified adduct pair: {from_adduct_name} (m/z {from_feature['mz']:.4f}) "
3239
+ f"<-> {to_adduct_name} (m/z {to_feature['mz']:.4f}), "
3240
+ f"RT {rt1:.2f}s, Δm/z {mz_diff:.4f}"
3241
+ )
3242
+ break # Found match, no need to check other relationships
3243
+
3244
+ # Apply updates to consensus_df
3245
+ if adduct_updates:
3246
+ # Prepare update data
3247
+ consensus_uids = self.consensus_df["consensus_uid"].to_list()
3248
+
3249
+ new_adduct_top = []
3250
+ new_adduct_charge_top = []
3251
+ new_adduct_mass_neutral_top = []
3252
+ new_adduct_mass_shift_top = []
3253
+
3254
+ for uid in consensus_uids:
3255
+ if uid in adduct_updates:
3256
+ update = adduct_updates[uid]
3257
+ new_adduct_top.append(update["adduct_top"])
3258
+ new_adduct_charge_top.append(update["adduct_charge_top"])
3259
+ new_adduct_mass_neutral_top.append(update["adduct_mass_neutral_top"])
3260
+ new_adduct_mass_shift_top.append(update["adduct_mass_shift_top"])
3261
+ else:
3262
+ # Keep existing values
3263
+ row_idx = consensus_uids.index(uid)
3264
+ row = self.consensus_df.row(row_idx, named=True)
3265
+ new_adduct_top.append(row.get("adduct_top"))
3266
+ new_adduct_charge_top.append(row.get("adduct_charge_top"))
3267
+ new_adduct_mass_neutral_top.append(row.get("adduct_mass_neutral_top"))
3268
+ new_adduct_mass_shift_top.append(row.get("adduct_mass_shift_top"))
3269
+
3270
+ # Update the DataFrame
3271
+ self.consensus_df = self.consensus_df.with_columns([
3272
+ pl.Series("adduct_top", new_adduct_top),
3273
+ pl.Series("adduct_charge_top", new_adduct_charge_top),
3274
+ pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
3275
+ pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
3276
+ ])
3277
+
3278
+ self.logger.info(f"Updated adduct assignments for {updated_count} consensus features based on mass shifts")
3279
+ else:
3280
+ self.logger.debug("No consensus features updated based on mass shift analysis")
3281
+
3282
+
2448
3283
  def _finalize_merge(self, link_ms2, min_samples):
2449
3284
  """Complete the merge process with final calculations and cleanup."""
2450
3285
  import polars as pl
@@ -2483,21 +3318,29 @@ def _finalize_merge(self, link_ms2, min_samples):
2483
3318
  )
2484
3319
 
2485
3320
  # Calculate the completeness of the consensus map
3321
+ # Log completion with tight cluster metrics
2486
3322
  if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
2487
3323
  c = (
2488
3324
  len(self.consensus_mapping_df)
2489
3325
  / len(self.consensus_df)
2490
3326
  / len(self.samples_df)
2491
3327
  )
3328
+
3329
+ # Count tight clusters with specified thresholds
3330
+ tight_clusters = _count_tight_clusters(self,mz_tol=0.04, rt_tol=0.3)
3331
+
2492
3332
  self.logger.info(
2493
- f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
3333
+ f"Merging completed. Consensus features: {len(self.consensus_df)}. "
3334
+ f"Completeness: {c:.2f}. Tight clusters left: {tight_clusters}.",
2494
3335
  )
2495
3336
  else:
2496
3337
  self.logger.warning(
2497
3338
  f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
2498
3339
  f"This may be due to min_samples ({min_samples}) being too high for the available data.",
2499
3340
  )
2500
-
3341
+
3342
+ # add iso data from raw files.
3343
+ self.find_iso()
2501
3344
  if link_ms2:
2502
3345
  self.find_ms2()
2503
3346