masster 0.5.28__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -2,12 +2,13 @@
2
2
  import.py
3
3
 
4
4
  Module providing import functionality for Study class, specifically for importing
5
- oracle identification data into consensus features.
5
+ oracle and TIMA identification data into consensus features.
6
6
  """
7
7
 
8
8
  from __future__ import annotations
9
9
 
10
10
  import os
11
+ import glob
11
12
  import pandas as pd
12
13
  import polars as pl
13
14
 
@@ -320,3 +321,385 @@ def import_oracle(
320
321
  "id_matches": len(self.id_df),
321
322
  },
322
323
  )
324
+
325
+
326
+ def import_tima(self, folder, file="results_annotation"):
327
+ """
328
+ Import TIMA identification data and map it to consensus features.
329
+
330
+ This method reads TIMA identification results from folder/results_annotation_*.tsv
331
+ and creates lib_df and id_df DataFrames with detailed library and identification information.
332
+ It also updates consensus_df with top identification results.
333
+
334
+ Parameters:
335
+ folder (str): Path to TIMA folder containing results_annotation_*.tsv files
336
+ file (str, optional): Base name of TIMA results file (default: "results_annotation")
337
+
338
+ Returns:
339
+ None: Updates consensus_df, creates lib_df and id_df in-place with TIMA identification data
340
+
341
+ Raises:
342
+ FileNotFoundError: If the TIMA results file doesn't exist
343
+ ValueError: If consensus_df is empty or doesn't have required columns
344
+
345
+ Example:
346
+ >>> study.import_tima(folder="path/to/tima_results")
347
+ """
348
+
349
+ self.logger.info(f"Starting TIMA import from folder: {folder}")
350
+
351
+ # Validate inputs
352
+ if self.consensus_df is None or self.consensus_df.is_empty():
353
+ raise ValueError("consensus_df is empty or not available. Run merge() first.")
354
+
355
+ if "consensus_id" not in self.consensus_df.columns:
356
+ raise ValueError("consensus_df must contain 'consensus_id' column")
357
+
358
+ # Find TIMA file
359
+ tima_pattern = os.path.join(folder, f"*{file}*.tsv")
360
+ tima_files = glob.glob(tima_pattern)
361
+
362
+ if not tima_files:
363
+ raise FileNotFoundError(f"TIMA results file not found with pattern: {tima_pattern}")
364
+
365
+ tima_file_path = tima_files[0]
366
+ self.logger.debug(f"Loading TIMA data from: {tima_file_path}")
367
+
368
+ try:
369
+ # Read TIMA data using polars
370
+ tima_data = pl.read_csv(
371
+ tima_file_path,
372
+ separator="\t",
373
+ schema_overrides={
374
+ "feature_id": pl.Utf8, # Read as Utf8 string
375
+ },
376
+ infer_schema_length=10000,
377
+ )
378
+ self.logger.info(f"TIMA data loaded successfully with {len(tima_data)} rows")
379
+ except Exception as e:
380
+ self.logger.error(f"Could not read {tima_file_path}: {e}")
381
+ raise
382
+
383
+ # Check if TIMA feature_ids match consensus_df consensus_id column
384
+ if "consensus_id" not in self.consensus_df.columns:
385
+ raise ValueError("consensus_df must contain 'consensus_id' column")
386
+
387
+ # Compare TIMA feature_ids with consensus_df consensus_ids
388
+ consensus_ids = set(self.consensus_df["consensus_id"].to_list())
389
+ tima_ids = set(tima_data["feature_id"].to_list())
390
+
391
+ matching_ids = consensus_ids.intersection(tima_ids)
392
+ non_matching_ids = tima_ids - consensus_ids
393
+
394
+ if non_matching_ids:
395
+ self.logger.warning(
396
+ f"Found {len(non_matching_ids)} feature_ids in TIMA data that do not match any consensus_id in consensus_df. "
397
+ f"These will be filtered out. Matching features: {len(matching_ids)}/{len(tima_ids)}"
398
+ )
399
+ # Filter to only matching feature_ids
400
+ tima_data = tima_data.filter(pl.col("feature_id").is_in(list(consensus_ids)))
401
+
402
+ if len(tima_data) == 0:
403
+ self.logger.error("No TIMA feature_ids match consensus_df consensus_id values")
404
+ raise ValueError("No matching features found between TIMA data and consensus_df")
405
+
406
+ self.logger.debug(f"Matched {len(tima_data)} TIMA entries to consensus_df consensus_id values")
407
+
408
+ # Filter to only rows with identification data (non-empty label_compound)
409
+ initial_count = len(tima_data)
410
+ tima_data = tima_data.filter(
411
+ pl.col("label_compound").is_not_null() & (pl.col("label_compound").cast(pl.Utf8).str.strip_chars() != "")
412
+ )
413
+
414
+ self.logger.debug(f"Filtered to {len(tima_data)}/{initial_count} TIMA entries with identifications")
415
+
416
+ if len(tima_data) == 0:
417
+ self.logger.warning("No TIMA entries with identifications found")
418
+ return
419
+
420
+ # === CREATE LIB_DF ===
421
+ self.logger.debug("Creating lib_df from TIMA annotation data")
422
+ self.logger.debug(f"TIMA data shape before lib_df creation: {tima_data.shape}")
423
+
424
+ # Create unique lib_uid for each library entry
425
+ tima_data = tima_data.with_columns(pl.arange(0, len(tima_data)).alias("lib_uid"))
426
+
427
+ # Map TIMA columns to lib_df schema
428
+ lib_data = []
429
+ for row in tima_data.iter_rows(named=True):
430
+ # Extract z (charge) from adduct
431
+ z = None
432
+ adduct_str = str(row.get("adduct", ""))
433
+ if "+" in adduct_str:
434
+ z = 1
435
+ elif "-" in adduct_str:
436
+ z = -1
437
+
438
+ # Get SMILES
439
+ smiles = row.get("smiles_no_stereo", None)
440
+ if smiles is None or (isinstance(smiles, str) and smiles.strip() == ""):
441
+ smiles = None
442
+
443
+ # Calculate InChI from SMILES if available
444
+ inchi = None
445
+ if smiles:
446
+ try:
447
+ from rdkit import Chem
448
+
449
+ mol_rdkit = Chem.MolFromSmiles(smiles)
450
+ if mol_rdkit:
451
+ inchi = Chem.MolToInchi(mol_rdkit)
452
+ except ImportError:
453
+ pass # RDKit not available
454
+ except Exception:
455
+ pass
456
+
457
+ # Calculate formula from SMILES if available
458
+ formula = None
459
+ if smiles:
460
+ try:
461
+ from rdkit import Chem
462
+
463
+ mol_rdkit = Chem.MolFromSmiles(smiles)
464
+ if mol_rdkit:
465
+ formula = Chem.rdMolDescriptors.CalcMolFormula(mol_rdkit)
466
+ except ImportError:
467
+ pass # RDKit not available
468
+ except Exception:
469
+ pass
470
+
471
+ # Calculate mass from m/z and charge
472
+ m = None
473
+ mz_value = row.get("mz", None)
474
+ if mz_value is not None and z is not None:
475
+ try:
476
+ m = float(mz_value) * abs(z)
477
+ except (ValueError, TypeError):
478
+ pass
479
+
480
+ # Get class and clean NaN values
481
+ class_value = row.get("label_classyfire", None)
482
+ if class_value is None or (isinstance(class_value, str) and class_value.upper() == "NAN"):
483
+ class_value = None
484
+
485
+ lib_entry = {
486
+ "lib_uid": row["lib_uid"],
487
+ "cmpd_uid": row["lib_uid"], # Use lib_uid as compound identifier
488
+ "source_id": None, # Leave empty as requested
489
+ "name": row.get("label_compound", None),
490
+ "shortname": None, # Not available in TIMA data
491
+ "class": class_value,
492
+ "smiles": smiles,
493
+ "inchi": inchi,
494
+ "inchikey": row.get("inchikey_connectivity_layer", None),
495
+ "formula": formula,
496
+ "iso": 0, # Fixed isotope value
497
+ "adduct": row.get("adduct", None),
498
+ "probability": row.get("score", None),
499
+ "m": m,
500
+ "z": z,
501
+ "mz": row.get("mz", None),
502
+ "rt": None, # Set to null as requested
503
+ "quant_group": None,
504
+ "db_id": None, # Not available in TIMA data
505
+ "db": row.get("library", None),
506
+ }
507
+ lib_data.append(lib_entry)
508
+
509
+ self.logger.debug(f"Created {len(lib_data)} lib_data entries")
510
+
511
+ # Create lib_df as Polars DataFrame with error handling for mixed types
512
+ try:
513
+ lib_df_temp = pl.DataFrame(lib_data)
514
+ except Exception as e:
515
+ self.logger.warning(f"Error creating lib_df with polars: {e}")
516
+ # Fallback: convert to pandas first, then to polars
517
+ lib_df_pandas = pd.DataFrame(lib_data)
518
+ lib_df_temp = pl.from_pandas(lib_df_pandas)
519
+
520
+ # Ensure uniqueness by name and adduct combination
521
+ # Sort by lib_uid and keep first occurrence (earliest in processing order)
522
+ self.lib_df = lib_df_temp.sort("lib_uid").unique(subset=["name", "adduct"], keep="first")
523
+
524
+ self.logger.info(
525
+ f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)"
526
+ )
527
+
528
+ # === CREATE ID_DF ===
529
+ self.logger.debug("Creating id_df from TIMA identification matches")
530
+
531
+ # Create a mapping from consensus_id to consensus_uid
532
+ # TIMA data has feature_id which matches consensus_id, map to consensus_uid for id_df
533
+ consensus_id_to_uid_map = dict(
534
+ zip(self.consensus_df["consensus_id"].to_list(), self.consensus_df["consensus_uid"].to_list())
535
+ )
536
+
537
+ # Create identification matches
538
+ id_data = []
539
+ for row in tima_data.iter_rows(named=True):
540
+ # Map TIMA feature_id to consensus_df consensus_uid
541
+ tima_feature_id = row["feature_id"]
542
+ consensus_uid = consensus_id_to_uid_map.get(tima_feature_id)
543
+
544
+ if consensus_uid is None:
545
+ # Skip if we can't find the mapping (shouldn't happen after filtering)
546
+ continue
547
+
548
+ # Use error_mz for mz_delta
549
+ mz_delta = None
550
+ error_mz = row.get("error_mz", None)
551
+ if error_mz is not None:
552
+ try:
553
+ mz_delta = float(error_mz)
554
+ except (ValueError, TypeError):
555
+ pass
556
+
557
+ # Use error_rt for rt_delta
558
+ rt_delta = None
559
+ rt_err_value = row.get("error_rt", None)
560
+ if rt_err_value is not None:
561
+ try:
562
+ rt_delta = float(rt_err_value)
563
+ except (ValueError, TypeError):
564
+ pass
565
+
566
+ # Create matcher as "tima-" + library
567
+ matcher = "tima" # default fallback
568
+ library_value = row.get("library", None)
569
+ if library_value is not None:
570
+ try:
571
+ library = str(library_value)
572
+ matcher = f"tima-{library}"
573
+ except (ValueError, TypeError):
574
+ pass
575
+
576
+ id_entry = {
577
+ "consensus_uid": consensus_uid, # Use mapped consensus_uid from consensus_df
578
+ "lib_uid": row["lib_uid"],
579
+ "mz_delta": mz_delta,
580
+ "rt_delta": rt_delta,
581
+ "matcher": matcher,
582
+ "score": row.get("score", None),
583
+ }
584
+ id_data.append(id_entry)
585
+
586
+ # Create id_df as Polars DataFrame with explicit schema to avoid inference issues
587
+ # Match consensus_uid type to consensus_df
588
+ consensus_uid_dtype = self.consensus_df["consensus_uid"].dtype
589
+ id_schema = {
590
+ "consensus_uid": consensus_uid_dtype, # Match the type from consensus_df
591
+ "lib_uid": pl.Int64,
592
+ "mz_delta": pl.Float64,
593
+ "rt_delta": pl.Float64,
594
+ "matcher": pl.Utf8,
595
+ "score": pl.Float64,
596
+ }
597
+ id_df_temp = pl.DataFrame(id_data, schema=id_schema)
598
+
599
+ # Filter id_df to only include lib_uids that exist in the final unique lib_df
600
+ unique_lib_uids = self.lib_df.select("lib_uid").to_series()
601
+ self.id_df = id_df_temp.filter(pl.col("lib_uid").is_in(unique_lib_uids))
602
+
603
+ self.logger.info(f"Created id_df with {len(self.id_df)} identification matches")
604
+
605
+ # === UPDATE CONSENSUS_DF ===
606
+ self.logger.debug("Updating consensus_df with top identification results")
607
+
608
+ # tima_data is already a polars DataFrame
609
+ tima_pl = tima_data
610
+
611
+ # Group by feature_id and select the best identification (highest score)
612
+ # In case of ties, take the first one
613
+ best_ids = (
614
+ tima_pl.group_by("feature_id")
615
+ .agg([pl.col("score").max().alias("max_score")])
616
+ .join(tima_pl, on="feature_id")
617
+ .filter(pl.col("score") == pl.col("max_score"))
618
+ .group_by("feature_id")
619
+ .first() # In case of ties, take the first
620
+ )
621
+
622
+ # Join with consensus_df to map consensus_id to consensus_uid
623
+ best_ids = best_ids.join(
624
+ self.consensus_df.select(["consensus_id", "consensus_uid"]), left_on="feature_id", right_on="consensus_id", how="left"
625
+ )
626
+
627
+ self.logger.debug(f"Selected best identifications for {len(best_ids)} consensus features")
628
+
629
+ # Prepare the identification columns
630
+ id_columns = {
631
+ "id_top_name": best_ids.select("consensus_uid", "label_compound"),
632
+ "id_top_adduct": best_ids.select("consensus_uid", "adduct"),
633
+ "id_top_class": best_ids.select("consensus_uid", "label_classyfire"),
634
+ "id_top_score": best_ids.select("consensus_uid", pl.col("score").round(3).alias("score")),
635
+ }
636
+
637
+ # Initialize identification columns in consensus_df if they don't exist
638
+ for col_name in id_columns.keys():
639
+ if col_name not in self.consensus_df.columns:
640
+ if col_name == "id_top_score":
641
+ self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.Float64).alias(col_name))
642
+ else:
643
+ self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.String).alias(col_name))
644
+
645
+ # Update consensus_df with TIMA identifications
646
+ for col_name, id_data_col in id_columns.items():
647
+ tima_column = id_data_col.columns[1] # second column (after consensus_uid)
648
+
649
+ # Create update dataframe
650
+ update_data = id_data_col.rename({tima_column: col_name})
651
+
652
+ # Join and update
653
+ self.consensus_df = (
654
+ self.consensus_df.join(update_data, on="consensus_uid", how="left", suffix="_tima")
655
+ .with_columns(pl.coalesce([f"{col_name}_tima", col_name]).alias(col_name))
656
+ .drop(f"{col_name}_tima")
657
+ )
658
+
659
+ # Replace NaN values with None in identification columns
660
+ id_col_names = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score"]
661
+ for col_name in id_col_names:
662
+ if col_name in self.consensus_df.columns:
663
+ # For string columns, replace empty strings and "nan" with None
664
+ if col_name != "id_top_score":
665
+ self.consensus_df = self.consensus_df.with_columns(
666
+ pl.when(
667
+ pl.col(col_name).is_null()
668
+ | (pl.col(col_name) == "")
669
+ | (pl.col(col_name) == "nan")
670
+ | (pl.col(col_name) == "NaN")
671
+ )
672
+ .then(None)
673
+ .otherwise(pl.col(col_name))
674
+ .alias(col_name)
675
+ )
676
+ # For numeric columns, replace NaN with None
677
+ else:
678
+ self.consensus_df = self.consensus_df.with_columns(
679
+ pl.when(pl.col(col_name).is_null() | pl.col(col_name).is_nan())
680
+ .then(None)
681
+ .otherwise(pl.col(col_name))
682
+ .alias(col_name)
683
+ )
684
+
685
+ # Count how many consensus features were updated
686
+ updated_count = self.consensus_df.filter(pl.col("id_top_name").is_not_null()).height
687
+ total_consensus = len(self.consensus_df)
688
+
689
+ self.logger.success(
690
+ f"TIMA import completed. {updated_count}/{total_consensus} "
691
+ f"consensus features now have identifications ({updated_count / total_consensus * 100:.1f}%)"
692
+ )
693
+
694
+ # Update history
695
+ self.update_history(
696
+ ["import_tima"],
697
+ {
698
+ "folder": folder,
699
+ "file": file,
700
+ "updated_features": updated_count,
701
+ "total_features": total_consensus,
702
+ "lib_entries": len(self.lib_df),
703
+ "id_matches": len(self.id_df),
704
+ },
705
+ )
masster/study/load.py CHANGED
@@ -191,17 +191,6 @@ def load(self, filename=None):
191
191
 
192
192
  _load_study5(self, filename)
193
193
 
194
- # After loading the study, check if we have consensus features before loading consensus XML
195
- # if (self.consensus_df is not None and not self.consensus_df.is_empty()):
196
- # consensus_xml_path = filename.replace(".study5", ".consensusXML")
197
- # if os.path.exists(consensus_xml_path):
198
- # self._load_consensusXML(filename=consensus_xml_path)
199
- # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
200
- # else:
201
- # self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
202
- # else:
203
- # self.logger.debug("No consensus features found, skipping consensusXML loading")
204
-
205
194
  self.filename = filename
206
195
 
207
196
 
masster/study/merge.py CHANGED
@@ -441,9 +441,15 @@ def merge(study, **kwargs) -> None:
441
441
  cached_valid_adducts = None
442
442
  try:
443
443
  cached_adducts_df = study._get_adducts()
444
+ # Remove all adducts with wrong polarity
445
+ if study.polarity == "positive":
446
+ cached_adducts_df = cached_adducts_df.filter(pl.col("charge") >= 0)
447
+ else:
448
+ cached_adducts_df = cached_adducts_df.filter(pl.col("charge") <= 0)
444
449
  if not cached_adducts_df.is_empty():
445
450
  cached_valid_adducts = set(cached_adducts_df["name"].to_list())
446
451
  else:
452
+ study.logger.warning(f"No valid adducts found for polarity '{study.polarity}'")
447
453
  cached_valid_adducts = set()
448
454
  except Exception as e:
449
455
  study.logger.warning(f"Could not retrieve study adducts: {e}")
@@ -452,6 +458,13 @@ def merge(study, **kwargs) -> None:
452
458
  # Always allow '?' adducts
453
459
  cached_valid_adducts.add("?")
454
460
 
461
+ # Bypass for single sample case
462
+ if len(study.samples_df) == 1:
463
+ study.logger.info("Single sample detected - bypassing merge algorithm and using direct feature mapping")
464
+ _handle_single_sample_merge(study, cached_adducts_df, cached_valid_adducts)
465
+ # Skip all post-processing for single sample case
466
+ return
467
+
455
468
  # Route to algorithm implementation
456
469
  if params.method == "kd":
457
470
  consensus_map = _merge_kd(study, params)
@@ -1719,6 +1732,10 @@ def _calculate_consensus_statistics(
1719
1732
  mz_values: m/z values from chunk consensus features
1720
1733
  intensity_values: Intensity values from chunk consensus features
1721
1734
  quality_values: Quality values from chunk consensus features
1735
+ number_features: Number of unique features contributing
1736
+ number_samples: Number of unique samples contributing
1737
+ cached_adducts_df: Cached DataFrame of valid adducts for the study
1738
+ cached_valid_adducts: Cached set of valid adduct names for the study
1722
1739
 
1723
1740
  Returns:
1724
1741
  Dictionary with consensus feature metadata
@@ -3612,6 +3629,142 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
3612
3629
  return adduct_group_list, adduct_of_list
3613
3630
 
3614
3631
 
3632
+ def _handle_single_sample_merge(study, cached_adducts_df=None, cached_valid_adducts=None):
3633
+ """
3634
+ Handle merge for the special case of a single sample.
3635
+ Directly populate consensus_df from the sample's features_df without any filtering.
3636
+
3637
+ Args:
3638
+ study: Study object with single sample
3639
+ cached_adducts_df: Pre-computed adducts DataFrame (optional)
3640
+ cached_valid_adducts: Set of valid adduct names (optional)
3641
+ """
3642
+ import polars as pl
3643
+ import uuid
3644
+
3645
+ if len(study.samples_df) != 1:
3646
+ raise ValueError("_handle_single_sample_merge should only be called with exactly one sample")
3647
+
3648
+ # Get the single sample's features
3649
+ sample_row = study.samples_df.row(0, named=True)
3650
+ sample_uid = sample_row["sample_uid"]
3651
+
3652
+ # Filter features for this sample
3653
+ sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
3654
+
3655
+ if len(sample_features) == 0:
3656
+ study.logger.warning("No features found for single sample")
3657
+ study.consensus_df = pl.DataFrame()
3658
+ study.consensus_mapping_df = pl.DataFrame()
3659
+ return
3660
+
3661
+ study.logger.info(f"Creating consensus from {len(sample_features)} features in single sample")
3662
+
3663
+ # Create consensus features directly from sample features
3664
+ consensus_list = []
3665
+ mapping_list = []
3666
+
3667
+ # Cache valid adducts
3668
+ valid_adducts = cached_valid_adducts if cached_valid_adducts is not None else set()
3669
+ valid_adducts.add("?") # Always allow '?' adducts
3670
+
3671
+ for i, feature_row in enumerate(sample_features.iter_rows(named=True)):
3672
+ # Generate unique consensus ID
3673
+ consensus_id_str = str(uuid.uuid4()).replace("-", "")[:16]
3674
+
3675
+ # Handle adduct information
3676
+ adduct = feature_row.get("adduct")
3677
+ if adduct is None or adduct not in valid_adducts:
3678
+ # Set default adduct based on study polarity
3679
+ study_polarity = getattr(study, "polarity", "positive")
3680
+ if study_polarity in ["negative", "neg"]:
3681
+ adduct = "[M-?]1-"
3682
+ adduct_charge = -1
3683
+ adduct_mass_shift = -1.007825
3684
+ else:
3685
+ adduct = "[M+?]1+"
3686
+ adduct_charge = 1
3687
+ adduct_mass_shift = 1.007825
3688
+ else:
3689
+ # Try to get charge and mass shift from cached adducts
3690
+ adduct_charge = 1
3691
+ adduct_mass_shift = 1.007825
3692
+ if cached_adducts_df is not None and not cached_adducts_df.is_empty():
3693
+ matching_adduct = cached_adducts_df.filter(pl.col("name") == adduct)
3694
+ if not matching_adduct.is_empty():
3695
+ adduct_row = matching_adduct.row(0, named=True)
3696
+ adduct_charge = adduct_row["charge"]
3697
+ adduct_mass_shift = adduct_row["mass_shift"]
3698
+
3699
+ # Calculate neutral mass
3700
+ mz = feature_row.get("mz", 0.0)
3701
+ if adduct_charge and adduct_mass_shift is not None:
3702
+ adduct_mass_neutral = mz * abs(adduct_charge) - adduct_mass_shift
3703
+ else:
3704
+ adduct_mass_neutral = None
3705
+
3706
+ # Count MS2 scans
3707
+ ms2_scans = feature_row.get("ms2_scans", [])
3708
+ ms2_count = len(ms2_scans) if ms2_scans else 0
3709
+
3710
+ # Create consensus feature metadata
3711
+ consensus_feature = {
3712
+ "consensus_uid": i,
3713
+ "consensus_id": consensus_id_str,
3714
+ "quality": feature_row.get("quality", 1.0),
3715
+ "number_samples": 1, # Always 1 for single sample
3716
+ "rt": feature_row.get("rt", 0.0),
3717
+ "mz": mz,
3718
+ "rt_min": feature_row.get("rt", 0.0),
3719
+ "rt_max": feature_row.get("rt", 0.0),
3720
+ "rt_mean": feature_row.get("rt", 0.0),
3721
+ "rt_start_mean": feature_row.get("rt_start", 0.0),
3722
+ "rt_end_mean": feature_row.get("rt_end", 0.0),
3723
+ "rt_delta_mean": feature_row.get("rt_delta", 0.0),
3724
+ "mz_min": mz,
3725
+ "mz_max": mz,
3726
+ "mz_mean": mz,
3727
+ "mz_start_mean": feature_row.get("mz_start", 0.0),
3728
+ "mz_end_mean": feature_row.get("mz_end", 0.0),
3729
+ "inty_mean": feature_row.get("inty", 0.0),
3730
+ "bl": -1.0,
3731
+ "chrom_coherence_mean": feature_row.get("chrom_coherence", 0.0),
3732
+ "chrom_prominence_mean": feature_row.get("chrom_prominence", 0.0),
3733
+ "chrom_prominence_scaled_mean": feature_row.get("chrom_prominence_scaled", 0.0),
3734
+ "chrom_height_scaled_mean": feature_row.get("chrom_height_scaled", 0.0),
3735
+ "iso": None, # Will be filled by find_iso() function
3736
+ "iso_mean": feature_row.get("iso", 0.0),
3737
+ "charge_mean": feature_row.get("charge", 0.0),
3738
+ "number_ms2": ms2_count,
3739
+ "adducts": [[adduct, 1, 100.0]], # Single adduct with 100% frequency
3740
+ "adduct_top": adduct,
3741
+ "adduct_charge_top": adduct_charge,
3742
+ "adduct_mass_neutral_top": adduct_mass_neutral,
3743
+ "adduct_mass_shift_top": adduct_mass_shift,
3744
+ "id_top_name": None,
3745
+ "id_top_class": None,
3746
+ "id_top_adduct": None,
3747
+ "id_top_score": None,
3748
+ "id_source": None,
3749
+ }
3750
+
3751
+ consensus_list.append(consensus_feature)
3752
+
3753
+ # Create mapping entry
3754
+ mapping_entry = {
3755
+ "consensus_uid": i,
3756
+ "sample_uid": sample_uid,
3757
+ "feature_uid": feature_row.get("feature_uid"),
3758
+ }
3759
+ mapping_list.append(mapping_entry)
3760
+
3761
+ # Create DataFrames
3762
+ study.consensus_df = pl.DataFrame(consensus_list, strict=False)
3763
+ study.consensus_mapping_df = pl.DataFrame(mapping_list, strict=False)
3764
+
3765
+ study.logger.info(f"Created {len(consensus_list)} consensus features from single sample")
3766
+
3767
+
3615
3768
  def _fast_correlation(x, y):
3616
3769
  """
3617
3770
  Fast correlation coefficient calculation for consensus matrix data.