masster 0.5.28__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/data/libs/aa_nort.json +240 -0
- masster/data/libs/ccm_nort.json +1319 -0
- masster/lib/lib.py +1 -1
- masster/logger.py +0 -6
- masster/sample/adducts.py +1 -1
- masster/sample/defaults/find_adducts_def.py +1 -1
- masster/sample/h5.py +152 -2
- masster/sample/helpers.py +91 -5
- masster/sample/id.py +1160 -0
- masster/sample/importers.py +715 -0
- masster/sample/plot.py +175 -71
- masster/sample/sample.py +26 -5
- masster/sample/sample5_schema.json +99 -1
- masster/sample/save.py +724 -1
- masster/study/defaults/study_def.py +8 -12
- masster/study/export.py +216 -65
- masster/study/id.py +59 -12
- masster/study/importers.py +384 -1
- masster/study/load.py +0 -11
- masster/study/merge.py +153 -0
- masster/study/plot.py +197 -0
- masster/study/study.py +6 -4
- masster/study/study5_schema.json +15 -0
- masster/wizard/wizard.py +13 -14
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/METADATA +17 -18
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/RECORD +30 -29
- masster/data/libs/aa.csv +0 -22
- masster/data/libs/ccm.csv +0 -120
- masster/data/libs/urine.csv +0 -4693
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/WHEEL +0 -0
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/entry_points.txt +0 -0
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/licenses/LICENSE +0 -0
masster/study/importers.py
CHANGED
|
@@ -2,12 +2,13 @@
|
|
|
2
2
|
import.py
|
|
3
3
|
|
|
4
4
|
Module providing import functionality for Study class, specifically for importing
|
|
5
|
-
oracle identification data into consensus features.
|
|
5
|
+
oracle and TIMA identification data into consensus features.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
10
|
import os
|
|
11
|
+
import glob
|
|
11
12
|
import pandas as pd
|
|
12
13
|
import polars as pl
|
|
13
14
|
|
|
@@ -320,3 +321,385 @@ def import_oracle(
|
|
|
320
321
|
"id_matches": len(self.id_df),
|
|
321
322
|
},
|
|
322
323
|
)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def import_tima(self, folder, file="results_annotation"):
|
|
327
|
+
"""
|
|
328
|
+
Import TIMA identification data and map it to consensus features.
|
|
329
|
+
|
|
330
|
+
This method reads TIMA identification results from folder/results_annotation_*.tsv
|
|
331
|
+
and creates lib_df and id_df DataFrames with detailed library and identification information.
|
|
332
|
+
It also updates consensus_df with top identification results.
|
|
333
|
+
|
|
334
|
+
Parameters:
|
|
335
|
+
folder (str): Path to TIMA folder containing results_annotation_*.tsv files
|
|
336
|
+
file (str, optional): Base name of TIMA results file (default: "results_annotation")
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
None: Updates consensus_df, creates lib_df and id_df in-place with TIMA identification data
|
|
340
|
+
|
|
341
|
+
Raises:
|
|
342
|
+
FileNotFoundError: If the TIMA results file doesn't exist
|
|
343
|
+
ValueError: If consensus_df is empty or doesn't have required columns
|
|
344
|
+
|
|
345
|
+
Example:
|
|
346
|
+
>>> study.import_tima(folder="path/to/tima_results")
|
|
347
|
+
"""
|
|
348
|
+
|
|
349
|
+
self.logger.info(f"Starting TIMA import from folder: {folder}")
|
|
350
|
+
|
|
351
|
+
# Validate inputs
|
|
352
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
353
|
+
raise ValueError("consensus_df is empty or not available. Run merge() first.")
|
|
354
|
+
|
|
355
|
+
if "consensus_id" not in self.consensus_df.columns:
|
|
356
|
+
raise ValueError("consensus_df must contain 'consensus_id' column")
|
|
357
|
+
|
|
358
|
+
# Find TIMA file
|
|
359
|
+
tima_pattern = os.path.join(folder, f"*{file}*.tsv")
|
|
360
|
+
tima_files = glob.glob(tima_pattern)
|
|
361
|
+
|
|
362
|
+
if not tima_files:
|
|
363
|
+
raise FileNotFoundError(f"TIMA results file not found with pattern: {tima_pattern}")
|
|
364
|
+
|
|
365
|
+
tima_file_path = tima_files[0]
|
|
366
|
+
self.logger.debug(f"Loading TIMA data from: {tima_file_path}")
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
# Read TIMA data using polars
|
|
370
|
+
tima_data = pl.read_csv(
|
|
371
|
+
tima_file_path,
|
|
372
|
+
separator="\t",
|
|
373
|
+
schema_overrides={
|
|
374
|
+
"feature_id": pl.Utf8, # Read as Utf8 string
|
|
375
|
+
},
|
|
376
|
+
infer_schema_length=10000,
|
|
377
|
+
)
|
|
378
|
+
self.logger.info(f"TIMA data loaded successfully with {len(tima_data)} rows")
|
|
379
|
+
except Exception as e:
|
|
380
|
+
self.logger.error(f"Could not read {tima_file_path}: {e}")
|
|
381
|
+
raise
|
|
382
|
+
|
|
383
|
+
# Check if TIMA feature_ids match consensus_df consensus_id column
|
|
384
|
+
if "consensus_id" not in self.consensus_df.columns:
|
|
385
|
+
raise ValueError("consensus_df must contain 'consensus_id' column")
|
|
386
|
+
|
|
387
|
+
# Compare TIMA feature_ids with consensus_df consensus_ids
|
|
388
|
+
consensus_ids = set(self.consensus_df["consensus_id"].to_list())
|
|
389
|
+
tima_ids = set(tima_data["feature_id"].to_list())
|
|
390
|
+
|
|
391
|
+
matching_ids = consensus_ids.intersection(tima_ids)
|
|
392
|
+
non_matching_ids = tima_ids - consensus_ids
|
|
393
|
+
|
|
394
|
+
if non_matching_ids:
|
|
395
|
+
self.logger.warning(
|
|
396
|
+
f"Found {len(non_matching_ids)} feature_ids in TIMA data that do not match any consensus_id in consensus_df. "
|
|
397
|
+
f"These will be filtered out. Matching features: {len(matching_ids)}/{len(tima_ids)}"
|
|
398
|
+
)
|
|
399
|
+
# Filter to only matching feature_ids
|
|
400
|
+
tima_data = tima_data.filter(pl.col("feature_id").is_in(list(consensus_ids)))
|
|
401
|
+
|
|
402
|
+
if len(tima_data) == 0:
|
|
403
|
+
self.logger.error("No TIMA feature_ids match consensus_df consensus_id values")
|
|
404
|
+
raise ValueError("No matching features found between TIMA data and consensus_df")
|
|
405
|
+
|
|
406
|
+
self.logger.debug(f"Matched {len(tima_data)} TIMA entries to consensus_df consensus_id values")
|
|
407
|
+
|
|
408
|
+
# Filter to only rows with identification data (non-empty label_compound)
|
|
409
|
+
initial_count = len(tima_data)
|
|
410
|
+
tima_data = tima_data.filter(
|
|
411
|
+
pl.col("label_compound").is_not_null() & (pl.col("label_compound").cast(pl.Utf8).str.strip_chars() != "")
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
self.logger.debug(f"Filtered to {len(tima_data)}/{initial_count} TIMA entries with identifications")
|
|
415
|
+
|
|
416
|
+
if len(tima_data) == 0:
|
|
417
|
+
self.logger.warning("No TIMA entries with identifications found")
|
|
418
|
+
return
|
|
419
|
+
|
|
420
|
+
# === CREATE LIB_DF ===
|
|
421
|
+
self.logger.debug("Creating lib_df from TIMA annotation data")
|
|
422
|
+
self.logger.debug(f"TIMA data shape before lib_df creation: {tima_data.shape}")
|
|
423
|
+
|
|
424
|
+
# Create unique lib_uid for each library entry
|
|
425
|
+
tima_data = tima_data.with_columns(pl.arange(0, len(tima_data)).alias("lib_uid"))
|
|
426
|
+
|
|
427
|
+
# Map TIMA columns to lib_df schema
|
|
428
|
+
lib_data = []
|
|
429
|
+
for row in tima_data.iter_rows(named=True):
|
|
430
|
+
# Extract z (charge) from adduct
|
|
431
|
+
z = None
|
|
432
|
+
adduct_str = str(row.get("adduct", ""))
|
|
433
|
+
if "+" in adduct_str:
|
|
434
|
+
z = 1
|
|
435
|
+
elif "-" in adduct_str:
|
|
436
|
+
z = -1
|
|
437
|
+
|
|
438
|
+
# Get SMILES
|
|
439
|
+
smiles = row.get("smiles_no_stereo", None)
|
|
440
|
+
if smiles is None or (isinstance(smiles, str) and smiles.strip() == ""):
|
|
441
|
+
smiles = None
|
|
442
|
+
|
|
443
|
+
# Calculate InChI from SMILES if available
|
|
444
|
+
inchi = None
|
|
445
|
+
if smiles:
|
|
446
|
+
try:
|
|
447
|
+
from rdkit import Chem
|
|
448
|
+
|
|
449
|
+
mol_rdkit = Chem.MolFromSmiles(smiles)
|
|
450
|
+
if mol_rdkit:
|
|
451
|
+
inchi = Chem.MolToInchi(mol_rdkit)
|
|
452
|
+
except ImportError:
|
|
453
|
+
pass # RDKit not available
|
|
454
|
+
except Exception:
|
|
455
|
+
pass
|
|
456
|
+
|
|
457
|
+
# Calculate formula from SMILES if available
|
|
458
|
+
formula = None
|
|
459
|
+
if smiles:
|
|
460
|
+
try:
|
|
461
|
+
from rdkit import Chem
|
|
462
|
+
|
|
463
|
+
mol_rdkit = Chem.MolFromSmiles(smiles)
|
|
464
|
+
if mol_rdkit:
|
|
465
|
+
formula = Chem.rdMolDescriptors.CalcMolFormula(mol_rdkit)
|
|
466
|
+
except ImportError:
|
|
467
|
+
pass # RDKit not available
|
|
468
|
+
except Exception:
|
|
469
|
+
pass
|
|
470
|
+
|
|
471
|
+
# Calculate mass from m/z and charge
|
|
472
|
+
m = None
|
|
473
|
+
mz_value = row.get("mz", None)
|
|
474
|
+
if mz_value is not None and z is not None:
|
|
475
|
+
try:
|
|
476
|
+
m = float(mz_value) * abs(z)
|
|
477
|
+
except (ValueError, TypeError):
|
|
478
|
+
pass
|
|
479
|
+
|
|
480
|
+
# Get class and clean NaN values
|
|
481
|
+
class_value = row.get("label_classyfire", None)
|
|
482
|
+
if class_value is None or (isinstance(class_value, str) and class_value.upper() == "NAN"):
|
|
483
|
+
class_value = None
|
|
484
|
+
|
|
485
|
+
lib_entry = {
|
|
486
|
+
"lib_uid": row["lib_uid"],
|
|
487
|
+
"cmpd_uid": row["lib_uid"], # Use lib_uid as compound identifier
|
|
488
|
+
"source_id": None, # Leave empty as requested
|
|
489
|
+
"name": row.get("label_compound", None),
|
|
490
|
+
"shortname": None, # Not available in TIMA data
|
|
491
|
+
"class": class_value,
|
|
492
|
+
"smiles": smiles,
|
|
493
|
+
"inchi": inchi,
|
|
494
|
+
"inchikey": row.get("inchikey_connectivity_layer", None),
|
|
495
|
+
"formula": formula,
|
|
496
|
+
"iso": 0, # Fixed isotope value
|
|
497
|
+
"adduct": row.get("adduct", None),
|
|
498
|
+
"probability": row.get("score", None),
|
|
499
|
+
"m": m,
|
|
500
|
+
"z": z,
|
|
501
|
+
"mz": row.get("mz", None),
|
|
502
|
+
"rt": None, # Set to null as requested
|
|
503
|
+
"quant_group": None,
|
|
504
|
+
"db_id": None, # Not available in TIMA data
|
|
505
|
+
"db": row.get("library", None),
|
|
506
|
+
}
|
|
507
|
+
lib_data.append(lib_entry)
|
|
508
|
+
|
|
509
|
+
self.logger.debug(f"Created {len(lib_data)} lib_data entries")
|
|
510
|
+
|
|
511
|
+
# Create lib_df as Polars DataFrame with error handling for mixed types
|
|
512
|
+
try:
|
|
513
|
+
lib_df_temp = pl.DataFrame(lib_data)
|
|
514
|
+
except Exception as e:
|
|
515
|
+
self.logger.warning(f"Error creating lib_df with polars: {e}")
|
|
516
|
+
# Fallback: convert to pandas first, then to polars
|
|
517
|
+
lib_df_pandas = pd.DataFrame(lib_data)
|
|
518
|
+
lib_df_temp = pl.from_pandas(lib_df_pandas)
|
|
519
|
+
|
|
520
|
+
# Ensure uniqueness by name and adduct combination
|
|
521
|
+
# Sort by lib_uid and keep first occurrence (earliest in processing order)
|
|
522
|
+
self.lib_df = lib_df_temp.sort("lib_uid").unique(subset=["name", "adduct"], keep="first")
|
|
523
|
+
|
|
524
|
+
self.logger.info(
|
|
525
|
+
f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)"
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
# === CREATE ID_DF ===
|
|
529
|
+
self.logger.debug("Creating id_df from TIMA identification matches")
|
|
530
|
+
|
|
531
|
+
# Create a mapping from consensus_id to consensus_uid
|
|
532
|
+
# TIMA data has feature_id which matches consensus_id, map to consensus_uid for id_df
|
|
533
|
+
consensus_id_to_uid_map = dict(
|
|
534
|
+
zip(self.consensus_df["consensus_id"].to_list(), self.consensus_df["consensus_uid"].to_list())
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
# Create identification matches
|
|
538
|
+
id_data = []
|
|
539
|
+
for row in tima_data.iter_rows(named=True):
|
|
540
|
+
# Map TIMA feature_id to consensus_df consensus_uid
|
|
541
|
+
tima_feature_id = row["feature_id"]
|
|
542
|
+
consensus_uid = consensus_id_to_uid_map.get(tima_feature_id)
|
|
543
|
+
|
|
544
|
+
if consensus_uid is None:
|
|
545
|
+
# Skip if we can't find the mapping (shouldn't happen after filtering)
|
|
546
|
+
continue
|
|
547
|
+
|
|
548
|
+
# Use error_mz for mz_delta
|
|
549
|
+
mz_delta = None
|
|
550
|
+
error_mz = row.get("error_mz", None)
|
|
551
|
+
if error_mz is not None:
|
|
552
|
+
try:
|
|
553
|
+
mz_delta = float(error_mz)
|
|
554
|
+
except (ValueError, TypeError):
|
|
555
|
+
pass
|
|
556
|
+
|
|
557
|
+
# Use error_rt for rt_delta
|
|
558
|
+
rt_delta = None
|
|
559
|
+
rt_err_value = row.get("error_rt", None)
|
|
560
|
+
if rt_err_value is not None:
|
|
561
|
+
try:
|
|
562
|
+
rt_delta = float(rt_err_value)
|
|
563
|
+
except (ValueError, TypeError):
|
|
564
|
+
pass
|
|
565
|
+
|
|
566
|
+
# Create matcher as "tima-" + library
|
|
567
|
+
matcher = "tima" # default fallback
|
|
568
|
+
library_value = row.get("library", None)
|
|
569
|
+
if library_value is not None:
|
|
570
|
+
try:
|
|
571
|
+
library = str(library_value)
|
|
572
|
+
matcher = f"tima-{library}"
|
|
573
|
+
except (ValueError, TypeError):
|
|
574
|
+
pass
|
|
575
|
+
|
|
576
|
+
id_entry = {
|
|
577
|
+
"consensus_uid": consensus_uid, # Use mapped consensus_uid from consensus_df
|
|
578
|
+
"lib_uid": row["lib_uid"],
|
|
579
|
+
"mz_delta": mz_delta,
|
|
580
|
+
"rt_delta": rt_delta,
|
|
581
|
+
"matcher": matcher,
|
|
582
|
+
"score": row.get("score", None),
|
|
583
|
+
}
|
|
584
|
+
id_data.append(id_entry)
|
|
585
|
+
|
|
586
|
+
# Create id_df as Polars DataFrame with explicit schema to avoid inference issues
|
|
587
|
+
# Match consensus_uid type to consensus_df
|
|
588
|
+
consensus_uid_dtype = self.consensus_df["consensus_uid"].dtype
|
|
589
|
+
id_schema = {
|
|
590
|
+
"consensus_uid": consensus_uid_dtype, # Match the type from consensus_df
|
|
591
|
+
"lib_uid": pl.Int64,
|
|
592
|
+
"mz_delta": pl.Float64,
|
|
593
|
+
"rt_delta": pl.Float64,
|
|
594
|
+
"matcher": pl.Utf8,
|
|
595
|
+
"score": pl.Float64,
|
|
596
|
+
}
|
|
597
|
+
id_df_temp = pl.DataFrame(id_data, schema=id_schema)
|
|
598
|
+
|
|
599
|
+
# Filter id_df to only include lib_uids that exist in the final unique lib_df
|
|
600
|
+
unique_lib_uids = self.lib_df.select("lib_uid").to_series()
|
|
601
|
+
self.id_df = id_df_temp.filter(pl.col("lib_uid").is_in(unique_lib_uids))
|
|
602
|
+
|
|
603
|
+
self.logger.info(f"Created id_df with {len(self.id_df)} identification matches")
|
|
604
|
+
|
|
605
|
+
# === UPDATE CONSENSUS_DF ===
|
|
606
|
+
self.logger.debug("Updating consensus_df with top identification results")
|
|
607
|
+
|
|
608
|
+
# tima_data is already a polars DataFrame
|
|
609
|
+
tima_pl = tima_data
|
|
610
|
+
|
|
611
|
+
# Group by feature_id and select the best identification (highest score)
|
|
612
|
+
# In case of ties, take the first one
|
|
613
|
+
best_ids = (
|
|
614
|
+
tima_pl.group_by("feature_id")
|
|
615
|
+
.agg([pl.col("score").max().alias("max_score")])
|
|
616
|
+
.join(tima_pl, on="feature_id")
|
|
617
|
+
.filter(pl.col("score") == pl.col("max_score"))
|
|
618
|
+
.group_by("feature_id")
|
|
619
|
+
.first() # In case of ties, take the first
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
# Join with consensus_df to map consensus_id to consensus_uid
|
|
623
|
+
best_ids = best_ids.join(
|
|
624
|
+
self.consensus_df.select(["consensus_id", "consensus_uid"]), left_on="feature_id", right_on="consensus_id", how="left"
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
self.logger.debug(f"Selected best identifications for {len(best_ids)} consensus features")
|
|
628
|
+
|
|
629
|
+
# Prepare the identification columns
|
|
630
|
+
id_columns = {
|
|
631
|
+
"id_top_name": best_ids.select("consensus_uid", "label_compound"),
|
|
632
|
+
"id_top_adduct": best_ids.select("consensus_uid", "adduct"),
|
|
633
|
+
"id_top_class": best_ids.select("consensus_uid", "label_classyfire"),
|
|
634
|
+
"id_top_score": best_ids.select("consensus_uid", pl.col("score").round(3).alias("score")),
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
# Initialize identification columns in consensus_df if they don't exist
|
|
638
|
+
for col_name in id_columns.keys():
|
|
639
|
+
if col_name not in self.consensus_df.columns:
|
|
640
|
+
if col_name == "id_top_score":
|
|
641
|
+
self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.Float64).alias(col_name))
|
|
642
|
+
else:
|
|
643
|
+
self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.String).alias(col_name))
|
|
644
|
+
|
|
645
|
+
# Update consensus_df with TIMA identifications
|
|
646
|
+
for col_name, id_data_col in id_columns.items():
|
|
647
|
+
tima_column = id_data_col.columns[1] # second column (after consensus_uid)
|
|
648
|
+
|
|
649
|
+
# Create update dataframe
|
|
650
|
+
update_data = id_data_col.rename({tima_column: col_name})
|
|
651
|
+
|
|
652
|
+
# Join and update
|
|
653
|
+
self.consensus_df = (
|
|
654
|
+
self.consensus_df.join(update_data, on="consensus_uid", how="left", suffix="_tima")
|
|
655
|
+
.with_columns(pl.coalesce([f"{col_name}_tima", col_name]).alias(col_name))
|
|
656
|
+
.drop(f"{col_name}_tima")
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
# Replace NaN values with None in identification columns
|
|
660
|
+
id_col_names = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score"]
|
|
661
|
+
for col_name in id_col_names:
|
|
662
|
+
if col_name in self.consensus_df.columns:
|
|
663
|
+
# For string columns, replace empty strings and "nan" with None
|
|
664
|
+
if col_name != "id_top_score":
|
|
665
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
666
|
+
pl.when(
|
|
667
|
+
pl.col(col_name).is_null()
|
|
668
|
+
| (pl.col(col_name) == "")
|
|
669
|
+
| (pl.col(col_name) == "nan")
|
|
670
|
+
| (pl.col(col_name) == "NaN")
|
|
671
|
+
)
|
|
672
|
+
.then(None)
|
|
673
|
+
.otherwise(pl.col(col_name))
|
|
674
|
+
.alias(col_name)
|
|
675
|
+
)
|
|
676
|
+
# For numeric columns, replace NaN with None
|
|
677
|
+
else:
|
|
678
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
679
|
+
pl.when(pl.col(col_name).is_null() | pl.col(col_name).is_nan())
|
|
680
|
+
.then(None)
|
|
681
|
+
.otherwise(pl.col(col_name))
|
|
682
|
+
.alias(col_name)
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
# Count how many consensus features were updated
|
|
686
|
+
updated_count = self.consensus_df.filter(pl.col("id_top_name").is_not_null()).height
|
|
687
|
+
total_consensus = len(self.consensus_df)
|
|
688
|
+
|
|
689
|
+
self.logger.success(
|
|
690
|
+
f"TIMA import completed. {updated_count}/{total_consensus} "
|
|
691
|
+
f"consensus features now have identifications ({updated_count / total_consensus * 100:.1f}%)"
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
# Update history
|
|
695
|
+
self.update_history(
|
|
696
|
+
["import_tima"],
|
|
697
|
+
{
|
|
698
|
+
"folder": folder,
|
|
699
|
+
"file": file,
|
|
700
|
+
"updated_features": updated_count,
|
|
701
|
+
"total_features": total_consensus,
|
|
702
|
+
"lib_entries": len(self.lib_df),
|
|
703
|
+
"id_matches": len(self.id_df),
|
|
704
|
+
},
|
|
705
|
+
)
|
masster/study/load.py
CHANGED
|
@@ -191,17 +191,6 @@ def load(self, filename=None):
|
|
|
191
191
|
|
|
192
192
|
_load_study5(self, filename)
|
|
193
193
|
|
|
194
|
-
# After loading the study, check if we have consensus features before loading consensus XML
|
|
195
|
-
# if (self.consensus_df is not None and not self.consensus_df.is_empty()):
|
|
196
|
-
# consensus_xml_path = filename.replace(".study5", ".consensusXML")
|
|
197
|
-
# if os.path.exists(consensus_xml_path):
|
|
198
|
-
# self._load_consensusXML(filename=consensus_xml_path)
|
|
199
|
-
# self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
|
|
200
|
-
# else:
|
|
201
|
-
# self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
|
|
202
|
-
# else:
|
|
203
|
-
# self.logger.debug("No consensus features found, skipping consensusXML loading")
|
|
204
|
-
|
|
205
194
|
self.filename = filename
|
|
206
195
|
|
|
207
196
|
|
masster/study/merge.py
CHANGED
|
@@ -441,9 +441,15 @@ def merge(study, **kwargs) -> None:
|
|
|
441
441
|
cached_valid_adducts = None
|
|
442
442
|
try:
|
|
443
443
|
cached_adducts_df = study._get_adducts()
|
|
444
|
+
# Remove all adducts with wrong polarity
|
|
445
|
+
if study.polarity == "positive":
|
|
446
|
+
cached_adducts_df = cached_adducts_df.filter(pl.col("charge") >= 0)
|
|
447
|
+
else:
|
|
448
|
+
cached_adducts_df = cached_adducts_df.filter(pl.col("charge") <= 0)
|
|
444
449
|
if not cached_adducts_df.is_empty():
|
|
445
450
|
cached_valid_adducts = set(cached_adducts_df["name"].to_list())
|
|
446
451
|
else:
|
|
452
|
+
study.logger.warning(f"No valid adducts found for polarity '{study.polarity}'")
|
|
447
453
|
cached_valid_adducts = set()
|
|
448
454
|
except Exception as e:
|
|
449
455
|
study.logger.warning(f"Could not retrieve study adducts: {e}")
|
|
@@ -452,6 +458,13 @@ def merge(study, **kwargs) -> None:
|
|
|
452
458
|
# Always allow '?' adducts
|
|
453
459
|
cached_valid_adducts.add("?")
|
|
454
460
|
|
|
461
|
+
# Bypass for single sample case
|
|
462
|
+
if len(study.samples_df) == 1:
|
|
463
|
+
study.logger.info("Single sample detected - bypassing merge algorithm and using direct feature mapping")
|
|
464
|
+
_handle_single_sample_merge(study, cached_adducts_df, cached_valid_adducts)
|
|
465
|
+
# Skip all post-processing for single sample case
|
|
466
|
+
return
|
|
467
|
+
|
|
455
468
|
# Route to algorithm implementation
|
|
456
469
|
if params.method == "kd":
|
|
457
470
|
consensus_map = _merge_kd(study, params)
|
|
@@ -1719,6 +1732,10 @@ def _calculate_consensus_statistics(
|
|
|
1719
1732
|
mz_values: m/z values from chunk consensus features
|
|
1720
1733
|
intensity_values: Intensity values from chunk consensus features
|
|
1721
1734
|
quality_values: Quality values from chunk consensus features
|
|
1735
|
+
number_features: Number of unique features contributing
|
|
1736
|
+
number_samples: Number of unique samples contributing
|
|
1737
|
+
cached_adducts_df: Cached DataFrame of valid adducts for the study
|
|
1738
|
+
cached_valid_adducts: Cached set of valid adduct names for the study
|
|
1722
1739
|
|
|
1723
1740
|
Returns:
|
|
1724
1741
|
Dictionary with consensus feature metadata
|
|
@@ -3612,6 +3629,142 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
|
|
|
3612
3629
|
return adduct_group_list, adduct_of_list
|
|
3613
3630
|
|
|
3614
3631
|
|
|
3632
|
+
def _handle_single_sample_merge(study, cached_adducts_df=None, cached_valid_adducts=None):
|
|
3633
|
+
"""
|
|
3634
|
+
Handle merge for the special case of a single sample.
|
|
3635
|
+
Directly populate consensus_df from the sample's features_df without any filtering.
|
|
3636
|
+
|
|
3637
|
+
Args:
|
|
3638
|
+
study: Study object with single sample
|
|
3639
|
+
cached_adducts_df: Pre-computed adducts DataFrame (optional)
|
|
3640
|
+
cached_valid_adducts: Set of valid adduct names (optional)
|
|
3641
|
+
"""
|
|
3642
|
+
import polars as pl
|
|
3643
|
+
import uuid
|
|
3644
|
+
|
|
3645
|
+
if len(study.samples_df) != 1:
|
|
3646
|
+
raise ValueError("_handle_single_sample_merge should only be called with exactly one sample")
|
|
3647
|
+
|
|
3648
|
+
# Get the single sample's features
|
|
3649
|
+
sample_row = study.samples_df.row(0, named=True)
|
|
3650
|
+
sample_uid = sample_row["sample_uid"]
|
|
3651
|
+
|
|
3652
|
+
# Filter features for this sample
|
|
3653
|
+
sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
|
|
3654
|
+
|
|
3655
|
+
if len(sample_features) == 0:
|
|
3656
|
+
study.logger.warning("No features found for single sample")
|
|
3657
|
+
study.consensus_df = pl.DataFrame()
|
|
3658
|
+
study.consensus_mapping_df = pl.DataFrame()
|
|
3659
|
+
return
|
|
3660
|
+
|
|
3661
|
+
study.logger.info(f"Creating consensus from {len(sample_features)} features in single sample")
|
|
3662
|
+
|
|
3663
|
+
# Create consensus features directly from sample features
|
|
3664
|
+
consensus_list = []
|
|
3665
|
+
mapping_list = []
|
|
3666
|
+
|
|
3667
|
+
# Cache valid adducts
|
|
3668
|
+
valid_adducts = cached_valid_adducts if cached_valid_adducts is not None else set()
|
|
3669
|
+
valid_adducts.add("?") # Always allow '?' adducts
|
|
3670
|
+
|
|
3671
|
+
for i, feature_row in enumerate(sample_features.iter_rows(named=True)):
|
|
3672
|
+
# Generate unique consensus ID
|
|
3673
|
+
consensus_id_str = str(uuid.uuid4()).replace("-", "")[:16]
|
|
3674
|
+
|
|
3675
|
+
# Handle adduct information
|
|
3676
|
+
adduct = feature_row.get("adduct")
|
|
3677
|
+
if adduct is None or adduct not in valid_adducts:
|
|
3678
|
+
# Set default adduct based on study polarity
|
|
3679
|
+
study_polarity = getattr(study, "polarity", "positive")
|
|
3680
|
+
if study_polarity in ["negative", "neg"]:
|
|
3681
|
+
adduct = "[M-?]1-"
|
|
3682
|
+
adduct_charge = -1
|
|
3683
|
+
adduct_mass_shift = -1.007825
|
|
3684
|
+
else:
|
|
3685
|
+
adduct = "[M+?]1+"
|
|
3686
|
+
adduct_charge = 1
|
|
3687
|
+
adduct_mass_shift = 1.007825
|
|
3688
|
+
else:
|
|
3689
|
+
# Try to get charge and mass shift from cached adducts
|
|
3690
|
+
adduct_charge = 1
|
|
3691
|
+
adduct_mass_shift = 1.007825
|
|
3692
|
+
if cached_adducts_df is not None and not cached_adducts_df.is_empty():
|
|
3693
|
+
matching_adduct = cached_adducts_df.filter(pl.col("name") == adduct)
|
|
3694
|
+
if not matching_adduct.is_empty():
|
|
3695
|
+
adduct_row = matching_adduct.row(0, named=True)
|
|
3696
|
+
adduct_charge = adduct_row["charge"]
|
|
3697
|
+
adduct_mass_shift = adduct_row["mass_shift"]
|
|
3698
|
+
|
|
3699
|
+
# Calculate neutral mass
|
|
3700
|
+
mz = feature_row.get("mz", 0.0)
|
|
3701
|
+
if adduct_charge and adduct_mass_shift is not None:
|
|
3702
|
+
adduct_mass_neutral = mz * abs(adduct_charge) - adduct_mass_shift
|
|
3703
|
+
else:
|
|
3704
|
+
adduct_mass_neutral = None
|
|
3705
|
+
|
|
3706
|
+
# Count MS2 scans
|
|
3707
|
+
ms2_scans = feature_row.get("ms2_scans", [])
|
|
3708
|
+
ms2_count = len(ms2_scans) if ms2_scans else 0
|
|
3709
|
+
|
|
3710
|
+
# Create consensus feature metadata
|
|
3711
|
+
consensus_feature = {
|
|
3712
|
+
"consensus_uid": i,
|
|
3713
|
+
"consensus_id": consensus_id_str,
|
|
3714
|
+
"quality": feature_row.get("quality", 1.0),
|
|
3715
|
+
"number_samples": 1, # Always 1 for single sample
|
|
3716
|
+
"rt": feature_row.get("rt", 0.0),
|
|
3717
|
+
"mz": mz,
|
|
3718
|
+
"rt_min": feature_row.get("rt", 0.0),
|
|
3719
|
+
"rt_max": feature_row.get("rt", 0.0),
|
|
3720
|
+
"rt_mean": feature_row.get("rt", 0.0),
|
|
3721
|
+
"rt_start_mean": feature_row.get("rt_start", 0.0),
|
|
3722
|
+
"rt_end_mean": feature_row.get("rt_end", 0.0),
|
|
3723
|
+
"rt_delta_mean": feature_row.get("rt_delta", 0.0),
|
|
3724
|
+
"mz_min": mz,
|
|
3725
|
+
"mz_max": mz,
|
|
3726
|
+
"mz_mean": mz,
|
|
3727
|
+
"mz_start_mean": feature_row.get("mz_start", 0.0),
|
|
3728
|
+
"mz_end_mean": feature_row.get("mz_end", 0.0),
|
|
3729
|
+
"inty_mean": feature_row.get("inty", 0.0),
|
|
3730
|
+
"bl": -1.0,
|
|
3731
|
+
"chrom_coherence_mean": feature_row.get("chrom_coherence", 0.0),
|
|
3732
|
+
"chrom_prominence_mean": feature_row.get("chrom_prominence", 0.0),
|
|
3733
|
+
"chrom_prominence_scaled_mean": feature_row.get("chrom_prominence_scaled", 0.0),
|
|
3734
|
+
"chrom_height_scaled_mean": feature_row.get("chrom_height_scaled", 0.0),
|
|
3735
|
+
"iso": None, # Will be filled by find_iso() function
|
|
3736
|
+
"iso_mean": feature_row.get("iso", 0.0),
|
|
3737
|
+
"charge_mean": feature_row.get("charge", 0.0),
|
|
3738
|
+
"number_ms2": ms2_count,
|
|
3739
|
+
"adducts": [[adduct, 1, 100.0]], # Single adduct with 100% frequency
|
|
3740
|
+
"adduct_top": adduct,
|
|
3741
|
+
"adduct_charge_top": adduct_charge,
|
|
3742
|
+
"adduct_mass_neutral_top": adduct_mass_neutral,
|
|
3743
|
+
"adduct_mass_shift_top": adduct_mass_shift,
|
|
3744
|
+
"id_top_name": None,
|
|
3745
|
+
"id_top_class": None,
|
|
3746
|
+
"id_top_adduct": None,
|
|
3747
|
+
"id_top_score": None,
|
|
3748
|
+
"id_source": None,
|
|
3749
|
+
}
|
|
3750
|
+
|
|
3751
|
+
consensus_list.append(consensus_feature)
|
|
3752
|
+
|
|
3753
|
+
# Create mapping entry
|
|
3754
|
+
mapping_entry = {
|
|
3755
|
+
"consensus_uid": i,
|
|
3756
|
+
"sample_uid": sample_uid,
|
|
3757
|
+
"feature_uid": feature_row.get("feature_uid"),
|
|
3758
|
+
}
|
|
3759
|
+
mapping_list.append(mapping_entry)
|
|
3760
|
+
|
|
3761
|
+
# Create DataFrames
|
|
3762
|
+
study.consensus_df = pl.DataFrame(consensus_list, strict=False)
|
|
3763
|
+
study.consensus_mapping_df = pl.DataFrame(mapping_list, strict=False)
|
|
3764
|
+
|
|
3765
|
+
study.logger.info(f"Created {len(consensus_list)} consensus features from single sample")
|
|
3766
|
+
|
|
3767
|
+
|
|
3615
3768
|
def _fast_correlation(x, y):
|
|
3616
3769
|
"""
|
|
3617
3770
|
Fast correlation coefficient calculation for consensus matrix data.
|