masster 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/_version.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.6.1"
4
+ __version__ = "0.6.2"
5
5
 
6
6
 
7
7
  def get_version():
@@ -314,3 +314,402 @@ def import_oracle(
314
314
  "id_matches": len(self.id_df),
315
315
  },
316
316
  )
317
+
318
+
319
+ def import_tima(
320
+ self,
321
+ folder,
322
+ file="mini",
323
+ ):
324
+ """
325
+ Import TIMA identification data and map it to features.
326
+
327
+ This method reads TIMA identification results from folder/*results_{file}.tsv
328
+ and creates lib_df and id_df DataFrames with detailed library and identification information.
329
+ It also updates features_df with top identification results.
330
+
331
+ Parameters:
332
+ folder (str): Path to folder containing TIMA results TSV file
333
+ file (str): File suffix to search for (default: "mini")
334
+
335
+ Returns:
336
+ None: Updates features_df, creates lib_df and id_df in-place with TIMA identification data
337
+
338
+ Raises:
339
+ FileNotFoundError: If the TIMA results file doesn't exist
340
+ ValueError: If features_df is empty or doesn't have required columns
341
+
342
+ Example:
343
+ >>> sample.import_tima(
344
+ ... folder="path/to/tima_results",
345
+ ... file="mini"
346
+ ... )
347
+ """
348
+
349
+ self.logger.info(f"Starting TIMA import from folder: {folder}")
350
+
351
+ # Validate inputs
352
+ if self.features_df is None or self.features_df.is_empty():
353
+ raise ValueError("features_df is empty or not available. Run find_features() first.")
354
+
355
+ if "feature_uid" not in self.features_df.columns:
356
+ raise ValueError("features_df must contain 'feature_uid' column")
357
+
358
+ # Find TIMA file
359
+ import glob
360
+
361
+ tima_pattern = os.path.join(folder, f"*results_{file}.tsv")
362
+ tima_files = glob.glob(tima_pattern)
363
+
364
+ if not tima_files:
365
+ raise FileNotFoundError(f"TIMA results file not found with pattern: {tima_pattern}")
366
+
367
+ tima_file_path = tima_files[0]
368
+ self.logger.debug(f"Loading TIMA data from: {tima_file_path}")
369
+
370
+ try:
371
+ # Read TIMA data using polars
372
+ tima_data = pl.read_csv(
373
+ tima_file_path,
374
+ separator="\t",
375
+ schema_overrides={
376
+ "feature_id": pl.Utf8, # Read as Utf8 string
377
+ },
378
+ infer_schema_length=10000
379
+ )
380
+ self.logger.info(f"TIMA data loaded successfully with {len(tima_data)} rows")
381
+ except Exception as e:
382
+ self.logger.error(f"Could not read {tima_file_path}: {e}")
383
+ raise
384
+
385
+ # Check if TIMA feature_ids match features_df feature_id column
386
+ if "feature_id" not in self.features_df.columns:
387
+ raise ValueError("features_df must contain 'feature_id' column")
388
+
389
+ # Compare TIMA feature_ids with features_df feature_ids
390
+ features_ids = set(self.features_df["feature_id"].to_list())
391
+ tima_ids = set(tima_data["feature_id"].to_list())
392
+
393
+ matching_ids = features_ids.intersection(tima_ids)
394
+ non_matching_ids = tima_ids - features_ids
395
+
396
+ if non_matching_ids:
397
+ self.logger.warning(
398
+ f"Found {len(non_matching_ids)} feature_ids in TIMA data that do not match any feature_id in features_df. "
399
+ f"These will be filtered out. Matching features: {len(matching_ids)}/{len(tima_ids)}"
400
+ )
401
+ # Filter to only matching feature_ids
402
+ tima_data = tima_data.filter(pl.col("feature_id").is_in(list(features_ids)))
403
+
404
+ if len(tima_data) == 0:
405
+ self.logger.error("No TIMA feature_ids match features_df feature_id values")
406
+ raise ValueError("No matching features found between TIMA data and features_df")
407
+
408
+ self.logger.debug(f"Matched {len(tima_data)} TIMA entries to features_df feature_id values")
409
+
410
+ # Filter to only rows with identification data (non-empty label_compound)
411
+ initial_count = len(tima_data)
412
+ tima_data = tima_data.filter(
413
+ pl.col("label_compound").is_not_null() &
414
+ (pl.col("label_compound").cast(pl.Utf8).str.strip_chars() != "")
415
+ )
416
+
417
+ self.logger.debug(f"Filtered to {len(tima_data)}/{initial_count} TIMA entries with identifications")
418
+
419
+ if len(tima_data) == 0:
420
+ self.logger.warning("No TIMA entries with identifications found")
421
+ return
422
+
423
+ # === CREATE LIB_DF ===
424
+ self.logger.debug("Creating lib_df from TIMA annotation data")
425
+ self.logger.debug(f"TIMA data shape before lib_df creation: {tima_data.shape}")
426
+
427
+ # Create unique lib_uid for each library entry
428
+ tima_data = tima_data.with_columns(
429
+ pl.arange(0, len(tima_data)).alias("lib_uid")
430
+ )
431
+
432
+ # Map TIMA columns to lib_df schema
433
+ lib_data = []
434
+ for row in tima_data.iter_rows(named=True):
435
+ # Extract z (charge) from adduct
436
+ z = None
437
+ adduct_str = str(row.get("adduct", ""))
438
+ if "+" in adduct_str:
439
+ z = 1
440
+ elif "-" in adduct_str:
441
+ z = -1
442
+
443
+ # Get SMILES
444
+ smiles = row.get("smiles_no_stereo", None)
445
+ if smiles is None or (isinstance(smiles, str) and smiles.strip() == ""):
446
+ smiles = None
447
+
448
+ # Calculate InChI from SMILES if available
449
+ inchi = None
450
+ if smiles:
451
+ try:
452
+ # Try to get InChI from SMILES using RDKit if available
453
+ try:
454
+ from rdkit import Chem
455
+ mol_rdkit = Chem.MolFromSmiles(smiles)
456
+ if mol_rdkit:
457
+ inchi = Chem.MolToInchi(mol_rdkit)
458
+ except ImportError:
459
+ pass # RDKit not available
460
+ except Exception:
461
+ pass
462
+
463
+ # Calculate formula from SMILES if available
464
+ formula = None
465
+ if smiles:
466
+ try:
467
+ from rdkit import Chem
468
+ mol_rdkit = Chem.MolFromSmiles(smiles)
469
+ if mol_rdkit:
470
+ formula = Chem.rdMolDescriptors.CalcMolFormula(mol_rdkit)
471
+ except ImportError:
472
+ pass # RDKit not available
473
+ except Exception:
474
+ pass
475
+
476
+ # Calculate mass from m/z and charge
477
+ m = None
478
+ mz_value = row.get("mz", None)
479
+ if mz_value is not None and z is not None:
480
+ try:
481
+ m = float(mz_value) * abs(z)
482
+ except (ValueError, TypeError):
483
+ pass
484
+
485
+ # Get class and clean NaN values
486
+ class_value = row.get("label_classyfire", None)
487
+ if class_value is None or (isinstance(class_value, str) and class_value.upper() == "NAN"):
488
+ class_value = None
489
+
490
+ lib_entry = {
491
+ "lib_uid": row["lib_uid"],
492
+ "cmpd_uid": row["lib_uid"], # Use lib_uid as compound identifier
493
+ "source_id": None, # Leave empty as requested
494
+ "name": row.get("label_compound", None),
495
+ "shortname": None, # Not available in TIMA data
496
+ "class": class_value,
497
+ "smiles": smiles,
498
+ "inchi": inchi,
499
+ "inchikey": row.get("inchikey_connectivity_layer", None),
500
+ "formula": formula,
501
+ "iso": 0, # Fixed isotope value
502
+ "adduct": row.get("adduct", None),
503
+ "probability": row.get("score", None),
504
+ "m": m,
505
+ "z": z,
506
+ "mz": row.get("mz", None),
507
+ "rt": None, # Set to null as requested
508
+ "quant_group": None,
509
+ "db_id": None, # Not available in TIMA data
510
+ "db": row.get("library", None),
511
+ }
512
+ lib_data.append(lib_entry)
513
+
514
+ self.logger.debug(f"Created {len(lib_data)} lib_data entries")
515
+
516
+ # Create lib_df as Polars DataFrame with error handling for mixed types
517
+ try:
518
+ lib_df_temp = pl.DataFrame(lib_data)
519
+ except Exception as e:
520
+ self.logger.warning(f"Error creating lib_df with polars: {e}")
521
+ # Fallback: convert to pandas first, then to polars
522
+ lib_df_pandas = pd.DataFrame(lib_data)
523
+ lib_df_temp = pl.from_pandas(lib_df_pandas)
524
+
525
+ # Ensure uniqueness by name and adduct combination
526
+ # Sort by lib_uid and keep first occurrence (earliest in processing order)
527
+ self.lib_df = lib_df_temp.sort("lib_uid").unique(subset=["name", "adduct"], keep="first")
528
+
529
+ self.logger.info(
530
+ f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)"
531
+ )
532
+
533
+ # === CREATE ID_DF ===
534
+ self.logger.debug("Creating id_df from TIMA identification matches")
535
+
536
+ # Create a mapping from feature_id to feature_uid
537
+ # TIMA data has feature_id which must be mapped to features_df feature_uid for id_df
538
+ feature_id_to_uid_map = dict(zip(
539
+ self.features_df["feature_id"].to_list(),
540
+ self.features_df["feature_uid"].to_list()
541
+ ))
542
+
543
+ # Create identification matches
544
+ id_data = []
545
+ for row in tima_data.iter_rows(named=True):
546
+ # Map TIMA feature_id to features_df feature_uid
547
+ tima_feature_id = row["feature_id"]
548
+ feature_uid = feature_id_to_uid_map.get(tima_feature_id)
549
+
550
+ if feature_uid is None:
551
+ # Skip if we can't find the mapping (shouldn't happen after filtering)
552
+ continue
553
+
554
+ # Use error_mz for mz_delta
555
+ mz_delta = None
556
+ error_mz = row.get("error_mz", None)
557
+ if error_mz is not None:
558
+ try:
559
+ mz_delta = float(error_mz)
560
+ except (ValueError, TypeError):
561
+ pass
562
+
563
+ # Use error_rt for rt_delta
564
+ rt_delta = None
565
+ rt_err_value = row.get("error_rt", None)
566
+ if rt_err_value is not None:
567
+ try:
568
+ rt_delta = float(rt_err_value)
569
+ except (ValueError, TypeError):
570
+ pass
571
+
572
+ # Create matcher as "tima-" + library
573
+ matcher = "tima" # default fallback
574
+ library_value = row.get("library", None)
575
+ if library_value is not None:
576
+ try:
577
+ library = str(library_value)
578
+ matcher = f"tima-{library}"
579
+ except (ValueError, TypeError):
580
+ pass
581
+
582
+ id_entry = {
583
+ "feature_uid": feature_uid, # Use mapped feature_uid from features_df
584
+ "lib_uid": row["lib_uid"],
585
+ "mz_delta": mz_delta,
586
+ "rt_delta": rt_delta,
587
+ "matcher": matcher,
588
+ "score": row.get("score", None),
589
+ "iso": 0, # Fixed isotope value for TIMA imports
590
+ }
591
+ id_data.append(id_entry)
592
+
593
+ # Create id_df as Polars DataFrame with explicit schema to avoid inference issues
594
+ # Match feature_uid type to features_df
595
+ feature_uid_dtype = self.features_df["feature_uid"].dtype
596
+ id_schema = {
597
+ "feature_uid": feature_uid_dtype, # Match the type from features_df
598
+ "lib_uid": pl.Int64,
599
+ "mz_delta": pl.Float64,
600
+ "rt_delta": pl.Float64,
601
+ "matcher": pl.Utf8,
602
+ "score": pl.Float64,
603
+ "iso": pl.Int64,
604
+ }
605
+ id_df_temp = pl.DataFrame(id_data, schema=id_schema)
606
+
607
+ # Filter id_df to only include lib_uids that exist in the final unique lib_df
608
+ unique_lib_uids = self.lib_df.select("lib_uid").to_series()
609
+ self.id_df = id_df_temp.filter(pl.col("lib_uid").is_in(unique_lib_uids))
610
+
611
+ self.logger.info(f"Created id_df with {len(self.id_df)} identification matches")
612
+
613
+ # === UPDATE FEATURES_DF ===
614
+ self.logger.debug("Updating features_df with top identification results")
615
+
616
+ # tima_data is already a polars DataFrame
617
+ tima_pl = tima_data
618
+
619
+ # Group by feature_id and select the best identification (highest score)
620
+ # In case of ties, take the first one
621
+ best_ids = (
622
+ tima_pl.group_by("feature_id")
623
+ .agg([pl.col("score").max().alias("max_score")])
624
+ .join(tima_pl, on="feature_id")
625
+ .filter(pl.col("score") == pl.col("max_score"))
626
+ .group_by("feature_id")
627
+ .first() # In case of ties, take the first
628
+ )
629
+
630
+ # Join with features_df to map feature_id to feature_uid
631
+ best_ids = best_ids.join(
632
+ self.features_df.select(["feature_id", "feature_uid"]),
633
+ on="feature_id",
634
+ how="left"
635
+ )
636
+
637
+ self.logger.debug(f"Selected best identifications for {len(best_ids)} features")
638
+
639
+ # Prepare the identification columns
640
+ id_columns = {
641
+ "id_top_name": best_ids.select("feature_uid", "label_compound"),
642
+ "id_top_adduct": best_ids.select("feature_uid", "adduct"),
643
+ "id_top_class": best_ids.select("feature_uid", "label_classyfire"),
644
+ "id_top_score": best_ids.select("feature_uid", pl.col("score").round(3).alias("score")),
645
+ }
646
+
647
+ # Initialize identification columns in features_df if they don't exist
648
+ for col_name in id_columns.keys():
649
+ if col_name not in self.features_df.columns:
650
+ if col_name == "id_top_score":
651
+ self.features_df = self.features_df.with_columns(pl.lit(None, dtype=pl.Float64).alias(col_name))
652
+ else:
653
+ self.features_df = self.features_df.with_columns(pl.lit(None, dtype=pl.String).alias(col_name))
654
+
655
+ # Update features_df with TIMA identifications
656
+ for col_name, id_data_col in id_columns.items():
657
+ tima_column = id_data_col.columns[1] # second column (after feature_uid)
658
+
659
+ # Create update dataframe
660
+ update_data = id_data_col.rename({tima_column: col_name})
661
+
662
+ # Join and update
663
+ self.features_df = (
664
+ self.features_df.join(update_data, on="feature_uid", how="left", suffix="_tima")
665
+ .with_columns(pl.coalesce([f"{col_name}_tima", col_name]).alias(col_name))
666
+ .drop(f"{col_name}_tima")
667
+ )
668
+
669
+ # Replace NaN values with None in identification columns
670
+ id_col_names = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score"]
671
+ for col_name in id_col_names:
672
+ if col_name in self.features_df.columns:
673
+ # For string columns, replace empty strings and "nan" with None
674
+ if col_name != "id_top_score":
675
+ self.features_df = self.features_df.with_columns(
676
+ pl.when(
677
+ pl.col(col_name).is_null()
678
+ | (pl.col(col_name) == "")
679
+ | (pl.col(col_name) == "nan")
680
+ | (pl.col(col_name) == "NaN")
681
+ )
682
+ .then(None)
683
+ .otherwise(pl.col(col_name))
684
+ .alias(col_name)
685
+ )
686
+ # For numeric columns, replace NaN with None
687
+ else:
688
+ self.features_df = self.features_df.with_columns(
689
+ pl.when(pl.col(col_name).is_null() | pl.col(col_name).is_nan())
690
+ .then(None)
691
+ .otherwise(pl.col(col_name))
692
+ .alias(col_name)
693
+ )
694
+
695
+ # Count how many features were updated
696
+ updated_count = self.features_df.filter(pl.col("id_top_name").is_not_null()).height
697
+ total_features = len(self.features_df)
698
+
699
+ self.logger.success(
700
+ f"TIMA import completed. {updated_count}/{total_features} "
701
+ f"features now have identifications ({updated_count / total_features * 100:.1f}%)"
702
+ )
703
+
704
+ # Update history
705
+ self.store_history(
706
+ ["import_tima"],
707
+ {
708
+ "folder": folder,
709
+ "file": file,
710
+ "updated_features": updated_count,
711
+ "total_features": total_features,
712
+ "lib_entries": len(self.lib_df),
713
+ "id_matches": len(self.id_df),
714
+ },
715
+ )
masster/sample/sample.py CHANGED
@@ -135,6 +135,7 @@ from masster.sample.id import get_id
135
135
  from masster.sample.id import id_reset
136
136
  from masster.sample.id import lib_reset
137
137
  from masster.sample.importers import import_oracle
138
+ from masster.sample.importers import import_tima
138
139
  from masster.sample.load import chrom_extract
139
140
  from masster.sample.load import _index_file
140
141
  from masster.sample.load import load
@@ -170,7 +171,9 @@ from masster.sample.save import export_chrom
170
171
  from masster.sample.save import export_dda_stats
171
172
  from masster.sample.save import export_features
172
173
  from masster.sample.save import export_mgf
173
- from masster.sample.save import export_xlsx
174
+ from masster.sample.save import export_excel
175
+ from masster.sample.save import export_slaw
176
+ from masster.sample.save import export_mztab
174
177
  from masster.sample.save import save
175
178
 
176
179
 
@@ -307,11 +310,14 @@ class Sample:
307
310
  lib_reset = lib_reset
308
311
  # Importers from importers.py
309
312
  import_oracle = import_oracle
313
+ import_tima = import_tima
310
314
  export_features = export_features
311
- export_xlsx = export_xlsx
315
+ export_excel = export_excel
316
+ export_slaw = export_slaw
312
317
  export_mgf = export_mgf
313
318
  export_chrom = export_chrom
314
319
  export_dda_stats = export_dda_stats
320
+ export_mztab = export_mztab
315
321
  plot_2d = plot_2d
316
322
  plot_2d_oracle = plot_2d_oracle
317
323
  plot_dda_stats = plot_dda_stats