masster 0.5.28__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -0,0 +1,715 @@
1
+ """
2
+ importers.py
3
+
4
+ Module providing import functionality for Sample class, specifically for importing
5
+ oracle identification data into features.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ import pandas as pd
12
+ import polars as pl
13
+
14
+
15
+ def import_oracle(
16
+ self,
17
+ folder,
18
+ min_id_level=None,
19
+ max_id_level=None,
20
+ ):
21
+ """
22
+ Import oracle identification data and map it to features.
23
+
24
+ This method reads oracle identification results from folder/diag/annotation_full.csv
25
+ and creates lib_df and id_df DataFrames with detailed library and identification information.
26
+ It also updates features_df with top identification results.
27
+
28
+ Parameters:
29
+ folder (str): Path to oracle folder containing diag/annotation_full.csv
30
+ min_id_level (int, optional): Minimum identification level to include
31
+ max_id_level (int, optional): Maximum identification level to include
32
+
33
+ Returns:
34
+ None: Updates features_df, creates lib_df and id_df in-place with oracle identification data
35
+
36
+ Raises:
37
+ FileNotFoundError: If the oracle annotation file doesn't exist
38
+ ValueError: If features_df is empty or doesn't have required columns
39
+
40
+ Example:
41
+ >>> sample.import_oracle(
42
+ ... folder="path/to/oracle_results",
43
+ ... min_id_level=2,
44
+ ... max_id_level=4
45
+ ... )
46
+ """
47
+
48
+ self.logger.info(f"Starting oracle import from folder: {folder}")
49
+
50
+ # Validate inputs
51
+ if self.features_df is None or self.features_df.is_empty():
52
+ raise ValueError("features_df is empty or not available. Run find_features() first.")
53
+
54
+ if "feature_uid" not in self.features_df.columns:
55
+ raise ValueError("features_df must contain 'feature_uid' column")
56
+
57
+ # Check if oracle file exists
58
+ oracle_file_path = os.path.join(folder, "diag", "annotation_full.csv")
59
+ if not os.path.exists(oracle_file_path):
60
+ raise FileNotFoundError(f"Oracle annotation file not found: {oracle_file_path}")
61
+
62
+ self.logger.debug(f"Loading oracle data from: {oracle_file_path}")
63
+
64
+ try:
65
+ # Read oracle data using pandas first for easier processing
66
+ oracle_data = pd.read_csv(oracle_file_path)
67
+ self.logger.info(f"Oracle data loaded successfully with {len(oracle_data)} rows")
68
+ except Exception as e:
69
+ self.logger.error(f"Could not read {oracle_file_path}: {e}")
70
+ raise
71
+
72
+ # Extract feature_uid from scan_title column (format: "uid:XYZ, ...")
73
+ self.logger.debug("Extracting feature UIDs from oracle scan_title using pattern 'uid:(\\d+)'")
74
+ oracle_data["feature_uid"] = oracle_data["scan_title"].str.extract(r"uid:(\d+)", expand=False)
75
+
76
+ # Remove rows where feature_uid extraction failed
77
+ initial_count = len(oracle_data)
78
+ oracle_data = oracle_data.dropna(subset=["feature_uid"])
79
+ oracle_data["feature_uid"] = oracle_data["feature_uid"].astype(int)
80
+
81
+ self.logger.debug(f"Extracted feature UIDs for {len(oracle_data)}/{initial_count} oracle entries")
82
+
83
+ # Apply id_level filters if specified
84
+ if min_id_level is not None:
85
+ oracle_data = oracle_data[oracle_data["level"] >= min_id_level]
86
+ self.logger.debug(f"After min_id_level filter ({min_id_level}): {len(oracle_data)} entries")
87
+
88
+ if max_id_level is not None:
89
+ oracle_data = oracle_data[oracle_data["level"] <= max_id_level]
90
+ self.logger.debug(f"After max_id_level filter ({max_id_level}): {len(oracle_data)} entries")
91
+
92
+ if len(oracle_data) == 0:
93
+ self.logger.warning("No oracle entries remain after filtering")
94
+ return
95
+
96
+ # === CREATE LIB_DF ===
97
+ self.logger.debug("Creating lib_df from Oracle annotation data")
98
+ self.logger.debug(f"Oracle data shape before lib_df creation: {oracle_data.shape}")
99
+
100
+ # Create unique lib_uid for each library entry
101
+ oracle_data["lib_uid"] = range(len(oracle_data))
102
+
103
+ # Map Oracle columns to lib_df schema
104
+ lib_data = []
105
+ for _, row in oracle_data.iterrows():
106
+ # Convert cmpd_uid to integer, using lib_uid as fallback
107
+ cmpd_uid = row["lib_uid"] # Use lib_uid as integer compound identifier
108
+ try:
109
+ if row.get("lib_id") is not None:
110
+ cmpd_uid = int(float(str(row["lib_id"]))) # Convert to int, handling potential float strings
111
+ except (ValueError, TypeError):
112
+ pass # Keep lib_uid as fallback
113
+
114
+ lib_entry = {
115
+ "lib_uid": row["lib_uid"],
116
+ "cmpd_uid": cmpd_uid, # Integer compound identifier
117
+ "source_id": "LipidOracle", # Fixed source identifier
118
+ "name": row.get("name", None),
119
+ "shortname": row.get("species", None),
120
+ "class": row.get("hg", None),
121
+ "smiles": None, # Not available in Oracle data
122
+ "inchi": None, # Not available in Oracle data
123
+ "inchikey": None, # Not available in Oracle data
124
+ "formula": row.get("formula", None),
125
+ "iso": 0, # Fixed isotope value
126
+ "adduct": row.get("ion", None),
127
+ "probability": row.get("score", None),
128
+ "m": None, # Would need to calculate from formula
129
+ "z": 1 if row.get("ion", "").find("+") != -1 else (-1 if row.get("ion", "").find("-") != -1 else None),
130
+ "mz": row.get("mz", None), # Use mz column from annotation_full.csv
131
+ "rt": None, # Set to null as requested
132
+ "quant_group": None, # Set to null as requested
133
+ "db_id": row.get("lib_id", None),
134
+ "db": row.get("lib", None),
135
+ }
136
+ lib_data.append(lib_entry)
137
+
138
+ self.logger.debug(f"Created {len(lib_data)} lib_data entries")
139
+
140
+ # Create lib_df as Polars DataFrame with error handling for mixed types
141
+ try:
142
+ lib_df_temp = pl.DataFrame(lib_data)
143
+ except Exception as e:
144
+ self.logger.warning(f"Error creating lib_df with polars: {e}")
145
+ # Fallback: convert to pandas first, then to polars
146
+ lib_df_pandas = pd.DataFrame(lib_data)
147
+ lib_df_temp = pl.from_pandas(lib_df_pandas)
148
+
149
+ # Ensure uniqueness by name and adduct combination
150
+ # Sort by lib_uid and keep first occurrence (earliest in processing order)
151
+ self.lib_df = lib_df_temp.sort("lib_uid").unique(subset=["name", "adduct"], keep="first")
152
+
153
+ self.logger.info(
154
+ f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)"
155
+ )
156
+
157
+ # === CREATE ID_DF ===
158
+ self.logger.debug("Creating id_df from Oracle identification matches")
159
+
160
+ # Create identification matches
161
+ id_data = []
162
+ for _, row in oracle_data.iterrows():
163
+ # Use dmz from annotation_full.csv directly for mz_delta
164
+ mz_delta = None
165
+ if row.get("dmz") is not None:
166
+ try:
167
+ mz_delta = float(row["dmz"])
168
+ except (ValueError, TypeError):
169
+ pass
170
+
171
+ # Use rt_err from annotation_full.csv for rt_delta, None if NaN
172
+ rt_delta = None
173
+ rt_err_value = row.get("rt_err")
174
+ if rt_err_value is not None and not (isinstance(rt_err_value, float) and pd.isna(rt_err_value)):
175
+ try:
176
+ rt_delta = float(rt_err_value)
177
+ except (ValueError, TypeError):
178
+ pass
179
+
180
+ # Create matcher as "lipidoracle-" + score_metric from annotation_full.csv
181
+ matcher = "lipidoracle" # default fallback
182
+ if row.get("score_metric") is not None:
183
+ try:
184
+ score_metric = str(row["score_metric"])
185
+ matcher = f"lipidoracle-{score_metric}"
186
+ except (ValueError, TypeError):
187
+ pass
188
+
189
+ id_entry = {
190
+ "feature_uid": row["feature_uid"],
191
+ "lib_uid": row["lib_uid"],
192
+ "mz_delta": mz_delta,
193
+ "rt_delta": rt_delta,
194
+ "matcher": matcher,
195
+ "score": row.get("score", None),
196
+ "iso": 0, # Fixed isotope value for oracle imports
197
+ }
198
+ id_data.append(id_entry)
199
+
200
+ # Create id_df as Polars DataFrame with error handling
201
+ try:
202
+ id_df_temp = pl.DataFrame(id_data)
203
+ except Exception as e:
204
+ self.logger.warning(f"Error creating id_df with polars: {e}")
205
+ # Fallback: convert to pandas first, then to polars
206
+ id_df_pandas = pd.DataFrame(id_data)
207
+ id_df_temp = pl.from_pandas(id_df_pandas)
208
+
209
+ # Filter id_df to only include lib_uids that exist in the final unique lib_df
210
+ unique_lib_uids = self.lib_df.select("lib_uid").to_series()
211
+ self.id_df = id_df_temp.filter(pl.col("lib_uid").is_in(unique_lib_uids))
212
+
213
+ self.logger.info(f"Created id_df with {len(self.id_df)} identification matches")
214
+
215
+ # === UPDATE FEATURES_DF (adapted from consensus functionality) ===
216
+ self.logger.debug("Updating features_df with top identification results")
217
+
218
+ # Convert to polars for efficient joining with error handling
219
+ try:
220
+ oracle_pl = pl.DataFrame(oracle_data)
221
+ except Exception as e:
222
+ self.logger.warning(f"Error converting oracle_data to polars: {e}")
223
+ # Convert using from_pandas properly
224
+ oracle_pl = pl.from_pandas(oracle_data.reset_index(drop=True))
225
+
226
+ # Group by feature_uid and select the best identification (highest level)
227
+ # In case of ties, take the first one
228
+ best_ids = (
229
+ oracle_pl.group_by("feature_uid")
230
+ .agg([pl.col("level").max().alias("max_level")])
231
+ .join(oracle_pl, on="feature_uid")
232
+ .filter(pl.col("level") == pl.col("max_level"))
233
+ .group_by("feature_uid")
234
+ .first() # In case of ties, take the first
235
+ )
236
+
237
+ self.logger.debug(f"Selected best identifications for {len(best_ids)} features")
238
+
239
+ # Prepare the identification columns
240
+ id_columns = {
241
+ "id_top_name": best_ids.select("feature_uid", "name"),
242
+ "id_top_adduct": best_ids.select("feature_uid", "ion"),
243
+ "id_top_class": best_ids.select("feature_uid", "hg"),
244
+ "id_top_score": best_ids.select("feature_uid", pl.col("score").round(3).alias("score")),
245
+ }
246
+
247
+ # Initialize identification columns in features_df if they don't exist
248
+ for col_name in id_columns.keys():
249
+ if col_name not in self.features_df.columns:
250
+ if col_name == "id_top_score":
251
+ self.features_df = self.features_df.with_columns(pl.lit(None, dtype=pl.Float64).alias(col_name))
252
+ else:
253
+ self.features_df = self.features_df.with_columns(pl.lit(None, dtype=pl.String).alias(col_name))
254
+
255
+ # Update features_df with oracle identifications
256
+ for col_name, id_data_col in id_columns.items():
257
+ oracle_column = id_data_col.columns[1] # second column (after feature_uid)
258
+
259
+ # Create update dataframe
260
+ update_data = id_data_col.rename({oracle_column: col_name})
261
+
262
+ # Join and update
263
+ self.features_df = (
264
+ self.features_df.join(update_data, on="feature_uid", how="left", suffix="_oracle")
265
+ .with_columns(pl.coalesce([f"{col_name}_oracle", col_name]).alias(col_name))
266
+ .drop(f"{col_name}_oracle")
267
+ )
268
+
269
+ # Replace NaN values with None in identification columns
270
+ id_col_names = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score"]
271
+ for col_name in id_col_names:
272
+ if col_name in self.features_df.columns:
273
+ # For string columns, replace empty strings and "nan" with None
274
+ if col_name != "id_top_score":
275
+ self.features_df = self.features_df.with_columns(
276
+ pl.when(
277
+ pl.col(col_name).is_null()
278
+ | (pl.col(col_name) == "")
279
+ | (pl.col(col_name) == "nan")
280
+ | (pl.col(col_name) == "NaN")
281
+ )
282
+ .then(None)
283
+ .otherwise(pl.col(col_name))
284
+ .alias(col_name)
285
+ )
286
+ # For numeric columns, replace NaN with None
287
+ else:
288
+ self.features_df = self.features_df.with_columns(
289
+ pl.when(pl.col(col_name).is_null() | pl.col(col_name).is_nan())
290
+ .then(None)
291
+ .otherwise(pl.col(col_name))
292
+ .alias(col_name)
293
+ )
294
+
295
+ # Count how many features were updated
296
+ updated_count = self.features_df.filter(pl.col("id_top_name").is_not_null()).height
297
+ total_features = len(self.features_df)
298
+
299
+ self.logger.success(
300
+ f"LipidOracle import completed. {updated_count}/{total_features} "
301
+ f"features now have identifications ({updated_count / total_features * 100:.1f}%)"
302
+ )
303
+
304
+ # Update history
305
+ self.store_history(
306
+ ["import_oracle"],
307
+ {
308
+ "folder": folder,
309
+ "min_id_level": min_id_level,
310
+ "max_id_level": max_id_level,
311
+ "updated_features": updated_count,
312
+ "total_features": total_features,
313
+ "lib_entries": len(self.lib_df),
314
+ "id_matches": len(self.id_df),
315
+ },
316
+ )
317
+
318
+
319
+ def import_tima(
320
+ self,
321
+ folder,
322
+ file="mini",
323
+ ):
324
+ """
325
+ Import TIMA identification data and map it to features.
326
+
327
+ This method reads TIMA identification results from folder/*results_{file}.tsv
328
+ and creates lib_df and id_df DataFrames with detailed library and identification information.
329
+ It also updates features_df with top identification results.
330
+
331
+ Parameters:
332
+ folder (str): Path to folder containing TIMA results TSV file
333
+ file (str): File suffix to search for (default: "mini")
334
+
335
+ Returns:
336
+ None: Updates features_df, creates lib_df and id_df in-place with TIMA identification data
337
+
338
+ Raises:
339
+ FileNotFoundError: If the TIMA results file doesn't exist
340
+ ValueError: If features_df is empty or doesn't have required columns
341
+
342
+ Example:
343
+ >>> sample.import_tima(
344
+ ... folder="path/to/tima_results",
345
+ ... file="mini"
346
+ ... )
347
+ """
348
+
349
+ self.logger.info(f"Starting TIMA import from folder: {folder}")
350
+
351
+ # Validate inputs
352
+ if self.features_df is None or self.features_df.is_empty():
353
+ raise ValueError("features_df is empty or not available. Run find_features() first.")
354
+
355
+ if "feature_uid" not in self.features_df.columns:
356
+ raise ValueError("features_df must contain 'feature_uid' column")
357
+
358
+ # Find TIMA file
359
+ import glob
360
+
361
+ tima_pattern = os.path.join(folder, f"*results_{file}.tsv")
362
+ tima_files = glob.glob(tima_pattern)
363
+
364
+ if not tima_files:
365
+ raise FileNotFoundError(f"TIMA results file not found with pattern: {tima_pattern}")
366
+
367
+ tima_file_path = tima_files[0]
368
+ self.logger.debug(f"Loading TIMA data from: {tima_file_path}")
369
+
370
+ try:
371
+ # Read TIMA data using polars
372
+ tima_data = pl.read_csv(
373
+ tima_file_path,
374
+ separator="\t",
375
+ schema_overrides={
376
+ "feature_id": pl.Utf8, # Read as Utf8 string
377
+ },
378
+ infer_schema_length=10000
379
+ )
380
+ self.logger.info(f"TIMA data loaded successfully with {len(tima_data)} rows")
381
+ except Exception as e:
382
+ self.logger.error(f"Could not read {tima_file_path}: {e}")
383
+ raise
384
+
385
+ # Check if TIMA feature_ids match features_df feature_id column
386
+ if "feature_id" not in self.features_df.columns:
387
+ raise ValueError("features_df must contain 'feature_id' column")
388
+
389
+ # Compare TIMA feature_ids with features_df feature_ids
390
+ features_ids = set(self.features_df["feature_id"].to_list())
391
+ tima_ids = set(tima_data["feature_id"].to_list())
392
+
393
+ matching_ids = features_ids.intersection(tima_ids)
394
+ non_matching_ids = tima_ids - features_ids
395
+
396
+ if non_matching_ids:
397
+ self.logger.warning(
398
+ f"Found {len(non_matching_ids)} feature_ids in TIMA data that do not match any feature_id in features_df. "
399
+ f"These will be filtered out. Matching features: {len(matching_ids)}/{len(tima_ids)}"
400
+ )
401
+ # Filter to only matching feature_ids
402
+ tima_data = tima_data.filter(pl.col("feature_id").is_in(list(features_ids)))
403
+
404
+ if len(tima_data) == 0:
405
+ self.logger.error("No TIMA feature_ids match features_df feature_id values")
406
+ raise ValueError("No matching features found between TIMA data and features_df")
407
+
408
+ self.logger.debug(f"Matched {len(tima_data)} TIMA entries to features_df feature_id values")
409
+
410
+ # Filter to only rows with identification data (non-empty label_compound)
411
+ initial_count = len(tima_data)
412
+ tima_data = tima_data.filter(
413
+ pl.col("label_compound").is_not_null() &
414
+ (pl.col("label_compound").cast(pl.Utf8).str.strip_chars() != "")
415
+ )
416
+
417
+ self.logger.debug(f"Filtered to {len(tima_data)}/{initial_count} TIMA entries with identifications")
418
+
419
+ if len(tima_data) == 0:
420
+ self.logger.warning("No TIMA entries with identifications found")
421
+ return
422
+
423
+ # === CREATE LIB_DF ===
424
+ self.logger.debug("Creating lib_df from TIMA annotation data")
425
+ self.logger.debug(f"TIMA data shape before lib_df creation: {tima_data.shape}")
426
+
427
+ # Create unique lib_uid for each library entry
428
+ tima_data = tima_data.with_columns(
429
+ pl.arange(0, len(tima_data)).alias("lib_uid")
430
+ )
431
+
432
+ # Map TIMA columns to lib_df schema
433
+ lib_data = []
434
+ for row in tima_data.iter_rows(named=True):
435
+ # Extract z (charge) from adduct
436
+ z = None
437
+ adduct_str = str(row.get("adduct", ""))
438
+ if "+" in adduct_str:
439
+ z = 1
440
+ elif "-" in adduct_str:
441
+ z = -1
442
+
443
+ # Get SMILES
444
+ smiles = row.get("smiles_no_stereo", None)
445
+ if smiles is None or (isinstance(smiles, str) and smiles.strip() == ""):
446
+ smiles = None
447
+
448
+ # Calculate InChI from SMILES if available
449
+ inchi = None
450
+ if smiles:
451
+ try:
452
+ # Try to get InChI from SMILES using RDKit if available
453
+ try:
454
+ from rdkit import Chem
455
+ mol_rdkit = Chem.MolFromSmiles(smiles)
456
+ if mol_rdkit:
457
+ inchi = Chem.MolToInchi(mol_rdkit)
458
+ except ImportError:
459
+ pass # RDKit not available
460
+ except Exception:
461
+ pass
462
+
463
+ # Calculate formula from SMILES if available
464
+ formula = None
465
+ if smiles:
466
+ try:
467
+ from rdkit import Chem
468
+ mol_rdkit = Chem.MolFromSmiles(smiles)
469
+ if mol_rdkit:
470
+ formula = Chem.rdMolDescriptors.CalcMolFormula(mol_rdkit)
471
+ except ImportError:
472
+ pass # RDKit not available
473
+ except Exception:
474
+ pass
475
+
476
+ # Calculate mass from m/z and charge
477
+ m = None
478
+ mz_value = row.get("mz", None)
479
+ if mz_value is not None and z is not None:
480
+ try:
481
+ m = float(mz_value) * abs(z)
482
+ except (ValueError, TypeError):
483
+ pass
484
+
485
+ # Get class and clean NaN values
486
+ class_value = row.get("label_classyfire", None)
487
+ if class_value is None or (isinstance(class_value, str) and class_value.upper() == "NAN"):
488
+ class_value = None
489
+
490
+ lib_entry = {
491
+ "lib_uid": row["lib_uid"],
492
+ "cmpd_uid": row["lib_uid"], # Use lib_uid as compound identifier
493
+ "source_id": None, # Leave empty as requested
494
+ "name": row.get("label_compound", None),
495
+ "shortname": None, # Not available in TIMA data
496
+ "class": class_value,
497
+ "smiles": smiles,
498
+ "inchi": inchi,
499
+ "inchikey": row.get("inchikey_connectivity_layer", None),
500
+ "formula": formula,
501
+ "iso": 0, # Fixed isotope value
502
+ "adduct": row.get("adduct", None),
503
+ "probability": row.get("score", None),
504
+ "m": m,
505
+ "z": z,
506
+ "mz": row.get("mz", None),
507
+ "rt": None, # Set to null as requested
508
+ "quant_group": None,
509
+ "db_id": None, # Not available in TIMA data
510
+ "db": row.get("library", None),
511
+ }
512
+ lib_data.append(lib_entry)
513
+
514
+ self.logger.debug(f"Created {len(lib_data)} lib_data entries")
515
+
516
+ # Create lib_df as Polars DataFrame with error handling for mixed types
517
+ try:
518
+ lib_df_temp = pl.DataFrame(lib_data)
519
+ except Exception as e:
520
+ self.logger.warning(f"Error creating lib_df with polars: {e}")
521
+ # Fallback: convert to pandas first, then to polars
522
+ lib_df_pandas = pd.DataFrame(lib_data)
523
+ lib_df_temp = pl.from_pandas(lib_df_pandas)
524
+
525
+ # Ensure uniqueness by name and adduct combination
526
+ # Sort by lib_uid and keep first occurrence (earliest in processing order)
527
+ self.lib_df = lib_df_temp.sort("lib_uid").unique(subset=["name", "adduct"], keep="first")
528
+
529
+ self.logger.info(
530
+ f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)"
531
+ )
532
+
533
+ # === CREATE ID_DF ===
534
+ self.logger.debug("Creating id_df from TIMA identification matches")
535
+
536
+ # Create a mapping from feature_id to feature_uid
537
+ # TIMA data has feature_id which must be mapped to features_df feature_uid for id_df
538
+ feature_id_to_uid_map = dict(zip(
539
+ self.features_df["feature_id"].to_list(),
540
+ self.features_df["feature_uid"].to_list()
541
+ ))
542
+
543
+ # Create identification matches
544
+ id_data = []
545
+ for row in tima_data.iter_rows(named=True):
546
+ # Map TIMA feature_id to features_df feature_uid
547
+ tima_feature_id = row["feature_id"]
548
+ feature_uid = feature_id_to_uid_map.get(tima_feature_id)
549
+
550
+ if feature_uid is None:
551
+ # Skip if we can't find the mapping (shouldn't happen after filtering)
552
+ continue
553
+
554
+ # Use error_mz for mz_delta
555
+ mz_delta = None
556
+ error_mz = row.get("error_mz", None)
557
+ if error_mz is not None:
558
+ try:
559
+ mz_delta = float(error_mz)
560
+ except (ValueError, TypeError):
561
+ pass
562
+
563
+ # Use error_rt for rt_delta
564
+ rt_delta = None
565
+ rt_err_value = row.get("error_rt", None)
566
+ if rt_err_value is not None:
567
+ try:
568
+ rt_delta = float(rt_err_value)
569
+ except (ValueError, TypeError):
570
+ pass
571
+
572
+ # Create matcher as "tima-" + library
573
+ matcher = "tima" # default fallback
574
+ library_value = row.get("library", None)
575
+ if library_value is not None:
576
+ try:
577
+ library = str(library_value)
578
+ matcher = f"tima-{library}"
579
+ except (ValueError, TypeError):
580
+ pass
581
+
582
+ id_entry = {
583
+ "feature_uid": feature_uid, # Use mapped feature_uid from features_df
584
+ "lib_uid": row["lib_uid"],
585
+ "mz_delta": mz_delta,
586
+ "rt_delta": rt_delta,
587
+ "matcher": matcher,
588
+ "score": row.get("score", None),
589
+ "iso": 0, # Fixed isotope value for TIMA imports
590
+ }
591
+ id_data.append(id_entry)
592
+
593
+ # Create id_df as Polars DataFrame with explicit schema to avoid inference issues
594
+ # Match feature_uid type to features_df
595
+ feature_uid_dtype = self.features_df["feature_uid"].dtype
596
+ id_schema = {
597
+ "feature_uid": feature_uid_dtype, # Match the type from features_df
598
+ "lib_uid": pl.Int64,
599
+ "mz_delta": pl.Float64,
600
+ "rt_delta": pl.Float64,
601
+ "matcher": pl.Utf8,
602
+ "score": pl.Float64,
603
+ "iso": pl.Int64,
604
+ }
605
+ id_df_temp = pl.DataFrame(id_data, schema=id_schema)
606
+
607
+ # Filter id_df to only include lib_uids that exist in the final unique lib_df
608
+ unique_lib_uids = self.lib_df.select("lib_uid").to_series()
609
+ self.id_df = id_df_temp.filter(pl.col("lib_uid").is_in(unique_lib_uids))
610
+
611
+ self.logger.info(f"Created id_df with {len(self.id_df)} identification matches")
612
+
613
+ # === UPDATE FEATURES_DF ===
614
+ self.logger.debug("Updating features_df with top identification results")
615
+
616
+ # tima_data is already a polars DataFrame
617
+ tima_pl = tima_data
618
+
619
+ # Group by feature_id and select the best identification (highest score)
620
+ # In case of ties, take the first one
621
+ best_ids = (
622
+ tima_pl.group_by("feature_id")
623
+ .agg([pl.col("score").max().alias("max_score")])
624
+ .join(tima_pl, on="feature_id")
625
+ .filter(pl.col("score") == pl.col("max_score"))
626
+ .group_by("feature_id")
627
+ .first() # In case of ties, take the first
628
+ )
629
+
630
+ # Join with features_df to map feature_id to feature_uid
631
+ best_ids = best_ids.join(
632
+ self.features_df.select(["feature_id", "feature_uid"]),
633
+ on="feature_id",
634
+ how="left"
635
+ )
636
+
637
+ self.logger.debug(f"Selected best identifications for {len(best_ids)} features")
638
+
639
+ # Prepare the identification columns
640
+ id_columns = {
641
+ "id_top_name": best_ids.select("feature_uid", "label_compound"),
642
+ "id_top_adduct": best_ids.select("feature_uid", "adduct"),
643
+ "id_top_class": best_ids.select("feature_uid", "label_classyfire"),
644
+ "id_top_score": best_ids.select("feature_uid", pl.col("score").round(3).alias("score")),
645
+ }
646
+
647
+ # Initialize identification columns in features_df if they don't exist
648
+ for col_name in id_columns.keys():
649
+ if col_name not in self.features_df.columns:
650
+ if col_name == "id_top_score":
651
+ self.features_df = self.features_df.with_columns(pl.lit(None, dtype=pl.Float64).alias(col_name))
652
+ else:
653
+ self.features_df = self.features_df.with_columns(pl.lit(None, dtype=pl.String).alias(col_name))
654
+
655
+ # Update features_df with TIMA identifications
656
+ for col_name, id_data_col in id_columns.items():
657
+ tima_column = id_data_col.columns[1] # second column (after feature_uid)
658
+
659
+ # Create update dataframe
660
+ update_data = id_data_col.rename({tima_column: col_name})
661
+
662
+ # Join and update
663
+ self.features_df = (
664
+ self.features_df.join(update_data, on="feature_uid", how="left", suffix="_tima")
665
+ .with_columns(pl.coalesce([f"{col_name}_tima", col_name]).alias(col_name))
666
+ .drop(f"{col_name}_tima")
667
+ )
668
+
669
+ # Replace NaN values with None in identification columns
670
+ id_col_names = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score"]
671
+ for col_name in id_col_names:
672
+ if col_name in self.features_df.columns:
673
+ # For string columns, replace empty strings and "nan" with None
674
+ if col_name != "id_top_score":
675
+ self.features_df = self.features_df.with_columns(
676
+ pl.when(
677
+ pl.col(col_name).is_null()
678
+ | (pl.col(col_name) == "")
679
+ | (pl.col(col_name) == "nan")
680
+ | (pl.col(col_name) == "NaN")
681
+ )
682
+ .then(None)
683
+ .otherwise(pl.col(col_name))
684
+ .alias(col_name)
685
+ )
686
+ # For numeric columns, replace NaN with None
687
+ else:
688
+ self.features_df = self.features_df.with_columns(
689
+ pl.when(pl.col(col_name).is_null() | pl.col(col_name).is_nan())
690
+ .then(None)
691
+ .otherwise(pl.col(col_name))
692
+ .alias(col_name)
693
+ )
694
+
695
+ # Count how many features were updated
696
+ updated_count = self.features_df.filter(pl.col("id_top_name").is_not_null()).height
697
+ total_features = len(self.features_df)
698
+
699
+ self.logger.success(
700
+ f"TIMA import completed. {updated_count}/{total_features} "
701
+ f"features now have identifications ({updated_count / total_features * 100:.1f}%)"
702
+ )
703
+
704
+ # Update history
705
+ self.store_history(
706
+ ["import_tima"],
707
+ {
708
+ "folder": folder,
709
+ "file": file,
710
+ "updated_features": updated_count,
711
+ "total_features": total_features,
712
+ "lib_entries": len(self.lib_df),
713
+ "id_matches": len(self.id_df),
714
+ },
715
+ )