masster 0.5.28__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/data/libs/aa_nort.json +240 -0
- masster/data/libs/ccm_nort.json +1319 -0
- masster/lib/lib.py +1 -1
- masster/logger.py +0 -6
- masster/sample/adducts.py +1 -1
- masster/sample/defaults/find_adducts_def.py +1 -1
- masster/sample/h5.py +152 -2
- masster/sample/helpers.py +91 -5
- masster/sample/id.py +1160 -0
- masster/sample/importers.py +715 -0
- masster/sample/plot.py +175 -71
- masster/sample/sample.py +26 -5
- masster/sample/sample5_schema.json +99 -1
- masster/sample/save.py +724 -1
- masster/study/defaults/study_def.py +8 -12
- masster/study/export.py +216 -65
- masster/study/id.py +59 -12
- masster/study/importers.py +384 -1
- masster/study/load.py +0 -11
- masster/study/merge.py +153 -0
- masster/study/plot.py +197 -0
- masster/study/study.py +6 -4
- masster/study/study5_schema.json +15 -0
- masster/wizard/wizard.py +13 -14
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/METADATA +17 -18
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/RECORD +30 -29
- masster/data/libs/aa.csv +0 -22
- masster/data/libs/ccm.csv +0 -120
- masster/data/libs/urine.csv +0 -4693
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/WHEEL +0 -0
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/entry_points.txt +0 -0
- {masster-0.5.28.dist-info → masster-0.6.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,715 @@
|
|
|
1
|
+
"""
|
|
2
|
+
importers.py
|
|
3
|
+
|
|
4
|
+
Module providing import functionality for Sample class, specifically for importing
|
|
5
|
+
oracle identification data into features.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import polars as pl
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def import_oracle(
|
|
16
|
+
self,
|
|
17
|
+
folder,
|
|
18
|
+
min_id_level=None,
|
|
19
|
+
max_id_level=None,
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
Import oracle identification data and map it to features.
|
|
23
|
+
|
|
24
|
+
This method reads oracle identification results from folder/diag/annotation_full.csv
|
|
25
|
+
and creates lib_df and id_df DataFrames with detailed library and identification information.
|
|
26
|
+
It also updates features_df with top identification results.
|
|
27
|
+
|
|
28
|
+
Parameters:
|
|
29
|
+
folder (str): Path to oracle folder containing diag/annotation_full.csv
|
|
30
|
+
min_id_level (int, optional): Minimum identification level to include
|
|
31
|
+
max_id_level (int, optional): Maximum identification level to include
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
None: Updates features_df, creates lib_df and id_df in-place with oracle identification data
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
FileNotFoundError: If the oracle annotation file doesn't exist
|
|
38
|
+
ValueError: If features_df is empty or doesn't have required columns
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
>>> sample.import_oracle(
|
|
42
|
+
... folder="path/to/oracle_results",
|
|
43
|
+
... min_id_level=2,
|
|
44
|
+
... max_id_level=4
|
|
45
|
+
... )
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
self.logger.info(f"Starting oracle import from folder: {folder}")
|
|
49
|
+
|
|
50
|
+
# Validate inputs
|
|
51
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
52
|
+
raise ValueError("features_df is empty or not available. Run find_features() first.")
|
|
53
|
+
|
|
54
|
+
if "feature_uid" not in self.features_df.columns:
|
|
55
|
+
raise ValueError("features_df must contain 'feature_uid' column")
|
|
56
|
+
|
|
57
|
+
# Check if oracle file exists
|
|
58
|
+
oracle_file_path = os.path.join(folder, "diag", "annotation_full.csv")
|
|
59
|
+
if not os.path.exists(oracle_file_path):
|
|
60
|
+
raise FileNotFoundError(f"Oracle annotation file not found: {oracle_file_path}")
|
|
61
|
+
|
|
62
|
+
self.logger.debug(f"Loading oracle data from: {oracle_file_path}")
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
# Read oracle data using pandas first for easier processing
|
|
66
|
+
oracle_data = pd.read_csv(oracle_file_path)
|
|
67
|
+
self.logger.info(f"Oracle data loaded successfully with {len(oracle_data)} rows")
|
|
68
|
+
except Exception as e:
|
|
69
|
+
self.logger.error(f"Could not read {oracle_file_path}: {e}")
|
|
70
|
+
raise
|
|
71
|
+
|
|
72
|
+
# Extract feature_uid from scan_title column (format: "uid:XYZ, ...")
|
|
73
|
+
self.logger.debug("Extracting feature UIDs from oracle scan_title using pattern 'uid:(\\d+)'")
|
|
74
|
+
oracle_data["feature_uid"] = oracle_data["scan_title"].str.extract(r"uid:(\d+)", expand=False)
|
|
75
|
+
|
|
76
|
+
# Remove rows where feature_uid extraction failed
|
|
77
|
+
initial_count = len(oracle_data)
|
|
78
|
+
oracle_data = oracle_data.dropna(subset=["feature_uid"])
|
|
79
|
+
oracle_data["feature_uid"] = oracle_data["feature_uid"].astype(int)
|
|
80
|
+
|
|
81
|
+
self.logger.debug(f"Extracted feature UIDs for {len(oracle_data)}/{initial_count} oracle entries")
|
|
82
|
+
|
|
83
|
+
# Apply id_level filters if specified
|
|
84
|
+
if min_id_level is not None:
|
|
85
|
+
oracle_data = oracle_data[oracle_data["level"] >= min_id_level]
|
|
86
|
+
self.logger.debug(f"After min_id_level filter ({min_id_level}): {len(oracle_data)} entries")
|
|
87
|
+
|
|
88
|
+
if max_id_level is not None:
|
|
89
|
+
oracle_data = oracle_data[oracle_data["level"] <= max_id_level]
|
|
90
|
+
self.logger.debug(f"After max_id_level filter ({max_id_level}): {len(oracle_data)} entries")
|
|
91
|
+
|
|
92
|
+
if len(oracle_data) == 0:
|
|
93
|
+
self.logger.warning("No oracle entries remain after filtering")
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
# === CREATE LIB_DF ===
|
|
97
|
+
self.logger.debug("Creating lib_df from Oracle annotation data")
|
|
98
|
+
self.logger.debug(f"Oracle data shape before lib_df creation: {oracle_data.shape}")
|
|
99
|
+
|
|
100
|
+
# Create unique lib_uid for each library entry
|
|
101
|
+
oracle_data["lib_uid"] = range(len(oracle_data))
|
|
102
|
+
|
|
103
|
+
# Map Oracle columns to lib_df schema
|
|
104
|
+
lib_data = []
|
|
105
|
+
for _, row in oracle_data.iterrows():
|
|
106
|
+
# Convert cmpd_uid to integer, using lib_uid as fallback
|
|
107
|
+
cmpd_uid = row["lib_uid"] # Use lib_uid as integer compound identifier
|
|
108
|
+
try:
|
|
109
|
+
if row.get("lib_id") is not None:
|
|
110
|
+
cmpd_uid = int(float(str(row["lib_id"]))) # Convert to int, handling potential float strings
|
|
111
|
+
except (ValueError, TypeError):
|
|
112
|
+
pass # Keep lib_uid as fallback
|
|
113
|
+
|
|
114
|
+
lib_entry = {
|
|
115
|
+
"lib_uid": row["lib_uid"],
|
|
116
|
+
"cmpd_uid": cmpd_uid, # Integer compound identifier
|
|
117
|
+
"source_id": "LipidOracle", # Fixed source identifier
|
|
118
|
+
"name": row.get("name", None),
|
|
119
|
+
"shortname": row.get("species", None),
|
|
120
|
+
"class": row.get("hg", None),
|
|
121
|
+
"smiles": None, # Not available in Oracle data
|
|
122
|
+
"inchi": None, # Not available in Oracle data
|
|
123
|
+
"inchikey": None, # Not available in Oracle data
|
|
124
|
+
"formula": row.get("formula", None),
|
|
125
|
+
"iso": 0, # Fixed isotope value
|
|
126
|
+
"adduct": row.get("ion", None),
|
|
127
|
+
"probability": row.get("score", None),
|
|
128
|
+
"m": None, # Would need to calculate from formula
|
|
129
|
+
"z": 1 if row.get("ion", "").find("+") != -1 else (-1 if row.get("ion", "").find("-") != -1 else None),
|
|
130
|
+
"mz": row.get("mz", None), # Use mz column from annotation_full.csv
|
|
131
|
+
"rt": None, # Set to null as requested
|
|
132
|
+
"quant_group": None, # Set to null as requested
|
|
133
|
+
"db_id": row.get("lib_id", None),
|
|
134
|
+
"db": row.get("lib", None),
|
|
135
|
+
}
|
|
136
|
+
lib_data.append(lib_entry)
|
|
137
|
+
|
|
138
|
+
self.logger.debug(f"Created {len(lib_data)} lib_data entries")
|
|
139
|
+
|
|
140
|
+
# Create lib_df as Polars DataFrame with error handling for mixed types
|
|
141
|
+
try:
|
|
142
|
+
lib_df_temp = pl.DataFrame(lib_data)
|
|
143
|
+
except Exception as e:
|
|
144
|
+
self.logger.warning(f"Error creating lib_df with polars: {e}")
|
|
145
|
+
# Fallback: convert to pandas first, then to polars
|
|
146
|
+
lib_df_pandas = pd.DataFrame(lib_data)
|
|
147
|
+
lib_df_temp = pl.from_pandas(lib_df_pandas)
|
|
148
|
+
|
|
149
|
+
# Ensure uniqueness by name and adduct combination
|
|
150
|
+
# Sort by lib_uid and keep first occurrence (earliest in processing order)
|
|
151
|
+
self.lib_df = lib_df_temp.sort("lib_uid").unique(subset=["name", "adduct"], keep="first")
|
|
152
|
+
|
|
153
|
+
self.logger.info(
|
|
154
|
+
f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# === CREATE ID_DF ===
|
|
158
|
+
self.logger.debug("Creating id_df from Oracle identification matches")
|
|
159
|
+
|
|
160
|
+
# Create identification matches
|
|
161
|
+
id_data = []
|
|
162
|
+
for _, row in oracle_data.iterrows():
|
|
163
|
+
# Use dmz from annotation_full.csv directly for mz_delta
|
|
164
|
+
mz_delta = None
|
|
165
|
+
if row.get("dmz") is not None:
|
|
166
|
+
try:
|
|
167
|
+
mz_delta = float(row["dmz"])
|
|
168
|
+
except (ValueError, TypeError):
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
# Use rt_err from annotation_full.csv for rt_delta, None if NaN
|
|
172
|
+
rt_delta = None
|
|
173
|
+
rt_err_value = row.get("rt_err")
|
|
174
|
+
if rt_err_value is not None and not (isinstance(rt_err_value, float) and pd.isna(rt_err_value)):
|
|
175
|
+
try:
|
|
176
|
+
rt_delta = float(rt_err_value)
|
|
177
|
+
except (ValueError, TypeError):
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
# Create matcher as "lipidoracle-" + score_metric from annotation_full.csv
|
|
181
|
+
matcher = "lipidoracle" # default fallback
|
|
182
|
+
if row.get("score_metric") is not None:
|
|
183
|
+
try:
|
|
184
|
+
score_metric = str(row["score_metric"])
|
|
185
|
+
matcher = f"lipidoracle-{score_metric}"
|
|
186
|
+
except (ValueError, TypeError):
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
id_entry = {
|
|
190
|
+
"feature_uid": row["feature_uid"],
|
|
191
|
+
"lib_uid": row["lib_uid"],
|
|
192
|
+
"mz_delta": mz_delta,
|
|
193
|
+
"rt_delta": rt_delta,
|
|
194
|
+
"matcher": matcher,
|
|
195
|
+
"score": row.get("score", None),
|
|
196
|
+
"iso": 0, # Fixed isotope value for oracle imports
|
|
197
|
+
}
|
|
198
|
+
id_data.append(id_entry)
|
|
199
|
+
|
|
200
|
+
# Create id_df as Polars DataFrame with error handling
|
|
201
|
+
try:
|
|
202
|
+
id_df_temp = pl.DataFrame(id_data)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
self.logger.warning(f"Error creating id_df with polars: {e}")
|
|
205
|
+
# Fallback: convert to pandas first, then to polars
|
|
206
|
+
id_df_pandas = pd.DataFrame(id_data)
|
|
207
|
+
id_df_temp = pl.from_pandas(id_df_pandas)
|
|
208
|
+
|
|
209
|
+
# Filter id_df to only include lib_uids that exist in the final unique lib_df
|
|
210
|
+
unique_lib_uids = self.lib_df.select("lib_uid").to_series()
|
|
211
|
+
self.id_df = id_df_temp.filter(pl.col("lib_uid").is_in(unique_lib_uids))
|
|
212
|
+
|
|
213
|
+
self.logger.info(f"Created id_df with {len(self.id_df)} identification matches")
|
|
214
|
+
|
|
215
|
+
# === UPDATE FEATURES_DF (adapted from consensus functionality) ===
|
|
216
|
+
self.logger.debug("Updating features_df with top identification results")
|
|
217
|
+
|
|
218
|
+
# Convert to polars for efficient joining with error handling
|
|
219
|
+
try:
|
|
220
|
+
oracle_pl = pl.DataFrame(oracle_data)
|
|
221
|
+
except Exception as e:
|
|
222
|
+
self.logger.warning(f"Error converting oracle_data to polars: {e}")
|
|
223
|
+
# Convert using from_pandas properly
|
|
224
|
+
oracle_pl = pl.from_pandas(oracle_data.reset_index(drop=True))
|
|
225
|
+
|
|
226
|
+
# Group by feature_uid and select the best identification (highest level)
|
|
227
|
+
# In case of ties, take the first one
|
|
228
|
+
best_ids = (
|
|
229
|
+
oracle_pl.group_by("feature_uid")
|
|
230
|
+
.agg([pl.col("level").max().alias("max_level")])
|
|
231
|
+
.join(oracle_pl, on="feature_uid")
|
|
232
|
+
.filter(pl.col("level") == pl.col("max_level"))
|
|
233
|
+
.group_by("feature_uid")
|
|
234
|
+
.first() # In case of ties, take the first
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
self.logger.debug(f"Selected best identifications for {len(best_ids)} features")
|
|
238
|
+
|
|
239
|
+
# Prepare the identification columns
|
|
240
|
+
id_columns = {
|
|
241
|
+
"id_top_name": best_ids.select("feature_uid", "name"),
|
|
242
|
+
"id_top_adduct": best_ids.select("feature_uid", "ion"),
|
|
243
|
+
"id_top_class": best_ids.select("feature_uid", "hg"),
|
|
244
|
+
"id_top_score": best_ids.select("feature_uid", pl.col("score").round(3).alias("score")),
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
# Initialize identification columns in features_df if they don't exist
|
|
248
|
+
for col_name in id_columns.keys():
|
|
249
|
+
if col_name not in self.features_df.columns:
|
|
250
|
+
if col_name == "id_top_score":
|
|
251
|
+
self.features_df = self.features_df.with_columns(pl.lit(None, dtype=pl.Float64).alias(col_name))
|
|
252
|
+
else:
|
|
253
|
+
self.features_df = self.features_df.with_columns(pl.lit(None, dtype=pl.String).alias(col_name))
|
|
254
|
+
|
|
255
|
+
# Update features_df with oracle identifications
|
|
256
|
+
for col_name, id_data_col in id_columns.items():
|
|
257
|
+
oracle_column = id_data_col.columns[1] # second column (after feature_uid)
|
|
258
|
+
|
|
259
|
+
# Create update dataframe
|
|
260
|
+
update_data = id_data_col.rename({oracle_column: col_name})
|
|
261
|
+
|
|
262
|
+
# Join and update
|
|
263
|
+
self.features_df = (
|
|
264
|
+
self.features_df.join(update_data, on="feature_uid", how="left", suffix="_oracle")
|
|
265
|
+
.with_columns(pl.coalesce([f"{col_name}_oracle", col_name]).alias(col_name))
|
|
266
|
+
.drop(f"{col_name}_oracle")
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Replace NaN values with None in identification columns
|
|
270
|
+
id_col_names = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score"]
|
|
271
|
+
for col_name in id_col_names:
|
|
272
|
+
if col_name in self.features_df.columns:
|
|
273
|
+
# For string columns, replace empty strings and "nan" with None
|
|
274
|
+
if col_name != "id_top_score":
|
|
275
|
+
self.features_df = self.features_df.with_columns(
|
|
276
|
+
pl.when(
|
|
277
|
+
pl.col(col_name).is_null()
|
|
278
|
+
| (pl.col(col_name) == "")
|
|
279
|
+
| (pl.col(col_name) == "nan")
|
|
280
|
+
| (pl.col(col_name) == "NaN")
|
|
281
|
+
)
|
|
282
|
+
.then(None)
|
|
283
|
+
.otherwise(pl.col(col_name))
|
|
284
|
+
.alias(col_name)
|
|
285
|
+
)
|
|
286
|
+
# For numeric columns, replace NaN with None
|
|
287
|
+
else:
|
|
288
|
+
self.features_df = self.features_df.with_columns(
|
|
289
|
+
pl.when(pl.col(col_name).is_null() | pl.col(col_name).is_nan())
|
|
290
|
+
.then(None)
|
|
291
|
+
.otherwise(pl.col(col_name))
|
|
292
|
+
.alias(col_name)
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Count how many features were updated
|
|
296
|
+
updated_count = self.features_df.filter(pl.col("id_top_name").is_not_null()).height
|
|
297
|
+
total_features = len(self.features_df)
|
|
298
|
+
|
|
299
|
+
self.logger.success(
|
|
300
|
+
f"LipidOracle import completed. {updated_count}/{total_features} "
|
|
301
|
+
f"features now have identifications ({updated_count / total_features * 100:.1f}%)"
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
# Update history
|
|
305
|
+
self.store_history(
|
|
306
|
+
["import_oracle"],
|
|
307
|
+
{
|
|
308
|
+
"folder": folder,
|
|
309
|
+
"min_id_level": min_id_level,
|
|
310
|
+
"max_id_level": max_id_level,
|
|
311
|
+
"updated_features": updated_count,
|
|
312
|
+
"total_features": total_features,
|
|
313
|
+
"lib_entries": len(self.lib_df),
|
|
314
|
+
"id_matches": len(self.id_df),
|
|
315
|
+
},
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def import_tima(
|
|
320
|
+
self,
|
|
321
|
+
folder,
|
|
322
|
+
file="mini",
|
|
323
|
+
):
|
|
324
|
+
"""
|
|
325
|
+
Import TIMA identification data and map it to features.
|
|
326
|
+
|
|
327
|
+
This method reads TIMA identification results from folder/*results_{file}.tsv
|
|
328
|
+
and creates lib_df and id_df DataFrames with detailed library and identification information.
|
|
329
|
+
It also updates features_df with top identification results.
|
|
330
|
+
|
|
331
|
+
Parameters:
|
|
332
|
+
folder (str): Path to folder containing TIMA results TSV file
|
|
333
|
+
file (str): File suffix to search for (default: "mini")
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
None: Updates features_df, creates lib_df and id_df in-place with TIMA identification data
|
|
337
|
+
|
|
338
|
+
Raises:
|
|
339
|
+
FileNotFoundError: If the TIMA results file doesn't exist
|
|
340
|
+
ValueError: If features_df is empty or doesn't have required columns
|
|
341
|
+
|
|
342
|
+
Example:
|
|
343
|
+
>>> sample.import_tima(
|
|
344
|
+
... folder="path/to/tima_results",
|
|
345
|
+
... file="mini"
|
|
346
|
+
... )
|
|
347
|
+
"""
|
|
348
|
+
|
|
349
|
+
self.logger.info(f"Starting TIMA import from folder: {folder}")
|
|
350
|
+
|
|
351
|
+
# Validate inputs
|
|
352
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
353
|
+
raise ValueError("features_df is empty or not available. Run find_features() first.")
|
|
354
|
+
|
|
355
|
+
if "feature_uid" not in self.features_df.columns:
|
|
356
|
+
raise ValueError("features_df must contain 'feature_uid' column")
|
|
357
|
+
|
|
358
|
+
# Find TIMA file
|
|
359
|
+
import glob
|
|
360
|
+
|
|
361
|
+
tima_pattern = os.path.join(folder, f"*results_{file}.tsv")
|
|
362
|
+
tima_files = glob.glob(tima_pattern)
|
|
363
|
+
|
|
364
|
+
if not tima_files:
|
|
365
|
+
raise FileNotFoundError(f"TIMA results file not found with pattern: {tima_pattern}")
|
|
366
|
+
|
|
367
|
+
tima_file_path = tima_files[0]
|
|
368
|
+
self.logger.debug(f"Loading TIMA data from: {tima_file_path}")
|
|
369
|
+
|
|
370
|
+
try:
|
|
371
|
+
# Read TIMA data using polars
|
|
372
|
+
tima_data = pl.read_csv(
|
|
373
|
+
tima_file_path,
|
|
374
|
+
separator="\t",
|
|
375
|
+
schema_overrides={
|
|
376
|
+
"feature_id": pl.Utf8, # Read as Utf8 string
|
|
377
|
+
},
|
|
378
|
+
infer_schema_length=10000
|
|
379
|
+
)
|
|
380
|
+
self.logger.info(f"TIMA data loaded successfully with {len(tima_data)} rows")
|
|
381
|
+
except Exception as e:
|
|
382
|
+
self.logger.error(f"Could not read {tima_file_path}: {e}")
|
|
383
|
+
raise
|
|
384
|
+
|
|
385
|
+
# Check if TIMA feature_ids match features_df feature_id column
|
|
386
|
+
if "feature_id" not in self.features_df.columns:
|
|
387
|
+
raise ValueError("features_df must contain 'feature_id' column")
|
|
388
|
+
|
|
389
|
+
# Compare TIMA feature_ids with features_df feature_ids
|
|
390
|
+
features_ids = set(self.features_df["feature_id"].to_list())
|
|
391
|
+
tima_ids = set(tima_data["feature_id"].to_list())
|
|
392
|
+
|
|
393
|
+
matching_ids = features_ids.intersection(tima_ids)
|
|
394
|
+
non_matching_ids = tima_ids - features_ids
|
|
395
|
+
|
|
396
|
+
if non_matching_ids:
|
|
397
|
+
self.logger.warning(
|
|
398
|
+
f"Found {len(non_matching_ids)} feature_ids in TIMA data that do not match any feature_id in features_df. "
|
|
399
|
+
f"These will be filtered out. Matching features: {len(matching_ids)}/{len(tima_ids)}"
|
|
400
|
+
)
|
|
401
|
+
# Filter to only matching feature_ids
|
|
402
|
+
tima_data = tima_data.filter(pl.col("feature_id").is_in(list(features_ids)))
|
|
403
|
+
|
|
404
|
+
if len(tima_data) == 0:
|
|
405
|
+
self.logger.error("No TIMA feature_ids match features_df feature_id values")
|
|
406
|
+
raise ValueError("No matching features found between TIMA data and features_df")
|
|
407
|
+
|
|
408
|
+
self.logger.debug(f"Matched {len(tima_data)} TIMA entries to features_df feature_id values")
|
|
409
|
+
|
|
410
|
+
# Filter to only rows with identification data (non-empty label_compound)
|
|
411
|
+
initial_count = len(tima_data)
|
|
412
|
+
tima_data = tima_data.filter(
|
|
413
|
+
pl.col("label_compound").is_not_null() &
|
|
414
|
+
(pl.col("label_compound").cast(pl.Utf8).str.strip_chars() != "")
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
self.logger.debug(f"Filtered to {len(tima_data)}/{initial_count} TIMA entries with identifications")
|
|
418
|
+
|
|
419
|
+
if len(tima_data) == 0:
|
|
420
|
+
self.logger.warning("No TIMA entries with identifications found")
|
|
421
|
+
return
|
|
422
|
+
|
|
423
|
+
# === CREATE LIB_DF ===
|
|
424
|
+
self.logger.debug("Creating lib_df from TIMA annotation data")
|
|
425
|
+
self.logger.debug(f"TIMA data shape before lib_df creation: {tima_data.shape}")
|
|
426
|
+
|
|
427
|
+
# Create unique lib_uid for each library entry
|
|
428
|
+
tima_data = tima_data.with_columns(
|
|
429
|
+
pl.arange(0, len(tima_data)).alias("lib_uid")
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Map TIMA columns to lib_df schema
|
|
433
|
+
lib_data = []
|
|
434
|
+
for row in tima_data.iter_rows(named=True):
|
|
435
|
+
# Extract z (charge) from adduct
|
|
436
|
+
z = None
|
|
437
|
+
adduct_str = str(row.get("adduct", ""))
|
|
438
|
+
if "+" in adduct_str:
|
|
439
|
+
z = 1
|
|
440
|
+
elif "-" in adduct_str:
|
|
441
|
+
z = -1
|
|
442
|
+
|
|
443
|
+
# Get SMILES
|
|
444
|
+
smiles = row.get("smiles_no_stereo", None)
|
|
445
|
+
if smiles is None or (isinstance(smiles, str) and smiles.strip() == ""):
|
|
446
|
+
smiles = None
|
|
447
|
+
|
|
448
|
+
# Calculate InChI from SMILES if available
|
|
449
|
+
inchi = None
|
|
450
|
+
if smiles:
|
|
451
|
+
try:
|
|
452
|
+
# Try to get InChI from SMILES using RDKit if available
|
|
453
|
+
try:
|
|
454
|
+
from rdkit import Chem
|
|
455
|
+
mol_rdkit = Chem.MolFromSmiles(smiles)
|
|
456
|
+
if mol_rdkit:
|
|
457
|
+
inchi = Chem.MolToInchi(mol_rdkit)
|
|
458
|
+
except ImportError:
|
|
459
|
+
pass # RDKit not available
|
|
460
|
+
except Exception:
|
|
461
|
+
pass
|
|
462
|
+
|
|
463
|
+
# Calculate formula from SMILES if available
|
|
464
|
+
formula = None
|
|
465
|
+
if smiles:
|
|
466
|
+
try:
|
|
467
|
+
from rdkit import Chem
|
|
468
|
+
mol_rdkit = Chem.MolFromSmiles(smiles)
|
|
469
|
+
if mol_rdkit:
|
|
470
|
+
formula = Chem.rdMolDescriptors.CalcMolFormula(mol_rdkit)
|
|
471
|
+
except ImportError:
|
|
472
|
+
pass # RDKit not available
|
|
473
|
+
except Exception:
|
|
474
|
+
pass
|
|
475
|
+
|
|
476
|
+
# Calculate mass from m/z and charge
|
|
477
|
+
m = None
|
|
478
|
+
mz_value = row.get("mz", None)
|
|
479
|
+
if mz_value is not None and z is not None:
|
|
480
|
+
try:
|
|
481
|
+
m = float(mz_value) * abs(z)
|
|
482
|
+
except (ValueError, TypeError):
|
|
483
|
+
pass
|
|
484
|
+
|
|
485
|
+
# Get class and clean NaN values
|
|
486
|
+
class_value = row.get("label_classyfire", None)
|
|
487
|
+
if class_value is None or (isinstance(class_value, str) and class_value.upper() == "NAN"):
|
|
488
|
+
class_value = None
|
|
489
|
+
|
|
490
|
+
lib_entry = {
|
|
491
|
+
"lib_uid": row["lib_uid"],
|
|
492
|
+
"cmpd_uid": row["lib_uid"], # Use lib_uid as compound identifier
|
|
493
|
+
"source_id": None, # Leave empty as requested
|
|
494
|
+
"name": row.get("label_compound", None),
|
|
495
|
+
"shortname": None, # Not available in TIMA data
|
|
496
|
+
"class": class_value,
|
|
497
|
+
"smiles": smiles,
|
|
498
|
+
"inchi": inchi,
|
|
499
|
+
"inchikey": row.get("inchikey_connectivity_layer", None),
|
|
500
|
+
"formula": formula,
|
|
501
|
+
"iso": 0, # Fixed isotope value
|
|
502
|
+
"adduct": row.get("adduct", None),
|
|
503
|
+
"probability": row.get("score", None),
|
|
504
|
+
"m": m,
|
|
505
|
+
"z": z,
|
|
506
|
+
"mz": row.get("mz", None),
|
|
507
|
+
"rt": None, # Set to null as requested
|
|
508
|
+
"quant_group": None,
|
|
509
|
+
"db_id": None, # Not available in TIMA data
|
|
510
|
+
"db": row.get("library", None),
|
|
511
|
+
}
|
|
512
|
+
lib_data.append(lib_entry)
|
|
513
|
+
|
|
514
|
+
self.logger.debug(f"Created {len(lib_data)} lib_data entries")
|
|
515
|
+
|
|
516
|
+
# Create lib_df as Polars DataFrame with error handling for mixed types
|
|
517
|
+
try:
|
|
518
|
+
lib_df_temp = pl.DataFrame(lib_data)
|
|
519
|
+
except Exception as e:
|
|
520
|
+
self.logger.warning(f"Error creating lib_df with polars: {e}")
|
|
521
|
+
# Fallback: convert to pandas first, then to polars
|
|
522
|
+
lib_df_pandas = pd.DataFrame(lib_data)
|
|
523
|
+
lib_df_temp = pl.from_pandas(lib_df_pandas)
|
|
524
|
+
|
|
525
|
+
# Ensure uniqueness by name and adduct combination
|
|
526
|
+
# Sort by lib_uid and keep first occurrence (earliest in processing order)
|
|
527
|
+
self.lib_df = lib_df_temp.sort("lib_uid").unique(subset=["name", "adduct"], keep="first")
|
|
528
|
+
|
|
529
|
+
self.logger.info(
|
|
530
|
+
f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)"
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
# === CREATE ID_DF ===
|
|
534
|
+
self.logger.debug("Creating id_df from TIMA identification matches")
|
|
535
|
+
|
|
536
|
+
# Create a mapping from feature_id to feature_uid
|
|
537
|
+
# TIMA data has feature_id which must be mapped to features_df feature_uid for id_df
|
|
538
|
+
feature_id_to_uid_map = dict(zip(
|
|
539
|
+
self.features_df["feature_id"].to_list(),
|
|
540
|
+
self.features_df["feature_uid"].to_list()
|
|
541
|
+
))
|
|
542
|
+
|
|
543
|
+
# Create identification matches
|
|
544
|
+
id_data = []
|
|
545
|
+
for row in tima_data.iter_rows(named=True):
|
|
546
|
+
# Map TIMA feature_id to features_df feature_uid
|
|
547
|
+
tima_feature_id = row["feature_id"]
|
|
548
|
+
feature_uid = feature_id_to_uid_map.get(tima_feature_id)
|
|
549
|
+
|
|
550
|
+
if feature_uid is None:
|
|
551
|
+
# Skip if we can't find the mapping (shouldn't happen after filtering)
|
|
552
|
+
continue
|
|
553
|
+
|
|
554
|
+
# Use error_mz for mz_delta
|
|
555
|
+
mz_delta = None
|
|
556
|
+
error_mz = row.get("error_mz", None)
|
|
557
|
+
if error_mz is not None:
|
|
558
|
+
try:
|
|
559
|
+
mz_delta = float(error_mz)
|
|
560
|
+
except (ValueError, TypeError):
|
|
561
|
+
pass
|
|
562
|
+
|
|
563
|
+
# Use error_rt for rt_delta
|
|
564
|
+
rt_delta = None
|
|
565
|
+
rt_err_value = row.get("error_rt", None)
|
|
566
|
+
if rt_err_value is not None:
|
|
567
|
+
try:
|
|
568
|
+
rt_delta = float(rt_err_value)
|
|
569
|
+
except (ValueError, TypeError):
|
|
570
|
+
pass
|
|
571
|
+
|
|
572
|
+
# Create matcher as "tima-" + library
|
|
573
|
+
matcher = "tima" # default fallback
|
|
574
|
+
library_value = row.get("library", None)
|
|
575
|
+
if library_value is not None:
|
|
576
|
+
try:
|
|
577
|
+
library = str(library_value)
|
|
578
|
+
matcher = f"tima-{library}"
|
|
579
|
+
except (ValueError, TypeError):
|
|
580
|
+
pass
|
|
581
|
+
|
|
582
|
+
id_entry = {
|
|
583
|
+
"feature_uid": feature_uid, # Use mapped feature_uid from features_df
|
|
584
|
+
"lib_uid": row["lib_uid"],
|
|
585
|
+
"mz_delta": mz_delta,
|
|
586
|
+
"rt_delta": rt_delta,
|
|
587
|
+
"matcher": matcher,
|
|
588
|
+
"score": row.get("score", None),
|
|
589
|
+
"iso": 0, # Fixed isotope value for TIMA imports
|
|
590
|
+
}
|
|
591
|
+
id_data.append(id_entry)
|
|
592
|
+
|
|
593
|
+
# Create id_df as Polars DataFrame with explicit schema to avoid inference issues
|
|
594
|
+
# Match feature_uid type to features_df
|
|
595
|
+
feature_uid_dtype = self.features_df["feature_uid"].dtype
|
|
596
|
+
id_schema = {
|
|
597
|
+
"feature_uid": feature_uid_dtype, # Match the type from features_df
|
|
598
|
+
"lib_uid": pl.Int64,
|
|
599
|
+
"mz_delta": pl.Float64,
|
|
600
|
+
"rt_delta": pl.Float64,
|
|
601
|
+
"matcher": pl.Utf8,
|
|
602
|
+
"score": pl.Float64,
|
|
603
|
+
"iso": pl.Int64,
|
|
604
|
+
}
|
|
605
|
+
id_df_temp = pl.DataFrame(id_data, schema=id_schema)
|
|
606
|
+
|
|
607
|
+
# Filter id_df to only include lib_uids that exist in the final unique lib_df
|
|
608
|
+
unique_lib_uids = self.lib_df.select("lib_uid").to_series()
|
|
609
|
+
self.id_df = id_df_temp.filter(pl.col("lib_uid").is_in(unique_lib_uids))
|
|
610
|
+
|
|
611
|
+
self.logger.info(f"Created id_df with {len(self.id_df)} identification matches")
|
|
612
|
+
|
|
613
|
+
# === UPDATE FEATURES_DF ===
|
|
614
|
+
self.logger.debug("Updating features_df with top identification results")
|
|
615
|
+
|
|
616
|
+
# tima_data is already a polars DataFrame
|
|
617
|
+
tima_pl = tima_data
|
|
618
|
+
|
|
619
|
+
# Group by feature_id and select the best identification (highest score)
|
|
620
|
+
# In case of ties, take the first one
|
|
621
|
+
best_ids = (
|
|
622
|
+
tima_pl.group_by("feature_id")
|
|
623
|
+
.agg([pl.col("score").max().alias("max_score")])
|
|
624
|
+
.join(tima_pl, on="feature_id")
|
|
625
|
+
.filter(pl.col("score") == pl.col("max_score"))
|
|
626
|
+
.group_by("feature_id")
|
|
627
|
+
.first() # In case of ties, take the first
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
# Join with features_df to map feature_id to feature_uid
|
|
631
|
+
best_ids = best_ids.join(
|
|
632
|
+
self.features_df.select(["feature_id", "feature_uid"]),
|
|
633
|
+
on="feature_id",
|
|
634
|
+
how="left"
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
self.logger.debug(f"Selected best identifications for {len(best_ids)} features")
|
|
638
|
+
|
|
639
|
+
# Prepare the identification columns
|
|
640
|
+
id_columns = {
|
|
641
|
+
"id_top_name": best_ids.select("feature_uid", "label_compound"),
|
|
642
|
+
"id_top_adduct": best_ids.select("feature_uid", "adduct"),
|
|
643
|
+
"id_top_class": best_ids.select("feature_uid", "label_classyfire"),
|
|
644
|
+
"id_top_score": best_ids.select("feature_uid", pl.col("score").round(3).alias("score")),
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
# Initialize identification columns in features_df if they don't exist
|
|
648
|
+
for col_name in id_columns.keys():
|
|
649
|
+
if col_name not in self.features_df.columns:
|
|
650
|
+
if col_name == "id_top_score":
|
|
651
|
+
self.features_df = self.features_df.with_columns(pl.lit(None, dtype=pl.Float64).alias(col_name))
|
|
652
|
+
else:
|
|
653
|
+
self.features_df = self.features_df.with_columns(pl.lit(None, dtype=pl.String).alias(col_name))
|
|
654
|
+
|
|
655
|
+
# Update features_df with TIMA identifications
|
|
656
|
+
for col_name, id_data_col in id_columns.items():
|
|
657
|
+
tima_column = id_data_col.columns[1] # second column (after feature_uid)
|
|
658
|
+
|
|
659
|
+
# Create update dataframe
|
|
660
|
+
update_data = id_data_col.rename({tima_column: col_name})
|
|
661
|
+
|
|
662
|
+
# Join and update
|
|
663
|
+
self.features_df = (
|
|
664
|
+
self.features_df.join(update_data, on="feature_uid", how="left", suffix="_tima")
|
|
665
|
+
.with_columns(pl.coalesce([f"{col_name}_tima", col_name]).alias(col_name))
|
|
666
|
+
.drop(f"{col_name}_tima")
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
# Replace NaN values with None in identification columns
|
|
670
|
+
id_col_names = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score"]
|
|
671
|
+
for col_name in id_col_names:
|
|
672
|
+
if col_name in self.features_df.columns:
|
|
673
|
+
# For string columns, replace empty strings and "nan" with None
|
|
674
|
+
if col_name != "id_top_score":
|
|
675
|
+
self.features_df = self.features_df.with_columns(
|
|
676
|
+
pl.when(
|
|
677
|
+
pl.col(col_name).is_null()
|
|
678
|
+
| (pl.col(col_name) == "")
|
|
679
|
+
| (pl.col(col_name) == "nan")
|
|
680
|
+
| (pl.col(col_name) == "NaN")
|
|
681
|
+
)
|
|
682
|
+
.then(None)
|
|
683
|
+
.otherwise(pl.col(col_name))
|
|
684
|
+
.alias(col_name)
|
|
685
|
+
)
|
|
686
|
+
# For numeric columns, replace NaN with None
|
|
687
|
+
else:
|
|
688
|
+
self.features_df = self.features_df.with_columns(
|
|
689
|
+
pl.when(pl.col(col_name).is_null() | pl.col(col_name).is_nan())
|
|
690
|
+
.then(None)
|
|
691
|
+
.otherwise(pl.col(col_name))
|
|
692
|
+
.alias(col_name)
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
# Count how many features were updated
|
|
696
|
+
updated_count = self.features_df.filter(pl.col("id_top_name").is_not_null()).height
|
|
697
|
+
total_features = len(self.features_df)
|
|
698
|
+
|
|
699
|
+
self.logger.success(
|
|
700
|
+
f"TIMA import completed. {updated_count}/{total_features} "
|
|
701
|
+
f"features now have identifications ({updated_count / total_features * 100:.1f}%)"
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
# Update history
|
|
705
|
+
self.store_history(
|
|
706
|
+
["import_tima"],
|
|
707
|
+
{
|
|
708
|
+
"folder": folder,
|
|
709
|
+
"file": file,
|
|
710
|
+
"updated_features": updated_count,
|
|
711
|
+
"total_features": total_features,
|
|
712
|
+
"lib_entries": len(self.lib_df),
|
|
713
|
+
"id_matches": len(self.id_df),
|
|
714
|
+
},
|
|
715
|
+
)
|