masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -20,23 +20,23 @@ def import_oracle(
20
20
  ):
21
21
  """
22
22
  Import oracle identification data and map it to consensus features.
23
-
23
+
24
24
  This method reads oracle identification results from folder/diag/annotation_full.csv
25
25
  and creates lib_df and id_df DataFrames with detailed library and identification information.
26
26
  It also updates consensus_df with top identification results.
27
-
27
+
28
28
  Parameters:
29
29
  folder (str): Path to oracle folder containing diag/annotation_full.csv
30
30
  min_id_level (int, optional): Minimum identification level to include
31
31
  max_id_level (int, optional): Maximum identification level to include
32
-
32
+
33
33
  Returns:
34
34
  None: Updates consensus_df, creates lib_df and id_df in-place with oracle identification data
35
-
35
+
36
36
  Raises:
37
37
  FileNotFoundError: If the oracle annotation file doesn't exist
38
38
  ValueError: If consensus_df is empty or doesn't have required columns
39
-
39
+
40
40
  Example:
41
41
  >>> study.import_oracle(
42
42
  ... folder="path/to/oracle_results",
@@ -44,23 +44,23 @@ def import_oracle(
44
44
  ... max_id_level=4
45
45
  ... )
46
46
  """
47
-
47
+
48
48
  self.logger.info(f"Starting oracle import from folder: {folder}")
49
-
49
+
50
50
  # Validate inputs
51
51
  if self.consensus_df is None or self.consensus_df.is_empty():
52
52
  raise ValueError("consensus_df is empty or not available. Run merge() first.")
53
-
53
+
54
54
  if "consensus_uid" not in self.consensus_df.columns:
55
55
  raise ValueError("consensus_df must contain 'consensus_uid' column")
56
-
56
+
57
57
  # Check if oracle file exists
58
58
  oracle_file_path = os.path.join(folder, "diag", "annotation_full.csv")
59
59
  if not os.path.exists(oracle_file_path):
60
60
  raise FileNotFoundError(f"Oracle annotation file not found: {oracle_file_path}")
61
-
61
+
62
62
  self.logger.debug(f"Loading oracle data from: {oracle_file_path}")
63
-
63
+
64
64
  try:
65
65
  # Read oracle data using pandas first for easier processing
66
66
  oracle_data = pd.read_csv(oracle_file_path)
@@ -68,38 +68,38 @@ def import_oracle(
68
68
  except Exception as e:
69
69
  self.logger.error(f"Could not read {oracle_file_path}: {e}")
70
70
  raise
71
-
71
+
72
72
  # Extract consensus_uid from scan_title column (format: "uid:XYZ, ...")
73
73
  self.logger.debug("Extracting consensus UIDs from oracle scan_title using pattern 'uid:(\\d+)'")
74
74
  oracle_data["consensus_uid"] = oracle_data["scan_title"].str.extract(r"uid:(\d+)", expand=False)
75
-
75
+
76
76
  # Remove rows where consensus_uid extraction failed
77
77
  initial_count = len(oracle_data)
78
78
  oracle_data = oracle_data.dropna(subset=["consensus_uid"])
79
79
  oracle_data["consensus_uid"] = oracle_data["consensus_uid"].astype(int)
80
-
80
+
81
81
  self.logger.debug(f"Extracted consensus UIDs for {len(oracle_data)}/{initial_count} oracle entries")
82
-
82
+
83
83
  # Apply id_level filters if specified
84
84
  if min_id_level is not None:
85
85
  oracle_data = oracle_data[oracle_data["level"] >= min_id_level]
86
86
  self.logger.debug(f"After min_id_level filter ({min_id_level}): {len(oracle_data)} entries")
87
-
87
+
88
88
  if max_id_level is not None:
89
89
  oracle_data = oracle_data[oracle_data["level"] <= max_id_level]
90
90
  self.logger.debug(f"After max_id_level filter ({max_id_level}): {len(oracle_data)} entries")
91
-
91
+
92
92
  if len(oracle_data) == 0:
93
93
  self.logger.warning("No oracle entries remain after filtering")
94
94
  return
95
-
95
+
96
96
  # === CREATE LIB_DF ===
97
97
  self.logger.debug("Creating lib_df from Oracle annotation data")
98
98
  self.logger.debug(f"Oracle data shape before lib_df creation: {oracle_data.shape}")
99
-
99
+
100
100
  # Create unique lib_uid for each library entry
101
101
  oracle_data["lib_uid"] = range(len(oracle_data))
102
-
102
+
103
103
  # Map Oracle columns to lib_df schema
104
104
  lib_data = []
105
105
  for _, row in oracle_data.iterrows():
@@ -110,33 +110,33 @@ def import_oracle(
110
110
  cmpd_uid = int(float(str(row["lib_id"]))) # Convert to int, handling potential float strings
111
111
  except (ValueError, TypeError):
112
112
  pass # Keep lib_uid as fallback
113
-
113
+
114
114
  lib_entry = {
115
115
  "lib_uid": row["lib_uid"],
116
116
  "cmpd_uid": cmpd_uid, # Integer compound identifier
117
117
  "source_id": "LipidOracle", # Fixed source identifier
118
118
  "name": row.get("name", None),
119
119
  "shortname": row.get("species", None),
120
- "class": row.get("hg", None),
120
+ "class": row.get("hg", None),
121
121
  "smiles": None, # Not available in Oracle data
122
- "inchi": None, # Not available in Oracle data
123
- "inchikey": None, # Not available in Oracle data
122
+ "inchi": None, # Not available in Oracle data
123
+ "inchikey": None, # Not available in Oracle data
124
124
  "formula": row.get("formula", None),
125
- "iso": 0, # Fixed isotope value
125
+ "iso": 0, # Fixed isotope value
126
126
  "adduct": row.get("ion", None),
127
127
  "probability": row.get("score", None),
128
- "m": None, # Would need to calculate from formula
128
+ "m": None, # Would need to calculate from formula
129
129
  "z": 1 if row.get("ion", "").find("+") != -1 else (-1 if row.get("ion", "").find("-") != -1 else None),
130
130
  "mz": row.get("mz", None), # Use mz column from annotation_full.csv
131
- "rt": None, # Set to null as requested
131
+ "rt": None, # Set to null as requested
132
132
  "quant_group": None, # Set to null as requested
133
133
  "db_id": row.get("lib_id", None),
134
- "db": row.get("lib", None)
134
+ "db": row.get("lib", None),
135
135
  }
136
136
  lib_data.append(lib_entry)
137
-
137
+
138
138
  self.logger.debug(f"Created {len(lib_data)} lib_data entries")
139
-
139
+
140
140
  # Create lib_df as Polars DataFrame with error handling for mixed types
141
141
  try:
142
142
  lib_df_temp = pl.DataFrame(lib_data)
@@ -145,20 +145,18 @@ def import_oracle(
145
145
  # Fallback: convert to pandas first, then to polars
146
146
  lib_df_pandas = pd.DataFrame(lib_data)
147
147
  lib_df_temp = pl.from_pandas(lib_df_pandas)
148
-
148
+
149
149
  # Ensure uniqueness by name and adduct combination
150
150
  # Sort by lib_uid and keep first occurrence (earliest in processing order)
151
- self.lib_df = (
152
- lib_df_temp
153
- .sort("lib_uid")
154
- .unique(subset=["name", "adduct"], keep="first")
151
+ self.lib_df = lib_df_temp.sort("lib_uid").unique(subset=["name", "adduct"], keep="first")
152
+
153
+ self.logger.info(
154
+ f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)"
155
155
  )
156
-
157
- self.logger.info(f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)")
158
-
156
+
159
157
  # === CREATE ID_DF ===
160
158
  self.logger.debug("Creating id_df from Oracle identification matches")
161
-
159
+
162
160
  # Create identification matches
163
161
  id_data = []
164
162
  for _, row in oracle_data.iterrows():
@@ -169,7 +167,7 @@ def import_oracle(
169
167
  mz_delta = float(row["dmz"])
170
168
  except (ValueError, TypeError):
171
169
  pass
172
-
170
+
173
171
  # Use rt_err from annotation_full.csv for rt_delta, None if NaN
174
172
  rt_delta = None
175
173
  rt_err_value = row.get("rt_err")
@@ -178,7 +176,7 @@ def import_oracle(
178
176
  rt_delta = float(rt_err_value)
179
177
  except (ValueError, TypeError):
180
178
  pass
181
-
179
+
182
180
  # Create matcher as "lipidoracle-" + score_metric from annotation_full.csv
183
181
  matcher = "lipidoracle" # default fallback
184
182
  if row.get("score_metric") is not None:
@@ -187,17 +185,17 @@ def import_oracle(
187
185
  matcher = f"lipidoracle-{score_metric}"
188
186
  except (ValueError, TypeError):
189
187
  pass
190
-
188
+
191
189
  id_entry = {
192
190
  "consensus_uid": row["consensus_uid"],
193
191
  "lib_uid": row["lib_uid"],
194
192
  "mz_delta": mz_delta,
195
193
  "rt_delta": rt_delta,
196
194
  "matcher": matcher,
197
- "score": row.get("score", None)
195
+ "score": row.get("score", None),
198
196
  }
199
197
  id_data.append(id_entry)
200
-
198
+
201
199
  # Create id_df as Polars DataFrame with error handling
202
200
  try:
203
201
  id_df_temp = pl.DataFrame(id_data)
@@ -206,16 +204,16 @@ def import_oracle(
206
204
  # Fallback: convert to pandas first, then to polars
207
205
  id_df_pandas = pd.DataFrame(id_data)
208
206
  id_df_temp = pl.from_pandas(id_df_pandas)
209
-
207
+
210
208
  # Filter id_df to only include lib_uids that exist in the final unique lib_df
211
209
  unique_lib_uids = self.lib_df.select("lib_uid").to_series()
212
210
  self.id_df = id_df_temp.filter(pl.col("lib_uid").is_in(unique_lib_uids))
213
-
211
+
214
212
  self.logger.info(f"Created id_df with {len(self.id_df)} identification matches")
215
-
213
+
216
214
  # === UPDATE CONSENSUS_DF (existing functionality) ===
217
215
  self.logger.debug("Updating consensus_df with top identification results")
218
-
216
+
219
217
  # Convert to polars for efficient joining with error handling
220
218
  try:
221
219
  oracle_pl = pl.DataFrame(oracle_data)
@@ -223,67 +221,57 @@ def import_oracle(
223
221
  self.logger.warning(f"Error converting oracle_data to polars: {e}")
224
222
  # Convert using from_pandas properly
225
223
  oracle_pl = pl.from_pandas(oracle_data.reset_index(drop=True))
226
-
224
+
227
225
  # Group by consensus_uid and select the best identification (highest level)
228
226
  # In case of ties, take the first one
229
227
  best_ids = (
230
- oracle_pl
231
- .group_by("consensus_uid")
232
- .agg([
233
- pl.col("level").max().alias("max_level")
234
- ])
228
+ oracle_pl.group_by("consensus_uid")
229
+ .agg([pl.col("level").max().alias("max_level")])
235
230
  .join(oracle_pl, on="consensus_uid")
236
231
  .filter(pl.col("level") == pl.col("max_level"))
237
232
  .group_by("consensus_uid")
238
233
  .first() # In case of ties, take the first
239
234
  )
240
-
235
+
241
236
  self.logger.debug(f"Selected best identifications for {len(best_ids)} consensus features")
242
-
237
+
243
238
  # Prepare the identification columns
244
239
  id_columns = {
245
240
  "id_top_name": best_ids.select("consensus_uid", "name"),
246
- "id_top_adduct": best_ids.select("consensus_uid", "ion"),
241
+ "id_top_adduct": best_ids.select("consensus_uid", "ion"),
247
242
  "id_top_class": best_ids.select("consensus_uid", "hg"),
248
243
  "id_top_score": best_ids.select("consensus_uid", pl.col("score").round(3).alias("score")),
249
244
  "id_source": best_ids.select(
250
- "consensus_uid",
245
+ "consensus_uid",
251
246
  pl.when(pl.col("level") == 1)
252
247
  .then(pl.lit("lipidoracle ms1"))
253
248
  .otherwise(pl.lit("lipidoracle ms2"))
254
- .alias("id_source")
255
- )
249
+ .alias("id_source"),
250
+ ),
256
251
  }
257
-
252
+
258
253
  # Initialize identification columns in consensus_df if they don't exist
259
254
  for col_name in id_columns.keys():
260
255
  if col_name not in self.consensus_df.columns:
261
256
  if col_name == "id_top_score":
262
- self.consensus_df = self.consensus_df.with_columns(
263
- pl.lit(None, dtype=pl.Float64).alias(col_name)
264
- )
257
+ self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.Float64).alias(col_name))
265
258
  else:
266
- self.consensus_df = self.consensus_df.with_columns(
267
- pl.lit(None, dtype=pl.String).alias(col_name)
268
- )
269
-
259
+ self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.String).alias(col_name))
260
+
270
261
  # Update consensus_df with oracle identifications
271
262
  for col_name, id_data_col in id_columns.items():
272
263
  oracle_column = id_data_col.columns[1] # second column (after consensus_uid)
273
-
264
+
274
265
  # Create update dataframe
275
266
  update_data = id_data_col.rename({oracle_column: col_name})
276
-
267
+
277
268
  # Join and update
278
269
  self.consensus_df = (
279
- self.consensus_df
280
- .join(update_data, on="consensus_uid", how="left", suffix="_oracle")
281
- .with_columns(
282
- pl.coalesce([f"{col_name}_oracle", col_name]).alias(col_name)
283
- )
270
+ self.consensus_df.join(update_data, on="consensus_uid", how="left", suffix="_oracle")
271
+ .with_columns(pl.coalesce([f"{col_name}_oracle", col_name]).alias(col_name))
284
272
  .drop(f"{col_name}_oracle")
285
273
  )
286
-
274
+
287
275
  # Replace NaN values with None in identification columns
288
276
  id_col_names = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score", "id_source"]
289
277
  for col_name in id_col_names:
@@ -292,10 +280,10 @@ def import_oracle(
292
280
  if col_name != "id_top_score":
293
281
  self.consensus_df = self.consensus_df.with_columns(
294
282
  pl.when(
295
- pl.col(col_name).is_null() |
296
- (pl.col(col_name) == "") |
297
- (pl.col(col_name) == "nan") |
298
- (pl.col(col_name) == "NaN")
283
+ pl.col(col_name).is_null()
284
+ | (pl.col(col_name) == "")
285
+ | (pl.col(col_name) == "nan")
286
+ | (pl.col(col_name) == "NaN")
299
287
  )
300
288
  .then(None)
301
289
  .otherwise(pl.col(col_name))
@@ -309,23 +297,26 @@ def import_oracle(
309
297
  .otherwise(pl.col(col_name))
310
298
  .alias(col_name)
311
299
  )
312
-
300
+
313
301
  # Count how many consensus features were updated
314
302
  updated_count = self.consensus_df.filter(pl.col("id_top_name").is_not_null()).height
315
303
  total_consensus = len(self.consensus_df)
316
-
304
+
317
305
  self.logger.success(
318
306
  f"LipidOracle import completed. {updated_count}/{total_consensus} "
319
- f"consensus features now have identifications ({updated_count/total_consensus*100:.1f}%)"
307
+ f"consensus features now have identifications ({updated_count / total_consensus * 100:.1f}%)"
320
308
  )
321
-
309
+
322
310
  # Update history
323
- self.update_history(["import_oracle"], {
324
- "folder": folder,
325
- "min_id_level": min_id_level,
326
- "max_id_level": max_id_level,
327
- "updated_features": updated_count,
328
- "total_features": total_consensus,
329
- "lib_entries": len(self.lib_df),
330
- "id_matches": len(self.id_df)
331
- })
311
+ self.update_history(
312
+ ["import_oracle"],
313
+ {
314
+ "folder": folder,
315
+ "min_id_level": min_id_level,
316
+ "max_id_level": max_id_level,
317
+ "updated_features": updated_count,
318
+ "total_features": total_consensus,
319
+ "lib_entries": len(self.lib_df),
320
+ "id_matches": len(self.id_df),
321
+ },
322
+ )