masster 0.5.13__py3-none-any.whl → 0.5.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -21,20 +21,20 @@ def import_oracle(
21
21
  """
22
22
  Import oracle identification data and map it to consensus features.
23
23
 
24
- This method reads oracle identification results from folder/diag/summary_by_feature.csv
25
- and maps them to consensus features using the 'uit' (feature_uid) column. The oracle
26
- data is used to populate identification columns in consensus_df.
24
+ This method reads oracle identification results from folder/diag/annotation_full.csv
25
+ and creates lib_df and id_df DataFrames with detailed library and identification information.
26
+ It also updates consensus_df with top identification results.
27
27
 
28
28
  Parameters:
29
- folder (str): Path to oracle folder containing diag/summary_by_feature.csv
29
+ folder (str): Path to oracle folder containing diag/annotation_full.csv
30
30
  min_id_level (int, optional): Minimum identification level to include
31
31
  max_id_level (int, optional): Maximum identification level to include
32
32
 
33
33
  Returns:
34
- None: Updates consensus_df in-place with oracle identification data
34
+ None: Updates consensus_df, creates lib_df and id_df in-place with oracle identification data
35
35
 
36
36
  Raises:
37
- FileNotFoundError: If the oracle summary file doesn't exist
37
+ FileNotFoundError: If the oracle annotation file doesn't exist
38
38
  ValueError: If consensus_df is empty or doesn't have required columns
39
39
 
40
40
  Example:
@@ -55,9 +55,9 @@ def import_oracle(
55
55
  raise ValueError("consensus_df must contain 'consensus_uid' column")
56
56
 
57
57
  # Check if oracle file exists
58
- oracle_file_path = os.path.join(folder, "diag", "summary_by_feature.csv")
58
+ oracle_file_path = os.path.join(folder, "diag", "annotation_full.csv")
59
59
  if not os.path.exists(oracle_file_path):
60
- raise FileNotFoundError(f"Oracle summary file not found: {oracle_file_path}")
60
+ raise FileNotFoundError(f"Oracle annotation file not found: {oracle_file_path}")
61
61
 
62
62
  self.logger.debug(f"Loading oracle data from: {oracle_file_path}")
63
63
 
@@ -69,64 +69,171 @@ def import_oracle(
69
69
  self.logger.error(f"Could not read {oracle_file_path}: {e}")
70
70
  raise
71
71
 
72
- # Select relevant columns from oracle data
73
- required_oracle_cols = ["title", "id_level", "id_label", "id_ion", "id_class", "score"]
74
- missing_cols = [col for col in required_oracle_cols if col not in oracle_data.columns]
75
- if missing_cols:
76
- raise ValueError(f"Oracle data missing required columns: {missing_cols}")
77
-
78
- oracle_subset = oracle_data[required_oracle_cols].copy()
79
-
80
- # Extract consensus_uid from title column (format: "uid:XYZ, ...")
81
- self.logger.debug("Extracting consensus UIDs from oracle titles using pattern 'uid:(\\d+)'")
82
- oracle_subset["consensus_uid"] = oracle_subset["title"].str.extract(r"uid:(\d+)")
72
+ # Extract consensus_uid from scan_title column (format: "uid:XYZ, ...")
73
+ self.logger.debug("Extracting consensus UIDs from oracle scan_title using pattern 'uid:(\\d+)'")
74
+ oracle_data["consensus_uid"] = oracle_data["scan_title"].str.extract(r"uid:(\d+)", expand=False)
83
75
 
84
76
  # Remove rows where consensus_uid extraction failed
85
- oracle_subset = oracle_subset.dropna(subset=["consensus_uid"])
86
- oracle_subset["consensus_uid"] = oracle_subset["consensus_uid"].astype(int)
77
+ initial_count = len(oracle_data)
78
+ oracle_data = oracle_data.dropna(subset=["consensus_uid"])
79
+ oracle_data["consensus_uid"] = oracle_data["consensus_uid"].astype(int)
87
80
 
88
- self.logger.debug(f"Extracted consensus UIDs for {len(oracle_subset)} oracle entries")
81
+ self.logger.debug(f"Extracted consensus UIDs for {len(oracle_data)}/{initial_count} oracle entries")
89
82
 
90
83
  # Apply id_level filters if specified
91
- initial_count = len(oracle_subset)
92
84
  if min_id_level is not None:
93
- oracle_subset = oracle_subset[oracle_subset["id_level"] >= min_id_level]
94
- self.logger.debug(f"After min_id_level filter ({min_id_level}): {len(oracle_subset)} entries")
85
+ oracle_data = oracle_data[oracle_data["level"] >= min_id_level]
86
+ self.logger.debug(f"After min_id_level filter ({min_id_level}): {len(oracle_data)} entries")
95
87
 
96
88
  if max_id_level is not None:
97
- oracle_subset = oracle_subset[oracle_subset["id_level"] <= max_id_level]
98
- self.logger.debug(f"After max_id_level filter ({max_id_level}): {len(oracle_subset)} entries")
89
+ oracle_data = oracle_data[oracle_data["level"] <= max_id_level]
90
+ self.logger.debug(f"After max_id_level filter ({max_id_level}): {len(oracle_data)} entries")
99
91
 
100
- if len(oracle_subset) == 0:
92
+ if len(oracle_data) == 0:
101
93
  self.logger.warning("No oracle entries remain after filtering")
102
94
  return
103
95
 
104
- # Sort by id_level (descending) to prioritize higher confidence identifications
105
- # and remove duplicates by consensus_uid, keeping the first (highest id_level)
106
- oracle_subset = oracle_subset.sort_values(by=["id_level"], ascending=False)
107
- oracle_subset = oracle_subset.drop_duplicates(subset=["consensus_uid"], keep="first")
96
+ # === CREATE LIB_DF ===
97
+ self.logger.debug("Creating lib_df from Oracle annotation data")
98
+ self.logger.debug(f"Oracle data shape before lib_df creation: {oracle_data.shape}")
108
99
 
109
- self.logger.debug(f"After deduplication by consensus_uid: {len(oracle_subset)} unique identifications")
100
+ # Create unique lib_uid for each library entry
101
+ oracle_data["lib_uid"] = range(len(oracle_data))
110
102
 
111
- # Convert to polars for efficient joining
112
- oracle_pl = pl.DataFrame(oracle_subset)
103
+ # Map Oracle columns to lib_df schema
104
+ lib_data = []
105
+ for _, row in oracle_data.iterrows():
106
+ # Convert cmpd_uid to integer, using lib_uid as fallback
107
+ cmpd_uid = row["lib_uid"] # Use lib_uid as integer compound identifier
108
+ try:
109
+ if row.get("lib_id") is not None:
110
+ cmpd_uid = int(float(str(row["lib_id"]))) # Convert to int, handling potential float strings
111
+ except (ValueError, TypeError):
112
+ pass # Keep lib_uid as fallback
113
+
114
+ lib_entry = {
115
+ "lib_uid": row["lib_uid"],
116
+ "cmpd_uid": cmpd_uid, # Integer compound identifier
117
+ "source_id": "LipidOracle", # Fixed source identifier
118
+ "name": row.get("name", None),
119
+ "shortname": row.get("species", None),
120
+ "class": row.get("hg", None),
121
+ "smiles": None, # Not available in Oracle data
122
+ "inchi": None, # Not available in Oracle data
123
+ "inchikey": None, # Not available in Oracle data
124
+ "formula": row.get("formula", None),
125
+ "iso": 0, # Fixed isotope value
126
+ "adduct": row.get("ion", None),
127
+ "probability": row.get("score", None),
128
+ "m": None, # Would need to calculate from formula
129
+ "z": 1 if row.get("ion", "").find("+") != -1 else (-1 if row.get("ion", "").find("-") != -1 else None),
130
+ "mz": row.get("mz", None), # Use mz column from annotation_full.csv
131
+ "rt": None, # Set to null as requested
132
+ "quant_group": None, # Set to null as requested
133
+ "db_id": row.get("lib_id", None),
134
+ "db": row.get("lib", None)
135
+ }
136
+ lib_data.append(lib_entry)
113
137
 
114
- self.logger.debug(f"Oracle data ready for consensus mapping: {len(oracle_pl)} entries")
138
+ self.logger.debug(f"Created {len(lib_data)} lib_data entries")
115
139
 
116
- if oracle_pl.is_empty():
117
- self.logger.warning("No oracle entries could be processed")
118
- return
140
+ # Create lib_df as Polars DataFrame with error handling for mixed types
141
+ try:
142
+ lib_df_temp = pl.DataFrame(lib_data)
143
+ except Exception as e:
144
+ self.logger.warning(f"Error creating lib_df with polars: {e}")
145
+ # Fallback: convert to pandas first, then to polars
146
+ lib_df_pandas = pd.DataFrame(lib_data)
147
+ lib_df_temp = pl.from_pandas(lib_df_pandas)
148
+
149
+ # Ensure uniqueness by name and adduct combination
150
+ # Sort by lib_uid and keep first occurrence (earliest in processing order)
151
+ self.lib_df = (
152
+ lib_df_temp
153
+ .sort("lib_uid")
154
+ .unique(subset=["name", "adduct"], keep="first")
155
+ )
156
+
157
+ self.logger.info(f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)")
158
+
159
+ # === CREATE ID_DF ===
160
+ self.logger.debug("Creating id_df from Oracle identification matches")
161
+
162
+ # Create identification matches
163
+ id_data = []
164
+ for _, row in oracle_data.iterrows():
165
+ # Use dmz from annotation_full.csv directly for mz_delta
166
+ mz_delta = None
167
+ if row.get("dmz") is not None:
168
+ try:
169
+ mz_delta = float(row["dmz"])
170
+ except (ValueError, TypeError):
171
+ pass
172
+
173
+ # Use rt_err from annotation_full.csv for rt_delta, None if NaN
174
+ rt_delta = None
175
+ rt_err_value = row.get("rt_err")
176
+ if rt_err_value is not None and not (isinstance(rt_err_value, float) and pd.isna(rt_err_value)):
177
+ try:
178
+ rt_delta = float(rt_err_value)
179
+ except (ValueError, TypeError):
180
+ pass
181
+
182
+ # Create matcher as "lipidoracle-" + score_metric from annotation_full.csv
183
+ matcher = "lipidoracle" # default fallback
184
+ if row.get("score_metric") is not None:
185
+ try:
186
+ score_metric = str(row["score_metric"])
187
+ matcher = f"lipidoracle-{score_metric}"
188
+ except (ValueError, TypeError):
189
+ pass
190
+
191
+ id_entry = {
192
+ "consensus_uid": row["consensus_uid"],
193
+ "lib_uid": row["lib_uid"],
194
+ "mz_delta": mz_delta,
195
+ "rt_delta": rt_delta,
196
+ "matcher": matcher,
197
+ "score": row.get("score", None)
198
+ }
199
+ id_data.append(id_entry)
200
+
201
+ # Create id_df as Polars DataFrame with error handling
202
+ try:
203
+ id_df_temp = pl.DataFrame(id_data)
204
+ except Exception as e:
205
+ self.logger.warning(f"Error creating id_df with polars: {e}")
206
+ # Fallback: convert to pandas first, then to polars
207
+ id_df_pandas = pd.DataFrame(id_data)
208
+ id_df_temp = pl.from_pandas(id_df_pandas)
209
+
210
+ # Filter id_df to only include lib_uids that exist in the final unique lib_df
211
+ unique_lib_uids = self.lib_df.select("lib_uid").to_series()
212
+ self.id_df = id_df_temp.filter(pl.col("lib_uid").is_in(unique_lib_uids))
213
+
214
+ self.logger.info(f"Created id_df with {len(self.id_df)} identification matches")
215
+
216
+ # === UPDATE CONSENSUS_DF (existing functionality) ===
217
+ self.logger.debug("Updating consensus_df with top identification results")
218
+
219
+ # Convert to polars for efficient joining with error handling
220
+ try:
221
+ oracle_pl = pl.DataFrame(oracle_data)
222
+ except Exception as e:
223
+ self.logger.warning(f"Error converting oracle_data to polars: {e}")
224
+ # Convert using from_pandas properly
225
+ oracle_pl = pl.from_pandas(oracle_data.reset_index(drop=True))
119
226
 
120
- # Group by consensus_uid and select the best identification (highest id_level)
227
+ # Group by consensus_uid and select the best identification (highest level)
121
228
  # In case of ties, take the first one
122
229
  best_ids = (
123
230
  oracle_pl
124
231
  .group_by("consensus_uid")
125
232
  .agg([
126
- pl.col("id_level").max().alias("max_id_level")
233
+ pl.col("level").max().alias("max_level")
127
234
  ])
128
235
  .join(oracle_pl, on="consensus_uid")
129
- .filter(pl.col("id_level") == pl.col("max_id_level"))
236
+ .filter(pl.col("level") == pl.col("max_level"))
130
237
  .group_by("consensus_uid")
131
238
  .first() # In case of ties, take the first
132
239
  )
@@ -135,13 +242,13 @@ def import_oracle(
135
242
 
136
243
  # Prepare the identification columns
137
244
  id_columns = {
138
- "id_top_name": best_ids.select("consensus_uid", "id_label"),
139
- "id_top_adduct": best_ids.select("consensus_uid", "id_ion"),
140
- "id_top_class": best_ids.select("consensus_uid", "id_class"),
245
+ "id_top_name": best_ids.select("consensus_uid", "name"),
246
+ "id_top_adduct": best_ids.select("consensus_uid", "ion"),
247
+ "id_top_class": best_ids.select("consensus_uid", "hg"),
141
248
  "id_top_score": best_ids.select("consensus_uid", pl.col("score").round(3).alias("score")),
142
249
  "id_source": best_ids.select(
143
250
  "consensus_uid",
144
- pl.when(pl.col("id_level") == 1)
251
+ pl.when(pl.col("level") == 1)
145
252
  .then(pl.lit("lipidoracle ms1"))
146
253
  .otherwise(pl.lit("lipidoracle ms2"))
147
254
  .alias("id_source")
@@ -161,11 +268,11 @@ def import_oracle(
161
268
  )
162
269
 
163
270
  # Update consensus_df with oracle identifications
164
- for col_name, id_data in id_columns.items():
165
- oracle_column = id_data.columns[1] # second column (after consensus_uid)
271
+ for col_name, id_data_col in id_columns.items():
272
+ oracle_column = id_data_col.columns[1] # second column (after consensus_uid)
166
273
 
167
274
  # Create update dataframe
168
- update_data = id_data.rename({oracle_column: col_name})
275
+ update_data = id_data_col.rename({oracle_column: col_name})
169
276
 
170
277
  # Join and update
171
278
  self.consensus_df = (
@@ -207,8 +314,8 @@ def import_oracle(
207
314
  updated_count = self.consensus_df.filter(pl.col("id_top_name").is_not_null()).height
208
315
  total_consensus = len(self.consensus_df)
209
316
 
210
- self.logger.info(
211
- f"Oracle import complete: {updated_count}/{total_consensus} "
317
+ self.logger.success(
318
+ f"LipidOracle import completed. {updated_count}/{total_consensus} "
212
319
  f"consensus features now have identifications ({updated_count/total_consensus*100:.1f}%)"
213
320
  )
214
321
 
@@ -218,5 +325,7 @@ def import_oracle(
218
325
  "min_id_level": min_id_level,
219
326
  "max_id_level": max_id_level,
220
327
  "updated_features": updated_count,
221
- "total_features": total_consensus
328
+ "total_features": total_consensus,
329
+ "lib_entries": len(self.lib_df),
330
+ "id_matches": len(self.id_df)
222
331
  })
masster/study/merge.py CHANGED
@@ -3023,7 +3023,7 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
3023
3023
  pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
3024
3024
  pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
3025
3025
  ])
3026
- study.logger.success(f"Adduct information updated for {updated_count} consensus features.")
3026
+ study.logger.info(f"Adduct information updated for {updated_count} consensus features.")
3027
3027
  else:
3028
3028
  study.logger.debug("No consensus features updated based on mass shift analysis")
3029
3029
 
masster/study/plot.py CHANGED
@@ -631,6 +631,7 @@ def plot_consensus_2d(
631
631
  mz_range=None,
632
632
  rt_range=None,
633
633
  legend="bottom_right",
634
+ show_none=True,
634
635
  ):
635
636
  """
636
637
  Plot consensus features in a 2D scatter plot with retention time vs m/z.
@@ -656,6 +657,7 @@ def plot_consensus_2d(
656
657
  legend (str, optional): Legend position for categorical data. Options: 'top_right', 'top_left',
657
658
  'bottom_right', 'bottom_left', 'right', 'left', 'top', 'bottom'.
658
659
  If None, legend is hidden. Only applies to categorical coloring (default: "bottom_right")
660
+ show_none (bool): Whether to display points with None values for colorby column (default: True)
659
661
  """
660
662
  if self.consensus_df is None:
661
663
  self.logger.error("No consensus map found.")
@@ -734,6 +736,10 @@ def plot_consensus_2d(
734
736
  from bokeh.models.annotations import ColorBar
735
737
  from bokeh.palettes import viridis, Category20
736
738
 
739
+ # Filter out None values for colorby column if show_none=False
740
+ if not show_none and colorby in data.columns:
741
+ data = data.filter(pl.col(colorby).is_not_null())
742
+
737
743
  # Convert Polars DataFrame to pandas for Bokeh compatibility
738
744
  data_pd = data.to_pandas()
739
745
  source = ColumnDataSource(data_pd)
@@ -837,7 +843,7 @@ def plot_consensus_2d(
837
843
  categorical_palette = viridis(min(256, len(unique_values)))
838
844
 
839
845
  # Handle None values with black color FIRST so they appear in the background
840
- if has_none_values:
846
+ if has_none_values and show_none:
841
847
  # Filter data for None values
842
848
  none_data = data.filter(pl.col(colorby).is_null())
843
849
  none_data_pd = none_data.to_pandas()
@@ -947,33 +953,19 @@ def plot_consensus_2d(
947
953
  ("number_samples", "@number_samples"),
948
954
  ("number_ms2", "@number_ms2"),
949
955
  ("inty_mean", "@inty_mean"),
950
- ("coherence_mean", "@chrom_coherence_mean"),
951
- ("prominence_scaled_mean", "@chrom_prominence_scaled_mean"),
952
956
  ]
953
957
 
954
- # Add adduct_top if it exists in data
955
- if "adduct_top" in data.columns:
956
- tooltips.append(("adduct_top", "@adduct_top"))
957
-
958
- # Add id_top_name if it exists in data
959
- if "id_top_name" in data.columns:
960
- tooltips.append(("id_top_name", "@id_top_name"))
961
-
962
- # Add id_top_adduct if it exists in data
963
- if "id_top_adduct" in data.columns:
964
- tooltips.append(("id_top_adduct", "@id_top_adduct"))
965
-
966
958
  # Add id_top_* columns if they exist and have non-null values
967
- id_top_columns = ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]
959
+ id_top_columns = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score"]
968
960
  for col in id_top_columns:
969
961
  if col in data.columns:
970
962
  # Check if the column has any non-null values
971
963
  if data.filter(pl.col(col).is_not_null()).height > 0:
972
964
  # Format score column with decimal places, others as strings
973
965
  if col == "id_top_score":
974
- tooltips.append((col.replace("id_top_", "id_"), f"@{col}{{0.0000}}"))
966
+ tooltips.append((col, f"@{col}{{0.0}}"))
975
967
  else:
976
- tooltips.append((col.replace("id_top_", "id_"), f"@{col}"))
968
+ tooltips.append((col, f"@{col}"))
977
969
 
978
970
  hover = HoverTool(
979
971
  tooltips=tooltips,
@@ -321,6 +321,12 @@
321
321
  "name": {
322
322
  "dtype": "pl.String"
323
323
  },
324
+ "shortname": {
325
+ "dtype": "pl.String"
326
+ },
327
+ "class": {
328
+ "dtype": "pl.String"
329
+ },
324
330
  "smiles": {
325
331
  "dtype": "pl.String"
326
332
  },
@@ -339,6 +345,9 @@
339
345
  "adduct": {
340
346
  "dtype": "pl.String"
341
347
  },
348
+ "probability": {
349
+ "dtype": "pl.Float64"
350
+ },
342
351
  "m": {
343
352
  "dtype": "pl.Float64"
344
353
  },
@@ -5,13 +5,13 @@ This module provides the Wizard class for fully automated processing of MS data
5
5
  from raw files to final study results, including batch conversion, assembly,
6
6
  alignment, merging, plotting, and export.
7
7
 
8
- The create_script() function allows immediate generation of standalone analysis
8
+ The create_analysis() function allows immediate generation of standalone analysis
9
9
  scripts without creating a Wizard instance first.
10
10
 
11
- The execute() function combines create_script() with immediate execution of the
11
+ The analyze() function combines create_analysis() with immediate execution of the
12
12
  generated script for fully automated processing.
13
13
  """
14
14
 
15
- from .wizard import Wizard, wizard_def, create_script, execute
15
+ from .wizard import Wizard, wizard_def, create_analysis, create_notebook, analyze
16
16
 
17
- __all__ = ["Wizard", "wizard_def", "create_script", "execute"]
17
+ __all__ = ["Wizard", "wizard_def", "create_analysis", "create_notebook", "analyze"]