masster 0.5.12__py3-none-any.whl → 0.5.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/lib/lib.py +371 -57
- masster/study/helpers.py +1 -0
- masster/study/id.py +237 -39
- masster/study/importers.py +331 -0
- masster/study/merge.py +3 -1
- masster/study/plot.py +93 -29
- masster/study/study.py +4 -0
- masster/study/study5_schema.json +12 -0
- masster/wizard/__init__.py +4 -4
- masster/wizard/wizard.py +437 -19
- {masster-0.5.12.dist-info → masster-0.5.14.dist-info}/METADATA +1 -1
- {masster-0.5.12.dist-info → masster-0.5.14.dist-info}/RECORD +16 -15
- {masster-0.5.12.dist-info → masster-0.5.14.dist-info}/WHEEL +0 -0
- {masster-0.5.12.dist-info → masster-0.5.14.dist-info}/entry_points.txt +0 -0
- {masster-0.5.12.dist-info → masster-0.5.14.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
"""
|
|
2
|
+
import.py
|
|
3
|
+
|
|
4
|
+
Module providing import functionality for Study class, specifically for importing
|
|
5
|
+
oracle identification data into consensus features.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import polars as pl
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def import_oracle(
|
|
16
|
+
self,
|
|
17
|
+
folder,
|
|
18
|
+
min_id_level=None,
|
|
19
|
+
max_id_level=None,
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
Import oracle identification data and map it to consensus features.
|
|
23
|
+
|
|
24
|
+
This method reads oracle identification results from folder/diag/annotation_full.csv
|
|
25
|
+
and creates lib_df and id_df DataFrames with detailed library and identification information.
|
|
26
|
+
It also updates consensus_df with top identification results.
|
|
27
|
+
|
|
28
|
+
Parameters:
|
|
29
|
+
folder (str): Path to oracle folder containing diag/annotation_full.csv
|
|
30
|
+
min_id_level (int, optional): Minimum identification level to include
|
|
31
|
+
max_id_level (int, optional): Maximum identification level to include
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
None: Updates consensus_df, creates lib_df and id_df in-place with oracle identification data
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
FileNotFoundError: If the oracle annotation file doesn't exist
|
|
38
|
+
ValueError: If consensus_df is empty or doesn't have required columns
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
>>> study.import_oracle(
|
|
42
|
+
... folder="path/to/oracle_results",
|
|
43
|
+
... min_id_level=2,
|
|
44
|
+
... max_id_level=4
|
|
45
|
+
... )
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
self.logger.info(f"Starting oracle import from folder: {folder}")
|
|
49
|
+
|
|
50
|
+
# Validate inputs
|
|
51
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
52
|
+
raise ValueError("consensus_df is empty or not available. Run merge() first.")
|
|
53
|
+
|
|
54
|
+
if "consensus_uid" not in self.consensus_df.columns:
|
|
55
|
+
raise ValueError("consensus_df must contain 'consensus_uid' column")
|
|
56
|
+
|
|
57
|
+
# Check if oracle file exists
|
|
58
|
+
oracle_file_path = os.path.join(folder, "diag", "annotation_full.csv")
|
|
59
|
+
if not os.path.exists(oracle_file_path):
|
|
60
|
+
raise FileNotFoundError(f"Oracle annotation file not found: {oracle_file_path}")
|
|
61
|
+
|
|
62
|
+
self.logger.debug(f"Loading oracle data from: {oracle_file_path}")
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
# Read oracle data using pandas first for easier processing
|
|
66
|
+
oracle_data = pd.read_csv(oracle_file_path)
|
|
67
|
+
self.logger.info(f"Oracle data loaded successfully with {len(oracle_data)} rows")
|
|
68
|
+
except Exception as e:
|
|
69
|
+
self.logger.error(f"Could not read {oracle_file_path}: {e}")
|
|
70
|
+
raise
|
|
71
|
+
|
|
72
|
+
# Extract consensus_uid from scan_title column (format: "uid:XYZ, ...")
|
|
73
|
+
self.logger.debug("Extracting consensus UIDs from oracle scan_title using pattern 'uid:(\\d+)'")
|
|
74
|
+
oracle_data["consensus_uid"] = oracle_data["scan_title"].str.extract(r"uid:(\d+)", expand=False)
|
|
75
|
+
|
|
76
|
+
# Remove rows where consensus_uid extraction failed
|
|
77
|
+
initial_count = len(oracle_data)
|
|
78
|
+
oracle_data = oracle_data.dropna(subset=["consensus_uid"])
|
|
79
|
+
oracle_data["consensus_uid"] = oracle_data["consensus_uid"].astype(int)
|
|
80
|
+
|
|
81
|
+
self.logger.debug(f"Extracted consensus UIDs for {len(oracle_data)}/{initial_count} oracle entries")
|
|
82
|
+
|
|
83
|
+
# Apply id_level filters if specified
|
|
84
|
+
if min_id_level is not None:
|
|
85
|
+
oracle_data = oracle_data[oracle_data["level"] >= min_id_level]
|
|
86
|
+
self.logger.debug(f"After min_id_level filter ({min_id_level}): {len(oracle_data)} entries")
|
|
87
|
+
|
|
88
|
+
if max_id_level is not None:
|
|
89
|
+
oracle_data = oracle_data[oracle_data["level"] <= max_id_level]
|
|
90
|
+
self.logger.debug(f"After max_id_level filter ({max_id_level}): {len(oracle_data)} entries")
|
|
91
|
+
|
|
92
|
+
if len(oracle_data) == 0:
|
|
93
|
+
self.logger.warning("No oracle entries remain after filtering")
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
# === CREATE LIB_DF ===
|
|
97
|
+
self.logger.debug("Creating lib_df from Oracle annotation data")
|
|
98
|
+
self.logger.debug(f"Oracle data shape before lib_df creation: {oracle_data.shape}")
|
|
99
|
+
|
|
100
|
+
# Create unique lib_uid for each library entry
|
|
101
|
+
oracle_data["lib_uid"] = range(len(oracle_data))
|
|
102
|
+
|
|
103
|
+
# Map Oracle columns to lib_df schema
|
|
104
|
+
lib_data = []
|
|
105
|
+
for _, row in oracle_data.iterrows():
|
|
106
|
+
# Convert cmpd_uid to integer, using lib_uid as fallback
|
|
107
|
+
cmpd_uid = row["lib_uid"] # Use lib_uid as integer compound identifier
|
|
108
|
+
try:
|
|
109
|
+
if row.get("lib_id") is not None:
|
|
110
|
+
cmpd_uid = int(float(str(row["lib_id"]))) # Convert to int, handling potential float strings
|
|
111
|
+
except (ValueError, TypeError):
|
|
112
|
+
pass # Keep lib_uid as fallback
|
|
113
|
+
|
|
114
|
+
lib_entry = {
|
|
115
|
+
"lib_uid": row["lib_uid"],
|
|
116
|
+
"cmpd_uid": cmpd_uid, # Integer compound identifier
|
|
117
|
+
"source_id": "LipidOracle", # Fixed source identifier
|
|
118
|
+
"name": row.get("name", None),
|
|
119
|
+
"shortname": row.get("species", None),
|
|
120
|
+
"class": row.get("hg", None),
|
|
121
|
+
"smiles": None, # Not available in Oracle data
|
|
122
|
+
"inchi": None, # Not available in Oracle data
|
|
123
|
+
"inchikey": None, # Not available in Oracle data
|
|
124
|
+
"formula": row.get("formula", None),
|
|
125
|
+
"iso": 0, # Fixed isotope value
|
|
126
|
+
"adduct": row.get("ion", None),
|
|
127
|
+
"probability": row.get("score", None),
|
|
128
|
+
"m": None, # Would need to calculate from formula
|
|
129
|
+
"z": 1 if row.get("ion", "").find("+") != -1 else (-1 if row.get("ion", "").find("-") != -1 else None),
|
|
130
|
+
"mz": row.get("mz", None), # Use mz column from annotation_full.csv
|
|
131
|
+
"rt": None, # Set to null as requested
|
|
132
|
+
"quant_group": None, # Set to null as requested
|
|
133
|
+
"db_id": row.get("lib_id", None),
|
|
134
|
+
"db": row.get("lib", None)
|
|
135
|
+
}
|
|
136
|
+
lib_data.append(lib_entry)
|
|
137
|
+
|
|
138
|
+
self.logger.debug(f"Created {len(lib_data)} lib_data entries")
|
|
139
|
+
|
|
140
|
+
# Create lib_df as Polars DataFrame with error handling for mixed types
|
|
141
|
+
try:
|
|
142
|
+
lib_df_temp = pl.DataFrame(lib_data)
|
|
143
|
+
except Exception as e:
|
|
144
|
+
self.logger.warning(f"Error creating lib_df with polars: {e}")
|
|
145
|
+
# Fallback: convert to pandas first, then to polars
|
|
146
|
+
lib_df_pandas = pd.DataFrame(lib_data)
|
|
147
|
+
lib_df_temp = pl.from_pandas(lib_df_pandas)
|
|
148
|
+
|
|
149
|
+
# Ensure uniqueness by name and adduct combination
|
|
150
|
+
# Sort by lib_uid and keep first occurrence (earliest in processing order)
|
|
151
|
+
self.lib_df = (
|
|
152
|
+
lib_df_temp
|
|
153
|
+
.sort("lib_uid")
|
|
154
|
+
.unique(subset=["name", "adduct"], keep="first")
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
self.logger.info(f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)")
|
|
158
|
+
|
|
159
|
+
# === CREATE ID_DF ===
|
|
160
|
+
self.logger.debug("Creating id_df from Oracle identification matches")
|
|
161
|
+
|
|
162
|
+
# Create identification matches
|
|
163
|
+
id_data = []
|
|
164
|
+
for _, row in oracle_data.iterrows():
|
|
165
|
+
# Use dmz from annotation_full.csv directly for mz_delta
|
|
166
|
+
mz_delta = None
|
|
167
|
+
if row.get("dmz") is not None:
|
|
168
|
+
try:
|
|
169
|
+
mz_delta = float(row["dmz"])
|
|
170
|
+
except (ValueError, TypeError):
|
|
171
|
+
pass
|
|
172
|
+
|
|
173
|
+
# Use rt_err from annotation_full.csv for rt_delta, None if NaN
|
|
174
|
+
rt_delta = None
|
|
175
|
+
rt_err_value = row.get("rt_err")
|
|
176
|
+
if rt_err_value is not None and not (isinstance(rt_err_value, float) and pd.isna(rt_err_value)):
|
|
177
|
+
try:
|
|
178
|
+
rt_delta = float(rt_err_value)
|
|
179
|
+
except (ValueError, TypeError):
|
|
180
|
+
pass
|
|
181
|
+
|
|
182
|
+
# Create matcher as "lipidoracle-" + score_metric from annotation_full.csv
|
|
183
|
+
matcher = "lipidoracle" # default fallback
|
|
184
|
+
if row.get("score_metric") is not None:
|
|
185
|
+
try:
|
|
186
|
+
score_metric = str(row["score_metric"])
|
|
187
|
+
matcher = f"lipidoracle-{score_metric}"
|
|
188
|
+
except (ValueError, TypeError):
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
id_entry = {
|
|
192
|
+
"consensus_uid": row["consensus_uid"],
|
|
193
|
+
"lib_uid": row["lib_uid"],
|
|
194
|
+
"mz_delta": mz_delta,
|
|
195
|
+
"rt_delta": rt_delta,
|
|
196
|
+
"matcher": matcher,
|
|
197
|
+
"score": row.get("score", None)
|
|
198
|
+
}
|
|
199
|
+
id_data.append(id_entry)
|
|
200
|
+
|
|
201
|
+
# Create id_df as Polars DataFrame with error handling
|
|
202
|
+
try:
|
|
203
|
+
id_df_temp = pl.DataFrame(id_data)
|
|
204
|
+
except Exception as e:
|
|
205
|
+
self.logger.warning(f"Error creating id_df with polars: {e}")
|
|
206
|
+
# Fallback: convert to pandas first, then to polars
|
|
207
|
+
id_df_pandas = pd.DataFrame(id_data)
|
|
208
|
+
id_df_temp = pl.from_pandas(id_df_pandas)
|
|
209
|
+
|
|
210
|
+
# Filter id_df to only include lib_uids that exist in the final unique lib_df
|
|
211
|
+
unique_lib_uids = self.lib_df.select("lib_uid").to_series()
|
|
212
|
+
self.id_df = id_df_temp.filter(pl.col("lib_uid").is_in(unique_lib_uids))
|
|
213
|
+
|
|
214
|
+
self.logger.info(f"Created id_df with {len(self.id_df)} identification matches")
|
|
215
|
+
|
|
216
|
+
# === UPDATE CONSENSUS_DF (existing functionality) ===
|
|
217
|
+
self.logger.debug("Updating consensus_df with top identification results")
|
|
218
|
+
|
|
219
|
+
# Convert to polars for efficient joining with error handling
|
|
220
|
+
try:
|
|
221
|
+
oracle_pl = pl.DataFrame(oracle_data)
|
|
222
|
+
except Exception as e:
|
|
223
|
+
self.logger.warning(f"Error converting oracle_data to polars: {e}")
|
|
224
|
+
# Convert using from_pandas properly
|
|
225
|
+
oracle_pl = pl.from_pandas(oracle_data.reset_index(drop=True))
|
|
226
|
+
|
|
227
|
+
# Group by consensus_uid and select the best identification (highest level)
|
|
228
|
+
# In case of ties, take the first one
|
|
229
|
+
best_ids = (
|
|
230
|
+
oracle_pl
|
|
231
|
+
.group_by("consensus_uid")
|
|
232
|
+
.agg([
|
|
233
|
+
pl.col("level").max().alias("max_level")
|
|
234
|
+
])
|
|
235
|
+
.join(oracle_pl, on="consensus_uid")
|
|
236
|
+
.filter(pl.col("level") == pl.col("max_level"))
|
|
237
|
+
.group_by("consensus_uid")
|
|
238
|
+
.first() # In case of ties, take the first
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
self.logger.debug(f"Selected best identifications for {len(best_ids)} consensus features")
|
|
242
|
+
|
|
243
|
+
# Prepare the identification columns
|
|
244
|
+
id_columns = {
|
|
245
|
+
"id_top_name": best_ids.select("consensus_uid", "name"),
|
|
246
|
+
"id_top_adduct": best_ids.select("consensus_uid", "ion"),
|
|
247
|
+
"id_top_class": best_ids.select("consensus_uid", "hg"),
|
|
248
|
+
"id_top_score": best_ids.select("consensus_uid", pl.col("score").round(3).alias("score")),
|
|
249
|
+
"id_source": best_ids.select(
|
|
250
|
+
"consensus_uid",
|
|
251
|
+
pl.when(pl.col("level") == 1)
|
|
252
|
+
.then(pl.lit("lipidoracle ms1"))
|
|
253
|
+
.otherwise(pl.lit("lipidoracle ms2"))
|
|
254
|
+
.alias("id_source")
|
|
255
|
+
)
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
# Initialize identification columns in consensus_df if they don't exist
|
|
259
|
+
for col_name in id_columns.keys():
|
|
260
|
+
if col_name not in self.consensus_df.columns:
|
|
261
|
+
if col_name == "id_top_score":
|
|
262
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
263
|
+
pl.lit(None, dtype=pl.Float64).alias(col_name)
|
|
264
|
+
)
|
|
265
|
+
else:
|
|
266
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
267
|
+
pl.lit(None, dtype=pl.String).alias(col_name)
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Update consensus_df with oracle identifications
|
|
271
|
+
for col_name, id_data_col in id_columns.items():
|
|
272
|
+
oracle_column = id_data_col.columns[1] # second column (after consensus_uid)
|
|
273
|
+
|
|
274
|
+
# Create update dataframe
|
|
275
|
+
update_data = id_data_col.rename({oracle_column: col_name})
|
|
276
|
+
|
|
277
|
+
# Join and update
|
|
278
|
+
self.consensus_df = (
|
|
279
|
+
self.consensus_df
|
|
280
|
+
.join(update_data, on="consensus_uid", how="left", suffix="_oracle")
|
|
281
|
+
.with_columns(
|
|
282
|
+
pl.coalesce([f"{col_name}_oracle", col_name]).alias(col_name)
|
|
283
|
+
)
|
|
284
|
+
.drop(f"{col_name}_oracle")
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Replace NaN values with None in identification columns
|
|
288
|
+
id_col_names = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score", "id_source"]
|
|
289
|
+
for col_name in id_col_names:
|
|
290
|
+
if col_name in self.consensus_df.columns:
|
|
291
|
+
# For string columns, replace empty strings and "nan" with None
|
|
292
|
+
if col_name != "id_top_score":
|
|
293
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
294
|
+
pl.when(
|
|
295
|
+
pl.col(col_name).is_null() |
|
|
296
|
+
(pl.col(col_name) == "") |
|
|
297
|
+
(pl.col(col_name) == "nan") |
|
|
298
|
+
(pl.col(col_name) == "NaN")
|
|
299
|
+
)
|
|
300
|
+
.then(None)
|
|
301
|
+
.otherwise(pl.col(col_name))
|
|
302
|
+
.alias(col_name)
|
|
303
|
+
)
|
|
304
|
+
# For numeric columns, replace NaN with None
|
|
305
|
+
else:
|
|
306
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
307
|
+
pl.when(pl.col(col_name).is_null() | pl.col(col_name).is_nan())
|
|
308
|
+
.then(None)
|
|
309
|
+
.otherwise(pl.col(col_name))
|
|
310
|
+
.alias(col_name)
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Count how many consensus features were updated
|
|
314
|
+
updated_count = self.consensus_df.filter(pl.col("id_top_name").is_not_null()).height
|
|
315
|
+
total_consensus = len(self.consensus_df)
|
|
316
|
+
|
|
317
|
+
self.logger.success(
|
|
318
|
+
f"LipidOracle import completed. {updated_count}/{total_consensus} "
|
|
319
|
+
f"consensus features now have identifications ({updated_count/total_consensus*100:.1f}%)"
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# Update history
|
|
323
|
+
self.update_history(["import_oracle"], {
|
|
324
|
+
"folder": folder,
|
|
325
|
+
"min_id_level": min_id_level,
|
|
326
|
+
"max_id_level": max_id_level,
|
|
327
|
+
"updated_features": updated_count,
|
|
328
|
+
"total_features": total_consensus,
|
|
329
|
+
"lib_entries": len(self.lib_df),
|
|
330
|
+
"id_matches": len(self.id_df)
|
|
331
|
+
})
|
masster/study/merge.py
CHANGED
|
@@ -1792,6 +1792,7 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
|
|
|
1792
1792
|
"id_top_class": None,
|
|
1793
1793
|
"id_top_adduct": None,
|
|
1794
1794
|
"id_top_score": None,
|
|
1795
|
+
"id_source": None,
|
|
1795
1796
|
}
|
|
1796
1797
|
|
|
1797
1798
|
|
|
@@ -2194,6 +2195,7 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
|
|
|
2194
2195
|
"id_top_class": None,
|
|
2195
2196
|
"id_top_adduct": None,
|
|
2196
2197
|
"id_top_score": None,
|
|
2198
|
+
"id_source": None,
|
|
2197
2199
|
},
|
|
2198
2200
|
)
|
|
2199
2201
|
|
|
@@ -3021,7 +3023,7 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
3021
3023
|
pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
|
|
3022
3024
|
pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
|
|
3023
3025
|
])
|
|
3024
|
-
study.logger.
|
|
3026
|
+
study.logger.info(f"Adduct information updated for {updated_count} consensus features.")
|
|
3025
3027
|
else:
|
|
3026
3028
|
study.logger.debug("No consensus features updated based on mass shift analysis")
|
|
3027
3029
|
|
masster/study/plot.py
CHANGED
|
@@ -630,6 +630,8 @@ def plot_consensus_2d(
|
|
|
630
630
|
height=450,
|
|
631
631
|
mz_range=None,
|
|
632
632
|
rt_range=None,
|
|
633
|
+
legend="bottom_right",
|
|
634
|
+
show_none=True,
|
|
633
635
|
):
|
|
634
636
|
"""
|
|
635
637
|
Plot consensus features in a 2D scatter plot with retention time vs m/z.
|
|
@@ -652,6 +654,10 @@ def plot_consensus_2d(
|
|
|
652
654
|
height (int): Plot height in pixels (default: 900)
|
|
653
655
|
mz_range (tuple, optional): m/z range for filtering consensus features (min_mz, max_mz)
|
|
654
656
|
rt_range (tuple, optional): Retention time range for filtering consensus features (min_rt, max_rt)
|
|
657
|
+
legend (str, optional): Legend position for categorical data. Options: 'top_right', 'top_left',
|
|
658
|
+
'bottom_right', 'bottom_left', 'right', 'left', 'top', 'bottom'.
|
|
659
|
+
If None, legend is hidden. Only applies to categorical coloring (default: "bottom_right")
|
|
660
|
+
show_none (bool): Whether to display points with None values for colorby column (default: True)
|
|
655
661
|
"""
|
|
656
662
|
if self.consensus_df is None:
|
|
657
663
|
self.logger.error("No consensus map found.")
|
|
@@ -730,6 +736,10 @@ def plot_consensus_2d(
|
|
|
730
736
|
from bokeh.models.annotations import ColorBar
|
|
731
737
|
from bokeh.palettes import viridis, Category20
|
|
732
738
|
|
|
739
|
+
# Filter out None values for colorby column if show_none=False
|
|
740
|
+
if not show_none and colorby in data.columns:
|
|
741
|
+
data = data.filter(pl.col(colorby).is_not_null())
|
|
742
|
+
|
|
733
743
|
# Convert Polars DataFrame to pandas for Bokeh compatibility
|
|
734
744
|
data_pd = data.to_pandas()
|
|
735
745
|
source = ColumnDataSource(data_pd)
|
|
@@ -783,13 +793,20 @@ def plot_consensus_2d(
|
|
|
783
793
|
# Sorting would break the correspondence between legend labels and point colors
|
|
784
794
|
unique_values = [v for v in data_pd[colorby].unique() if v is not None]
|
|
785
795
|
|
|
786
|
-
if
|
|
787
|
-
|
|
796
|
+
# Use the custom palette from cmap if available, otherwise fall back to defaults
|
|
797
|
+
if len(palette) >= len(unique_values):
|
|
798
|
+
# Use custom colormap palette - sample evenly across the palette
|
|
799
|
+
import numpy as np
|
|
800
|
+
indices = np.linspace(0, len(palette) - 1, len(unique_values)).astype(int)
|
|
801
|
+
categorical_palette = [palette[i] for i in indices]
|
|
802
|
+
elif len(unique_values) <= 20:
|
|
803
|
+
# Fall back to Category20 if custom palette is too small
|
|
804
|
+
categorical_palette = Category20[min(20, max(3, len(unique_values)))]
|
|
788
805
|
else:
|
|
789
806
|
# For many categories, use a subset of the viridis palette
|
|
790
|
-
|
|
807
|
+
categorical_palette = viridis(min(256, len(unique_values)))
|
|
791
808
|
|
|
792
|
-
color_mapper = factor_cmap(colorby,
|
|
809
|
+
color_mapper = factor_cmap(colorby, categorical_palette, unique_values)
|
|
793
810
|
else:
|
|
794
811
|
# Handle numeric coloring with LinearColorMapper
|
|
795
812
|
color_mapper = LinearColorMapper(
|
|
@@ -809,21 +826,65 @@ def plot_consensus_2d(
|
|
|
809
826
|
if is_categorical:
|
|
810
827
|
# For categorical data, create separate renderers for each category
|
|
811
828
|
# This enables proper legend interactivity where each category can be toggled independently
|
|
812
|
-
|
|
829
|
+
all_unique_values = list(data_pd[colorby].unique())
|
|
830
|
+
unique_values = [v for v in all_unique_values if v is not None]
|
|
831
|
+
has_none_values = None in all_unique_values
|
|
813
832
|
|
|
814
|
-
if
|
|
815
|
-
|
|
833
|
+
# Use the custom palette from cmap if available, otherwise fall back to defaults
|
|
834
|
+
if len(palette) >= len(unique_values):
|
|
835
|
+
# Use custom colormap palette - sample evenly across the palette
|
|
836
|
+
import numpy as np
|
|
837
|
+
indices = np.linspace(0, len(palette) - 1, len(unique_values)).astype(int)
|
|
838
|
+
categorical_palette = [palette[i] for i in indices]
|
|
839
|
+
elif len(unique_values) <= 20:
|
|
840
|
+
# Fall back to Category20 if custom palette is too small
|
|
841
|
+
categorical_palette = Category20[min(20, max(3, len(unique_values)))]
|
|
816
842
|
else:
|
|
817
|
-
|
|
843
|
+
categorical_palette = viridis(min(256, len(unique_values)))
|
|
818
844
|
|
|
819
|
-
#
|
|
845
|
+
# Handle None values with black color FIRST so they appear in the background
|
|
846
|
+
if has_none_values and show_none:
|
|
847
|
+
# Filter data for None values
|
|
848
|
+
none_data = data.filter(pl.col(colorby).is_null())
|
|
849
|
+
none_data_pd = none_data.to_pandas()
|
|
850
|
+
none_source = bp.ColumnDataSource(none_data_pd)
|
|
851
|
+
|
|
852
|
+
if scaling.lower() in ["dyn", "dynamic"]:
|
|
853
|
+
# Calculate appropriate radius for dynamic scaling
|
|
854
|
+
rt_range = data["rt"].max() - data["rt"].min()
|
|
855
|
+
mz_range = data["mz"].max() - data["mz"].min()
|
|
856
|
+
dynamic_radius = min(rt_range, mz_range) * 0.0005 * markersize
|
|
857
|
+
|
|
858
|
+
renderer = p.circle(
|
|
859
|
+
x="rt",
|
|
860
|
+
y="mz",
|
|
861
|
+
radius=dynamic_radius,
|
|
862
|
+
fill_color="lightgray",
|
|
863
|
+
line_color=None,
|
|
864
|
+
alpha=alpha,
|
|
865
|
+
source=none_source,
|
|
866
|
+
legend_label="None",
|
|
867
|
+
)
|
|
868
|
+
else:
|
|
869
|
+
renderer = p.scatter(
|
|
870
|
+
x="rt",
|
|
871
|
+
y="mz",
|
|
872
|
+
size="markersize",
|
|
873
|
+
fill_color="lightgray",
|
|
874
|
+
line_color=None,
|
|
875
|
+
alpha=alpha,
|
|
876
|
+
source=none_source,
|
|
877
|
+
legend_label="None",
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
# Create a separate renderer for each non-None category (plotted on top of None values)
|
|
820
881
|
for i, category in enumerate(unique_values):
|
|
821
882
|
# Filter data for this category
|
|
822
883
|
category_data = data.filter(pl.col(colorby) == category)
|
|
823
884
|
category_data_pd = category_data.to_pandas()
|
|
824
885
|
category_source = bp.ColumnDataSource(category_data_pd)
|
|
825
886
|
|
|
826
|
-
color =
|
|
887
|
+
color = categorical_palette[i % len(categorical_palette)]
|
|
827
888
|
|
|
828
889
|
if scaling.lower() in ["dyn", "dynamic"]:
|
|
829
890
|
# Calculate appropriate radius for dynamic scaling
|
|
@@ -892,33 +953,19 @@ def plot_consensus_2d(
|
|
|
892
953
|
("number_samples", "@number_samples"),
|
|
893
954
|
("number_ms2", "@number_ms2"),
|
|
894
955
|
("inty_mean", "@inty_mean"),
|
|
895
|
-
("coherence_mean", "@chrom_coherence_mean"),
|
|
896
|
-
("prominence_scaled_mean", "@chrom_prominence_scaled_mean"),
|
|
897
956
|
]
|
|
898
957
|
|
|
899
|
-
# Add adduct_top if it exists in data
|
|
900
|
-
if "adduct_top" in data.columns:
|
|
901
|
-
tooltips.append(("adduct_top", "@adduct_top"))
|
|
902
|
-
|
|
903
|
-
# Add id_top_name if it exists in data
|
|
904
|
-
if "id_top_name" in data.columns:
|
|
905
|
-
tooltips.append(("id_top_name", "@id_top_name"))
|
|
906
|
-
|
|
907
|
-
# Add id_top_adduct if it exists in data
|
|
908
|
-
if "id_top_adduct" in data.columns:
|
|
909
|
-
tooltips.append(("id_top_adduct", "@id_top_adduct"))
|
|
910
|
-
|
|
911
958
|
# Add id_top_* columns if they exist and have non-null values
|
|
912
|
-
id_top_columns = ["id_top_name", "
|
|
959
|
+
id_top_columns = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score"]
|
|
913
960
|
for col in id_top_columns:
|
|
914
961
|
if col in data.columns:
|
|
915
962
|
# Check if the column has any non-null values
|
|
916
963
|
if data.filter(pl.col(col).is_not_null()).height > 0:
|
|
917
964
|
# Format score column with decimal places, others as strings
|
|
918
965
|
if col == "id_top_score":
|
|
919
|
-
tooltips.append((col
|
|
966
|
+
tooltips.append((col, f"@{col}{{0.0}}"))
|
|
920
967
|
else:
|
|
921
|
-
tooltips.append((col
|
|
968
|
+
tooltips.append((col, f"@{col}"))
|
|
922
969
|
|
|
923
970
|
hover = HoverTool(
|
|
924
971
|
tooltips=tooltips,
|
|
@@ -942,8 +989,25 @@ def plot_consensus_2d(
|
|
|
942
989
|
p.add_layout(color_bar, "right")
|
|
943
990
|
else:
|
|
944
991
|
# For categorical data, configure the legend that was automatically created
|
|
945
|
-
|
|
946
|
-
|
|
992
|
+
if legend is not None:
|
|
993
|
+
# Map legend position parameter to Bokeh legend position
|
|
994
|
+
legend_position_map = {
|
|
995
|
+
"top_right": "top_right",
|
|
996
|
+
"top_left": "top_left",
|
|
997
|
+
"bottom_right": "bottom_right",
|
|
998
|
+
"bottom_left": "bottom_left",
|
|
999
|
+
"right": "right",
|
|
1000
|
+
"left": "left",
|
|
1001
|
+
"top": "top",
|
|
1002
|
+
"bottom": "bottom"
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
bokeh_legend_pos = legend_position_map.get(legend, "bottom_right")
|
|
1006
|
+
p.legend.location = bokeh_legend_pos
|
|
1007
|
+
p.legend.click_policy = "hide"
|
|
1008
|
+
else:
|
|
1009
|
+
# Hide legend when legend=None
|
|
1010
|
+
p.legend.visible = False
|
|
947
1011
|
|
|
948
1012
|
if filename is not None:
|
|
949
1013
|
# Convert relative paths to absolute paths using study folder as base
|
masster/study/study.py
CHANGED
|
@@ -109,6 +109,7 @@ from masster.study.parameters import set_parameters_property
|
|
|
109
109
|
from masster.study.save import save, save_consensus, save_samples
|
|
110
110
|
from masster.study.export import export_mgf, export_mztab, export_xlsx, export_parquet
|
|
111
111
|
from masster.study.id import lib_load, identify, get_id, id_reset, lib_reset, _get_adducts
|
|
112
|
+
from masster.study.importers import import_oracle
|
|
112
113
|
|
|
113
114
|
from masster.logger import MassterLogger
|
|
114
115
|
from masster.study.defaults.study_def import study_defaults
|
|
@@ -454,6 +455,9 @@ class Study:
|
|
|
454
455
|
reset_id = id_reset
|
|
455
456
|
lib_reset = lib_reset
|
|
456
457
|
reset_lib = lib_reset
|
|
458
|
+
|
|
459
|
+
# === Oracle Import Operations ===
|
|
460
|
+
import_oracle = import_oracle
|
|
457
461
|
|
|
458
462
|
# === Parameter Management ===
|
|
459
463
|
update_history = update_history
|
masster/study/study5_schema.json
CHANGED
|
@@ -114,6 +114,9 @@
|
|
|
114
114
|
},
|
|
115
115
|
"id_top_score": {
|
|
116
116
|
"dtype": "pl.Float64"
|
|
117
|
+
},
|
|
118
|
+
"id_source": {
|
|
119
|
+
"dtype": "pl.String"
|
|
117
120
|
}
|
|
118
121
|
}
|
|
119
122
|
},
|
|
@@ -318,6 +321,12 @@
|
|
|
318
321
|
"name": {
|
|
319
322
|
"dtype": "pl.String"
|
|
320
323
|
},
|
|
324
|
+
"shortname": {
|
|
325
|
+
"dtype": "pl.String"
|
|
326
|
+
},
|
|
327
|
+
"class": {
|
|
328
|
+
"dtype": "pl.String"
|
|
329
|
+
},
|
|
321
330
|
"smiles": {
|
|
322
331
|
"dtype": "pl.String"
|
|
323
332
|
},
|
|
@@ -336,6 +345,9 @@
|
|
|
336
345
|
"adduct": {
|
|
337
346
|
"dtype": "pl.String"
|
|
338
347
|
},
|
|
348
|
+
"probability": {
|
|
349
|
+
"dtype": "pl.Float64"
|
|
350
|
+
},
|
|
339
351
|
"m": {
|
|
340
352
|
"dtype": "pl.Float64"
|
|
341
353
|
},
|
masster/wizard/__init__.py
CHANGED
|
@@ -5,13 +5,13 @@ This module provides the Wizard class for fully automated processing of MS data
|
|
|
5
5
|
from raw files to final study results, including batch conversion, assembly,
|
|
6
6
|
alignment, merging, plotting, and export.
|
|
7
7
|
|
|
8
|
-
The
|
|
8
|
+
The create_analysis() function allows immediate generation of standalone analysis
|
|
9
9
|
scripts without creating a Wizard instance first.
|
|
10
10
|
|
|
11
|
-
The
|
|
11
|
+
The analyze() function combines create_analysis() with immediate execution of the
|
|
12
12
|
generated script for fully automated processing.
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
-
from .wizard import Wizard, wizard_def,
|
|
15
|
+
from .wizard import Wizard, wizard_def, create_analysis, create_notebook, analyze
|
|
16
16
|
|
|
17
|
-
__all__ = ["Wizard", "wizard_def", "
|
|
17
|
+
__all__ = ["Wizard", "wizard_def", "create_analysis", "create_notebook", "analyze"]
|