masster 0.5.28__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/data/libs/aa_nort.json +240 -0
- masster/data/libs/ccm_nort.json +1319 -0
- masster/lib/lib.py +1 -1
- masster/logger.py +0 -6
- masster/sample/adducts.py +1 -1
- masster/sample/defaults/find_adducts_def.py +1 -1
- masster/sample/h5.py +152 -2
- masster/sample/helpers.py +91 -5
- masster/sample/id.py +1160 -0
- masster/sample/importers.py +316 -0
- masster/sample/plot.py +175 -71
- masster/sample/sample.py +18 -3
- masster/sample/sample5_schema.json +99 -1
- masster/study/defaults/study_def.py +8 -12
- masster/study/id.py +59 -12
- masster/study/load.py +0 -11
- masster/study/merge.py +153 -0
- masster/study/plot.py +197 -0
- masster/study/study.py +3 -1
- masster/study/study5_schema.json +15 -0
- masster/wizard/wizard.py +11 -12
- {masster-0.5.28.dist-info → masster-0.6.0.dist-info}/METADATA +15 -17
- {masster-0.5.28.dist-info → masster-0.6.0.dist-info}/RECORD +27 -26
- masster/data/libs/aa.csv +0 -22
- masster/data/libs/ccm.csv +0 -120
- masster/data/libs/urine.csv +0 -4693
- {masster-0.5.28.dist-info → masster-0.6.0.dist-info}/WHEEL +0 -0
- {masster-0.5.28.dist-info → masster-0.6.0.dist-info}/entry_points.txt +0 -0
- {masster-0.5.28.dist-info → masster-0.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
"""
|
|
2
|
+
importers.py
|
|
3
|
+
|
|
4
|
+
Module providing import functionality for Sample class, specifically for importing
|
|
5
|
+
oracle identification data into features.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import polars as pl
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def import_oracle(
|
|
16
|
+
self,
|
|
17
|
+
folder,
|
|
18
|
+
min_id_level=None,
|
|
19
|
+
max_id_level=None,
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
Import oracle identification data and map it to features.
|
|
23
|
+
|
|
24
|
+
This method reads oracle identification results from folder/diag/annotation_full.csv
|
|
25
|
+
and creates lib_df and id_df DataFrames with detailed library and identification information.
|
|
26
|
+
It also updates features_df with top identification results.
|
|
27
|
+
|
|
28
|
+
Parameters:
|
|
29
|
+
folder (str): Path to oracle folder containing diag/annotation_full.csv
|
|
30
|
+
min_id_level (int, optional): Minimum identification level to include
|
|
31
|
+
max_id_level (int, optional): Maximum identification level to include
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
None: Updates features_df, creates lib_df and id_df in-place with oracle identification data
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
FileNotFoundError: If the oracle annotation file doesn't exist
|
|
38
|
+
ValueError: If features_df is empty or doesn't have required columns
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
>>> sample.import_oracle(
|
|
42
|
+
... folder="path/to/oracle_results",
|
|
43
|
+
... min_id_level=2,
|
|
44
|
+
... max_id_level=4
|
|
45
|
+
... )
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
self.logger.info(f"Starting oracle import from folder: {folder}")
|
|
49
|
+
|
|
50
|
+
# Validate inputs
|
|
51
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
52
|
+
raise ValueError("features_df is empty or not available. Run find_features() first.")
|
|
53
|
+
|
|
54
|
+
if "feature_uid" not in self.features_df.columns:
|
|
55
|
+
raise ValueError("features_df must contain 'feature_uid' column")
|
|
56
|
+
|
|
57
|
+
# Check if oracle file exists
|
|
58
|
+
oracle_file_path = os.path.join(folder, "diag", "annotation_full.csv")
|
|
59
|
+
if not os.path.exists(oracle_file_path):
|
|
60
|
+
raise FileNotFoundError(f"Oracle annotation file not found: {oracle_file_path}")
|
|
61
|
+
|
|
62
|
+
self.logger.debug(f"Loading oracle data from: {oracle_file_path}")
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
# Read oracle data using pandas first for easier processing
|
|
66
|
+
oracle_data = pd.read_csv(oracle_file_path)
|
|
67
|
+
self.logger.info(f"Oracle data loaded successfully with {len(oracle_data)} rows")
|
|
68
|
+
except Exception as e:
|
|
69
|
+
self.logger.error(f"Could not read {oracle_file_path}: {e}")
|
|
70
|
+
raise
|
|
71
|
+
|
|
72
|
+
# Extract feature_uid from scan_title column (format: "uid:XYZ, ...")
|
|
73
|
+
self.logger.debug("Extracting feature UIDs from oracle scan_title using pattern 'uid:(\\d+)'")
|
|
74
|
+
oracle_data["feature_uid"] = oracle_data["scan_title"].str.extract(r"uid:(\d+)", expand=False)
|
|
75
|
+
|
|
76
|
+
# Remove rows where feature_uid extraction failed
|
|
77
|
+
initial_count = len(oracle_data)
|
|
78
|
+
oracle_data = oracle_data.dropna(subset=["feature_uid"])
|
|
79
|
+
oracle_data["feature_uid"] = oracle_data["feature_uid"].astype(int)
|
|
80
|
+
|
|
81
|
+
self.logger.debug(f"Extracted feature UIDs for {len(oracle_data)}/{initial_count} oracle entries")
|
|
82
|
+
|
|
83
|
+
# Apply id_level filters if specified
|
|
84
|
+
if min_id_level is not None:
|
|
85
|
+
oracle_data = oracle_data[oracle_data["level"] >= min_id_level]
|
|
86
|
+
self.logger.debug(f"After min_id_level filter ({min_id_level}): {len(oracle_data)} entries")
|
|
87
|
+
|
|
88
|
+
if max_id_level is not None:
|
|
89
|
+
oracle_data = oracle_data[oracle_data["level"] <= max_id_level]
|
|
90
|
+
self.logger.debug(f"After max_id_level filter ({max_id_level}): {len(oracle_data)} entries")
|
|
91
|
+
|
|
92
|
+
if len(oracle_data) == 0:
|
|
93
|
+
self.logger.warning("No oracle entries remain after filtering")
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
# === CREATE LIB_DF ===
|
|
97
|
+
self.logger.debug("Creating lib_df from Oracle annotation data")
|
|
98
|
+
self.logger.debug(f"Oracle data shape before lib_df creation: {oracle_data.shape}")
|
|
99
|
+
|
|
100
|
+
# Create unique lib_uid for each library entry
|
|
101
|
+
oracle_data["lib_uid"] = range(len(oracle_data))
|
|
102
|
+
|
|
103
|
+
# Map Oracle columns to lib_df schema
|
|
104
|
+
lib_data = []
|
|
105
|
+
for _, row in oracle_data.iterrows():
|
|
106
|
+
# Convert cmpd_uid to integer, using lib_uid as fallback
|
|
107
|
+
cmpd_uid = row["lib_uid"] # Use lib_uid as integer compound identifier
|
|
108
|
+
try:
|
|
109
|
+
if row.get("lib_id") is not None:
|
|
110
|
+
cmpd_uid = int(float(str(row["lib_id"]))) # Convert to int, handling potential float strings
|
|
111
|
+
except (ValueError, TypeError):
|
|
112
|
+
pass # Keep lib_uid as fallback
|
|
113
|
+
|
|
114
|
+
lib_entry = {
|
|
115
|
+
"lib_uid": row["lib_uid"],
|
|
116
|
+
"cmpd_uid": cmpd_uid, # Integer compound identifier
|
|
117
|
+
"source_id": "LipidOracle", # Fixed source identifier
|
|
118
|
+
"name": row.get("name", None),
|
|
119
|
+
"shortname": row.get("species", None),
|
|
120
|
+
"class": row.get("hg", None),
|
|
121
|
+
"smiles": None, # Not available in Oracle data
|
|
122
|
+
"inchi": None, # Not available in Oracle data
|
|
123
|
+
"inchikey": None, # Not available in Oracle data
|
|
124
|
+
"formula": row.get("formula", None),
|
|
125
|
+
"iso": 0, # Fixed isotope value
|
|
126
|
+
"adduct": row.get("ion", None),
|
|
127
|
+
"probability": row.get("score", None),
|
|
128
|
+
"m": None, # Would need to calculate from formula
|
|
129
|
+
"z": 1 if row.get("ion", "").find("+") != -1 else (-1 if row.get("ion", "").find("-") != -1 else None),
|
|
130
|
+
"mz": row.get("mz", None), # Use mz column from annotation_full.csv
|
|
131
|
+
"rt": None, # Set to null as requested
|
|
132
|
+
"quant_group": None, # Set to null as requested
|
|
133
|
+
"db_id": row.get("lib_id", None),
|
|
134
|
+
"db": row.get("lib", None),
|
|
135
|
+
}
|
|
136
|
+
lib_data.append(lib_entry)
|
|
137
|
+
|
|
138
|
+
self.logger.debug(f"Created {len(lib_data)} lib_data entries")
|
|
139
|
+
|
|
140
|
+
# Create lib_df as Polars DataFrame with error handling for mixed types
|
|
141
|
+
try:
|
|
142
|
+
lib_df_temp = pl.DataFrame(lib_data)
|
|
143
|
+
except Exception as e:
|
|
144
|
+
self.logger.warning(f"Error creating lib_df with polars: {e}")
|
|
145
|
+
# Fallback: convert to pandas first, then to polars
|
|
146
|
+
lib_df_pandas = pd.DataFrame(lib_data)
|
|
147
|
+
lib_df_temp = pl.from_pandas(lib_df_pandas)
|
|
148
|
+
|
|
149
|
+
# Ensure uniqueness by name and adduct combination
|
|
150
|
+
# Sort by lib_uid and keep first occurrence (earliest in processing order)
|
|
151
|
+
self.lib_df = lib_df_temp.sort("lib_uid").unique(subset=["name", "adduct"], keep="first")
|
|
152
|
+
|
|
153
|
+
self.logger.info(
|
|
154
|
+
f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# === CREATE ID_DF ===
|
|
158
|
+
self.logger.debug("Creating id_df from Oracle identification matches")
|
|
159
|
+
|
|
160
|
+
# Create identification matches
|
|
161
|
+
id_data = []
|
|
162
|
+
for _, row in oracle_data.iterrows():
|
|
163
|
+
# Use dmz from annotation_full.csv directly for mz_delta
|
|
164
|
+
mz_delta = None
|
|
165
|
+
if row.get("dmz") is not None:
|
|
166
|
+
try:
|
|
167
|
+
mz_delta = float(row["dmz"])
|
|
168
|
+
except (ValueError, TypeError):
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
# Use rt_err from annotation_full.csv for rt_delta, None if NaN
|
|
172
|
+
rt_delta = None
|
|
173
|
+
rt_err_value = row.get("rt_err")
|
|
174
|
+
if rt_err_value is not None and not (isinstance(rt_err_value, float) and pd.isna(rt_err_value)):
|
|
175
|
+
try:
|
|
176
|
+
rt_delta = float(rt_err_value)
|
|
177
|
+
except (ValueError, TypeError):
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
# Create matcher as "lipidoracle-" + score_metric from annotation_full.csv
|
|
181
|
+
matcher = "lipidoracle" # default fallback
|
|
182
|
+
if row.get("score_metric") is not None:
|
|
183
|
+
try:
|
|
184
|
+
score_metric = str(row["score_metric"])
|
|
185
|
+
matcher = f"lipidoracle-{score_metric}"
|
|
186
|
+
except (ValueError, TypeError):
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
id_entry = {
|
|
190
|
+
"feature_uid": row["feature_uid"],
|
|
191
|
+
"lib_uid": row["lib_uid"],
|
|
192
|
+
"mz_delta": mz_delta,
|
|
193
|
+
"rt_delta": rt_delta,
|
|
194
|
+
"matcher": matcher,
|
|
195
|
+
"score": row.get("score", None),
|
|
196
|
+
"iso": 0, # Fixed isotope value for oracle imports
|
|
197
|
+
}
|
|
198
|
+
id_data.append(id_entry)
|
|
199
|
+
|
|
200
|
+
# Create id_df as Polars DataFrame with error handling
|
|
201
|
+
try:
|
|
202
|
+
id_df_temp = pl.DataFrame(id_data)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
self.logger.warning(f"Error creating id_df with polars: {e}")
|
|
205
|
+
# Fallback: convert to pandas first, then to polars
|
|
206
|
+
id_df_pandas = pd.DataFrame(id_data)
|
|
207
|
+
id_df_temp = pl.from_pandas(id_df_pandas)
|
|
208
|
+
|
|
209
|
+
# Filter id_df to only include lib_uids that exist in the final unique lib_df
|
|
210
|
+
unique_lib_uids = self.lib_df.select("lib_uid").to_series()
|
|
211
|
+
self.id_df = id_df_temp.filter(pl.col("lib_uid").is_in(unique_lib_uids))
|
|
212
|
+
|
|
213
|
+
self.logger.info(f"Created id_df with {len(self.id_df)} identification matches")
|
|
214
|
+
|
|
215
|
+
# === UPDATE FEATURES_DF (adapted from consensus functionality) ===
|
|
216
|
+
self.logger.debug("Updating features_df with top identification results")
|
|
217
|
+
|
|
218
|
+
# Convert to polars for efficient joining with error handling
|
|
219
|
+
try:
|
|
220
|
+
oracle_pl = pl.DataFrame(oracle_data)
|
|
221
|
+
except Exception as e:
|
|
222
|
+
self.logger.warning(f"Error converting oracle_data to polars: {e}")
|
|
223
|
+
# Convert using from_pandas properly
|
|
224
|
+
oracle_pl = pl.from_pandas(oracle_data.reset_index(drop=True))
|
|
225
|
+
|
|
226
|
+
# Group by feature_uid and select the best identification (highest level)
|
|
227
|
+
# In case of ties, take the first one
|
|
228
|
+
best_ids = (
|
|
229
|
+
oracle_pl.group_by("feature_uid")
|
|
230
|
+
.agg([pl.col("level").max().alias("max_level")])
|
|
231
|
+
.join(oracle_pl, on="feature_uid")
|
|
232
|
+
.filter(pl.col("level") == pl.col("max_level"))
|
|
233
|
+
.group_by("feature_uid")
|
|
234
|
+
.first() # In case of ties, take the first
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
self.logger.debug(f"Selected best identifications for {len(best_ids)} features")
|
|
238
|
+
|
|
239
|
+
# Prepare the identification columns
|
|
240
|
+
id_columns = {
|
|
241
|
+
"id_top_name": best_ids.select("feature_uid", "name"),
|
|
242
|
+
"id_top_adduct": best_ids.select("feature_uid", "ion"),
|
|
243
|
+
"id_top_class": best_ids.select("feature_uid", "hg"),
|
|
244
|
+
"id_top_score": best_ids.select("feature_uid", pl.col("score").round(3).alias("score")),
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
# Initialize identification columns in features_df if they don't exist
|
|
248
|
+
for col_name in id_columns.keys():
|
|
249
|
+
if col_name not in self.features_df.columns:
|
|
250
|
+
if col_name == "id_top_score":
|
|
251
|
+
self.features_df = self.features_df.with_columns(pl.lit(None, dtype=pl.Float64).alias(col_name))
|
|
252
|
+
else:
|
|
253
|
+
self.features_df = self.features_df.with_columns(pl.lit(None, dtype=pl.String).alias(col_name))
|
|
254
|
+
|
|
255
|
+
# Update features_df with oracle identifications
|
|
256
|
+
for col_name, id_data_col in id_columns.items():
|
|
257
|
+
oracle_column = id_data_col.columns[1] # second column (after feature_uid)
|
|
258
|
+
|
|
259
|
+
# Create update dataframe
|
|
260
|
+
update_data = id_data_col.rename({oracle_column: col_name})
|
|
261
|
+
|
|
262
|
+
# Join and update
|
|
263
|
+
self.features_df = (
|
|
264
|
+
self.features_df.join(update_data, on="feature_uid", how="left", suffix="_oracle")
|
|
265
|
+
.with_columns(pl.coalesce([f"{col_name}_oracle", col_name]).alias(col_name))
|
|
266
|
+
.drop(f"{col_name}_oracle")
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Replace NaN values with None in identification columns
|
|
270
|
+
id_col_names = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score"]
|
|
271
|
+
for col_name in id_col_names:
|
|
272
|
+
if col_name in self.features_df.columns:
|
|
273
|
+
# For string columns, replace empty strings and "nan" with None
|
|
274
|
+
if col_name != "id_top_score":
|
|
275
|
+
self.features_df = self.features_df.with_columns(
|
|
276
|
+
pl.when(
|
|
277
|
+
pl.col(col_name).is_null()
|
|
278
|
+
| (pl.col(col_name) == "")
|
|
279
|
+
| (pl.col(col_name) == "nan")
|
|
280
|
+
| (pl.col(col_name) == "NaN")
|
|
281
|
+
)
|
|
282
|
+
.then(None)
|
|
283
|
+
.otherwise(pl.col(col_name))
|
|
284
|
+
.alias(col_name)
|
|
285
|
+
)
|
|
286
|
+
# For numeric columns, replace NaN with None
|
|
287
|
+
else:
|
|
288
|
+
self.features_df = self.features_df.with_columns(
|
|
289
|
+
pl.when(pl.col(col_name).is_null() | pl.col(col_name).is_nan())
|
|
290
|
+
.then(None)
|
|
291
|
+
.otherwise(pl.col(col_name))
|
|
292
|
+
.alias(col_name)
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Count how many features were updated
|
|
296
|
+
updated_count = self.features_df.filter(pl.col("id_top_name").is_not_null()).height
|
|
297
|
+
total_features = len(self.features_df)
|
|
298
|
+
|
|
299
|
+
self.logger.success(
|
|
300
|
+
f"LipidOracle import completed. {updated_count}/{total_features} "
|
|
301
|
+
f"features now have identifications ({updated_count / total_features * 100:.1f}%)"
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
# Update history
|
|
305
|
+
self.store_history(
|
|
306
|
+
["import_oracle"],
|
|
307
|
+
{
|
|
308
|
+
"folder": folder,
|
|
309
|
+
"min_id_level": min_id_level,
|
|
310
|
+
"max_id_level": max_id_level,
|
|
311
|
+
"updated_features": updated_count,
|
|
312
|
+
"total_features": total_features,
|
|
313
|
+
"lib_entries": len(self.lib_df),
|
|
314
|
+
"id_matches": len(self.id_df),
|
|
315
|
+
},
|
|
316
|
+
)
|
masster/sample/plot.py
CHANGED
|
@@ -1115,6 +1115,7 @@ def plot_2d(
|
|
|
1115
1115
|
filename=None,
|
|
1116
1116
|
show_features=True,
|
|
1117
1117
|
show_only_features_with_ms2=False,
|
|
1118
|
+
show_only_features_with_id=False,
|
|
1118
1119
|
show_isotopes=False,
|
|
1119
1120
|
show_ms2=False,
|
|
1120
1121
|
show_in_browser=False,
|
|
@@ -1134,6 +1135,7 @@ def plot_2d(
|
|
|
1134
1135
|
rt_range=None,
|
|
1135
1136
|
legend=None,
|
|
1136
1137
|
colorby=None,
|
|
1138
|
+
tooltip=None,
|
|
1137
1139
|
):
|
|
1138
1140
|
"""
|
|
1139
1141
|
Plot a two-dimensional visualization of MS1 survey scan data with optional overlays
|
|
@@ -1151,6 +1153,9 @@ def plot_2d(
|
|
|
1151
1153
|
show_only_features_with_ms2 (bool, default False):
|
|
1152
1154
|
If True, only display features that have associated MS2 scans. When False,
|
|
1153
1155
|
features without MS2 data are also shown.
|
|
1156
|
+
show_only_features_with_id (bool, default False):
|
|
1157
|
+
If True, only display features with non-null id_top_name (identified features).
|
|
1158
|
+
When False, all features are shown. Only applies when colorby='id'.
|
|
1154
1159
|
show_isotopes (bool, default False):
|
|
1155
1160
|
Whether to overlay isotope information on top of the features.
|
|
1156
1161
|
show_ms2 (bool, default False):
|
|
@@ -1186,6 +1191,9 @@ def plot_2d(
|
|
|
1186
1191
|
Feature property to use for coloring. If None (default), uses current green/red scheme
|
|
1187
1192
|
for features with/without MS2 data. If specified and contains categorical data, applies
|
|
1188
1193
|
categorical coloring with legend support (similar to plot_2d_oracle).
|
|
1194
|
+
tooltip (str, optional):
|
|
1195
|
+
Controls the feature hover tooltip content. Use None or "ms1" (default) to display the
|
|
1196
|
+
full feature details, or "id" to show only rt, m/z, feature_uid, inty, and any id_* columns.
|
|
1189
1197
|
Behavior:
|
|
1190
1198
|
- Checks for a loaded mzML file by verifying that self.file_obj is not None.
|
|
1191
1199
|
- Converts internal MS1 data (a Polars DataFrame) to a Pandas DataFrame and filters out low-intensity
|
|
@@ -1376,8 +1384,104 @@ def plot_2d(
|
|
|
1376
1384
|
# keep only iso==0, i.e. the main
|
|
1377
1385
|
feats = feats[feats["iso"] == 0]
|
|
1378
1386
|
|
|
1387
|
+
tooltip_mode = str(tooltip).lower() if tooltip is not None else "ms1"
|
|
1388
|
+
if tooltip_mode not in {"ms1", "id"}:
|
|
1389
|
+
tooltip_mode = "ms1"
|
|
1390
|
+
|
|
1391
|
+
id_columns = [col for col in feats.columns if isinstance(col, str) and col.startswith("id_")]
|
|
1392
|
+
|
|
1393
|
+
def build_feature_tooltips(*, include_iso=True, include_iso_of=False, include_colorby=None):
|
|
1394
|
+
base_tooltips = [
|
|
1395
|
+
("rt", "@rt"),
|
|
1396
|
+
("m/z", "@mz{0.0000}"),
|
|
1397
|
+
("feature_uid", "@feature_uid"),
|
|
1398
|
+
("inty", "@inty"),
|
|
1399
|
+
]
|
|
1400
|
+
|
|
1401
|
+
if tooltip_mode == "id":
|
|
1402
|
+
base_tooltips.extend((col, f"@{col}") for col in id_columns)
|
|
1403
|
+
return base_tooltips
|
|
1404
|
+
|
|
1405
|
+
if include_iso:
|
|
1406
|
+
base_tooltips.append(("iso", "@iso"))
|
|
1407
|
+
if include_iso_of:
|
|
1408
|
+
base_tooltips.append(("iso_of", "@iso_of"))
|
|
1409
|
+
base_tooltips.append(("adduct", "@adduct"))
|
|
1410
|
+
base_tooltips.append(("chrom_coherence", "@chrom_coherence"))
|
|
1411
|
+
base_tooltips.append(("chrom_prominence_scaled", "@chrom_prominence_scaled"))
|
|
1412
|
+
|
|
1413
|
+
if include_colorby and tooltip_mode != "id":
|
|
1414
|
+
base_tooltips.append((include_colorby, f"@{include_colorby}"))
|
|
1415
|
+
|
|
1416
|
+
return base_tooltips
|
|
1417
|
+
|
|
1418
|
+
handled_colorby = False
|
|
1419
|
+
colorby_id_mode = False
|
|
1420
|
+
|
|
1421
|
+
if colorby == "id":
|
|
1422
|
+
if "id_top_name" not in feats.columns:
|
|
1423
|
+
self.logger.warning("colorby='id' requested but 'id_top_name' column is missing; using default colors")
|
|
1424
|
+
else:
|
|
1425
|
+
handled_colorby = True
|
|
1426
|
+
colorby_id_mode = True
|
|
1427
|
+
id_values = feats["id_top_name"]
|
|
1428
|
+
annotated_mask = id_values.notna() & (id_values.astype(str).str.strip() != "")
|
|
1429
|
+
|
|
1430
|
+
annotated_features = feats[annotated_mask].copy()
|
|
1431
|
+
unannotated_features = feats[~annotated_mask].copy()
|
|
1432
|
+
|
|
1433
|
+
# Apply show_only_features_with_id filter if requested
|
|
1434
|
+
if show_only_features_with_id:
|
|
1435
|
+
# Only keep annotated features, discard unannotated
|
|
1436
|
+
unannotated_features = unannotated_features.iloc[0:0] # Empty dataframe
|
|
1437
|
+
|
|
1438
|
+
feature_hover_annotated = HoverTool(
|
|
1439
|
+
tooltips=build_feature_tooltips(),
|
|
1440
|
+
)
|
|
1441
|
+
feature_hover_unannotated = HoverTool(
|
|
1442
|
+
tooltips=build_feature_tooltips(),
|
|
1443
|
+
)
|
|
1444
|
+
|
|
1445
|
+
# Select only plottable columns for vdims (exclude complex objects like Chromatogram)
|
|
1446
|
+
base_vdims = ["feature_uid", "inty", "iso", "adduct", "chrom_coherence", "chrom_prominence_scaled"]
|
|
1447
|
+
# Add id_* columns if they exist
|
|
1448
|
+
id_vdims = [col for col in feats.columns if isinstance(col, str) and col.startswith("id_")]
|
|
1449
|
+
all_vdims = base_vdims + id_vdims
|
|
1450
|
+
|
|
1451
|
+
if len(annotated_features) > 0:
|
|
1452
|
+
vdims_annotated = [col for col in all_vdims if col in annotated_features.columns]
|
|
1453
|
+
feature_points_1 = hv.Points(
|
|
1454
|
+
annotated_features,
|
|
1455
|
+
kdims=["rt", "mz"],
|
|
1456
|
+
vdims=vdims_annotated,
|
|
1457
|
+
label="Annotated features",
|
|
1458
|
+
).options(
|
|
1459
|
+
color="#2e7d32",
|
|
1460
|
+
marker=marker_type,
|
|
1461
|
+
size=size_1,
|
|
1462
|
+
tools=[feature_hover_annotated],
|
|
1463
|
+
hooks=hooks,
|
|
1464
|
+
show_legend=True,
|
|
1465
|
+
)
|
|
1466
|
+
|
|
1467
|
+
if len(unannotated_features) > 0:
|
|
1468
|
+
vdims_unannotated = [col for col in all_vdims if col in unannotated_features.columns]
|
|
1469
|
+
feature_points_2 = hv.Points(
|
|
1470
|
+
unannotated_features,
|
|
1471
|
+
kdims=["rt", "mz"],
|
|
1472
|
+
vdims=vdims_unannotated,
|
|
1473
|
+
label="Unannotated features",
|
|
1474
|
+
).options(
|
|
1475
|
+
color="#9e9e9e",
|
|
1476
|
+
marker=marker_type,
|
|
1477
|
+
size=size_2,
|
|
1478
|
+
tools=[feature_hover_unannotated],
|
|
1479
|
+
hooks=hooks,
|
|
1480
|
+
show_legend=True,
|
|
1481
|
+
)
|
|
1482
|
+
|
|
1379
1483
|
# Handle colorby parameter
|
|
1380
|
-
if colorby is not None and colorby in feats.columns:
|
|
1484
|
+
if (not handled_colorby) and colorby is not None and colorby in feats.columns:
|
|
1381
1485
|
# Check if colorby data is categorical (string-like)
|
|
1382
1486
|
colorby_values = feats[colorby].dropna()
|
|
1383
1487
|
is_categorical = feats[colorby].dtype in ["object", "string", "category"] or (
|
|
@@ -1424,17 +1528,7 @@ def plot_2d(
|
|
|
1424
1528
|
|
|
1425
1529
|
if len(group_with_ms2) > 0:
|
|
1426
1530
|
feature_hover = HoverTool(
|
|
1427
|
-
tooltips=
|
|
1428
|
-
("rt", "@rt"),
|
|
1429
|
-
("m/z", "@mz{0.0000}"),
|
|
1430
|
-
("feature_uid", "@feature_uid"),
|
|
1431
|
-
("inty", "@inty"),
|
|
1432
|
-
("iso", "@iso"),
|
|
1433
|
-
("adduct", "@adduct"),
|
|
1434
|
-
("chrom_coherence", "@chrom_coherence"),
|
|
1435
|
-
("chrom_prominence_scaled", "@chrom_prominence_scaled"),
|
|
1436
|
-
(colorby, f"@{colorby}"),
|
|
1437
|
-
],
|
|
1531
|
+
tooltips=build_feature_tooltips(include_colorby=colorby),
|
|
1438
1532
|
)
|
|
1439
1533
|
group_points_ms2 = hv.Points(
|
|
1440
1534
|
group_with_ms2,
|
|
@@ -1464,17 +1558,7 @@ def plot_2d(
|
|
|
1464
1558
|
|
|
1465
1559
|
if len(group_without_ms2) > 0:
|
|
1466
1560
|
feature_hover = HoverTool(
|
|
1467
|
-
tooltips=
|
|
1468
|
-
("rt", "@rt"),
|
|
1469
|
-
("m/z", "@mz{0.0000}"),
|
|
1470
|
-
("feature_uid", "@feature_uid"),
|
|
1471
|
-
("inty", "@inty"),
|
|
1472
|
-
("iso", "@iso"),
|
|
1473
|
-
("adduct", "@adduct"),
|
|
1474
|
-
("chrom_coherence", "@chrom_coherence"),
|
|
1475
|
-
("chrom_prominence_scaled", "@chrom_prominence_scaled"),
|
|
1476
|
-
(colorby, f"@{colorby}"),
|
|
1477
|
-
],
|
|
1561
|
+
tooltips=build_feature_tooltips(include_colorby=colorby),
|
|
1478
1562
|
)
|
|
1479
1563
|
group_points_no_ms2 = hv.Points(
|
|
1480
1564
|
group_without_ms2,
|
|
@@ -1500,22 +1584,15 @@ def plot_2d(
|
|
|
1500
1584
|
feature_points_2 = group_points_no_ms2
|
|
1501
1585
|
else:
|
|
1502
1586
|
feature_points_2 = feature_points_2 * group_points_no_ms2
|
|
1503
|
-
|
|
1587
|
+
|
|
1588
|
+
# Only use default coloring if no special colorby mode was handled
|
|
1589
|
+
if not handled_colorby and not use_categorical_coloring:
|
|
1504
1590
|
# Use original green/red coloring scheme for MS2 presence
|
|
1505
1591
|
# find features with ms2_scans not None and iso==0
|
|
1506
1592
|
features_df = feats[feats["ms2_scans"].notnull()]
|
|
1507
1593
|
# Create feature points with proper sizing method
|
|
1508
1594
|
feature_hover_1 = HoverTool(
|
|
1509
|
-
tooltips=
|
|
1510
|
-
("rt", "@rt"),
|
|
1511
|
-
("m/z", "@mz{0.0000}"),
|
|
1512
|
-
("feature_uid", "@feature_uid"),
|
|
1513
|
-
("inty", "@inty"),
|
|
1514
|
-
("iso", "@iso"),
|
|
1515
|
-
("adduct", "@adduct"),
|
|
1516
|
-
("chrom_coherence", "@chrom_coherence"),
|
|
1517
|
-
("chrom_prominence_scaled", "@chrom_prominence_scaled"),
|
|
1518
|
-
],
|
|
1595
|
+
tooltips=build_feature_tooltips(),
|
|
1519
1596
|
)
|
|
1520
1597
|
if len(features_df) > 0:
|
|
1521
1598
|
feature_points_1 = hv.Points(
|
|
@@ -1542,16 +1619,7 @@ def plot_2d(
|
|
|
1542
1619
|
# find features without MS2 data
|
|
1543
1620
|
features_df = feats[feats["ms2_scans"].isnull()]
|
|
1544
1621
|
feature_hover_2 = HoverTool(
|
|
1545
|
-
tooltips=
|
|
1546
|
-
("rt", "@rt"),
|
|
1547
|
-
("m/z", "@mz{0.0000}"),
|
|
1548
|
-
("feature_uid", "@feature_uid"),
|
|
1549
|
-
("inty", "@inty"),
|
|
1550
|
-
("iso", "@iso"),
|
|
1551
|
-
("adduct", "@adduct"),
|
|
1552
|
-
("chrom_coherence", "@chrom_coherence"),
|
|
1553
|
-
("chrom_prominence_scaled", "@chrom_prominence_scaled"),
|
|
1554
|
-
],
|
|
1622
|
+
tooltips=build_feature_tooltips(),
|
|
1555
1623
|
)
|
|
1556
1624
|
if len(features_df) > 0:
|
|
1557
1625
|
feature_points_2 = hv.Points(
|
|
@@ -1581,17 +1649,7 @@ def plot_2d(
|
|
|
1581
1649
|
if hasattr(features_df, "to_pandas"):
|
|
1582
1650
|
features_df = features_df.to_pandas()
|
|
1583
1651
|
feature_hover_iso = HoverTool(
|
|
1584
|
-
tooltips=
|
|
1585
|
-
("rt", "@rt"),
|
|
1586
|
-
("m/z", "@mz{0.0000}"),
|
|
1587
|
-
("feature_uid", "@feature_uid"),
|
|
1588
|
-
("inty", "@inty"),
|
|
1589
|
-
("iso", "@iso"),
|
|
1590
|
-
("iso_of", "@iso_of"),
|
|
1591
|
-
("adduct", "@adduct"),
|
|
1592
|
-
("chrom_coherence", "@chrom_coherence"),
|
|
1593
|
-
("chrom_prominence_scaled", "@chrom_prominence_scaled"),
|
|
1594
|
-
],
|
|
1652
|
+
tooltips=build_feature_tooltips(include_iso_of=True),
|
|
1595
1653
|
)
|
|
1596
1654
|
feature_points_iso = hv.Points(
|
|
1597
1655
|
features_df,
|
|
@@ -1676,18 +1734,31 @@ def plot_2d(
|
|
|
1676
1734
|
overlay = overlay * feature_points_4
|
|
1677
1735
|
if feature_points_3 is not None:
|
|
1678
1736
|
overlay = overlay * feature_points_3
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
if
|
|
1682
|
-
|
|
1737
|
+
|
|
1738
|
+
# In colorby='id' mode, draw unannotated (grey) first, then annotated (green) on top
|
|
1739
|
+
if colorby_id_mode:
|
|
1740
|
+
# Draw grey points first (bottom layer)
|
|
1741
|
+
if feature_points_2 is not None:
|
|
1742
|
+
overlay = overlay * feature_points_2
|
|
1743
|
+
# Draw green points last (top layer)
|
|
1744
|
+
if feature_points_1 is not None:
|
|
1745
|
+
overlay = overlay * feature_points_1
|
|
1746
|
+
else:
|
|
1747
|
+
# Default order: green (with MS2) first, then red (without MS2)
|
|
1748
|
+
if feature_points_1 is not None:
|
|
1749
|
+
overlay = overlay * feature_points_1
|
|
1750
|
+
# In non-id mode, only show features without MS2 if show_only_features_with_ms2 is False
|
|
1751
|
+
if not show_only_features_with_ms2 and feature_points_2 is not None:
|
|
1752
|
+
overlay = overlay * feature_points_2
|
|
1753
|
+
|
|
1683
1754
|
if feature_points_iso is not None:
|
|
1684
1755
|
overlay = overlay * feature_points_iso
|
|
1685
1756
|
|
|
1686
1757
|
if title is not None:
|
|
1687
1758
|
overlay = overlay.opts(title=title)
|
|
1688
1759
|
|
|
1689
|
-
# Handle legend positioning for categorical coloring
|
|
1690
|
-
if legend is not None and use_categorical_coloring and len(categorical_groups) > 1:
|
|
1760
|
+
# Handle legend positioning for categorical coloring or colorby='id' mode
|
|
1761
|
+
if legend is not None and (colorby_id_mode or (use_categorical_coloring and len(categorical_groups) > 1)):
|
|
1691
1762
|
# Map legend position parameter to HoloViews legend position
|
|
1692
1763
|
legend_position_map = {
|
|
1693
1764
|
"top_right": "top_right",
|
|
@@ -1704,8 +1775,8 @@ def plot_2d(
|
|
|
1704
1775
|
|
|
1705
1776
|
# Apply legend configuration to the overlay
|
|
1706
1777
|
overlay = overlay.opts(legend_position=hv_legend_pos, legend_opts={"title": "", "padding": 2, "spacing": 2})
|
|
1707
|
-
elif legend is None and use_categorical_coloring:
|
|
1708
|
-
# Explicitly hide legend when legend=None but categorical coloring is used
|
|
1778
|
+
elif legend is None and (colorby_id_mode or use_categorical_coloring):
|
|
1779
|
+
# Explicitly hide legend when legend=None but categorical coloring or id mode is used
|
|
1709
1780
|
overlay = overlay.opts(show_legend=False)
|
|
1710
1781
|
|
|
1711
1782
|
# Handle slider functionality
|
|
@@ -1728,12 +1799,27 @@ def plot_2d(
|
|
|
1728
1799
|
if feature_points_3 is not None:
|
|
1729
1800
|
updated_points_3 = feature_points_3.opts(size=size_val)
|
|
1730
1801
|
feature_overlay = updated_points_3 if feature_overlay is None else feature_overlay * updated_points_3
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1802
|
+
|
|
1803
|
+
# In colorby='id' mode, draw unannotated (grey) first, then annotated (green) on top
|
|
1804
|
+
if colorby_id_mode:
|
|
1805
|
+
# Draw grey points first (bottom layer)
|
|
1806
|
+
if feature_points_2 is not None:
|
|
1807
|
+
updated_points_2 = feature_points_2.opts(size=size_val)
|
|
1808
|
+
feature_overlay = updated_points_2 if feature_overlay is None else feature_overlay * updated_points_2
|
|
1809
|
+
# Draw green points last (top layer)
|
|
1810
|
+
if feature_points_1 is not None:
|
|
1811
|
+
updated_points_1 = feature_points_1.opts(size=size_val)
|
|
1812
|
+
feature_overlay = updated_points_1 if feature_overlay is None else feature_overlay * updated_points_1
|
|
1813
|
+
else:
|
|
1814
|
+
# Default order: green (with MS2) first, then red (without MS2)
|
|
1815
|
+
if feature_points_1 is not None:
|
|
1816
|
+
updated_points_1 = feature_points_1.opts(size=size_val)
|
|
1817
|
+
feature_overlay = updated_points_1 if feature_overlay is None else feature_overlay * updated_points_1
|
|
1818
|
+
# In non-id mode, only show features without MS2 if show_only_features_with_ms2 is False
|
|
1819
|
+
if not show_only_features_with_ms2 and feature_points_2 is not None:
|
|
1820
|
+
updated_points_2 = feature_points_2.opts(size=size_val)
|
|
1821
|
+
feature_overlay = updated_points_2 if feature_overlay is None else feature_overlay * updated_points_2
|
|
1822
|
+
|
|
1737
1823
|
if feature_points_iso is not None:
|
|
1738
1824
|
updated_points_iso = feature_points_iso.opts(size=size_val)
|
|
1739
1825
|
feature_overlay = (
|
|
@@ -2390,10 +2476,28 @@ def plot_dda_stats(
|
|
|
2390
2476
|
"time_ms2_to_ms2",
|
|
2391
2477
|
"time_ms2_to_ms1",
|
|
2392
2478
|
]
|
|
2393
|
-
#
|
|
2479
|
+
# skip cols that are not in stats
|
|
2480
|
+
cols_to_plot = [col for col in cols_to_plot if col in stats.columns]
|
|
2394
2481
|
stats = stats[["scan_uid", "cycle", "rt", *cols_to_plot]]
|
|
2395
2482
|
# set any value < 0 to None
|
|
2396
|
-
|
|
2483
|
+
# Replace negative values with nulls in a polars-friendly way
|
|
2484
|
+
numeric_types = {
|
|
2485
|
+
pl.Float32, pl.Float64,
|
|
2486
|
+
pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
2487
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64
|
|
2488
|
+
}
|
|
2489
|
+
exprs = []
|
|
2490
|
+
for col_name, dtype in stats.schema.items():
|
|
2491
|
+
if dtype in numeric_types:
|
|
2492
|
+
exprs.append(
|
|
2493
|
+
pl.when(pl.col(col_name) < 0)
|
|
2494
|
+
.then(None)
|
|
2495
|
+
.otherwise(pl.col(col_name))
|
|
2496
|
+
.alias(col_name)
|
|
2497
|
+
)
|
|
2498
|
+
else:
|
|
2499
|
+
exprs.append(pl.col(col_name))
|
|
2500
|
+
stats = stats.select(exprs)
|
|
2397
2501
|
|
|
2398
2502
|
# Create a Scatter for each column in cols_to_plot stacked vertically, with hover enabled
|
|
2399
2503
|
scatter_plots = []
|