masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/logger.py +35 -19
- masster/sample/adducts.py +15 -29
- masster/sample/defaults/find_adducts_def.py +1 -3
- masster/sample/defaults/sample_def.py +4 -4
- masster/sample/h5.py +203 -361
- masster/sample/helpers.py +14 -30
- masster/sample/lib.py +3 -3
- masster/sample/load.py +21 -29
- masster/sample/plot.py +222 -132
- masster/sample/processing.py +42 -55
- masster/sample/sample.py +37 -46
- masster/sample/save.py +37 -61
- masster/sample/sciex.py +13 -11
- masster/sample/thermo.py +69 -74
- masster/spectrum.py +15 -15
- masster/study/analysis.py +650 -586
- masster/study/defaults/identify_def.py +1 -3
- masster/study/defaults/merge_def.py +6 -7
- masster/study/defaults/study_def.py +1 -5
- masster/study/export.py +35 -96
- masster/study/h5.py +134 -211
- masster/study/helpers.py +385 -459
- masster/study/id.py +239 -290
- masster/study/importers.py +84 -93
- masster/study/load.py +159 -178
- masster/study/merge.py +1112 -1098
- masster/study/plot.py +195 -149
- masster/study/processing.py +144 -191
- masster/study/save.py +14 -13
- masster/study/study.py +89 -130
- masster/wizard/wizard.py +764 -714
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0
masster/study/importers.py
CHANGED
|
@@ -20,23 +20,23 @@ def import_oracle(
|
|
|
20
20
|
):
|
|
21
21
|
"""
|
|
22
22
|
Import oracle identification data and map it to consensus features.
|
|
23
|
-
|
|
23
|
+
|
|
24
24
|
This method reads oracle identification results from folder/diag/annotation_full.csv
|
|
25
25
|
and creates lib_df and id_df DataFrames with detailed library and identification information.
|
|
26
26
|
It also updates consensus_df with top identification results.
|
|
27
|
-
|
|
27
|
+
|
|
28
28
|
Parameters:
|
|
29
29
|
folder (str): Path to oracle folder containing diag/annotation_full.csv
|
|
30
30
|
min_id_level (int, optional): Minimum identification level to include
|
|
31
31
|
max_id_level (int, optional): Maximum identification level to include
|
|
32
|
-
|
|
32
|
+
|
|
33
33
|
Returns:
|
|
34
34
|
None: Updates consensus_df, creates lib_df and id_df in-place with oracle identification data
|
|
35
|
-
|
|
35
|
+
|
|
36
36
|
Raises:
|
|
37
37
|
FileNotFoundError: If the oracle annotation file doesn't exist
|
|
38
38
|
ValueError: If consensus_df is empty or doesn't have required columns
|
|
39
|
-
|
|
39
|
+
|
|
40
40
|
Example:
|
|
41
41
|
>>> study.import_oracle(
|
|
42
42
|
... folder="path/to/oracle_results",
|
|
@@ -44,23 +44,23 @@ def import_oracle(
|
|
|
44
44
|
... max_id_level=4
|
|
45
45
|
... )
|
|
46
46
|
"""
|
|
47
|
-
|
|
47
|
+
|
|
48
48
|
self.logger.info(f"Starting oracle import from folder: {folder}")
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
# Validate inputs
|
|
51
51
|
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
52
52
|
raise ValueError("consensus_df is empty or not available. Run merge() first.")
|
|
53
|
-
|
|
53
|
+
|
|
54
54
|
if "consensus_uid" not in self.consensus_df.columns:
|
|
55
55
|
raise ValueError("consensus_df must contain 'consensus_uid' column")
|
|
56
|
-
|
|
56
|
+
|
|
57
57
|
# Check if oracle file exists
|
|
58
58
|
oracle_file_path = os.path.join(folder, "diag", "annotation_full.csv")
|
|
59
59
|
if not os.path.exists(oracle_file_path):
|
|
60
60
|
raise FileNotFoundError(f"Oracle annotation file not found: {oracle_file_path}")
|
|
61
|
-
|
|
61
|
+
|
|
62
62
|
self.logger.debug(f"Loading oracle data from: {oracle_file_path}")
|
|
63
|
-
|
|
63
|
+
|
|
64
64
|
try:
|
|
65
65
|
# Read oracle data using pandas first for easier processing
|
|
66
66
|
oracle_data = pd.read_csv(oracle_file_path)
|
|
@@ -68,38 +68,38 @@ def import_oracle(
|
|
|
68
68
|
except Exception as e:
|
|
69
69
|
self.logger.error(f"Could not read {oracle_file_path}: {e}")
|
|
70
70
|
raise
|
|
71
|
-
|
|
71
|
+
|
|
72
72
|
# Extract consensus_uid from scan_title column (format: "uid:XYZ, ...")
|
|
73
73
|
self.logger.debug("Extracting consensus UIDs from oracle scan_title using pattern 'uid:(\\d+)'")
|
|
74
74
|
oracle_data["consensus_uid"] = oracle_data["scan_title"].str.extract(r"uid:(\d+)", expand=False)
|
|
75
|
-
|
|
75
|
+
|
|
76
76
|
# Remove rows where consensus_uid extraction failed
|
|
77
77
|
initial_count = len(oracle_data)
|
|
78
78
|
oracle_data = oracle_data.dropna(subset=["consensus_uid"])
|
|
79
79
|
oracle_data["consensus_uid"] = oracle_data["consensus_uid"].astype(int)
|
|
80
|
-
|
|
80
|
+
|
|
81
81
|
self.logger.debug(f"Extracted consensus UIDs for {len(oracle_data)}/{initial_count} oracle entries")
|
|
82
|
-
|
|
82
|
+
|
|
83
83
|
# Apply id_level filters if specified
|
|
84
84
|
if min_id_level is not None:
|
|
85
85
|
oracle_data = oracle_data[oracle_data["level"] >= min_id_level]
|
|
86
86
|
self.logger.debug(f"After min_id_level filter ({min_id_level}): {len(oracle_data)} entries")
|
|
87
|
-
|
|
87
|
+
|
|
88
88
|
if max_id_level is not None:
|
|
89
89
|
oracle_data = oracle_data[oracle_data["level"] <= max_id_level]
|
|
90
90
|
self.logger.debug(f"After max_id_level filter ({max_id_level}): {len(oracle_data)} entries")
|
|
91
|
-
|
|
91
|
+
|
|
92
92
|
if len(oracle_data) == 0:
|
|
93
93
|
self.logger.warning("No oracle entries remain after filtering")
|
|
94
94
|
return
|
|
95
|
-
|
|
95
|
+
|
|
96
96
|
# === CREATE LIB_DF ===
|
|
97
97
|
self.logger.debug("Creating lib_df from Oracle annotation data")
|
|
98
98
|
self.logger.debug(f"Oracle data shape before lib_df creation: {oracle_data.shape}")
|
|
99
|
-
|
|
99
|
+
|
|
100
100
|
# Create unique lib_uid for each library entry
|
|
101
101
|
oracle_data["lib_uid"] = range(len(oracle_data))
|
|
102
|
-
|
|
102
|
+
|
|
103
103
|
# Map Oracle columns to lib_df schema
|
|
104
104
|
lib_data = []
|
|
105
105
|
for _, row in oracle_data.iterrows():
|
|
@@ -110,33 +110,33 @@ def import_oracle(
|
|
|
110
110
|
cmpd_uid = int(float(str(row["lib_id"]))) # Convert to int, handling potential float strings
|
|
111
111
|
except (ValueError, TypeError):
|
|
112
112
|
pass # Keep lib_uid as fallback
|
|
113
|
-
|
|
113
|
+
|
|
114
114
|
lib_entry = {
|
|
115
115
|
"lib_uid": row["lib_uid"],
|
|
116
116
|
"cmpd_uid": cmpd_uid, # Integer compound identifier
|
|
117
117
|
"source_id": "LipidOracle", # Fixed source identifier
|
|
118
118
|
"name": row.get("name", None),
|
|
119
119
|
"shortname": row.get("species", None),
|
|
120
|
-
"class": row.get("hg", None),
|
|
120
|
+
"class": row.get("hg", None),
|
|
121
121
|
"smiles": None, # Not available in Oracle data
|
|
122
|
-
"inchi": None,
|
|
123
|
-
"inchikey": None,
|
|
122
|
+
"inchi": None, # Not available in Oracle data
|
|
123
|
+
"inchikey": None, # Not available in Oracle data
|
|
124
124
|
"formula": row.get("formula", None),
|
|
125
|
-
"iso": 0,
|
|
125
|
+
"iso": 0, # Fixed isotope value
|
|
126
126
|
"adduct": row.get("ion", None),
|
|
127
127
|
"probability": row.get("score", None),
|
|
128
|
-
"m": None,
|
|
128
|
+
"m": None, # Would need to calculate from formula
|
|
129
129
|
"z": 1 if row.get("ion", "").find("+") != -1 else (-1 if row.get("ion", "").find("-") != -1 else None),
|
|
130
130
|
"mz": row.get("mz", None), # Use mz column from annotation_full.csv
|
|
131
|
-
"rt": None,
|
|
131
|
+
"rt": None, # Set to null as requested
|
|
132
132
|
"quant_group": None, # Set to null as requested
|
|
133
133
|
"db_id": row.get("lib_id", None),
|
|
134
|
-
"db": row.get("lib", None)
|
|
134
|
+
"db": row.get("lib", None),
|
|
135
135
|
}
|
|
136
136
|
lib_data.append(lib_entry)
|
|
137
|
-
|
|
137
|
+
|
|
138
138
|
self.logger.debug(f"Created {len(lib_data)} lib_data entries")
|
|
139
|
-
|
|
139
|
+
|
|
140
140
|
# Create lib_df as Polars DataFrame with error handling for mixed types
|
|
141
141
|
try:
|
|
142
142
|
lib_df_temp = pl.DataFrame(lib_data)
|
|
@@ -145,20 +145,18 @@ def import_oracle(
|
|
|
145
145
|
# Fallback: convert to pandas first, then to polars
|
|
146
146
|
lib_df_pandas = pd.DataFrame(lib_data)
|
|
147
147
|
lib_df_temp = pl.from_pandas(lib_df_pandas)
|
|
148
|
-
|
|
148
|
+
|
|
149
149
|
# Ensure uniqueness by name and adduct combination
|
|
150
150
|
# Sort by lib_uid and keep first occurrence (earliest in processing order)
|
|
151
|
-
self.lib_df = (
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
.
|
|
151
|
+
self.lib_df = lib_df_temp.sort("lib_uid").unique(subset=["name", "adduct"], keep="first")
|
|
152
|
+
|
|
153
|
+
self.logger.info(
|
|
154
|
+
f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)"
|
|
155
155
|
)
|
|
156
|
-
|
|
157
|
-
self.logger.info(f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)")
|
|
158
|
-
|
|
156
|
+
|
|
159
157
|
# === CREATE ID_DF ===
|
|
160
158
|
self.logger.debug("Creating id_df from Oracle identification matches")
|
|
161
|
-
|
|
159
|
+
|
|
162
160
|
# Create identification matches
|
|
163
161
|
id_data = []
|
|
164
162
|
for _, row in oracle_data.iterrows():
|
|
@@ -169,7 +167,7 @@ def import_oracle(
|
|
|
169
167
|
mz_delta = float(row["dmz"])
|
|
170
168
|
except (ValueError, TypeError):
|
|
171
169
|
pass
|
|
172
|
-
|
|
170
|
+
|
|
173
171
|
# Use rt_err from annotation_full.csv for rt_delta, None if NaN
|
|
174
172
|
rt_delta = None
|
|
175
173
|
rt_err_value = row.get("rt_err")
|
|
@@ -178,7 +176,7 @@ def import_oracle(
|
|
|
178
176
|
rt_delta = float(rt_err_value)
|
|
179
177
|
except (ValueError, TypeError):
|
|
180
178
|
pass
|
|
181
|
-
|
|
179
|
+
|
|
182
180
|
# Create matcher as "lipidoracle-" + score_metric from annotation_full.csv
|
|
183
181
|
matcher = "lipidoracle" # default fallback
|
|
184
182
|
if row.get("score_metric") is not None:
|
|
@@ -187,17 +185,17 @@ def import_oracle(
|
|
|
187
185
|
matcher = f"lipidoracle-{score_metric}"
|
|
188
186
|
except (ValueError, TypeError):
|
|
189
187
|
pass
|
|
190
|
-
|
|
188
|
+
|
|
191
189
|
id_entry = {
|
|
192
190
|
"consensus_uid": row["consensus_uid"],
|
|
193
191
|
"lib_uid": row["lib_uid"],
|
|
194
192
|
"mz_delta": mz_delta,
|
|
195
193
|
"rt_delta": rt_delta,
|
|
196
194
|
"matcher": matcher,
|
|
197
|
-
"score": row.get("score", None)
|
|
195
|
+
"score": row.get("score", None),
|
|
198
196
|
}
|
|
199
197
|
id_data.append(id_entry)
|
|
200
|
-
|
|
198
|
+
|
|
201
199
|
# Create id_df as Polars DataFrame with error handling
|
|
202
200
|
try:
|
|
203
201
|
id_df_temp = pl.DataFrame(id_data)
|
|
@@ -206,16 +204,16 @@ def import_oracle(
|
|
|
206
204
|
# Fallback: convert to pandas first, then to polars
|
|
207
205
|
id_df_pandas = pd.DataFrame(id_data)
|
|
208
206
|
id_df_temp = pl.from_pandas(id_df_pandas)
|
|
209
|
-
|
|
207
|
+
|
|
210
208
|
# Filter id_df to only include lib_uids that exist in the final unique lib_df
|
|
211
209
|
unique_lib_uids = self.lib_df.select("lib_uid").to_series()
|
|
212
210
|
self.id_df = id_df_temp.filter(pl.col("lib_uid").is_in(unique_lib_uids))
|
|
213
|
-
|
|
211
|
+
|
|
214
212
|
self.logger.info(f"Created id_df with {len(self.id_df)} identification matches")
|
|
215
|
-
|
|
213
|
+
|
|
216
214
|
# === UPDATE CONSENSUS_DF (existing functionality) ===
|
|
217
215
|
self.logger.debug("Updating consensus_df with top identification results")
|
|
218
|
-
|
|
216
|
+
|
|
219
217
|
# Convert to polars for efficient joining with error handling
|
|
220
218
|
try:
|
|
221
219
|
oracle_pl = pl.DataFrame(oracle_data)
|
|
@@ -223,67 +221,57 @@ def import_oracle(
|
|
|
223
221
|
self.logger.warning(f"Error converting oracle_data to polars: {e}")
|
|
224
222
|
# Convert using from_pandas properly
|
|
225
223
|
oracle_pl = pl.from_pandas(oracle_data.reset_index(drop=True))
|
|
226
|
-
|
|
224
|
+
|
|
227
225
|
# Group by consensus_uid and select the best identification (highest level)
|
|
228
226
|
# In case of ties, take the first one
|
|
229
227
|
best_ids = (
|
|
230
|
-
oracle_pl
|
|
231
|
-
.
|
|
232
|
-
.agg([
|
|
233
|
-
pl.col("level").max().alias("max_level")
|
|
234
|
-
])
|
|
228
|
+
oracle_pl.group_by("consensus_uid")
|
|
229
|
+
.agg([pl.col("level").max().alias("max_level")])
|
|
235
230
|
.join(oracle_pl, on="consensus_uid")
|
|
236
231
|
.filter(pl.col("level") == pl.col("max_level"))
|
|
237
232
|
.group_by("consensus_uid")
|
|
238
233
|
.first() # In case of ties, take the first
|
|
239
234
|
)
|
|
240
|
-
|
|
235
|
+
|
|
241
236
|
self.logger.debug(f"Selected best identifications for {len(best_ids)} consensus features")
|
|
242
|
-
|
|
237
|
+
|
|
243
238
|
# Prepare the identification columns
|
|
244
239
|
id_columns = {
|
|
245
240
|
"id_top_name": best_ids.select("consensus_uid", "name"),
|
|
246
|
-
"id_top_adduct": best_ids.select("consensus_uid", "ion"),
|
|
241
|
+
"id_top_adduct": best_ids.select("consensus_uid", "ion"),
|
|
247
242
|
"id_top_class": best_ids.select("consensus_uid", "hg"),
|
|
248
243
|
"id_top_score": best_ids.select("consensus_uid", pl.col("score").round(3).alias("score")),
|
|
249
244
|
"id_source": best_ids.select(
|
|
250
|
-
"consensus_uid",
|
|
245
|
+
"consensus_uid",
|
|
251
246
|
pl.when(pl.col("level") == 1)
|
|
252
247
|
.then(pl.lit("lipidoracle ms1"))
|
|
253
248
|
.otherwise(pl.lit("lipidoracle ms2"))
|
|
254
|
-
.alias("id_source")
|
|
255
|
-
)
|
|
249
|
+
.alias("id_source"),
|
|
250
|
+
),
|
|
256
251
|
}
|
|
257
|
-
|
|
252
|
+
|
|
258
253
|
# Initialize identification columns in consensus_df if they don't exist
|
|
259
254
|
for col_name in id_columns.keys():
|
|
260
255
|
if col_name not in self.consensus_df.columns:
|
|
261
256
|
if col_name == "id_top_score":
|
|
262
|
-
self.consensus_df = self.consensus_df.with_columns(
|
|
263
|
-
pl.lit(None, dtype=pl.Float64).alias(col_name)
|
|
264
|
-
)
|
|
257
|
+
self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.Float64).alias(col_name))
|
|
265
258
|
else:
|
|
266
|
-
self.consensus_df = self.consensus_df.with_columns(
|
|
267
|
-
|
|
268
|
-
)
|
|
269
|
-
|
|
259
|
+
self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.String).alias(col_name))
|
|
260
|
+
|
|
270
261
|
# Update consensus_df with oracle identifications
|
|
271
262
|
for col_name, id_data_col in id_columns.items():
|
|
272
263
|
oracle_column = id_data_col.columns[1] # second column (after consensus_uid)
|
|
273
|
-
|
|
264
|
+
|
|
274
265
|
# Create update dataframe
|
|
275
266
|
update_data = id_data_col.rename({oracle_column: col_name})
|
|
276
|
-
|
|
267
|
+
|
|
277
268
|
# Join and update
|
|
278
269
|
self.consensus_df = (
|
|
279
|
-
self.consensus_df
|
|
280
|
-
.
|
|
281
|
-
.with_columns(
|
|
282
|
-
pl.coalesce([f"{col_name}_oracle", col_name]).alias(col_name)
|
|
283
|
-
)
|
|
270
|
+
self.consensus_df.join(update_data, on="consensus_uid", how="left", suffix="_oracle")
|
|
271
|
+
.with_columns(pl.coalesce([f"{col_name}_oracle", col_name]).alias(col_name))
|
|
284
272
|
.drop(f"{col_name}_oracle")
|
|
285
273
|
)
|
|
286
|
-
|
|
274
|
+
|
|
287
275
|
# Replace NaN values with None in identification columns
|
|
288
276
|
id_col_names = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score", "id_source"]
|
|
289
277
|
for col_name in id_col_names:
|
|
@@ -292,10 +280,10 @@ def import_oracle(
|
|
|
292
280
|
if col_name != "id_top_score":
|
|
293
281
|
self.consensus_df = self.consensus_df.with_columns(
|
|
294
282
|
pl.when(
|
|
295
|
-
pl.col(col_name).is_null()
|
|
296
|
-
(pl.col(col_name) == "")
|
|
297
|
-
(pl.col(col_name) == "nan")
|
|
298
|
-
(pl.col(col_name) == "NaN")
|
|
283
|
+
pl.col(col_name).is_null()
|
|
284
|
+
| (pl.col(col_name) == "")
|
|
285
|
+
| (pl.col(col_name) == "nan")
|
|
286
|
+
| (pl.col(col_name) == "NaN")
|
|
299
287
|
)
|
|
300
288
|
.then(None)
|
|
301
289
|
.otherwise(pl.col(col_name))
|
|
@@ -309,23 +297,26 @@ def import_oracle(
|
|
|
309
297
|
.otherwise(pl.col(col_name))
|
|
310
298
|
.alias(col_name)
|
|
311
299
|
)
|
|
312
|
-
|
|
300
|
+
|
|
313
301
|
# Count how many consensus features were updated
|
|
314
302
|
updated_count = self.consensus_df.filter(pl.col("id_top_name").is_not_null()).height
|
|
315
303
|
total_consensus = len(self.consensus_df)
|
|
316
|
-
|
|
304
|
+
|
|
317
305
|
self.logger.success(
|
|
318
306
|
f"LipidOracle import completed. {updated_count}/{total_consensus} "
|
|
319
|
-
f"consensus features now have identifications ({updated_count/total_consensus*100:.1f}%)"
|
|
307
|
+
f"consensus features now have identifications ({updated_count / total_consensus * 100:.1f}%)"
|
|
320
308
|
)
|
|
321
|
-
|
|
309
|
+
|
|
322
310
|
# Update history
|
|
323
|
-
self.update_history(
|
|
324
|
-
"
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
311
|
+
self.update_history(
|
|
312
|
+
["import_oracle"],
|
|
313
|
+
{
|
|
314
|
+
"folder": folder,
|
|
315
|
+
"min_id_level": min_id_level,
|
|
316
|
+
"max_id_level": max_id_level,
|
|
317
|
+
"updated_features": updated_count,
|
|
318
|
+
"total_features": total_consensus,
|
|
319
|
+
"lib_entries": len(self.lib_df),
|
|
320
|
+
"id_matches": len(self.id_df),
|
|
321
|
+
},
|
|
322
|
+
)
|