masster 0.5.12__py3-none-any.whl → 0.5.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/lib/lib.py +371 -57
- masster/study/helpers.py +1 -0
- masster/study/id.py +237 -39
- masster/study/importers.py +331 -0
- masster/study/merge.py +3 -1
- masster/study/plot.py +93 -29
- masster/study/study.py +4 -0
- masster/study/study5_schema.json +12 -0
- masster/wizard/__init__.py +4 -4
- masster/wizard/wizard.py +437 -19
- {masster-0.5.12.dist-info → masster-0.5.14.dist-info}/METADATA +1 -1
- {masster-0.5.12.dist-info → masster-0.5.14.dist-info}/RECORD +16 -15
- {masster-0.5.12.dist-info → masster-0.5.14.dist-info}/WHEEL +0 -0
- {masster-0.5.12.dist-info → masster-0.5.14.dist-info}/entry_points.txt +0 -0
- {masster-0.5.12.dist-info → masster-0.5.14.dist-info}/licenses/LICENSE +0 -0
masster/study/id.py
CHANGED
|
@@ -145,16 +145,61 @@ def lib_load(
|
|
|
145
145
|
column_order.append("quant_group")
|
|
146
146
|
elif col == "formula" and "iso" in columns_list and "iso" not in column_order:
|
|
147
147
|
column_order.append("iso")
|
|
148
|
-
|
|
149
|
-
# Apply the column ordering
|
|
150
|
-
filtered_lf = filtered_lf.select(column_order)
|
|
151
148
|
|
|
149
|
+
|
|
152
150
|
# Add to existing lib_df instead of replacing
|
|
153
151
|
if (
|
|
154
152
|
hasattr(study, "lib_df")
|
|
155
153
|
and study.lib_df is not None
|
|
156
154
|
and not study.lib_df.is_empty()
|
|
157
155
|
):
|
|
156
|
+
# Check for schema compatibility and handle mismatches
|
|
157
|
+
existing_cols = set(study.lib_df.columns)
|
|
158
|
+
new_cols = set(filtered_lf.columns)
|
|
159
|
+
|
|
160
|
+
# If schemas don't match, we need to align them
|
|
161
|
+
if existing_cols != new_cols:
|
|
162
|
+
# Get union of all columns
|
|
163
|
+
all_cols = existing_cols.union(new_cols)
|
|
164
|
+
|
|
165
|
+
# Add missing columns to existing data with appropriate defaults
|
|
166
|
+
for col in new_cols - existing_cols:
|
|
167
|
+
if col == "probability":
|
|
168
|
+
# Add probability column to existing data - try to calculate from adduct
|
|
169
|
+
if "adduct" in study.lib_df.columns:
|
|
170
|
+
try:
|
|
171
|
+
adduct_prob_map = _get_adduct_probabilities(study)
|
|
172
|
+
study.lib_df = study.lib_df.with_columns(
|
|
173
|
+
pl.col("adduct").map_elements(
|
|
174
|
+
lambda adduct: adduct_prob_map.get(adduct, 1.0) if adduct is not None else 1.0,
|
|
175
|
+
return_dtype=pl.Float64
|
|
176
|
+
).alias("probability")
|
|
177
|
+
)
|
|
178
|
+
except Exception:
|
|
179
|
+
study.lib_df = study.lib_df.with_columns(pl.lit(1.0).alias("probability"))
|
|
180
|
+
else:
|
|
181
|
+
study.lib_df = study.lib_df.with_columns(pl.lit(1.0).alias("probability"))
|
|
182
|
+
elif col == "iso":
|
|
183
|
+
study.lib_df = study.lib_df.with_columns(pl.lit(0).cast(pl.Int64).alias("iso"))
|
|
184
|
+
elif col == "quant_group":
|
|
185
|
+
# Set quant_group using cmpd_uid or lib_uid
|
|
186
|
+
if "cmpd_uid" in study.lib_df.columns:
|
|
187
|
+
study.lib_df = study.lib_df.with_columns(pl.col("cmpd_uid").cast(pl.Int64).alias("quant_group"))
|
|
188
|
+
else:
|
|
189
|
+
study.lib_df = study.lib_df.with_columns(pl.col("lib_uid").cast(pl.Int64).alias("quant_group"))
|
|
190
|
+
else:
|
|
191
|
+
# Default to null for other columns
|
|
192
|
+
study.lib_df = study.lib_df.with_columns(pl.lit(None).alias(col))
|
|
193
|
+
|
|
194
|
+
# Add missing columns to new data with appropriate defaults
|
|
195
|
+
for col in existing_cols - new_cols:
|
|
196
|
+
if col not in ["probability", "iso", "quant_group"]: # These should already be handled
|
|
197
|
+
filtered_lf = filtered_lf.with_columns(pl.lit(None).alias(col))
|
|
198
|
+
|
|
199
|
+
# Ensure column order matches for concatenation - use existing column order
|
|
200
|
+
existing_column_order = list(study.lib_df.columns)
|
|
201
|
+
filtered_lf = filtered_lf.select(existing_column_order)
|
|
202
|
+
|
|
158
203
|
# Concatenate with existing data
|
|
159
204
|
study.lib_df = pl.concat([study.lib_df, filtered_lf])
|
|
160
205
|
else:
|
|
@@ -209,8 +254,19 @@ def _setup_identify_parameters(params, kwargs):
|
|
|
209
254
|
|
|
210
255
|
# Override parameters with any provided kwargs
|
|
211
256
|
if kwargs:
|
|
257
|
+
# Handle parameter name mapping for backwards compatibility
|
|
258
|
+
param_mapping = {
|
|
259
|
+
'rt_tolerance': 'rt_tol',
|
|
260
|
+
'mz_tolerance': 'mz_tol'
|
|
261
|
+
}
|
|
262
|
+
|
|
212
263
|
for param_name, value in kwargs.items():
|
|
213
|
-
if
|
|
264
|
+
# Check if we need to map the parameter name
|
|
265
|
+
mapped_name = param_mapping.get(param_name, param_name)
|
|
266
|
+
|
|
267
|
+
if hasattr(params, mapped_name):
|
|
268
|
+
setattr(params, mapped_name, value)
|
|
269
|
+
elif hasattr(params, param_name):
|
|
214
270
|
setattr(params, param_name, value)
|
|
215
271
|
|
|
216
272
|
return params
|
|
@@ -319,9 +375,13 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
|
|
|
319
375
|
else None
|
|
320
376
|
)
|
|
321
377
|
|
|
322
|
-
# Get
|
|
378
|
+
# Get library probability as base score, then multiply by adduct probability
|
|
379
|
+
lib_probability = match_row.get("probability", 1.0) if match_row.get("probability") is not None else 1.0
|
|
323
380
|
adduct = match_row.get("adduct")
|
|
324
|
-
|
|
381
|
+
adduct_probability = adduct_prob_map.get(adduct, 1.0) if adduct else 1.0
|
|
382
|
+
score = lib_probability * adduct_probability
|
|
383
|
+
# Scale to 0-100 and round to 1 decimal place
|
|
384
|
+
score = round(score * 100.0, 1)
|
|
325
385
|
|
|
326
386
|
match_results.append({
|
|
327
387
|
"lib_uid": match_row.get("lib_uid"),
|
|
@@ -337,7 +397,11 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
|
|
|
337
397
|
|
|
338
398
|
|
|
339
399
|
def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, cons_uid):
|
|
340
|
-
"""
|
|
400
|
+
"""
|
|
401
|
+
Find library matches using optimized vectorized operations.
|
|
402
|
+
|
|
403
|
+
FIXED VERSION: Prevents incorrect matching of same compound to different m/z values.
|
|
404
|
+
"""
|
|
341
405
|
# Filter by m/z tolerance using vectorized operations
|
|
342
406
|
matches = lib_df.filter(
|
|
343
407
|
(pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol)
|
|
@@ -345,43 +409,78 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
|
|
|
345
409
|
|
|
346
410
|
initial_match_count = len(matches)
|
|
347
411
|
|
|
348
|
-
# Apply RT filter if available
|
|
412
|
+
# Apply RT filter if available - STRICT VERSION (no fallback)
|
|
349
413
|
if rt_tol is not None and cons_rt is not None and not matches.is_empty():
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
(pl.col("rt") >= cons_rt - rt_tol) &
|
|
353
|
-
(pl.col("rt") <= cons_rt + rt_tol)
|
|
354
|
-
)
|
|
414
|
+
# First, check if any m/z matches have RT data
|
|
415
|
+
rt_candidates = matches.filter(pl.col("rt").is_not_null())
|
|
355
416
|
|
|
356
|
-
if not
|
|
357
|
-
|
|
417
|
+
if not rt_candidates.is_empty():
|
|
418
|
+
# Apply RT filtering to candidates with RT data
|
|
419
|
+
rt_matches = rt_candidates.filter(
|
|
420
|
+
(pl.col("rt") >= cons_rt - rt_tol) &
|
|
421
|
+
(pl.col("rt") <= cons_rt + rt_tol)
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
if not rt_matches.is_empty():
|
|
425
|
+
matches = rt_matches
|
|
426
|
+
if logger:
|
|
427
|
+
logger.debug(
|
|
428
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, {len(matches)} after RT filter"
|
|
429
|
+
)
|
|
430
|
+
else:
|
|
431
|
+
# NO FALLBACK - if RT filtering finds no matches, return empty
|
|
432
|
+
matches = rt_matches # This is empty
|
|
433
|
+
if logger:
|
|
434
|
+
logger.debug(
|
|
435
|
+
f"Consensus {cons_uid}: RT filtering eliminated all {len(rt_candidates)} candidates (rt_tol={rt_tol}s) - no matches returned"
|
|
436
|
+
)
|
|
437
|
+
else:
|
|
438
|
+
# No RT data in library matches - return empty if strict RT filtering requested
|
|
358
439
|
if logger:
|
|
359
440
|
logger.debug(
|
|
360
|
-
f"Consensus {cons_uid}: {initial_match_count} m/z matches
|
|
441
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches but none have library RT data - no matches returned due to RT filtering"
|
|
361
442
|
)
|
|
443
|
+
matches = pl.DataFrame() # Return empty DataFrame
|
|
444
|
+
|
|
445
|
+
# FIX 1: Add stricter m/z validation - prioritize more accurate matches
|
|
446
|
+
if not matches.is_empty():
|
|
447
|
+
strict_mz_tol = mz_tol * 0.5 # Use 50% of tolerance as strict threshold
|
|
448
|
+
strict_matches = matches.filter(
|
|
449
|
+
(pl.col("mz") >= cons_mz - strict_mz_tol) & (pl.col("mz") <= cons_mz + strict_mz_tol)
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
if not strict_matches.is_empty():
|
|
453
|
+
# Use strict matches if available
|
|
454
|
+
matches = strict_matches
|
|
455
|
+
if logger:
|
|
456
|
+
logger.debug(f"Consensus {cons_uid}: Using {len(matches)} strict m/z matches (within {strict_mz_tol:.6f} Da)")
|
|
362
457
|
else:
|
|
363
458
|
if logger:
|
|
364
|
-
logger.debug(
|
|
365
|
-
f"Consensus {cons_uid}: {initial_match_count} m/z matches, 0 after RT filter - using m/z matches only"
|
|
366
|
-
)
|
|
459
|
+
logger.debug(f"Consensus {cons_uid}: No strict matches, using {len(matches)} loose matches")
|
|
367
460
|
|
|
368
|
-
#
|
|
461
|
+
# FIX 2: Improved deduplication - prioritize by m/z accuracy
|
|
369
462
|
if not matches.is_empty() and len(matches) > 1:
|
|
370
463
|
if "formula" in matches.columns and "adduct" in matches.columns:
|
|
371
464
|
pre_dedup_count = len(matches)
|
|
372
465
|
|
|
373
|
-
#
|
|
466
|
+
# Calculate m/z error for sorting
|
|
467
|
+
matches = matches.with_columns([
|
|
468
|
+
(pl.col("mz") - cons_mz).abs().alias("mz_error_abs")
|
|
469
|
+
])
|
|
470
|
+
|
|
471
|
+
# Group by formula and adduct, but keep the most accurate m/z match
|
|
374
472
|
matches = (
|
|
375
473
|
matches
|
|
376
|
-
.sort("lib_uid") #
|
|
474
|
+
.sort(["mz_error_abs", "lib_uid"]) # Sort by m/z accuracy first, then lib_uid for consistency
|
|
377
475
|
.group_by(["formula", "adduct"], maintain_order=True)
|
|
378
476
|
.first()
|
|
477
|
+
.drop("mz_error_abs") # Remove the temporary column
|
|
379
478
|
)
|
|
380
479
|
|
|
381
480
|
post_dedup_count = len(matches)
|
|
382
481
|
if logger and post_dedup_count < pre_dedup_count:
|
|
383
482
|
logger.debug(
|
|
384
|
-
f"Consensus {cons_uid}: deduplicated {pre_dedup_count} to {post_dedup_count} matches"
|
|
483
|
+
f"Consensus {cons_uid}: deduplicated {pre_dedup_count} to {post_dedup_count} matches (m/z accuracy prioritized)"
|
|
385
484
|
)
|
|
386
485
|
|
|
387
486
|
return matches
|
|
@@ -617,7 +716,11 @@ def _apply_scoring_adjustments(study, params):
|
|
|
617
716
|
|
|
618
717
|
|
|
619
718
|
def _update_consensus_id_columns(study, logger=None):
|
|
620
|
-
"""
|
|
719
|
+
"""
|
|
720
|
+
Update consensus_df with top-scoring identification results using safe in-place updates.
|
|
721
|
+
|
|
722
|
+
FIXED VERSION: Prevents same compound from being assigned to vastly different m/z values.
|
|
723
|
+
"""
|
|
621
724
|
try:
|
|
622
725
|
if not hasattr(study, "id_df") or study.id_df is None or study.id_df.is_empty():
|
|
623
726
|
if logger:
|
|
@@ -634,14 +737,47 @@ def _update_consensus_id_columns(study, logger=None):
|
|
|
634
737
|
logger.debug("No consensus data available")
|
|
635
738
|
return
|
|
636
739
|
|
|
637
|
-
# Get library columns we need
|
|
638
|
-
lib_columns = ["lib_uid", "name", "adduct"]
|
|
740
|
+
# Get library columns we need (include mz for validation)
|
|
741
|
+
lib_columns = ["lib_uid", "name", "adduct", "mz"]
|
|
639
742
|
if "class" in study.lib_df.columns:
|
|
640
743
|
lib_columns.append("class")
|
|
641
744
|
|
|
642
|
-
#
|
|
745
|
+
# FIX 1: Join identification results with consensus m/z for validation
|
|
746
|
+
id_with_consensus = study.id_df.join(
|
|
747
|
+
study.consensus_df.select(["consensus_uid", "mz"]),
|
|
748
|
+
on="consensus_uid",
|
|
749
|
+
how="left",
|
|
750
|
+
suffix="_consensus"
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
# FIX 2: Validate m/z accuracy - filter out poor matches
|
|
754
|
+
id_with_lib = id_with_consensus.join(
|
|
755
|
+
study.lib_df.select(["lib_uid", "mz"]),
|
|
756
|
+
on="lib_uid",
|
|
757
|
+
how="left",
|
|
758
|
+
suffix="_lib"
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
# Calculate actual m/z error and filter out excessive errors
|
|
762
|
+
id_validated = id_with_lib.with_columns([
|
|
763
|
+
(pl.col("mz") - pl.col("mz_lib")).abs().alias("actual_mz_error")
|
|
764
|
+
])
|
|
765
|
+
|
|
766
|
+
# Filter out matches with excessive m/z error
|
|
767
|
+
max_reasonable_error = 0.02 # 20 millidalton maximum error
|
|
768
|
+
id_validated = id_validated.filter(
|
|
769
|
+
(pl.col("actual_mz_error") <= max_reasonable_error) | pl.col("actual_mz_error").is_null()
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
if logger:
|
|
773
|
+
original_count = len(id_with_consensus)
|
|
774
|
+
validated_count = len(id_validated)
|
|
775
|
+
if validated_count < original_count:
|
|
776
|
+
logger.warning(f"Filtered out {original_count - validated_count} identifications with excessive m/z error (>{max_reasonable_error:.3f} Da)")
|
|
777
|
+
|
|
778
|
+
# Get top-scoring identification for each consensus feature (from validated results)
|
|
643
779
|
top_ids = (
|
|
644
|
-
|
|
780
|
+
id_validated
|
|
645
781
|
.sort(["consensus_uid", "score"], descending=[False, True])
|
|
646
782
|
.group_by("consensus_uid", maintain_order=True)
|
|
647
783
|
.first()
|
|
@@ -656,12 +792,44 @@ def _update_consensus_id_columns(study, logger=None):
|
|
|
656
792
|
.rename({"name": "id_top_name"})
|
|
657
793
|
)
|
|
658
794
|
|
|
795
|
+
# FIX 3: Check for conflicts where same compound+adduct assigned to very different m/z
|
|
796
|
+
if not top_ids.is_empty():
|
|
797
|
+
compound_groups = (
|
|
798
|
+
top_ids
|
|
799
|
+
.join(study.consensus_df.select(["consensus_uid", "mz"]), on="consensus_uid", how="left")
|
|
800
|
+
.group_by(["id_top_name", "id_top_adduct"])
|
|
801
|
+
.agg([
|
|
802
|
+
pl.col("consensus_uid").count().alias("count"),
|
|
803
|
+
pl.col("mz").min().alias("mz_min"),
|
|
804
|
+
pl.col("mz").max().alias("mz_max")
|
|
805
|
+
])
|
|
806
|
+
.with_columns([
|
|
807
|
+
(pl.col("mz_max") - pl.col("mz_min")).alias("mz_range")
|
|
808
|
+
])
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
# Find problematic assignments (same compound+adduct with >0.1 Da m/z range)
|
|
812
|
+
problematic = compound_groups.filter(
|
|
813
|
+
(pl.col("count") > 1) & (pl.col("mz_range") > 0.1)
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
if not problematic.is_empty() and logger:
|
|
817
|
+
for row in problematic.iter_rows(named=True):
|
|
818
|
+
name = row["id_top_name"]
|
|
819
|
+
adduct = row["id_top_adduct"]
|
|
820
|
+
count = row["count"]
|
|
821
|
+
mz_range = row["mz_range"]
|
|
822
|
+
logger.warning(
|
|
823
|
+
f"Identification conflict detected: '{name}' ({adduct}) assigned to {count} features with {mz_range:.4f} Da m/z range"
|
|
824
|
+
)
|
|
825
|
+
|
|
659
826
|
# Ensure we have the id_top columns in consensus_df
|
|
660
827
|
for col_name, dtype in [
|
|
661
828
|
("id_top_name", pl.String),
|
|
662
829
|
("id_top_class", pl.String),
|
|
663
830
|
("id_top_adduct", pl.String),
|
|
664
|
-
("id_top_score", pl.Float64)
|
|
831
|
+
("id_top_score", pl.Float64),
|
|
832
|
+
("id_source", pl.String)
|
|
665
833
|
]:
|
|
666
834
|
if col_name not in study.consensus_df.columns:
|
|
667
835
|
study.consensus_df = study.consensus_df.with_columns(
|
|
@@ -782,7 +950,7 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
782
950
|
if logger:
|
|
783
951
|
features_with_matches = len([r for r in results if len(r["matches"]) > 0])
|
|
784
952
|
total_matches = sum(len(r["matches"]) for r in results)
|
|
785
|
-
logger.
|
|
953
|
+
logger.success(
|
|
786
954
|
f"Identification completed: {features_with_matches}/{consensus_count} features matched, {total_matches} total identifications",
|
|
787
955
|
)
|
|
788
956
|
|
|
@@ -805,6 +973,8 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
805
973
|
- mz (consensus feature m/z)
|
|
806
974
|
- rt (consensus feature RT)
|
|
807
975
|
- name (compound name from library)
|
|
976
|
+
- shortname (short name from library, if available)
|
|
977
|
+
- class (compound class from library, if available)
|
|
808
978
|
- formula (molecular formula from library)
|
|
809
979
|
- adduct (adduct type from library)
|
|
810
980
|
- smiles (SMILES notation from library)
|
|
@@ -872,6 +1042,8 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
872
1042
|
lib_cols = [
|
|
873
1043
|
"lib_uid",
|
|
874
1044
|
"name",
|
|
1045
|
+
"shortname",
|
|
1046
|
+
"class",
|
|
875
1047
|
"formula",
|
|
876
1048
|
"adduct",
|
|
877
1049
|
"smiles",
|
|
@@ -900,6 +1072,8 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
900
1072
|
"cmpd_uid" if "cmpd_uid" in result_df.columns else None,
|
|
901
1073
|
"lib_uid",
|
|
902
1074
|
"name" if "name" in result_df.columns else None,
|
|
1075
|
+
"shortname" if "shortname" in result_df.columns else None,
|
|
1076
|
+
"class" if "class" in result_df.columns else None,
|
|
903
1077
|
"formula" if "formula" in result_df.columns else None,
|
|
904
1078
|
"adduct" if "adduct" in result_df.columns else None,
|
|
905
1079
|
"mz" if "mz" in result_df.columns else None,
|
|
@@ -951,6 +1125,8 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
951
1125
|
"cmpd_uid",
|
|
952
1126
|
"lib_uid",
|
|
953
1127
|
"name",
|
|
1128
|
+
"shortname",
|
|
1129
|
+
"class",
|
|
954
1130
|
"formula",
|
|
955
1131
|
"adduct",
|
|
956
1132
|
"mz",
|
|
@@ -1076,7 +1252,7 @@ def id_reset(study):
|
|
|
1076
1252
|
|
|
1077
1253
|
# Check which columns exist before trying to update them
|
|
1078
1254
|
id_columns_to_reset = []
|
|
1079
|
-
for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]:
|
|
1255
|
+
for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
|
|
1080
1256
|
if col in study.consensus_df.columns:
|
|
1081
1257
|
if col == "id_top_score":
|
|
1082
1258
|
id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
|
|
@@ -1093,7 +1269,7 @@ def id_reset(study):
|
|
|
1093
1269
|
del study.history["identify"]
|
|
1094
1270
|
|
|
1095
1271
|
if logger:
|
|
1096
|
-
logger.
|
|
1272
|
+
logger.info("Identification data reset completed")
|
|
1097
1273
|
|
|
1098
1274
|
|
|
1099
1275
|
def lib_reset(study):
|
|
@@ -1122,11 +1298,33 @@ def lib_reset(study):
|
|
|
1122
1298
|
logger.debug("Checking for consensus features created by lib_to_consensus()")
|
|
1123
1299
|
|
|
1124
1300
|
try:
|
|
1125
|
-
# Filter for features
|
|
1126
|
-
#
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1301
|
+
# Filter for features created by lib_to_consensus()
|
|
1302
|
+
# These can be identified by:
|
|
1303
|
+
# 1. number_samples < 1 (set to 0.0 by lib_to_consensus)
|
|
1304
|
+
# 2. AND have corresponding entries in consensus_mapping_df with sample_uid = 0 (virtual sample)
|
|
1305
|
+
|
|
1306
|
+
# First check if we have any features with number_samples < 1
|
|
1307
|
+
potential_lib_features = study.consensus_df.filter(pl.col("number_samples") < 1)
|
|
1308
|
+
|
|
1309
|
+
if potential_lib_features is not None and not potential_lib_features.is_empty():
|
|
1310
|
+
# Further filter by checking if they have sample_uid = 0 in consensus_mapping_df
|
|
1311
|
+
# This ensures we only remove library-derived features, not legitimate features with 0 samples
|
|
1312
|
+
if hasattr(study, "consensus_mapping_df") and not study.consensus_mapping_df.is_empty():
|
|
1313
|
+
lib_consensus_uids = study.consensus_mapping_df.filter(
|
|
1314
|
+
pl.col("sample_uid") == 0
|
|
1315
|
+
)["consensus_uid"].unique().to_list()
|
|
1316
|
+
|
|
1317
|
+
if lib_consensus_uids:
|
|
1318
|
+
lib_consensus_features = potential_lib_features.filter(
|
|
1319
|
+
pl.col("consensus_uid").is_in(lib_consensus_uids)
|
|
1320
|
+
)
|
|
1321
|
+
else:
|
|
1322
|
+
lib_consensus_features = pl.DataFrame() # No library features found
|
|
1323
|
+
else:
|
|
1324
|
+
# If no consensus_mapping_df, fall back to number_samples < 1 only
|
|
1325
|
+
lib_consensus_features = potential_lib_features
|
|
1326
|
+
else:
|
|
1327
|
+
lib_consensus_features = pl.DataFrame() # No features with number_samples < 1
|
|
1130
1328
|
|
|
1131
1329
|
if lib_consensus_features is not None and not lib_consensus_features.is_empty():
|
|
1132
1330
|
num_lib_features = len(lib_consensus_features)
|
|
@@ -1170,7 +1368,7 @@ def lib_reset(study):
|
|
|
1170
1368
|
|
|
1171
1369
|
# Check which columns exist before trying to update them
|
|
1172
1370
|
id_columns_to_reset = []
|
|
1173
|
-
for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]:
|
|
1371
|
+
for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
|
|
1174
1372
|
if col in study.consensus_df.columns:
|
|
1175
1373
|
if col == "id_top_score":
|
|
1176
1374
|
id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
|
|
@@ -1198,7 +1396,7 @@ def lib_reset(study):
|
|
|
1198
1396
|
del study.history["lib_to_consensus"]
|
|
1199
1397
|
|
|
1200
1398
|
if logger:
|
|
1201
|
-
logger.
|
|
1399
|
+
logger.info("Library and identification data reset completed")
|
|
1202
1400
|
|
|
1203
1401
|
|
|
1204
1402
|
def _get_adducts(study, adducts_list: list | None = None, **kwargs):
|