masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +3 -9
- masster/data/libs/README.md +1 -1
- masster/data/libs/ccm.csv +120 -120
- masster/data/libs/ccm.py +116 -62
- masster/data/libs/central_carbon_README.md +1 -1
- masster/data/libs/urine.py +161 -65
- masster/data/libs/urine_metabolites.csv +4693 -4693
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
- masster/logger.py +43 -78
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +264 -338
- masster/sample/defaults/find_adducts_def.py +8 -21
- masster/sample/defaults/find_features_def.py +1 -6
- masster/sample/defaults/get_spectrum_def.py +1 -5
- masster/sample/defaults/sample_def.py +1 -5
- masster/sample/h5.py +282 -561
- masster/sample/helpers.py +75 -131
- masster/sample/lib.py +17 -42
- masster/sample/load.py +17 -31
- masster/sample/parameters.py +2 -6
- masster/sample/plot.py +27 -88
- masster/sample/processing.py +87 -117
- masster/sample/quant.py +51 -57
- masster/sample/sample.py +90 -103
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +12 -35
- masster/sample/sciex.py +19 -66
- masster/spectrum.py +20 -58
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +1 -5
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/fill_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/integrate_def.py +1 -5
- masster/study/defaults/study_def.py +25 -58
- masster/study/export.py +207 -233
- masster/study/h5.py +136 -470
- masster/study/helpers.py +202 -495
- masster/study/helpers_optimized.py +13 -40
- masster/study/id.py +110 -213
- masster/study/load.py +143 -230
- masster/study/plot.py +257 -518
- masster/study/processing.py +257 -469
- masster/study/save.py +5 -15
- masster/study/study.py +276 -379
- masster/study/study5_schema.json +96 -96
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
- masster-0.4.1.dist-info/RECORD +67 -0
- masster-0.4.0.dist-info/RECORD +0 -67
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py
CHANGED
|
@@ -22,7 +22,7 @@ import pandas as pd
|
|
|
22
22
|
import polars as pl
|
|
23
23
|
|
|
24
24
|
from tqdm import tqdm
|
|
25
|
-
from
|
|
25
|
+
from masster.chromatogram import Chromatogram
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
# =====================================================================================
|
|
@@ -71,12 +71,7 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
|
|
|
71
71
|
# fallback to pandas
|
|
72
72
|
try:
|
|
73
73
|
bpc_pd = s.ms1_df.to_pandas()[["rt", "inty"]]
|
|
74
|
-
bpc_pd = (
|
|
75
|
-
bpc_pd.groupby("rt")
|
|
76
|
-
.agg({"inty": "max"})
|
|
77
|
-
.reset_index()
|
|
78
|
-
.sort_values("rt")
|
|
79
|
-
)
|
|
74
|
+
bpc_pd = bpc_pd.groupby("rt").agg({"inty": "max"}).reset_index().sort_values("rt")
|
|
80
75
|
except Exception:
|
|
81
76
|
raise
|
|
82
77
|
|
|
@@ -118,16 +113,11 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
|
|
|
118
113
|
mapping_rows = pl.DataFrame()
|
|
119
114
|
|
|
120
115
|
# If we still have no sample selector, try to infer sample from the Sample object s
|
|
121
|
-
if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(
|
|
122
|
-
s,
|
|
123
|
-
"sample_path",
|
|
124
|
-
):
|
|
116
|
+
if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(s, "sample_path"):
|
|
125
117
|
# attempt to match by sample_path or file name
|
|
126
118
|
try:
|
|
127
119
|
# find row where sample_path matches
|
|
128
|
-
mapping_rows = feats.filter(
|
|
129
|
-
pl.col("sample_path") == getattr(s, "file", None),
|
|
130
|
-
)
|
|
120
|
+
mapping_rows = feats.filter(pl.col("sample_path") == getattr(s, "file", None))
|
|
131
121
|
except Exception:
|
|
132
122
|
mapping_rows = pl.DataFrame()
|
|
133
123
|
|
|
@@ -214,9 +204,7 @@ def get_tic(owner, sample=None, label=None):
|
|
|
214
204
|
except Exception:
|
|
215
205
|
raise
|
|
216
206
|
else:
|
|
217
|
-
raise ValueError(
|
|
218
|
-
"Neither ms1_df nor scans_df available for TIC computation",
|
|
219
|
-
)
|
|
207
|
+
raise ValueError("Neither ms1_df nor scans_df available for TIC computation")
|
|
220
208
|
|
|
221
209
|
if tic_pd.empty:
|
|
222
210
|
raise ValueError("Computed TIC is empty")
|
|
@@ -379,17 +367,14 @@ def get_chrom(self, uids=None, samples=None):
|
|
|
379
367
|
)
|
|
380
368
|
# Pre-filter features_df to only relevant features and samples
|
|
381
369
|
filtered_features = self.features_df.filter(
|
|
382
|
-
pl.col("feature_uid").is_in(relevant_feature_uids)
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
"sample_uid",
|
|
391
|
-
],
|
|
392
|
-
)
|
|
370
|
+
pl.col("feature_uid").is_in(relevant_feature_uids) & pl.col("sample_uid").is_in(sample_uids),
|
|
371
|
+
).select([
|
|
372
|
+
"feature_uid",
|
|
373
|
+
"chrom",
|
|
374
|
+
"rt",
|
|
375
|
+
"rt_original",
|
|
376
|
+
"sample_uid",
|
|
377
|
+
])
|
|
393
378
|
|
|
394
379
|
# Pre-filter samples_df
|
|
395
380
|
filtered_samples = self.samples_df.filter(
|
|
@@ -424,13 +409,11 @@ def get_chrom(self, uids=None, samples=None):
|
|
|
424
409
|
# Create a mapping dictionary for O(1) lookup instead of O(n) filtering
|
|
425
410
|
self.logger.debug("Creating lookup dictionary for chromatogram objects.")
|
|
426
411
|
chrom_lookup = {}
|
|
427
|
-
for row in df_combined.select(
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
],
|
|
433
|
-
).iter_rows():
|
|
412
|
+
for row in df_combined.select([
|
|
413
|
+
"consensus_uid",
|
|
414
|
+
"sample_name",
|
|
415
|
+
"chrom",
|
|
416
|
+
]).iter_rows():
|
|
434
417
|
key = (row[0], row[1]) # (consensus_uid, sample_name)
|
|
435
418
|
chrom_lookup[key] = row[2] # chrom object
|
|
436
419
|
|
|
@@ -549,9 +532,7 @@ def get_consensus_matrix(self, quant="chrom_area"):
|
|
|
549
532
|
|
|
550
533
|
# Build consensus matrix directly using the consensus_mapping_df
|
|
551
534
|
matrix_dict = {}
|
|
552
|
-
sample_mapping = dict(
|
|
553
|
-
self.samples_df.select(["sample_uid", "sample_name"]).iter_rows(),
|
|
554
|
-
)
|
|
535
|
+
sample_mapping = dict(self.samples_df.select(["sample_uid", "sample_name"]).iter_rows())
|
|
555
536
|
|
|
556
537
|
for row in self.consensus_mapping_df.iter_rows(named=True):
|
|
557
538
|
consensus_uid = row["consensus_uid"]
|
|
@@ -569,10 +550,7 @@ def get_consensus_matrix(self, quant="chrom_area"):
|
|
|
569
550
|
|
|
570
551
|
# Take max if multiple features map to same consensus/sample combination
|
|
571
552
|
if sample_name in matrix_dict[consensus_uid]:
|
|
572
|
-
matrix_dict[consensus_uid][sample_name] = max(
|
|
573
|
-
matrix_dict[consensus_uid][sample_name],
|
|
574
|
-
value,
|
|
575
|
-
)
|
|
553
|
+
matrix_dict[consensus_uid][sample_name] = max(matrix_dict[consensus_uid][sample_name], value)
|
|
576
554
|
else:
|
|
577
555
|
matrix_dict[consensus_uid][sample_name] = value
|
|
578
556
|
|
|
@@ -591,12 +569,10 @@ def get_consensus_matrix(self, quant="chrom_area"):
|
|
|
591
569
|
|
|
592
570
|
# Fill null values with 0 and round numeric columns
|
|
593
571
|
numeric_cols = [col for col in df2.columns if col != "consensus_uid"]
|
|
594
|
-
df2 = df2.with_columns(
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
],
|
|
599
|
-
)
|
|
572
|
+
df2 = df2.with_columns([
|
|
573
|
+
pl.col("consensus_uid").cast(pl.UInt64),
|
|
574
|
+
*[pl.col(col).fill_null(0).round(0) for col in numeric_cols],
|
|
575
|
+
])
|
|
600
576
|
|
|
601
577
|
return df2
|
|
602
578
|
|
|
@@ -816,7 +792,7 @@ def get_sample(self, sample):
|
|
|
816
792
|
|
|
817
793
|
This helper mirrors the original Study.get_sample method but lives in helpers for reuse.
|
|
818
794
|
"""
|
|
819
|
-
from
|
|
795
|
+
from masster.sample.sample import Sample
|
|
820
796
|
|
|
821
797
|
if isinstance(sample, Sample):
|
|
822
798
|
return sample
|
|
@@ -826,9 +802,7 @@ def get_sample(self, sample):
|
|
|
826
802
|
elif isinstance(sample, str):
|
|
827
803
|
rows = self.samples_df.filter(pl.col("sample_name") == sample)
|
|
828
804
|
else:
|
|
829
|
-
raise ValueError(
|
|
830
|
-
"sample must be an int (sample_uid), str (sample_name) or a Sample instance",
|
|
831
|
-
)
|
|
805
|
+
raise ValueError("sample must be an int (sample_uid), str (sample_name) or a Sample instance")
|
|
832
806
|
|
|
833
807
|
if rows.is_empty():
|
|
834
808
|
raise KeyError(f"Sample not found: {sample}")
|
|
@@ -862,9 +836,7 @@ def get_orphans(self):
|
|
|
862
836
|
Get all features that are not in the consensus mapping.
|
|
863
837
|
"""
|
|
864
838
|
not_in_consensus = self.features_df.filter(
|
|
865
|
-
~self.features_df["feature_uid"].is_in(
|
|
866
|
-
self.consensus_mapping_df["feature_uid"].to_list(),
|
|
867
|
-
),
|
|
839
|
+
~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list()),
|
|
868
840
|
)
|
|
869
841
|
return not_in_consensus
|
|
870
842
|
|
|
@@ -942,7 +914,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
942
914
|
maps (bool, optional): If True, also load featureXML data and update study.feature_maps.
|
|
943
915
|
"""
|
|
944
916
|
import datetime
|
|
945
|
-
from
|
|
917
|
+
from masster.sample.sample import Sample
|
|
946
918
|
|
|
947
919
|
if self.features_df is None or self.features_df.is_empty():
|
|
948
920
|
self.logger.error("No features_df found in study.")
|
|
@@ -962,9 +934,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
962
934
|
# Columns to update from sample data
|
|
963
935
|
columns_to_update = ["chrom", "chrom_area", "ms2_scans", "ms2_specs"]
|
|
964
936
|
|
|
965
|
-
self.logger.info(
|
|
966
|
-
f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...",
|
|
967
|
-
)
|
|
937
|
+
self.logger.info(f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...")
|
|
968
938
|
|
|
969
939
|
# Create a mapping of (sample_uid, feature_id) to feature_uid from study.features_df
|
|
970
940
|
study_feature_mapping = {}
|
|
@@ -984,9 +954,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
984
954
|
# Get sample info
|
|
985
955
|
sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
|
|
986
956
|
if sample_row.is_empty():
|
|
987
|
-
self.logger.warning(
|
|
988
|
-
f"Sample with uid {sample_uid} not found in samples_df.",
|
|
989
|
-
)
|
|
957
|
+
self.logger.warning(f"Sample with uid {sample_uid} not found in samples_df.")
|
|
990
958
|
continue
|
|
991
959
|
|
|
992
960
|
sample_info = sample_row.row(0, named=True)
|
|
@@ -994,9 +962,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
994
962
|
sample_name = sample_info.get("sample_name")
|
|
995
963
|
|
|
996
964
|
if not sample_path or not os.path.exists(sample_path):
|
|
997
|
-
self.logger.warning(
|
|
998
|
-
f"Sample file not found for {sample_name}: {sample_path}",
|
|
999
|
-
)
|
|
965
|
+
self.logger.warning(f"Sample file not found for {sample_name}: {sample_path}")
|
|
1000
966
|
continue
|
|
1001
967
|
|
|
1002
968
|
try:
|
|
@@ -1012,9 +978,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
1012
978
|
continue
|
|
1013
979
|
|
|
1014
980
|
# Check which columns are actually available in the sample
|
|
1015
|
-
available_columns = [
|
|
1016
|
-
col for col in columns_to_update if col in sample.features_df.columns
|
|
1017
|
-
]
|
|
981
|
+
available_columns = [col for col in columns_to_update if col in sample.features_df.columns]
|
|
1018
982
|
if not available_columns:
|
|
1019
983
|
self.logger.debug(f"No target columns found in sample {sample_name}")
|
|
1020
984
|
continue
|
|
@@ -1037,21 +1001,13 @@ def restore_features(self, samples=None, maps=False):
|
|
|
1037
1001
|
original_dtype = self.features_df[col].dtype
|
|
1038
1002
|
|
|
1039
1003
|
# Update the specific row and column, preserving dtype
|
|
1040
|
-
mask = (pl.col("feature_uid") == feature_uid) & (
|
|
1041
|
-
pl.col("sample_uid") == sample_uid
|
|
1042
|
-
)
|
|
1004
|
+
mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
|
|
1043
1005
|
|
|
1044
1006
|
# Handle object columns (like Chromatogram) differently
|
|
1045
1007
|
if original_dtype == pl.Object:
|
|
1046
1008
|
self.features_df = self.features_df.with_columns(
|
|
1047
1009
|
pl.when(mask)
|
|
1048
|
-
.then(
|
|
1049
|
-
pl.lit(
|
|
1050
|
-
row[col],
|
|
1051
|
-
dtype=original_dtype,
|
|
1052
|
-
allow_object=True,
|
|
1053
|
-
),
|
|
1054
|
-
)
|
|
1010
|
+
.then(pl.lit(row[col], dtype=original_dtype, allow_object=True))
|
|
1055
1011
|
.otherwise(pl.col(col))
|
|
1056
1012
|
.alias(col),
|
|
1057
1013
|
)
|
|
@@ -1065,9 +1021,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
1065
1021
|
updates_made += 1
|
|
1066
1022
|
|
|
1067
1023
|
if updates_made > 0:
|
|
1068
|
-
self.logger.debug(
|
|
1069
|
-
f"Updated {updates_made} features from sample {sample_name}",
|
|
1070
|
-
)
|
|
1024
|
+
self.logger.debug(f"Updated {updates_made} features from sample {sample_name}")
|
|
1071
1025
|
|
|
1072
1026
|
# If maps is True, load featureXML data
|
|
1073
1027
|
if maps:
|
|
@@ -1078,9 +1032,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
1078
1032
|
self.logger.error(f"Failed to load sample {sample_name}: {e}")
|
|
1079
1033
|
continue
|
|
1080
1034
|
|
|
1081
|
-
self.logger.info(
|
|
1082
|
-
f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples",
|
|
1083
|
-
)
|
|
1035
|
+
self.logger.info(f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples")
|
|
1084
1036
|
|
|
1085
1037
|
|
|
1086
1038
|
def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
@@ -1100,8 +1052,8 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1100
1052
|
"""
|
|
1101
1053
|
import datetime
|
|
1102
1054
|
import numpy as np
|
|
1103
|
-
from
|
|
1104
|
-
from
|
|
1055
|
+
from masster.sample.sample import Sample
|
|
1056
|
+
from masster.chromatogram import Chromatogram
|
|
1105
1057
|
|
|
1106
1058
|
if self.features_df is None or self.features_df.is_empty():
|
|
1107
1059
|
self.logger.error("No features_df found in study.")
|
|
@@ -1177,9 +1129,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1177
1129
|
feature_uid = study_feature_mapping[key]
|
|
1178
1130
|
|
|
1179
1131
|
# Update only the chrom column
|
|
1180
|
-
mask = (pl.col("feature_uid") == feature_uid) & (
|
|
1181
|
-
pl.col("sample_uid") == sample_uid
|
|
1182
|
-
)
|
|
1132
|
+
mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
|
|
1183
1133
|
self.features_df = self.features_df.with_columns(
|
|
1184
1134
|
pl.when(mask)
|
|
1185
1135
|
.then(pl.lit(chrom, dtype=pl.Object, allow_object=True))
|
|
@@ -1192,9 +1142,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1192
1142
|
self.logger.error(f"Failed to load sample {sample_name}: {e}")
|
|
1193
1143
|
continue
|
|
1194
1144
|
|
|
1195
|
-
self.logger.info(
|
|
1196
|
-
f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files",
|
|
1197
|
-
)
|
|
1145
|
+
self.logger.info(f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files")
|
|
1198
1146
|
|
|
1199
1147
|
# Phase 2: Gap-fill remaining empty chromatograms (like fill_chrom)
|
|
1200
1148
|
self.logger.info("Phase 2: Gap-filling remaining empty chromatograms...")
|
|
@@ -1208,9 +1156,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1208
1156
|
)
|
|
1209
1157
|
|
|
1210
1158
|
if empty_chroms == 0:
|
|
1211
|
-
self.logger.info(
|
|
1212
|
-
"All chromatograms restored from .sample5 files. No gap-filling needed.",
|
|
1213
|
-
)
|
|
1159
|
+
self.logger.info("All chromatograms restored from .sample5 files. No gap-filling needed.")
|
|
1214
1160
|
return
|
|
1215
1161
|
|
|
1216
1162
|
# Get consensus info for gap filling
|
|
@@ -1254,11 +1200,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1254
1200
|
sample = Sample(log_level="ERROR")
|
|
1255
1201
|
sample._load_sample5(sample_path, map=False)
|
|
1256
1202
|
|
|
1257
|
-
if (
|
|
1258
|
-
not hasattr(sample, "ms1_df")
|
|
1259
|
-
or sample.ms1_df is None
|
|
1260
|
-
or sample.ms1_df.is_empty()
|
|
1261
|
-
):
|
|
1203
|
+
if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
|
|
1262
1204
|
continue
|
|
1263
1205
|
|
|
1264
1206
|
# Process each missing feature
|
|
@@ -1343,9 +1285,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1343
1285
|
self.logger.info(
|
|
1344
1286
|
f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)",
|
|
1345
1287
|
)
|
|
1346
|
-
self.logger.info(
|
|
1347
|
-
f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}",
|
|
1348
|
-
)
|
|
1288
|
+
self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
|
|
1349
1289
|
|
|
1350
1290
|
|
|
1351
1291
|
def compress_ms2(self, max_replicates=5):
|
|
@@ -1365,28 +1305,17 @@ def compress_ms2(self, max_replicates=5):
|
|
|
1365
1305
|
|
|
1366
1306
|
# Create a ranking score based on number_frags * prec_inty
|
|
1367
1307
|
# Handle None values by treating them as 0
|
|
1368
|
-
self.consensus_ms2 = self.consensus_ms2.with_columns(
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)
|
|
1372
|
-
).alias("ranking_score"),
|
|
1373
|
-
],
|
|
1374
|
-
)
|
|
1308
|
+
self.consensus_ms2 = self.consensus_ms2.with_columns([
|
|
1309
|
+
(pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)).alias("ranking_score"),
|
|
1310
|
+
])
|
|
1375
1311
|
|
|
1376
1312
|
# Group by consensus_uid and energy, then rank by score and keep top max_replicates
|
|
1377
1313
|
compressed_ms2 = (
|
|
1378
|
-
self.consensus_ms2.with_row_count(
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
descending=[False, False, True, False],
|
|
1384
|
-
)
|
|
1385
|
-
.with_columns(
|
|
1386
|
-
[
|
|
1387
|
-
pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank"),
|
|
1388
|
-
],
|
|
1389
|
-
)
|
|
1314
|
+
self.consensus_ms2.with_row_count("row_id") # Add row numbers for stable sorting
|
|
1315
|
+
.sort(["consensus_uid", "energy", "ranking_score", "row_id"], descending=[False, False, True, False])
|
|
1316
|
+
.with_columns([
|
|
1317
|
+
pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank"),
|
|
1318
|
+
])
|
|
1390
1319
|
.filter(pl.col("rank") < max_replicates)
|
|
1391
1320
|
.drop(["ranking_score", "row_id", "rank"])
|
|
1392
1321
|
)
|
|
@@ -1422,9 +1351,7 @@ def compress_chrom(self):
|
|
|
1422
1351
|
pl.lit(None, dtype=pl.Object).alias("chrom"),
|
|
1423
1352
|
)
|
|
1424
1353
|
|
|
1425
|
-
self.logger.info(
|
|
1426
|
-
f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df",
|
|
1427
|
-
)
|
|
1354
|
+
self.logger.info(f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df")
|
|
1428
1355
|
|
|
1429
1356
|
|
|
1430
1357
|
# =====================================================================================
|
|
@@ -1475,9 +1402,7 @@ def sample_name_replace(self, replace_dict):
|
|
|
1475
1402
|
if name in replace_dict:
|
|
1476
1403
|
new_names.append(replace_dict[name])
|
|
1477
1404
|
replaced_count += 1
|
|
1478
|
-
self.logger.debug(
|
|
1479
|
-
f"Replacing sample name: '{name}' -> '{replace_dict[name]}'",
|
|
1480
|
-
)
|
|
1405
|
+
self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
|
|
1481
1406
|
else:
|
|
1482
1407
|
new_names.append(name)
|
|
1483
1408
|
|
|
@@ -1490,9 +1415,7 @@ def sample_name_replace(self, replace_dict):
|
|
|
1490
1415
|
duplicates.append(name)
|
|
1491
1416
|
else:
|
|
1492
1417
|
seen.add(name)
|
|
1493
|
-
raise ValueError(
|
|
1494
|
-
f"Resulting sample names are not unique. Duplicates found: {duplicates}",
|
|
1495
|
-
)
|
|
1418
|
+
raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
|
|
1496
1419
|
|
|
1497
1420
|
# If we get here, all names are unique - apply the changes
|
|
1498
1421
|
self.samples_df = self.samples_df.with_columns(
|
|
@@ -1541,9 +1464,7 @@ def sample_name_reset(self):
|
|
|
1541
1464
|
name_without_ext = os.path.splitext(name_without_ext)[0]
|
|
1542
1465
|
|
|
1543
1466
|
new_names.append(name_without_ext)
|
|
1544
|
-
self.logger.debug(
|
|
1545
|
-
f"Resetting sample name from path: '{path}' -> '{name_without_ext}'",
|
|
1546
|
-
)
|
|
1467
|
+
self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
|
|
1547
1468
|
|
|
1548
1469
|
# Check that all new names are unique
|
|
1549
1470
|
if len(set(new_names)) != len(new_names):
|
|
@@ -1554,18 +1475,14 @@ def sample_name_reset(self):
|
|
|
1554
1475
|
duplicates.append(name)
|
|
1555
1476
|
else:
|
|
1556
1477
|
seen.add(name)
|
|
1557
|
-
raise ValueError(
|
|
1558
|
-
f"Resulting sample names are not unique. Duplicates found: {duplicates}",
|
|
1559
|
-
)
|
|
1478
|
+
raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
|
|
1560
1479
|
|
|
1561
1480
|
# If we get here, all names are unique - apply the changes
|
|
1562
1481
|
self.samples_df = self.samples_df.with_columns(
|
|
1563
1482
|
pl.Series("sample_name", new_names).alias("sample_name"),
|
|
1564
1483
|
)
|
|
1565
1484
|
|
|
1566
|
-
self.logger.info(
|
|
1567
|
-
f"Successfully reset {len(new_names)} sample names from sample paths",
|
|
1568
|
-
)
|
|
1485
|
+
self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
|
|
1569
1486
|
|
|
1570
1487
|
|
|
1571
1488
|
def set_source(self, filename):
|
|
@@ -1595,15 +1512,11 @@ def set_source(self, filename):
|
|
|
1595
1512
|
|
|
1596
1513
|
new_sources = []
|
|
1597
1514
|
|
|
1598
|
-
for i, (current_source, sample_name) in enumerate(
|
|
1599
|
-
zip(current_sources, sample_names),
|
|
1600
|
-
):
|
|
1515
|
+
for i, (current_source, sample_name) in enumerate(zip(current_sources, sample_names)):
|
|
1601
1516
|
# Check if filename is just a directory path
|
|
1602
1517
|
if os.path.isdir(filename):
|
|
1603
1518
|
if current_source is None or current_source == "":
|
|
1604
|
-
self.logger.warning(
|
|
1605
|
-
f"Cannot build path for sample '{sample_name}': no current file_source available",
|
|
1606
|
-
)
|
|
1519
|
+
self.logger.warning(f"Cannot build path for sample '{sample_name}': no current file_source available")
|
|
1607
1520
|
new_sources.append(current_source)
|
|
1608
1521
|
failed_count += 1
|
|
1609
1522
|
continue
|
|
@@ -1618,9 +1531,7 @@ def set_source(self, filename):
|
|
|
1618
1531
|
|
|
1619
1532
|
# Check if the new file exists
|
|
1620
1533
|
if not os.path.exists(new_file_path):
|
|
1621
|
-
self.logger.warning(
|
|
1622
|
-
f"File does not exist for sample '{sample_name}': {new_file_path}",
|
|
1623
|
-
)
|
|
1534
|
+
self.logger.warning(f"File does not exist for sample '{sample_name}': {new_file_path}")
|
|
1624
1535
|
new_sources.append(current_source)
|
|
1625
1536
|
failed_count += 1
|
|
1626
1537
|
continue
|
|
@@ -1630,9 +1541,7 @@ def set_source(self, filename):
|
|
|
1630
1541
|
updated_count += 1
|
|
1631
1542
|
|
|
1632
1543
|
# Log individual updates at debug level
|
|
1633
|
-
self.logger.debug(
|
|
1634
|
-
f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}",
|
|
1635
|
-
)
|
|
1544
|
+
self.logger.debug(f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}")
|
|
1636
1545
|
|
|
1637
1546
|
# Update the samples_df with new file_source values
|
|
1638
1547
|
self.samples_df = self.samples_df.with_columns(
|
|
@@ -1726,9 +1635,7 @@ def features_select(
|
|
|
1726
1635
|
if mz is not None:
|
|
1727
1636
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
1728
1637
|
min_mz, max_mz = mz
|
|
1729
|
-
filter_conditions.append(
|
|
1730
|
-
(pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
|
|
1731
|
-
)
|
|
1638
|
+
filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
1732
1639
|
else:
|
|
1733
1640
|
filter_conditions.append(pl.col("mz") >= mz)
|
|
1734
1641
|
|
|
@@ -1736,9 +1643,7 @@ def features_select(
|
|
|
1736
1643
|
if rt is not None:
|
|
1737
1644
|
if isinstance(rt, tuple) and len(rt) == 2:
|
|
1738
1645
|
min_rt, max_rt = rt
|
|
1739
|
-
filter_conditions.append(
|
|
1740
|
-
(pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
|
|
1741
|
-
)
|
|
1646
|
+
filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
1742
1647
|
else:
|
|
1743
1648
|
filter_conditions.append(pl.col("rt") >= rt)
|
|
1744
1649
|
|
|
@@ -1746,9 +1651,7 @@ def features_select(
|
|
|
1746
1651
|
if inty is not None:
|
|
1747
1652
|
if isinstance(inty, tuple) and len(inty) == 2:
|
|
1748
1653
|
min_inty, max_inty = inty
|
|
1749
|
-
filter_conditions.append(
|
|
1750
|
-
(pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty),
|
|
1751
|
-
)
|
|
1654
|
+
filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
|
|
1752
1655
|
else:
|
|
1753
1656
|
filter_conditions.append(pl.col("inty") >= inty)
|
|
1754
1657
|
|
|
@@ -1758,10 +1661,7 @@ def features_select(
|
|
|
1758
1661
|
if len(sample_uid) == 2 and not isinstance(sample_uid, list):
|
|
1759
1662
|
# Treat as range
|
|
1760
1663
|
min_uid, max_uid = sample_uid
|
|
1761
|
-
filter_conditions.append(
|
|
1762
|
-
(pl.col("sample_uid") >= min_uid)
|
|
1763
|
-
& (pl.col("sample_uid") <= max_uid),
|
|
1764
|
-
)
|
|
1664
|
+
filter_conditions.append((pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid))
|
|
1765
1665
|
else:
|
|
1766
1666
|
# Treat as list
|
|
1767
1667
|
filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
|
|
@@ -1791,10 +1691,7 @@ def features_select(
|
|
|
1791
1691
|
if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
|
|
1792
1692
|
# Treat as range
|
|
1793
1693
|
min_uid, max_uid = consensus_uid
|
|
1794
|
-
filter_conditions.append(
|
|
1795
|
-
(pl.col("consensus_uid") >= min_uid)
|
|
1796
|
-
& (pl.col("consensus_uid") <= max_uid),
|
|
1797
|
-
)
|
|
1694
|
+
filter_conditions.append((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
|
|
1798
1695
|
else:
|
|
1799
1696
|
# Treat as list
|
|
1800
1697
|
filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
|
|
@@ -1807,10 +1704,7 @@ def features_select(
|
|
|
1807
1704
|
if len(feature_uid) == 2 and not isinstance(feature_uid, list):
|
|
1808
1705
|
# Treat as range
|
|
1809
1706
|
min_uid, max_uid = feature_uid
|
|
1810
|
-
filter_conditions.append(
|
|
1811
|
-
(pl.col("feature_uid") >= min_uid)
|
|
1812
|
-
& (pl.col("feature_uid") <= max_uid),
|
|
1813
|
-
)
|
|
1707
|
+
filter_conditions.append((pl.col("feature_uid") >= min_uid) & (pl.col("feature_uid") <= max_uid))
|
|
1814
1708
|
else:
|
|
1815
1709
|
# Treat as list
|
|
1816
1710
|
filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
|
|
@@ -1832,10 +1726,7 @@ def features_select(
|
|
|
1832
1726
|
if "quality" in available_columns:
|
|
1833
1727
|
if isinstance(quality, tuple) and len(quality) == 2:
|
|
1834
1728
|
min_quality, max_quality = quality
|
|
1835
|
-
filter_conditions.append(
|
|
1836
|
-
(pl.col("quality") >= min_quality)
|
|
1837
|
-
& (pl.col("quality") <= max_quality),
|
|
1838
|
-
)
|
|
1729
|
+
filter_conditions.append((pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality))
|
|
1839
1730
|
else:
|
|
1840
1731
|
filter_conditions.append(pl.col("quality") >= quality)
|
|
1841
1732
|
else:
|
|
@@ -1847,8 +1738,7 @@ def features_select(
|
|
|
1847
1738
|
if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
|
|
1848
1739
|
min_coherence, max_coherence = chrom_coherence
|
|
1849
1740
|
filter_conditions.append(
|
|
1850
|
-
(pl.col("chrom_coherence") >= min_coherence)
|
|
1851
|
-
& (pl.col("chrom_coherence") <= max_coherence),
|
|
1741
|
+
(pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
|
|
1852
1742
|
)
|
|
1853
1743
|
else:
|
|
1854
1744
|
filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
|
|
@@ -1861,8 +1751,7 @@ def features_select(
|
|
|
1861
1751
|
if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
|
|
1862
1752
|
min_prominence, max_prominence = chrom_prominence
|
|
1863
1753
|
filter_conditions.append(
|
|
1864
|
-
(pl.col("chrom_prominence") >= min_prominence)
|
|
1865
|
-
& (pl.col("chrom_prominence") <= max_prominence),
|
|
1754
|
+
(pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
|
|
1866
1755
|
)
|
|
1867
1756
|
else:
|
|
1868
1757
|
filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
|
|
@@ -1872,19 +1761,14 @@ def features_select(
|
|
|
1872
1761
|
# Filter by scaled chromatogram prominence
|
|
1873
1762
|
if chrom_prominence_scaled is not None:
|
|
1874
1763
|
if "chrom_prominence_scaled" in available_columns:
|
|
1875
|
-
if (
|
|
1876
|
-
isinstance(chrom_prominence_scaled, tuple)
|
|
1877
|
-
and len(chrom_prominence_scaled) == 2
|
|
1878
|
-
):
|
|
1764
|
+
if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
|
|
1879
1765
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
|
|
1880
1766
|
filter_conditions.append(
|
|
1881
1767
|
(pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
|
|
1882
1768
|
& (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
|
|
1883
1769
|
)
|
|
1884
1770
|
else:
|
|
1885
|
-
filter_conditions.append(
|
|
1886
|
-
pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled,
|
|
1887
|
-
)
|
|
1771
|
+
filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
|
|
1888
1772
|
else:
|
|
1889
1773
|
warnings.append("'chrom_prominence_scaled' column not found in features_df")
|
|
1890
1774
|
|
|
@@ -1898,9 +1782,7 @@ def features_select(
|
|
|
1898
1782
|
& (pl.col("chrom_height_scaled") <= max_height_scaled),
|
|
1899
1783
|
)
|
|
1900
1784
|
else:
|
|
1901
|
-
filter_conditions.append(
|
|
1902
|
-
pl.col("chrom_height_scaled") >= chrom_height_scaled,
|
|
1903
|
-
)
|
|
1785
|
+
filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
|
|
1904
1786
|
else:
|
|
1905
1787
|
warnings.append("'chrom_height_scaled' column not found in features_df")
|
|
1906
1788
|
|
|
@@ -1992,14 +1874,9 @@ def features_filter(self, features):
|
|
|
1992
1874
|
|
|
1993
1875
|
# Apply filter to consensus_mapping_df if it exists - batch operation
|
|
1994
1876
|
mapping_removed_count = 0
|
|
1995
|
-
if (
|
|
1996
|
-
self.consensus_mapping_df is not None
|
|
1997
|
-
and not self.consensus_mapping_df.is_empty()
|
|
1998
|
-
):
|
|
1877
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1999
1878
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
2000
|
-
self.consensus_mapping_df = (
|
|
2001
|
-
self.consensus_mapping_df.lazy().filter(filter_condition).collect()
|
|
2002
|
-
)
|
|
1879
|
+
self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
|
|
2003
1880
|
mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
2004
1881
|
|
|
2005
1882
|
# Calculate results once and log efficiently
|
|
@@ -2012,9 +1889,7 @@ def features_filter(self, features):
|
|
|
2012
1889
|
f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.",
|
|
2013
1890
|
)
|
|
2014
1891
|
else:
|
|
2015
|
-
self.logger.info(
|
|
2016
|
-
f"Kept {final_count} features. Filtered out {removed_count} features.",
|
|
2017
|
-
)
|
|
1892
|
+
self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
|
|
2018
1893
|
|
|
2019
1894
|
|
|
2020
1895
|
def features_delete(self, features):
|
|
@@ -2076,14 +1951,9 @@ def features_delete(self, features):
|
|
|
2076
1951
|
|
|
2077
1952
|
# Apply filter to consensus_mapping_df if it exists - batch operation
|
|
2078
1953
|
mapping_removed_count = 0
|
|
2079
|
-
if (
|
|
2080
|
-
self.consensus_mapping_df is not None
|
|
2081
|
-
and not self.consensus_mapping_df.is_empty()
|
|
2082
|
-
):
|
|
1954
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
2083
1955
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
2084
|
-
self.consensus_mapping_df = (
|
|
2085
|
-
self.consensus_mapping_df.lazy().filter(filter_condition).collect()
|
|
2086
|
-
)
|
|
1956
|
+
self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
|
|
2087
1957
|
mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
2088
1958
|
|
|
2089
1959
|
# Calculate results once and log efficiently
|
|
@@ -2096,9 +1966,7 @@ def features_delete(self, features):
|
|
|
2096
1966
|
f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}",
|
|
2097
1967
|
)
|
|
2098
1968
|
else:
|
|
2099
|
-
self.logger.info(
|
|
2100
|
-
f"Deleted {removed_count} features. Remaining features: {final_count}",
|
|
2101
|
-
)
|
|
1969
|
+
self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
|
|
2102
1970
|
|
|
2103
1971
|
|
|
2104
1972
|
def consensus_select(
|
|
@@ -2171,9 +2039,7 @@ def consensus_select(
|
|
|
2171
2039
|
else:
|
|
2172
2040
|
# Standard (min_mz, max_mz) format
|
|
2173
2041
|
min_mz, max_mz = mz
|
|
2174
|
-
consensus = consensus.filter(
|
|
2175
|
-
(pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
|
|
2176
|
-
)
|
|
2042
|
+
consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
2177
2043
|
else:
|
|
2178
2044
|
# Single float value - use default mz tolerance from study parameters
|
|
2179
2045
|
default_mz_tol = getattr(self, "parameters", None)
|
|
@@ -2181,15 +2047,13 @@ def consensus_select(
|
|
|
2181
2047
|
default_mz_tol = default_mz_tol.eic_mz_tol
|
|
2182
2048
|
else:
|
|
2183
2049
|
# Fallback to align_defaults if study parameters not available
|
|
2184
|
-
from
|
|
2050
|
+
from masster.study.defaults.align_def import align_defaults
|
|
2185
2051
|
|
|
2186
2052
|
default_mz_tol = align_defaults().mz_max_diff
|
|
2187
2053
|
|
|
2188
2054
|
min_mz = mz - default_mz_tol
|
|
2189
2055
|
max_mz = mz + default_mz_tol
|
|
2190
|
-
consensus = consensus.filter(
|
|
2191
|
-
(pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
|
|
2192
|
-
)
|
|
2056
|
+
consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
2193
2057
|
|
|
2194
2058
|
self.logger.debug(
|
|
2195
2059
|
f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
@@ -2209,9 +2073,7 @@ def consensus_select(
|
|
|
2209
2073
|
else:
|
|
2210
2074
|
# Standard (min_rt, max_rt) format
|
|
2211
2075
|
min_rt, max_rt = rt
|
|
2212
|
-
consensus = consensus.filter(
|
|
2213
|
-
(pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
|
|
2214
|
-
)
|
|
2076
|
+
consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
2215
2077
|
else:
|
|
2216
2078
|
# Single float value - use default rt tolerance from study parameters
|
|
2217
2079
|
default_rt_tol = getattr(self, "parameters", None)
|
|
@@ -2219,15 +2081,13 @@ def consensus_select(
|
|
|
2219
2081
|
default_rt_tol = default_rt_tol.eic_rt_tol
|
|
2220
2082
|
else:
|
|
2221
2083
|
# Fallback to align_defaults if study parameters not available
|
|
2222
|
-
from
|
|
2084
|
+
from masster.study.defaults.align_def import align_defaults
|
|
2223
2085
|
|
|
2224
2086
|
default_rt_tol = align_defaults().rt_max_diff
|
|
2225
2087
|
|
|
2226
2088
|
min_rt = rt - default_rt_tol
|
|
2227
2089
|
max_rt = rt + default_rt_tol
|
|
2228
|
-
consensus = consensus.filter(
|
|
2229
|
-
(pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
|
|
2230
|
-
)
|
|
2090
|
+
consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
2231
2091
|
|
|
2232
2092
|
self.logger.debug(
|
|
2233
2093
|
f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
@@ -2238,9 +2098,7 @@ def consensus_select(
|
|
|
2238
2098
|
consensus_len_before_filter = len(consensus)
|
|
2239
2099
|
if isinstance(inty_mean, tuple) and len(inty_mean) == 2:
|
|
2240
2100
|
min_inty, max_inty = inty_mean
|
|
2241
|
-
consensus = consensus.filter(
|
|
2242
|
-
(pl.col("inty_mean") >= min_inty) & (pl.col("inty_mean") <= max_inty),
|
|
2243
|
-
)
|
|
2101
|
+
consensus = consensus.filter((pl.col("inty_mean") >= min_inty) & (pl.col("inty_mean") <= max_inty))
|
|
2244
2102
|
else:
|
|
2245
2103
|
consensus = consensus.filter(pl.col("inty_mean") >= inty_mean)
|
|
2246
2104
|
self.logger.debug(
|
|
@@ -2255,14 +2113,11 @@ def consensus_select(
|
|
|
2255
2113
|
# Treat as range
|
|
2256
2114
|
min_uid, max_uid = consensus_uid
|
|
2257
2115
|
consensus = consensus.filter(
|
|
2258
|
-
(pl.col("consensus_uid") >= min_uid)
|
|
2259
|
-
& (pl.col("consensus_uid") <= max_uid),
|
|
2116
|
+
(pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid),
|
|
2260
2117
|
)
|
|
2261
2118
|
else:
|
|
2262
2119
|
# Treat as list
|
|
2263
|
-
consensus = consensus.filter(
|
|
2264
|
-
pl.col("consensus_uid").is_in(consensus_uid),
|
|
2265
|
-
)
|
|
2120
|
+
consensus = consensus.filter(pl.col("consensus_uid").is_in(consensus_uid))
|
|
2266
2121
|
else:
|
|
2267
2122
|
consensus = consensus.filter(pl.col("consensus_uid") == consensus_uid)
|
|
2268
2123
|
self.logger.debug(
|
|
@@ -2286,8 +2141,7 @@ def consensus_select(
|
|
|
2286
2141
|
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
2287
2142
|
min_samples, max_samples = number_samples
|
|
2288
2143
|
consensus = consensus.filter(
|
|
2289
|
-
(pl.col("number_samples") >= min_samples)
|
|
2290
|
-
& (pl.col("number_samples") <= max_samples),
|
|
2144
|
+
(pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples),
|
|
2291
2145
|
)
|
|
2292
2146
|
else:
|
|
2293
2147
|
consensus = consensus.filter(pl.col("number_samples") >= number_samples)
|
|
@@ -2301,10 +2155,7 @@ def consensus_select(
|
|
|
2301
2155
|
if "number_ms2" in consensus.columns:
|
|
2302
2156
|
if isinstance(number_ms2, tuple) and len(number_ms2) == 2:
|
|
2303
2157
|
min_ms2, max_ms2 = number_ms2
|
|
2304
|
-
consensus = consensus.filter(
|
|
2305
|
-
(pl.col("number_ms2") >= min_ms2)
|
|
2306
|
-
& (pl.col("number_ms2") <= max_ms2),
|
|
2307
|
-
)
|
|
2158
|
+
consensus = consensus.filter((pl.col("number_ms2") >= min_ms2) & (pl.col("number_ms2") <= max_ms2))
|
|
2308
2159
|
else:
|
|
2309
2160
|
consensus = consensus.filter(pl.col("number_ms2") >= number_ms2)
|
|
2310
2161
|
else:
|
|
@@ -2318,9 +2169,7 @@ def consensus_select(
|
|
|
2318
2169
|
consensus_len_before_filter = len(consensus)
|
|
2319
2170
|
if isinstance(quality, tuple) and len(quality) == 2:
|
|
2320
2171
|
min_quality, max_quality = quality
|
|
2321
|
-
consensus = consensus.filter(
|
|
2322
|
-
(pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality),
|
|
2323
|
-
)
|
|
2172
|
+
consensus = consensus.filter((pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality))
|
|
2324
2173
|
else:
|
|
2325
2174
|
consensus = consensus.filter(pl.col("quality") >= quality)
|
|
2326
2175
|
self.logger.debug(
|
|
@@ -2333,9 +2182,7 @@ def consensus_select(
|
|
|
2333
2182
|
if "bl" in consensus.columns:
|
|
2334
2183
|
if isinstance(bl, tuple) and len(bl) == 2:
|
|
2335
2184
|
min_bl, max_bl = bl
|
|
2336
|
-
consensus = consensus.filter(
|
|
2337
|
-
(pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl),
|
|
2338
|
-
)
|
|
2185
|
+
consensus = consensus.filter((pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl))
|
|
2339
2186
|
else:
|
|
2340
2187
|
consensus = consensus.filter(pl.col("bl") >= bl)
|
|
2341
2188
|
else:
|
|
@@ -2348,23 +2195,16 @@ def consensus_select(
|
|
|
2348
2195
|
if chrom_coherence_mean is not None:
|
|
2349
2196
|
consensus_len_before_filter = len(consensus)
|
|
2350
2197
|
if "chrom_coherence_mean" in consensus.columns:
|
|
2351
|
-
if (
|
|
2352
|
-
isinstance(chrom_coherence_mean, tuple)
|
|
2353
|
-
and len(chrom_coherence_mean) == 2
|
|
2354
|
-
):
|
|
2198
|
+
if isinstance(chrom_coherence_mean, tuple) and len(chrom_coherence_mean) == 2:
|
|
2355
2199
|
min_coherence, max_coherence = chrom_coherence_mean
|
|
2356
2200
|
consensus = consensus.filter(
|
|
2357
2201
|
(pl.col("chrom_coherence_mean") >= min_coherence)
|
|
2358
2202
|
& (pl.col("chrom_coherence_mean") <= max_coherence),
|
|
2359
2203
|
)
|
|
2360
2204
|
else:
|
|
2361
|
-
consensus = consensus.filter(
|
|
2362
|
-
pl.col("chrom_coherence_mean") >= chrom_coherence_mean,
|
|
2363
|
-
)
|
|
2205
|
+
consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
|
|
2364
2206
|
else:
|
|
2365
|
-
self.logger.warning(
|
|
2366
|
-
"'chrom_coherence_mean' column not found in consensus_df",
|
|
2367
|
-
)
|
|
2207
|
+
self.logger.warning("'chrom_coherence_mean' column not found in consensus_df")
|
|
2368
2208
|
self.logger.debug(
|
|
2369
2209
|
f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2370
2210
|
)
|
|
@@ -2373,23 +2213,16 @@ def consensus_select(
|
|
|
2373
2213
|
if chrom_prominence_mean is not None:
|
|
2374
2214
|
consensus_len_before_filter = len(consensus)
|
|
2375
2215
|
if "chrom_prominence_mean" in consensus.columns:
|
|
2376
|
-
if (
|
|
2377
|
-
isinstance(chrom_prominence_mean, tuple)
|
|
2378
|
-
and len(chrom_prominence_mean) == 2
|
|
2379
|
-
):
|
|
2216
|
+
if isinstance(chrom_prominence_mean, tuple) and len(chrom_prominence_mean) == 2:
|
|
2380
2217
|
min_prominence, max_prominence = chrom_prominence_mean
|
|
2381
2218
|
consensus = consensus.filter(
|
|
2382
2219
|
(pl.col("chrom_prominence_mean") >= min_prominence)
|
|
2383
2220
|
& (pl.col("chrom_prominence_mean") <= max_prominence),
|
|
2384
2221
|
)
|
|
2385
2222
|
else:
|
|
2386
|
-
consensus = consensus.filter(
|
|
2387
|
-
pl.col("chrom_prominence_mean") >= chrom_prominence_mean,
|
|
2388
|
-
)
|
|
2223
|
+
consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
|
|
2389
2224
|
else:
|
|
2390
|
-
self.logger.warning(
|
|
2391
|
-
"'chrom_prominence_mean' column not found in consensus_df",
|
|
2392
|
-
)
|
|
2225
|
+
self.logger.warning("'chrom_prominence_mean' column not found in consensus_df")
|
|
2393
2226
|
self.logger.debug(
|
|
2394
2227
|
f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2395
2228
|
)
|
|
@@ -2398,26 +2231,16 @@ def consensus_select(
|
|
|
2398
2231
|
if chrom_prominence_scaled_mean is not None:
|
|
2399
2232
|
consensus_len_before_filter = len(consensus)
|
|
2400
2233
|
if "chrom_prominence_scaled_mean" in consensus.columns:
|
|
2401
|
-
if (
|
|
2402
|
-
|
|
2403
|
-
and len(chrom_prominence_scaled_mean) == 2
|
|
2404
|
-
):
|
|
2405
|
-
min_prominence_scaled, max_prominence_scaled = (
|
|
2406
|
-
chrom_prominence_scaled_mean
|
|
2407
|
-
)
|
|
2234
|
+
if isinstance(chrom_prominence_scaled_mean, tuple) and len(chrom_prominence_scaled_mean) == 2:
|
|
2235
|
+
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
|
|
2408
2236
|
consensus = consensus.filter(
|
|
2409
2237
|
(pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
|
|
2410
2238
|
& (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
|
|
2411
2239
|
)
|
|
2412
2240
|
else:
|
|
2413
|
-
consensus = consensus.filter(
|
|
2414
|
-
pl.col("chrom_prominence_scaled_mean")
|
|
2415
|
-
>= chrom_prominence_scaled_mean,
|
|
2416
|
-
)
|
|
2241
|
+
consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
|
|
2417
2242
|
else:
|
|
2418
|
-
self.logger.warning(
|
|
2419
|
-
"'chrom_prominence_scaled_mean' column not found in consensus_df",
|
|
2420
|
-
)
|
|
2243
|
+
self.logger.warning("'chrom_prominence_scaled_mean' column not found in consensus_df")
|
|
2421
2244
|
self.logger.debug(
|
|
2422
2245
|
f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2423
2246
|
)
|
|
@@ -2426,23 +2249,16 @@ def consensus_select(
|
|
|
2426
2249
|
if chrom_height_scaled_mean is not None:
|
|
2427
2250
|
consensus_len_before_filter = len(consensus)
|
|
2428
2251
|
if "chrom_height_scaled_mean" in consensus.columns:
|
|
2429
|
-
if (
|
|
2430
|
-
isinstance(chrom_height_scaled_mean, tuple)
|
|
2431
|
-
and len(chrom_height_scaled_mean) == 2
|
|
2432
|
-
):
|
|
2252
|
+
if isinstance(chrom_height_scaled_mean, tuple) and len(chrom_height_scaled_mean) == 2:
|
|
2433
2253
|
min_height_scaled, max_height_scaled = chrom_height_scaled_mean
|
|
2434
2254
|
consensus = consensus.filter(
|
|
2435
2255
|
(pl.col("chrom_height_scaled_mean") >= min_height_scaled)
|
|
2436
2256
|
& (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
|
|
2437
2257
|
)
|
|
2438
2258
|
else:
|
|
2439
|
-
consensus = consensus.filter(
|
|
2440
|
-
pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean,
|
|
2441
|
-
)
|
|
2259
|
+
consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
|
|
2442
2260
|
else:
|
|
2443
|
-
self.logger.warning(
|
|
2444
|
-
"'chrom_height_scaled_mean' column not found in consensus_df",
|
|
2445
|
-
)
|
|
2261
|
+
self.logger.warning("'chrom_height_scaled_mean' column not found in consensus_df")
|
|
2446
2262
|
self.logger.debug(
|
|
2447
2263
|
f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2448
2264
|
)
|
|
@@ -2454,8 +2270,7 @@ def consensus_select(
|
|
|
2454
2270
|
if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
|
|
2455
2271
|
min_rt_delta, max_rt_delta = rt_delta_mean
|
|
2456
2272
|
consensus = consensus.filter(
|
|
2457
|
-
(pl.col("rt_delta_mean") >= min_rt_delta)
|
|
2458
|
-
& (pl.col("rt_delta_mean") <= max_rt_delta),
|
|
2273
|
+
(pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta),
|
|
2459
2274
|
)
|
|
2460
2275
|
else:
|
|
2461
2276
|
consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
|
|
@@ -2466,13 +2281,9 @@ def consensus_select(
|
|
|
2466
2281
|
)
|
|
2467
2282
|
|
|
2468
2283
|
if len(consensus) == 0:
|
|
2469
|
-
self.logger.warning(
|
|
2470
|
-
"No consensus features remaining after applying selection criteria.",
|
|
2471
|
-
)
|
|
2284
|
+
self.logger.warning("No consensus features remaining after applying selection criteria.")
|
|
2472
2285
|
else:
|
|
2473
|
-
self.logger.info(
|
|
2474
|
-
f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})",
|
|
2475
|
-
)
|
|
2286
|
+
self.logger.info(f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})")
|
|
2476
2287
|
|
|
2477
2288
|
# Sort the results if sortby is specified
|
|
2478
2289
|
if sortby is not None:
|
|
@@ -2481,25 +2292,19 @@ def consensus_select(
|
|
|
2481
2292
|
if sortby in consensus.columns:
|
|
2482
2293
|
consensus = consensus.sort(sortby, descending=descending)
|
|
2483
2294
|
else:
|
|
2484
|
-
self.logger.warning(
|
|
2485
|
-
f"Sort column '{sortby}' not found in consensus DataFrame",
|
|
2486
|
-
)
|
|
2295
|
+
self.logger.warning(f"Sort column '{sortby}' not found in consensus DataFrame")
|
|
2487
2296
|
elif isinstance(sortby, (list, tuple)):
|
|
2488
2297
|
# Multiple columns
|
|
2489
2298
|
valid_columns = [col for col in sortby if col in consensus.columns]
|
|
2490
2299
|
invalid_columns = [col for col in sortby if col not in consensus.columns]
|
|
2491
2300
|
|
|
2492
2301
|
if invalid_columns:
|
|
2493
|
-
self.logger.warning(
|
|
2494
|
-
f"Sort columns not found in consensus DataFrame: {invalid_columns}",
|
|
2495
|
-
)
|
|
2302
|
+
self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
|
|
2496
2303
|
|
|
2497
2304
|
if valid_columns:
|
|
2498
2305
|
consensus = consensus.sort(valid_columns, descending=descending)
|
|
2499
2306
|
else:
|
|
2500
|
-
self.logger.warning(
|
|
2501
|
-
f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.",
|
|
2502
|
-
)
|
|
2307
|
+
self.logger.warning(f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.")
|
|
2503
2308
|
|
|
2504
2309
|
return consensus
|
|
2505
2310
|
|
|
@@ -2544,10 +2349,7 @@ def consensus_filter(self, consensus):
|
|
|
2544
2349
|
|
|
2545
2350
|
# Get feature_uids that need to be removed from features_df
|
|
2546
2351
|
feature_uids_to_remove = []
|
|
2547
|
-
if (
|
|
2548
|
-
self.consensus_mapping_df is not None
|
|
2549
|
-
and not self.consensus_mapping_df.is_empty()
|
|
2550
|
-
):
|
|
2352
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
2551
2353
|
feature_uids_to_remove = self.consensus_mapping_df.filter(
|
|
2552
2354
|
pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
2553
2355
|
)["feature_uid"].to_list()
|
|
@@ -2558,42 +2360,27 @@ def consensus_filter(self, consensus):
|
|
|
2558
2360
|
)
|
|
2559
2361
|
|
|
2560
2362
|
# Remove from consensus_mapping_df
|
|
2561
|
-
if (
|
|
2562
|
-
self.consensus_mapping_df is not None
|
|
2563
|
-
and not self.consensus_mapping_df.is_empty()
|
|
2564
|
-
):
|
|
2363
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
2565
2364
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
2566
2365
|
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
2567
2366
|
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
2568
2367
|
)
|
|
2569
2368
|
removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
2570
2369
|
if removed_mapping_count > 0:
|
|
2571
|
-
self.logger.debug(
|
|
2572
|
-
f"Removed {removed_mapping_count} entries from consensus_mapping_df",
|
|
2573
|
-
)
|
|
2370
|
+
self.logger.debug(f"Removed {removed_mapping_count} entries from consensus_mapping_df")
|
|
2574
2371
|
|
|
2575
2372
|
# Remove corresponding features from features_df
|
|
2576
|
-
if (
|
|
2577
|
-
feature_uids_to_remove
|
|
2578
|
-
and self.features_df is not None
|
|
2579
|
-
and not self.features_df.is_empty()
|
|
2580
|
-
):
|
|
2373
|
+
if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
|
|
2581
2374
|
initial_features_count = len(self.features_df)
|
|
2582
2375
|
self.features_df = self.features_df.filter(
|
|
2583
2376
|
~pl.col("feature_uid").is_in(feature_uids_to_remove),
|
|
2584
2377
|
)
|
|
2585
2378
|
removed_features_count = initial_features_count - len(self.features_df)
|
|
2586
2379
|
if removed_features_count > 0:
|
|
2587
|
-
self.logger.debug(
|
|
2588
|
-
f"Removed {removed_features_count} entries from features_df",
|
|
2589
|
-
)
|
|
2380
|
+
self.logger.debug(f"Removed {removed_features_count} entries from features_df")
|
|
2590
2381
|
|
|
2591
2382
|
# Remove from consensus_ms2 if it exists
|
|
2592
|
-
if (
|
|
2593
|
-
hasattr(self, "consensus_ms2")
|
|
2594
|
-
and self.consensus_ms2 is not None
|
|
2595
|
-
and not self.consensus_ms2.is_empty()
|
|
2596
|
-
):
|
|
2383
|
+
if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
2597
2384
|
initial_ms2_count = len(self.consensus_ms2)
|
|
2598
2385
|
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
2599
2386
|
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
@@ -2693,10 +2480,7 @@ def samples_select(
|
|
|
2693
2480
|
if len(sample_uid) == 2 and not isinstance(sample_uid, list):
|
|
2694
2481
|
# Treat as range
|
|
2695
2482
|
min_uid, max_uid = sample_uid
|
|
2696
|
-
filter_conditions.append(
|
|
2697
|
-
(pl.col("sample_uid") >= min_uid)
|
|
2698
|
-
& (pl.col("sample_uid") <= max_uid),
|
|
2699
|
-
)
|
|
2483
|
+
filter_conditions.append((pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid))
|
|
2700
2484
|
else:
|
|
2701
2485
|
# Treat as list
|
|
2702
2486
|
filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
|
|
@@ -2738,8 +2522,7 @@ def samples_select(
|
|
|
2738
2522
|
# Treat as range
|
|
2739
2523
|
min_batch, max_batch = sample_batch
|
|
2740
2524
|
filter_conditions.append(
|
|
2741
|
-
(pl.col("sample_batch") >= min_batch)
|
|
2742
|
-
& (pl.col("sample_batch") <= max_batch),
|
|
2525
|
+
(pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch),
|
|
2743
2526
|
)
|
|
2744
2527
|
else:
|
|
2745
2528
|
# Treat as list
|
|
@@ -2757,14 +2540,11 @@ def samples_select(
|
|
|
2757
2540
|
# Treat as range
|
|
2758
2541
|
min_seq, max_seq = sample_sequence
|
|
2759
2542
|
filter_conditions.append(
|
|
2760
|
-
(pl.col("sample_sequence") >= min_seq)
|
|
2761
|
-
& (pl.col("sample_sequence") <= max_seq),
|
|
2543
|
+
(pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq),
|
|
2762
2544
|
)
|
|
2763
2545
|
else:
|
|
2764
2546
|
# Treat as list
|
|
2765
|
-
filter_conditions.append(
|
|
2766
|
-
pl.col("sample_sequence").is_in(sample_sequence),
|
|
2767
|
-
)
|
|
2547
|
+
filter_conditions.append(pl.col("sample_sequence").is_in(sample_sequence))
|
|
2768
2548
|
else:
|
|
2769
2549
|
filter_conditions.append(pl.col("sample_sequence") == sample_sequence)
|
|
2770
2550
|
else:
|
|
@@ -2776,8 +2556,7 @@ def samples_select(
|
|
|
2776
2556
|
if isinstance(num_features, tuple) and len(num_features) == 2:
|
|
2777
2557
|
min_features, max_features = num_features
|
|
2778
2558
|
filter_conditions.append(
|
|
2779
|
-
(pl.col("num_features") >= min_features)
|
|
2780
|
-
& (pl.col("num_features") <= max_features),
|
|
2559
|
+
(pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features),
|
|
2781
2560
|
)
|
|
2782
2561
|
else:
|
|
2783
2562
|
filter_conditions.append(pl.col("num_features") >= num_features)
|
|
@@ -2789,9 +2568,7 @@ def samples_select(
|
|
|
2789
2568
|
if "num_ms1" in available_columns:
|
|
2790
2569
|
if isinstance(num_ms1, tuple) and len(num_ms1) == 2:
|
|
2791
2570
|
min_ms1, max_ms1 = num_ms1
|
|
2792
|
-
filter_conditions.append(
|
|
2793
|
-
(pl.col("num_ms1") >= min_ms1) & (pl.col("num_ms1") <= max_ms1),
|
|
2794
|
-
)
|
|
2571
|
+
filter_conditions.append((pl.col("num_ms1") >= min_ms1) & (pl.col("num_ms1") <= max_ms1))
|
|
2795
2572
|
else:
|
|
2796
2573
|
filter_conditions.append(pl.col("num_ms1") >= num_ms1)
|
|
2797
2574
|
else:
|
|
@@ -2802,9 +2579,7 @@ def samples_select(
|
|
|
2802
2579
|
if "num_ms2" in available_columns:
|
|
2803
2580
|
if isinstance(num_ms2, tuple) and len(num_ms2) == 2:
|
|
2804
2581
|
min_ms2, max_ms2 = num_ms2
|
|
2805
|
-
filter_conditions.append(
|
|
2806
|
-
(pl.col("num_ms2") >= min_ms2) & (pl.col("num_ms2") <= max_ms2),
|
|
2807
|
-
)
|
|
2582
|
+
filter_conditions.append((pl.col("num_ms2") >= min_ms2) & (pl.col("num_ms2") <= max_ms2))
|
|
2808
2583
|
else:
|
|
2809
2584
|
filter_conditions.append(pl.col("num_ms2") >= num_ms2)
|
|
2810
2585
|
else:
|
|
@@ -2896,9 +2671,7 @@ def samples_delete(self, samples):
|
|
|
2896
2671
|
if len(sample_uids_set) < len(sample_uids_to_remove) * 0.8:
|
|
2897
2672
|
sample_uids_to_remove = list(sample_uids_set)
|
|
2898
2673
|
|
|
2899
|
-
self.logger.info(
|
|
2900
|
-
f"Deleting {len(sample_uids_to_remove)} samples and all related data...",
|
|
2901
|
-
)
|
|
2674
|
+
self.logger.info(f"Deleting {len(sample_uids_to_remove)} samples and all related data...")
|
|
2902
2675
|
|
|
2903
2676
|
# Get feature_uids that need to be removed from features_df
|
|
2904
2677
|
feature_uids_to_remove = []
|
|
@@ -2926,11 +2699,7 @@ def samples_delete(self, samples):
|
|
|
2926
2699
|
|
|
2927
2700
|
# 2. Remove corresponding features from features_df
|
|
2928
2701
|
removed_features_count = 0
|
|
2929
|
-
if (
|
|
2930
|
-
feature_uids_to_remove
|
|
2931
|
-
and self.features_df is not None
|
|
2932
|
-
and not self.features_df.is_empty()
|
|
2933
|
-
):
|
|
2702
|
+
if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
|
|
2934
2703
|
self.features_df = self.features_df.filter(
|
|
2935
2704
|
~pl.col("sample_uid").is_in(sample_uids_to_remove),
|
|
2936
2705
|
)
|
|
@@ -2938,11 +2707,7 @@ def samples_delete(self, samples):
|
|
|
2938
2707
|
|
|
2939
2708
|
# 3. Remove from consensus_mapping_df
|
|
2940
2709
|
removed_mapping_count = 0
|
|
2941
|
-
if (
|
|
2942
|
-
feature_uids_to_remove
|
|
2943
|
-
and self.consensus_mapping_df is not None
|
|
2944
|
-
and not self.consensus_mapping_df.is_empty()
|
|
2945
|
-
):
|
|
2710
|
+
if feature_uids_to_remove and self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
2946
2711
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
2947
2712
|
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
2948
2713
|
~pl.col("feature_uid").is_in(feature_uids_to_remove),
|
|
@@ -2951,11 +2716,7 @@ def samples_delete(self, samples):
|
|
|
2951
2716
|
|
|
2952
2717
|
# 4. Remove from consensus_ms2 if it exists
|
|
2953
2718
|
removed_ms2_count = 0
|
|
2954
|
-
if (
|
|
2955
|
-
hasattr(self, "consensus_ms2")
|
|
2956
|
-
and self.consensus_ms2 is not None
|
|
2957
|
-
and not self.consensus_ms2.is_empty()
|
|
2958
|
-
):
|
|
2719
|
+
if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
2959
2720
|
initial_ms2_count = len(self.consensus_ms2)
|
|
2960
2721
|
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
2961
2722
|
~pl.col("sample_uid").is_in(sample_uids_to_remove),
|
|
@@ -2964,11 +2725,7 @@ def samples_delete(self, samples):
|
|
|
2964
2725
|
|
|
2965
2726
|
# 5. Remove from feature_maps and update map_id
|
|
2966
2727
|
removed_maps_count = 0
|
|
2967
|
-
if (
|
|
2968
|
-
hasattr(self, "feature_maps")
|
|
2969
|
-
and self.feature_maps is not None
|
|
2970
|
-
and map_ids_to_remove
|
|
2971
|
-
):
|
|
2728
|
+
if hasattr(self, "feature_maps") and self.feature_maps is not None and map_ids_to_remove:
|
|
2972
2729
|
# Remove feature maps in reverse order to maintain indices
|
|
2973
2730
|
for map_id in sorted(map_ids_to_remove, reverse=True):
|
|
2974
2731
|
if 0 <= map_id < len(self.feature_maps):
|
|
@@ -3009,9 +2766,7 @@ def samples_delete(self, samples):
|
|
|
3009
2766
|
|
|
3010
2767
|
# Update map_id indices if needed
|
|
3011
2768
|
if removed_maps_count > 0 and final_sample_count > 0:
|
|
3012
|
-
self.logger.debug(
|
|
3013
|
-
f"Updated map_id values to range from 0 to {final_sample_count - 1}",
|
|
3014
|
-
)
|
|
2769
|
+
self.logger.debug(f"Updated map_id values to range from 0 to {final_sample_count - 1}")
|
|
3015
2770
|
|
|
3016
2771
|
|
|
3017
2772
|
# =====================================================================================
|
|
@@ -3182,9 +2937,7 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
3182
2937
|
)
|
|
3183
2938
|
|
|
3184
2939
|
if isinstance(by, list):
|
|
3185
|
-
self.logger.debug(
|
|
3186
|
-
f"Set sample colors using provided color list ({len(by)} colors)",
|
|
3187
|
-
)
|
|
2940
|
+
self.logger.debug(f"Set sample colors using provided color list ({len(by)} colors)")
|
|
3188
2941
|
elif by is None:
|
|
3189
2942
|
self.logger.debug(f"Set sequential sample colors using {palette} palette")
|
|
3190
2943
|
else:
|
|
@@ -3219,9 +2972,7 @@ def sample_color_reset(self):
|
|
|
3219
2972
|
# Distribute samples evenly across the full colormap range
|
|
3220
2973
|
for i in range(n_samples):
|
|
3221
2974
|
# Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
|
|
3222
|
-
normalized_value = (
|
|
3223
|
-
i + 0.5
|
|
3224
|
-
) / n_samples # +0.5 to center samples in their bins
|
|
2975
|
+
normalized_value = (i + 0.5) / n_samples # +0.5 to center samples in their bins
|
|
3225
2976
|
# Optionally, map to a subset of colormap to avoid extreme colors
|
|
3226
2977
|
# Use 10% to 90% of colormap range for better color diversity
|
|
3227
2978
|
normalized_value = 0.1 + (normalized_value * 0.8)
|
|
@@ -3242,14 +2993,10 @@ def sample_color_reset(self):
|
|
|
3242
2993
|
pl.Series("sample_color", colors).alias("sample_color"),
|
|
3243
2994
|
)
|
|
3244
2995
|
|
|
3245
|
-
self.logger.debug(
|
|
3246
|
-
f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)",
|
|
3247
|
-
)
|
|
2996
|
+
self.logger.debug(f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)")
|
|
3248
2997
|
|
|
3249
2998
|
except ImportError:
|
|
3250
|
-
self.logger.error(
|
|
3251
|
-
"cmap library is required for sample color reset. Install with: uv add cmap",
|
|
3252
|
-
)
|
|
2999
|
+
self.logger.error("cmap library is required for sample color reset. Install with: uv add cmap")
|
|
3253
3000
|
except Exception as e:
|
|
3254
3001
|
self.logger.error(f"Failed to reset sample colors: {e}")
|
|
3255
3002
|
|
|
@@ -3270,9 +3017,7 @@ def _get_color_palette(palette_name):
|
|
|
3270
3017
|
try:
|
|
3271
3018
|
from cmap import Colormap
|
|
3272
3019
|
except ImportError:
|
|
3273
|
-
raise ValueError(
|
|
3274
|
-
"cmap library is required for color palettes. Install with: pip install cmap",
|
|
3275
|
-
)
|
|
3020
|
+
raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
|
|
3276
3021
|
|
|
3277
3022
|
# Map common palette names to cmap names
|
|
3278
3023
|
palette_mapping = {
|
|
@@ -3367,9 +3112,7 @@ def _sample_colors_from_colormap(palette_name, n_colors):
|
|
|
3367
3112
|
try:
|
|
3368
3113
|
from cmap import Colormap
|
|
3369
3114
|
except ImportError:
|
|
3370
|
-
raise ValueError(
|
|
3371
|
-
"cmap library is required for color palettes. Install with: pip install cmap",
|
|
3372
|
-
)
|
|
3115
|
+
raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
|
|
3373
3116
|
|
|
3374
3117
|
# Map common palette names to cmap names (same as _get_color_palette)
|
|
3375
3118
|
palette_mapping = {
|
|
@@ -3407,9 +3150,7 @@ def _sample_colors_from_colormap(palette_name, n_colors):
|
|
|
3407
3150
|
# Distribute samples evenly across the full colormap range (same approach as sample_color_reset)
|
|
3408
3151
|
for i in range(n_colors):
|
|
3409
3152
|
# Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
|
|
3410
|
-
normalized_value = (
|
|
3411
|
-
i + 0.5
|
|
3412
|
-
) / n_colors # +0.5 to center samples in their bins
|
|
3153
|
+
normalized_value = (i + 0.5) / n_colors # +0.5 to center samples in their bins
|
|
3413
3154
|
# Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
|
|
3414
3155
|
normalized_value = 0.1 + (normalized_value * 0.8)
|
|
3415
3156
|
|
|
@@ -3454,7 +3195,7 @@ def _ensure_features_df_schema_order(self):
|
|
|
3454
3195
|
try:
|
|
3455
3196
|
import os
|
|
3456
3197
|
import json
|
|
3457
|
-
from
|
|
3198
|
+
from masster.study.h5 import _reorder_columns_by_schema
|
|
3458
3199
|
|
|
3459
3200
|
# Load schema
|
|
3460
3201
|
schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
|
|
@@ -3462,11 +3203,7 @@ def _ensure_features_df_schema_order(self):
|
|
|
3462
3203
|
schema = json.load(f)
|
|
3463
3204
|
|
|
3464
3205
|
# Reorder columns to match schema
|
|
3465
|
-
self.features_df = _reorder_columns_by_schema(
|
|
3466
|
-
self.features_df,
|
|
3467
|
-
schema,
|
|
3468
|
-
"features_df",
|
|
3469
|
-
)
|
|
3206
|
+
self.features_df = _reorder_columns_by_schema(self.features_df, schema, "features_df")
|
|
3470
3207
|
|
|
3471
3208
|
except Exception as e:
|
|
3472
3209
|
self.logger.warning(f"Failed to reorder features_df columns: {e}")
|
|
@@ -3508,19 +3245,17 @@ def migrate_map_id_to_index(self):
|
|
|
3508
3245
|
# Ensure the column is Int64 type
|
|
3509
3246
|
self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
|
|
3510
3247
|
|
|
3511
|
-
self.logger.info(
|
|
3512
|
-
f"Successfully migrated {sample_count} samples to indexed map_id format",
|
|
3513
|
-
)
|
|
3248
|
+
self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
|
|
3514
3249
|
self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")
|
|
3515
3250
|
|
|
3516
3251
|
|
|
3517
3252
|
def restore_ms2(self, samples=None, **kwargs):
|
|
3518
3253
|
"""
|
|
3519
3254
|
Restore MS2 data by re-running find_ms2 on specified samples.
|
|
3520
|
-
|
|
3255
|
+
|
|
3521
3256
|
This function rebuilds the consensus_ms2 DataFrame by re-extracting MS2 spectra
|
|
3522
3257
|
from the original sample files. Use this to reverse the effects of compress_ms2().
|
|
3523
|
-
|
|
3258
|
+
|
|
3524
3259
|
Parameters:
|
|
3525
3260
|
samples (list, optional): List of sample_uids or sample_names to process.
|
|
3526
3261
|
If None, processes all samples.
|
|
@@ -3530,37 +3265,31 @@ def restore_ms2(self, samples=None, **kwargs):
|
|
|
3530
3265
|
if self.features_df is None or self.features_df.is_empty():
|
|
3531
3266
|
self.logger.error("No features_df found in study.")
|
|
3532
3267
|
return
|
|
3533
|
-
|
|
3268
|
+
|
|
3534
3269
|
if self.samples_df is None or self.samples_df.is_empty():
|
|
3535
3270
|
self.logger.error("No samples_df found in study.")
|
|
3536
3271
|
return
|
|
3537
|
-
|
|
3538
|
-
# Get sample_uids to process
|
|
3272
|
+
|
|
3273
|
+
# Get sample_uids to process
|
|
3539
3274
|
sample_uids = self._get_sample_uids(samples)
|
|
3540
3275
|
if not sample_uids:
|
|
3541
3276
|
self.logger.warning("No valid samples specified.")
|
|
3542
3277
|
return
|
|
3543
|
-
|
|
3278
|
+
|
|
3544
3279
|
self.logger.info(f"Restoring MS2 data from {len(sample_uids)} samples...")
|
|
3545
|
-
|
|
3280
|
+
|
|
3546
3281
|
# Clear existing consensus_ms2 to rebuild from scratch
|
|
3547
|
-
initial_ms2_count = (
|
|
3548
|
-
len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
3549
|
-
)
|
|
3282
|
+
initial_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
3550
3283
|
self.consensus_ms2 = pl.DataFrame()
|
|
3551
|
-
|
|
3284
|
+
|
|
3552
3285
|
# Re-run find_ms2 which will rebuild consensus_ms2
|
|
3553
3286
|
try:
|
|
3554
3287
|
self.find_ms2(**kwargs)
|
|
3555
|
-
|
|
3556
|
-
final_ms2_count = (
|
|
3557
|
-
|
|
3558
|
-
)
|
|
3559
|
-
|
|
3560
|
-
self.logger.info(
|
|
3561
|
-
f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra",
|
|
3562
|
-
)
|
|
3563
|
-
|
|
3288
|
+
|
|
3289
|
+
final_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
3290
|
+
|
|
3291
|
+
self.logger.info(f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra")
|
|
3292
|
+
|
|
3564
3293
|
except Exception as e:
|
|
3565
3294
|
self.logger.error(f"Failed to restore MS2 data: {e}")
|
|
3566
3295
|
raise
|
|
@@ -3569,51 +3298,51 @@ def restore_ms2(self, samples=None, **kwargs):
|
|
|
3569
3298
|
def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs):
|
|
3570
3299
|
"""
|
|
3571
3300
|
Reverse any compression effects by restoring compressed data adaptively.
|
|
3572
|
-
|
|
3301
|
+
|
|
3573
3302
|
This function restores data that was compressed using compress(), compress_features(),
|
|
3574
3303
|
compress_ms2(), compress_chrom(), or study.save(compress=True). It optimizes the
|
|
3575
3304
|
decompression process for speed by only processing what actually needs restoration.
|
|
3576
|
-
|
|
3305
|
+
|
|
3577
3306
|
Parameters:
|
|
3578
3307
|
features (bool): Restore features data (ms2_specs, ms2_scans, chrom_area)
|
|
3579
|
-
ms2 (bool): Restore MS2 spectra by re-running find_ms2()
|
|
3308
|
+
ms2 (bool): Restore MS2 spectra by re-running find_ms2()
|
|
3580
3309
|
chrom (bool): Restore chromatogram objects
|
|
3581
3310
|
samples (list, optional): List of sample_uids or sample_names to process.
|
|
3582
3311
|
If None, processes all samples.
|
|
3583
3312
|
**kwargs: Additional keyword arguments for restoration functions:
|
|
3584
3313
|
- For restore_chrom: mz_tol (default: 0.010), rt_tol (default: 10.0)
|
|
3585
3314
|
- For restore_ms2/find_ms2: mz_tol, centroid, deisotope, etc.
|
|
3586
|
-
|
|
3315
|
+
|
|
3587
3316
|
Performance Optimizations:
|
|
3588
3317
|
- Adaptive processing: Only restores what actually needs restoration
|
|
3589
3318
|
- Processes features and chromatograms together when possible (shared file I/O)
|
|
3590
3319
|
- Uses cached sample instances to avoid repeated file loading
|
|
3591
3320
|
- Processes MS2 restoration last as it's the most computationally expensive
|
|
3592
3321
|
- Provides detailed progress information for long-running operations
|
|
3593
|
-
|
|
3322
|
+
|
|
3594
3323
|
Example:
|
|
3595
3324
|
# Restore everything (but only what needs restoration)
|
|
3596
3325
|
study.decompress()
|
|
3597
|
-
|
|
3326
|
+
|
|
3598
3327
|
# Restore only chromatograms with custom tolerances
|
|
3599
3328
|
study.decompress(features=False, ms2=False, chrom=True, mz_tol=0.005, rt_tol=5.0)
|
|
3600
|
-
|
|
3329
|
+
|
|
3601
3330
|
# Restore specific samples only
|
|
3602
3331
|
study.decompress(samples=["sample1", "sample2"])
|
|
3603
3332
|
"""
|
|
3604
3333
|
if not any([features, ms2, chrom]):
|
|
3605
3334
|
self.logger.warning("No decompression operations specified.")
|
|
3606
3335
|
return
|
|
3607
|
-
|
|
3336
|
+
|
|
3608
3337
|
# Get sample_uids to process
|
|
3609
3338
|
sample_uids = self._get_sample_uids(samples)
|
|
3610
3339
|
if not sample_uids:
|
|
3611
3340
|
self.logger.warning("No valid samples specified.")
|
|
3612
3341
|
return
|
|
3613
|
-
|
|
3342
|
+
|
|
3614
3343
|
# Adaptively check what actually needs to be done
|
|
3615
3344
|
import polars as pl
|
|
3616
|
-
|
|
3345
|
+
|
|
3617
3346
|
# Check if features need restoration (more sophisticated logic)
|
|
3618
3347
|
features_need_restoration = False
|
|
3619
3348
|
if features and not self.features_df.is_empty():
|
|
@@ -3622,7 +3351,7 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
|
|
|
3622
3351
|
for col in ["ms2_scans", "ms2_specs"]:
|
|
3623
3352
|
if col not in self.features_df.columns:
|
|
3624
3353
|
missing_cols.append(col)
|
|
3625
|
-
|
|
3354
|
+
|
|
3626
3355
|
# If columns are missing entirely, we likely need restoration
|
|
3627
3356
|
if missing_cols:
|
|
3628
3357
|
features_need_restoration = True
|
|
@@ -3631,15 +3360,13 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
|
|
|
3631
3360
|
# But be smart about it - only check if we have consensus features with MS2
|
|
3632
3361
|
if not self.consensus_ms2.is_empty():
|
|
3633
3362
|
# We have MS2 data, so ms2_specs should have some content
|
|
3634
|
-
null_ms2_specs = self.features_df.filter(
|
|
3635
|
-
pl.col("ms2_specs").is_null(),
|
|
3636
|
-
).height
|
|
3363
|
+
null_ms2_specs = self.features_df.filter(pl.col("ms2_specs").is_null()).height
|
|
3637
3364
|
total_features = len(self.features_df)
|
|
3638
3365
|
# If more than 90% are null but we have MS2 data, likely compressed
|
|
3639
3366
|
if null_ms2_specs > (total_features * 0.9):
|
|
3640
3367
|
features_need_restoration = True
|
|
3641
|
-
|
|
3642
|
-
# Check if chromatograms need restoration
|
|
3368
|
+
|
|
3369
|
+
# Check if chromatograms need restoration
|
|
3643
3370
|
chrom_need_restoration = False
|
|
3644
3371
|
if chrom and not self.features_df.is_empty():
|
|
3645
3372
|
if "chrom" not in self.features_df.columns:
|
|
@@ -3650,26 +3377,22 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
|
|
|
3650
3377
|
total_features = len(self.features_df)
|
|
3651
3378
|
# If more than 50% are null, likely need restoration
|
|
3652
3379
|
chrom_need_restoration = null_chroms > (total_features * 0.5)
|
|
3653
|
-
|
|
3380
|
+
|
|
3654
3381
|
# Check if MS2 data might need restoration (compare expected vs actual)
|
|
3655
3382
|
ms2_need_restoration = False
|
|
3656
3383
|
if ms2:
|
|
3657
|
-
current_ms2_count = (
|
|
3658
|
-
|
|
3659
|
-
|
|
3660
|
-
consensus_count = (
|
|
3661
|
-
len(self.consensus_df) if not self.consensus_df.is_empty() else 0
|
|
3662
|
-
)
|
|
3663
|
-
|
|
3384
|
+
current_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
3385
|
+
consensus_count = len(self.consensus_df) if not self.consensus_df.is_empty() else 0
|
|
3386
|
+
|
|
3664
3387
|
if consensus_count > 0:
|
|
3665
3388
|
# Calculate expected MS2 count based on consensus features with MS2 potential
|
|
3666
3389
|
# This is a heuristic - if we have very few MS2 compared to consensus, likely compressed
|
|
3667
3390
|
expected_ratio = 3.0 # Expect at least 3 MS2 per consensus on average
|
|
3668
3391
|
expected_ms2 = consensus_count * expected_ratio
|
|
3669
|
-
|
|
3392
|
+
|
|
3670
3393
|
if current_ms2_count < min(expected_ms2 * 0.3, consensus_count * 0.8):
|
|
3671
3394
|
ms2_need_restoration = True
|
|
3672
|
-
|
|
3395
|
+
|
|
3673
3396
|
# Build list of operations that actually need to be done
|
|
3674
3397
|
operations_needed = []
|
|
3675
3398
|
if features and features_need_restoration:
|
|
@@ -3678,75 +3401,59 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
|
|
|
3678
3401
|
operations_needed.append("chromatograms")
|
|
3679
3402
|
if ms2 and ms2_need_restoration:
|
|
3680
3403
|
operations_needed.append("MS2 spectra")
|
|
3681
|
-
|
|
3404
|
+
|
|
3682
3405
|
# Early exit if nothing needs to be done
|
|
3683
3406
|
if not operations_needed:
|
|
3684
|
-
self.logger.info(
|
|
3685
|
-
"All data appears to be already decompressed. No operations needed.",
|
|
3686
|
-
)
|
|
3407
|
+
self.logger.info("All data appears to be already decompressed. No operations needed.")
|
|
3687
3408
|
return
|
|
3688
|
-
|
|
3689
|
-
self.logger.info(
|
|
3690
|
-
|
|
3691
|
-
)
|
|
3692
|
-
|
|
3409
|
+
|
|
3410
|
+
self.logger.info(f"Starting adaptive decompression: {', '.join(operations_needed)} from {len(sample_uids)} samples")
|
|
3411
|
+
|
|
3693
3412
|
try:
|
|
3694
3413
|
# Phase 1: Restore features and chromatograms together (shared file I/O)
|
|
3695
|
-
if "features" in operations_needed and "chromatograms" in operations_needed:
|
|
3696
|
-
self.logger.info(
|
|
3697
|
-
|
|
3698
|
-
)
|
|
3699
|
-
|
|
3414
|
+
if ("features" in operations_needed and "chromatograms" in operations_needed):
|
|
3415
|
+
self.logger.info("Phase 1: Restoring features and chromatograms together...")
|
|
3416
|
+
|
|
3700
3417
|
# Extract relevant kwargs for restore_features and restore_chrom
|
|
3701
3418
|
restore_kwargs = {}
|
|
3702
|
-
if
|
|
3703
|
-
restore_kwargs[
|
|
3704
|
-
if
|
|
3705
|
-
restore_kwargs[
|
|
3706
|
-
|
|
3419
|
+
if 'mz_tol' in kwargs:
|
|
3420
|
+
restore_kwargs['mz_tol'] = kwargs['mz_tol']
|
|
3421
|
+
if 'rt_tol' in kwargs:
|
|
3422
|
+
restore_kwargs['rt_tol'] = kwargs['rt_tol']
|
|
3423
|
+
|
|
3707
3424
|
# Restore features first (includes chrom column)
|
|
3708
3425
|
self.restore_features(samples=samples)
|
|
3709
|
-
|
|
3426
|
+
|
|
3710
3427
|
# Then do additional chrom gap-filling if needed
|
|
3711
3428
|
self.restore_chrom(samples=samples, **restore_kwargs)
|
|
3712
|
-
|
|
3713
|
-
elif (
|
|
3714
|
-
"features" in operations_needed and "chromatograms" not in operations_needed
|
|
3715
|
-
):
|
|
3429
|
+
|
|
3430
|
+
elif ("features" in operations_needed and "chromatograms" not in operations_needed):
|
|
3716
3431
|
self.logger.info("Phase 1: Restoring features data...")
|
|
3717
3432
|
self.restore_features(samples=samples)
|
|
3718
|
-
|
|
3719
|
-
elif (
|
|
3720
|
-
"chromatograms" in operations_needed and "features" not in operations_needed
|
|
3721
|
-
):
|
|
3433
|
+
|
|
3434
|
+
elif ("chromatograms" in operations_needed and "features" not in operations_needed):
|
|
3722
3435
|
self.logger.info("Phase 1: Restoring chromatograms...")
|
|
3723
3436
|
restore_kwargs = {}
|
|
3724
|
-
if
|
|
3725
|
-
restore_kwargs[
|
|
3726
|
-
if
|
|
3727
|
-
restore_kwargs[
|
|
3437
|
+
if 'mz_tol' in kwargs:
|
|
3438
|
+
restore_kwargs['mz_tol'] = kwargs['mz_tol']
|
|
3439
|
+
if 'rt_tol' in kwargs:
|
|
3440
|
+
restore_kwargs['rt_tol'] = kwargs['rt_tol']
|
|
3728
3441
|
self.restore_chrom(samples=samples, **restore_kwargs)
|
|
3729
|
-
|
|
3442
|
+
|
|
3730
3443
|
# Phase 2: Restore MS2 data (most computationally expensive, done last)
|
|
3731
3444
|
if "MS2 spectra" in operations_needed:
|
|
3732
3445
|
self.logger.info("Phase 2: Restoring MS2 spectra...")
|
|
3733
|
-
|
|
3446
|
+
|
|
3734
3447
|
# Extract MS2-specific kwargs
|
|
3735
3448
|
ms2_kwargs = {}
|
|
3736
3449
|
for key, value in kwargs.items():
|
|
3737
|
-
if key in [
|
|
3738
|
-
"mz_tol",
|
|
3739
|
-
"centroid",
|
|
3740
|
-
"deisotope",
|
|
3741
|
-
"dia_stats",
|
|
3742
|
-
"feature_uid",
|
|
3743
|
-
]:
|
|
3450
|
+
if key in ['mz_tol', 'centroid', 'deisotope', 'dia_stats', 'feature_uid']:
|
|
3744
3451
|
ms2_kwargs[key] = value
|
|
3745
|
-
|
|
3452
|
+
|
|
3746
3453
|
self.restore_ms2(samples=samples, **ms2_kwargs)
|
|
3747
|
-
|
|
3454
|
+
|
|
3748
3455
|
self.logger.info("Adaptive decompression completed successfully")
|
|
3749
|
-
|
|
3456
|
+
|
|
3750
3457
|
except Exception as e:
|
|
3751
3458
|
self.logger.error(f"Decompression failed: {e}")
|
|
3752
3459
|
raise
|