masster 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +1 -1
- masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
- masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
- masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
- masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
- masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
- masster/data/libs/__pycache__/ccm.cpython-312.pyc +0 -0
- masster/data/libs/__pycache__/urine.cpython-312.pyc +0 -0
- masster/data/libs/ccm.csv +120 -0
- masster/data/libs/urine.csv +4693 -0
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
- masster/logger.py +11 -11
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +338 -264
- masster/sample/defaults/find_adducts_def.py +21 -8
- masster/sample/h5.py +561 -282
- masster/sample/helpers.py +131 -75
- masster/sample/lib.py +4 -4
- masster/sample/load.py +31 -17
- masster/sample/parameters.py +1 -1
- masster/sample/plot.py +7 -7
- masster/sample/processing.py +117 -87
- masster/sample/sample.py +103 -90
- masster/sample/sample5_schema.json +196 -0
- masster/sample/save.py +35 -12
- masster/spectrum.py +1 -1
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +5 -1
- masster/study/defaults/identify_def.py +3 -1
- masster/study/defaults/study_def.py +58 -25
- masster/study/export.py +360 -210
- masster/study/h5.py +560 -158
- masster/study/helpers.py +496 -203
- masster/study/helpers_optimized.py +1 -1
- masster/study/id.py +538 -349
- masster/study/load.py +233 -143
- masster/study/plot.py +71 -71
- masster/study/processing.py +456 -254
- masster/study/save.py +15 -5
- masster/study/study.py +213 -131
- masster/study/study5_schema.json +360 -0
- masster-0.4.5.dist-info/METADATA +131 -0
- masster-0.4.5.dist-info/RECORD +71 -0
- masster-0.4.3.dist-info/METADATA +0 -791
- masster-0.4.3.dist-info/RECORD +0 -56
- {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/WHEEL +0 -0
- {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/entry_points.txt +0 -0
- {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/top_level.txt +0 -0
masster/study/helpers.py
CHANGED
|
@@ -22,7 +22,7 @@ import pandas as pd
|
|
|
22
22
|
import polars as pl
|
|
23
23
|
|
|
24
24
|
from tqdm import tqdm
|
|
25
|
-
from
|
|
25
|
+
from master.chromatogram import Chromatogram
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
# =====================================================================================
|
|
@@ -71,7 +71,12 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
|
|
|
71
71
|
# fallback to pandas
|
|
72
72
|
try:
|
|
73
73
|
bpc_pd = s.ms1_df.to_pandas()[["rt", "inty"]]
|
|
74
|
-
bpc_pd =
|
|
74
|
+
bpc_pd = (
|
|
75
|
+
bpc_pd.groupby("rt")
|
|
76
|
+
.agg({"inty": "max"})
|
|
77
|
+
.reset_index()
|
|
78
|
+
.sort_values("rt")
|
|
79
|
+
)
|
|
75
80
|
except Exception:
|
|
76
81
|
raise
|
|
77
82
|
|
|
@@ -113,11 +118,16 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
|
|
|
113
118
|
mapping_rows = pl.DataFrame()
|
|
114
119
|
|
|
115
120
|
# If we still have no sample selector, try to infer sample from the Sample object s
|
|
116
|
-
if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(
|
|
121
|
+
if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(
|
|
122
|
+
s,
|
|
123
|
+
"sample_path",
|
|
124
|
+
):
|
|
117
125
|
# attempt to match by sample_path or file name
|
|
118
126
|
try:
|
|
119
127
|
# find row where sample_path matches
|
|
120
|
-
mapping_rows = feats.filter(
|
|
128
|
+
mapping_rows = feats.filter(
|
|
129
|
+
pl.col("sample_path") == getattr(s, "file", None),
|
|
130
|
+
)
|
|
121
131
|
except Exception:
|
|
122
132
|
mapping_rows = pl.DataFrame()
|
|
123
133
|
|
|
@@ -204,7 +214,9 @@ def get_tic(owner, sample=None, label=None):
|
|
|
204
214
|
except Exception:
|
|
205
215
|
raise
|
|
206
216
|
else:
|
|
207
|
-
raise ValueError(
|
|
217
|
+
raise ValueError(
|
|
218
|
+
"Neither ms1_df nor scans_df available for TIC computation",
|
|
219
|
+
)
|
|
208
220
|
|
|
209
221
|
if tic_pd.empty:
|
|
210
222
|
raise ValueError("Computed TIC is empty")
|
|
@@ -367,14 +379,17 @@ def get_chrom(self, uids=None, samples=None):
|
|
|
367
379
|
)
|
|
368
380
|
# Pre-filter features_df to only relevant features and samples
|
|
369
381
|
filtered_features = self.features_df.filter(
|
|
370
|
-
pl.col("feature_uid").is_in(relevant_feature_uids)
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
382
|
+
pl.col("feature_uid").is_in(relevant_feature_uids)
|
|
383
|
+
& pl.col("sample_uid").is_in(sample_uids),
|
|
384
|
+
).select(
|
|
385
|
+
[
|
|
386
|
+
"feature_uid",
|
|
387
|
+
"chrom",
|
|
388
|
+
"rt",
|
|
389
|
+
"rt_original",
|
|
390
|
+
"sample_uid",
|
|
391
|
+
],
|
|
392
|
+
)
|
|
378
393
|
|
|
379
394
|
# Pre-filter samples_df
|
|
380
395
|
filtered_samples = self.samples_df.filter(
|
|
@@ -409,11 +424,13 @@ def get_chrom(self, uids=None, samples=None):
|
|
|
409
424
|
# Create a mapping dictionary for O(1) lookup instead of O(n) filtering
|
|
410
425
|
self.logger.debug("Creating lookup dictionary for chromatogram objects.")
|
|
411
426
|
chrom_lookup = {}
|
|
412
|
-
for row in df_combined.select(
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
427
|
+
for row in df_combined.select(
|
|
428
|
+
[
|
|
429
|
+
"consensus_uid",
|
|
430
|
+
"sample_name",
|
|
431
|
+
"chrom",
|
|
432
|
+
],
|
|
433
|
+
).iter_rows():
|
|
417
434
|
key = (row[0], row[1]) # (consensus_uid, sample_name)
|
|
418
435
|
chrom_lookup[key] = row[2] # chrom object
|
|
419
436
|
|
|
@@ -532,7 +549,9 @@ def get_consensus_matrix(self, quant="chrom_area"):
|
|
|
532
549
|
|
|
533
550
|
# Build consensus matrix directly using the consensus_mapping_df
|
|
534
551
|
matrix_dict = {}
|
|
535
|
-
sample_mapping = dict(
|
|
552
|
+
sample_mapping = dict(
|
|
553
|
+
self.samples_df.select(["sample_uid", "sample_name"]).iter_rows(),
|
|
554
|
+
)
|
|
536
555
|
|
|
537
556
|
for row in self.consensus_mapping_df.iter_rows(named=True):
|
|
538
557
|
consensus_uid = row["consensus_uid"]
|
|
@@ -550,7 +569,10 @@ def get_consensus_matrix(self, quant="chrom_area"):
|
|
|
550
569
|
|
|
551
570
|
# Take max if multiple features map to same consensus/sample combination
|
|
552
571
|
if sample_name in matrix_dict[consensus_uid]:
|
|
553
|
-
matrix_dict[consensus_uid][sample_name] = max(
|
|
572
|
+
matrix_dict[consensus_uid][sample_name] = max(
|
|
573
|
+
matrix_dict[consensus_uid][sample_name],
|
|
574
|
+
value,
|
|
575
|
+
)
|
|
554
576
|
else:
|
|
555
577
|
matrix_dict[consensus_uid][sample_name] = value
|
|
556
578
|
|
|
@@ -569,10 +591,12 @@ def get_consensus_matrix(self, quant="chrom_area"):
|
|
|
569
591
|
|
|
570
592
|
# Fill null values with 0 and round numeric columns
|
|
571
593
|
numeric_cols = [col for col in df2.columns if col != "consensus_uid"]
|
|
572
|
-
df2 = df2.with_columns(
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
594
|
+
df2 = df2.with_columns(
|
|
595
|
+
[
|
|
596
|
+
pl.col("consensus_uid").cast(pl.UInt64),
|
|
597
|
+
*[pl.col(col).fill_null(0).round(0) for col in numeric_cols],
|
|
598
|
+
],
|
|
599
|
+
)
|
|
576
600
|
|
|
577
601
|
return df2
|
|
578
602
|
|
|
@@ -792,7 +816,7 @@ def get_sample(self, sample):
|
|
|
792
816
|
|
|
793
817
|
This helper mirrors the original Study.get_sample method but lives in helpers for reuse.
|
|
794
818
|
"""
|
|
795
|
-
from
|
|
819
|
+
from master.sample.sample import Sample
|
|
796
820
|
|
|
797
821
|
if isinstance(sample, Sample):
|
|
798
822
|
return sample
|
|
@@ -802,7 +826,9 @@ def get_sample(self, sample):
|
|
|
802
826
|
elif isinstance(sample, str):
|
|
803
827
|
rows = self.samples_df.filter(pl.col("sample_name") == sample)
|
|
804
828
|
else:
|
|
805
|
-
raise ValueError(
|
|
829
|
+
raise ValueError(
|
|
830
|
+
"sample must be an int (sample_uid), str (sample_name) or a Sample instance",
|
|
831
|
+
)
|
|
806
832
|
|
|
807
833
|
if rows.is_empty():
|
|
808
834
|
raise KeyError(f"Sample not found: {sample}")
|
|
@@ -836,7 +862,9 @@ def get_orphans(self):
|
|
|
836
862
|
Get all features that are not in the consensus mapping.
|
|
837
863
|
"""
|
|
838
864
|
not_in_consensus = self.features_df.filter(
|
|
839
|
-
~self.features_df["feature_uid"].is_in(
|
|
865
|
+
~self.features_df["feature_uid"].is_in(
|
|
866
|
+
self.consensus_mapping_df["feature_uid"].to_list(),
|
|
867
|
+
),
|
|
840
868
|
)
|
|
841
869
|
return not_in_consensus
|
|
842
870
|
|
|
@@ -914,7 +942,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
914
942
|
maps (bool, optional): If True, also load featureXML data and update study.feature_maps.
|
|
915
943
|
"""
|
|
916
944
|
import datetime
|
|
917
|
-
from
|
|
945
|
+
from master.sample.sample import Sample
|
|
918
946
|
|
|
919
947
|
if self.features_df is None or self.features_df.is_empty():
|
|
920
948
|
self.logger.error("No features_df found in study.")
|
|
@@ -934,7 +962,9 @@ def restore_features(self, samples=None, maps=False):
|
|
|
934
962
|
# Columns to update from sample data
|
|
935
963
|
columns_to_update = ["chrom", "chrom_area", "ms2_scans", "ms2_specs"]
|
|
936
964
|
|
|
937
|
-
self.logger.info(
|
|
965
|
+
self.logger.info(
|
|
966
|
+
f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...",
|
|
967
|
+
)
|
|
938
968
|
|
|
939
969
|
# Create a mapping of (sample_uid, feature_id) to feature_uid from study.features_df
|
|
940
970
|
study_feature_mapping = {}
|
|
@@ -954,7 +984,9 @@ def restore_features(self, samples=None, maps=False):
|
|
|
954
984
|
# Get sample info
|
|
955
985
|
sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
|
|
956
986
|
if sample_row.is_empty():
|
|
957
|
-
self.logger.warning(
|
|
987
|
+
self.logger.warning(
|
|
988
|
+
f"Sample with uid {sample_uid} not found in samples_df.",
|
|
989
|
+
)
|
|
958
990
|
continue
|
|
959
991
|
|
|
960
992
|
sample_info = sample_row.row(0, named=True)
|
|
@@ -962,7 +994,9 @@ def restore_features(self, samples=None, maps=False):
|
|
|
962
994
|
sample_name = sample_info.get("sample_name")
|
|
963
995
|
|
|
964
996
|
if not sample_path or not os.path.exists(sample_path):
|
|
965
|
-
self.logger.warning(
|
|
997
|
+
self.logger.warning(
|
|
998
|
+
f"Sample file not found for {sample_name}: {sample_path}",
|
|
999
|
+
)
|
|
966
1000
|
continue
|
|
967
1001
|
|
|
968
1002
|
try:
|
|
@@ -978,7 +1012,9 @@ def restore_features(self, samples=None, maps=False):
|
|
|
978
1012
|
continue
|
|
979
1013
|
|
|
980
1014
|
# Check which columns are actually available in the sample
|
|
981
|
-
available_columns = [
|
|
1015
|
+
available_columns = [
|
|
1016
|
+
col for col in columns_to_update if col in sample.features_df.columns
|
|
1017
|
+
]
|
|
982
1018
|
if not available_columns:
|
|
983
1019
|
self.logger.debug(f"No target columns found in sample {sample_name}")
|
|
984
1020
|
continue
|
|
@@ -1001,13 +1037,21 @@ def restore_features(self, samples=None, maps=False):
|
|
|
1001
1037
|
original_dtype = self.features_df[col].dtype
|
|
1002
1038
|
|
|
1003
1039
|
# Update the specific row and column, preserving dtype
|
|
1004
|
-
mask = (pl.col("feature_uid") == feature_uid) & (
|
|
1040
|
+
mask = (pl.col("feature_uid") == feature_uid) & (
|
|
1041
|
+
pl.col("sample_uid") == sample_uid
|
|
1042
|
+
)
|
|
1005
1043
|
|
|
1006
1044
|
# Handle object columns (like Chromatogram) differently
|
|
1007
1045
|
if original_dtype == pl.Object:
|
|
1008
1046
|
self.features_df = self.features_df.with_columns(
|
|
1009
1047
|
pl.when(mask)
|
|
1010
|
-
.then(
|
|
1048
|
+
.then(
|
|
1049
|
+
pl.lit(
|
|
1050
|
+
row[col],
|
|
1051
|
+
dtype=original_dtype,
|
|
1052
|
+
allow_object=True,
|
|
1053
|
+
),
|
|
1054
|
+
)
|
|
1011
1055
|
.otherwise(pl.col(col))
|
|
1012
1056
|
.alias(col),
|
|
1013
1057
|
)
|
|
@@ -1021,7 +1065,9 @@ def restore_features(self, samples=None, maps=False):
|
|
|
1021
1065
|
updates_made += 1
|
|
1022
1066
|
|
|
1023
1067
|
if updates_made > 0:
|
|
1024
|
-
self.logger.debug(
|
|
1068
|
+
self.logger.debug(
|
|
1069
|
+
f"Updated {updates_made} features from sample {sample_name}",
|
|
1070
|
+
)
|
|
1025
1071
|
|
|
1026
1072
|
# If maps is True, load featureXML data
|
|
1027
1073
|
if maps:
|
|
@@ -1032,7 +1078,9 @@ def restore_features(self, samples=None, maps=False):
|
|
|
1032
1078
|
self.logger.error(f"Failed to load sample {sample_name}: {e}")
|
|
1033
1079
|
continue
|
|
1034
1080
|
|
|
1035
|
-
self.logger.info(
|
|
1081
|
+
self.logger.info(
|
|
1082
|
+
f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples",
|
|
1083
|
+
)
|
|
1036
1084
|
|
|
1037
1085
|
|
|
1038
1086
|
def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
@@ -1052,8 +1100,8 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1052
1100
|
"""
|
|
1053
1101
|
import datetime
|
|
1054
1102
|
import numpy as np
|
|
1055
|
-
from
|
|
1056
|
-
from
|
|
1103
|
+
from master.sample.sample import Sample
|
|
1104
|
+
from master.chromatogram import Chromatogram
|
|
1057
1105
|
|
|
1058
1106
|
if self.features_df is None or self.features_df.is_empty():
|
|
1059
1107
|
self.logger.error("No features_df found in study.")
|
|
@@ -1129,7 +1177,9 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1129
1177
|
feature_uid = study_feature_mapping[key]
|
|
1130
1178
|
|
|
1131
1179
|
# Update only the chrom column
|
|
1132
|
-
mask = (pl.col("feature_uid") == feature_uid) & (
|
|
1180
|
+
mask = (pl.col("feature_uid") == feature_uid) & (
|
|
1181
|
+
pl.col("sample_uid") == sample_uid
|
|
1182
|
+
)
|
|
1133
1183
|
self.features_df = self.features_df.with_columns(
|
|
1134
1184
|
pl.when(mask)
|
|
1135
1185
|
.then(pl.lit(chrom, dtype=pl.Object, allow_object=True))
|
|
@@ -1142,7 +1192,9 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1142
1192
|
self.logger.error(f"Failed to load sample {sample_name}: {e}")
|
|
1143
1193
|
continue
|
|
1144
1194
|
|
|
1145
|
-
self.logger.info(
|
|
1195
|
+
self.logger.info(
|
|
1196
|
+
f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files",
|
|
1197
|
+
)
|
|
1146
1198
|
|
|
1147
1199
|
# Phase 2: Gap-fill remaining empty chromatograms (like fill_chrom)
|
|
1148
1200
|
self.logger.info("Phase 2: Gap-filling remaining empty chromatograms...")
|
|
@@ -1156,7 +1208,9 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1156
1208
|
)
|
|
1157
1209
|
|
|
1158
1210
|
if empty_chroms == 0:
|
|
1159
|
-
self.logger.info(
|
|
1211
|
+
self.logger.info(
|
|
1212
|
+
"All chromatograms restored from .sample5 files. No gap-filling needed.",
|
|
1213
|
+
)
|
|
1160
1214
|
return
|
|
1161
1215
|
|
|
1162
1216
|
# Get consensus info for gap filling
|
|
@@ -1200,7 +1254,11 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1200
1254
|
sample = Sample(log_level="ERROR")
|
|
1201
1255
|
sample._load_sample5(sample_path, map=False)
|
|
1202
1256
|
|
|
1203
|
-
if
|
|
1257
|
+
if (
|
|
1258
|
+
not hasattr(sample, "ms1_df")
|
|
1259
|
+
or sample.ms1_df is None
|
|
1260
|
+
or sample.ms1_df.is_empty()
|
|
1261
|
+
):
|
|
1204
1262
|
continue
|
|
1205
1263
|
|
|
1206
1264
|
# Process each missing feature
|
|
@@ -1285,7 +1343,9 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1285
1343
|
self.logger.info(
|
|
1286
1344
|
f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)",
|
|
1287
1345
|
)
|
|
1288
|
-
self.logger.info(
|
|
1346
|
+
self.logger.info(
|
|
1347
|
+
f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}",
|
|
1348
|
+
)
|
|
1289
1349
|
|
|
1290
1350
|
|
|
1291
1351
|
def compress_ms2(self, max_replicates=5):
|
|
@@ -1305,17 +1365,28 @@ def compress_ms2(self, max_replicates=5):
|
|
|
1305
1365
|
|
|
1306
1366
|
# Create a ranking score based on number_frags * prec_inty
|
|
1307
1367
|
# Handle None values by treating them as 0
|
|
1308
|
-
self.consensus_ms2 = self.consensus_ms2.with_columns(
|
|
1309
|
-
|
|
1310
|
-
|
|
1368
|
+
self.consensus_ms2 = self.consensus_ms2.with_columns(
|
|
1369
|
+
[
|
|
1370
|
+
(
|
|
1371
|
+
pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)
|
|
1372
|
+
).alias("ranking_score"),
|
|
1373
|
+
],
|
|
1374
|
+
)
|
|
1311
1375
|
|
|
1312
1376
|
# Group by consensus_uid and energy, then rank by score and keep top max_replicates
|
|
1313
1377
|
compressed_ms2 = (
|
|
1314
|
-
self.consensus_ms2.with_row_count(
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1378
|
+
self.consensus_ms2.with_row_count(
|
|
1379
|
+
"row_id",
|
|
1380
|
+
) # Add row numbers for stable sorting
|
|
1381
|
+
.sort(
|
|
1382
|
+
["consensus_uid", "energy", "ranking_score", "row_id"],
|
|
1383
|
+
descending=[False, False, True, False],
|
|
1384
|
+
)
|
|
1385
|
+
.with_columns(
|
|
1386
|
+
[
|
|
1387
|
+
pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank"),
|
|
1388
|
+
],
|
|
1389
|
+
)
|
|
1319
1390
|
.filter(pl.col("rank") < max_replicates)
|
|
1320
1391
|
.drop(["ranking_score", "row_id", "rank"])
|
|
1321
1392
|
)
|
|
@@ -1351,7 +1422,9 @@ def compress_chrom(self):
|
|
|
1351
1422
|
pl.lit(None, dtype=pl.Object).alias("chrom"),
|
|
1352
1423
|
)
|
|
1353
1424
|
|
|
1354
|
-
self.logger.info(
|
|
1425
|
+
self.logger.info(
|
|
1426
|
+
f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df",
|
|
1427
|
+
)
|
|
1355
1428
|
|
|
1356
1429
|
|
|
1357
1430
|
# =====================================================================================
|
|
@@ -1402,7 +1475,9 @@ def sample_name_replace(self, replace_dict):
|
|
|
1402
1475
|
if name in replace_dict:
|
|
1403
1476
|
new_names.append(replace_dict[name])
|
|
1404
1477
|
replaced_count += 1
|
|
1405
|
-
self.logger.debug(
|
|
1478
|
+
self.logger.debug(
|
|
1479
|
+
f"Replacing sample name: '{name}' -> '{replace_dict[name]}'",
|
|
1480
|
+
)
|
|
1406
1481
|
else:
|
|
1407
1482
|
new_names.append(name)
|
|
1408
1483
|
|
|
@@ -1415,7 +1490,9 @@ def sample_name_replace(self, replace_dict):
|
|
|
1415
1490
|
duplicates.append(name)
|
|
1416
1491
|
else:
|
|
1417
1492
|
seen.add(name)
|
|
1418
|
-
raise ValueError(
|
|
1493
|
+
raise ValueError(
|
|
1494
|
+
f"Resulting sample names are not unique. Duplicates found: {duplicates}",
|
|
1495
|
+
)
|
|
1419
1496
|
|
|
1420
1497
|
# If we get here, all names are unique - apply the changes
|
|
1421
1498
|
self.samples_df = self.samples_df.with_columns(
|
|
@@ -1464,7 +1541,9 @@ def sample_name_reset(self):
|
|
|
1464
1541
|
name_without_ext = os.path.splitext(name_without_ext)[0]
|
|
1465
1542
|
|
|
1466
1543
|
new_names.append(name_without_ext)
|
|
1467
|
-
self.logger.debug(
|
|
1544
|
+
self.logger.debug(
|
|
1545
|
+
f"Resetting sample name from path: '{path}' -> '{name_without_ext}'",
|
|
1546
|
+
)
|
|
1468
1547
|
|
|
1469
1548
|
# Check that all new names are unique
|
|
1470
1549
|
if len(set(new_names)) != len(new_names):
|
|
@@ -1475,14 +1554,18 @@ def sample_name_reset(self):
|
|
|
1475
1554
|
duplicates.append(name)
|
|
1476
1555
|
else:
|
|
1477
1556
|
seen.add(name)
|
|
1478
|
-
raise ValueError(
|
|
1557
|
+
raise ValueError(
|
|
1558
|
+
f"Resulting sample names are not unique. Duplicates found: {duplicates}",
|
|
1559
|
+
)
|
|
1479
1560
|
|
|
1480
1561
|
# If we get here, all names are unique - apply the changes
|
|
1481
1562
|
self.samples_df = self.samples_df.with_columns(
|
|
1482
1563
|
pl.Series("sample_name", new_names).alias("sample_name"),
|
|
1483
1564
|
)
|
|
1484
1565
|
|
|
1485
|
-
self.logger.info(
|
|
1566
|
+
self.logger.info(
|
|
1567
|
+
f"Successfully reset {len(new_names)} sample names from sample paths",
|
|
1568
|
+
)
|
|
1486
1569
|
|
|
1487
1570
|
|
|
1488
1571
|
def set_source(self, filename):
|
|
@@ -1512,11 +1595,15 @@ def set_source(self, filename):
|
|
|
1512
1595
|
|
|
1513
1596
|
new_sources = []
|
|
1514
1597
|
|
|
1515
|
-
for i, (current_source, sample_name) in enumerate(
|
|
1598
|
+
for i, (current_source, sample_name) in enumerate(
|
|
1599
|
+
zip(current_sources, sample_names),
|
|
1600
|
+
):
|
|
1516
1601
|
# Check if filename is just a directory path
|
|
1517
1602
|
if os.path.isdir(filename):
|
|
1518
1603
|
if current_source is None or current_source == "":
|
|
1519
|
-
self.logger.warning(
|
|
1604
|
+
self.logger.warning(
|
|
1605
|
+
f"Cannot build path for sample '{sample_name}': no current file_source available",
|
|
1606
|
+
)
|
|
1520
1607
|
new_sources.append(current_source)
|
|
1521
1608
|
failed_count += 1
|
|
1522
1609
|
continue
|
|
@@ -1531,7 +1618,9 @@ def set_source(self, filename):
|
|
|
1531
1618
|
|
|
1532
1619
|
# Check if the new file exists
|
|
1533
1620
|
if not os.path.exists(new_file_path):
|
|
1534
|
-
self.logger.warning(
|
|
1621
|
+
self.logger.warning(
|
|
1622
|
+
f"File does not exist for sample '{sample_name}': {new_file_path}",
|
|
1623
|
+
)
|
|
1535
1624
|
new_sources.append(current_source)
|
|
1536
1625
|
failed_count += 1
|
|
1537
1626
|
continue
|
|
@@ -1541,7 +1630,9 @@ def set_source(self, filename):
|
|
|
1541
1630
|
updated_count += 1
|
|
1542
1631
|
|
|
1543
1632
|
# Log individual updates at debug level
|
|
1544
|
-
self.logger.debug(
|
|
1633
|
+
self.logger.debug(
|
|
1634
|
+
f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}",
|
|
1635
|
+
)
|
|
1545
1636
|
|
|
1546
1637
|
# Update the samples_df with new file_source values
|
|
1547
1638
|
self.samples_df = self.samples_df.with_columns(
|
|
@@ -1636,7 +1727,9 @@ def features_select(
|
|
|
1636
1727
|
if mz is not None:
|
|
1637
1728
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
1638
1729
|
min_mz, max_mz = mz
|
|
1639
|
-
filter_conditions.append(
|
|
1730
|
+
filter_conditions.append(
|
|
1731
|
+
(pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
|
|
1732
|
+
)
|
|
1640
1733
|
else:
|
|
1641
1734
|
filter_conditions.append(pl.col("mz") >= mz)
|
|
1642
1735
|
|
|
@@ -1644,7 +1737,9 @@ def features_select(
|
|
|
1644
1737
|
if rt is not None:
|
|
1645
1738
|
if isinstance(rt, tuple) and len(rt) == 2:
|
|
1646
1739
|
min_rt, max_rt = rt
|
|
1647
|
-
filter_conditions.append(
|
|
1740
|
+
filter_conditions.append(
|
|
1741
|
+
(pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
|
|
1742
|
+
)
|
|
1648
1743
|
else:
|
|
1649
1744
|
filter_conditions.append(pl.col("rt") >= rt)
|
|
1650
1745
|
|
|
@@ -1652,7 +1747,9 @@ def features_select(
|
|
|
1652
1747
|
if inty is not None:
|
|
1653
1748
|
if isinstance(inty, tuple) and len(inty) == 2:
|
|
1654
1749
|
min_inty, max_inty = inty
|
|
1655
|
-
filter_conditions.append(
|
|
1750
|
+
filter_conditions.append(
|
|
1751
|
+
(pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty),
|
|
1752
|
+
)
|
|
1656
1753
|
else:
|
|
1657
1754
|
filter_conditions.append(pl.col("inty") >= inty)
|
|
1658
1755
|
|
|
@@ -1662,7 +1759,10 @@ def features_select(
|
|
|
1662
1759
|
if len(sample_uid) == 2 and not isinstance(sample_uid, list):
|
|
1663
1760
|
# Treat as range
|
|
1664
1761
|
min_uid, max_uid = sample_uid
|
|
1665
|
-
filter_conditions.append(
|
|
1762
|
+
filter_conditions.append(
|
|
1763
|
+
(pl.col("sample_uid") >= min_uid)
|
|
1764
|
+
& (pl.col("sample_uid") <= max_uid),
|
|
1765
|
+
)
|
|
1666
1766
|
else:
|
|
1667
1767
|
# Treat as list
|
|
1668
1768
|
filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
|
|
@@ -1692,7 +1792,10 @@ def features_select(
|
|
|
1692
1792
|
if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
|
|
1693
1793
|
# Treat as range
|
|
1694
1794
|
min_uid, max_uid = consensus_uid
|
|
1695
|
-
filter_conditions.append(
|
|
1795
|
+
filter_conditions.append(
|
|
1796
|
+
(pl.col("consensus_uid") >= min_uid)
|
|
1797
|
+
& (pl.col("consensus_uid") <= max_uid),
|
|
1798
|
+
)
|
|
1696
1799
|
else:
|
|
1697
1800
|
# Treat as list
|
|
1698
1801
|
filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
|
|
@@ -1705,7 +1808,10 @@ def features_select(
|
|
|
1705
1808
|
if len(feature_uid) == 2 and not isinstance(feature_uid, list):
|
|
1706
1809
|
# Treat as range
|
|
1707
1810
|
min_uid, max_uid = feature_uid
|
|
1708
|
-
filter_conditions.append(
|
|
1811
|
+
filter_conditions.append(
|
|
1812
|
+
(pl.col("feature_uid") >= min_uid)
|
|
1813
|
+
& (pl.col("feature_uid") <= max_uid),
|
|
1814
|
+
)
|
|
1709
1815
|
else:
|
|
1710
1816
|
# Treat as list
|
|
1711
1817
|
filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
|
|
@@ -1727,7 +1833,10 @@ def features_select(
|
|
|
1727
1833
|
if "quality" in available_columns:
|
|
1728
1834
|
if isinstance(quality, tuple) and len(quality) == 2:
|
|
1729
1835
|
min_quality, max_quality = quality
|
|
1730
|
-
filter_conditions.append(
|
|
1836
|
+
filter_conditions.append(
|
|
1837
|
+
(pl.col("quality") >= min_quality)
|
|
1838
|
+
& (pl.col("quality") <= max_quality),
|
|
1839
|
+
)
|
|
1731
1840
|
else:
|
|
1732
1841
|
filter_conditions.append(pl.col("quality") >= quality)
|
|
1733
1842
|
else:
|
|
@@ -1739,7 +1848,8 @@ def features_select(
|
|
|
1739
1848
|
if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
|
|
1740
1849
|
min_coherence, max_coherence = chrom_coherence
|
|
1741
1850
|
filter_conditions.append(
|
|
1742
|
-
(pl.col("chrom_coherence") >= min_coherence)
|
|
1851
|
+
(pl.col("chrom_coherence") >= min_coherence)
|
|
1852
|
+
& (pl.col("chrom_coherence") <= max_coherence),
|
|
1743
1853
|
)
|
|
1744
1854
|
else:
|
|
1745
1855
|
filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
|
|
@@ -1752,7 +1862,8 @@ def features_select(
|
|
|
1752
1862
|
if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
|
|
1753
1863
|
min_prominence, max_prominence = chrom_prominence
|
|
1754
1864
|
filter_conditions.append(
|
|
1755
|
-
(pl.col("chrom_prominence") >= min_prominence)
|
|
1865
|
+
(pl.col("chrom_prominence") >= min_prominence)
|
|
1866
|
+
& (pl.col("chrom_prominence") <= max_prominence),
|
|
1756
1867
|
)
|
|
1757
1868
|
else:
|
|
1758
1869
|
filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
|
|
@@ -1762,14 +1873,19 @@ def features_select(
|
|
|
1762
1873
|
# Filter by scaled chromatogram prominence
|
|
1763
1874
|
if chrom_prominence_scaled is not None:
|
|
1764
1875
|
if "chrom_prominence_scaled" in available_columns:
|
|
1765
|
-
if
|
|
1876
|
+
if (
|
|
1877
|
+
isinstance(chrom_prominence_scaled, tuple)
|
|
1878
|
+
and len(chrom_prominence_scaled) == 2
|
|
1879
|
+
):
|
|
1766
1880
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
|
|
1767
1881
|
filter_conditions.append(
|
|
1768
1882
|
(pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
|
|
1769
1883
|
& (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
|
|
1770
1884
|
)
|
|
1771
1885
|
else:
|
|
1772
|
-
filter_conditions.append(
|
|
1886
|
+
filter_conditions.append(
|
|
1887
|
+
pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled,
|
|
1888
|
+
)
|
|
1773
1889
|
else:
|
|
1774
1890
|
warnings.append("'chrom_prominence_scaled' column not found in features_df")
|
|
1775
1891
|
|
|
@@ -1783,7 +1899,9 @@ def features_select(
|
|
|
1783
1899
|
& (pl.col("chrom_height_scaled") <= max_height_scaled),
|
|
1784
1900
|
)
|
|
1785
1901
|
else:
|
|
1786
|
-
filter_conditions.append(
|
|
1902
|
+
filter_conditions.append(
|
|
1903
|
+
pl.col("chrom_height_scaled") >= chrom_height_scaled,
|
|
1904
|
+
)
|
|
1787
1905
|
else:
|
|
1788
1906
|
warnings.append("'chrom_height_scaled' column not found in features_df")
|
|
1789
1907
|
|
|
@@ -1896,7 +2014,7 @@ def monkey_patch_study():
|
|
|
1896
2014
|
as `features_select_original` if not already set, then replaces Study.features_select
|
|
1897
2015
|
with the optimized `features_select` defined above. This function is idempotent.
|
|
1898
2016
|
"""
|
|
1899
|
-
from
|
|
2017
|
+
from master.study.study import Study
|
|
1900
2018
|
|
|
1901
2019
|
# Only set original if it doesn't exist yet
|
|
1902
2020
|
if not hasattr(Study, "features_select_original"):
|
|
@@ -1969,9 +2087,14 @@ def features_filter(self, features):
|
|
|
1969
2087
|
|
|
1970
2088
|
# Apply filter to consensus_mapping_df if it exists - batch operation
|
|
1971
2089
|
mapping_removed_count = 0
|
|
1972
|
-
if
|
|
2090
|
+
if (
|
|
2091
|
+
self.consensus_mapping_df is not None
|
|
2092
|
+
and not self.consensus_mapping_df.is_empty()
|
|
2093
|
+
):
|
|
1973
2094
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
1974
|
-
self.consensus_mapping_df =
|
|
2095
|
+
self.consensus_mapping_df = (
|
|
2096
|
+
self.consensus_mapping_df.lazy().filter(filter_condition).collect()
|
|
2097
|
+
)
|
|
1975
2098
|
mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
1976
2099
|
|
|
1977
2100
|
# Calculate results once and log efficiently
|
|
@@ -1984,7 +2107,9 @@ def features_filter(self, features):
|
|
|
1984
2107
|
f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.",
|
|
1985
2108
|
)
|
|
1986
2109
|
else:
|
|
1987
|
-
self.logger.info(
|
|
2110
|
+
self.logger.info(
|
|
2111
|
+
f"Kept {final_count} features. Filtered out {removed_count} features.",
|
|
2112
|
+
)
|
|
1988
2113
|
|
|
1989
2114
|
|
|
1990
2115
|
def features_delete(self, features):
|
|
@@ -2046,9 +2171,14 @@ def features_delete(self, features):
|
|
|
2046
2171
|
|
|
2047
2172
|
# Apply filter to consensus_mapping_df if it exists - batch operation
|
|
2048
2173
|
mapping_removed_count = 0
|
|
2049
|
-
if
|
|
2174
|
+
if (
|
|
2175
|
+
self.consensus_mapping_df is not None
|
|
2176
|
+
and not self.consensus_mapping_df.is_empty()
|
|
2177
|
+
):
|
|
2050
2178
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
2051
|
-
self.consensus_mapping_df =
|
|
2179
|
+
self.consensus_mapping_df = (
|
|
2180
|
+
self.consensus_mapping_df.lazy().filter(filter_condition).collect()
|
|
2181
|
+
)
|
|
2052
2182
|
mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
2053
2183
|
|
|
2054
2184
|
# Calculate results once and log efficiently
|
|
@@ -2061,7 +2191,9 @@ def features_delete(self, features):
|
|
|
2061
2191
|
f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}",
|
|
2062
2192
|
)
|
|
2063
2193
|
else:
|
|
2064
|
-
self.logger.info(
|
|
2194
|
+
self.logger.info(
|
|
2195
|
+
f"Deleted {removed_count} features. Remaining features: {final_count}",
|
|
2196
|
+
)
|
|
2065
2197
|
|
|
2066
2198
|
|
|
2067
2199
|
def consensus_select(
|
|
@@ -2134,7 +2266,9 @@ def consensus_select(
|
|
|
2134
2266
|
else:
|
|
2135
2267
|
# Standard (min_mz, max_mz) format
|
|
2136
2268
|
min_mz, max_mz = mz
|
|
2137
|
-
consensus = consensus.filter(
|
|
2269
|
+
consensus = consensus.filter(
|
|
2270
|
+
(pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
|
|
2271
|
+
)
|
|
2138
2272
|
else:
|
|
2139
2273
|
# Single float value - use default mz tolerance from study parameters
|
|
2140
2274
|
default_mz_tol = getattr(self, "parameters", None)
|
|
@@ -2142,13 +2276,15 @@ def consensus_select(
|
|
|
2142
2276
|
default_mz_tol = default_mz_tol.eic_mz_tol
|
|
2143
2277
|
else:
|
|
2144
2278
|
# Fallback to align_defaults if study parameters not available
|
|
2145
|
-
from
|
|
2279
|
+
from master.study.defaults.align_def import align_defaults
|
|
2146
2280
|
|
|
2147
2281
|
default_mz_tol = align_defaults().mz_max_diff
|
|
2148
2282
|
|
|
2149
2283
|
min_mz = mz - default_mz_tol
|
|
2150
2284
|
max_mz = mz + default_mz_tol
|
|
2151
|
-
consensus = consensus.filter(
|
|
2285
|
+
consensus = consensus.filter(
|
|
2286
|
+
(pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
|
|
2287
|
+
)
|
|
2152
2288
|
|
|
2153
2289
|
self.logger.debug(
|
|
2154
2290
|
f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
@@ -2168,7 +2304,9 @@ def consensus_select(
|
|
|
2168
2304
|
else:
|
|
2169
2305
|
# Standard (min_rt, max_rt) format
|
|
2170
2306
|
min_rt, max_rt = rt
|
|
2171
|
-
consensus = consensus.filter(
|
|
2307
|
+
consensus = consensus.filter(
|
|
2308
|
+
(pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
|
|
2309
|
+
)
|
|
2172
2310
|
else:
|
|
2173
2311
|
# Single float value - use default rt tolerance from study parameters
|
|
2174
2312
|
default_rt_tol = getattr(self, "parameters", None)
|
|
@@ -2176,13 +2314,15 @@ def consensus_select(
|
|
|
2176
2314
|
default_rt_tol = default_rt_tol.eic_rt_tol
|
|
2177
2315
|
else:
|
|
2178
2316
|
# Fallback to align_defaults if study parameters not available
|
|
2179
|
-
from
|
|
2317
|
+
from master.study.defaults.align_def import align_defaults
|
|
2180
2318
|
|
|
2181
2319
|
default_rt_tol = align_defaults().rt_tol
|
|
2182
2320
|
|
|
2183
2321
|
min_rt = rt - default_rt_tol
|
|
2184
2322
|
max_rt = rt + default_rt_tol
|
|
2185
|
-
consensus = consensus.filter(
|
|
2323
|
+
consensus = consensus.filter(
|
|
2324
|
+
(pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
|
|
2325
|
+
)
|
|
2186
2326
|
|
|
2187
2327
|
self.logger.debug(
|
|
2188
2328
|
f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
@@ -2193,7 +2333,9 @@ def consensus_select(
|
|
|
2193
2333
|
consensus_len_before_filter = len(consensus)
|
|
2194
2334
|
if isinstance(inty_mean, tuple) and len(inty_mean) == 2:
|
|
2195
2335
|
min_inty, max_inty = inty_mean
|
|
2196
|
-
consensus = consensus.filter(
|
|
2336
|
+
consensus = consensus.filter(
|
|
2337
|
+
(pl.col("inty_mean") >= min_inty) & (pl.col("inty_mean") <= max_inty),
|
|
2338
|
+
)
|
|
2197
2339
|
else:
|
|
2198
2340
|
consensus = consensus.filter(pl.col("inty_mean") >= inty_mean)
|
|
2199
2341
|
self.logger.debug(
|
|
@@ -2208,11 +2350,14 @@ def consensus_select(
|
|
|
2208
2350
|
# Treat as range
|
|
2209
2351
|
min_uid, max_uid = consensus_uid
|
|
2210
2352
|
consensus = consensus.filter(
|
|
2211
|
-
(pl.col("consensus_uid") >= min_uid)
|
|
2353
|
+
(pl.col("consensus_uid") >= min_uid)
|
|
2354
|
+
& (pl.col("consensus_uid") <= max_uid),
|
|
2212
2355
|
)
|
|
2213
2356
|
else:
|
|
2214
2357
|
# Treat as list
|
|
2215
|
-
consensus = consensus.filter(
|
|
2358
|
+
consensus = consensus.filter(
|
|
2359
|
+
pl.col("consensus_uid").is_in(consensus_uid),
|
|
2360
|
+
)
|
|
2216
2361
|
else:
|
|
2217
2362
|
consensus = consensus.filter(pl.col("consensus_uid") == consensus_uid)
|
|
2218
2363
|
self.logger.debug(
|
|
@@ -2236,7 +2381,8 @@ def consensus_select(
|
|
|
2236
2381
|
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
2237
2382
|
min_samples, max_samples = number_samples
|
|
2238
2383
|
consensus = consensus.filter(
|
|
2239
|
-
(pl.col("number_samples") >= min_samples)
|
|
2384
|
+
(pl.col("number_samples") >= min_samples)
|
|
2385
|
+
& (pl.col("number_samples") <= max_samples),
|
|
2240
2386
|
)
|
|
2241
2387
|
else:
|
|
2242
2388
|
consensus = consensus.filter(pl.col("number_samples") >= number_samples)
|
|
@@ -2250,7 +2396,10 @@ def consensus_select(
|
|
|
2250
2396
|
if "number_ms2" in consensus.columns:
|
|
2251
2397
|
if isinstance(number_ms2, tuple) and len(number_ms2) == 2:
|
|
2252
2398
|
min_ms2, max_ms2 = number_ms2
|
|
2253
|
-
consensus = consensus.filter(
|
|
2399
|
+
consensus = consensus.filter(
|
|
2400
|
+
(pl.col("number_ms2") >= min_ms2)
|
|
2401
|
+
& (pl.col("number_ms2") <= max_ms2),
|
|
2402
|
+
)
|
|
2254
2403
|
else:
|
|
2255
2404
|
consensus = consensus.filter(pl.col("number_ms2") >= number_ms2)
|
|
2256
2405
|
else:
|
|
@@ -2264,7 +2413,9 @@ def consensus_select(
|
|
|
2264
2413
|
consensus_len_before_filter = len(consensus)
|
|
2265
2414
|
if isinstance(quality, tuple) and len(quality) == 2:
|
|
2266
2415
|
min_quality, max_quality = quality
|
|
2267
|
-
consensus = consensus.filter(
|
|
2416
|
+
consensus = consensus.filter(
|
|
2417
|
+
(pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality),
|
|
2418
|
+
)
|
|
2268
2419
|
else:
|
|
2269
2420
|
consensus = consensus.filter(pl.col("quality") >= quality)
|
|
2270
2421
|
self.logger.debug(
|
|
@@ -2277,7 +2428,9 @@ def consensus_select(
|
|
|
2277
2428
|
if "bl" in consensus.columns:
|
|
2278
2429
|
if isinstance(bl, tuple) and len(bl) == 2:
|
|
2279
2430
|
min_bl, max_bl = bl
|
|
2280
|
-
consensus = consensus.filter(
|
|
2431
|
+
consensus = consensus.filter(
|
|
2432
|
+
(pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl),
|
|
2433
|
+
)
|
|
2281
2434
|
else:
|
|
2282
2435
|
consensus = consensus.filter(pl.col("bl") >= bl)
|
|
2283
2436
|
else:
|
|
@@ -2290,16 +2443,23 @@ def consensus_select(
|
|
|
2290
2443
|
if chrom_coherence_mean is not None:
|
|
2291
2444
|
consensus_len_before_filter = len(consensus)
|
|
2292
2445
|
if "chrom_coherence_mean" in consensus.columns:
|
|
2293
|
-
if
|
|
2446
|
+
if (
|
|
2447
|
+
isinstance(chrom_coherence_mean, tuple)
|
|
2448
|
+
and len(chrom_coherence_mean) == 2
|
|
2449
|
+
):
|
|
2294
2450
|
min_coherence, max_coherence = chrom_coherence_mean
|
|
2295
2451
|
consensus = consensus.filter(
|
|
2296
2452
|
(pl.col("chrom_coherence_mean") >= min_coherence)
|
|
2297
2453
|
& (pl.col("chrom_coherence_mean") <= max_coherence),
|
|
2298
2454
|
)
|
|
2299
2455
|
else:
|
|
2300
|
-
consensus = consensus.filter(
|
|
2456
|
+
consensus = consensus.filter(
|
|
2457
|
+
pl.col("chrom_coherence_mean") >= chrom_coherence_mean,
|
|
2458
|
+
)
|
|
2301
2459
|
else:
|
|
2302
|
-
self.logger.warning(
|
|
2460
|
+
self.logger.warning(
|
|
2461
|
+
"'chrom_coherence_mean' column not found in consensus_df",
|
|
2462
|
+
)
|
|
2303
2463
|
self.logger.debug(
|
|
2304
2464
|
f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2305
2465
|
)
|
|
@@ -2308,16 +2468,23 @@ def consensus_select(
|
|
|
2308
2468
|
if chrom_prominence_mean is not None:
|
|
2309
2469
|
consensus_len_before_filter = len(consensus)
|
|
2310
2470
|
if "chrom_prominence_mean" in consensus.columns:
|
|
2311
|
-
if
|
|
2471
|
+
if (
|
|
2472
|
+
isinstance(chrom_prominence_mean, tuple)
|
|
2473
|
+
and len(chrom_prominence_mean) == 2
|
|
2474
|
+
):
|
|
2312
2475
|
min_prominence, max_prominence = chrom_prominence_mean
|
|
2313
2476
|
consensus = consensus.filter(
|
|
2314
2477
|
(pl.col("chrom_prominence_mean") >= min_prominence)
|
|
2315
2478
|
& (pl.col("chrom_prominence_mean") <= max_prominence),
|
|
2316
2479
|
)
|
|
2317
2480
|
else:
|
|
2318
|
-
consensus = consensus.filter(
|
|
2481
|
+
consensus = consensus.filter(
|
|
2482
|
+
pl.col("chrom_prominence_mean") >= chrom_prominence_mean,
|
|
2483
|
+
)
|
|
2319
2484
|
else:
|
|
2320
|
-
self.logger.warning(
|
|
2485
|
+
self.logger.warning(
|
|
2486
|
+
"'chrom_prominence_mean' column not found in consensus_df",
|
|
2487
|
+
)
|
|
2321
2488
|
self.logger.debug(
|
|
2322
2489
|
f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2323
2490
|
)
|
|
@@ -2326,16 +2493,26 @@ def consensus_select(
|
|
|
2326
2493
|
if chrom_prominence_scaled_mean is not None:
|
|
2327
2494
|
consensus_len_before_filter = len(consensus)
|
|
2328
2495
|
if "chrom_prominence_scaled_mean" in consensus.columns:
|
|
2329
|
-
if
|
|
2330
|
-
|
|
2496
|
+
if (
|
|
2497
|
+
isinstance(chrom_prominence_scaled_mean, tuple)
|
|
2498
|
+
and len(chrom_prominence_scaled_mean) == 2
|
|
2499
|
+
):
|
|
2500
|
+
min_prominence_scaled, max_prominence_scaled = (
|
|
2501
|
+
chrom_prominence_scaled_mean
|
|
2502
|
+
)
|
|
2331
2503
|
consensus = consensus.filter(
|
|
2332
2504
|
(pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
|
|
2333
2505
|
& (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
|
|
2334
2506
|
)
|
|
2335
2507
|
else:
|
|
2336
|
-
consensus = consensus.filter(
|
|
2508
|
+
consensus = consensus.filter(
|
|
2509
|
+
pl.col("chrom_prominence_scaled_mean")
|
|
2510
|
+
>= chrom_prominence_scaled_mean,
|
|
2511
|
+
)
|
|
2337
2512
|
else:
|
|
2338
|
-
self.logger.warning(
|
|
2513
|
+
self.logger.warning(
|
|
2514
|
+
"'chrom_prominence_scaled_mean' column not found in consensus_df",
|
|
2515
|
+
)
|
|
2339
2516
|
self.logger.debug(
|
|
2340
2517
|
f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2341
2518
|
)
|
|
@@ -2344,16 +2521,23 @@ def consensus_select(
|
|
|
2344
2521
|
if chrom_height_scaled_mean is not None:
|
|
2345
2522
|
consensus_len_before_filter = len(consensus)
|
|
2346
2523
|
if "chrom_height_scaled_mean" in consensus.columns:
|
|
2347
|
-
if
|
|
2524
|
+
if (
|
|
2525
|
+
isinstance(chrom_height_scaled_mean, tuple)
|
|
2526
|
+
and len(chrom_height_scaled_mean) == 2
|
|
2527
|
+
):
|
|
2348
2528
|
min_height_scaled, max_height_scaled = chrom_height_scaled_mean
|
|
2349
2529
|
consensus = consensus.filter(
|
|
2350
2530
|
(pl.col("chrom_height_scaled_mean") >= min_height_scaled)
|
|
2351
2531
|
& (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
|
|
2352
2532
|
)
|
|
2353
2533
|
else:
|
|
2354
|
-
consensus = consensus.filter(
|
|
2534
|
+
consensus = consensus.filter(
|
|
2535
|
+
pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean,
|
|
2536
|
+
)
|
|
2355
2537
|
else:
|
|
2356
|
-
self.logger.warning(
|
|
2538
|
+
self.logger.warning(
|
|
2539
|
+
"'chrom_height_scaled_mean' column not found in consensus_df",
|
|
2540
|
+
)
|
|
2357
2541
|
self.logger.debug(
|
|
2358
2542
|
f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2359
2543
|
)
|
|
@@ -2365,7 +2549,8 @@ def consensus_select(
|
|
|
2365
2549
|
if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
|
|
2366
2550
|
min_rt_delta, max_rt_delta = rt_delta_mean
|
|
2367
2551
|
consensus = consensus.filter(
|
|
2368
|
-
(pl.col("rt_delta_mean") >= min_rt_delta)
|
|
2552
|
+
(pl.col("rt_delta_mean") >= min_rt_delta)
|
|
2553
|
+
& (pl.col("rt_delta_mean") <= max_rt_delta),
|
|
2369
2554
|
)
|
|
2370
2555
|
else:
|
|
2371
2556
|
consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
|
|
@@ -2376,9 +2561,13 @@ def consensus_select(
|
|
|
2376
2561
|
)
|
|
2377
2562
|
|
|
2378
2563
|
if len(consensus) == 0:
|
|
2379
|
-
self.logger.warning(
|
|
2564
|
+
self.logger.warning(
|
|
2565
|
+
"No consensus features remaining after applying selection criteria.",
|
|
2566
|
+
)
|
|
2380
2567
|
else:
|
|
2381
|
-
self.logger.info(
|
|
2568
|
+
self.logger.info(
|
|
2569
|
+
f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})",
|
|
2570
|
+
)
|
|
2382
2571
|
|
|
2383
2572
|
# Sort the results if sortby is specified
|
|
2384
2573
|
if sortby is not None:
|
|
@@ -2387,19 +2576,25 @@ def consensus_select(
|
|
|
2387
2576
|
if sortby in consensus.columns:
|
|
2388
2577
|
consensus = consensus.sort(sortby, descending=descending)
|
|
2389
2578
|
else:
|
|
2390
|
-
self.logger.warning(
|
|
2579
|
+
self.logger.warning(
|
|
2580
|
+
f"Sort column '{sortby}' not found in consensus DataFrame",
|
|
2581
|
+
)
|
|
2391
2582
|
elif isinstance(sortby, (list, tuple)):
|
|
2392
2583
|
# Multiple columns
|
|
2393
2584
|
valid_columns = [col for col in sortby if col in consensus.columns]
|
|
2394
2585
|
invalid_columns = [col for col in sortby if col not in consensus.columns]
|
|
2395
2586
|
|
|
2396
2587
|
if invalid_columns:
|
|
2397
|
-
self.logger.warning(
|
|
2588
|
+
self.logger.warning(
|
|
2589
|
+
f"Sort columns not found in consensus DataFrame: {invalid_columns}",
|
|
2590
|
+
)
|
|
2398
2591
|
|
|
2399
2592
|
if valid_columns:
|
|
2400
2593
|
consensus = consensus.sort(valid_columns, descending=descending)
|
|
2401
2594
|
else:
|
|
2402
|
-
self.logger.warning(
|
|
2595
|
+
self.logger.warning(
|
|
2596
|
+
f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.",
|
|
2597
|
+
)
|
|
2403
2598
|
|
|
2404
2599
|
return consensus
|
|
2405
2600
|
|
|
@@ -2444,7 +2639,10 @@ def consensus_filter(self, consensus):
|
|
|
2444
2639
|
|
|
2445
2640
|
# Get feature_uids that need to be removed from features_df
|
|
2446
2641
|
feature_uids_to_remove = []
|
|
2447
|
-
if
|
|
2642
|
+
if (
|
|
2643
|
+
self.consensus_mapping_df is not None
|
|
2644
|
+
and not self.consensus_mapping_df.is_empty()
|
|
2645
|
+
):
|
|
2448
2646
|
feature_uids_to_remove = self.consensus_mapping_df.filter(
|
|
2449
2647
|
pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
2450
2648
|
)["feature_uid"].to_list()
|
|
@@ -2455,27 +2653,42 @@ def consensus_filter(self, consensus):
|
|
|
2455
2653
|
)
|
|
2456
2654
|
|
|
2457
2655
|
# Remove from consensus_mapping_df
|
|
2458
|
-
if
|
|
2656
|
+
if (
|
|
2657
|
+
self.consensus_mapping_df is not None
|
|
2658
|
+
and not self.consensus_mapping_df.is_empty()
|
|
2659
|
+
):
|
|
2459
2660
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
2460
2661
|
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
2461
2662
|
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
2462
2663
|
)
|
|
2463
2664
|
removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
2464
2665
|
if removed_mapping_count > 0:
|
|
2465
|
-
self.logger.debug(
|
|
2666
|
+
self.logger.debug(
|
|
2667
|
+
f"Removed {removed_mapping_count} entries from consensus_mapping_df",
|
|
2668
|
+
)
|
|
2466
2669
|
|
|
2467
2670
|
# Remove corresponding features from features_df
|
|
2468
|
-
if
|
|
2671
|
+
if (
|
|
2672
|
+
feature_uids_to_remove
|
|
2673
|
+
and self.features_df is not None
|
|
2674
|
+
and not self.features_df.is_empty()
|
|
2675
|
+
):
|
|
2469
2676
|
initial_features_count = len(self.features_df)
|
|
2470
2677
|
self.features_df = self.features_df.filter(
|
|
2471
2678
|
~pl.col("feature_uid").is_in(feature_uids_to_remove),
|
|
2472
2679
|
)
|
|
2473
2680
|
removed_features_count = initial_features_count - len(self.features_df)
|
|
2474
2681
|
if removed_features_count > 0:
|
|
2475
|
-
self.logger.debug(
|
|
2682
|
+
self.logger.debug(
|
|
2683
|
+
f"Removed {removed_features_count} entries from features_df",
|
|
2684
|
+
)
|
|
2476
2685
|
|
|
2477
2686
|
# Remove from consensus_ms2 if it exists
|
|
2478
|
-
if
|
|
2687
|
+
if (
|
|
2688
|
+
hasattr(self, "consensus_ms2")
|
|
2689
|
+
and self.consensus_ms2 is not None
|
|
2690
|
+
and not self.consensus_ms2.is_empty()
|
|
2691
|
+
):
|
|
2479
2692
|
initial_ms2_count = len(self.consensus_ms2)
|
|
2480
2693
|
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
2481
2694
|
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
@@ -2575,7 +2788,10 @@ def samples_select(
|
|
|
2575
2788
|
if len(sample_uid) == 2 and not isinstance(sample_uid, list):
|
|
2576
2789
|
# Treat as range
|
|
2577
2790
|
min_uid, max_uid = sample_uid
|
|
2578
|
-
filter_conditions.append(
|
|
2791
|
+
filter_conditions.append(
|
|
2792
|
+
(pl.col("sample_uid") >= min_uid)
|
|
2793
|
+
& (pl.col("sample_uid") <= max_uid),
|
|
2794
|
+
)
|
|
2579
2795
|
else:
|
|
2580
2796
|
# Treat as list
|
|
2581
2797
|
filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
|
|
@@ -2617,7 +2833,8 @@ def samples_select(
|
|
|
2617
2833
|
# Treat as range
|
|
2618
2834
|
min_batch, max_batch = sample_batch
|
|
2619
2835
|
filter_conditions.append(
|
|
2620
|
-
(pl.col("sample_batch") >= min_batch)
|
|
2836
|
+
(pl.col("sample_batch") >= min_batch)
|
|
2837
|
+
& (pl.col("sample_batch") <= max_batch),
|
|
2621
2838
|
)
|
|
2622
2839
|
else:
|
|
2623
2840
|
# Treat as list
|
|
@@ -2635,11 +2852,14 @@ def samples_select(
|
|
|
2635
2852
|
# Treat as range
|
|
2636
2853
|
min_seq, max_seq = sample_sequence
|
|
2637
2854
|
filter_conditions.append(
|
|
2638
|
-
(pl.col("sample_sequence") >= min_seq)
|
|
2855
|
+
(pl.col("sample_sequence") >= min_seq)
|
|
2856
|
+
& (pl.col("sample_sequence") <= max_seq),
|
|
2639
2857
|
)
|
|
2640
2858
|
else:
|
|
2641
2859
|
# Treat as list
|
|
2642
|
-
filter_conditions.append(
|
|
2860
|
+
filter_conditions.append(
|
|
2861
|
+
pl.col("sample_sequence").is_in(sample_sequence),
|
|
2862
|
+
)
|
|
2643
2863
|
else:
|
|
2644
2864
|
filter_conditions.append(pl.col("sample_sequence") == sample_sequence)
|
|
2645
2865
|
else:
|
|
@@ -2651,7 +2871,8 @@ def samples_select(
|
|
|
2651
2871
|
if isinstance(num_features, tuple) and len(num_features) == 2:
|
|
2652
2872
|
min_features, max_features = num_features
|
|
2653
2873
|
filter_conditions.append(
|
|
2654
|
-
(pl.col("num_features") >= min_features)
|
|
2874
|
+
(pl.col("num_features") >= min_features)
|
|
2875
|
+
& (pl.col("num_features") <= max_features),
|
|
2655
2876
|
)
|
|
2656
2877
|
else:
|
|
2657
2878
|
filter_conditions.append(pl.col("num_features") >= num_features)
|
|
@@ -2663,7 +2884,9 @@ def samples_select(
|
|
|
2663
2884
|
if "num_ms1" in available_columns:
|
|
2664
2885
|
if isinstance(num_ms1, tuple) and len(num_ms1) == 2:
|
|
2665
2886
|
min_ms1, max_ms1 = num_ms1
|
|
2666
|
-
filter_conditions.append(
|
|
2887
|
+
filter_conditions.append(
|
|
2888
|
+
(pl.col("num_ms1") >= min_ms1) & (pl.col("num_ms1") <= max_ms1),
|
|
2889
|
+
)
|
|
2667
2890
|
else:
|
|
2668
2891
|
filter_conditions.append(pl.col("num_ms1") >= num_ms1)
|
|
2669
2892
|
else:
|
|
@@ -2674,7 +2897,9 @@ def samples_select(
|
|
|
2674
2897
|
if "num_ms2" in available_columns:
|
|
2675
2898
|
if isinstance(num_ms2, tuple) and len(num_ms2) == 2:
|
|
2676
2899
|
min_ms2, max_ms2 = num_ms2
|
|
2677
|
-
filter_conditions.append(
|
|
2900
|
+
filter_conditions.append(
|
|
2901
|
+
(pl.col("num_ms2") >= min_ms2) & (pl.col("num_ms2") <= max_ms2),
|
|
2902
|
+
)
|
|
2678
2903
|
else:
|
|
2679
2904
|
filter_conditions.append(pl.col("num_ms2") >= num_ms2)
|
|
2680
2905
|
else:
|
|
@@ -2766,7 +2991,9 @@ def samples_delete(self, samples):
|
|
|
2766
2991
|
if len(sample_uids_set) < len(sample_uids_to_remove) * 0.8:
|
|
2767
2992
|
sample_uids_to_remove = list(sample_uids_set)
|
|
2768
2993
|
|
|
2769
|
-
self.logger.info(
|
|
2994
|
+
self.logger.info(
|
|
2995
|
+
f"Deleting {len(sample_uids_to_remove)} samples and all related data...",
|
|
2996
|
+
)
|
|
2770
2997
|
|
|
2771
2998
|
# Get feature_uids that need to be removed from features_df
|
|
2772
2999
|
feature_uids_to_remove = []
|
|
@@ -2794,7 +3021,11 @@ def samples_delete(self, samples):
|
|
|
2794
3021
|
|
|
2795
3022
|
# 2. Remove corresponding features from features_df
|
|
2796
3023
|
removed_features_count = 0
|
|
2797
|
-
if
|
|
3024
|
+
if (
|
|
3025
|
+
feature_uids_to_remove
|
|
3026
|
+
and self.features_df is not None
|
|
3027
|
+
and not self.features_df.is_empty()
|
|
3028
|
+
):
|
|
2798
3029
|
self.features_df = self.features_df.filter(
|
|
2799
3030
|
~pl.col("sample_uid").is_in(sample_uids_to_remove),
|
|
2800
3031
|
)
|
|
@@ -2802,7 +3033,11 @@ def samples_delete(self, samples):
|
|
|
2802
3033
|
|
|
2803
3034
|
# 3. Remove from consensus_mapping_df
|
|
2804
3035
|
removed_mapping_count = 0
|
|
2805
|
-
if
|
|
3036
|
+
if (
|
|
3037
|
+
feature_uids_to_remove
|
|
3038
|
+
and self.consensus_mapping_df is not None
|
|
3039
|
+
and not self.consensus_mapping_df.is_empty()
|
|
3040
|
+
):
|
|
2806
3041
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
2807
3042
|
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
2808
3043
|
~pl.col("feature_uid").is_in(feature_uids_to_remove),
|
|
@@ -2811,7 +3046,11 @@ def samples_delete(self, samples):
|
|
|
2811
3046
|
|
|
2812
3047
|
# 4. Remove from consensus_ms2 if it exists
|
|
2813
3048
|
removed_ms2_count = 0
|
|
2814
|
-
if
|
|
3049
|
+
if (
|
|
3050
|
+
hasattr(self, "consensus_ms2")
|
|
3051
|
+
and self.consensus_ms2 is not None
|
|
3052
|
+
and not self.consensus_ms2.is_empty()
|
|
3053
|
+
):
|
|
2815
3054
|
initial_ms2_count = len(self.consensus_ms2)
|
|
2816
3055
|
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
2817
3056
|
~pl.col("sample_uid").is_in(sample_uids_to_remove),
|
|
@@ -2820,7 +3059,11 @@ def samples_delete(self, samples):
|
|
|
2820
3059
|
|
|
2821
3060
|
# 5. Remove from feature_maps and update map_id
|
|
2822
3061
|
removed_maps_count = 0
|
|
2823
|
-
if
|
|
3062
|
+
if (
|
|
3063
|
+
hasattr(self, "feature_maps")
|
|
3064
|
+
and self.feature_maps is not None
|
|
3065
|
+
and map_ids_to_remove
|
|
3066
|
+
):
|
|
2824
3067
|
# Remove feature maps in reverse order to maintain indices
|
|
2825
3068
|
for map_id in sorted(map_ids_to_remove, reverse=True):
|
|
2826
3069
|
if 0 <= map_id < len(self.feature_maps):
|
|
@@ -2861,7 +3104,9 @@ def samples_delete(self, samples):
|
|
|
2861
3104
|
|
|
2862
3105
|
# Update map_id indices if needed
|
|
2863
3106
|
if removed_maps_count > 0 and final_sample_count > 0:
|
|
2864
|
-
self.logger.debug(
|
|
3107
|
+
self.logger.debug(
|
|
3108
|
+
f"Updated map_id values to range from 0 to {final_sample_count - 1}",
|
|
3109
|
+
)
|
|
2865
3110
|
|
|
2866
3111
|
|
|
2867
3112
|
# =====================================================================================
|
|
@@ -3032,7 +3277,9 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
3032
3277
|
)
|
|
3033
3278
|
|
|
3034
3279
|
if isinstance(by, list):
|
|
3035
|
-
self.logger.debug(
|
|
3280
|
+
self.logger.debug(
|
|
3281
|
+
f"Set sample colors using provided color list ({len(by)} colors)",
|
|
3282
|
+
)
|
|
3036
3283
|
elif by is None:
|
|
3037
3284
|
self.logger.debug(f"Set sequential sample colors using {palette} palette")
|
|
3038
3285
|
else:
|
|
@@ -3067,7 +3314,9 @@ def sample_color_reset(self):
|
|
|
3067
3314
|
# Distribute samples evenly across the full colormap range
|
|
3068
3315
|
for i in range(n_samples):
|
|
3069
3316
|
# Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
|
|
3070
|
-
normalized_value = (
|
|
3317
|
+
normalized_value = (
|
|
3318
|
+
i + 0.5
|
|
3319
|
+
) / n_samples # +0.5 to center samples in their bins
|
|
3071
3320
|
# Optionally, map to a subset of colormap to avoid extreme colors
|
|
3072
3321
|
# Use 10% to 90% of colormap range for better color diversity
|
|
3073
3322
|
normalized_value = 0.1 + (normalized_value * 0.8)
|
|
@@ -3088,10 +3337,14 @@ def sample_color_reset(self):
|
|
|
3088
3337
|
pl.Series("sample_color", colors).alias("sample_color"),
|
|
3089
3338
|
)
|
|
3090
3339
|
|
|
3091
|
-
self.logger.debug(
|
|
3340
|
+
self.logger.debug(
|
|
3341
|
+
f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)",
|
|
3342
|
+
)
|
|
3092
3343
|
|
|
3093
3344
|
except ImportError:
|
|
3094
|
-
self.logger.error(
|
|
3345
|
+
self.logger.error(
|
|
3346
|
+
"cmap library is required for sample color reset. Install with: uv add cmap",
|
|
3347
|
+
)
|
|
3095
3348
|
except Exception as e:
|
|
3096
3349
|
self.logger.error(f"Failed to reset sample colors: {e}")
|
|
3097
3350
|
|
|
@@ -3112,7 +3365,9 @@ def _get_color_palette(palette_name):
|
|
|
3112
3365
|
try:
|
|
3113
3366
|
from cmap import Colormap
|
|
3114
3367
|
except ImportError:
|
|
3115
|
-
raise ValueError(
|
|
3368
|
+
raise ValueError(
|
|
3369
|
+
"cmap library is required for color palettes. Install with: pip install cmap",
|
|
3370
|
+
)
|
|
3116
3371
|
|
|
3117
3372
|
# Map common palette names to cmap names
|
|
3118
3373
|
palette_mapping = {
|
|
@@ -3207,7 +3462,9 @@ def _sample_colors_from_colormap(palette_name, n_colors):
|
|
|
3207
3462
|
try:
|
|
3208
3463
|
from cmap import Colormap
|
|
3209
3464
|
except ImportError:
|
|
3210
|
-
raise ValueError(
|
|
3465
|
+
raise ValueError(
|
|
3466
|
+
"cmap library is required for color palettes. Install with: pip install cmap",
|
|
3467
|
+
)
|
|
3211
3468
|
|
|
3212
3469
|
# Map common palette names to cmap names (same as _get_color_palette)
|
|
3213
3470
|
palette_mapping = {
|
|
@@ -3245,7 +3502,9 @@ def _sample_colors_from_colormap(palette_name, n_colors):
|
|
|
3245
3502
|
# Distribute samples evenly across the full colormap range (same approach as sample_color_reset)
|
|
3246
3503
|
for i in range(n_colors):
|
|
3247
3504
|
# Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
|
|
3248
|
-
normalized_value = (
|
|
3505
|
+
normalized_value = (
|
|
3506
|
+
i + 0.5
|
|
3507
|
+
) / n_colors # +0.5 to center samples in their bins
|
|
3249
3508
|
# Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
|
|
3250
3509
|
normalized_value = 0.1 + (normalized_value * 0.8)
|
|
3251
3510
|
|
|
@@ -3290,7 +3549,7 @@ def _ensure_features_df_schema_order(self):
|
|
|
3290
3549
|
try:
|
|
3291
3550
|
import os
|
|
3292
3551
|
import json
|
|
3293
|
-
from
|
|
3552
|
+
from master.study.h5 import _reorder_columns_by_schema
|
|
3294
3553
|
|
|
3295
3554
|
# Load schema
|
|
3296
3555
|
schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
|
|
@@ -3298,7 +3557,11 @@ def _ensure_features_df_schema_order(self):
|
|
|
3298
3557
|
schema = json.load(f)
|
|
3299
3558
|
|
|
3300
3559
|
# Reorder columns to match schema
|
|
3301
|
-
self.features_df = _reorder_columns_by_schema(
|
|
3560
|
+
self.features_df = _reorder_columns_by_schema(
|
|
3561
|
+
self.features_df,
|
|
3562
|
+
schema,
|
|
3563
|
+
"features_df",
|
|
3564
|
+
)
|
|
3302
3565
|
|
|
3303
3566
|
except Exception as e:
|
|
3304
3567
|
self.logger.warning(f"Failed to reorder features_df columns: {e}")
|
|
@@ -3340,17 +3603,19 @@ def migrate_map_id_to_index(self):
|
|
|
3340
3603
|
# Ensure the column is Int64 type
|
|
3341
3604
|
self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
|
|
3342
3605
|
|
|
3343
|
-
self.logger.info(
|
|
3606
|
+
self.logger.info(
|
|
3607
|
+
f"Successfully migrated {sample_count} samples to indexed map_id format",
|
|
3608
|
+
)
|
|
3344
3609
|
self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")
|
|
3345
3610
|
|
|
3346
3611
|
|
|
3347
3612
|
def restore_ms2(self, samples=None, **kwargs):
|
|
3348
3613
|
"""
|
|
3349
3614
|
Restore MS2 data by re-running find_ms2 on specified samples.
|
|
3350
|
-
|
|
3615
|
+
|
|
3351
3616
|
This function rebuilds the consensus_ms2 DataFrame by re-extracting MS2 spectra
|
|
3352
3617
|
from the original sample files. Use this to reverse the effects of compress_ms2().
|
|
3353
|
-
|
|
3618
|
+
|
|
3354
3619
|
Parameters:
|
|
3355
3620
|
samples (list, optional): List of sample_uids or sample_names to process.
|
|
3356
3621
|
If None, processes all samples.
|
|
@@ -3360,31 +3625,37 @@ def restore_ms2(self, samples=None, **kwargs):
|
|
|
3360
3625
|
if self.features_df is None or self.features_df.is_empty():
|
|
3361
3626
|
self.logger.error("No features_df found in study.")
|
|
3362
3627
|
return
|
|
3363
|
-
|
|
3628
|
+
|
|
3364
3629
|
if self.samples_df is None or self.samples_df.is_empty():
|
|
3365
3630
|
self.logger.error("No samples_df found in study.")
|
|
3366
3631
|
return
|
|
3367
|
-
|
|
3368
|
-
# Get sample_uids to process
|
|
3632
|
+
|
|
3633
|
+
# Get sample_uids to process
|
|
3369
3634
|
sample_uids = self._get_sample_uids(samples)
|
|
3370
3635
|
if not sample_uids:
|
|
3371
3636
|
self.logger.warning("No valid samples specified.")
|
|
3372
3637
|
return
|
|
3373
|
-
|
|
3638
|
+
|
|
3374
3639
|
self.logger.info(f"Restoring MS2 data from {len(sample_uids)} samples...")
|
|
3375
|
-
|
|
3640
|
+
|
|
3376
3641
|
# Clear existing consensus_ms2 to rebuild from scratch
|
|
3377
|
-
initial_ms2_count =
|
|
3642
|
+
initial_ms2_count = (
|
|
3643
|
+
len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
3644
|
+
)
|
|
3378
3645
|
self.consensus_ms2 = pl.DataFrame()
|
|
3379
|
-
|
|
3646
|
+
|
|
3380
3647
|
# Re-run find_ms2 which will rebuild consensus_ms2
|
|
3381
3648
|
try:
|
|
3382
3649
|
self.find_ms2(**kwargs)
|
|
3383
|
-
|
|
3384
|
-
final_ms2_count =
|
|
3385
|
-
|
|
3386
|
-
|
|
3387
|
-
|
|
3650
|
+
|
|
3651
|
+
final_ms2_count = (
|
|
3652
|
+
len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
3653
|
+
)
|
|
3654
|
+
|
|
3655
|
+
self.logger.info(
|
|
3656
|
+
f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra",
|
|
3657
|
+
)
|
|
3658
|
+
|
|
3388
3659
|
except Exception as e:
|
|
3389
3660
|
self.logger.error(f"Failed to restore MS2 data: {e}")
|
|
3390
3661
|
raise
|
|
@@ -3393,51 +3664,51 @@ def restore_ms2(self, samples=None, **kwargs):
|
|
|
3393
3664
|
def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs):
|
|
3394
3665
|
"""
|
|
3395
3666
|
Reverse any compression effects by restoring compressed data adaptively.
|
|
3396
|
-
|
|
3667
|
+
|
|
3397
3668
|
This function restores data that was compressed using compress(), compress_features(),
|
|
3398
3669
|
compress_ms2(), compress_chrom(), or study.save(compress=True). It optimizes the
|
|
3399
3670
|
decompression process for speed by only processing what actually needs restoration.
|
|
3400
|
-
|
|
3671
|
+
|
|
3401
3672
|
Parameters:
|
|
3402
3673
|
features (bool): Restore features data (ms2_specs, ms2_scans, chrom_area)
|
|
3403
|
-
ms2 (bool): Restore MS2 spectra by re-running find_ms2()
|
|
3674
|
+
ms2 (bool): Restore MS2 spectra by re-running find_ms2()
|
|
3404
3675
|
chrom (bool): Restore chromatogram objects
|
|
3405
3676
|
samples (list, optional): List of sample_uids or sample_names to process.
|
|
3406
3677
|
If None, processes all samples.
|
|
3407
3678
|
**kwargs: Additional keyword arguments for restoration functions:
|
|
3408
3679
|
- For restore_chrom: mz_tol (default: 0.010), rt_tol (default: 10.0)
|
|
3409
3680
|
- For restore_ms2/find_ms2: mz_tol, centroid, deisotope, etc.
|
|
3410
|
-
|
|
3681
|
+
|
|
3411
3682
|
Performance Optimizations:
|
|
3412
3683
|
- Adaptive processing: Only restores what actually needs restoration
|
|
3413
3684
|
- Processes features and chromatograms together when possible (shared file I/O)
|
|
3414
3685
|
- Uses cached sample instances to avoid repeated file loading
|
|
3415
3686
|
- Processes MS2 restoration last as it's the most computationally expensive
|
|
3416
3687
|
- Provides detailed progress information for long-running operations
|
|
3417
|
-
|
|
3688
|
+
|
|
3418
3689
|
Example:
|
|
3419
3690
|
# Restore everything (but only what needs restoration)
|
|
3420
3691
|
study.decompress()
|
|
3421
|
-
|
|
3692
|
+
|
|
3422
3693
|
# Restore only chromatograms with custom tolerances
|
|
3423
3694
|
study.decompress(features=False, ms2=False, chrom=True, mz_tol=0.005, rt_tol=5.0)
|
|
3424
|
-
|
|
3695
|
+
|
|
3425
3696
|
# Restore specific samples only
|
|
3426
3697
|
study.decompress(samples=["sample1", "sample2"])
|
|
3427
3698
|
"""
|
|
3428
3699
|
if not any([features, ms2, chrom]):
|
|
3429
3700
|
self.logger.warning("No decompression operations specified.")
|
|
3430
3701
|
return
|
|
3431
|
-
|
|
3702
|
+
|
|
3432
3703
|
# Get sample_uids to process
|
|
3433
3704
|
sample_uids = self._get_sample_uids(samples)
|
|
3434
3705
|
if not sample_uids:
|
|
3435
3706
|
self.logger.warning("No valid samples specified.")
|
|
3436
3707
|
return
|
|
3437
|
-
|
|
3708
|
+
|
|
3438
3709
|
# Adaptively check what actually needs to be done
|
|
3439
3710
|
import polars as pl
|
|
3440
|
-
|
|
3711
|
+
|
|
3441
3712
|
# Check if features need restoration (more sophisticated logic)
|
|
3442
3713
|
features_need_restoration = False
|
|
3443
3714
|
if features and not self.features_df.is_empty():
|
|
@@ -3446,7 +3717,7 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
|
|
|
3446
3717
|
for col in ["ms2_scans", "ms2_specs"]:
|
|
3447
3718
|
if col not in self.features_df.columns:
|
|
3448
3719
|
missing_cols.append(col)
|
|
3449
|
-
|
|
3720
|
+
|
|
3450
3721
|
# If columns are missing entirely, we likely need restoration
|
|
3451
3722
|
if missing_cols:
|
|
3452
3723
|
features_need_restoration = True
|
|
@@ -3455,13 +3726,15 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
|
|
|
3455
3726
|
# But be smart about it - only check if we have consensus features with MS2
|
|
3456
3727
|
if not self.consensus_ms2.is_empty():
|
|
3457
3728
|
# We have MS2 data, so ms2_specs should have some content
|
|
3458
|
-
null_ms2_specs = self.features_df.filter(
|
|
3729
|
+
null_ms2_specs = self.features_df.filter(
|
|
3730
|
+
pl.col("ms2_specs").is_null(),
|
|
3731
|
+
).height
|
|
3459
3732
|
total_features = len(self.features_df)
|
|
3460
3733
|
# If more than 90% are null but we have MS2 data, likely compressed
|
|
3461
3734
|
if null_ms2_specs > (total_features * 0.9):
|
|
3462
3735
|
features_need_restoration = True
|
|
3463
|
-
|
|
3464
|
-
# Check if chromatograms need restoration
|
|
3736
|
+
|
|
3737
|
+
# Check if chromatograms need restoration
|
|
3465
3738
|
chrom_need_restoration = False
|
|
3466
3739
|
if chrom and not self.features_df.is_empty():
|
|
3467
3740
|
if "chrom" not in self.features_df.columns:
|
|
@@ -3472,22 +3745,26 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
|
|
|
3472
3745
|
total_features = len(self.features_df)
|
|
3473
3746
|
# If more than 50% are null, likely need restoration
|
|
3474
3747
|
chrom_need_restoration = null_chroms > (total_features * 0.5)
|
|
3475
|
-
|
|
3748
|
+
|
|
3476
3749
|
# Check if MS2 data might need restoration (compare expected vs actual)
|
|
3477
3750
|
ms2_need_restoration = False
|
|
3478
3751
|
if ms2:
|
|
3479
|
-
current_ms2_count =
|
|
3480
|
-
|
|
3481
|
-
|
|
3752
|
+
current_ms2_count = (
|
|
3753
|
+
len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
3754
|
+
)
|
|
3755
|
+
consensus_count = (
|
|
3756
|
+
len(self.consensus_df) if not self.consensus_df.is_empty() else 0
|
|
3757
|
+
)
|
|
3758
|
+
|
|
3482
3759
|
if consensus_count > 0:
|
|
3483
3760
|
# Calculate expected MS2 count based on consensus features with MS2 potential
|
|
3484
3761
|
# This is a heuristic - if we have very few MS2 compared to consensus, likely compressed
|
|
3485
3762
|
expected_ratio = 3.0 # Expect at least 3 MS2 per consensus on average
|
|
3486
3763
|
expected_ms2 = consensus_count * expected_ratio
|
|
3487
|
-
|
|
3764
|
+
|
|
3488
3765
|
if current_ms2_count < min(expected_ms2 * 0.3, consensus_count * 0.8):
|
|
3489
3766
|
ms2_need_restoration = True
|
|
3490
|
-
|
|
3767
|
+
|
|
3491
3768
|
# Build list of operations that actually need to be done
|
|
3492
3769
|
operations_needed = []
|
|
3493
3770
|
if features and features_need_restoration:
|
|
@@ -3496,59 +3773,75 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
|
|
|
3496
3773
|
operations_needed.append("chromatograms")
|
|
3497
3774
|
if ms2 and ms2_need_restoration:
|
|
3498
3775
|
operations_needed.append("MS2 spectra")
|
|
3499
|
-
|
|
3776
|
+
|
|
3500
3777
|
# Early exit if nothing needs to be done
|
|
3501
3778
|
if not operations_needed:
|
|
3502
|
-
self.logger.info(
|
|
3779
|
+
self.logger.info(
|
|
3780
|
+
"All data appears to be already decompressed. No operations needed.",
|
|
3781
|
+
)
|
|
3503
3782
|
return
|
|
3504
|
-
|
|
3505
|
-
self.logger.info(
|
|
3506
|
-
|
|
3783
|
+
|
|
3784
|
+
self.logger.info(
|
|
3785
|
+
f"Starting adaptive decompression: {', '.join(operations_needed)} from {len(sample_uids)} samples",
|
|
3786
|
+
)
|
|
3787
|
+
|
|
3507
3788
|
try:
|
|
3508
3789
|
# Phase 1: Restore features and chromatograms together (shared file I/O)
|
|
3509
|
-
if
|
|
3510
|
-
self.logger.info(
|
|
3511
|
-
|
|
3790
|
+
if "features" in operations_needed and "chromatograms" in operations_needed:
|
|
3791
|
+
self.logger.info(
|
|
3792
|
+
"Phase 1: Restoring features and chromatograms together...",
|
|
3793
|
+
)
|
|
3794
|
+
|
|
3512
3795
|
# Extract relevant kwargs for restore_features and restore_chrom
|
|
3513
3796
|
restore_kwargs = {}
|
|
3514
|
-
if
|
|
3515
|
-
restore_kwargs[
|
|
3516
|
-
if
|
|
3517
|
-
restore_kwargs[
|
|
3518
|
-
|
|
3797
|
+
if "mz_tol" in kwargs:
|
|
3798
|
+
restore_kwargs["mz_tol"] = kwargs["mz_tol"]
|
|
3799
|
+
if "rt_tol" in kwargs:
|
|
3800
|
+
restore_kwargs["rt_tol"] = kwargs["rt_tol"]
|
|
3801
|
+
|
|
3519
3802
|
# Restore features first (includes chrom column)
|
|
3520
3803
|
self.restore_features(samples=samples)
|
|
3521
|
-
|
|
3804
|
+
|
|
3522
3805
|
# Then do additional chrom gap-filling if needed
|
|
3523
3806
|
self.restore_chrom(samples=samples, **restore_kwargs)
|
|
3524
|
-
|
|
3525
|
-
elif (
|
|
3807
|
+
|
|
3808
|
+
elif (
|
|
3809
|
+
"features" in operations_needed and "chromatograms" not in operations_needed
|
|
3810
|
+
):
|
|
3526
3811
|
self.logger.info("Phase 1: Restoring features data...")
|
|
3527
3812
|
self.restore_features(samples=samples)
|
|
3528
|
-
|
|
3529
|
-
elif (
|
|
3813
|
+
|
|
3814
|
+
elif (
|
|
3815
|
+
"chromatograms" in operations_needed and "features" not in operations_needed
|
|
3816
|
+
):
|
|
3530
3817
|
self.logger.info("Phase 1: Restoring chromatograms...")
|
|
3531
3818
|
restore_kwargs = {}
|
|
3532
|
-
if
|
|
3533
|
-
restore_kwargs[
|
|
3534
|
-
if
|
|
3535
|
-
restore_kwargs[
|
|
3819
|
+
if "mz_tol" in kwargs:
|
|
3820
|
+
restore_kwargs["mz_tol"] = kwargs["mz_tol"]
|
|
3821
|
+
if "rt_tol" in kwargs:
|
|
3822
|
+
restore_kwargs["rt_tol"] = kwargs["rt_tol"]
|
|
3536
3823
|
self.restore_chrom(samples=samples, **restore_kwargs)
|
|
3537
|
-
|
|
3824
|
+
|
|
3538
3825
|
# Phase 2: Restore MS2 data (most computationally expensive, done last)
|
|
3539
3826
|
if "MS2 spectra" in operations_needed:
|
|
3540
3827
|
self.logger.info("Phase 2: Restoring MS2 spectra...")
|
|
3541
|
-
|
|
3828
|
+
|
|
3542
3829
|
# Extract MS2-specific kwargs
|
|
3543
3830
|
ms2_kwargs = {}
|
|
3544
3831
|
for key, value in kwargs.items():
|
|
3545
|
-
if key in [
|
|
3832
|
+
if key in [
|
|
3833
|
+
"mz_tol",
|
|
3834
|
+
"centroid",
|
|
3835
|
+
"deisotope",
|
|
3836
|
+
"dia_stats",
|
|
3837
|
+
"feature_uid",
|
|
3838
|
+
]:
|
|
3546
3839
|
ms2_kwargs[key] = value
|
|
3547
|
-
|
|
3840
|
+
|
|
3548
3841
|
self.restore_ms2(samples=samples, **ms2_kwargs)
|
|
3549
|
-
|
|
3842
|
+
|
|
3550
3843
|
self.logger.info("Adaptive decompression completed successfully")
|
|
3551
|
-
|
|
3844
|
+
|
|
3552
3845
|
except Exception as e:
|
|
3553
3846
|
self.logger.error(f"Decompression failed: {e}")
|
|
3554
3847
|
raise
|