masster 0.3.18__py3-none-any.whl → 0.3.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +2 -0
- masster/_version.py +1 -1
- masster/data/libs/README.md +17 -0
- masster/data/libs/ccm.py +533 -0
- masster/data/libs/central_carbon_README.md +17 -0
- masster/data/libs/central_carbon_metabolites.csv +120 -0
- masster/data/libs/urine.py +333 -0
- masster/data/libs/urine_metabolites.csv +51 -0
- masster/sample/h5.py +1 -1
- masster/sample/helpers.py +3 -7
- masster/sample/lib.py +32 -25
- masster/sample/load.py +9 -3
- masster/sample/plot.py +113 -27
- masster/study/export.py +27 -10
- masster/study/h5.py +58 -40
- masster/study/helpers.py +450 -196
- masster/study/helpers_optimized.py +5 -5
- masster/study/load.py +144 -118
- masster/study/plot.py +691 -277
- masster/study/processing.py +9 -5
- masster/study/study.py +6 -6
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/METADATA +1 -1
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/RECORD +31 -25
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/WHEEL +0 -0
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/entry_points.txt +0 -0
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py
CHANGED
|
@@ -6,7 +6,7 @@ like data retrieval, filtering, compression, and utility functions.
|
|
|
6
6
|
|
|
7
7
|
The functions are organized into the following sections:
|
|
8
8
|
1. Chromatogram extraction functions (BPC, TIC, EIC, chrom matrix)
|
|
9
|
-
2. Data retrieval helper functions (get_sample, get_consensus, etc.)
|
|
9
|
+
2. Data retrieval helper functions (get_sample, get_consensus, etc.)
|
|
10
10
|
3. UID helper functions (_get_*_uids)
|
|
11
11
|
4. Data filtering and selection functions
|
|
12
12
|
5. Data compression and restoration functions
|
|
@@ -150,9 +150,19 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
|
|
|
150
150
|
# build Chromatogram
|
|
151
151
|
ycol = "inty"
|
|
152
152
|
try:
|
|
153
|
-
chrom = Chromatogram(
|
|
153
|
+
chrom = Chromatogram(
|
|
154
|
+
rt=bpc_pd["rt"].to_numpy(),
|
|
155
|
+
inty=bpc_pd[ycol].to_numpy(),
|
|
156
|
+
label=label or "Base Peak Chromatogram",
|
|
157
|
+
rt_unit=rt_unit,
|
|
158
|
+
)
|
|
154
159
|
except Exception:
|
|
155
|
-
chrom = Chromatogram(
|
|
160
|
+
chrom = Chromatogram(
|
|
161
|
+
rt=bpc_pd["rt"].values,
|
|
162
|
+
inty=bpc_pd[ycol].values,
|
|
163
|
+
label=label or "Base Peak Chromatogram",
|
|
164
|
+
rt_unit=rt_unit,
|
|
165
|
+
)
|
|
156
166
|
|
|
157
167
|
return chrom
|
|
158
168
|
|
|
@@ -204,13 +214,21 @@ def get_tic(owner, sample=None, label=None):
|
|
|
204
214
|
tic_pd = tic_pd.rename(columns={tic_pd.columns[1]: "inty_tot"})
|
|
205
215
|
|
|
206
216
|
try:
|
|
207
|
-
chrom = Chromatogram(
|
|
217
|
+
chrom = Chromatogram(
|
|
218
|
+
rt=tic_pd["rt"].to_numpy(),
|
|
219
|
+
inty=tic_pd["inty_tot"].to_numpy(),
|
|
220
|
+
label=label or "Total Ion Chromatogram",
|
|
221
|
+
)
|
|
208
222
|
except Exception:
|
|
209
|
-
chrom = Chromatogram(
|
|
223
|
+
chrom = Chromatogram(
|
|
224
|
+
rt=tic_pd["rt"].values,
|
|
225
|
+
inty=tic_pd["inty_tot"].values,
|
|
226
|
+
label=label or "Total Ion Chromatogram",
|
|
227
|
+
)
|
|
210
228
|
|
|
211
229
|
return chrom
|
|
212
230
|
|
|
213
|
-
|
|
231
|
+
|
|
214
232
|
def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
215
233
|
"""
|
|
216
234
|
Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
|
|
@@ -223,7 +241,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
|
223
241
|
|
|
224
242
|
Parameters:
|
|
225
243
|
owner: Study or Sample instance
|
|
226
|
-
sample: Sample identifier (required if owner is Study)
|
|
244
|
+
sample: Sample identifier (required if owner is Study)
|
|
227
245
|
mz (float): Target m/z value
|
|
228
246
|
mz_tol (float): m/z tolerance. If None, uses owner.parameters.eic_mz_tol (for Study) or defaults to 0.01
|
|
229
247
|
rt_unit (str): Retention time unit for the chromatogram
|
|
@@ -234,7 +252,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
|
234
252
|
"""
|
|
235
253
|
# Use default mz_tol from study parameters if not provided
|
|
236
254
|
if mz_tol is None:
|
|
237
|
-
if hasattr(owner,
|
|
255
|
+
if hasattr(owner, "parameters") and hasattr(owner.parameters, "eic_mz_tol"):
|
|
238
256
|
mz_tol = owner.parameters.eic_mz_tol
|
|
239
257
|
else:
|
|
240
258
|
mz_tol = 0.01 # fallback default
|
|
@@ -267,17 +285,18 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
|
267
285
|
mz_min = mz - mz_tol
|
|
268
286
|
mz_max = mz + mz_tol
|
|
269
287
|
eic_data = s.ms1_df.filter(
|
|
270
|
-
(pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
|
|
288
|
+
(pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max),
|
|
271
289
|
)
|
|
272
290
|
|
|
273
291
|
if eic_data.is_empty():
|
|
274
292
|
# Return empty chromatogram if no data found
|
|
275
293
|
import numpy as _np
|
|
294
|
+
|
|
276
295
|
return Chromatogram(
|
|
277
|
-
rt=_np.array([0.0]),
|
|
278
|
-
inty=_np.array([0.0]),
|
|
296
|
+
rt=_np.array([0.0]),
|
|
297
|
+
inty=_np.array([0.0]),
|
|
279
298
|
label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
|
|
280
|
-
rt_unit=rt_unit
|
|
299
|
+
rt_unit=rt_unit,
|
|
281
300
|
)
|
|
282
301
|
|
|
283
302
|
# Aggregate intensities per retention time (sum in case of multiple points per rt)
|
|
@@ -290,34 +309,35 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
|
290
309
|
if eic_pd.empty:
|
|
291
310
|
# Return empty chromatogram if no data found
|
|
292
311
|
import numpy as _np
|
|
312
|
+
|
|
293
313
|
return Chromatogram(
|
|
294
|
-
rt=_np.array([0.0]),
|
|
295
|
-
inty=_np.array([0.0]),
|
|
314
|
+
rt=_np.array([0.0]),
|
|
315
|
+
inty=_np.array([0.0]),
|
|
296
316
|
label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
|
|
297
|
-
rt_unit=rt_unit
|
|
317
|
+
rt_unit=rt_unit,
|
|
298
318
|
)
|
|
299
319
|
|
|
300
320
|
# build Chromatogram
|
|
301
321
|
try:
|
|
302
322
|
chrom = Chromatogram(
|
|
303
|
-
rt=eic_pd["rt"].to_numpy(),
|
|
304
|
-
inty=eic_pd["inty"].to_numpy(),
|
|
323
|
+
rt=eic_pd["rt"].to_numpy(),
|
|
324
|
+
inty=eic_pd["inty"].to_numpy(),
|
|
305
325
|
label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
|
|
306
|
-
rt_unit=rt_unit
|
|
326
|
+
rt_unit=rt_unit,
|
|
307
327
|
)
|
|
308
328
|
except Exception:
|
|
309
329
|
chrom = Chromatogram(
|
|
310
|
-
rt=eic_pd["rt"].values,
|
|
311
|
-
inty=eic_pd["inty"].values,
|
|
330
|
+
rt=eic_pd["rt"].values,
|
|
331
|
+
inty=eic_pd["inty"].values,
|
|
312
332
|
label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
|
|
313
|
-
rt_unit=rt_unit
|
|
333
|
+
rt_unit=rt_unit,
|
|
314
334
|
)
|
|
315
335
|
|
|
316
336
|
return chrom
|
|
317
337
|
|
|
318
338
|
|
|
319
339
|
# =====================================================================================
|
|
320
|
-
# DATA RETRIEVAL AND MATRIX FUNCTIONS
|
|
340
|
+
# DATA RETRIEVAL AND MATRIX FUNCTIONS
|
|
321
341
|
# =====================================================================================
|
|
322
342
|
|
|
323
343
|
|
|
@@ -451,9 +471,9 @@ def align_reset(self):
|
|
|
451
471
|
self.alignment_ref_index = None
|
|
452
472
|
# in self.features_df, set rt equal to rt_original
|
|
453
473
|
self.features_df = self.features_df.with_columns(
|
|
454
|
-
pl.col("rt_original").alias("rt")
|
|
474
|
+
pl.col("rt_original").alias("rt"),
|
|
455
475
|
)
|
|
456
|
-
|
|
476
|
+
|
|
457
477
|
# Ensure column order is maintained after with_columns operation
|
|
458
478
|
self._ensure_features_df_schema_order()
|
|
459
479
|
|
|
@@ -614,7 +634,7 @@ def get_consensus_matches(self, uids=None):
|
|
|
614
634
|
return matches
|
|
615
635
|
|
|
616
636
|
|
|
617
|
-
# =====================================================================================
|
|
637
|
+
# =====================================================================================
|
|
618
638
|
# UID HELPER FUNCTIONS
|
|
619
639
|
# =====================================================================================
|
|
620
640
|
|
|
@@ -796,7 +816,7 @@ def get_sample(self, sample):
|
|
|
796
816
|
return cache[sample_uid]
|
|
797
817
|
|
|
798
818
|
sample_path = row.get("sample_path", None)
|
|
799
|
-
s = Sample(log_level=
|
|
819
|
+
s = Sample(log_level="ERROR")
|
|
800
820
|
try:
|
|
801
821
|
if sample_path:
|
|
802
822
|
try:
|
|
@@ -816,13 +836,13 @@ def get_orphans(self):
|
|
|
816
836
|
Get all features that are not in the consensus mapping.
|
|
817
837
|
"""
|
|
818
838
|
not_in_consensus = self.features_df.filter(
|
|
819
|
-
~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list())
|
|
839
|
+
~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list()),
|
|
820
840
|
)
|
|
821
841
|
return not_in_consensus
|
|
822
842
|
|
|
823
843
|
|
|
824
844
|
# =====================================================================================
|
|
825
|
-
# DATA COMPRESSION AND RESTORATION FUNCTIONS
|
|
845
|
+
# DATA COMPRESSION AND RESTORATION FUNCTIONS
|
|
826
846
|
# =====================================================================================
|
|
827
847
|
|
|
828
848
|
|
|
@@ -878,7 +898,7 @@ def compress_features(self):
|
|
|
878
898
|
|
|
879
899
|
removed_count = initial_count - len(self.features_df)
|
|
880
900
|
self.logger.info(
|
|
881
|
-
f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column"
|
|
901
|
+
f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column",
|
|
882
902
|
)
|
|
883
903
|
|
|
884
904
|
|
|
@@ -949,13 +969,20 @@ def restore_features(self, samples=None, maps=False):
|
|
|
949
969
|
# Load sample to get its features_df
|
|
950
970
|
# Use a direct load call with map=False to prevent feature synchronization
|
|
951
971
|
# which would remove filled features that don't exist in the original FeatureMap
|
|
952
|
-
|
|
972
|
+
# Use ERROR log level to suppress info messages
|
|
973
|
+
sample = Sample(log_level="ERROR")
|
|
953
974
|
sample._load_sample5(sample_path, map=False)
|
|
954
975
|
|
|
955
976
|
if sample.features_df is None or sample.features_df.is_empty():
|
|
956
977
|
self.logger.warning(f"No features found in sample {sample_name}")
|
|
957
978
|
continue
|
|
958
979
|
|
|
980
|
+
# Check which columns are actually available in the sample
|
|
981
|
+
available_columns = [col for col in columns_to_update if col in sample.features_df.columns]
|
|
982
|
+
if not available_columns:
|
|
983
|
+
self.logger.debug(f"No target columns found in sample {sample_name}")
|
|
984
|
+
continue
|
|
985
|
+
|
|
959
986
|
# Create update data for this sample
|
|
960
987
|
updates_made = 0
|
|
961
988
|
for row in sample.features_df.iter_rows(named=True):
|
|
@@ -967,8 +994,8 @@ def restore_features(self, samples=None, maps=False):
|
|
|
967
994
|
if key in study_feature_mapping:
|
|
968
995
|
feature_uid = study_feature_mapping[key]
|
|
969
996
|
|
|
970
|
-
# Update the
|
|
971
|
-
for col in
|
|
997
|
+
# Update only the available columns in study.features_df
|
|
998
|
+
for col in available_columns:
|
|
972
999
|
if col in row and col in self.features_df.columns:
|
|
973
1000
|
# Get the original column dtype to preserve it
|
|
974
1001
|
original_dtype = self.features_df[col].dtype
|
|
@@ -993,7 +1020,8 @@ def restore_features(self, samples=None, maps=False):
|
|
|
993
1020
|
)
|
|
994
1021
|
updates_made += 1
|
|
995
1022
|
|
|
996
|
-
|
|
1023
|
+
if updates_made > 0:
|
|
1024
|
+
self.logger.debug(f"Updated {updates_made} features from sample {sample_name}")
|
|
997
1025
|
|
|
998
1026
|
# If maps is True, load featureXML data
|
|
999
1027
|
if maps:
|
|
@@ -1076,13 +1104,18 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1076
1104
|
|
|
1077
1105
|
try:
|
|
1078
1106
|
# Load sample (with map=False to prevent feature synchronization)
|
|
1079
|
-
|
|
1107
|
+
# Use ERROR log level to suppress info messages
|
|
1108
|
+
sample = Sample(log_level="ERROR")
|
|
1080
1109
|
sample._load_sample5(sample_path, map=False)
|
|
1081
1110
|
|
|
1082
1111
|
if sample.features_df is None or sample.features_df.is_empty():
|
|
1083
1112
|
self.logger.warning(f"No features found in sample {sample_name}")
|
|
1084
1113
|
continue
|
|
1085
1114
|
|
|
1115
|
+
# Check if chrom column exists in sample
|
|
1116
|
+
if "chrom" not in sample.features_df.columns:
|
|
1117
|
+
continue
|
|
1118
|
+
|
|
1086
1119
|
# Update chromatograms from this sample
|
|
1087
1120
|
for row in sample.features_df.iter_rows(named=True):
|
|
1088
1121
|
feature_id = row.get("feature_id")
|
|
@@ -1119,7 +1152,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1119
1152
|
total_chroms = len(self.features_df)
|
|
1120
1153
|
|
|
1121
1154
|
self.logger.debug(
|
|
1122
|
-
f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)"
|
|
1155
|
+
f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)",
|
|
1123
1156
|
)
|
|
1124
1157
|
|
|
1125
1158
|
if empty_chroms == 0:
|
|
@@ -1163,7 +1196,8 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1163
1196
|
|
|
1164
1197
|
try:
|
|
1165
1198
|
# Load sample for MS1 data extraction
|
|
1166
|
-
|
|
1199
|
+
# Use ERROR log level to suppress info messages
|
|
1200
|
+
sample = Sample(log_level="ERROR")
|
|
1167
1201
|
sample._load_sample5(sample_path, map=False)
|
|
1168
1202
|
|
|
1169
1203
|
if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
|
|
@@ -1249,7 +1283,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1249
1283
|
final_total = len(self.features_df)
|
|
1250
1284
|
|
|
1251
1285
|
self.logger.info(
|
|
1252
|
-
f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)"
|
|
1286
|
+
f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)",
|
|
1253
1287
|
)
|
|
1254
1288
|
self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
|
|
1255
1289
|
|
|
@@ -1290,7 +1324,7 @@ def compress_ms2(self, max_replicates=5):
|
|
|
1290
1324
|
|
|
1291
1325
|
removed_count = initial_count - len(self.consensus_ms2)
|
|
1292
1326
|
self.logger.info(
|
|
1293
|
-
f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair"
|
|
1327
|
+
f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair",
|
|
1294
1328
|
)
|
|
1295
1329
|
|
|
1296
1330
|
|
|
@@ -1328,14 +1362,14 @@ def compress_chrom(self):
|
|
|
1328
1362
|
def sample_name_replace(self, replace_dict):
|
|
1329
1363
|
"""
|
|
1330
1364
|
Replace sample names in samples_df based on a dictionary mapping.
|
|
1331
|
-
|
|
1332
|
-
Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
|
|
1333
|
-
all keys with their corresponding values from replace_dict. Checks that all
|
|
1365
|
+
|
|
1366
|
+
Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
|
|
1367
|
+
all keys with their corresponding values from replace_dict. Checks that all
|
|
1334
1368
|
resulting sample names are unique. If unique, replaces the values in self.samples_df.
|
|
1335
1369
|
|
|
1336
1370
|
Parameters:
|
|
1337
1371
|
replace_dict (dict): Dictionary mapping old names (keys) to new names (values).
|
|
1338
|
-
All keys found in sample names will be replaced with their
|
|
1372
|
+
All keys found in sample names will be replaced with their
|
|
1339
1373
|
corresponding values.
|
|
1340
1374
|
e.g., {"old_name1": "new_name1", "old_name2": "new_name2"}
|
|
1341
1375
|
|
|
@@ -1348,22 +1382,22 @@ def sample_name_replace(self, replace_dict):
|
|
|
1348
1382
|
"""
|
|
1349
1383
|
if not isinstance(replace_dict, dict):
|
|
1350
1384
|
raise ValueError("replace_dict must be a dictionary")
|
|
1351
|
-
|
|
1385
|
+
|
|
1352
1386
|
if self.samples_df is None or len(self.samples_df) == 0:
|
|
1353
1387
|
self.logger.warning("No samples found in study.")
|
|
1354
1388
|
return
|
|
1355
|
-
|
|
1389
|
+
|
|
1356
1390
|
if not replace_dict:
|
|
1357
1391
|
self.logger.warning("Empty replace_dict provided, no changes made.")
|
|
1358
1392
|
return
|
|
1359
1393
|
|
|
1360
1394
|
# Get current sample names
|
|
1361
1395
|
current_names = self.samples_df.get_column("sample_name").to_list()
|
|
1362
|
-
|
|
1396
|
+
|
|
1363
1397
|
# Create a copy and apply replacements
|
|
1364
1398
|
new_names = []
|
|
1365
1399
|
replaced_count = 0
|
|
1366
|
-
|
|
1400
|
+
|
|
1367
1401
|
for name in current_names:
|
|
1368
1402
|
if name in replace_dict:
|
|
1369
1403
|
new_names.append(replace_dict[name])
|
|
@@ -1371,7 +1405,7 @@ def sample_name_replace(self, replace_dict):
|
|
|
1371
1405
|
self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
|
|
1372
1406
|
else:
|
|
1373
1407
|
new_names.append(name)
|
|
1374
|
-
|
|
1408
|
+
|
|
1375
1409
|
# Check that all new names are unique
|
|
1376
1410
|
if len(set(new_names)) != len(new_names):
|
|
1377
1411
|
duplicates = []
|
|
@@ -1382,19 +1416,19 @@ def sample_name_replace(self, replace_dict):
|
|
|
1382
1416
|
else:
|
|
1383
1417
|
seen.add(name)
|
|
1384
1418
|
raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
|
|
1385
|
-
|
|
1419
|
+
|
|
1386
1420
|
# If we get here, all names are unique - apply the changes
|
|
1387
1421
|
self.samples_df = self.samples_df.with_columns(
|
|
1388
1422
|
pl.Series("sample_name", new_names).alias("sample_name"),
|
|
1389
1423
|
)
|
|
1390
|
-
|
|
1424
|
+
|
|
1391
1425
|
self.logger.info(f"Successfully replaced {replaced_count} sample names")
|
|
1392
1426
|
|
|
1393
1427
|
|
|
1394
1428
|
def sample_name_reset(self):
|
|
1395
1429
|
"""
|
|
1396
1430
|
Reset sample names to the basename of sample_path without extensions.
|
|
1397
|
-
|
|
1431
|
+
|
|
1398
1432
|
Takes all paths in self.samples_df['sample_path'], extracts the basename,
|
|
1399
1433
|
removes file extensions, and checks that all resulting names are unique.
|
|
1400
1434
|
If unique, replaces the values in self.samples_df['sample_name'].
|
|
@@ -1407,31 +1441,31 @@ def sample_name_reset(self):
|
|
|
1407
1441
|
RuntimeError: If any sample_path is None or empty
|
|
1408
1442
|
"""
|
|
1409
1443
|
import os
|
|
1410
|
-
|
|
1444
|
+
|
|
1411
1445
|
if self.samples_df is None or len(self.samples_df) == 0:
|
|
1412
1446
|
self.logger.warning("No samples found in study.")
|
|
1413
1447
|
return
|
|
1414
1448
|
|
|
1415
1449
|
# Get current sample paths
|
|
1416
1450
|
sample_paths = self.samples_df.get_column("sample_path").to_list()
|
|
1417
|
-
|
|
1451
|
+
|
|
1418
1452
|
# Extract basenames without extensions
|
|
1419
1453
|
new_names = []
|
|
1420
|
-
|
|
1454
|
+
|
|
1421
1455
|
for i, path in enumerate(sample_paths):
|
|
1422
1456
|
if path is None or path == "":
|
|
1423
1457
|
raise RuntimeError(f"Sample at index {i} has no sample_path set")
|
|
1424
|
-
|
|
1458
|
+
|
|
1425
1459
|
# Get basename and remove extension(s)
|
|
1426
1460
|
basename = os.path.basename(path)
|
|
1427
1461
|
# Remove all extensions (handles cases like .tar.gz, .sample5.gz, etc.)
|
|
1428
1462
|
name_without_ext = basename
|
|
1429
|
-
while
|
|
1463
|
+
while "." in name_without_ext:
|
|
1430
1464
|
name_without_ext = os.path.splitext(name_without_ext)[0]
|
|
1431
|
-
|
|
1465
|
+
|
|
1432
1466
|
new_names.append(name_without_ext)
|
|
1433
1467
|
self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
|
|
1434
|
-
|
|
1468
|
+
|
|
1435
1469
|
# Check that all new names are unique
|
|
1436
1470
|
if len(set(new_names)) != len(new_names):
|
|
1437
1471
|
duplicates = []
|
|
@@ -1442,12 +1476,12 @@ def sample_name_reset(self):
|
|
|
1442
1476
|
else:
|
|
1443
1477
|
seen.add(name)
|
|
1444
1478
|
raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
|
|
1445
|
-
|
|
1479
|
+
|
|
1446
1480
|
# If we get here, all names are unique - apply the changes
|
|
1447
1481
|
self.samples_df = self.samples_df.with_columns(
|
|
1448
1482
|
pl.Series("sample_name", new_names).alias("sample_name"),
|
|
1449
1483
|
)
|
|
1450
|
-
|
|
1484
|
+
|
|
1451
1485
|
self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
|
|
1452
1486
|
|
|
1453
1487
|
|
|
@@ -1704,7 +1738,7 @@ def features_select(
|
|
|
1704
1738
|
if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
|
|
1705
1739
|
min_coherence, max_coherence = chrom_coherence
|
|
1706
1740
|
filter_conditions.append(
|
|
1707
|
-
(pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
|
|
1741
|
+
(pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
|
|
1708
1742
|
)
|
|
1709
1743
|
else:
|
|
1710
1744
|
filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
|
|
@@ -1717,7 +1751,7 @@ def features_select(
|
|
|
1717
1751
|
if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
|
|
1718
1752
|
min_prominence, max_prominence = chrom_prominence
|
|
1719
1753
|
filter_conditions.append(
|
|
1720
|
-
(pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
|
|
1754
|
+
(pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
|
|
1721
1755
|
)
|
|
1722
1756
|
else:
|
|
1723
1757
|
filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
|
|
@@ -1731,7 +1765,7 @@ def features_select(
|
|
|
1731
1765
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
|
|
1732
1766
|
filter_conditions.append(
|
|
1733
1767
|
(pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
|
|
1734
|
-
& (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
|
|
1768
|
+
& (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
|
|
1735
1769
|
)
|
|
1736
1770
|
else:
|
|
1737
1771
|
filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
|
|
@@ -1745,7 +1779,7 @@ def features_select(
|
|
|
1745
1779
|
min_height_scaled, max_height_scaled = chrom_height_scaled
|
|
1746
1780
|
filter_conditions.append(
|
|
1747
1781
|
(pl.col("chrom_height_scaled") >= min_height_scaled)
|
|
1748
|
-
& (pl.col("chrom_height_scaled") <= max_height_scaled)
|
|
1782
|
+
& (pl.col("chrom_height_scaled") <= max_height_scaled),
|
|
1749
1783
|
)
|
|
1750
1784
|
else:
|
|
1751
1785
|
filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
|
|
@@ -1852,7 +1886,7 @@ def features_filter(self, features):
|
|
|
1852
1886
|
# Single comprehensive log message
|
|
1853
1887
|
if mapping_removed_count > 0:
|
|
1854
1888
|
self.logger.info(
|
|
1855
|
-
f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features."
|
|
1889
|
+
f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.",
|
|
1856
1890
|
)
|
|
1857
1891
|
else:
|
|
1858
1892
|
self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
|
|
@@ -1929,7 +1963,7 @@ def features_delete(self, features):
|
|
|
1929
1963
|
# Single comprehensive log message
|
|
1930
1964
|
if mapping_removed_count > 0:
|
|
1931
1965
|
self.logger.info(
|
|
1932
|
-
f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}"
|
|
1966
|
+
f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}",
|
|
1933
1967
|
)
|
|
1934
1968
|
else:
|
|
1935
1969
|
self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
|
|
@@ -1994,7 +2028,7 @@ def consensus_select(
|
|
|
1994
2028
|
# Filter by m/z
|
|
1995
2029
|
if mz is not None:
|
|
1996
2030
|
consensus_len_before_filter = len(consensus)
|
|
1997
|
-
|
|
2031
|
+
|
|
1998
2032
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
1999
2033
|
# Check if second value is smaller than first (indicating mz, mz_tol format)
|
|
2000
2034
|
if mz[1] < mz[0]:
|
|
@@ -2008,18 +2042,19 @@ def consensus_select(
|
|
|
2008
2042
|
consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
2009
2043
|
else:
|
|
2010
2044
|
# Single float value - use default mz tolerance from study parameters
|
|
2011
|
-
default_mz_tol = getattr(self,
|
|
2012
|
-
if default_mz_tol and hasattr(default_mz_tol,
|
|
2045
|
+
default_mz_tol = getattr(self, "parameters", None)
|
|
2046
|
+
if default_mz_tol and hasattr(default_mz_tol, "eic_mz_tol"):
|
|
2013
2047
|
default_mz_tol = default_mz_tol.eic_mz_tol
|
|
2014
2048
|
else:
|
|
2015
2049
|
# Fallback to align_defaults if study parameters not available
|
|
2016
2050
|
from masster.study.defaults.align_def import align_defaults
|
|
2051
|
+
|
|
2017
2052
|
default_mz_tol = align_defaults().mz_max_diff
|
|
2018
|
-
|
|
2053
|
+
|
|
2019
2054
|
min_mz = mz - default_mz_tol
|
|
2020
2055
|
max_mz = mz + default_mz_tol
|
|
2021
2056
|
consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
2022
|
-
|
|
2057
|
+
|
|
2023
2058
|
self.logger.debug(
|
|
2024
2059
|
f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2025
2060
|
)
|
|
@@ -2027,7 +2062,7 @@ def consensus_select(
|
|
|
2027
2062
|
# Filter by retention time
|
|
2028
2063
|
if rt is not None:
|
|
2029
2064
|
consensus_len_before_filter = len(consensus)
|
|
2030
|
-
|
|
2065
|
+
|
|
2031
2066
|
if isinstance(rt, tuple) and len(rt) == 2:
|
|
2032
2067
|
# Check if second value is smaller than first (indicating rt, rt_tol format)
|
|
2033
2068
|
if rt[1] < rt[0]:
|
|
@@ -2041,18 +2076,19 @@ def consensus_select(
|
|
|
2041
2076
|
consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
2042
2077
|
else:
|
|
2043
2078
|
# Single float value - use default rt tolerance from study parameters
|
|
2044
|
-
default_rt_tol = getattr(self,
|
|
2045
|
-
if default_rt_tol and hasattr(default_rt_tol,
|
|
2079
|
+
default_rt_tol = getattr(self, "parameters", None)
|
|
2080
|
+
if default_rt_tol and hasattr(default_rt_tol, "eic_rt_tol"):
|
|
2046
2081
|
default_rt_tol = default_rt_tol.eic_rt_tol
|
|
2047
2082
|
else:
|
|
2048
2083
|
# Fallback to align_defaults if study parameters not available
|
|
2049
2084
|
from masster.study.defaults.align_def import align_defaults
|
|
2085
|
+
|
|
2050
2086
|
default_rt_tol = align_defaults().rt_max_diff
|
|
2051
|
-
|
|
2087
|
+
|
|
2052
2088
|
min_rt = rt - default_rt_tol
|
|
2053
2089
|
max_rt = rt + default_rt_tol
|
|
2054
2090
|
consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
2055
|
-
|
|
2091
|
+
|
|
2056
2092
|
self.logger.debug(
|
|
2057
2093
|
f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2058
2094
|
)
|
|
@@ -2077,7 +2113,7 @@ def consensus_select(
|
|
|
2077
2113
|
# Treat as range
|
|
2078
2114
|
min_uid, max_uid = consensus_uid
|
|
2079
2115
|
consensus = consensus.filter(
|
|
2080
|
-
(pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid)
|
|
2116
|
+
(pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid),
|
|
2081
2117
|
)
|
|
2082
2118
|
else:
|
|
2083
2119
|
# Treat as list
|
|
@@ -2105,7 +2141,7 @@ def consensus_select(
|
|
|
2105
2141
|
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
2106
2142
|
min_samples, max_samples = number_samples
|
|
2107
2143
|
consensus = consensus.filter(
|
|
2108
|
-
(pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples)
|
|
2144
|
+
(pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples),
|
|
2109
2145
|
)
|
|
2110
2146
|
else:
|
|
2111
2147
|
consensus = consensus.filter(pl.col("number_samples") >= number_samples)
|
|
@@ -2163,7 +2199,7 @@ def consensus_select(
|
|
|
2163
2199
|
min_coherence, max_coherence = chrom_coherence_mean
|
|
2164
2200
|
consensus = consensus.filter(
|
|
2165
2201
|
(pl.col("chrom_coherence_mean") >= min_coherence)
|
|
2166
|
-
& (pl.col("chrom_coherence_mean") <= max_coherence)
|
|
2202
|
+
& (pl.col("chrom_coherence_mean") <= max_coherence),
|
|
2167
2203
|
)
|
|
2168
2204
|
else:
|
|
2169
2205
|
consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
|
|
@@ -2181,7 +2217,7 @@ def consensus_select(
|
|
|
2181
2217
|
min_prominence, max_prominence = chrom_prominence_mean
|
|
2182
2218
|
consensus = consensus.filter(
|
|
2183
2219
|
(pl.col("chrom_prominence_mean") >= min_prominence)
|
|
2184
|
-
& (pl.col("chrom_prominence_mean") <= max_prominence)
|
|
2220
|
+
& (pl.col("chrom_prominence_mean") <= max_prominence),
|
|
2185
2221
|
)
|
|
2186
2222
|
else:
|
|
2187
2223
|
consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
|
|
@@ -2199,7 +2235,7 @@ def consensus_select(
|
|
|
2199
2235
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
|
|
2200
2236
|
consensus = consensus.filter(
|
|
2201
2237
|
(pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
|
|
2202
|
-
& (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled)
|
|
2238
|
+
& (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
|
|
2203
2239
|
)
|
|
2204
2240
|
else:
|
|
2205
2241
|
consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
|
|
@@ -2217,7 +2253,7 @@ def consensus_select(
|
|
|
2217
2253
|
min_height_scaled, max_height_scaled = chrom_height_scaled_mean
|
|
2218
2254
|
consensus = consensus.filter(
|
|
2219
2255
|
(pl.col("chrom_height_scaled_mean") >= min_height_scaled)
|
|
2220
|
-
& (pl.col("chrom_height_scaled_mean") <= max_height_scaled)
|
|
2256
|
+
& (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
|
|
2221
2257
|
)
|
|
2222
2258
|
else:
|
|
2223
2259
|
consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
|
|
@@ -2234,7 +2270,7 @@ def consensus_select(
|
|
|
2234
2270
|
if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
|
|
2235
2271
|
min_rt_delta, max_rt_delta = rt_delta_mean
|
|
2236
2272
|
consensus = consensus.filter(
|
|
2237
|
-
(pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta)
|
|
2273
|
+
(pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta),
|
|
2238
2274
|
)
|
|
2239
2275
|
else:
|
|
2240
2276
|
consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
|
|
@@ -2261,10 +2297,10 @@ def consensus_select(
|
|
|
2261
2297
|
# Multiple columns
|
|
2262
2298
|
valid_columns = [col for col in sortby if col in consensus.columns]
|
|
2263
2299
|
invalid_columns = [col for col in sortby if col not in consensus.columns]
|
|
2264
|
-
|
|
2300
|
+
|
|
2265
2301
|
if invalid_columns:
|
|
2266
2302
|
self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
|
|
2267
|
-
|
|
2303
|
+
|
|
2268
2304
|
if valid_columns:
|
|
2269
2305
|
consensus = consensus.sort(valid_columns, descending=descending)
|
|
2270
2306
|
else:
|
|
@@ -2355,7 +2391,7 @@ def consensus_filter(self, consensus):
|
|
|
2355
2391
|
|
|
2356
2392
|
removed_consensus_count = initial_consensus_count - len(self.consensus_df)
|
|
2357
2393
|
self.logger.info(
|
|
2358
|
-
f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}"
|
|
2394
|
+
f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}",
|
|
2359
2395
|
)
|
|
2360
2396
|
|
|
2361
2397
|
|
|
@@ -2485,7 +2521,9 @@ def samples_select(
|
|
|
2485
2521
|
if len(sample_batch) == 2 and not isinstance(sample_batch, list):
|
|
2486
2522
|
# Treat as range
|
|
2487
2523
|
min_batch, max_batch = sample_batch
|
|
2488
|
-
filter_conditions.append(
|
|
2524
|
+
filter_conditions.append(
|
|
2525
|
+
(pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch),
|
|
2526
|
+
)
|
|
2489
2527
|
else:
|
|
2490
2528
|
# Treat as list
|
|
2491
2529
|
filter_conditions.append(pl.col("sample_batch").is_in(sample_batch))
|
|
@@ -2501,7 +2539,9 @@ def samples_select(
|
|
|
2501
2539
|
if len(sample_sequence) == 2 and not isinstance(sample_sequence, list):
|
|
2502
2540
|
# Treat as range
|
|
2503
2541
|
min_seq, max_seq = sample_sequence
|
|
2504
|
-
filter_conditions.append(
|
|
2542
|
+
filter_conditions.append(
|
|
2543
|
+
(pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq),
|
|
2544
|
+
)
|
|
2505
2545
|
else:
|
|
2506
2546
|
# Treat as list
|
|
2507
2547
|
filter_conditions.append(pl.col("sample_sequence").is_in(sample_sequence))
|
|
@@ -2515,7 +2555,9 @@ def samples_select(
|
|
|
2515
2555
|
if "num_features" in available_columns:
|
|
2516
2556
|
if isinstance(num_features, tuple) and len(num_features) == 2:
|
|
2517
2557
|
min_features, max_features = num_features
|
|
2518
|
-
filter_conditions.append(
|
|
2558
|
+
filter_conditions.append(
|
|
2559
|
+
(pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features),
|
|
2560
|
+
)
|
|
2519
2561
|
else:
|
|
2520
2562
|
filter_conditions.append(pl.col("num_features") >= num_features)
|
|
2521
2563
|
else:
|
|
@@ -2572,15 +2614,15 @@ def samples_select(
|
|
|
2572
2614
|
def samples_delete(self, samples):
|
|
2573
2615
|
"""
|
|
2574
2616
|
Delete samples and all related data from the study based on sample identifiers.
|
|
2575
|
-
|
|
2576
|
-
This function eliminates all data related to the specified samples (and their sample_uids)
|
|
2617
|
+
|
|
2618
|
+
This function eliminates all data related to the specified samples (and their sample_uids)
|
|
2577
2619
|
from all dataframes including:
|
|
2578
2620
|
- samples_df: Removes the sample rows
|
|
2579
2621
|
- features_df: Removes all features belonging to these samples
|
|
2580
2622
|
- consensus_mapping_df: Removes mappings for features from these samples
|
|
2581
2623
|
- consensus_ms2: Removes MS2 spectra for features from these samples
|
|
2582
2624
|
- feature_maps: Removes the corresponding feature maps
|
|
2583
|
-
|
|
2625
|
+
|
|
2584
2626
|
Also updates map_id values to maintain sequential indices after deletion.
|
|
2585
2627
|
|
|
2586
2628
|
Parameters:
|
|
@@ -2642,10 +2684,10 @@ def samples_delete(self, samples):
|
|
|
2642
2684
|
|
|
2643
2685
|
# Get map_ids to remove from feature_maps (needed before samples_df deletion)
|
|
2644
2686
|
map_ids_to_remove = []
|
|
2645
|
-
if hasattr(self,
|
|
2687
|
+
if hasattr(self, "feature_maps") and self.feature_maps is not None:
|
|
2646
2688
|
# Get map_ids for samples to be deleted
|
|
2647
2689
|
map_ids_df = self.samples_df.filter(
|
|
2648
|
-
pl.col("sample_uid").is_in(sample_uids_to_remove)
|
|
2690
|
+
pl.col("sample_uid").is_in(sample_uids_to_remove),
|
|
2649
2691
|
).select("map_id")
|
|
2650
2692
|
if not map_ids_df.is_empty():
|
|
2651
2693
|
map_ids_to_remove = map_ids_df["map_id"].to_list()
|
|
@@ -2683,7 +2725,7 @@ def samples_delete(self, samples):
|
|
|
2683
2725
|
|
|
2684
2726
|
# 5. Remove from feature_maps and update map_id
|
|
2685
2727
|
removed_maps_count = 0
|
|
2686
|
-
if hasattr(self,
|
|
2728
|
+
if hasattr(self, "feature_maps") and self.feature_maps is not None and map_ids_to_remove:
|
|
2687
2729
|
# Remove feature maps in reverse order to maintain indices
|
|
2688
2730
|
for map_id in sorted(map_ids_to_remove, reverse=True):
|
|
2689
2731
|
if 0 <= map_id < len(self.feature_maps):
|
|
@@ -2694,7 +2736,7 @@ def samples_delete(self, samples):
|
|
|
2694
2736
|
if len(self.samples_df) > 0:
|
|
2695
2737
|
new_map_ids = list(range(len(self.samples_df)))
|
|
2696
2738
|
self.samples_df = self.samples_df.with_columns(
|
|
2697
|
-
pl.lit(new_map_ids).alias("map_id")
|
|
2739
|
+
pl.lit(new_map_ids).alias("map_id"),
|
|
2698
2740
|
)
|
|
2699
2741
|
|
|
2700
2742
|
# Calculate and log results
|
|
@@ -2705,16 +2747,16 @@ def samples_delete(self, samples):
|
|
|
2705
2747
|
summary_parts = [
|
|
2706
2748
|
f"Deleted {removed_sample_count} samples",
|
|
2707
2749
|
]
|
|
2708
|
-
|
|
2750
|
+
|
|
2709
2751
|
if removed_features_count > 0:
|
|
2710
2752
|
summary_parts.append(f"{removed_features_count} features")
|
|
2711
|
-
|
|
2753
|
+
|
|
2712
2754
|
if removed_mapping_count > 0:
|
|
2713
2755
|
summary_parts.append(f"{removed_mapping_count} consensus mappings")
|
|
2714
|
-
|
|
2756
|
+
|
|
2715
2757
|
if removed_ms2_count > 0:
|
|
2716
2758
|
summary_parts.append(f"{removed_ms2_count} MS2 spectra")
|
|
2717
|
-
|
|
2759
|
+
|
|
2718
2760
|
if removed_maps_count > 0:
|
|
2719
2761
|
summary_parts.append(f"{removed_maps_count} feature maps")
|
|
2720
2762
|
|
|
@@ -2735,14 +2777,14 @@ def samples_delete(self, samples):
|
|
|
2735
2777
|
def sample_color(self, by=None, palette="Turbo256"):
|
|
2736
2778
|
"""
|
|
2737
2779
|
Set sample colors in the sample_color column of samples_df.
|
|
2738
|
-
|
|
2780
|
+
|
|
2739
2781
|
When a new sample is added, this function resets all colors picking from the specified palette.
|
|
2740
2782
|
The default palette is Turbo256.
|
|
2741
2783
|
|
|
2742
2784
|
Parameters:
|
|
2743
2785
|
by (str or list, optional): Property to base colors on. Options:
|
|
2744
2786
|
- 'sample_uid': Use sample_uid values to assign colors
|
|
2745
|
-
- 'sample_index': Use sample index (position) to assign colors
|
|
2787
|
+
- 'sample_index': Use sample index (position) to assign colors
|
|
2746
2788
|
- 'sample_type': Use sample_type values to assign colors
|
|
2747
2789
|
- 'sample_name': Use sample_name values to assign colors
|
|
2748
2790
|
- list of colors: Use provided list of hex color codes
|
|
@@ -2755,7 +2797,7 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2755
2797
|
- 'Magma256': Magma colormap (256 colors, perceptually uniform)
|
|
2756
2798
|
- 'Cividis256': Cividis colormap (256 colors, colorblind-friendly)
|
|
2757
2799
|
- 'Set1': Qualitative palette (9 distinct colors)
|
|
2758
|
-
- 'Set2': Qualitative palette (8 distinct colors)
|
|
2800
|
+
- 'Set2': Qualitative palette (8 distinct colors)
|
|
2759
2801
|
- 'Set3': Qualitative palette (12 distinct colors)
|
|
2760
2802
|
- 'Tab10': Tableau 10 palette (10 distinct colors)
|
|
2761
2803
|
- 'Tab20': Tableau 20 palette (20 distinct colors)
|
|
@@ -2766,7 +2808,7 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2766
2808
|
- 'Coolwarm': Cool-warm diverging colormap
|
|
2767
2809
|
- 'Seismic': Seismic diverging colormap
|
|
2768
2810
|
- Any other colormap name supported by the cmap library
|
|
2769
|
-
|
|
2811
|
+
|
|
2770
2812
|
For a complete catalog of available colormaps, see:
|
|
2771
2813
|
https://cmap-docs.readthedocs.io/en/latest/catalog/
|
|
2772
2814
|
|
|
@@ -2776,10 +2818,10 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2776
2818
|
Example:
|
|
2777
2819
|
# Set colors based on sample type
|
|
2778
2820
|
study.sample_color(by='sample_type', palette='Set1')
|
|
2779
|
-
|
|
2821
|
+
|
|
2780
2822
|
# Set colors using a custom color list
|
|
2781
2823
|
study.sample_color(by=['#FF0000', '#00FF00', '#0000FF'])
|
|
2782
|
-
|
|
2824
|
+
|
|
2783
2825
|
# Reset to default Turbo256 sequential colors
|
|
2784
2826
|
study.sample_color()
|
|
2785
2827
|
"""
|
|
@@ -2788,11 +2830,13 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2788
2830
|
return
|
|
2789
2831
|
|
|
2790
2832
|
sample_count = len(self.samples_df)
|
|
2791
|
-
|
|
2833
|
+
|
|
2792
2834
|
# Handle custom color list
|
|
2793
2835
|
if isinstance(by, list):
|
|
2794
2836
|
if len(by) < sample_count:
|
|
2795
|
-
self.logger.warning(
|
|
2837
|
+
self.logger.warning(
|
|
2838
|
+
f"Provided color list has {len(by)} colors but {sample_count} samples. Repeating colors.",
|
|
2839
|
+
)
|
|
2796
2840
|
# Cycle through the provided colors if there aren't enough
|
|
2797
2841
|
colors = []
|
|
2798
2842
|
for i in range(sample_count):
|
|
@@ -2808,10 +2852,10 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2808
2852
|
except ValueError as e:
|
|
2809
2853
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2810
2854
|
return
|
|
2811
|
-
|
|
2812
|
-
elif by ==
|
|
2855
|
+
|
|
2856
|
+
elif by == "sample_uid":
|
|
2813
2857
|
# Use sample_uid to determine position in evenly sampled colormap
|
|
2814
|
-
sample_uids = self.samples_df[
|
|
2858
|
+
sample_uids = self.samples_df["sample_uid"].to_list()
|
|
2815
2859
|
try:
|
|
2816
2860
|
# Sample colors evenly for the number of samples
|
|
2817
2861
|
palette_colors = _sample_colors_from_colormap(palette, sample_count)
|
|
@@ -2823,29 +2867,29 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2823
2867
|
except ValueError as e:
|
|
2824
2868
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2825
2869
|
return
|
|
2826
|
-
|
|
2827
|
-
elif by ==
|
|
2870
|
+
|
|
2871
|
+
elif by == "sample_index":
|
|
2828
2872
|
# Use sample index (position in DataFrame) with evenly sampled colors
|
|
2829
2873
|
try:
|
|
2830
2874
|
colors = _sample_colors_from_colormap(palette, sample_count)
|
|
2831
2875
|
except ValueError as e:
|
|
2832
2876
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2833
2877
|
return
|
|
2834
|
-
|
|
2835
|
-
elif by ==
|
|
2878
|
+
|
|
2879
|
+
elif by == "sample_type":
|
|
2836
2880
|
# Use sample_type to assign colors - same type gets same color
|
|
2837
2881
|
# Sample colors evenly across colormap for unique types
|
|
2838
|
-
sample_types = self.samples_df[
|
|
2839
|
-
unique_types = list(
|
|
2840
|
-
|
|
2882
|
+
sample_types = self.samples_df["sample_type"].to_list()
|
|
2883
|
+
unique_types = list({t for t in sample_types if t is not None})
|
|
2884
|
+
|
|
2841
2885
|
try:
|
|
2842
2886
|
# Sample colors evenly for unique types
|
|
2843
2887
|
type_colors = _sample_colors_from_colormap(palette, len(unique_types))
|
|
2844
2888
|
type_to_color = {}
|
|
2845
|
-
|
|
2889
|
+
|
|
2846
2890
|
for i, sample_type in enumerate(unique_types):
|
|
2847
2891
|
type_to_color[sample_type] = type_colors[i]
|
|
2848
|
-
|
|
2892
|
+
|
|
2849
2893
|
colors = []
|
|
2850
2894
|
for sample_type in sample_types:
|
|
2851
2895
|
if sample_type is None:
|
|
@@ -2856,21 +2900,21 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2856
2900
|
except ValueError as e:
|
|
2857
2901
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2858
2902
|
return
|
|
2859
|
-
|
|
2860
|
-
elif by ==
|
|
2903
|
+
|
|
2904
|
+
elif by == "sample_name":
|
|
2861
2905
|
# Use sample_name to assign colors - same name gets same color (unlikely but possible)
|
|
2862
2906
|
# Sample colors evenly across colormap for unique names
|
|
2863
|
-
sample_names = self.samples_df[
|
|
2864
|
-
unique_names = list(
|
|
2865
|
-
|
|
2907
|
+
sample_names = self.samples_df["sample_name"].to_list()
|
|
2908
|
+
unique_names = list({n for n in sample_names if n is not None})
|
|
2909
|
+
|
|
2866
2910
|
try:
|
|
2867
2911
|
# Sample colors evenly for unique names
|
|
2868
2912
|
name_colors = _sample_colors_from_colormap(palette, len(unique_names))
|
|
2869
2913
|
name_to_color = {}
|
|
2870
|
-
|
|
2914
|
+
|
|
2871
2915
|
for i, sample_name in enumerate(unique_names):
|
|
2872
2916
|
name_to_color[sample_name] = name_colors[i]
|
|
2873
|
-
|
|
2917
|
+
|
|
2874
2918
|
colors = []
|
|
2875
2919
|
for sample_name in sample_names:
|
|
2876
2920
|
if sample_name is None:
|
|
@@ -2882,14 +2926,16 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2882
2926
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2883
2927
|
return
|
|
2884
2928
|
else:
|
|
2885
|
-
self.logger.error(
|
|
2929
|
+
self.logger.error(
|
|
2930
|
+
f"Invalid by value: {by}. Must be 'sample_uid', 'sample_index', 'sample_type', 'sample_name', a list of colors, or None.",
|
|
2931
|
+
)
|
|
2886
2932
|
return
|
|
2887
2933
|
|
|
2888
2934
|
# Update the sample_color column
|
|
2889
2935
|
self.samples_df = self.samples_df.with_columns(
|
|
2890
|
-
pl.Series("sample_color", colors).alias("sample_color")
|
|
2936
|
+
pl.Series("sample_color", colors).alias("sample_color"),
|
|
2891
2937
|
)
|
|
2892
|
-
|
|
2938
|
+
|
|
2893
2939
|
if isinstance(by, list):
|
|
2894
2940
|
self.logger.debug(f"Set sample colors using provided color list ({len(by)} colors)")
|
|
2895
2941
|
elif by is None:
|
|
@@ -2901,28 +2947,28 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2901
2947
|
def sample_color_reset(self):
|
|
2902
2948
|
"""
|
|
2903
2949
|
Reset sample colors to default coloring using the 'turbo' colormap.
|
|
2904
|
-
|
|
2950
|
+
|
|
2905
2951
|
This function assigns colors by distributing samples evenly across the full
|
|
2906
2952
|
turbo colormap range, ensuring maximum color diversity and visual distinction
|
|
2907
2953
|
between samples.
|
|
2908
|
-
|
|
2954
|
+
|
|
2909
2955
|
Returns:
|
|
2910
2956
|
None (modifies self.samples_df in place)
|
|
2911
2957
|
"""
|
|
2912
2958
|
if self.samples_df is None or len(self.samples_df) == 0:
|
|
2913
2959
|
self.logger.warning("No samples found in study.")
|
|
2914
2960
|
return
|
|
2915
|
-
|
|
2961
|
+
|
|
2916
2962
|
try:
|
|
2917
2963
|
from cmap import Colormap
|
|
2918
|
-
|
|
2964
|
+
|
|
2919
2965
|
# Use turbo colormap
|
|
2920
|
-
cm = Colormap(
|
|
2921
|
-
|
|
2966
|
+
cm = Colormap("turbo")
|
|
2967
|
+
|
|
2922
2968
|
# Get sample count and assign colors evenly distributed across colormap
|
|
2923
2969
|
n_samples = len(self.samples_df)
|
|
2924
2970
|
colors = []
|
|
2925
|
-
|
|
2971
|
+
|
|
2926
2972
|
# Distribute samples evenly across the full colormap range
|
|
2927
2973
|
for i in range(n_samples):
|
|
2928
2974
|
# Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
|
|
@@ -2930,9 +2976,9 @@ def sample_color_reset(self):
|
|
|
2930
2976
|
# Optionally, map to a subset of colormap to avoid extreme colors
|
|
2931
2977
|
# Use 10% to 90% of colormap range for better color diversity
|
|
2932
2978
|
normalized_value = 0.1 + (normalized_value * 0.8)
|
|
2933
|
-
|
|
2979
|
+
|
|
2934
2980
|
color_rgba = cm(normalized_value)
|
|
2935
|
-
|
|
2981
|
+
|
|
2936
2982
|
# Convert RGBA to hex
|
|
2937
2983
|
if len(color_rgba) >= 3:
|
|
2938
2984
|
r, g, b = color_rgba[:3]
|
|
@@ -2941,14 +2987,14 @@ def sample_color_reset(self):
|
|
|
2941
2987
|
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
|
2942
2988
|
hex_color = f"#{r:02x}{g:02x}{b:02x}"
|
|
2943
2989
|
colors.append(hex_color)
|
|
2944
|
-
|
|
2990
|
+
|
|
2945
2991
|
# Update the sample_color column
|
|
2946
2992
|
self.samples_df = self.samples_df.with_columns(
|
|
2947
|
-
pl.Series("sample_color", colors).alias("sample_color")
|
|
2993
|
+
pl.Series("sample_color", colors).alias("sample_color"),
|
|
2948
2994
|
)
|
|
2949
|
-
|
|
2995
|
+
|
|
2950
2996
|
self.logger.debug(f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)")
|
|
2951
|
-
|
|
2997
|
+
|
|
2952
2998
|
except ImportError:
|
|
2953
2999
|
self.logger.error("cmap library is required for sample color reset. Install with: pip install cmap")
|
|
2954
3000
|
except Exception as e:
|
|
@@ -2958,13 +3004,13 @@ def sample_color_reset(self):
|
|
|
2958
3004
|
def _get_color_palette(palette_name):
|
|
2959
3005
|
"""
|
|
2960
3006
|
Get color palette as a list of hex color codes using the cmap library.
|
|
2961
|
-
|
|
3007
|
+
|
|
2962
3008
|
Parameters:
|
|
2963
3009
|
palette_name (str): Name of the palette
|
|
2964
|
-
|
|
3010
|
+
|
|
2965
3011
|
Returns:
|
|
2966
3012
|
list: List of hex color codes
|
|
2967
|
-
|
|
3013
|
+
|
|
2968
3014
|
Raises:
|
|
2969
3015
|
ValueError: If palette_name is not supported
|
|
2970
3016
|
"""
|
|
@@ -2972,40 +3018,38 @@ def _get_color_palette(palette_name):
|
|
|
2972
3018
|
from cmap import Colormap
|
|
2973
3019
|
except ImportError:
|
|
2974
3020
|
raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
|
|
2975
|
-
|
|
3021
|
+
|
|
2976
3022
|
# Map common palette names to cmap names
|
|
2977
3023
|
palette_mapping = {
|
|
2978
3024
|
# Scientific colormaps
|
|
2979
3025
|
"Turbo256": "turbo",
|
|
2980
|
-
"Viridis256": "viridis",
|
|
3026
|
+
"Viridis256": "viridis",
|
|
2981
3027
|
"Plasma256": "plasma",
|
|
2982
3028
|
"Inferno256": "inferno",
|
|
2983
3029
|
"Magma256": "magma",
|
|
2984
3030
|
"Cividis256": "cividis",
|
|
2985
|
-
|
|
2986
3031
|
# Qualitative palettes
|
|
2987
3032
|
"Set1": "Set1",
|
|
2988
|
-
"Set2": "Set2",
|
|
3033
|
+
"Set2": "Set2",
|
|
2989
3034
|
"Set3": "Set3",
|
|
2990
3035
|
"Tab10": "tab10",
|
|
2991
3036
|
"Tab20": "tab20",
|
|
2992
3037
|
"Dark2": "Dark2",
|
|
2993
3038
|
"Paired": "Paired",
|
|
2994
|
-
|
|
2995
3039
|
# Additional useful palettes
|
|
2996
3040
|
"Spectral": "Spectral",
|
|
2997
3041
|
"Rainbow": "rainbow",
|
|
2998
3042
|
"Coolwarm": "coolwarm",
|
|
2999
3043
|
"Seismic": "seismic",
|
|
3000
3044
|
}
|
|
3001
|
-
|
|
3045
|
+
|
|
3002
3046
|
# Get the cmap name
|
|
3003
3047
|
cmap_name = palette_mapping.get(palette_name, palette_name.lower())
|
|
3004
|
-
|
|
3048
|
+
|
|
3005
3049
|
try:
|
|
3006
3050
|
# Create colormap
|
|
3007
3051
|
cm = Colormap(cmap_name)
|
|
3008
|
-
|
|
3052
|
+
|
|
3009
3053
|
# Determine number of colors to generate
|
|
3010
3054
|
if "256" in palette_name:
|
|
3011
3055
|
n_colors = 256
|
|
@@ -3021,7 +3065,7 @@ def _get_color_palette(palette_name):
|
|
|
3021
3065
|
n_colors = 20
|
|
3022
3066
|
else:
|
|
3023
3067
|
n_colors = 256 # Default for continuous colormaps
|
|
3024
|
-
|
|
3068
|
+
|
|
3025
3069
|
# Generate colors
|
|
3026
3070
|
if n_colors <= 20:
|
|
3027
3071
|
# For discrete palettes, use evenly spaced indices
|
|
@@ -3029,11 +3073,11 @@ def _get_color_palette(palette_name):
|
|
|
3029
3073
|
else:
|
|
3030
3074
|
# For continuous palettes, use full range
|
|
3031
3075
|
indices = [i / (n_colors - 1) for i in range(n_colors)]
|
|
3032
|
-
|
|
3076
|
+
|
|
3033
3077
|
# Get colors as RGBA and convert to hex
|
|
3034
3078
|
colors = cm(indices)
|
|
3035
3079
|
hex_colors = []
|
|
3036
|
-
|
|
3080
|
+
|
|
3037
3081
|
for color in colors:
|
|
3038
3082
|
if len(color) >= 3: # RGBA or RGB
|
|
3039
3083
|
r, g, b = color[:3]
|
|
@@ -3042,25 +3086,26 @@ def _get_color_palette(palette_name):
|
|
|
3042
3086
|
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
|
3043
3087
|
hex_color = f"#{r:02x}{g:02x}{b:02x}"
|
|
3044
3088
|
hex_colors.append(hex_color)
|
|
3045
|
-
|
|
3089
|
+
|
|
3046
3090
|
return hex_colors
|
|
3047
|
-
|
|
3091
|
+
|
|
3048
3092
|
except Exception as e:
|
|
3049
|
-
raise ValueError(
|
|
3050
|
-
|
|
3093
|
+
raise ValueError(
|
|
3094
|
+
f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
|
|
3095
|
+
)
|
|
3051
3096
|
|
|
3052
3097
|
|
|
3053
3098
|
def _sample_colors_from_colormap(palette_name, n_colors):
|
|
3054
3099
|
"""
|
|
3055
3100
|
Sample colors evenly from the whole colormap range, similar to sample_color_reset.
|
|
3056
|
-
|
|
3101
|
+
|
|
3057
3102
|
Parameters:
|
|
3058
3103
|
palette_name (str): Name of the palette/colormap
|
|
3059
3104
|
n_colors (int): Number of colors to sample
|
|
3060
|
-
|
|
3105
|
+
|
|
3061
3106
|
Returns:
|
|
3062
3107
|
list: List of hex color codes sampled evenly from the colormap
|
|
3063
|
-
|
|
3108
|
+
|
|
3064
3109
|
Raises:
|
|
3065
3110
|
ValueError: If palette_name is not supported
|
|
3066
3111
|
"""
|
|
@@ -3068,51 +3113,49 @@ def _sample_colors_from_colormap(palette_name, n_colors):
|
|
|
3068
3113
|
from cmap import Colormap
|
|
3069
3114
|
except ImportError:
|
|
3070
3115
|
raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
|
|
3071
|
-
|
|
3116
|
+
|
|
3072
3117
|
# Map common palette names to cmap names (same as _get_color_palette)
|
|
3073
3118
|
palette_mapping = {
|
|
3074
3119
|
# Scientific colormaps
|
|
3075
3120
|
"Turbo256": "turbo",
|
|
3076
|
-
"Viridis256": "viridis",
|
|
3121
|
+
"Viridis256": "viridis",
|
|
3077
3122
|
"Plasma256": "plasma",
|
|
3078
3123
|
"Inferno256": "inferno",
|
|
3079
3124
|
"Magma256": "magma",
|
|
3080
3125
|
"Cividis256": "cividis",
|
|
3081
|
-
|
|
3082
3126
|
# Qualitative palettes
|
|
3083
3127
|
"Set1": "Set1",
|
|
3084
|
-
"Set2": "Set2",
|
|
3128
|
+
"Set2": "Set2",
|
|
3085
3129
|
"Set3": "Set3",
|
|
3086
3130
|
"Tab10": "tab10",
|
|
3087
3131
|
"Tab20": "tab20",
|
|
3088
3132
|
"Dark2": "Dark2",
|
|
3089
3133
|
"Paired": "Paired",
|
|
3090
|
-
|
|
3091
3134
|
# Additional useful palettes
|
|
3092
3135
|
"Spectral": "Spectral",
|
|
3093
3136
|
"Rainbow": "rainbow",
|
|
3094
3137
|
"Coolwarm": "coolwarm",
|
|
3095
3138
|
"Seismic": "seismic",
|
|
3096
3139
|
}
|
|
3097
|
-
|
|
3140
|
+
|
|
3098
3141
|
# Get the cmap name
|
|
3099
3142
|
cmap_name = palette_mapping.get(palette_name, palette_name.lower())
|
|
3100
|
-
|
|
3143
|
+
|
|
3101
3144
|
try:
|
|
3102
3145
|
# Create colormap
|
|
3103
3146
|
cm = Colormap(cmap_name)
|
|
3104
|
-
|
|
3147
|
+
|
|
3105
3148
|
colors = []
|
|
3106
|
-
|
|
3149
|
+
|
|
3107
3150
|
# Distribute samples evenly across the full colormap range (same approach as sample_color_reset)
|
|
3108
3151
|
for i in range(n_colors):
|
|
3109
3152
|
# Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
|
|
3110
3153
|
normalized_value = (i + 0.5) / n_colors # +0.5 to center samples in their bins
|
|
3111
3154
|
# Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
|
|
3112
3155
|
normalized_value = 0.1 + (normalized_value * 0.8)
|
|
3113
|
-
|
|
3156
|
+
|
|
3114
3157
|
color_rgba = cm(normalized_value)
|
|
3115
|
-
|
|
3158
|
+
|
|
3116
3159
|
# Convert RGBA to hex
|
|
3117
3160
|
if len(color_rgba) >= 3:
|
|
3118
3161
|
r, g, b = color_rgba[:3]
|
|
@@ -3121,12 +3164,13 @@ def _sample_colors_from_colormap(palette_name, n_colors):
|
|
|
3121
3164
|
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
|
3122
3165
|
hex_color = f"#{r:02x}{g:02x}{b:02x}"
|
|
3123
3166
|
colors.append(hex_color)
|
|
3124
|
-
|
|
3167
|
+
|
|
3125
3168
|
return colors
|
|
3126
|
-
|
|
3169
|
+
|
|
3127
3170
|
except Exception as e:
|
|
3128
|
-
raise ValueError(
|
|
3129
|
-
|
|
3171
|
+
raise ValueError(
|
|
3172
|
+
f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
|
|
3173
|
+
)
|
|
3130
3174
|
|
|
3131
3175
|
|
|
3132
3176
|
def _matplotlib_to_hex(color_dict):
|
|
@@ -3135,32 +3179,32 @@ def _matplotlib_to_hex(color_dict):
|
|
|
3135
3179
|
|
|
3136
3180
|
|
|
3137
3181
|
# =====================================================================================
|
|
3138
|
-
# SCHEMA AND DATA STRUCTURE FUNCTIONS
|
|
3182
|
+
# SCHEMA AND DATA STRUCTURE FUNCTIONS
|
|
3139
3183
|
# =====================================================================================
|
|
3140
3184
|
|
|
3141
3185
|
|
|
3142
3186
|
def _ensure_features_df_schema_order(self):
|
|
3143
3187
|
"""
|
|
3144
3188
|
Ensure features_df columns are ordered according to study5_schema.json.
|
|
3145
|
-
|
|
3189
|
+
|
|
3146
3190
|
This method should be called after operations that might scramble the column order.
|
|
3147
3191
|
"""
|
|
3148
3192
|
if self.features_df is None or self.features_df.is_empty():
|
|
3149
3193
|
return
|
|
3150
|
-
|
|
3194
|
+
|
|
3151
3195
|
try:
|
|
3152
3196
|
import os
|
|
3153
3197
|
import json
|
|
3154
3198
|
from masster.study.h5 import _reorder_columns_by_schema
|
|
3155
|
-
|
|
3199
|
+
|
|
3156
3200
|
# Load schema
|
|
3157
3201
|
schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
|
|
3158
|
-
with open(schema_path
|
|
3202
|
+
with open(schema_path) as f:
|
|
3159
3203
|
schema = json.load(f)
|
|
3160
|
-
|
|
3204
|
+
|
|
3161
3205
|
# Reorder columns to match schema
|
|
3162
|
-
self.features_df = _reorder_columns_by_schema(self.features_df, schema,
|
|
3163
|
-
|
|
3206
|
+
self.features_df = _reorder_columns_by_schema(self.features_df, schema, "features_df")
|
|
3207
|
+
|
|
3164
3208
|
except Exception as e:
|
|
3165
3209
|
self.logger.warning(f"Failed to reorder features_df columns: {e}")
|
|
3166
3210
|
|
|
@@ -3168,38 +3212,248 @@ def _ensure_features_df_schema_order(self):
|
|
|
3168
3212
|
def migrate_map_id_to_index(self):
|
|
3169
3213
|
"""
|
|
3170
3214
|
Migrate map_id from string-based OpenMS unique IDs to integer indices.
|
|
3171
|
-
|
|
3215
|
+
|
|
3172
3216
|
This function converts the map_id column from string type (with OpenMS unique IDs)
|
|
3173
3217
|
to integer type where each map_id corresponds to the index of the feature map
|
|
3174
3218
|
in self.features_maps.
|
|
3175
|
-
|
|
3219
|
+
|
|
3176
3220
|
This migration is needed for studies that were created before the map_id format
|
|
3177
3221
|
change from OpenMS unique IDs to feature map indices.
|
|
3178
3222
|
"""
|
|
3179
3223
|
if self.samples_df is None or self.samples_df.is_empty():
|
|
3180
3224
|
self.logger.warning("No samples to migrate")
|
|
3181
3225
|
return
|
|
3182
|
-
|
|
3226
|
+
|
|
3183
3227
|
# Check if migration is needed
|
|
3184
|
-
current_dtype = self.samples_df[
|
|
3228
|
+
current_dtype = self.samples_df["map_id"].dtype
|
|
3185
3229
|
if current_dtype == pl.Int64:
|
|
3186
3230
|
self.logger.info("map_id column is already Int64 type - no migration needed")
|
|
3187
3231
|
return
|
|
3188
|
-
|
|
3232
|
+
|
|
3189
3233
|
self.logger.info("Migrating map_id from string-based OpenMS IDs to integer indices")
|
|
3190
|
-
|
|
3234
|
+
|
|
3191
3235
|
# Create new map_id values based on sample order
|
|
3192
3236
|
# Each sample gets a map_id that corresponds to its position in features_maps
|
|
3193
3237
|
sample_count = len(self.samples_df)
|
|
3194
3238
|
new_map_ids = list(range(sample_count))
|
|
3195
|
-
|
|
3239
|
+
|
|
3196
3240
|
# Update the map_id column
|
|
3197
3241
|
self.samples_df = self.samples_df.with_columns(
|
|
3198
|
-
pl.lit(new_map_ids).alias("map_id")
|
|
3242
|
+
pl.lit(new_map_ids).alias("map_id"),
|
|
3199
3243
|
)
|
|
3200
|
-
|
|
3244
|
+
|
|
3201
3245
|
# Ensure the column is Int64 type
|
|
3202
3246
|
self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
|
|
3203
|
-
|
|
3247
|
+
|
|
3204
3248
|
self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
|
|
3205
3249
|
self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")
|
|
3250
|
+
|
|
3251
|
+
|
|
3252
|
+
def restore_ms2(self, samples=None, **kwargs):
|
|
3253
|
+
"""
|
|
3254
|
+
Restore MS2 data by re-running find_ms2 on specified samples.
|
|
3255
|
+
|
|
3256
|
+
This function rebuilds the consensus_ms2 DataFrame by re-extracting MS2 spectra
|
|
3257
|
+
from the original sample files. Use this to reverse the effects of compress_ms2().
|
|
3258
|
+
|
|
3259
|
+
Parameters:
|
|
3260
|
+
samples (list, optional): List of sample_uids or sample_names to process.
|
|
3261
|
+
If None, processes all samples.
|
|
3262
|
+
**kwargs: Additional keyword arguments passed to find_ms2()
|
|
3263
|
+
(e.g., mz_tol, centroid, deisotope, etc.)
|
|
3264
|
+
"""
|
|
3265
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
3266
|
+
self.logger.error("No features_df found in study.")
|
|
3267
|
+
return
|
|
3268
|
+
|
|
3269
|
+
if self.samples_df is None or self.samples_df.is_empty():
|
|
3270
|
+
self.logger.error("No samples_df found in study.")
|
|
3271
|
+
return
|
|
3272
|
+
|
|
3273
|
+
# Get sample_uids to process
|
|
3274
|
+
sample_uids = self._get_sample_uids(samples)
|
|
3275
|
+
if not sample_uids:
|
|
3276
|
+
self.logger.warning("No valid samples specified.")
|
|
3277
|
+
return
|
|
3278
|
+
|
|
3279
|
+
self.logger.info(f"Restoring MS2 data from {len(sample_uids)} samples...")
|
|
3280
|
+
|
|
3281
|
+
# Clear existing consensus_ms2 to rebuild from scratch
|
|
3282
|
+
initial_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
3283
|
+
self.consensus_ms2 = pl.DataFrame()
|
|
3284
|
+
|
|
3285
|
+
# Re-run find_ms2 which will rebuild consensus_ms2
|
|
3286
|
+
try:
|
|
3287
|
+
self.find_ms2(**kwargs)
|
|
3288
|
+
|
|
3289
|
+
final_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
3290
|
+
|
|
3291
|
+
self.logger.info(f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra")
|
|
3292
|
+
|
|
3293
|
+
except Exception as e:
|
|
3294
|
+
self.logger.error(f"Failed to restore MS2 data: {e}")
|
|
3295
|
+
raise
|
|
3296
|
+
|
|
3297
|
+
|
|
3298
|
+
def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs):
|
|
3299
|
+
"""
|
|
3300
|
+
Reverse any compression effects by restoring compressed data adaptively.
|
|
3301
|
+
|
|
3302
|
+
This function restores data that was compressed using compress(), compress_features(),
|
|
3303
|
+
compress_ms2(), compress_chrom(), or study.save(compress=True). It optimizes the
|
|
3304
|
+
decompression process for speed by only processing what actually needs restoration.
|
|
3305
|
+
|
|
3306
|
+
Parameters:
|
|
3307
|
+
features (bool): Restore features data (ms2_specs, ms2_scans, chrom_area)
|
|
3308
|
+
ms2 (bool): Restore MS2 spectra by re-running find_ms2()
|
|
3309
|
+
chrom (bool): Restore chromatogram objects
|
|
3310
|
+
samples (list, optional): List of sample_uids or sample_names to process.
|
|
3311
|
+
If None, processes all samples.
|
|
3312
|
+
**kwargs: Additional keyword arguments for restoration functions:
|
|
3313
|
+
- For restore_chrom: mz_tol (default: 0.010), rt_tol (default: 10.0)
|
|
3314
|
+
- For restore_ms2/find_ms2: mz_tol, centroid, deisotope, etc.
|
|
3315
|
+
|
|
3316
|
+
Performance Optimizations:
|
|
3317
|
+
- Adaptive processing: Only restores what actually needs restoration
|
|
3318
|
+
- Processes features and chromatograms together when possible (shared file I/O)
|
|
3319
|
+
- Uses cached sample instances to avoid repeated file loading
|
|
3320
|
+
- Processes MS2 restoration last as it's the most computationally expensive
|
|
3321
|
+
- Provides detailed progress information for long-running operations
|
|
3322
|
+
|
|
3323
|
+
Example:
|
|
3324
|
+
# Restore everything (but only what needs restoration)
|
|
3325
|
+
study.decompress()
|
|
3326
|
+
|
|
3327
|
+
# Restore only chromatograms with custom tolerances
|
|
3328
|
+
study.decompress(features=False, ms2=False, chrom=True, mz_tol=0.005, rt_tol=5.0)
|
|
3329
|
+
|
|
3330
|
+
# Restore specific samples only
|
|
3331
|
+
study.decompress(samples=["sample1", "sample2"])
|
|
3332
|
+
"""
|
|
3333
|
+
if not any([features, ms2, chrom]):
|
|
3334
|
+
self.logger.warning("No decompression operations specified.")
|
|
3335
|
+
return
|
|
3336
|
+
|
|
3337
|
+
# Get sample_uids to process
|
|
3338
|
+
sample_uids = self._get_sample_uids(samples)
|
|
3339
|
+
if not sample_uids:
|
|
3340
|
+
self.logger.warning("No valid samples specified.")
|
|
3341
|
+
return
|
|
3342
|
+
|
|
3343
|
+
# Adaptively check what actually needs to be done
|
|
3344
|
+
import polars as pl
|
|
3345
|
+
|
|
3346
|
+
# Check if features need restoration (more sophisticated logic)
|
|
3347
|
+
features_need_restoration = False
|
|
3348
|
+
if features and not self.features_df.is_empty():
|
|
3349
|
+
# Check for completely missing columns that should exist after feature processing
|
|
3350
|
+
missing_cols = []
|
|
3351
|
+
for col in ["ms2_scans", "ms2_specs"]:
|
|
3352
|
+
if col not in self.features_df.columns:
|
|
3353
|
+
missing_cols.append(col)
|
|
3354
|
+
|
|
3355
|
+
# If columns are missing entirely, we likely need restoration
|
|
3356
|
+
if missing_cols:
|
|
3357
|
+
features_need_restoration = True
|
|
3358
|
+
else:
|
|
3359
|
+
# If columns exist, check if they're mostly null (indicating compression)
|
|
3360
|
+
# But be smart about it - only check if we have consensus features with MS2
|
|
3361
|
+
if not self.consensus_ms2.is_empty():
|
|
3362
|
+
# We have MS2 data, so ms2_specs should have some content
|
|
3363
|
+
null_ms2_specs = self.features_df.filter(pl.col("ms2_specs").is_null()).height
|
|
3364
|
+
total_features = len(self.features_df)
|
|
3365
|
+
# If more than 90% are null but we have MS2 data, likely compressed
|
|
3366
|
+
if null_ms2_specs > (total_features * 0.9):
|
|
3367
|
+
features_need_restoration = True
|
|
3368
|
+
|
|
3369
|
+
# Check if chromatograms need restoration
|
|
3370
|
+
chrom_need_restoration = False
|
|
3371
|
+
if chrom and not self.features_df.is_empty():
|
|
3372
|
+
if "chrom" not in self.features_df.columns:
|
|
3373
|
+
# Column completely missing
|
|
3374
|
+
chrom_need_restoration = True
|
|
3375
|
+
else:
|
|
3376
|
+
null_chroms = self.features_df.filter(pl.col("chrom").is_null()).height
|
|
3377
|
+
total_features = len(self.features_df)
|
|
3378
|
+
# If more than 50% are null, likely need restoration
|
|
3379
|
+
chrom_need_restoration = null_chroms > (total_features * 0.5)
|
|
3380
|
+
|
|
3381
|
+
# Check if MS2 data might need restoration (compare expected vs actual)
|
|
3382
|
+
ms2_need_restoration = False
|
|
3383
|
+
if ms2:
|
|
3384
|
+
current_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
3385
|
+
consensus_count = len(self.consensus_df) if not self.consensus_df.is_empty() else 0
|
|
3386
|
+
|
|
3387
|
+
if consensus_count > 0:
|
|
3388
|
+
# Calculate expected MS2 count based on consensus features with MS2 potential
|
|
3389
|
+
# This is a heuristic - if we have very few MS2 compared to consensus, likely compressed
|
|
3390
|
+
expected_ratio = 3.0 # Expect at least 3 MS2 per consensus on average
|
|
3391
|
+
expected_ms2 = consensus_count * expected_ratio
|
|
3392
|
+
|
|
3393
|
+
if current_ms2_count < min(expected_ms2 * 0.3, consensus_count * 0.8):
|
|
3394
|
+
ms2_need_restoration = True
|
|
3395
|
+
|
|
3396
|
+
# Build list of operations that actually need to be done
|
|
3397
|
+
operations_needed = []
|
|
3398
|
+
if features and features_need_restoration:
|
|
3399
|
+
operations_needed.append("features")
|
|
3400
|
+
if chrom and chrom_need_restoration:
|
|
3401
|
+
operations_needed.append("chromatograms")
|
|
3402
|
+
if ms2 and ms2_need_restoration:
|
|
3403
|
+
operations_needed.append("MS2 spectra")
|
|
3404
|
+
|
|
3405
|
+
# Early exit if nothing needs to be done
|
|
3406
|
+
if not operations_needed:
|
|
3407
|
+
self.logger.info("All data appears to be already decompressed. No operations needed.")
|
|
3408
|
+
return
|
|
3409
|
+
|
|
3410
|
+
self.logger.info(f"Starting adaptive decompression: {', '.join(operations_needed)} from {len(sample_uids)} samples")
|
|
3411
|
+
|
|
3412
|
+
try:
|
|
3413
|
+
# Phase 1: Restore features and chromatograms together (shared file I/O)
|
|
3414
|
+
if ("features" in operations_needed and "chromatograms" in operations_needed):
|
|
3415
|
+
self.logger.info("Phase 1: Restoring features and chromatograms together...")
|
|
3416
|
+
|
|
3417
|
+
# Extract relevant kwargs for restore_features and restore_chrom
|
|
3418
|
+
restore_kwargs = {}
|
|
3419
|
+
if 'mz_tol' in kwargs:
|
|
3420
|
+
restore_kwargs['mz_tol'] = kwargs['mz_tol']
|
|
3421
|
+
if 'rt_tol' in kwargs:
|
|
3422
|
+
restore_kwargs['rt_tol'] = kwargs['rt_tol']
|
|
3423
|
+
|
|
3424
|
+
# Restore features first (includes chrom column)
|
|
3425
|
+
self.restore_features(samples=samples)
|
|
3426
|
+
|
|
3427
|
+
# Then do additional chrom gap-filling if needed
|
|
3428
|
+
self.restore_chrom(samples=samples, **restore_kwargs)
|
|
3429
|
+
|
|
3430
|
+
elif ("features" in operations_needed and "chromatograms" not in operations_needed):
|
|
3431
|
+
self.logger.info("Phase 1: Restoring features data...")
|
|
3432
|
+
self.restore_features(samples=samples)
|
|
3433
|
+
|
|
3434
|
+
elif ("chromatograms" in operations_needed and "features" not in operations_needed):
|
|
3435
|
+
self.logger.info("Phase 1: Restoring chromatograms...")
|
|
3436
|
+
restore_kwargs = {}
|
|
3437
|
+
if 'mz_tol' in kwargs:
|
|
3438
|
+
restore_kwargs['mz_tol'] = kwargs['mz_tol']
|
|
3439
|
+
if 'rt_tol' in kwargs:
|
|
3440
|
+
restore_kwargs['rt_tol'] = kwargs['rt_tol']
|
|
3441
|
+
self.restore_chrom(samples=samples, **restore_kwargs)
|
|
3442
|
+
|
|
3443
|
+
# Phase 2: Restore MS2 data (most computationally expensive, done last)
|
|
3444
|
+
if "MS2 spectra" in operations_needed:
|
|
3445
|
+
self.logger.info("Phase 2: Restoring MS2 spectra...")
|
|
3446
|
+
|
|
3447
|
+
# Extract MS2-specific kwargs
|
|
3448
|
+
ms2_kwargs = {}
|
|
3449
|
+
for key, value in kwargs.items():
|
|
3450
|
+
if key in ['mz_tol', 'centroid', 'deisotope', 'dia_stats', 'feature_uid']:
|
|
3451
|
+
ms2_kwargs[key] = value
|
|
3452
|
+
|
|
3453
|
+
self.restore_ms2(samples=samples, **ms2_kwargs)
|
|
3454
|
+
|
|
3455
|
+
self.logger.info("Adaptive decompression completed successfully")
|
|
3456
|
+
|
|
3457
|
+
except Exception as e:
|
|
3458
|
+
self.logger.error(f"Decompression failed: {e}")
|
|
3459
|
+
raise
|