masster 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/h5.py +1 -1
- masster/sample/helpers.py +3 -7
- masster/sample/load.py +2 -2
- masster/sample/plot.py +2 -1
- masster/study/export.py +27 -10
- masster/study/h5.py +58 -40
- masster/study/helpers.py +220 -190
- masster/study/helpers_optimized.py +5 -5
- masster/study/load.py +144 -118
- masster/study/plot.py +240 -101
- masster/study/processing.py +9 -5
- masster/study/study.py +2 -6
- {masster-0.3.18.dist-info → masster-0.3.19.dist-info}/METADATA +1 -1
- {masster-0.3.18.dist-info → masster-0.3.19.dist-info}/RECORD +18 -18
- {masster-0.3.18.dist-info → masster-0.3.19.dist-info}/WHEEL +0 -0
- {masster-0.3.18.dist-info → masster-0.3.19.dist-info}/entry_points.txt +0 -0
- {masster-0.3.18.dist-info → masster-0.3.19.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py
CHANGED
|
@@ -6,7 +6,7 @@ like data retrieval, filtering, compression, and utility functions.
|
|
|
6
6
|
|
|
7
7
|
The functions are organized into the following sections:
|
|
8
8
|
1. Chromatogram extraction functions (BPC, TIC, EIC, chrom matrix)
|
|
9
|
-
2. Data retrieval helper functions (get_sample, get_consensus, etc.)
|
|
9
|
+
2. Data retrieval helper functions (get_sample, get_consensus, etc.)
|
|
10
10
|
3. UID helper functions (_get_*_uids)
|
|
11
11
|
4. Data filtering and selection functions
|
|
12
12
|
5. Data compression and restoration functions
|
|
@@ -150,9 +150,19 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
|
|
|
150
150
|
# build Chromatogram
|
|
151
151
|
ycol = "inty"
|
|
152
152
|
try:
|
|
153
|
-
chrom = Chromatogram(
|
|
153
|
+
chrom = Chromatogram(
|
|
154
|
+
rt=bpc_pd["rt"].to_numpy(),
|
|
155
|
+
inty=bpc_pd[ycol].to_numpy(),
|
|
156
|
+
label=label or "Base Peak Chromatogram",
|
|
157
|
+
rt_unit=rt_unit,
|
|
158
|
+
)
|
|
154
159
|
except Exception:
|
|
155
|
-
chrom = Chromatogram(
|
|
160
|
+
chrom = Chromatogram(
|
|
161
|
+
rt=bpc_pd["rt"].values,
|
|
162
|
+
inty=bpc_pd[ycol].values,
|
|
163
|
+
label=label or "Base Peak Chromatogram",
|
|
164
|
+
rt_unit=rt_unit,
|
|
165
|
+
)
|
|
156
166
|
|
|
157
167
|
return chrom
|
|
158
168
|
|
|
@@ -204,13 +214,21 @@ def get_tic(owner, sample=None, label=None):
|
|
|
204
214
|
tic_pd = tic_pd.rename(columns={tic_pd.columns[1]: "inty_tot"})
|
|
205
215
|
|
|
206
216
|
try:
|
|
207
|
-
chrom = Chromatogram(
|
|
217
|
+
chrom = Chromatogram(
|
|
218
|
+
rt=tic_pd["rt"].to_numpy(),
|
|
219
|
+
inty=tic_pd["inty_tot"].to_numpy(),
|
|
220
|
+
label=label or "Total Ion Chromatogram",
|
|
221
|
+
)
|
|
208
222
|
except Exception:
|
|
209
|
-
chrom = Chromatogram(
|
|
223
|
+
chrom = Chromatogram(
|
|
224
|
+
rt=tic_pd["rt"].values,
|
|
225
|
+
inty=tic_pd["inty_tot"].values,
|
|
226
|
+
label=label or "Total Ion Chromatogram",
|
|
227
|
+
)
|
|
210
228
|
|
|
211
229
|
return chrom
|
|
212
230
|
|
|
213
|
-
|
|
231
|
+
|
|
214
232
|
def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
215
233
|
"""
|
|
216
234
|
Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
|
|
@@ -223,7 +241,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
|
223
241
|
|
|
224
242
|
Parameters:
|
|
225
243
|
owner: Study or Sample instance
|
|
226
|
-
sample: Sample identifier (required if owner is Study)
|
|
244
|
+
sample: Sample identifier (required if owner is Study)
|
|
227
245
|
mz (float): Target m/z value
|
|
228
246
|
mz_tol (float): m/z tolerance. If None, uses owner.parameters.eic_mz_tol (for Study) or defaults to 0.01
|
|
229
247
|
rt_unit (str): Retention time unit for the chromatogram
|
|
@@ -234,7 +252,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
|
234
252
|
"""
|
|
235
253
|
# Use default mz_tol from study parameters if not provided
|
|
236
254
|
if mz_tol is None:
|
|
237
|
-
if hasattr(owner,
|
|
255
|
+
if hasattr(owner, "parameters") and hasattr(owner.parameters, "eic_mz_tol"):
|
|
238
256
|
mz_tol = owner.parameters.eic_mz_tol
|
|
239
257
|
else:
|
|
240
258
|
mz_tol = 0.01 # fallback default
|
|
@@ -267,17 +285,18 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
|
267
285
|
mz_min = mz - mz_tol
|
|
268
286
|
mz_max = mz + mz_tol
|
|
269
287
|
eic_data = s.ms1_df.filter(
|
|
270
|
-
(pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
|
|
288
|
+
(pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max),
|
|
271
289
|
)
|
|
272
290
|
|
|
273
291
|
if eic_data.is_empty():
|
|
274
292
|
# Return empty chromatogram if no data found
|
|
275
293
|
import numpy as _np
|
|
294
|
+
|
|
276
295
|
return Chromatogram(
|
|
277
|
-
rt=_np.array([0.0]),
|
|
278
|
-
inty=_np.array([0.0]),
|
|
296
|
+
rt=_np.array([0.0]),
|
|
297
|
+
inty=_np.array([0.0]),
|
|
279
298
|
label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
|
|
280
|
-
rt_unit=rt_unit
|
|
299
|
+
rt_unit=rt_unit,
|
|
281
300
|
)
|
|
282
301
|
|
|
283
302
|
# Aggregate intensities per retention time (sum in case of multiple points per rt)
|
|
@@ -290,34 +309,35 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
|
290
309
|
if eic_pd.empty:
|
|
291
310
|
# Return empty chromatogram if no data found
|
|
292
311
|
import numpy as _np
|
|
312
|
+
|
|
293
313
|
return Chromatogram(
|
|
294
|
-
rt=_np.array([0.0]),
|
|
295
|
-
inty=_np.array([0.0]),
|
|
314
|
+
rt=_np.array([0.0]),
|
|
315
|
+
inty=_np.array([0.0]),
|
|
296
316
|
label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
|
|
297
|
-
rt_unit=rt_unit
|
|
317
|
+
rt_unit=rt_unit,
|
|
298
318
|
)
|
|
299
319
|
|
|
300
320
|
# build Chromatogram
|
|
301
321
|
try:
|
|
302
322
|
chrom = Chromatogram(
|
|
303
|
-
rt=eic_pd["rt"].to_numpy(),
|
|
304
|
-
inty=eic_pd["inty"].to_numpy(),
|
|
323
|
+
rt=eic_pd["rt"].to_numpy(),
|
|
324
|
+
inty=eic_pd["inty"].to_numpy(),
|
|
305
325
|
label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
|
|
306
|
-
rt_unit=rt_unit
|
|
326
|
+
rt_unit=rt_unit,
|
|
307
327
|
)
|
|
308
328
|
except Exception:
|
|
309
329
|
chrom = Chromatogram(
|
|
310
|
-
rt=eic_pd["rt"].values,
|
|
311
|
-
inty=eic_pd["inty"].values,
|
|
330
|
+
rt=eic_pd["rt"].values,
|
|
331
|
+
inty=eic_pd["inty"].values,
|
|
312
332
|
label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
|
|
313
|
-
rt_unit=rt_unit
|
|
333
|
+
rt_unit=rt_unit,
|
|
314
334
|
)
|
|
315
335
|
|
|
316
336
|
return chrom
|
|
317
337
|
|
|
318
338
|
|
|
319
339
|
# =====================================================================================
|
|
320
|
-
# DATA RETRIEVAL AND MATRIX FUNCTIONS
|
|
340
|
+
# DATA RETRIEVAL AND MATRIX FUNCTIONS
|
|
321
341
|
# =====================================================================================
|
|
322
342
|
|
|
323
343
|
|
|
@@ -451,9 +471,9 @@ def align_reset(self):
|
|
|
451
471
|
self.alignment_ref_index = None
|
|
452
472
|
# in self.features_df, set rt equal to rt_original
|
|
453
473
|
self.features_df = self.features_df.with_columns(
|
|
454
|
-
pl.col("rt_original").alias("rt")
|
|
474
|
+
pl.col("rt_original").alias("rt"),
|
|
455
475
|
)
|
|
456
|
-
|
|
476
|
+
|
|
457
477
|
# Ensure column order is maintained after with_columns operation
|
|
458
478
|
self._ensure_features_df_schema_order()
|
|
459
479
|
|
|
@@ -614,7 +634,7 @@ def get_consensus_matches(self, uids=None):
|
|
|
614
634
|
return matches
|
|
615
635
|
|
|
616
636
|
|
|
617
|
-
# =====================================================================================
|
|
637
|
+
# =====================================================================================
|
|
618
638
|
# UID HELPER FUNCTIONS
|
|
619
639
|
# =====================================================================================
|
|
620
640
|
|
|
@@ -796,7 +816,7 @@ def get_sample(self, sample):
|
|
|
796
816
|
return cache[sample_uid]
|
|
797
817
|
|
|
798
818
|
sample_path = row.get("sample_path", None)
|
|
799
|
-
s = Sample(log_level=
|
|
819
|
+
s = Sample(log_level="ERROR")
|
|
800
820
|
try:
|
|
801
821
|
if sample_path:
|
|
802
822
|
try:
|
|
@@ -816,13 +836,13 @@ def get_orphans(self):
|
|
|
816
836
|
Get all features that are not in the consensus mapping.
|
|
817
837
|
"""
|
|
818
838
|
not_in_consensus = self.features_df.filter(
|
|
819
|
-
~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list())
|
|
839
|
+
~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list()),
|
|
820
840
|
)
|
|
821
841
|
return not_in_consensus
|
|
822
842
|
|
|
823
843
|
|
|
824
844
|
# =====================================================================================
|
|
825
|
-
# DATA COMPRESSION AND RESTORATION FUNCTIONS
|
|
845
|
+
# DATA COMPRESSION AND RESTORATION FUNCTIONS
|
|
826
846
|
# =====================================================================================
|
|
827
847
|
|
|
828
848
|
|
|
@@ -878,7 +898,7 @@ def compress_features(self):
|
|
|
878
898
|
|
|
879
899
|
removed_count = initial_count - len(self.features_df)
|
|
880
900
|
self.logger.info(
|
|
881
|
-
f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column"
|
|
901
|
+
f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column",
|
|
882
902
|
)
|
|
883
903
|
|
|
884
904
|
|
|
@@ -1119,7 +1139,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1119
1139
|
total_chroms = len(self.features_df)
|
|
1120
1140
|
|
|
1121
1141
|
self.logger.debug(
|
|
1122
|
-
f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)"
|
|
1142
|
+
f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)",
|
|
1123
1143
|
)
|
|
1124
1144
|
|
|
1125
1145
|
if empty_chroms == 0:
|
|
@@ -1249,7 +1269,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1249
1269
|
final_total = len(self.features_df)
|
|
1250
1270
|
|
|
1251
1271
|
self.logger.info(
|
|
1252
|
-
f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)"
|
|
1272
|
+
f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)",
|
|
1253
1273
|
)
|
|
1254
1274
|
self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
|
|
1255
1275
|
|
|
@@ -1290,7 +1310,7 @@ def compress_ms2(self, max_replicates=5):
|
|
|
1290
1310
|
|
|
1291
1311
|
removed_count = initial_count - len(self.consensus_ms2)
|
|
1292
1312
|
self.logger.info(
|
|
1293
|
-
f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair"
|
|
1313
|
+
f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair",
|
|
1294
1314
|
)
|
|
1295
1315
|
|
|
1296
1316
|
|
|
@@ -1328,14 +1348,14 @@ def compress_chrom(self):
|
|
|
1328
1348
|
def sample_name_replace(self, replace_dict):
|
|
1329
1349
|
"""
|
|
1330
1350
|
Replace sample names in samples_df based on a dictionary mapping.
|
|
1331
|
-
|
|
1332
|
-
Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
|
|
1333
|
-
all keys with their corresponding values from replace_dict. Checks that all
|
|
1351
|
+
|
|
1352
|
+
Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
|
|
1353
|
+
all keys with their corresponding values from replace_dict. Checks that all
|
|
1334
1354
|
resulting sample names are unique. If unique, replaces the values in self.samples_df.
|
|
1335
1355
|
|
|
1336
1356
|
Parameters:
|
|
1337
1357
|
replace_dict (dict): Dictionary mapping old names (keys) to new names (values).
|
|
1338
|
-
All keys found in sample names will be replaced with their
|
|
1358
|
+
All keys found in sample names will be replaced with their
|
|
1339
1359
|
corresponding values.
|
|
1340
1360
|
e.g., {"old_name1": "new_name1", "old_name2": "new_name2"}
|
|
1341
1361
|
|
|
@@ -1348,22 +1368,22 @@ def sample_name_replace(self, replace_dict):
|
|
|
1348
1368
|
"""
|
|
1349
1369
|
if not isinstance(replace_dict, dict):
|
|
1350
1370
|
raise ValueError("replace_dict must be a dictionary")
|
|
1351
|
-
|
|
1371
|
+
|
|
1352
1372
|
if self.samples_df is None or len(self.samples_df) == 0:
|
|
1353
1373
|
self.logger.warning("No samples found in study.")
|
|
1354
1374
|
return
|
|
1355
|
-
|
|
1375
|
+
|
|
1356
1376
|
if not replace_dict:
|
|
1357
1377
|
self.logger.warning("Empty replace_dict provided, no changes made.")
|
|
1358
1378
|
return
|
|
1359
1379
|
|
|
1360
1380
|
# Get current sample names
|
|
1361
1381
|
current_names = self.samples_df.get_column("sample_name").to_list()
|
|
1362
|
-
|
|
1382
|
+
|
|
1363
1383
|
# Create a copy and apply replacements
|
|
1364
1384
|
new_names = []
|
|
1365
1385
|
replaced_count = 0
|
|
1366
|
-
|
|
1386
|
+
|
|
1367
1387
|
for name in current_names:
|
|
1368
1388
|
if name in replace_dict:
|
|
1369
1389
|
new_names.append(replace_dict[name])
|
|
@@ -1371,7 +1391,7 @@ def sample_name_replace(self, replace_dict):
|
|
|
1371
1391
|
self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
|
|
1372
1392
|
else:
|
|
1373
1393
|
new_names.append(name)
|
|
1374
|
-
|
|
1394
|
+
|
|
1375
1395
|
# Check that all new names are unique
|
|
1376
1396
|
if len(set(new_names)) != len(new_names):
|
|
1377
1397
|
duplicates = []
|
|
@@ -1382,19 +1402,19 @@ def sample_name_replace(self, replace_dict):
|
|
|
1382
1402
|
else:
|
|
1383
1403
|
seen.add(name)
|
|
1384
1404
|
raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
|
|
1385
|
-
|
|
1405
|
+
|
|
1386
1406
|
# If we get here, all names are unique - apply the changes
|
|
1387
1407
|
self.samples_df = self.samples_df.with_columns(
|
|
1388
1408
|
pl.Series("sample_name", new_names).alias("sample_name"),
|
|
1389
1409
|
)
|
|
1390
|
-
|
|
1410
|
+
|
|
1391
1411
|
self.logger.info(f"Successfully replaced {replaced_count} sample names")
|
|
1392
1412
|
|
|
1393
1413
|
|
|
1394
1414
|
def sample_name_reset(self):
|
|
1395
1415
|
"""
|
|
1396
1416
|
Reset sample names to the basename of sample_path without extensions.
|
|
1397
|
-
|
|
1417
|
+
|
|
1398
1418
|
Takes all paths in self.samples_df['sample_path'], extracts the basename,
|
|
1399
1419
|
removes file extensions, and checks that all resulting names are unique.
|
|
1400
1420
|
If unique, replaces the values in self.samples_df['sample_name'].
|
|
@@ -1407,31 +1427,31 @@ def sample_name_reset(self):
|
|
|
1407
1427
|
RuntimeError: If any sample_path is None or empty
|
|
1408
1428
|
"""
|
|
1409
1429
|
import os
|
|
1410
|
-
|
|
1430
|
+
|
|
1411
1431
|
if self.samples_df is None or len(self.samples_df) == 0:
|
|
1412
1432
|
self.logger.warning("No samples found in study.")
|
|
1413
1433
|
return
|
|
1414
1434
|
|
|
1415
1435
|
# Get current sample paths
|
|
1416
1436
|
sample_paths = self.samples_df.get_column("sample_path").to_list()
|
|
1417
|
-
|
|
1437
|
+
|
|
1418
1438
|
# Extract basenames without extensions
|
|
1419
1439
|
new_names = []
|
|
1420
|
-
|
|
1440
|
+
|
|
1421
1441
|
for i, path in enumerate(sample_paths):
|
|
1422
1442
|
if path is None or path == "":
|
|
1423
1443
|
raise RuntimeError(f"Sample at index {i} has no sample_path set")
|
|
1424
|
-
|
|
1444
|
+
|
|
1425
1445
|
# Get basename and remove extension(s)
|
|
1426
1446
|
basename = os.path.basename(path)
|
|
1427
1447
|
# Remove all extensions (handles cases like .tar.gz, .sample5.gz, etc.)
|
|
1428
1448
|
name_without_ext = basename
|
|
1429
|
-
while
|
|
1449
|
+
while "." in name_without_ext:
|
|
1430
1450
|
name_without_ext = os.path.splitext(name_without_ext)[0]
|
|
1431
|
-
|
|
1451
|
+
|
|
1432
1452
|
new_names.append(name_without_ext)
|
|
1433
1453
|
self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
|
|
1434
|
-
|
|
1454
|
+
|
|
1435
1455
|
# Check that all new names are unique
|
|
1436
1456
|
if len(set(new_names)) != len(new_names):
|
|
1437
1457
|
duplicates = []
|
|
@@ -1442,12 +1462,12 @@ def sample_name_reset(self):
|
|
|
1442
1462
|
else:
|
|
1443
1463
|
seen.add(name)
|
|
1444
1464
|
raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
|
|
1445
|
-
|
|
1465
|
+
|
|
1446
1466
|
# If we get here, all names are unique - apply the changes
|
|
1447
1467
|
self.samples_df = self.samples_df.with_columns(
|
|
1448
1468
|
pl.Series("sample_name", new_names).alias("sample_name"),
|
|
1449
1469
|
)
|
|
1450
|
-
|
|
1470
|
+
|
|
1451
1471
|
self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
|
|
1452
1472
|
|
|
1453
1473
|
|
|
@@ -1704,7 +1724,7 @@ def features_select(
|
|
|
1704
1724
|
if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
|
|
1705
1725
|
min_coherence, max_coherence = chrom_coherence
|
|
1706
1726
|
filter_conditions.append(
|
|
1707
|
-
(pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
|
|
1727
|
+
(pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
|
|
1708
1728
|
)
|
|
1709
1729
|
else:
|
|
1710
1730
|
filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
|
|
@@ -1717,7 +1737,7 @@ def features_select(
|
|
|
1717
1737
|
if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
|
|
1718
1738
|
min_prominence, max_prominence = chrom_prominence
|
|
1719
1739
|
filter_conditions.append(
|
|
1720
|
-
(pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
|
|
1740
|
+
(pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
|
|
1721
1741
|
)
|
|
1722
1742
|
else:
|
|
1723
1743
|
filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
|
|
@@ -1731,7 +1751,7 @@ def features_select(
|
|
|
1731
1751
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
|
|
1732
1752
|
filter_conditions.append(
|
|
1733
1753
|
(pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
|
|
1734
|
-
& (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
|
|
1754
|
+
& (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
|
|
1735
1755
|
)
|
|
1736
1756
|
else:
|
|
1737
1757
|
filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
|
|
@@ -1745,7 +1765,7 @@ def features_select(
|
|
|
1745
1765
|
min_height_scaled, max_height_scaled = chrom_height_scaled
|
|
1746
1766
|
filter_conditions.append(
|
|
1747
1767
|
(pl.col("chrom_height_scaled") >= min_height_scaled)
|
|
1748
|
-
& (pl.col("chrom_height_scaled") <= max_height_scaled)
|
|
1768
|
+
& (pl.col("chrom_height_scaled") <= max_height_scaled),
|
|
1749
1769
|
)
|
|
1750
1770
|
else:
|
|
1751
1771
|
filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
|
|
@@ -1852,7 +1872,7 @@ def features_filter(self, features):
|
|
|
1852
1872
|
# Single comprehensive log message
|
|
1853
1873
|
if mapping_removed_count > 0:
|
|
1854
1874
|
self.logger.info(
|
|
1855
|
-
f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features."
|
|
1875
|
+
f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.",
|
|
1856
1876
|
)
|
|
1857
1877
|
else:
|
|
1858
1878
|
self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
|
|
@@ -1929,7 +1949,7 @@ def features_delete(self, features):
|
|
|
1929
1949
|
# Single comprehensive log message
|
|
1930
1950
|
if mapping_removed_count > 0:
|
|
1931
1951
|
self.logger.info(
|
|
1932
|
-
f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}"
|
|
1952
|
+
f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}",
|
|
1933
1953
|
)
|
|
1934
1954
|
else:
|
|
1935
1955
|
self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
|
|
@@ -1994,7 +2014,7 @@ def consensus_select(
|
|
|
1994
2014
|
# Filter by m/z
|
|
1995
2015
|
if mz is not None:
|
|
1996
2016
|
consensus_len_before_filter = len(consensus)
|
|
1997
|
-
|
|
2017
|
+
|
|
1998
2018
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
1999
2019
|
# Check if second value is smaller than first (indicating mz, mz_tol format)
|
|
2000
2020
|
if mz[1] < mz[0]:
|
|
@@ -2008,18 +2028,19 @@ def consensus_select(
|
|
|
2008
2028
|
consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
2009
2029
|
else:
|
|
2010
2030
|
# Single float value - use default mz tolerance from study parameters
|
|
2011
|
-
default_mz_tol = getattr(self,
|
|
2012
|
-
if default_mz_tol and hasattr(default_mz_tol,
|
|
2031
|
+
default_mz_tol = getattr(self, "parameters", None)
|
|
2032
|
+
if default_mz_tol and hasattr(default_mz_tol, "eic_mz_tol"):
|
|
2013
2033
|
default_mz_tol = default_mz_tol.eic_mz_tol
|
|
2014
2034
|
else:
|
|
2015
2035
|
# Fallback to align_defaults if study parameters not available
|
|
2016
2036
|
from masster.study.defaults.align_def import align_defaults
|
|
2037
|
+
|
|
2017
2038
|
default_mz_tol = align_defaults().mz_max_diff
|
|
2018
|
-
|
|
2039
|
+
|
|
2019
2040
|
min_mz = mz - default_mz_tol
|
|
2020
2041
|
max_mz = mz + default_mz_tol
|
|
2021
2042
|
consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
2022
|
-
|
|
2043
|
+
|
|
2023
2044
|
self.logger.debug(
|
|
2024
2045
|
f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2025
2046
|
)
|
|
@@ -2027,7 +2048,7 @@ def consensus_select(
|
|
|
2027
2048
|
# Filter by retention time
|
|
2028
2049
|
if rt is not None:
|
|
2029
2050
|
consensus_len_before_filter = len(consensus)
|
|
2030
|
-
|
|
2051
|
+
|
|
2031
2052
|
if isinstance(rt, tuple) and len(rt) == 2:
|
|
2032
2053
|
# Check if second value is smaller than first (indicating rt, rt_tol format)
|
|
2033
2054
|
if rt[1] < rt[0]:
|
|
@@ -2041,18 +2062,19 @@ def consensus_select(
|
|
|
2041
2062
|
consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
2042
2063
|
else:
|
|
2043
2064
|
# Single float value - use default rt tolerance from study parameters
|
|
2044
|
-
default_rt_tol = getattr(self,
|
|
2045
|
-
if default_rt_tol and hasattr(default_rt_tol,
|
|
2065
|
+
default_rt_tol = getattr(self, "parameters", None)
|
|
2066
|
+
if default_rt_tol and hasattr(default_rt_tol, "eic_rt_tol"):
|
|
2046
2067
|
default_rt_tol = default_rt_tol.eic_rt_tol
|
|
2047
2068
|
else:
|
|
2048
2069
|
# Fallback to align_defaults if study parameters not available
|
|
2049
2070
|
from masster.study.defaults.align_def import align_defaults
|
|
2071
|
+
|
|
2050
2072
|
default_rt_tol = align_defaults().rt_max_diff
|
|
2051
|
-
|
|
2073
|
+
|
|
2052
2074
|
min_rt = rt - default_rt_tol
|
|
2053
2075
|
max_rt = rt + default_rt_tol
|
|
2054
2076
|
consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
2055
|
-
|
|
2077
|
+
|
|
2056
2078
|
self.logger.debug(
|
|
2057
2079
|
f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2058
2080
|
)
|
|
@@ -2077,7 +2099,7 @@ def consensus_select(
|
|
|
2077
2099
|
# Treat as range
|
|
2078
2100
|
min_uid, max_uid = consensus_uid
|
|
2079
2101
|
consensus = consensus.filter(
|
|
2080
|
-
(pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid)
|
|
2102
|
+
(pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid),
|
|
2081
2103
|
)
|
|
2082
2104
|
else:
|
|
2083
2105
|
# Treat as list
|
|
@@ -2105,7 +2127,7 @@ def consensus_select(
|
|
|
2105
2127
|
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
2106
2128
|
min_samples, max_samples = number_samples
|
|
2107
2129
|
consensus = consensus.filter(
|
|
2108
|
-
(pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples)
|
|
2130
|
+
(pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples),
|
|
2109
2131
|
)
|
|
2110
2132
|
else:
|
|
2111
2133
|
consensus = consensus.filter(pl.col("number_samples") >= number_samples)
|
|
@@ -2163,7 +2185,7 @@ def consensus_select(
|
|
|
2163
2185
|
min_coherence, max_coherence = chrom_coherence_mean
|
|
2164
2186
|
consensus = consensus.filter(
|
|
2165
2187
|
(pl.col("chrom_coherence_mean") >= min_coherence)
|
|
2166
|
-
& (pl.col("chrom_coherence_mean") <= max_coherence)
|
|
2188
|
+
& (pl.col("chrom_coherence_mean") <= max_coherence),
|
|
2167
2189
|
)
|
|
2168
2190
|
else:
|
|
2169
2191
|
consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
|
|
@@ -2181,7 +2203,7 @@ def consensus_select(
|
|
|
2181
2203
|
min_prominence, max_prominence = chrom_prominence_mean
|
|
2182
2204
|
consensus = consensus.filter(
|
|
2183
2205
|
(pl.col("chrom_prominence_mean") >= min_prominence)
|
|
2184
|
-
& (pl.col("chrom_prominence_mean") <= max_prominence)
|
|
2206
|
+
& (pl.col("chrom_prominence_mean") <= max_prominence),
|
|
2185
2207
|
)
|
|
2186
2208
|
else:
|
|
2187
2209
|
consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
|
|
@@ -2199,7 +2221,7 @@ def consensus_select(
|
|
|
2199
2221
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
|
|
2200
2222
|
consensus = consensus.filter(
|
|
2201
2223
|
(pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
|
|
2202
|
-
& (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled)
|
|
2224
|
+
& (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
|
|
2203
2225
|
)
|
|
2204
2226
|
else:
|
|
2205
2227
|
consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
|
|
@@ -2217,7 +2239,7 @@ def consensus_select(
|
|
|
2217
2239
|
min_height_scaled, max_height_scaled = chrom_height_scaled_mean
|
|
2218
2240
|
consensus = consensus.filter(
|
|
2219
2241
|
(pl.col("chrom_height_scaled_mean") >= min_height_scaled)
|
|
2220
|
-
& (pl.col("chrom_height_scaled_mean") <= max_height_scaled)
|
|
2242
|
+
& (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
|
|
2221
2243
|
)
|
|
2222
2244
|
else:
|
|
2223
2245
|
consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
|
|
@@ -2234,7 +2256,7 @@ def consensus_select(
|
|
|
2234
2256
|
if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
|
|
2235
2257
|
min_rt_delta, max_rt_delta = rt_delta_mean
|
|
2236
2258
|
consensus = consensus.filter(
|
|
2237
|
-
(pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta)
|
|
2259
|
+
(pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta),
|
|
2238
2260
|
)
|
|
2239
2261
|
else:
|
|
2240
2262
|
consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
|
|
@@ -2261,10 +2283,10 @@ def consensus_select(
|
|
|
2261
2283
|
# Multiple columns
|
|
2262
2284
|
valid_columns = [col for col in sortby if col in consensus.columns]
|
|
2263
2285
|
invalid_columns = [col for col in sortby if col not in consensus.columns]
|
|
2264
|
-
|
|
2286
|
+
|
|
2265
2287
|
if invalid_columns:
|
|
2266
2288
|
self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
|
|
2267
|
-
|
|
2289
|
+
|
|
2268
2290
|
if valid_columns:
|
|
2269
2291
|
consensus = consensus.sort(valid_columns, descending=descending)
|
|
2270
2292
|
else:
|
|
@@ -2355,7 +2377,7 @@ def consensus_filter(self, consensus):
|
|
|
2355
2377
|
|
|
2356
2378
|
removed_consensus_count = initial_consensus_count - len(self.consensus_df)
|
|
2357
2379
|
self.logger.info(
|
|
2358
|
-
f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}"
|
|
2380
|
+
f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}",
|
|
2359
2381
|
)
|
|
2360
2382
|
|
|
2361
2383
|
|
|
@@ -2485,7 +2507,9 @@ def samples_select(
|
|
|
2485
2507
|
if len(sample_batch) == 2 and not isinstance(sample_batch, list):
|
|
2486
2508
|
# Treat as range
|
|
2487
2509
|
min_batch, max_batch = sample_batch
|
|
2488
|
-
filter_conditions.append(
|
|
2510
|
+
filter_conditions.append(
|
|
2511
|
+
(pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch),
|
|
2512
|
+
)
|
|
2489
2513
|
else:
|
|
2490
2514
|
# Treat as list
|
|
2491
2515
|
filter_conditions.append(pl.col("sample_batch").is_in(sample_batch))
|
|
@@ -2501,7 +2525,9 @@ def samples_select(
|
|
|
2501
2525
|
if len(sample_sequence) == 2 and not isinstance(sample_sequence, list):
|
|
2502
2526
|
# Treat as range
|
|
2503
2527
|
min_seq, max_seq = sample_sequence
|
|
2504
|
-
filter_conditions.append(
|
|
2528
|
+
filter_conditions.append(
|
|
2529
|
+
(pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq),
|
|
2530
|
+
)
|
|
2505
2531
|
else:
|
|
2506
2532
|
# Treat as list
|
|
2507
2533
|
filter_conditions.append(pl.col("sample_sequence").is_in(sample_sequence))
|
|
@@ -2515,7 +2541,9 @@ def samples_select(
|
|
|
2515
2541
|
if "num_features" in available_columns:
|
|
2516
2542
|
if isinstance(num_features, tuple) and len(num_features) == 2:
|
|
2517
2543
|
min_features, max_features = num_features
|
|
2518
|
-
filter_conditions.append(
|
|
2544
|
+
filter_conditions.append(
|
|
2545
|
+
(pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features),
|
|
2546
|
+
)
|
|
2519
2547
|
else:
|
|
2520
2548
|
filter_conditions.append(pl.col("num_features") >= num_features)
|
|
2521
2549
|
else:
|
|
@@ -2572,15 +2600,15 @@ def samples_select(
|
|
|
2572
2600
|
def samples_delete(self, samples):
|
|
2573
2601
|
"""
|
|
2574
2602
|
Delete samples and all related data from the study based on sample identifiers.
|
|
2575
|
-
|
|
2576
|
-
This function eliminates all data related to the specified samples (and their sample_uids)
|
|
2603
|
+
|
|
2604
|
+
This function eliminates all data related to the specified samples (and their sample_uids)
|
|
2577
2605
|
from all dataframes including:
|
|
2578
2606
|
- samples_df: Removes the sample rows
|
|
2579
2607
|
- features_df: Removes all features belonging to these samples
|
|
2580
2608
|
- consensus_mapping_df: Removes mappings for features from these samples
|
|
2581
2609
|
- consensus_ms2: Removes MS2 spectra for features from these samples
|
|
2582
2610
|
- feature_maps: Removes the corresponding feature maps
|
|
2583
|
-
|
|
2611
|
+
|
|
2584
2612
|
Also updates map_id values to maintain sequential indices after deletion.
|
|
2585
2613
|
|
|
2586
2614
|
Parameters:
|
|
@@ -2642,10 +2670,10 @@ def samples_delete(self, samples):
|
|
|
2642
2670
|
|
|
2643
2671
|
# Get map_ids to remove from feature_maps (needed before samples_df deletion)
|
|
2644
2672
|
map_ids_to_remove = []
|
|
2645
|
-
if hasattr(self,
|
|
2673
|
+
if hasattr(self, "feature_maps") and self.feature_maps is not None:
|
|
2646
2674
|
# Get map_ids for samples to be deleted
|
|
2647
2675
|
map_ids_df = self.samples_df.filter(
|
|
2648
|
-
pl.col("sample_uid").is_in(sample_uids_to_remove)
|
|
2676
|
+
pl.col("sample_uid").is_in(sample_uids_to_remove),
|
|
2649
2677
|
).select("map_id")
|
|
2650
2678
|
if not map_ids_df.is_empty():
|
|
2651
2679
|
map_ids_to_remove = map_ids_df["map_id"].to_list()
|
|
@@ -2683,7 +2711,7 @@ def samples_delete(self, samples):
|
|
|
2683
2711
|
|
|
2684
2712
|
# 5. Remove from feature_maps and update map_id
|
|
2685
2713
|
removed_maps_count = 0
|
|
2686
|
-
if hasattr(self,
|
|
2714
|
+
if hasattr(self, "feature_maps") and self.feature_maps is not None and map_ids_to_remove:
|
|
2687
2715
|
# Remove feature maps in reverse order to maintain indices
|
|
2688
2716
|
for map_id in sorted(map_ids_to_remove, reverse=True):
|
|
2689
2717
|
if 0 <= map_id < len(self.feature_maps):
|
|
@@ -2694,7 +2722,7 @@ def samples_delete(self, samples):
|
|
|
2694
2722
|
if len(self.samples_df) > 0:
|
|
2695
2723
|
new_map_ids = list(range(len(self.samples_df)))
|
|
2696
2724
|
self.samples_df = self.samples_df.with_columns(
|
|
2697
|
-
pl.lit(new_map_ids).alias("map_id")
|
|
2725
|
+
pl.lit(new_map_ids).alias("map_id"),
|
|
2698
2726
|
)
|
|
2699
2727
|
|
|
2700
2728
|
# Calculate and log results
|
|
@@ -2705,16 +2733,16 @@ def samples_delete(self, samples):
|
|
|
2705
2733
|
summary_parts = [
|
|
2706
2734
|
f"Deleted {removed_sample_count} samples",
|
|
2707
2735
|
]
|
|
2708
|
-
|
|
2736
|
+
|
|
2709
2737
|
if removed_features_count > 0:
|
|
2710
2738
|
summary_parts.append(f"{removed_features_count} features")
|
|
2711
|
-
|
|
2739
|
+
|
|
2712
2740
|
if removed_mapping_count > 0:
|
|
2713
2741
|
summary_parts.append(f"{removed_mapping_count} consensus mappings")
|
|
2714
|
-
|
|
2742
|
+
|
|
2715
2743
|
if removed_ms2_count > 0:
|
|
2716
2744
|
summary_parts.append(f"{removed_ms2_count} MS2 spectra")
|
|
2717
|
-
|
|
2745
|
+
|
|
2718
2746
|
if removed_maps_count > 0:
|
|
2719
2747
|
summary_parts.append(f"{removed_maps_count} feature maps")
|
|
2720
2748
|
|
|
@@ -2735,14 +2763,14 @@ def samples_delete(self, samples):
|
|
|
2735
2763
|
def sample_color(self, by=None, palette="Turbo256"):
|
|
2736
2764
|
"""
|
|
2737
2765
|
Set sample colors in the sample_color column of samples_df.
|
|
2738
|
-
|
|
2766
|
+
|
|
2739
2767
|
When a new sample is added, this function resets all colors picking from the specified palette.
|
|
2740
2768
|
The default palette is Turbo256.
|
|
2741
2769
|
|
|
2742
2770
|
Parameters:
|
|
2743
2771
|
by (str or list, optional): Property to base colors on. Options:
|
|
2744
2772
|
- 'sample_uid': Use sample_uid values to assign colors
|
|
2745
|
-
- 'sample_index': Use sample index (position) to assign colors
|
|
2773
|
+
- 'sample_index': Use sample index (position) to assign colors
|
|
2746
2774
|
- 'sample_type': Use sample_type values to assign colors
|
|
2747
2775
|
- 'sample_name': Use sample_name values to assign colors
|
|
2748
2776
|
- list of colors: Use provided list of hex color codes
|
|
@@ -2755,7 +2783,7 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2755
2783
|
- 'Magma256': Magma colormap (256 colors, perceptually uniform)
|
|
2756
2784
|
- 'Cividis256': Cividis colormap (256 colors, colorblind-friendly)
|
|
2757
2785
|
- 'Set1': Qualitative palette (9 distinct colors)
|
|
2758
|
-
- 'Set2': Qualitative palette (8 distinct colors)
|
|
2786
|
+
- 'Set2': Qualitative palette (8 distinct colors)
|
|
2759
2787
|
- 'Set3': Qualitative palette (12 distinct colors)
|
|
2760
2788
|
- 'Tab10': Tableau 10 palette (10 distinct colors)
|
|
2761
2789
|
- 'Tab20': Tableau 20 palette (20 distinct colors)
|
|
@@ -2766,7 +2794,7 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2766
2794
|
- 'Coolwarm': Cool-warm diverging colormap
|
|
2767
2795
|
- 'Seismic': Seismic diverging colormap
|
|
2768
2796
|
- Any other colormap name supported by the cmap library
|
|
2769
|
-
|
|
2797
|
+
|
|
2770
2798
|
For a complete catalog of available colormaps, see:
|
|
2771
2799
|
https://cmap-docs.readthedocs.io/en/latest/catalog/
|
|
2772
2800
|
|
|
@@ -2776,10 +2804,10 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2776
2804
|
Example:
|
|
2777
2805
|
# Set colors based on sample type
|
|
2778
2806
|
study.sample_color(by='sample_type', palette='Set1')
|
|
2779
|
-
|
|
2807
|
+
|
|
2780
2808
|
# Set colors using a custom color list
|
|
2781
2809
|
study.sample_color(by=['#FF0000', '#00FF00', '#0000FF'])
|
|
2782
|
-
|
|
2810
|
+
|
|
2783
2811
|
# Reset to default Turbo256 sequential colors
|
|
2784
2812
|
study.sample_color()
|
|
2785
2813
|
"""
|
|
@@ -2788,11 +2816,13 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2788
2816
|
return
|
|
2789
2817
|
|
|
2790
2818
|
sample_count = len(self.samples_df)
|
|
2791
|
-
|
|
2819
|
+
|
|
2792
2820
|
# Handle custom color list
|
|
2793
2821
|
if isinstance(by, list):
|
|
2794
2822
|
if len(by) < sample_count:
|
|
2795
|
-
self.logger.warning(
|
|
2823
|
+
self.logger.warning(
|
|
2824
|
+
f"Provided color list has {len(by)} colors but {sample_count} samples. Repeating colors.",
|
|
2825
|
+
)
|
|
2796
2826
|
# Cycle through the provided colors if there aren't enough
|
|
2797
2827
|
colors = []
|
|
2798
2828
|
for i in range(sample_count):
|
|
@@ -2808,10 +2838,10 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2808
2838
|
except ValueError as e:
|
|
2809
2839
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2810
2840
|
return
|
|
2811
|
-
|
|
2812
|
-
elif by ==
|
|
2841
|
+
|
|
2842
|
+
elif by == "sample_uid":
|
|
2813
2843
|
# Use sample_uid to determine position in evenly sampled colormap
|
|
2814
|
-
sample_uids = self.samples_df[
|
|
2844
|
+
sample_uids = self.samples_df["sample_uid"].to_list()
|
|
2815
2845
|
try:
|
|
2816
2846
|
# Sample colors evenly for the number of samples
|
|
2817
2847
|
palette_colors = _sample_colors_from_colormap(palette, sample_count)
|
|
@@ -2823,29 +2853,29 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2823
2853
|
except ValueError as e:
|
|
2824
2854
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2825
2855
|
return
|
|
2826
|
-
|
|
2827
|
-
elif by ==
|
|
2856
|
+
|
|
2857
|
+
elif by == "sample_index":
|
|
2828
2858
|
# Use sample index (position in DataFrame) with evenly sampled colors
|
|
2829
2859
|
try:
|
|
2830
2860
|
colors = _sample_colors_from_colormap(palette, sample_count)
|
|
2831
2861
|
except ValueError as e:
|
|
2832
2862
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2833
2863
|
return
|
|
2834
|
-
|
|
2835
|
-
elif by ==
|
|
2864
|
+
|
|
2865
|
+
elif by == "sample_type":
|
|
2836
2866
|
# Use sample_type to assign colors - same type gets same color
|
|
2837
2867
|
# Sample colors evenly across colormap for unique types
|
|
2838
|
-
sample_types = self.samples_df[
|
|
2839
|
-
unique_types = list(
|
|
2840
|
-
|
|
2868
|
+
sample_types = self.samples_df["sample_type"].to_list()
|
|
2869
|
+
unique_types = list({t for t in sample_types if t is not None})
|
|
2870
|
+
|
|
2841
2871
|
try:
|
|
2842
2872
|
# Sample colors evenly for unique types
|
|
2843
2873
|
type_colors = _sample_colors_from_colormap(palette, len(unique_types))
|
|
2844
2874
|
type_to_color = {}
|
|
2845
|
-
|
|
2875
|
+
|
|
2846
2876
|
for i, sample_type in enumerate(unique_types):
|
|
2847
2877
|
type_to_color[sample_type] = type_colors[i]
|
|
2848
|
-
|
|
2878
|
+
|
|
2849
2879
|
colors = []
|
|
2850
2880
|
for sample_type in sample_types:
|
|
2851
2881
|
if sample_type is None:
|
|
@@ -2856,21 +2886,21 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2856
2886
|
except ValueError as e:
|
|
2857
2887
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2858
2888
|
return
|
|
2859
|
-
|
|
2860
|
-
elif by ==
|
|
2889
|
+
|
|
2890
|
+
elif by == "sample_name":
|
|
2861
2891
|
# Use sample_name to assign colors - same name gets same color (unlikely but possible)
|
|
2862
2892
|
# Sample colors evenly across colormap for unique names
|
|
2863
|
-
sample_names = self.samples_df[
|
|
2864
|
-
unique_names = list(
|
|
2865
|
-
|
|
2893
|
+
sample_names = self.samples_df["sample_name"].to_list()
|
|
2894
|
+
unique_names = list({n for n in sample_names if n is not None})
|
|
2895
|
+
|
|
2866
2896
|
try:
|
|
2867
2897
|
# Sample colors evenly for unique names
|
|
2868
2898
|
name_colors = _sample_colors_from_colormap(palette, len(unique_names))
|
|
2869
2899
|
name_to_color = {}
|
|
2870
|
-
|
|
2900
|
+
|
|
2871
2901
|
for i, sample_name in enumerate(unique_names):
|
|
2872
2902
|
name_to_color[sample_name] = name_colors[i]
|
|
2873
|
-
|
|
2903
|
+
|
|
2874
2904
|
colors = []
|
|
2875
2905
|
for sample_name in sample_names:
|
|
2876
2906
|
if sample_name is None:
|
|
@@ -2882,14 +2912,16 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2882
2912
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2883
2913
|
return
|
|
2884
2914
|
else:
|
|
2885
|
-
self.logger.error(
|
|
2915
|
+
self.logger.error(
|
|
2916
|
+
f"Invalid by value: {by}. Must be 'sample_uid', 'sample_index', 'sample_type', 'sample_name', a list of colors, or None.",
|
|
2917
|
+
)
|
|
2886
2918
|
return
|
|
2887
2919
|
|
|
2888
2920
|
# Update the sample_color column
|
|
2889
2921
|
self.samples_df = self.samples_df.with_columns(
|
|
2890
|
-
pl.Series("sample_color", colors).alias("sample_color")
|
|
2922
|
+
pl.Series("sample_color", colors).alias("sample_color"),
|
|
2891
2923
|
)
|
|
2892
|
-
|
|
2924
|
+
|
|
2893
2925
|
if isinstance(by, list):
|
|
2894
2926
|
self.logger.debug(f"Set sample colors using provided color list ({len(by)} colors)")
|
|
2895
2927
|
elif by is None:
|
|
@@ -2901,28 +2933,28 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2901
2933
|
def sample_color_reset(self):
|
|
2902
2934
|
"""
|
|
2903
2935
|
Reset sample colors to default coloring using the 'turbo' colormap.
|
|
2904
|
-
|
|
2936
|
+
|
|
2905
2937
|
This function assigns colors by distributing samples evenly across the full
|
|
2906
2938
|
turbo colormap range, ensuring maximum color diversity and visual distinction
|
|
2907
2939
|
between samples.
|
|
2908
|
-
|
|
2940
|
+
|
|
2909
2941
|
Returns:
|
|
2910
2942
|
None (modifies self.samples_df in place)
|
|
2911
2943
|
"""
|
|
2912
2944
|
if self.samples_df is None or len(self.samples_df) == 0:
|
|
2913
2945
|
self.logger.warning("No samples found in study.")
|
|
2914
2946
|
return
|
|
2915
|
-
|
|
2947
|
+
|
|
2916
2948
|
try:
|
|
2917
2949
|
from cmap import Colormap
|
|
2918
|
-
|
|
2950
|
+
|
|
2919
2951
|
# Use turbo colormap
|
|
2920
|
-
cm = Colormap(
|
|
2921
|
-
|
|
2952
|
+
cm = Colormap("turbo")
|
|
2953
|
+
|
|
2922
2954
|
# Get sample count and assign colors evenly distributed across colormap
|
|
2923
2955
|
n_samples = len(self.samples_df)
|
|
2924
2956
|
colors = []
|
|
2925
|
-
|
|
2957
|
+
|
|
2926
2958
|
# Distribute samples evenly across the full colormap range
|
|
2927
2959
|
for i in range(n_samples):
|
|
2928
2960
|
# Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
|
|
@@ -2930,9 +2962,9 @@ def sample_color_reset(self):
|
|
|
2930
2962
|
# Optionally, map to a subset of colormap to avoid extreme colors
|
|
2931
2963
|
# Use 10% to 90% of colormap range for better color diversity
|
|
2932
2964
|
normalized_value = 0.1 + (normalized_value * 0.8)
|
|
2933
|
-
|
|
2965
|
+
|
|
2934
2966
|
color_rgba = cm(normalized_value)
|
|
2935
|
-
|
|
2967
|
+
|
|
2936
2968
|
# Convert RGBA to hex
|
|
2937
2969
|
if len(color_rgba) >= 3:
|
|
2938
2970
|
r, g, b = color_rgba[:3]
|
|
@@ -2941,14 +2973,14 @@ def sample_color_reset(self):
|
|
|
2941
2973
|
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
|
2942
2974
|
hex_color = f"#{r:02x}{g:02x}{b:02x}"
|
|
2943
2975
|
colors.append(hex_color)
|
|
2944
|
-
|
|
2976
|
+
|
|
2945
2977
|
# Update the sample_color column
|
|
2946
2978
|
self.samples_df = self.samples_df.with_columns(
|
|
2947
|
-
pl.Series("sample_color", colors).alias("sample_color")
|
|
2979
|
+
pl.Series("sample_color", colors).alias("sample_color"),
|
|
2948
2980
|
)
|
|
2949
|
-
|
|
2981
|
+
|
|
2950
2982
|
self.logger.debug(f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)")
|
|
2951
|
-
|
|
2983
|
+
|
|
2952
2984
|
except ImportError:
|
|
2953
2985
|
self.logger.error("cmap library is required for sample color reset. Install with: pip install cmap")
|
|
2954
2986
|
except Exception as e:
|
|
@@ -2958,13 +2990,13 @@ def sample_color_reset(self):
|
|
|
2958
2990
|
def _get_color_palette(palette_name):
|
|
2959
2991
|
"""
|
|
2960
2992
|
Get color palette as a list of hex color codes using the cmap library.
|
|
2961
|
-
|
|
2993
|
+
|
|
2962
2994
|
Parameters:
|
|
2963
2995
|
palette_name (str): Name of the palette
|
|
2964
|
-
|
|
2996
|
+
|
|
2965
2997
|
Returns:
|
|
2966
2998
|
list: List of hex color codes
|
|
2967
|
-
|
|
2999
|
+
|
|
2968
3000
|
Raises:
|
|
2969
3001
|
ValueError: If palette_name is not supported
|
|
2970
3002
|
"""
|
|
@@ -2972,40 +3004,38 @@ def _get_color_palette(palette_name):
|
|
|
2972
3004
|
from cmap import Colormap
|
|
2973
3005
|
except ImportError:
|
|
2974
3006
|
raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
|
|
2975
|
-
|
|
3007
|
+
|
|
2976
3008
|
# Map common palette names to cmap names
|
|
2977
3009
|
palette_mapping = {
|
|
2978
3010
|
# Scientific colormaps
|
|
2979
3011
|
"Turbo256": "turbo",
|
|
2980
|
-
"Viridis256": "viridis",
|
|
3012
|
+
"Viridis256": "viridis",
|
|
2981
3013
|
"Plasma256": "plasma",
|
|
2982
3014
|
"Inferno256": "inferno",
|
|
2983
3015
|
"Magma256": "magma",
|
|
2984
3016
|
"Cividis256": "cividis",
|
|
2985
|
-
|
|
2986
3017
|
# Qualitative palettes
|
|
2987
3018
|
"Set1": "Set1",
|
|
2988
|
-
"Set2": "Set2",
|
|
3019
|
+
"Set2": "Set2",
|
|
2989
3020
|
"Set3": "Set3",
|
|
2990
3021
|
"Tab10": "tab10",
|
|
2991
3022
|
"Tab20": "tab20",
|
|
2992
3023
|
"Dark2": "Dark2",
|
|
2993
3024
|
"Paired": "Paired",
|
|
2994
|
-
|
|
2995
3025
|
# Additional useful palettes
|
|
2996
3026
|
"Spectral": "Spectral",
|
|
2997
3027
|
"Rainbow": "rainbow",
|
|
2998
3028
|
"Coolwarm": "coolwarm",
|
|
2999
3029
|
"Seismic": "seismic",
|
|
3000
3030
|
}
|
|
3001
|
-
|
|
3031
|
+
|
|
3002
3032
|
# Get the cmap name
|
|
3003
3033
|
cmap_name = palette_mapping.get(palette_name, palette_name.lower())
|
|
3004
|
-
|
|
3034
|
+
|
|
3005
3035
|
try:
|
|
3006
3036
|
# Create colormap
|
|
3007
3037
|
cm = Colormap(cmap_name)
|
|
3008
|
-
|
|
3038
|
+
|
|
3009
3039
|
# Determine number of colors to generate
|
|
3010
3040
|
if "256" in palette_name:
|
|
3011
3041
|
n_colors = 256
|
|
@@ -3021,7 +3051,7 @@ def _get_color_palette(palette_name):
|
|
|
3021
3051
|
n_colors = 20
|
|
3022
3052
|
else:
|
|
3023
3053
|
n_colors = 256 # Default for continuous colormaps
|
|
3024
|
-
|
|
3054
|
+
|
|
3025
3055
|
# Generate colors
|
|
3026
3056
|
if n_colors <= 20:
|
|
3027
3057
|
# For discrete palettes, use evenly spaced indices
|
|
@@ -3029,11 +3059,11 @@ def _get_color_palette(palette_name):
|
|
|
3029
3059
|
else:
|
|
3030
3060
|
# For continuous palettes, use full range
|
|
3031
3061
|
indices = [i / (n_colors - 1) for i in range(n_colors)]
|
|
3032
|
-
|
|
3062
|
+
|
|
3033
3063
|
# Get colors as RGBA and convert to hex
|
|
3034
3064
|
colors = cm(indices)
|
|
3035
3065
|
hex_colors = []
|
|
3036
|
-
|
|
3066
|
+
|
|
3037
3067
|
for color in colors:
|
|
3038
3068
|
if len(color) >= 3: # RGBA or RGB
|
|
3039
3069
|
r, g, b = color[:3]
|
|
@@ -3042,25 +3072,26 @@ def _get_color_palette(palette_name):
|
|
|
3042
3072
|
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
|
3043
3073
|
hex_color = f"#{r:02x}{g:02x}{b:02x}"
|
|
3044
3074
|
hex_colors.append(hex_color)
|
|
3045
|
-
|
|
3075
|
+
|
|
3046
3076
|
return hex_colors
|
|
3047
|
-
|
|
3077
|
+
|
|
3048
3078
|
except Exception as e:
|
|
3049
|
-
raise ValueError(
|
|
3050
|
-
|
|
3079
|
+
raise ValueError(
|
|
3080
|
+
f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
|
|
3081
|
+
)
|
|
3051
3082
|
|
|
3052
3083
|
|
|
3053
3084
|
def _sample_colors_from_colormap(palette_name, n_colors):
|
|
3054
3085
|
"""
|
|
3055
3086
|
Sample colors evenly from the whole colormap range, similar to sample_color_reset.
|
|
3056
|
-
|
|
3087
|
+
|
|
3057
3088
|
Parameters:
|
|
3058
3089
|
palette_name (str): Name of the palette/colormap
|
|
3059
3090
|
n_colors (int): Number of colors to sample
|
|
3060
|
-
|
|
3091
|
+
|
|
3061
3092
|
Returns:
|
|
3062
3093
|
list: List of hex color codes sampled evenly from the colormap
|
|
3063
|
-
|
|
3094
|
+
|
|
3064
3095
|
Raises:
|
|
3065
3096
|
ValueError: If palette_name is not supported
|
|
3066
3097
|
"""
|
|
@@ -3068,51 +3099,49 @@ def _sample_colors_from_colormap(palette_name, n_colors):
|
|
|
3068
3099
|
from cmap import Colormap
|
|
3069
3100
|
except ImportError:
|
|
3070
3101
|
raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
|
|
3071
|
-
|
|
3102
|
+
|
|
3072
3103
|
# Map common palette names to cmap names (same as _get_color_palette)
|
|
3073
3104
|
palette_mapping = {
|
|
3074
3105
|
# Scientific colormaps
|
|
3075
3106
|
"Turbo256": "turbo",
|
|
3076
|
-
"Viridis256": "viridis",
|
|
3107
|
+
"Viridis256": "viridis",
|
|
3077
3108
|
"Plasma256": "plasma",
|
|
3078
3109
|
"Inferno256": "inferno",
|
|
3079
3110
|
"Magma256": "magma",
|
|
3080
3111
|
"Cividis256": "cividis",
|
|
3081
|
-
|
|
3082
3112
|
# Qualitative palettes
|
|
3083
3113
|
"Set1": "Set1",
|
|
3084
|
-
"Set2": "Set2",
|
|
3114
|
+
"Set2": "Set2",
|
|
3085
3115
|
"Set3": "Set3",
|
|
3086
3116
|
"Tab10": "tab10",
|
|
3087
3117
|
"Tab20": "tab20",
|
|
3088
3118
|
"Dark2": "Dark2",
|
|
3089
3119
|
"Paired": "Paired",
|
|
3090
|
-
|
|
3091
3120
|
# Additional useful palettes
|
|
3092
3121
|
"Spectral": "Spectral",
|
|
3093
3122
|
"Rainbow": "rainbow",
|
|
3094
3123
|
"Coolwarm": "coolwarm",
|
|
3095
3124
|
"Seismic": "seismic",
|
|
3096
3125
|
}
|
|
3097
|
-
|
|
3126
|
+
|
|
3098
3127
|
# Get the cmap name
|
|
3099
3128
|
cmap_name = palette_mapping.get(palette_name, palette_name.lower())
|
|
3100
|
-
|
|
3129
|
+
|
|
3101
3130
|
try:
|
|
3102
3131
|
# Create colormap
|
|
3103
3132
|
cm = Colormap(cmap_name)
|
|
3104
|
-
|
|
3133
|
+
|
|
3105
3134
|
colors = []
|
|
3106
|
-
|
|
3135
|
+
|
|
3107
3136
|
# Distribute samples evenly across the full colormap range (same approach as sample_color_reset)
|
|
3108
3137
|
for i in range(n_colors):
|
|
3109
3138
|
# Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
|
|
3110
3139
|
normalized_value = (i + 0.5) / n_colors # +0.5 to center samples in their bins
|
|
3111
3140
|
# Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
|
|
3112
3141
|
normalized_value = 0.1 + (normalized_value * 0.8)
|
|
3113
|
-
|
|
3142
|
+
|
|
3114
3143
|
color_rgba = cm(normalized_value)
|
|
3115
|
-
|
|
3144
|
+
|
|
3116
3145
|
# Convert RGBA to hex
|
|
3117
3146
|
if len(color_rgba) >= 3:
|
|
3118
3147
|
r, g, b = color_rgba[:3]
|
|
@@ -3121,12 +3150,13 @@ def _sample_colors_from_colormap(palette_name, n_colors):
|
|
|
3121
3150
|
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
|
3122
3151
|
hex_color = f"#{r:02x}{g:02x}{b:02x}"
|
|
3123
3152
|
colors.append(hex_color)
|
|
3124
|
-
|
|
3153
|
+
|
|
3125
3154
|
return colors
|
|
3126
|
-
|
|
3155
|
+
|
|
3127
3156
|
except Exception as e:
|
|
3128
|
-
raise ValueError(
|
|
3129
|
-
|
|
3157
|
+
raise ValueError(
|
|
3158
|
+
f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
|
|
3159
|
+
)
|
|
3130
3160
|
|
|
3131
3161
|
|
|
3132
3162
|
def _matplotlib_to_hex(color_dict):
|
|
@@ -3135,32 +3165,32 @@ def _matplotlib_to_hex(color_dict):
|
|
|
3135
3165
|
|
|
3136
3166
|
|
|
3137
3167
|
# =====================================================================================
|
|
3138
|
-
# SCHEMA AND DATA STRUCTURE FUNCTIONS
|
|
3168
|
+
# SCHEMA AND DATA STRUCTURE FUNCTIONS
|
|
3139
3169
|
# =====================================================================================
|
|
3140
3170
|
|
|
3141
3171
|
|
|
3142
3172
|
def _ensure_features_df_schema_order(self):
|
|
3143
3173
|
"""
|
|
3144
3174
|
Ensure features_df columns are ordered according to study5_schema.json.
|
|
3145
|
-
|
|
3175
|
+
|
|
3146
3176
|
This method should be called after operations that might scramble the column order.
|
|
3147
3177
|
"""
|
|
3148
3178
|
if self.features_df is None or self.features_df.is_empty():
|
|
3149
3179
|
return
|
|
3150
|
-
|
|
3180
|
+
|
|
3151
3181
|
try:
|
|
3152
3182
|
import os
|
|
3153
3183
|
import json
|
|
3154
3184
|
from masster.study.h5 import _reorder_columns_by_schema
|
|
3155
|
-
|
|
3185
|
+
|
|
3156
3186
|
# Load schema
|
|
3157
3187
|
schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
|
|
3158
|
-
with open(schema_path
|
|
3188
|
+
with open(schema_path) as f:
|
|
3159
3189
|
schema = json.load(f)
|
|
3160
|
-
|
|
3190
|
+
|
|
3161
3191
|
# Reorder columns to match schema
|
|
3162
|
-
self.features_df = _reorder_columns_by_schema(self.features_df, schema,
|
|
3163
|
-
|
|
3192
|
+
self.features_df = _reorder_columns_by_schema(self.features_df, schema, "features_df")
|
|
3193
|
+
|
|
3164
3194
|
except Exception as e:
|
|
3165
3195
|
self.logger.warning(f"Failed to reorder features_df columns: {e}")
|
|
3166
3196
|
|
|
@@ -3168,38 +3198,38 @@ def _ensure_features_df_schema_order(self):
|
|
|
3168
3198
|
def migrate_map_id_to_index(self):
|
|
3169
3199
|
"""
|
|
3170
3200
|
Migrate map_id from string-based OpenMS unique IDs to integer indices.
|
|
3171
|
-
|
|
3201
|
+
|
|
3172
3202
|
This function converts the map_id column from string type (with OpenMS unique IDs)
|
|
3173
3203
|
to integer type where each map_id corresponds to the index of the feature map
|
|
3174
3204
|
in self.features_maps.
|
|
3175
|
-
|
|
3205
|
+
|
|
3176
3206
|
This migration is needed for studies that were created before the map_id format
|
|
3177
3207
|
change from OpenMS unique IDs to feature map indices.
|
|
3178
3208
|
"""
|
|
3179
3209
|
if self.samples_df is None or self.samples_df.is_empty():
|
|
3180
3210
|
self.logger.warning("No samples to migrate")
|
|
3181
3211
|
return
|
|
3182
|
-
|
|
3212
|
+
|
|
3183
3213
|
# Check if migration is needed
|
|
3184
|
-
current_dtype = self.samples_df[
|
|
3214
|
+
current_dtype = self.samples_df["map_id"].dtype
|
|
3185
3215
|
if current_dtype == pl.Int64:
|
|
3186
3216
|
self.logger.info("map_id column is already Int64 type - no migration needed")
|
|
3187
3217
|
return
|
|
3188
|
-
|
|
3218
|
+
|
|
3189
3219
|
self.logger.info("Migrating map_id from string-based OpenMS IDs to integer indices")
|
|
3190
|
-
|
|
3220
|
+
|
|
3191
3221
|
# Create new map_id values based on sample order
|
|
3192
3222
|
# Each sample gets a map_id that corresponds to its position in features_maps
|
|
3193
3223
|
sample_count = len(self.samples_df)
|
|
3194
3224
|
new_map_ids = list(range(sample_count))
|
|
3195
|
-
|
|
3225
|
+
|
|
3196
3226
|
# Update the map_id column
|
|
3197
3227
|
self.samples_df = self.samples_df.with_columns(
|
|
3198
|
-
pl.lit(new_map_ids).alias("map_id")
|
|
3228
|
+
pl.lit(new_map_ids).alias("map_id"),
|
|
3199
3229
|
)
|
|
3200
|
-
|
|
3230
|
+
|
|
3201
3231
|
# Ensure the column is Int64 type
|
|
3202
3232
|
self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
|
|
3203
|
-
|
|
3233
|
+
|
|
3204
3234
|
self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
|
|
3205
3235
|
self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")
|