masster 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/h5.py +1 -1
- masster/sample/helpers.py +3 -7
- masster/sample/load.py +2 -2
- masster/sample/plot.py +2 -1
- masster/study/export.py +27 -10
- masster/study/h5.py +58 -40
- masster/study/helpers.py +275 -225
- masster/study/helpers_optimized.py +5 -5
- masster/study/load.py +148 -121
- masster/study/plot.py +306 -106
- masster/study/processing.py +9 -5
- masster/study/study.py +2 -6
- {masster-0.3.17.dist-info → masster-0.3.19.dist-info}/METADATA +1 -1
- {masster-0.3.17.dist-info → masster-0.3.19.dist-info}/RECORD +18 -18
- {masster-0.3.17.dist-info → masster-0.3.19.dist-info}/WHEEL +0 -0
- {masster-0.3.17.dist-info → masster-0.3.19.dist-info}/entry_points.txt +0 -0
- {masster-0.3.17.dist-info → masster-0.3.19.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py
CHANGED
|
@@ -6,7 +6,7 @@ like data retrieval, filtering, compression, and utility functions.
|
|
|
6
6
|
|
|
7
7
|
The functions are organized into the following sections:
|
|
8
8
|
1. Chromatogram extraction functions (BPC, TIC, EIC, chrom matrix)
|
|
9
|
-
2. Data retrieval helper functions (get_sample, get_consensus, etc.)
|
|
9
|
+
2. Data retrieval helper functions (get_sample, get_consensus, etc.)
|
|
10
10
|
3. UID helper functions (_get_*_uids)
|
|
11
11
|
4. Data filtering and selection functions
|
|
12
12
|
5. Data compression and restoration functions
|
|
@@ -150,9 +150,19 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
|
|
|
150
150
|
# build Chromatogram
|
|
151
151
|
ycol = "inty"
|
|
152
152
|
try:
|
|
153
|
-
chrom = Chromatogram(
|
|
153
|
+
chrom = Chromatogram(
|
|
154
|
+
rt=bpc_pd["rt"].to_numpy(),
|
|
155
|
+
inty=bpc_pd[ycol].to_numpy(),
|
|
156
|
+
label=label or "Base Peak Chromatogram",
|
|
157
|
+
rt_unit=rt_unit,
|
|
158
|
+
)
|
|
154
159
|
except Exception:
|
|
155
|
-
chrom = Chromatogram(
|
|
160
|
+
chrom = Chromatogram(
|
|
161
|
+
rt=bpc_pd["rt"].values,
|
|
162
|
+
inty=bpc_pd[ycol].values,
|
|
163
|
+
label=label or "Base Peak Chromatogram",
|
|
164
|
+
rt_unit=rt_unit,
|
|
165
|
+
)
|
|
156
166
|
|
|
157
167
|
return chrom
|
|
158
168
|
|
|
@@ -204,13 +214,21 @@ def get_tic(owner, sample=None, label=None):
|
|
|
204
214
|
tic_pd = tic_pd.rename(columns={tic_pd.columns[1]: "inty_tot"})
|
|
205
215
|
|
|
206
216
|
try:
|
|
207
|
-
chrom = Chromatogram(
|
|
217
|
+
chrom = Chromatogram(
|
|
218
|
+
rt=tic_pd["rt"].to_numpy(),
|
|
219
|
+
inty=tic_pd["inty_tot"].to_numpy(),
|
|
220
|
+
label=label or "Total Ion Chromatogram",
|
|
221
|
+
)
|
|
208
222
|
except Exception:
|
|
209
|
-
chrom = Chromatogram(
|
|
223
|
+
chrom = Chromatogram(
|
|
224
|
+
rt=tic_pd["rt"].values,
|
|
225
|
+
inty=tic_pd["inty_tot"].values,
|
|
226
|
+
label=label or "Total Ion Chromatogram",
|
|
227
|
+
)
|
|
210
228
|
|
|
211
229
|
return chrom
|
|
212
230
|
|
|
213
|
-
|
|
231
|
+
|
|
214
232
|
def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
215
233
|
"""
|
|
216
234
|
Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
|
|
@@ -223,7 +241,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
|
223
241
|
|
|
224
242
|
Parameters:
|
|
225
243
|
owner: Study or Sample instance
|
|
226
|
-
sample: Sample identifier (required if owner is Study)
|
|
244
|
+
sample: Sample identifier (required if owner is Study)
|
|
227
245
|
mz (float): Target m/z value
|
|
228
246
|
mz_tol (float): m/z tolerance. If None, uses owner.parameters.eic_mz_tol (for Study) or defaults to 0.01
|
|
229
247
|
rt_unit (str): Retention time unit for the chromatogram
|
|
@@ -234,7 +252,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
|
234
252
|
"""
|
|
235
253
|
# Use default mz_tol from study parameters if not provided
|
|
236
254
|
if mz_tol is None:
|
|
237
|
-
if hasattr(owner,
|
|
255
|
+
if hasattr(owner, "parameters") and hasattr(owner.parameters, "eic_mz_tol"):
|
|
238
256
|
mz_tol = owner.parameters.eic_mz_tol
|
|
239
257
|
else:
|
|
240
258
|
mz_tol = 0.01 # fallback default
|
|
@@ -267,17 +285,18 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
|
267
285
|
mz_min = mz - mz_tol
|
|
268
286
|
mz_max = mz + mz_tol
|
|
269
287
|
eic_data = s.ms1_df.filter(
|
|
270
|
-
(pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
|
|
288
|
+
(pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max),
|
|
271
289
|
)
|
|
272
290
|
|
|
273
291
|
if eic_data.is_empty():
|
|
274
292
|
# Return empty chromatogram if no data found
|
|
275
293
|
import numpy as _np
|
|
294
|
+
|
|
276
295
|
return Chromatogram(
|
|
277
|
-
rt=_np.array([0.0]),
|
|
278
|
-
inty=_np.array([0.0]),
|
|
296
|
+
rt=_np.array([0.0]),
|
|
297
|
+
inty=_np.array([0.0]),
|
|
279
298
|
label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
|
|
280
|
-
rt_unit=rt_unit
|
|
299
|
+
rt_unit=rt_unit,
|
|
281
300
|
)
|
|
282
301
|
|
|
283
302
|
# Aggregate intensities per retention time (sum in case of multiple points per rt)
|
|
@@ -290,34 +309,35 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
|
290
309
|
if eic_pd.empty:
|
|
291
310
|
# Return empty chromatogram if no data found
|
|
292
311
|
import numpy as _np
|
|
312
|
+
|
|
293
313
|
return Chromatogram(
|
|
294
|
-
rt=_np.array([0.0]),
|
|
295
|
-
inty=_np.array([0.0]),
|
|
314
|
+
rt=_np.array([0.0]),
|
|
315
|
+
inty=_np.array([0.0]),
|
|
296
316
|
label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
|
|
297
|
-
rt_unit=rt_unit
|
|
317
|
+
rt_unit=rt_unit,
|
|
298
318
|
)
|
|
299
319
|
|
|
300
320
|
# build Chromatogram
|
|
301
321
|
try:
|
|
302
322
|
chrom = Chromatogram(
|
|
303
|
-
rt=eic_pd["rt"].to_numpy(),
|
|
304
|
-
inty=eic_pd["inty"].to_numpy(),
|
|
323
|
+
rt=eic_pd["rt"].to_numpy(),
|
|
324
|
+
inty=eic_pd["inty"].to_numpy(),
|
|
305
325
|
label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
|
|
306
|
-
rt_unit=rt_unit
|
|
326
|
+
rt_unit=rt_unit,
|
|
307
327
|
)
|
|
308
328
|
except Exception:
|
|
309
329
|
chrom = Chromatogram(
|
|
310
|
-
rt=eic_pd["rt"].values,
|
|
311
|
-
inty=eic_pd["inty"].values,
|
|
330
|
+
rt=eic_pd["rt"].values,
|
|
331
|
+
inty=eic_pd["inty"].values,
|
|
312
332
|
label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
|
|
313
|
-
rt_unit=rt_unit
|
|
333
|
+
rt_unit=rt_unit,
|
|
314
334
|
)
|
|
315
335
|
|
|
316
336
|
return chrom
|
|
317
337
|
|
|
318
338
|
|
|
319
339
|
# =====================================================================================
|
|
320
|
-
# DATA RETRIEVAL AND MATRIX FUNCTIONS
|
|
340
|
+
# DATA RETRIEVAL AND MATRIX FUNCTIONS
|
|
321
341
|
# =====================================================================================
|
|
322
342
|
|
|
323
343
|
|
|
@@ -451,9 +471,9 @@ def align_reset(self):
|
|
|
451
471
|
self.alignment_ref_index = None
|
|
452
472
|
# in self.features_df, set rt equal to rt_original
|
|
453
473
|
self.features_df = self.features_df.with_columns(
|
|
454
|
-
pl.col("rt_original").alias("rt")
|
|
474
|
+
pl.col("rt_original").alias("rt"),
|
|
455
475
|
)
|
|
456
|
-
|
|
476
|
+
|
|
457
477
|
# Ensure column order is maintained after with_columns operation
|
|
458
478
|
self._ensure_features_df_schema_order()
|
|
459
479
|
|
|
@@ -479,7 +499,9 @@ def get_consensus(self, quant="chrom_area"):
|
|
|
479
499
|
# sort by consensus_id
|
|
480
500
|
df1 = df1.sort_index()
|
|
481
501
|
|
|
482
|
-
|
|
502
|
+
df2_polars = self.get_consensus_matrix(quant=quant)
|
|
503
|
+
# Convert to pandas for merging (since the result is used for export)
|
|
504
|
+
df2 = df2_polars.to_pandas().set_index("consensus_uid")
|
|
483
505
|
# sort df2 row by consensus_id
|
|
484
506
|
df2 = df2.sort_index()
|
|
485
507
|
# merge df and df2 on consensus_id
|
|
@@ -492,6 +514,7 @@ def get_consensus(self, quant="chrom_area"):
|
|
|
492
514
|
def get_consensus_matrix(self, quant="chrom_area"):
|
|
493
515
|
"""
|
|
494
516
|
Get a matrix of consensus features with samples as columns and consensus features as rows.
|
|
517
|
+
Optimized implementation that avoids expensive join operations.
|
|
495
518
|
"""
|
|
496
519
|
if quant not in self.features_df.columns:
|
|
497
520
|
self.logger.error(
|
|
@@ -499,41 +522,58 @@ def get_consensus_matrix(self, quant="chrom_area"):
|
|
|
499
522
|
)
|
|
500
523
|
return None
|
|
501
524
|
|
|
502
|
-
#
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
525
|
+
# Create a lookup dictionary from features_df for O(1) value access
|
|
526
|
+
feature_values = {}
|
|
527
|
+
for row in self.features_df.iter_rows(named=True):
|
|
528
|
+
feature_uid = row['feature_uid']
|
|
529
|
+
sample_uid = row['sample_uid']
|
|
530
|
+
value = row[quant] if row[quant] is not None else 0
|
|
531
|
+
feature_values[(feature_uid, sample_uid)] = value
|
|
532
|
+
|
|
533
|
+
# Build consensus matrix directly using the consensus_mapping_df
|
|
534
|
+
matrix_dict = {}
|
|
535
|
+
sample_mapping = dict(self.samples_df.select(["sample_uid", "sample_name"]).iter_rows())
|
|
536
|
+
|
|
537
|
+
for row in self.consensus_mapping_df.iter_rows(named=True):
|
|
538
|
+
consensus_uid = row['consensus_uid']
|
|
539
|
+
sample_uid = row['sample_uid']
|
|
540
|
+
feature_uid = row['feature_uid']
|
|
541
|
+
|
|
542
|
+
# Look up the quantification value
|
|
543
|
+
key = (feature_uid, sample_uid)
|
|
544
|
+
value = feature_values.get(key, 0)
|
|
545
|
+
|
|
546
|
+
if consensus_uid not in matrix_dict:
|
|
547
|
+
matrix_dict[consensus_uid] = {}
|
|
548
|
+
|
|
549
|
+
sample_name = sample_mapping.get(sample_uid, f"sample_{sample_uid}")
|
|
550
|
+
|
|
551
|
+
# Take max if multiple features map to same consensus/sample combination
|
|
552
|
+
if sample_name in matrix_dict[consensus_uid]:
|
|
553
|
+
matrix_dict[consensus_uid][sample_name] = max(matrix_dict[consensus_uid][sample_name], value)
|
|
554
|
+
else:
|
|
555
|
+
matrix_dict[consensus_uid][sample_name] = value
|
|
530
556
|
|
|
531
|
-
#
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
557
|
+
# Convert to Polars DataFrame with proper formatting
|
|
558
|
+
import polars as pl
|
|
559
|
+
|
|
560
|
+
# Convert matrix_dict to list of records for Polars
|
|
561
|
+
records = []
|
|
562
|
+
for consensus_uid, sample_values in matrix_dict.items():
|
|
563
|
+
record = {"consensus_uid": consensus_uid}
|
|
564
|
+
record.update(sample_values)
|
|
565
|
+
records.append(record)
|
|
566
|
+
|
|
567
|
+
# Create Polars DataFrame and set proper data types
|
|
568
|
+
df2 = pl.DataFrame(records)
|
|
569
|
+
|
|
570
|
+
# Fill null values with 0 and round numeric columns
|
|
571
|
+
numeric_cols = [col for col in df2.columns if col != "consensus_uid"]
|
|
572
|
+
df2 = df2.with_columns([
|
|
573
|
+
pl.col("consensus_uid").cast(pl.UInt64),
|
|
574
|
+
*[pl.col(col).fill_null(0).round(0) for col in numeric_cols]
|
|
575
|
+
])
|
|
576
|
+
|
|
537
577
|
return df2
|
|
538
578
|
|
|
539
579
|
|
|
@@ -594,7 +634,7 @@ def get_consensus_matches(self, uids=None):
|
|
|
594
634
|
return matches
|
|
595
635
|
|
|
596
636
|
|
|
597
|
-
# =====================================================================================
|
|
637
|
+
# =====================================================================================
|
|
598
638
|
# UID HELPER FUNCTIONS
|
|
599
639
|
# =====================================================================================
|
|
600
640
|
|
|
@@ -776,7 +816,7 @@ def get_sample(self, sample):
|
|
|
776
816
|
return cache[sample_uid]
|
|
777
817
|
|
|
778
818
|
sample_path = row.get("sample_path", None)
|
|
779
|
-
s = Sample(log_level=
|
|
819
|
+
s = Sample(log_level="ERROR")
|
|
780
820
|
try:
|
|
781
821
|
if sample_path:
|
|
782
822
|
try:
|
|
@@ -796,13 +836,13 @@ def get_orphans(self):
|
|
|
796
836
|
Get all features that are not in the consensus mapping.
|
|
797
837
|
"""
|
|
798
838
|
not_in_consensus = self.features_df.filter(
|
|
799
|
-
~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list())
|
|
839
|
+
~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list()),
|
|
800
840
|
)
|
|
801
841
|
return not_in_consensus
|
|
802
842
|
|
|
803
843
|
|
|
804
844
|
# =====================================================================================
|
|
805
|
-
# DATA COMPRESSION AND RESTORATION FUNCTIONS
|
|
845
|
+
# DATA COMPRESSION AND RESTORATION FUNCTIONS
|
|
806
846
|
# =====================================================================================
|
|
807
847
|
|
|
808
848
|
|
|
@@ -858,7 +898,7 @@ def compress_features(self):
|
|
|
858
898
|
|
|
859
899
|
removed_count = initial_count - len(self.features_df)
|
|
860
900
|
self.logger.info(
|
|
861
|
-
f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column"
|
|
901
|
+
f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column",
|
|
862
902
|
)
|
|
863
903
|
|
|
864
904
|
|
|
@@ -1099,7 +1139,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1099
1139
|
total_chroms = len(self.features_df)
|
|
1100
1140
|
|
|
1101
1141
|
self.logger.debug(
|
|
1102
|
-
f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)"
|
|
1142
|
+
f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)",
|
|
1103
1143
|
)
|
|
1104
1144
|
|
|
1105
1145
|
if empty_chroms == 0:
|
|
@@ -1229,7 +1269,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1229
1269
|
final_total = len(self.features_df)
|
|
1230
1270
|
|
|
1231
1271
|
self.logger.info(
|
|
1232
|
-
f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)"
|
|
1272
|
+
f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)",
|
|
1233
1273
|
)
|
|
1234
1274
|
self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
|
|
1235
1275
|
|
|
@@ -1270,7 +1310,7 @@ def compress_ms2(self, max_replicates=5):
|
|
|
1270
1310
|
|
|
1271
1311
|
removed_count = initial_count - len(self.consensus_ms2)
|
|
1272
1312
|
self.logger.info(
|
|
1273
|
-
f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair"
|
|
1313
|
+
f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair",
|
|
1274
1314
|
)
|
|
1275
1315
|
|
|
1276
1316
|
|
|
@@ -1308,14 +1348,14 @@ def compress_chrom(self):
|
|
|
1308
1348
|
def sample_name_replace(self, replace_dict):
|
|
1309
1349
|
"""
|
|
1310
1350
|
Replace sample names in samples_df based on a dictionary mapping.
|
|
1311
|
-
|
|
1312
|
-
Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
|
|
1313
|
-
all keys with their corresponding values from replace_dict. Checks that all
|
|
1351
|
+
|
|
1352
|
+
Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
|
|
1353
|
+
all keys with their corresponding values from replace_dict. Checks that all
|
|
1314
1354
|
resulting sample names are unique. If unique, replaces the values in self.samples_df.
|
|
1315
1355
|
|
|
1316
1356
|
Parameters:
|
|
1317
1357
|
replace_dict (dict): Dictionary mapping old names (keys) to new names (values).
|
|
1318
|
-
All keys found in sample names will be replaced with their
|
|
1358
|
+
All keys found in sample names will be replaced with their
|
|
1319
1359
|
corresponding values.
|
|
1320
1360
|
e.g., {"old_name1": "new_name1", "old_name2": "new_name2"}
|
|
1321
1361
|
|
|
@@ -1328,22 +1368,22 @@ def sample_name_replace(self, replace_dict):
|
|
|
1328
1368
|
"""
|
|
1329
1369
|
if not isinstance(replace_dict, dict):
|
|
1330
1370
|
raise ValueError("replace_dict must be a dictionary")
|
|
1331
|
-
|
|
1371
|
+
|
|
1332
1372
|
if self.samples_df is None or len(self.samples_df) == 0:
|
|
1333
1373
|
self.logger.warning("No samples found in study.")
|
|
1334
1374
|
return
|
|
1335
|
-
|
|
1375
|
+
|
|
1336
1376
|
if not replace_dict:
|
|
1337
1377
|
self.logger.warning("Empty replace_dict provided, no changes made.")
|
|
1338
1378
|
return
|
|
1339
1379
|
|
|
1340
1380
|
# Get current sample names
|
|
1341
1381
|
current_names = self.samples_df.get_column("sample_name").to_list()
|
|
1342
|
-
|
|
1382
|
+
|
|
1343
1383
|
# Create a copy and apply replacements
|
|
1344
1384
|
new_names = []
|
|
1345
1385
|
replaced_count = 0
|
|
1346
|
-
|
|
1386
|
+
|
|
1347
1387
|
for name in current_names:
|
|
1348
1388
|
if name in replace_dict:
|
|
1349
1389
|
new_names.append(replace_dict[name])
|
|
@@ -1351,7 +1391,7 @@ def sample_name_replace(self, replace_dict):
|
|
|
1351
1391
|
self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
|
|
1352
1392
|
else:
|
|
1353
1393
|
new_names.append(name)
|
|
1354
|
-
|
|
1394
|
+
|
|
1355
1395
|
# Check that all new names are unique
|
|
1356
1396
|
if len(set(new_names)) != len(new_names):
|
|
1357
1397
|
duplicates = []
|
|
@@ -1362,19 +1402,19 @@ def sample_name_replace(self, replace_dict):
|
|
|
1362
1402
|
else:
|
|
1363
1403
|
seen.add(name)
|
|
1364
1404
|
raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
|
|
1365
|
-
|
|
1405
|
+
|
|
1366
1406
|
# If we get here, all names are unique - apply the changes
|
|
1367
1407
|
self.samples_df = self.samples_df.with_columns(
|
|
1368
1408
|
pl.Series("sample_name", new_names).alias("sample_name"),
|
|
1369
1409
|
)
|
|
1370
|
-
|
|
1410
|
+
|
|
1371
1411
|
self.logger.info(f"Successfully replaced {replaced_count} sample names")
|
|
1372
1412
|
|
|
1373
1413
|
|
|
1374
1414
|
def sample_name_reset(self):
|
|
1375
1415
|
"""
|
|
1376
1416
|
Reset sample names to the basename of sample_path without extensions.
|
|
1377
|
-
|
|
1417
|
+
|
|
1378
1418
|
Takes all paths in self.samples_df['sample_path'], extracts the basename,
|
|
1379
1419
|
removes file extensions, and checks that all resulting names are unique.
|
|
1380
1420
|
If unique, replaces the values in self.samples_df['sample_name'].
|
|
@@ -1387,31 +1427,31 @@ def sample_name_reset(self):
|
|
|
1387
1427
|
RuntimeError: If any sample_path is None or empty
|
|
1388
1428
|
"""
|
|
1389
1429
|
import os
|
|
1390
|
-
|
|
1430
|
+
|
|
1391
1431
|
if self.samples_df is None or len(self.samples_df) == 0:
|
|
1392
1432
|
self.logger.warning("No samples found in study.")
|
|
1393
1433
|
return
|
|
1394
1434
|
|
|
1395
1435
|
# Get current sample paths
|
|
1396
1436
|
sample_paths = self.samples_df.get_column("sample_path").to_list()
|
|
1397
|
-
|
|
1437
|
+
|
|
1398
1438
|
# Extract basenames without extensions
|
|
1399
1439
|
new_names = []
|
|
1400
|
-
|
|
1440
|
+
|
|
1401
1441
|
for i, path in enumerate(sample_paths):
|
|
1402
1442
|
if path is None or path == "":
|
|
1403
1443
|
raise RuntimeError(f"Sample at index {i} has no sample_path set")
|
|
1404
|
-
|
|
1444
|
+
|
|
1405
1445
|
# Get basename and remove extension(s)
|
|
1406
1446
|
basename = os.path.basename(path)
|
|
1407
1447
|
# Remove all extensions (handles cases like .tar.gz, .sample5.gz, etc.)
|
|
1408
1448
|
name_without_ext = basename
|
|
1409
|
-
while
|
|
1449
|
+
while "." in name_without_ext:
|
|
1410
1450
|
name_without_ext = os.path.splitext(name_without_ext)[0]
|
|
1411
|
-
|
|
1451
|
+
|
|
1412
1452
|
new_names.append(name_without_ext)
|
|
1413
1453
|
self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
|
|
1414
|
-
|
|
1454
|
+
|
|
1415
1455
|
# Check that all new names are unique
|
|
1416
1456
|
if len(set(new_names)) != len(new_names):
|
|
1417
1457
|
duplicates = []
|
|
@@ -1422,12 +1462,12 @@ def sample_name_reset(self):
|
|
|
1422
1462
|
else:
|
|
1423
1463
|
seen.add(name)
|
|
1424
1464
|
raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
|
|
1425
|
-
|
|
1465
|
+
|
|
1426
1466
|
# If we get here, all names are unique - apply the changes
|
|
1427
1467
|
self.samples_df = self.samples_df.with_columns(
|
|
1428
1468
|
pl.Series("sample_name", new_names).alias("sample_name"),
|
|
1429
1469
|
)
|
|
1430
|
-
|
|
1470
|
+
|
|
1431
1471
|
self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
|
|
1432
1472
|
|
|
1433
1473
|
|
|
@@ -1684,7 +1724,7 @@ def features_select(
|
|
|
1684
1724
|
if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
|
|
1685
1725
|
min_coherence, max_coherence = chrom_coherence
|
|
1686
1726
|
filter_conditions.append(
|
|
1687
|
-
(pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
|
|
1727
|
+
(pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
|
|
1688
1728
|
)
|
|
1689
1729
|
else:
|
|
1690
1730
|
filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
|
|
@@ -1697,7 +1737,7 @@ def features_select(
|
|
|
1697
1737
|
if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
|
|
1698
1738
|
min_prominence, max_prominence = chrom_prominence
|
|
1699
1739
|
filter_conditions.append(
|
|
1700
|
-
(pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
|
|
1740
|
+
(pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
|
|
1701
1741
|
)
|
|
1702
1742
|
else:
|
|
1703
1743
|
filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
|
|
@@ -1711,7 +1751,7 @@ def features_select(
|
|
|
1711
1751
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
|
|
1712
1752
|
filter_conditions.append(
|
|
1713
1753
|
(pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
|
|
1714
|
-
& (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
|
|
1754
|
+
& (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
|
|
1715
1755
|
)
|
|
1716
1756
|
else:
|
|
1717
1757
|
filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
|
|
@@ -1725,7 +1765,7 @@ def features_select(
|
|
|
1725
1765
|
min_height_scaled, max_height_scaled = chrom_height_scaled
|
|
1726
1766
|
filter_conditions.append(
|
|
1727
1767
|
(pl.col("chrom_height_scaled") >= min_height_scaled)
|
|
1728
|
-
& (pl.col("chrom_height_scaled") <= max_height_scaled)
|
|
1768
|
+
& (pl.col("chrom_height_scaled") <= max_height_scaled),
|
|
1729
1769
|
)
|
|
1730
1770
|
else:
|
|
1731
1771
|
filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
|
|
@@ -1832,7 +1872,7 @@ def features_filter(self, features):
|
|
|
1832
1872
|
# Single comprehensive log message
|
|
1833
1873
|
if mapping_removed_count > 0:
|
|
1834
1874
|
self.logger.info(
|
|
1835
|
-
f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features."
|
|
1875
|
+
f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.",
|
|
1836
1876
|
)
|
|
1837
1877
|
else:
|
|
1838
1878
|
self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
|
|
@@ -1909,7 +1949,7 @@ def features_delete(self, features):
|
|
|
1909
1949
|
# Single comprehensive log message
|
|
1910
1950
|
if mapping_removed_count > 0:
|
|
1911
1951
|
self.logger.info(
|
|
1912
|
-
f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}"
|
|
1952
|
+
f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}",
|
|
1913
1953
|
)
|
|
1914
1954
|
else:
|
|
1915
1955
|
self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
|
|
@@ -1974,7 +2014,7 @@ def consensus_select(
|
|
|
1974
2014
|
# Filter by m/z
|
|
1975
2015
|
if mz is not None:
|
|
1976
2016
|
consensus_len_before_filter = len(consensus)
|
|
1977
|
-
|
|
2017
|
+
|
|
1978
2018
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
1979
2019
|
# Check if second value is smaller than first (indicating mz, mz_tol format)
|
|
1980
2020
|
if mz[1] < mz[0]:
|
|
@@ -1988,18 +2028,19 @@ def consensus_select(
|
|
|
1988
2028
|
consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
1989
2029
|
else:
|
|
1990
2030
|
# Single float value - use default mz tolerance from study parameters
|
|
1991
|
-
default_mz_tol = getattr(self,
|
|
1992
|
-
if default_mz_tol and hasattr(default_mz_tol,
|
|
2031
|
+
default_mz_tol = getattr(self, "parameters", None)
|
|
2032
|
+
if default_mz_tol and hasattr(default_mz_tol, "eic_mz_tol"):
|
|
1993
2033
|
default_mz_tol = default_mz_tol.eic_mz_tol
|
|
1994
2034
|
else:
|
|
1995
2035
|
# Fallback to align_defaults if study parameters not available
|
|
1996
2036
|
from masster.study.defaults.align_def import align_defaults
|
|
2037
|
+
|
|
1997
2038
|
default_mz_tol = align_defaults().mz_max_diff
|
|
1998
|
-
|
|
2039
|
+
|
|
1999
2040
|
min_mz = mz - default_mz_tol
|
|
2000
2041
|
max_mz = mz + default_mz_tol
|
|
2001
2042
|
consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
2002
|
-
|
|
2043
|
+
|
|
2003
2044
|
self.logger.debug(
|
|
2004
2045
|
f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2005
2046
|
)
|
|
@@ -2007,7 +2048,7 @@ def consensus_select(
|
|
|
2007
2048
|
# Filter by retention time
|
|
2008
2049
|
if rt is not None:
|
|
2009
2050
|
consensus_len_before_filter = len(consensus)
|
|
2010
|
-
|
|
2051
|
+
|
|
2011
2052
|
if isinstance(rt, tuple) and len(rt) == 2:
|
|
2012
2053
|
# Check if second value is smaller than first (indicating rt, rt_tol format)
|
|
2013
2054
|
if rt[1] < rt[0]:
|
|
@@ -2021,18 +2062,19 @@ def consensus_select(
|
|
|
2021
2062
|
consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
2022
2063
|
else:
|
|
2023
2064
|
# Single float value - use default rt tolerance from study parameters
|
|
2024
|
-
default_rt_tol = getattr(self,
|
|
2025
|
-
if default_rt_tol and hasattr(default_rt_tol,
|
|
2065
|
+
default_rt_tol = getattr(self, "parameters", None)
|
|
2066
|
+
if default_rt_tol and hasattr(default_rt_tol, "eic_rt_tol"):
|
|
2026
2067
|
default_rt_tol = default_rt_tol.eic_rt_tol
|
|
2027
2068
|
else:
|
|
2028
2069
|
# Fallback to align_defaults if study parameters not available
|
|
2029
2070
|
from masster.study.defaults.align_def import align_defaults
|
|
2071
|
+
|
|
2030
2072
|
default_rt_tol = align_defaults().rt_max_diff
|
|
2031
|
-
|
|
2073
|
+
|
|
2032
2074
|
min_rt = rt - default_rt_tol
|
|
2033
2075
|
max_rt = rt + default_rt_tol
|
|
2034
2076
|
consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
2035
|
-
|
|
2077
|
+
|
|
2036
2078
|
self.logger.debug(
|
|
2037
2079
|
f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2038
2080
|
)
|
|
@@ -2057,7 +2099,7 @@ def consensus_select(
|
|
|
2057
2099
|
# Treat as range
|
|
2058
2100
|
min_uid, max_uid = consensus_uid
|
|
2059
2101
|
consensus = consensus.filter(
|
|
2060
|
-
(pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid)
|
|
2102
|
+
(pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid),
|
|
2061
2103
|
)
|
|
2062
2104
|
else:
|
|
2063
2105
|
# Treat as list
|
|
@@ -2085,7 +2127,7 @@ def consensus_select(
|
|
|
2085
2127
|
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
2086
2128
|
min_samples, max_samples = number_samples
|
|
2087
2129
|
consensus = consensus.filter(
|
|
2088
|
-
(pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples)
|
|
2130
|
+
(pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples),
|
|
2089
2131
|
)
|
|
2090
2132
|
else:
|
|
2091
2133
|
consensus = consensus.filter(pl.col("number_samples") >= number_samples)
|
|
@@ -2143,7 +2185,7 @@ def consensus_select(
|
|
|
2143
2185
|
min_coherence, max_coherence = chrom_coherence_mean
|
|
2144
2186
|
consensus = consensus.filter(
|
|
2145
2187
|
(pl.col("chrom_coherence_mean") >= min_coherence)
|
|
2146
|
-
& (pl.col("chrom_coherence_mean") <= max_coherence)
|
|
2188
|
+
& (pl.col("chrom_coherence_mean") <= max_coherence),
|
|
2147
2189
|
)
|
|
2148
2190
|
else:
|
|
2149
2191
|
consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
|
|
@@ -2161,7 +2203,7 @@ def consensus_select(
|
|
|
2161
2203
|
min_prominence, max_prominence = chrom_prominence_mean
|
|
2162
2204
|
consensus = consensus.filter(
|
|
2163
2205
|
(pl.col("chrom_prominence_mean") >= min_prominence)
|
|
2164
|
-
& (pl.col("chrom_prominence_mean") <= max_prominence)
|
|
2206
|
+
& (pl.col("chrom_prominence_mean") <= max_prominence),
|
|
2165
2207
|
)
|
|
2166
2208
|
else:
|
|
2167
2209
|
consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
|
|
@@ -2179,7 +2221,7 @@ def consensus_select(
|
|
|
2179
2221
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
|
|
2180
2222
|
consensus = consensus.filter(
|
|
2181
2223
|
(pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
|
|
2182
|
-
& (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled)
|
|
2224
|
+
& (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
|
|
2183
2225
|
)
|
|
2184
2226
|
else:
|
|
2185
2227
|
consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
|
|
@@ -2197,7 +2239,7 @@ def consensus_select(
|
|
|
2197
2239
|
min_height_scaled, max_height_scaled = chrom_height_scaled_mean
|
|
2198
2240
|
consensus = consensus.filter(
|
|
2199
2241
|
(pl.col("chrom_height_scaled_mean") >= min_height_scaled)
|
|
2200
|
-
& (pl.col("chrom_height_scaled_mean") <= max_height_scaled)
|
|
2242
|
+
& (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
|
|
2201
2243
|
)
|
|
2202
2244
|
else:
|
|
2203
2245
|
consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
|
|
@@ -2214,7 +2256,7 @@ def consensus_select(
|
|
|
2214
2256
|
if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
|
|
2215
2257
|
min_rt_delta, max_rt_delta = rt_delta_mean
|
|
2216
2258
|
consensus = consensus.filter(
|
|
2217
|
-
(pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta)
|
|
2259
|
+
(pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta),
|
|
2218
2260
|
)
|
|
2219
2261
|
else:
|
|
2220
2262
|
consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
|
|
@@ -2241,10 +2283,10 @@ def consensus_select(
|
|
|
2241
2283
|
# Multiple columns
|
|
2242
2284
|
valid_columns = [col for col in sortby if col in consensus.columns]
|
|
2243
2285
|
invalid_columns = [col for col in sortby if col not in consensus.columns]
|
|
2244
|
-
|
|
2286
|
+
|
|
2245
2287
|
if invalid_columns:
|
|
2246
2288
|
self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
|
|
2247
|
-
|
|
2289
|
+
|
|
2248
2290
|
if valid_columns:
|
|
2249
2291
|
consensus = consensus.sort(valid_columns, descending=descending)
|
|
2250
2292
|
else:
|
|
@@ -2335,7 +2377,7 @@ def consensus_filter(self, consensus):
|
|
|
2335
2377
|
|
|
2336
2378
|
removed_consensus_count = initial_consensus_count - len(self.consensus_df)
|
|
2337
2379
|
self.logger.info(
|
|
2338
|
-
f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}"
|
|
2380
|
+
f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}",
|
|
2339
2381
|
)
|
|
2340
2382
|
|
|
2341
2383
|
|
|
@@ -2465,7 +2507,9 @@ def samples_select(
|
|
|
2465
2507
|
if len(sample_batch) == 2 and not isinstance(sample_batch, list):
|
|
2466
2508
|
# Treat as range
|
|
2467
2509
|
min_batch, max_batch = sample_batch
|
|
2468
|
-
filter_conditions.append(
|
|
2510
|
+
filter_conditions.append(
|
|
2511
|
+
(pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch),
|
|
2512
|
+
)
|
|
2469
2513
|
else:
|
|
2470
2514
|
# Treat as list
|
|
2471
2515
|
filter_conditions.append(pl.col("sample_batch").is_in(sample_batch))
|
|
@@ -2481,7 +2525,9 @@ def samples_select(
|
|
|
2481
2525
|
if len(sample_sequence) == 2 and not isinstance(sample_sequence, list):
|
|
2482
2526
|
# Treat as range
|
|
2483
2527
|
min_seq, max_seq = sample_sequence
|
|
2484
|
-
filter_conditions.append(
|
|
2528
|
+
filter_conditions.append(
|
|
2529
|
+
(pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq),
|
|
2530
|
+
)
|
|
2485
2531
|
else:
|
|
2486
2532
|
# Treat as list
|
|
2487
2533
|
filter_conditions.append(pl.col("sample_sequence").is_in(sample_sequence))
|
|
@@ -2495,7 +2541,9 @@ def samples_select(
|
|
|
2495
2541
|
if "num_features" in available_columns:
|
|
2496
2542
|
if isinstance(num_features, tuple) and len(num_features) == 2:
|
|
2497
2543
|
min_features, max_features = num_features
|
|
2498
|
-
filter_conditions.append(
|
|
2544
|
+
filter_conditions.append(
|
|
2545
|
+
(pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features),
|
|
2546
|
+
)
|
|
2499
2547
|
else:
|
|
2500
2548
|
filter_conditions.append(pl.col("num_features") >= num_features)
|
|
2501
2549
|
else:
|
|
@@ -2552,15 +2600,15 @@ def samples_select(
|
|
|
2552
2600
|
def samples_delete(self, samples):
|
|
2553
2601
|
"""
|
|
2554
2602
|
Delete samples and all related data from the study based on sample identifiers.
|
|
2555
|
-
|
|
2556
|
-
This function eliminates all data related to the specified samples (and their sample_uids)
|
|
2603
|
+
|
|
2604
|
+
This function eliminates all data related to the specified samples (and their sample_uids)
|
|
2557
2605
|
from all dataframes including:
|
|
2558
2606
|
- samples_df: Removes the sample rows
|
|
2559
2607
|
- features_df: Removes all features belonging to these samples
|
|
2560
2608
|
- consensus_mapping_df: Removes mappings for features from these samples
|
|
2561
2609
|
- consensus_ms2: Removes MS2 spectra for features from these samples
|
|
2562
2610
|
- feature_maps: Removes the corresponding feature maps
|
|
2563
|
-
|
|
2611
|
+
|
|
2564
2612
|
Also updates map_id values to maintain sequential indices after deletion.
|
|
2565
2613
|
|
|
2566
2614
|
Parameters:
|
|
@@ -2622,10 +2670,10 @@ def samples_delete(self, samples):
|
|
|
2622
2670
|
|
|
2623
2671
|
# Get map_ids to remove from feature_maps (needed before samples_df deletion)
|
|
2624
2672
|
map_ids_to_remove = []
|
|
2625
|
-
if hasattr(self,
|
|
2673
|
+
if hasattr(self, "feature_maps") and self.feature_maps is not None:
|
|
2626
2674
|
# Get map_ids for samples to be deleted
|
|
2627
2675
|
map_ids_df = self.samples_df.filter(
|
|
2628
|
-
pl.col("sample_uid").is_in(sample_uids_to_remove)
|
|
2676
|
+
pl.col("sample_uid").is_in(sample_uids_to_remove),
|
|
2629
2677
|
).select("map_id")
|
|
2630
2678
|
if not map_ids_df.is_empty():
|
|
2631
2679
|
map_ids_to_remove = map_ids_df["map_id"].to_list()
|
|
@@ -2663,7 +2711,7 @@ def samples_delete(self, samples):
|
|
|
2663
2711
|
|
|
2664
2712
|
# 5. Remove from feature_maps and update map_id
|
|
2665
2713
|
removed_maps_count = 0
|
|
2666
|
-
if hasattr(self,
|
|
2714
|
+
if hasattr(self, "feature_maps") and self.feature_maps is not None and map_ids_to_remove:
|
|
2667
2715
|
# Remove feature maps in reverse order to maintain indices
|
|
2668
2716
|
for map_id in sorted(map_ids_to_remove, reverse=True):
|
|
2669
2717
|
if 0 <= map_id < len(self.feature_maps):
|
|
@@ -2674,7 +2722,7 @@ def samples_delete(self, samples):
|
|
|
2674
2722
|
if len(self.samples_df) > 0:
|
|
2675
2723
|
new_map_ids = list(range(len(self.samples_df)))
|
|
2676
2724
|
self.samples_df = self.samples_df.with_columns(
|
|
2677
|
-
pl.lit(new_map_ids).alias("map_id")
|
|
2725
|
+
pl.lit(new_map_ids).alias("map_id"),
|
|
2678
2726
|
)
|
|
2679
2727
|
|
|
2680
2728
|
# Calculate and log results
|
|
@@ -2685,16 +2733,16 @@ def samples_delete(self, samples):
|
|
|
2685
2733
|
summary_parts = [
|
|
2686
2734
|
f"Deleted {removed_sample_count} samples",
|
|
2687
2735
|
]
|
|
2688
|
-
|
|
2736
|
+
|
|
2689
2737
|
if removed_features_count > 0:
|
|
2690
2738
|
summary_parts.append(f"{removed_features_count} features")
|
|
2691
|
-
|
|
2739
|
+
|
|
2692
2740
|
if removed_mapping_count > 0:
|
|
2693
2741
|
summary_parts.append(f"{removed_mapping_count} consensus mappings")
|
|
2694
|
-
|
|
2742
|
+
|
|
2695
2743
|
if removed_ms2_count > 0:
|
|
2696
2744
|
summary_parts.append(f"{removed_ms2_count} MS2 spectra")
|
|
2697
|
-
|
|
2745
|
+
|
|
2698
2746
|
if removed_maps_count > 0:
|
|
2699
2747
|
summary_parts.append(f"{removed_maps_count} feature maps")
|
|
2700
2748
|
|
|
@@ -2715,14 +2763,14 @@ def samples_delete(self, samples):
|
|
|
2715
2763
|
def sample_color(self, by=None, palette="Turbo256"):
|
|
2716
2764
|
"""
|
|
2717
2765
|
Set sample colors in the sample_color column of samples_df.
|
|
2718
|
-
|
|
2766
|
+
|
|
2719
2767
|
When a new sample is added, this function resets all colors picking from the specified palette.
|
|
2720
2768
|
The default palette is Turbo256.
|
|
2721
2769
|
|
|
2722
2770
|
Parameters:
|
|
2723
2771
|
by (str or list, optional): Property to base colors on. Options:
|
|
2724
2772
|
- 'sample_uid': Use sample_uid values to assign colors
|
|
2725
|
-
- 'sample_index': Use sample index (position) to assign colors
|
|
2773
|
+
- 'sample_index': Use sample index (position) to assign colors
|
|
2726
2774
|
- 'sample_type': Use sample_type values to assign colors
|
|
2727
2775
|
- 'sample_name': Use sample_name values to assign colors
|
|
2728
2776
|
- list of colors: Use provided list of hex color codes
|
|
@@ -2735,7 +2783,7 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2735
2783
|
- 'Magma256': Magma colormap (256 colors, perceptually uniform)
|
|
2736
2784
|
- 'Cividis256': Cividis colormap (256 colors, colorblind-friendly)
|
|
2737
2785
|
- 'Set1': Qualitative palette (9 distinct colors)
|
|
2738
|
-
- 'Set2': Qualitative palette (8 distinct colors)
|
|
2786
|
+
- 'Set2': Qualitative palette (8 distinct colors)
|
|
2739
2787
|
- 'Set3': Qualitative palette (12 distinct colors)
|
|
2740
2788
|
- 'Tab10': Tableau 10 palette (10 distinct colors)
|
|
2741
2789
|
- 'Tab20': Tableau 20 palette (20 distinct colors)
|
|
@@ -2746,7 +2794,7 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2746
2794
|
- 'Coolwarm': Cool-warm diverging colormap
|
|
2747
2795
|
- 'Seismic': Seismic diverging colormap
|
|
2748
2796
|
- Any other colormap name supported by the cmap library
|
|
2749
|
-
|
|
2797
|
+
|
|
2750
2798
|
For a complete catalog of available colormaps, see:
|
|
2751
2799
|
https://cmap-docs.readthedocs.io/en/latest/catalog/
|
|
2752
2800
|
|
|
@@ -2756,10 +2804,10 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2756
2804
|
Example:
|
|
2757
2805
|
# Set colors based on sample type
|
|
2758
2806
|
study.sample_color(by='sample_type', palette='Set1')
|
|
2759
|
-
|
|
2807
|
+
|
|
2760
2808
|
# Set colors using a custom color list
|
|
2761
2809
|
study.sample_color(by=['#FF0000', '#00FF00', '#0000FF'])
|
|
2762
|
-
|
|
2810
|
+
|
|
2763
2811
|
# Reset to default Turbo256 sequential colors
|
|
2764
2812
|
study.sample_color()
|
|
2765
2813
|
"""
|
|
@@ -2768,11 +2816,13 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2768
2816
|
return
|
|
2769
2817
|
|
|
2770
2818
|
sample_count = len(self.samples_df)
|
|
2771
|
-
|
|
2819
|
+
|
|
2772
2820
|
# Handle custom color list
|
|
2773
2821
|
if isinstance(by, list):
|
|
2774
2822
|
if len(by) < sample_count:
|
|
2775
|
-
self.logger.warning(
|
|
2823
|
+
self.logger.warning(
|
|
2824
|
+
f"Provided color list has {len(by)} colors but {sample_count} samples. Repeating colors.",
|
|
2825
|
+
)
|
|
2776
2826
|
# Cycle through the provided colors if there aren't enough
|
|
2777
2827
|
colors = []
|
|
2778
2828
|
for i in range(sample_count):
|
|
@@ -2788,10 +2838,10 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2788
2838
|
except ValueError as e:
|
|
2789
2839
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2790
2840
|
return
|
|
2791
|
-
|
|
2792
|
-
elif by ==
|
|
2841
|
+
|
|
2842
|
+
elif by == "sample_uid":
|
|
2793
2843
|
# Use sample_uid to determine position in evenly sampled colormap
|
|
2794
|
-
sample_uids = self.samples_df[
|
|
2844
|
+
sample_uids = self.samples_df["sample_uid"].to_list()
|
|
2795
2845
|
try:
|
|
2796
2846
|
# Sample colors evenly for the number of samples
|
|
2797
2847
|
palette_colors = _sample_colors_from_colormap(palette, sample_count)
|
|
@@ -2803,29 +2853,29 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2803
2853
|
except ValueError as e:
|
|
2804
2854
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2805
2855
|
return
|
|
2806
|
-
|
|
2807
|
-
elif by ==
|
|
2856
|
+
|
|
2857
|
+
elif by == "sample_index":
|
|
2808
2858
|
# Use sample index (position in DataFrame) with evenly sampled colors
|
|
2809
2859
|
try:
|
|
2810
2860
|
colors = _sample_colors_from_colormap(palette, sample_count)
|
|
2811
2861
|
except ValueError as e:
|
|
2812
2862
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2813
2863
|
return
|
|
2814
|
-
|
|
2815
|
-
elif by ==
|
|
2864
|
+
|
|
2865
|
+
elif by == "sample_type":
|
|
2816
2866
|
# Use sample_type to assign colors - same type gets same color
|
|
2817
2867
|
# Sample colors evenly across colormap for unique types
|
|
2818
|
-
sample_types = self.samples_df[
|
|
2819
|
-
unique_types = list(
|
|
2820
|
-
|
|
2868
|
+
sample_types = self.samples_df["sample_type"].to_list()
|
|
2869
|
+
unique_types = list({t for t in sample_types if t is not None})
|
|
2870
|
+
|
|
2821
2871
|
try:
|
|
2822
2872
|
# Sample colors evenly for unique types
|
|
2823
2873
|
type_colors = _sample_colors_from_colormap(palette, len(unique_types))
|
|
2824
2874
|
type_to_color = {}
|
|
2825
|
-
|
|
2875
|
+
|
|
2826
2876
|
for i, sample_type in enumerate(unique_types):
|
|
2827
2877
|
type_to_color[sample_type] = type_colors[i]
|
|
2828
|
-
|
|
2878
|
+
|
|
2829
2879
|
colors = []
|
|
2830
2880
|
for sample_type in sample_types:
|
|
2831
2881
|
if sample_type is None:
|
|
@@ -2836,21 +2886,21 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2836
2886
|
except ValueError as e:
|
|
2837
2887
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2838
2888
|
return
|
|
2839
|
-
|
|
2840
|
-
elif by ==
|
|
2889
|
+
|
|
2890
|
+
elif by == "sample_name":
|
|
2841
2891
|
# Use sample_name to assign colors - same name gets same color (unlikely but possible)
|
|
2842
2892
|
# Sample colors evenly across colormap for unique names
|
|
2843
|
-
sample_names = self.samples_df[
|
|
2844
|
-
unique_names = list(
|
|
2845
|
-
|
|
2893
|
+
sample_names = self.samples_df["sample_name"].to_list()
|
|
2894
|
+
unique_names = list({n for n in sample_names if n is not None})
|
|
2895
|
+
|
|
2846
2896
|
try:
|
|
2847
2897
|
# Sample colors evenly for unique names
|
|
2848
2898
|
name_colors = _sample_colors_from_colormap(palette, len(unique_names))
|
|
2849
2899
|
name_to_color = {}
|
|
2850
|
-
|
|
2900
|
+
|
|
2851
2901
|
for i, sample_name in enumerate(unique_names):
|
|
2852
2902
|
name_to_color[sample_name] = name_colors[i]
|
|
2853
|
-
|
|
2903
|
+
|
|
2854
2904
|
colors = []
|
|
2855
2905
|
for sample_name in sample_names:
|
|
2856
2906
|
if sample_name is None:
|
|
@@ -2862,14 +2912,16 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2862
2912
|
self.logger.error(f"Error sampling colors from colormap: {e}")
|
|
2863
2913
|
return
|
|
2864
2914
|
else:
|
|
2865
|
-
self.logger.error(
|
|
2915
|
+
self.logger.error(
|
|
2916
|
+
f"Invalid by value: {by}. Must be 'sample_uid', 'sample_index', 'sample_type', 'sample_name', a list of colors, or None.",
|
|
2917
|
+
)
|
|
2866
2918
|
return
|
|
2867
2919
|
|
|
2868
2920
|
# Update the sample_color column
|
|
2869
2921
|
self.samples_df = self.samples_df.with_columns(
|
|
2870
|
-
pl.Series("sample_color", colors).alias("sample_color")
|
|
2922
|
+
pl.Series("sample_color", colors).alias("sample_color"),
|
|
2871
2923
|
)
|
|
2872
|
-
|
|
2924
|
+
|
|
2873
2925
|
if isinstance(by, list):
|
|
2874
2926
|
self.logger.debug(f"Set sample colors using provided color list ({len(by)} colors)")
|
|
2875
2927
|
elif by is None:
|
|
@@ -2881,28 +2933,28 @@ def sample_color(self, by=None, palette="Turbo256"):
|
|
|
2881
2933
|
def sample_color_reset(self):
|
|
2882
2934
|
"""
|
|
2883
2935
|
Reset sample colors to default coloring using the 'turbo' colormap.
|
|
2884
|
-
|
|
2936
|
+
|
|
2885
2937
|
This function assigns colors by distributing samples evenly across the full
|
|
2886
2938
|
turbo colormap range, ensuring maximum color diversity and visual distinction
|
|
2887
2939
|
between samples.
|
|
2888
|
-
|
|
2940
|
+
|
|
2889
2941
|
Returns:
|
|
2890
2942
|
None (modifies self.samples_df in place)
|
|
2891
2943
|
"""
|
|
2892
2944
|
if self.samples_df is None or len(self.samples_df) == 0:
|
|
2893
2945
|
self.logger.warning("No samples found in study.")
|
|
2894
2946
|
return
|
|
2895
|
-
|
|
2947
|
+
|
|
2896
2948
|
try:
|
|
2897
2949
|
from cmap import Colormap
|
|
2898
|
-
|
|
2950
|
+
|
|
2899
2951
|
# Use turbo colormap
|
|
2900
|
-
cm = Colormap(
|
|
2901
|
-
|
|
2952
|
+
cm = Colormap("turbo")
|
|
2953
|
+
|
|
2902
2954
|
# Get sample count and assign colors evenly distributed across colormap
|
|
2903
2955
|
n_samples = len(self.samples_df)
|
|
2904
2956
|
colors = []
|
|
2905
|
-
|
|
2957
|
+
|
|
2906
2958
|
# Distribute samples evenly across the full colormap range
|
|
2907
2959
|
for i in range(n_samples):
|
|
2908
2960
|
# Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
|
|
@@ -2910,9 +2962,9 @@ def sample_color_reset(self):
|
|
|
2910
2962
|
# Optionally, map to a subset of colormap to avoid extreme colors
|
|
2911
2963
|
# Use 10% to 90% of colormap range for better color diversity
|
|
2912
2964
|
normalized_value = 0.1 + (normalized_value * 0.8)
|
|
2913
|
-
|
|
2965
|
+
|
|
2914
2966
|
color_rgba = cm(normalized_value)
|
|
2915
|
-
|
|
2967
|
+
|
|
2916
2968
|
# Convert RGBA to hex
|
|
2917
2969
|
if len(color_rgba) >= 3:
|
|
2918
2970
|
r, g, b = color_rgba[:3]
|
|
@@ -2921,14 +2973,14 @@ def sample_color_reset(self):
|
|
|
2921
2973
|
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
|
2922
2974
|
hex_color = f"#{r:02x}{g:02x}{b:02x}"
|
|
2923
2975
|
colors.append(hex_color)
|
|
2924
|
-
|
|
2976
|
+
|
|
2925
2977
|
# Update the sample_color column
|
|
2926
2978
|
self.samples_df = self.samples_df.with_columns(
|
|
2927
|
-
pl.Series("sample_color", colors).alias("sample_color")
|
|
2979
|
+
pl.Series("sample_color", colors).alias("sample_color"),
|
|
2928
2980
|
)
|
|
2929
|
-
|
|
2981
|
+
|
|
2930
2982
|
self.logger.debug(f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)")
|
|
2931
|
-
|
|
2983
|
+
|
|
2932
2984
|
except ImportError:
|
|
2933
2985
|
self.logger.error("cmap library is required for sample color reset. Install with: pip install cmap")
|
|
2934
2986
|
except Exception as e:
|
|
@@ -2938,13 +2990,13 @@ def sample_color_reset(self):
|
|
|
2938
2990
|
def _get_color_palette(palette_name):
|
|
2939
2991
|
"""
|
|
2940
2992
|
Get color palette as a list of hex color codes using the cmap library.
|
|
2941
|
-
|
|
2993
|
+
|
|
2942
2994
|
Parameters:
|
|
2943
2995
|
palette_name (str): Name of the palette
|
|
2944
|
-
|
|
2996
|
+
|
|
2945
2997
|
Returns:
|
|
2946
2998
|
list: List of hex color codes
|
|
2947
|
-
|
|
2999
|
+
|
|
2948
3000
|
Raises:
|
|
2949
3001
|
ValueError: If palette_name is not supported
|
|
2950
3002
|
"""
|
|
@@ -2952,40 +3004,38 @@ def _get_color_palette(palette_name):
|
|
|
2952
3004
|
from cmap import Colormap
|
|
2953
3005
|
except ImportError:
|
|
2954
3006
|
raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
|
|
2955
|
-
|
|
3007
|
+
|
|
2956
3008
|
# Map common palette names to cmap names
|
|
2957
3009
|
palette_mapping = {
|
|
2958
3010
|
# Scientific colormaps
|
|
2959
3011
|
"Turbo256": "turbo",
|
|
2960
|
-
"Viridis256": "viridis",
|
|
3012
|
+
"Viridis256": "viridis",
|
|
2961
3013
|
"Plasma256": "plasma",
|
|
2962
3014
|
"Inferno256": "inferno",
|
|
2963
3015
|
"Magma256": "magma",
|
|
2964
3016
|
"Cividis256": "cividis",
|
|
2965
|
-
|
|
2966
3017
|
# Qualitative palettes
|
|
2967
3018
|
"Set1": "Set1",
|
|
2968
|
-
"Set2": "Set2",
|
|
3019
|
+
"Set2": "Set2",
|
|
2969
3020
|
"Set3": "Set3",
|
|
2970
3021
|
"Tab10": "tab10",
|
|
2971
3022
|
"Tab20": "tab20",
|
|
2972
3023
|
"Dark2": "Dark2",
|
|
2973
3024
|
"Paired": "Paired",
|
|
2974
|
-
|
|
2975
3025
|
# Additional useful palettes
|
|
2976
3026
|
"Spectral": "Spectral",
|
|
2977
3027
|
"Rainbow": "rainbow",
|
|
2978
3028
|
"Coolwarm": "coolwarm",
|
|
2979
3029
|
"Seismic": "seismic",
|
|
2980
3030
|
}
|
|
2981
|
-
|
|
3031
|
+
|
|
2982
3032
|
# Get the cmap name
|
|
2983
3033
|
cmap_name = palette_mapping.get(palette_name, palette_name.lower())
|
|
2984
|
-
|
|
3034
|
+
|
|
2985
3035
|
try:
|
|
2986
3036
|
# Create colormap
|
|
2987
3037
|
cm = Colormap(cmap_name)
|
|
2988
|
-
|
|
3038
|
+
|
|
2989
3039
|
# Determine number of colors to generate
|
|
2990
3040
|
if "256" in palette_name:
|
|
2991
3041
|
n_colors = 256
|
|
@@ -3001,7 +3051,7 @@ def _get_color_palette(palette_name):
|
|
|
3001
3051
|
n_colors = 20
|
|
3002
3052
|
else:
|
|
3003
3053
|
n_colors = 256 # Default for continuous colormaps
|
|
3004
|
-
|
|
3054
|
+
|
|
3005
3055
|
# Generate colors
|
|
3006
3056
|
if n_colors <= 20:
|
|
3007
3057
|
# For discrete palettes, use evenly spaced indices
|
|
@@ -3009,11 +3059,11 @@ def _get_color_palette(palette_name):
|
|
|
3009
3059
|
else:
|
|
3010
3060
|
# For continuous palettes, use full range
|
|
3011
3061
|
indices = [i / (n_colors - 1) for i in range(n_colors)]
|
|
3012
|
-
|
|
3062
|
+
|
|
3013
3063
|
# Get colors as RGBA and convert to hex
|
|
3014
3064
|
colors = cm(indices)
|
|
3015
3065
|
hex_colors = []
|
|
3016
|
-
|
|
3066
|
+
|
|
3017
3067
|
for color in colors:
|
|
3018
3068
|
if len(color) >= 3: # RGBA or RGB
|
|
3019
3069
|
r, g, b = color[:3]
|
|
@@ -3022,25 +3072,26 @@ def _get_color_palette(palette_name):
|
|
|
3022
3072
|
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
|
3023
3073
|
hex_color = f"#{r:02x}{g:02x}{b:02x}"
|
|
3024
3074
|
hex_colors.append(hex_color)
|
|
3025
|
-
|
|
3075
|
+
|
|
3026
3076
|
return hex_colors
|
|
3027
|
-
|
|
3077
|
+
|
|
3028
3078
|
except Exception as e:
|
|
3029
|
-
raise ValueError(
|
|
3030
|
-
|
|
3079
|
+
raise ValueError(
|
|
3080
|
+
f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
|
|
3081
|
+
)
|
|
3031
3082
|
|
|
3032
3083
|
|
|
3033
3084
|
def _sample_colors_from_colormap(palette_name, n_colors):
|
|
3034
3085
|
"""
|
|
3035
3086
|
Sample colors evenly from the whole colormap range, similar to sample_color_reset.
|
|
3036
|
-
|
|
3087
|
+
|
|
3037
3088
|
Parameters:
|
|
3038
3089
|
palette_name (str): Name of the palette/colormap
|
|
3039
3090
|
n_colors (int): Number of colors to sample
|
|
3040
|
-
|
|
3091
|
+
|
|
3041
3092
|
Returns:
|
|
3042
3093
|
list: List of hex color codes sampled evenly from the colormap
|
|
3043
|
-
|
|
3094
|
+
|
|
3044
3095
|
Raises:
|
|
3045
3096
|
ValueError: If palette_name is not supported
|
|
3046
3097
|
"""
|
|
@@ -3048,51 +3099,49 @@ def _sample_colors_from_colormap(palette_name, n_colors):
|
|
|
3048
3099
|
from cmap import Colormap
|
|
3049
3100
|
except ImportError:
|
|
3050
3101
|
raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
|
|
3051
|
-
|
|
3102
|
+
|
|
3052
3103
|
# Map common palette names to cmap names (same as _get_color_palette)
|
|
3053
3104
|
palette_mapping = {
|
|
3054
3105
|
# Scientific colormaps
|
|
3055
3106
|
"Turbo256": "turbo",
|
|
3056
|
-
"Viridis256": "viridis",
|
|
3107
|
+
"Viridis256": "viridis",
|
|
3057
3108
|
"Plasma256": "plasma",
|
|
3058
3109
|
"Inferno256": "inferno",
|
|
3059
3110
|
"Magma256": "magma",
|
|
3060
3111
|
"Cividis256": "cividis",
|
|
3061
|
-
|
|
3062
3112
|
# Qualitative palettes
|
|
3063
3113
|
"Set1": "Set1",
|
|
3064
|
-
"Set2": "Set2",
|
|
3114
|
+
"Set2": "Set2",
|
|
3065
3115
|
"Set3": "Set3",
|
|
3066
3116
|
"Tab10": "tab10",
|
|
3067
3117
|
"Tab20": "tab20",
|
|
3068
3118
|
"Dark2": "Dark2",
|
|
3069
3119
|
"Paired": "Paired",
|
|
3070
|
-
|
|
3071
3120
|
# Additional useful palettes
|
|
3072
3121
|
"Spectral": "Spectral",
|
|
3073
3122
|
"Rainbow": "rainbow",
|
|
3074
3123
|
"Coolwarm": "coolwarm",
|
|
3075
3124
|
"Seismic": "seismic",
|
|
3076
3125
|
}
|
|
3077
|
-
|
|
3126
|
+
|
|
3078
3127
|
# Get the cmap name
|
|
3079
3128
|
cmap_name = palette_mapping.get(palette_name, palette_name.lower())
|
|
3080
|
-
|
|
3129
|
+
|
|
3081
3130
|
try:
|
|
3082
3131
|
# Create colormap
|
|
3083
3132
|
cm = Colormap(cmap_name)
|
|
3084
|
-
|
|
3133
|
+
|
|
3085
3134
|
colors = []
|
|
3086
|
-
|
|
3135
|
+
|
|
3087
3136
|
# Distribute samples evenly across the full colormap range (same approach as sample_color_reset)
|
|
3088
3137
|
for i in range(n_colors):
|
|
3089
3138
|
# Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
|
|
3090
3139
|
normalized_value = (i + 0.5) / n_colors # +0.5 to center samples in their bins
|
|
3091
3140
|
# Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
|
|
3092
3141
|
normalized_value = 0.1 + (normalized_value * 0.8)
|
|
3093
|
-
|
|
3142
|
+
|
|
3094
3143
|
color_rgba = cm(normalized_value)
|
|
3095
|
-
|
|
3144
|
+
|
|
3096
3145
|
# Convert RGBA to hex
|
|
3097
3146
|
if len(color_rgba) >= 3:
|
|
3098
3147
|
r, g, b = color_rgba[:3]
|
|
@@ -3101,12 +3150,13 @@ def _sample_colors_from_colormap(palette_name, n_colors):
|
|
|
3101
3150
|
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
|
3102
3151
|
hex_color = f"#{r:02x}{g:02x}{b:02x}"
|
|
3103
3152
|
colors.append(hex_color)
|
|
3104
|
-
|
|
3153
|
+
|
|
3105
3154
|
return colors
|
|
3106
|
-
|
|
3155
|
+
|
|
3107
3156
|
except Exception as e:
|
|
3108
|
-
raise ValueError(
|
|
3109
|
-
|
|
3157
|
+
raise ValueError(
|
|
3158
|
+
f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
|
|
3159
|
+
)
|
|
3110
3160
|
|
|
3111
3161
|
|
|
3112
3162
|
def _matplotlib_to_hex(color_dict):
|
|
@@ -3115,32 +3165,32 @@ def _matplotlib_to_hex(color_dict):
|
|
|
3115
3165
|
|
|
3116
3166
|
|
|
3117
3167
|
# =====================================================================================
|
|
3118
|
-
# SCHEMA AND DATA STRUCTURE FUNCTIONS
|
|
3168
|
+
# SCHEMA AND DATA STRUCTURE FUNCTIONS
|
|
3119
3169
|
# =====================================================================================
|
|
3120
3170
|
|
|
3121
3171
|
|
|
3122
3172
|
def _ensure_features_df_schema_order(self):
|
|
3123
3173
|
"""
|
|
3124
3174
|
Ensure features_df columns are ordered according to study5_schema.json.
|
|
3125
|
-
|
|
3175
|
+
|
|
3126
3176
|
This method should be called after operations that might scramble the column order.
|
|
3127
3177
|
"""
|
|
3128
3178
|
if self.features_df is None or self.features_df.is_empty():
|
|
3129
3179
|
return
|
|
3130
|
-
|
|
3180
|
+
|
|
3131
3181
|
try:
|
|
3132
3182
|
import os
|
|
3133
3183
|
import json
|
|
3134
3184
|
from masster.study.h5 import _reorder_columns_by_schema
|
|
3135
|
-
|
|
3185
|
+
|
|
3136
3186
|
# Load schema
|
|
3137
3187
|
schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
|
|
3138
|
-
with open(schema_path
|
|
3188
|
+
with open(schema_path) as f:
|
|
3139
3189
|
schema = json.load(f)
|
|
3140
|
-
|
|
3190
|
+
|
|
3141
3191
|
# Reorder columns to match schema
|
|
3142
|
-
self.features_df = _reorder_columns_by_schema(self.features_df, schema,
|
|
3143
|
-
|
|
3192
|
+
self.features_df = _reorder_columns_by_schema(self.features_df, schema, "features_df")
|
|
3193
|
+
|
|
3144
3194
|
except Exception as e:
|
|
3145
3195
|
self.logger.warning(f"Failed to reorder features_df columns: {e}")
|
|
3146
3196
|
|
|
@@ -3148,38 +3198,38 @@ def _ensure_features_df_schema_order(self):
|
|
|
3148
3198
|
def migrate_map_id_to_index(self):
|
|
3149
3199
|
"""
|
|
3150
3200
|
Migrate map_id from string-based OpenMS unique IDs to integer indices.
|
|
3151
|
-
|
|
3201
|
+
|
|
3152
3202
|
This function converts the map_id column from string type (with OpenMS unique IDs)
|
|
3153
3203
|
to integer type where each map_id corresponds to the index of the feature map
|
|
3154
3204
|
in self.features_maps.
|
|
3155
|
-
|
|
3205
|
+
|
|
3156
3206
|
This migration is needed for studies that were created before the map_id format
|
|
3157
3207
|
change from OpenMS unique IDs to feature map indices.
|
|
3158
3208
|
"""
|
|
3159
3209
|
if self.samples_df is None or self.samples_df.is_empty():
|
|
3160
3210
|
self.logger.warning("No samples to migrate")
|
|
3161
3211
|
return
|
|
3162
|
-
|
|
3212
|
+
|
|
3163
3213
|
# Check if migration is needed
|
|
3164
|
-
current_dtype = self.samples_df[
|
|
3214
|
+
current_dtype = self.samples_df["map_id"].dtype
|
|
3165
3215
|
if current_dtype == pl.Int64:
|
|
3166
3216
|
self.logger.info("map_id column is already Int64 type - no migration needed")
|
|
3167
3217
|
return
|
|
3168
|
-
|
|
3218
|
+
|
|
3169
3219
|
self.logger.info("Migrating map_id from string-based OpenMS IDs to integer indices")
|
|
3170
|
-
|
|
3220
|
+
|
|
3171
3221
|
# Create new map_id values based on sample order
|
|
3172
3222
|
# Each sample gets a map_id that corresponds to its position in features_maps
|
|
3173
3223
|
sample_count = len(self.samples_df)
|
|
3174
3224
|
new_map_ids = list(range(sample_count))
|
|
3175
|
-
|
|
3225
|
+
|
|
3176
3226
|
# Update the map_id column
|
|
3177
3227
|
self.samples_df = self.samples_df.with_columns(
|
|
3178
|
-
pl.lit(new_map_ids).alias("map_id")
|
|
3228
|
+
pl.lit(new_map_ids).alias("map_id"),
|
|
3179
3229
|
)
|
|
3180
|
-
|
|
3230
|
+
|
|
3181
3231
|
# Ensure the column is Int64 type
|
|
3182
3232
|
self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
|
|
3183
|
-
|
|
3233
|
+
|
|
3184
3234
|
self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
|
|
3185
3235
|
self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")
|