masster 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/helpers.py CHANGED
@@ -6,7 +6,7 @@ like data retrieval, filtering, compression, and utility functions.
6
6
 
7
7
  The functions are organized into the following sections:
8
8
  1. Chromatogram extraction functions (BPC, TIC, EIC, chrom matrix)
9
- 2. Data retrieval helper functions (get_sample, get_consensus, etc.)
9
+ 2. Data retrieval helper functions (get_sample, get_consensus, etc.)
10
10
  3. UID helper functions (_get_*_uids)
11
11
  4. Data filtering and selection functions
12
12
  5. Data compression and restoration functions
@@ -150,9 +150,19 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
150
150
  # build Chromatogram
151
151
  ycol = "inty"
152
152
  try:
153
- chrom = Chromatogram(rt=bpc_pd["rt"].to_numpy(), inty=bpc_pd[ycol].to_numpy(), label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
153
+ chrom = Chromatogram(
154
+ rt=bpc_pd["rt"].to_numpy(),
155
+ inty=bpc_pd[ycol].to_numpy(),
156
+ label=label or "Base Peak Chromatogram",
157
+ rt_unit=rt_unit,
158
+ )
154
159
  except Exception:
155
- chrom = Chromatogram(rt=bpc_pd["rt"].values, inty=bpc_pd[ycol].values, label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
160
+ chrom = Chromatogram(
161
+ rt=bpc_pd["rt"].values,
162
+ inty=bpc_pd[ycol].values,
163
+ label=label or "Base Peak Chromatogram",
164
+ rt_unit=rt_unit,
165
+ )
156
166
 
157
167
  return chrom
158
168
 
@@ -204,13 +214,21 @@ def get_tic(owner, sample=None, label=None):
204
214
  tic_pd = tic_pd.rename(columns={tic_pd.columns[1]: "inty_tot"})
205
215
 
206
216
  try:
207
- chrom = Chromatogram(rt=tic_pd["rt"].to_numpy(), inty=tic_pd["inty_tot"].to_numpy(), label=label or "Total Ion Chromatogram")
217
+ chrom = Chromatogram(
218
+ rt=tic_pd["rt"].to_numpy(),
219
+ inty=tic_pd["inty_tot"].to_numpy(),
220
+ label=label or "Total Ion Chromatogram",
221
+ )
208
222
  except Exception:
209
- chrom = Chromatogram(rt=tic_pd["rt"].values, inty=tic_pd["inty_tot"].values, label=label or "Total Ion Chromatogram")
223
+ chrom = Chromatogram(
224
+ rt=tic_pd["rt"].values,
225
+ inty=tic_pd["inty_tot"].values,
226
+ label=label or "Total Ion Chromatogram",
227
+ )
210
228
 
211
229
  return chrom
212
230
 
213
-
231
+
214
232
  def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
215
233
  """
216
234
  Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
@@ -223,7 +241,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
223
241
 
224
242
  Parameters:
225
243
  owner: Study or Sample instance
226
- sample: Sample identifier (required if owner is Study)
244
+ sample: Sample identifier (required if owner is Study)
227
245
  mz (float): Target m/z value
228
246
  mz_tol (float): m/z tolerance. If None, uses owner.parameters.eic_mz_tol (for Study) or defaults to 0.01
229
247
  rt_unit (str): Retention time unit for the chromatogram
@@ -234,7 +252,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
234
252
  """
235
253
  # Use default mz_tol from study parameters if not provided
236
254
  if mz_tol is None:
237
- if hasattr(owner, 'parameters') and hasattr(owner.parameters, 'eic_mz_tol'):
255
+ if hasattr(owner, "parameters") and hasattr(owner.parameters, "eic_mz_tol"):
238
256
  mz_tol = owner.parameters.eic_mz_tol
239
257
  else:
240
258
  mz_tol = 0.01 # fallback default
@@ -267,17 +285,18 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
267
285
  mz_min = mz - mz_tol
268
286
  mz_max = mz + mz_tol
269
287
  eic_data = s.ms1_df.filter(
270
- (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
288
+ (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max),
271
289
  )
272
290
 
273
291
  if eic_data.is_empty():
274
292
  # Return empty chromatogram if no data found
275
293
  import numpy as _np
294
+
276
295
  return Chromatogram(
277
- rt=_np.array([0.0]),
278
- inty=_np.array([0.0]),
296
+ rt=_np.array([0.0]),
297
+ inty=_np.array([0.0]),
279
298
  label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
280
- rt_unit=rt_unit
299
+ rt_unit=rt_unit,
281
300
  )
282
301
 
283
302
  # Aggregate intensities per retention time (sum in case of multiple points per rt)
@@ -290,34 +309,35 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
290
309
  if eic_pd.empty:
291
310
  # Return empty chromatogram if no data found
292
311
  import numpy as _np
312
+
293
313
  return Chromatogram(
294
- rt=_np.array([0.0]),
295
- inty=_np.array([0.0]),
314
+ rt=_np.array([0.0]),
315
+ inty=_np.array([0.0]),
296
316
  label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
297
- rt_unit=rt_unit
317
+ rt_unit=rt_unit,
298
318
  )
299
319
 
300
320
  # build Chromatogram
301
321
  try:
302
322
  chrom = Chromatogram(
303
- rt=eic_pd["rt"].to_numpy(),
304
- inty=eic_pd["inty"].to_numpy(),
323
+ rt=eic_pd["rt"].to_numpy(),
324
+ inty=eic_pd["inty"].to_numpy(),
305
325
  label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
306
- rt_unit=rt_unit
326
+ rt_unit=rt_unit,
307
327
  )
308
328
  except Exception:
309
329
  chrom = Chromatogram(
310
- rt=eic_pd["rt"].values,
311
- inty=eic_pd["inty"].values,
330
+ rt=eic_pd["rt"].values,
331
+ inty=eic_pd["inty"].values,
312
332
  label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
313
- rt_unit=rt_unit
333
+ rt_unit=rt_unit,
314
334
  )
315
335
 
316
336
  return chrom
317
337
 
318
338
 
319
339
  # =====================================================================================
320
- # DATA RETRIEVAL AND MATRIX FUNCTIONS
340
+ # DATA RETRIEVAL AND MATRIX FUNCTIONS
321
341
  # =====================================================================================
322
342
 
323
343
 
@@ -451,9 +471,9 @@ def align_reset(self):
451
471
  self.alignment_ref_index = None
452
472
  # in self.features_df, set rt equal to rt_original
453
473
  self.features_df = self.features_df.with_columns(
454
- pl.col("rt_original").alias("rt")
474
+ pl.col("rt_original").alias("rt"),
455
475
  )
456
-
476
+
457
477
  # Ensure column order is maintained after with_columns operation
458
478
  self._ensure_features_df_schema_order()
459
479
 
@@ -479,7 +499,9 @@ def get_consensus(self, quant="chrom_area"):
479
499
  # sort by consensus_id
480
500
  df1 = df1.sort_index()
481
501
 
482
- df2 = self.get_consensus_matrix(quant=quant)
502
+ df2_polars = self.get_consensus_matrix(quant=quant)
503
+ # Convert to pandas for merging (since the result is used for export)
504
+ df2 = df2_polars.to_pandas().set_index("consensus_uid")
483
505
  # sort df2 row by consensus_id
484
506
  df2 = df2.sort_index()
485
507
  # merge df and df2 on consensus_id
@@ -492,6 +514,7 @@ def get_consensus(self, quant="chrom_area"):
492
514
  def get_consensus_matrix(self, quant="chrom_area"):
493
515
  """
494
516
  Get a matrix of consensus features with samples as columns and consensus features as rows.
517
+ Optimized implementation that avoids expensive join operations.
495
518
  """
496
519
  if quant not in self.features_df.columns:
497
520
  self.logger.error(
@@ -499,41 +522,58 @@ def get_consensus_matrix(self, quant="chrom_area"):
499
522
  )
500
523
  return None
501
524
 
502
- # Use Polars join instead of pandas merge
503
- features_subset = self.features_df.select(["feature_uid", "sample_uid", quant])
504
- consensus_mapping_subset = self.consensus_mapping_df.select([
505
- "consensus_uid",
506
- "feature_uid",
507
- ])
508
-
509
- df1 = features_subset.join(
510
- consensus_mapping_subset,
511
- on="feature_uid",
512
- how="left",
513
- )
514
-
515
- # Convert to pandas for pivot operation (Polars pivot is still evolving)
516
- df1_pd = df1.to_pandas()
517
- df2 = df1_pd.pivot_table(
518
- index="consensus_uid",
519
- columns="sample_uid",
520
- values=quant,
521
- aggfunc="max",
522
- )
523
-
524
- # Create sample_uid to sample_name mapping using Polars
525
- sample_mapping = dict(
526
- self.samples_df.select(["sample_uid", "sample_name"]).iter_rows(),
527
- )
528
- # replace sample_uid with sample_name in df2
529
- df2 = df2.rename(columns=sample_mapping)
525
+ # Create a lookup dictionary from features_df for O(1) value access
526
+ feature_values = {}
527
+ for row in self.features_df.iter_rows(named=True):
528
+ feature_uid = row['feature_uid']
529
+ sample_uid = row['sample_uid']
530
+ value = row[quant] if row[quant] is not None else 0
531
+ feature_values[(feature_uid, sample_uid)] = value
532
+
533
+ # Build consensus matrix directly using the consensus_mapping_df
534
+ matrix_dict = {}
535
+ sample_mapping = dict(self.samples_df.select(["sample_uid", "sample_name"]).iter_rows())
536
+
537
+ for row in self.consensus_mapping_df.iter_rows(named=True):
538
+ consensus_uid = row['consensus_uid']
539
+ sample_uid = row['sample_uid']
540
+ feature_uid = row['feature_uid']
541
+
542
+ # Look up the quantification value
543
+ key = (feature_uid, sample_uid)
544
+ value = feature_values.get(key, 0)
545
+
546
+ if consensus_uid not in matrix_dict:
547
+ matrix_dict[consensus_uid] = {}
548
+
549
+ sample_name = sample_mapping.get(sample_uid, f"sample_{sample_uid}")
550
+
551
+ # Take max if multiple features map to same consensus/sample combination
552
+ if sample_name in matrix_dict[consensus_uid]:
553
+ matrix_dict[consensus_uid][sample_name] = max(matrix_dict[consensus_uid][sample_name], value)
554
+ else:
555
+ matrix_dict[consensus_uid][sample_name] = value
530
556
 
531
- # round to integer
532
- df2 = df2.round()
533
- # set consensus_id as uint64
534
- df2.index = df2.index.astype("uint64")
535
- # set index to consensus_id
536
- df2.index.name = "consensus_uid"
557
+ # Convert to Polars DataFrame with proper formatting
558
+ import polars as pl
559
+
560
+ # Convert matrix_dict to list of records for Polars
561
+ records = []
562
+ for consensus_uid, sample_values in matrix_dict.items():
563
+ record = {"consensus_uid": consensus_uid}
564
+ record.update(sample_values)
565
+ records.append(record)
566
+
567
+ # Create Polars DataFrame and set proper data types
568
+ df2 = pl.DataFrame(records)
569
+
570
+ # Fill null values with 0 and round numeric columns
571
+ numeric_cols = [col for col in df2.columns if col != "consensus_uid"]
572
+ df2 = df2.with_columns([
573
+ pl.col("consensus_uid").cast(pl.UInt64),
574
+ *[pl.col(col).fill_null(0).round(0) for col in numeric_cols]
575
+ ])
576
+
537
577
  return df2
538
578
 
539
579
 
@@ -594,7 +634,7 @@ def get_consensus_matches(self, uids=None):
594
634
  return matches
595
635
 
596
636
 
597
- # =====================================================================================
637
+ # =====================================================================================
598
638
  # UID HELPER FUNCTIONS
599
639
  # =====================================================================================
600
640
 
@@ -776,7 +816,7 @@ def get_sample(self, sample):
776
816
  return cache[sample_uid]
777
817
 
778
818
  sample_path = row.get("sample_path", None)
779
- s = Sample(log_level='ERROR')
819
+ s = Sample(log_level="ERROR")
780
820
  try:
781
821
  if sample_path:
782
822
  try:
@@ -796,13 +836,13 @@ def get_orphans(self):
796
836
  Get all features that are not in the consensus mapping.
797
837
  """
798
838
  not_in_consensus = self.features_df.filter(
799
- ~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list())
839
+ ~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list()),
800
840
  )
801
841
  return not_in_consensus
802
842
 
803
843
 
804
844
  # =====================================================================================
805
- # DATA COMPRESSION AND RESTORATION FUNCTIONS
845
+ # DATA COMPRESSION AND RESTORATION FUNCTIONS
806
846
  # =====================================================================================
807
847
 
808
848
 
@@ -858,7 +898,7 @@ def compress_features(self):
858
898
 
859
899
  removed_count = initial_count - len(self.features_df)
860
900
  self.logger.info(
861
- f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column"
901
+ f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column",
862
902
  )
863
903
 
864
904
 
@@ -1099,7 +1139,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1099
1139
  total_chroms = len(self.features_df)
1100
1140
 
1101
1141
  self.logger.debug(
1102
- f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)"
1142
+ f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)",
1103
1143
  )
1104
1144
 
1105
1145
  if empty_chroms == 0:
@@ -1229,7 +1269,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1229
1269
  final_total = len(self.features_df)
1230
1270
 
1231
1271
  self.logger.info(
1232
- f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)"
1272
+ f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)",
1233
1273
  )
1234
1274
  self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
1235
1275
 
@@ -1270,7 +1310,7 @@ def compress_ms2(self, max_replicates=5):
1270
1310
 
1271
1311
  removed_count = initial_count - len(self.consensus_ms2)
1272
1312
  self.logger.info(
1273
- f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair"
1313
+ f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair",
1274
1314
  )
1275
1315
 
1276
1316
 
@@ -1308,14 +1348,14 @@ def compress_chrom(self):
1308
1348
  def sample_name_replace(self, replace_dict):
1309
1349
  """
1310
1350
  Replace sample names in samples_df based on a dictionary mapping.
1311
-
1312
- Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
1313
- all keys with their corresponding values from replace_dict. Checks that all
1351
+
1352
+ Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
1353
+ all keys with their corresponding values from replace_dict. Checks that all
1314
1354
  resulting sample names are unique. If unique, replaces the values in self.samples_df.
1315
1355
 
1316
1356
  Parameters:
1317
1357
  replace_dict (dict): Dictionary mapping old names (keys) to new names (values).
1318
- All keys found in sample names will be replaced with their
1358
+ All keys found in sample names will be replaced with their
1319
1359
  corresponding values.
1320
1360
  e.g., {"old_name1": "new_name1", "old_name2": "new_name2"}
1321
1361
 
@@ -1328,22 +1368,22 @@ def sample_name_replace(self, replace_dict):
1328
1368
  """
1329
1369
  if not isinstance(replace_dict, dict):
1330
1370
  raise ValueError("replace_dict must be a dictionary")
1331
-
1371
+
1332
1372
  if self.samples_df is None or len(self.samples_df) == 0:
1333
1373
  self.logger.warning("No samples found in study.")
1334
1374
  return
1335
-
1375
+
1336
1376
  if not replace_dict:
1337
1377
  self.logger.warning("Empty replace_dict provided, no changes made.")
1338
1378
  return
1339
1379
 
1340
1380
  # Get current sample names
1341
1381
  current_names = self.samples_df.get_column("sample_name").to_list()
1342
-
1382
+
1343
1383
  # Create a copy and apply replacements
1344
1384
  new_names = []
1345
1385
  replaced_count = 0
1346
-
1386
+
1347
1387
  for name in current_names:
1348
1388
  if name in replace_dict:
1349
1389
  new_names.append(replace_dict[name])
@@ -1351,7 +1391,7 @@ def sample_name_replace(self, replace_dict):
1351
1391
  self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
1352
1392
  else:
1353
1393
  new_names.append(name)
1354
-
1394
+
1355
1395
  # Check that all new names are unique
1356
1396
  if len(set(new_names)) != len(new_names):
1357
1397
  duplicates = []
@@ -1362,19 +1402,19 @@ def sample_name_replace(self, replace_dict):
1362
1402
  else:
1363
1403
  seen.add(name)
1364
1404
  raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
1365
-
1405
+
1366
1406
  # If we get here, all names are unique - apply the changes
1367
1407
  self.samples_df = self.samples_df.with_columns(
1368
1408
  pl.Series("sample_name", new_names).alias("sample_name"),
1369
1409
  )
1370
-
1410
+
1371
1411
  self.logger.info(f"Successfully replaced {replaced_count} sample names")
1372
1412
 
1373
1413
 
1374
1414
  def sample_name_reset(self):
1375
1415
  """
1376
1416
  Reset sample names to the basename of sample_path without extensions.
1377
-
1417
+
1378
1418
  Takes all paths in self.samples_df['sample_path'], extracts the basename,
1379
1419
  removes file extensions, and checks that all resulting names are unique.
1380
1420
  If unique, replaces the values in self.samples_df['sample_name'].
@@ -1387,31 +1427,31 @@ def sample_name_reset(self):
1387
1427
  RuntimeError: If any sample_path is None or empty
1388
1428
  """
1389
1429
  import os
1390
-
1430
+
1391
1431
  if self.samples_df is None or len(self.samples_df) == 0:
1392
1432
  self.logger.warning("No samples found in study.")
1393
1433
  return
1394
1434
 
1395
1435
  # Get current sample paths
1396
1436
  sample_paths = self.samples_df.get_column("sample_path").to_list()
1397
-
1437
+
1398
1438
  # Extract basenames without extensions
1399
1439
  new_names = []
1400
-
1440
+
1401
1441
  for i, path in enumerate(sample_paths):
1402
1442
  if path is None or path == "":
1403
1443
  raise RuntimeError(f"Sample at index {i} has no sample_path set")
1404
-
1444
+
1405
1445
  # Get basename and remove extension(s)
1406
1446
  basename = os.path.basename(path)
1407
1447
  # Remove all extensions (handles cases like .tar.gz, .sample5.gz, etc.)
1408
1448
  name_without_ext = basename
1409
- while '.' in name_without_ext:
1449
+ while "." in name_without_ext:
1410
1450
  name_without_ext = os.path.splitext(name_without_ext)[0]
1411
-
1451
+
1412
1452
  new_names.append(name_without_ext)
1413
1453
  self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
1414
-
1454
+
1415
1455
  # Check that all new names are unique
1416
1456
  if len(set(new_names)) != len(new_names):
1417
1457
  duplicates = []
@@ -1422,12 +1462,12 @@ def sample_name_reset(self):
1422
1462
  else:
1423
1463
  seen.add(name)
1424
1464
  raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
1425
-
1465
+
1426
1466
  # If we get here, all names are unique - apply the changes
1427
1467
  self.samples_df = self.samples_df.with_columns(
1428
1468
  pl.Series("sample_name", new_names).alias("sample_name"),
1429
1469
  )
1430
-
1470
+
1431
1471
  self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
1432
1472
 
1433
1473
 
@@ -1684,7 +1724,7 @@ def features_select(
1684
1724
  if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
1685
1725
  min_coherence, max_coherence = chrom_coherence
1686
1726
  filter_conditions.append(
1687
- (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
1727
+ (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
1688
1728
  )
1689
1729
  else:
1690
1730
  filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
@@ -1697,7 +1737,7 @@ def features_select(
1697
1737
  if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
1698
1738
  min_prominence, max_prominence = chrom_prominence
1699
1739
  filter_conditions.append(
1700
- (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
1740
+ (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
1701
1741
  )
1702
1742
  else:
1703
1743
  filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
@@ -1711,7 +1751,7 @@ def features_select(
1711
1751
  min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
1712
1752
  filter_conditions.append(
1713
1753
  (pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
1714
- & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
1754
+ & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
1715
1755
  )
1716
1756
  else:
1717
1757
  filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
@@ -1725,7 +1765,7 @@ def features_select(
1725
1765
  min_height_scaled, max_height_scaled = chrom_height_scaled
1726
1766
  filter_conditions.append(
1727
1767
  (pl.col("chrom_height_scaled") >= min_height_scaled)
1728
- & (pl.col("chrom_height_scaled") <= max_height_scaled)
1768
+ & (pl.col("chrom_height_scaled") <= max_height_scaled),
1729
1769
  )
1730
1770
  else:
1731
1771
  filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
@@ -1832,7 +1872,7 @@ def features_filter(self, features):
1832
1872
  # Single comprehensive log message
1833
1873
  if mapping_removed_count > 0:
1834
1874
  self.logger.info(
1835
- f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features."
1875
+ f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.",
1836
1876
  )
1837
1877
  else:
1838
1878
  self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
@@ -1909,7 +1949,7 @@ def features_delete(self, features):
1909
1949
  # Single comprehensive log message
1910
1950
  if mapping_removed_count > 0:
1911
1951
  self.logger.info(
1912
- f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}"
1952
+ f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}",
1913
1953
  )
1914
1954
  else:
1915
1955
  self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
@@ -1974,7 +2014,7 @@ def consensus_select(
1974
2014
  # Filter by m/z
1975
2015
  if mz is not None:
1976
2016
  consensus_len_before_filter = len(consensus)
1977
-
2017
+
1978
2018
  if isinstance(mz, tuple) and len(mz) == 2:
1979
2019
  # Check if second value is smaller than first (indicating mz, mz_tol format)
1980
2020
  if mz[1] < mz[0]:
@@ -1988,18 +2028,19 @@ def consensus_select(
1988
2028
  consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
1989
2029
  else:
1990
2030
  # Single float value - use default mz tolerance from study parameters
1991
- default_mz_tol = getattr(self, 'parameters', None)
1992
- if default_mz_tol and hasattr(default_mz_tol, 'eic_mz_tol'):
2031
+ default_mz_tol = getattr(self, "parameters", None)
2032
+ if default_mz_tol and hasattr(default_mz_tol, "eic_mz_tol"):
1993
2033
  default_mz_tol = default_mz_tol.eic_mz_tol
1994
2034
  else:
1995
2035
  # Fallback to align_defaults if study parameters not available
1996
2036
  from masster.study.defaults.align_def import align_defaults
2037
+
1997
2038
  default_mz_tol = align_defaults().mz_max_diff
1998
-
2039
+
1999
2040
  min_mz = mz - default_mz_tol
2000
2041
  max_mz = mz + default_mz_tol
2001
2042
  consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
2002
-
2043
+
2003
2044
  self.logger.debug(
2004
2045
  f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2005
2046
  )
@@ -2007,7 +2048,7 @@ def consensus_select(
2007
2048
  # Filter by retention time
2008
2049
  if rt is not None:
2009
2050
  consensus_len_before_filter = len(consensus)
2010
-
2051
+
2011
2052
  if isinstance(rt, tuple) and len(rt) == 2:
2012
2053
  # Check if second value is smaller than first (indicating rt, rt_tol format)
2013
2054
  if rt[1] < rt[0]:
@@ -2021,18 +2062,19 @@ def consensus_select(
2021
2062
  consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
2022
2063
  else:
2023
2064
  # Single float value - use default rt tolerance from study parameters
2024
- default_rt_tol = getattr(self, 'parameters', None)
2025
- if default_rt_tol and hasattr(default_rt_tol, 'eic_rt_tol'):
2065
+ default_rt_tol = getattr(self, "parameters", None)
2066
+ if default_rt_tol and hasattr(default_rt_tol, "eic_rt_tol"):
2026
2067
  default_rt_tol = default_rt_tol.eic_rt_tol
2027
2068
  else:
2028
2069
  # Fallback to align_defaults if study parameters not available
2029
2070
  from masster.study.defaults.align_def import align_defaults
2071
+
2030
2072
  default_rt_tol = align_defaults().rt_max_diff
2031
-
2073
+
2032
2074
  min_rt = rt - default_rt_tol
2033
2075
  max_rt = rt + default_rt_tol
2034
2076
  consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
2035
-
2077
+
2036
2078
  self.logger.debug(
2037
2079
  f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2038
2080
  )
@@ -2057,7 +2099,7 @@ def consensus_select(
2057
2099
  # Treat as range
2058
2100
  min_uid, max_uid = consensus_uid
2059
2101
  consensus = consensus.filter(
2060
- (pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid)
2102
+ (pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid),
2061
2103
  )
2062
2104
  else:
2063
2105
  # Treat as list
@@ -2085,7 +2127,7 @@ def consensus_select(
2085
2127
  if isinstance(number_samples, tuple) and len(number_samples) == 2:
2086
2128
  min_samples, max_samples = number_samples
2087
2129
  consensus = consensus.filter(
2088
- (pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples)
2130
+ (pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples),
2089
2131
  )
2090
2132
  else:
2091
2133
  consensus = consensus.filter(pl.col("number_samples") >= number_samples)
@@ -2143,7 +2185,7 @@ def consensus_select(
2143
2185
  min_coherence, max_coherence = chrom_coherence_mean
2144
2186
  consensus = consensus.filter(
2145
2187
  (pl.col("chrom_coherence_mean") >= min_coherence)
2146
- & (pl.col("chrom_coherence_mean") <= max_coherence)
2188
+ & (pl.col("chrom_coherence_mean") <= max_coherence),
2147
2189
  )
2148
2190
  else:
2149
2191
  consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
@@ -2161,7 +2203,7 @@ def consensus_select(
2161
2203
  min_prominence, max_prominence = chrom_prominence_mean
2162
2204
  consensus = consensus.filter(
2163
2205
  (pl.col("chrom_prominence_mean") >= min_prominence)
2164
- & (pl.col("chrom_prominence_mean") <= max_prominence)
2206
+ & (pl.col("chrom_prominence_mean") <= max_prominence),
2165
2207
  )
2166
2208
  else:
2167
2209
  consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
@@ -2179,7 +2221,7 @@ def consensus_select(
2179
2221
  min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
2180
2222
  consensus = consensus.filter(
2181
2223
  (pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
2182
- & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled)
2224
+ & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
2183
2225
  )
2184
2226
  else:
2185
2227
  consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
@@ -2197,7 +2239,7 @@ def consensus_select(
2197
2239
  min_height_scaled, max_height_scaled = chrom_height_scaled_mean
2198
2240
  consensus = consensus.filter(
2199
2241
  (pl.col("chrom_height_scaled_mean") >= min_height_scaled)
2200
- & (pl.col("chrom_height_scaled_mean") <= max_height_scaled)
2242
+ & (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
2201
2243
  )
2202
2244
  else:
2203
2245
  consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
@@ -2214,7 +2256,7 @@ def consensus_select(
2214
2256
  if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
2215
2257
  min_rt_delta, max_rt_delta = rt_delta_mean
2216
2258
  consensus = consensus.filter(
2217
- (pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta)
2259
+ (pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta),
2218
2260
  )
2219
2261
  else:
2220
2262
  consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
@@ -2241,10 +2283,10 @@ def consensus_select(
2241
2283
  # Multiple columns
2242
2284
  valid_columns = [col for col in sortby if col in consensus.columns]
2243
2285
  invalid_columns = [col for col in sortby if col not in consensus.columns]
2244
-
2286
+
2245
2287
  if invalid_columns:
2246
2288
  self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
2247
-
2289
+
2248
2290
  if valid_columns:
2249
2291
  consensus = consensus.sort(valid_columns, descending=descending)
2250
2292
  else:
@@ -2335,7 +2377,7 @@ def consensus_filter(self, consensus):
2335
2377
 
2336
2378
  removed_consensus_count = initial_consensus_count - len(self.consensus_df)
2337
2379
  self.logger.info(
2338
- f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}"
2380
+ f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}",
2339
2381
  )
2340
2382
 
2341
2383
 
@@ -2465,7 +2507,9 @@ def samples_select(
2465
2507
  if len(sample_batch) == 2 and not isinstance(sample_batch, list):
2466
2508
  # Treat as range
2467
2509
  min_batch, max_batch = sample_batch
2468
- filter_conditions.append((pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch))
2510
+ filter_conditions.append(
2511
+ (pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch),
2512
+ )
2469
2513
  else:
2470
2514
  # Treat as list
2471
2515
  filter_conditions.append(pl.col("sample_batch").is_in(sample_batch))
@@ -2481,7 +2525,9 @@ def samples_select(
2481
2525
  if len(sample_sequence) == 2 and not isinstance(sample_sequence, list):
2482
2526
  # Treat as range
2483
2527
  min_seq, max_seq = sample_sequence
2484
- filter_conditions.append((pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq))
2528
+ filter_conditions.append(
2529
+ (pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq),
2530
+ )
2485
2531
  else:
2486
2532
  # Treat as list
2487
2533
  filter_conditions.append(pl.col("sample_sequence").is_in(sample_sequence))
@@ -2495,7 +2541,9 @@ def samples_select(
2495
2541
  if "num_features" in available_columns:
2496
2542
  if isinstance(num_features, tuple) and len(num_features) == 2:
2497
2543
  min_features, max_features = num_features
2498
- filter_conditions.append((pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features))
2544
+ filter_conditions.append(
2545
+ (pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features),
2546
+ )
2499
2547
  else:
2500
2548
  filter_conditions.append(pl.col("num_features") >= num_features)
2501
2549
  else:
@@ -2552,15 +2600,15 @@ def samples_select(
2552
2600
  def samples_delete(self, samples):
2553
2601
  """
2554
2602
  Delete samples and all related data from the study based on sample identifiers.
2555
-
2556
- This function eliminates all data related to the specified samples (and their sample_uids)
2603
+
2604
+ This function eliminates all data related to the specified samples (and their sample_uids)
2557
2605
  from all dataframes including:
2558
2606
  - samples_df: Removes the sample rows
2559
2607
  - features_df: Removes all features belonging to these samples
2560
2608
  - consensus_mapping_df: Removes mappings for features from these samples
2561
2609
  - consensus_ms2: Removes MS2 spectra for features from these samples
2562
2610
  - feature_maps: Removes the corresponding feature maps
2563
-
2611
+
2564
2612
  Also updates map_id values to maintain sequential indices after deletion.
2565
2613
 
2566
2614
  Parameters:
@@ -2622,10 +2670,10 @@ def samples_delete(self, samples):
2622
2670
 
2623
2671
  # Get map_ids to remove from feature_maps (needed before samples_df deletion)
2624
2672
  map_ids_to_remove = []
2625
- if hasattr(self, 'feature_maps') and self.feature_maps is not None:
2673
+ if hasattr(self, "feature_maps") and self.feature_maps is not None:
2626
2674
  # Get map_ids for samples to be deleted
2627
2675
  map_ids_df = self.samples_df.filter(
2628
- pl.col("sample_uid").is_in(sample_uids_to_remove)
2676
+ pl.col("sample_uid").is_in(sample_uids_to_remove),
2629
2677
  ).select("map_id")
2630
2678
  if not map_ids_df.is_empty():
2631
2679
  map_ids_to_remove = map_ids_df["map_id"].to_list()
@@ -2663,7 +2711,7 @@ def samples_delete(self, samples):
2663
2711
 
2664
2712
  # 5. Remove from feature_maps and update map_id
2665
2713
  removed_maps_count = 0
2666
- if hasattr(self, 'feature_maps') and self.feature_maps is not None and map_ids_to_remove:
2714
+ if hasattr(self, "feature_maps") and self.feature_maps is not None and map_ids_to_remove:
2667
2715
  # Remove feature maps in reverse order to maintain indices
2668
2716
  for map_id in sorted(map_ids_to_remove, reverse=True):
2669
2717
  if 0 <= map_id < len(self.feature_maps):
@@ -2674,7 +2722,7 @@ def samples_delete(self, samples):
2674
2722
  if len(self.samples_df) > 0:
2675
2723
  new_map_ids = list(range(len(self.samples_df)))
2676
2724
  self.samples_df = self.samples_df.with_columns(
2677
- pl.lit(new_map_ids).alias("map_id")
2725
+ pl.lit(new_map_ids).alias("map_id"),
2678
2726
  )
2679
2727
 
2680
2728
  # Calculate and log results
@@ -2685,16 +2733,16 @@ def samples_delete(self, samples):
2685
2733
  summary_parts = [
2686
2734
  f"Deleted {removed_sample_count} samples",
2687
2735
  ]
2688
-
2736
+
2689
2737
  if removed_features_count > 0:
2690
2738
  summary_parts.append(f"{removed_features_count} features")
2691
-
2739
+
2692
2740
  if removed_mapping_count > 0:
2693
2741
  summary_parts.append(f"{removed_mapping_count} consensus mappings")
2694
-
2742
+
2695
2743
  if removed_ms2_count > 0:
2696
2744
  summary_parts.append(f"{removed_ms2_count} MS2 spectra")
2697
-
2745
+
2698
2746
  if removed_maps_count > 0:
2699
2747
  summary_parts.append(f"{removed_maps_count} feature maps")
2700
2748
 
@@ -2715,14 +2763,14 @@ def samples_delete(self, samples):
2715
2763
  def sample_color(self, by=None, palette="Turbo256"):
2716
2764
  """
2717
2765
  Set sample colors in the sample_color column of samples_df.
2718
-
2766
+
2719
2767
  When a new sample is added, this function resets all colors picking from the specified palette.
2720
2768
  The default palette is Turbo256.
2721
2769
 
2722
2770
  Parameters:
2723
2771
  by (str or list, optional): Property to base colors on. Options:
2724
2772
  - 'sample_uid': Use sample_uid values to assign colors
2725
- - 'sample_index': Use sample index (position) to assign colors
2773
+ - 'sample_index': Use sample index (position) to assign colors
2726
2774
  - 'sample_type': Use sample_type values to assign colors
2727
2775
  - 'sample_name': Use sample_name values to assign colors
2728
2776
  - list of colors: Use provided list of hex color codes
@@ -2735,7 +2783,7 @@ def sample_color(self, by=None, palette="Turbo256"):
2735
2783
  - 'Magma256': Magma colormap (256 colors, perceptually uniform)
2736
2784
  - 'Cividis256': Cividis colormap (256 colors, colorblind-friendly)
2737
2785
  - 'Set1': Qualitative palette (9 distinct colors)
2738
- - 'Set2': Qualitative palette (8 distinct colors)
2786
+ - 'Set2': Qualitative palette (8 distinct colors)
2739
2787
  - 'Set3': Qualitative palette (12 distinct colors)
2740
2788
  - 'Tab10': Tableau 10 palette (10 distinct colors)
2741
2789
  - 'Tab20': Tableau 20 palette (20 distinct colors)
@@ -2746,7 +2794,7 @@ def sample_color(self, by=None, palette="Turbo256"):
2746
2794
  - 'Coolwarm': Cool-warm diverging colormap
2747
2795
  - 'Seismic': Seismic diverging colormap
2748
2796
  - Any other colormap name supported by the cmap library
2749
-
2797
+
2750
2798
  For a complete catalog of available colormaps, see:
2751
2799
  https://cmap-docs.readthedocs.io/en/latest/catalog/
2752
2800
 
@@ -2756,10 +2804,10 @@ def sample_color(self, by=None, palette="Turbo256"):
2756
2804
  Example:
2757
2805
  # Set colors based on sample type
2758
2806
  study.sample_color(by='sample_type', palette='Set1')
2759
-
2807
+
2760
2808
  # Set colors using a custom color list
2761
2809
  study.sample_color(by=['#FF0000', '#00FF00', '#0000FF'])
2762
-
2810
+
2763
2811
  # Reset to default Turbo256 sequential colors
2764
2812
  study.sample_color()
2765
2813
  """
@@ -2768,11 +2816,13 @@ def sample_color(self, by=None, palette="Turbo256"):
2768
2816
  return
2769
2817
 
2770
2818
  sample_count = len(self.samples_df)
2771
-
2819
+
2772
2820
  # Handle custom color list
2773
2821
  if isinstance(by, list):
2774
2822
  if len(by) < sample_count:
2775
- self.logger.warning(f"Provided color list has {len(by)} colors but {sample_count} samples. Repeating colors.")
2823
+ self.logger.warning(
2824
+ f"Provided color list has {len(by)} colors but {sample_count} samples. Repeating colors.",
2825
+ )
2776
2826
  # Cycle through the provided colors if there aren't enough
2777
2827
  colors = []
2778
2828
  for i in range(sample_count):
@@ -2788,10 +2838,10 @@ def sample_color(self, by=None, palette="Turbo256"):
2788
2838
  except ValueError as e:
2789
2839
  self.logger.error(f"Error sampling colors from colormap: {e}")
2790
2840
  return
2791
-
2792
- elif by == 'sample_uid':
2841
+
2842
+ elif by == "sample_uid":
2793
2843
  # Use sample_uid to determine position in evenly sampled colormap
2794
- sample_uids = self.samples_df['sample_uid'].to_list()
2844
+ sample_uids = self.samples_df["sample_uid"].to_list()
2795
2845
  try:
2796
2846
  # Sample colors evenly for the number of samples
2797
2847
  palette_colors = _sample_colors_from_colormap(palette, sample_count)
@@ -2803,29 +2853,29 @@ def sample_color(self, by=None, palette="Turbo256"):
2803
2853
  except ValueError as e:
2804
2854
  self.logger.error(f"Error sampling colors from colormap: {e}")
2805
2855
  return
2806
-
2807
- elif by == 'sample_index':
2856
+
2857
+ elif by == "sample_index":
2808
2858
  # Use sample index (position in DataFrame) with evenly sampled colors
2809
2859
  try:
2810
2860
  colors = _sample_colors_from_colormap(palette, sample_count)
2811
2861
  except ValueError as e:
2812
2862
  self.logger.error(f"Error sampling colors from colormap: {e}")
2813
2863
  return
2814
-
2815
- elif by == 'sample_type':
2864
+
2865
+ elif by == "sample_type":
2816
2866
  # Use sample_type to assign colors - same type gets same color
2817
2867
  # Sample colors evenly across colormap for unique types
2818
- sample_types = self.samples_df['sample_type'].to_list()
2819
- unique_types = list(set([t for t in sample_types if t is not None]))
2820
-
2868
+ sample_types = self.samples_df["sample_type"].to_list()
2869
+ unique_types = list({t for t in sample_types if t is not None})
2870
+
2821
2871
  try:
2822
2872
  # Sample colors evenly for unique types
2823
2873
  type_colors = _sample_colors_from_colormap(palette, len(unique_types))
2824
2874
  type_to_color = {}
2825
-
2875
+
2826
2876
  for i, sample_type in enumerate(unique_types):
2827
2877
  type_to_color[sample_type] = type_colors[i]
2828
-
2878
+
2829
2879
  colors = []
2830
2880
  for sample_type in sample_types:
2831
2881
  if sample_type is None:
@@ -2836,21 +2886,21 @@ def sample_color(self, by=None, palette="Turbo256"):
2836
2886
  except ValueError as e:
2837
2887
  self.logger.error(f"Error sampling colors from colormap: {e}")
2838
2888
  return
2839
-
2840
- elif by == 'sample_name':
2889
+
2890
+ elif by == "sample_name":
2841
2891
  # Use sample_name to assign colors - same name gets same color (unlikely but possible)
2842
2892
  # Sample colors evenly across colormap for unique names
2843
- sample_names = self.samples_df['sample_name'].to_list()
2844
- unique_names = list(set([n for n in sample_names if n is not None]))
2845
-
2893
+ sample_names = self.samples_df["sample_name"].to_list()
2894
+ unique_names = list({n for n in sample_names if n is not None})
2895
+
2846
2896
  try:
2847
2897
  # Sample colors evenly for unique names
2848
2898
  name_colors = _sample_colors_from_colormap(palette, len(unique_names))
2849
2899
  name_to_color = {}
2850
-
2900
+
2851
2901
  for i, sample_name in enumerate(unique_names):
2852
2902
  name_to_color[sample_name] = name_colors[i]
2853
-
2903
+
2854
2904
  colors = []
2855
2905
  for sample_name in sample_names:
2856
2906
  if sample_name is None:
@@ -2862,14 +2912,16 @@ def sample_color(self, by=None, palette="Turbo256"):
2862
2912
  self.logger.error(f"Error sampling colors from colormap: {e}")
2863
2913
  return
2864
2914
  else:
2865
- self.logger.error(f"Invalid by value: {by}. Must be 'sample_uid', 'sample_index', 'sample_type', 'sample_name', a list of colors, or None.")
2915
+ self.logger.error(
2916
+ f"Invalid by value: {by}. Must be 'sample_uid', 'sample_index', 'sample_type', 'sample_name', a list of colors, or None.",
2917
+ )
2866
2918
  return
2867
2919
 
2868
2920
  # Update the sample_color column
2869
2921
  self.samples_df = self.samples_df.with_columns(
2870
- pl.Series("sample_color", colors).alias("sample_color")
2922
+ pl.Series("sample_color", colors).alias("sample_color"),
2871
2923
  )
2872
-
2924
+
2873
2925
  if isinstance(by, list):
2874
2926
  self.logger.debug(f"Set sample colors using provided color list ({len(by)} colors)")
2875
2927
  elif by is None:
@@ -2881,28 +2933,28 @@ def sample_color(self, by=None, palette="Turbo256"):
2881
2933
  def sample_color_reset(self):
2882
2934
  """
2883
2935
  Reset sample colors to default coloring using the 'turbo' colormap.
2884
-
2936
+
2885
2937
  This function assigns colors by distributing samples evenly across the full
2886
2938
  turbo colormap range, ensuring maximum color diversity and visual distinction
2887
2939
  between samples.
2888
-
2940
+
2889
2941
  Returns:
2890
2942
  None (modifies self.samples_df in place)
2891
2943
  """
2892
2944
  if self.samples_df is None or len(self.samples_df) == 0:
2893
2945
  self.logger.warning("No samples found in study.")
2894
2946
  return
2895
-
2947
+
2896
2948
  try:
2897
2949
  from cmap import Colormap
2898
-
2950
+
2899
2951
  # Use turbo colormap
2900
- cm = Colormap('turbo')
2901
-
2952
+ cm = Colormap("turbo")
2953
+
2902
2954
  # Get sample count and assign colors evenly distributed across colormap
2903
2955
  n_samples = len(self.samples_df)
2904
2956
  colors = []
2905
-
2957
+
2906
2958
  # Distribute samples evenly across the full colormap range
2907
2959
  for i in range(n_samples):
2908
2960
  # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
@@ -2910,9 +2962,9 @@ def sample_color_reset(self):
2910
2962
  # Optionally, map to a subset of colormap to avoid extreme colors
2911
2963
  # Use 10% to 90% of colormap range for better color diversity
2912
2964
  normalized_value = 0.1 + (normalized_value * 0.8)
2913
-
2965
+
2914
2966
  color_rgba = cm(normalized_value)
2915
-
2967
+
2916
2968
  # Convert RGBA to hex
2917
2969
  if len(color_rgba) >= 3:
2918
2970
  r, g, b = color_rgba[:3]
@@ -2921,14 +2973,14 @@ def sample_color_reset(self):
2921
2973
  r, g, b = int(r * 255), int(g * 255), int(b * 255)
2922
2974
  hex_color = f"#{r:02x}{g:02x}{b:02x}"
2923
2975
  colors.append(hex_color)
2924
-
2976
+
2925
2977
  # Update the sample_color column
2926
2978
  self.samples_df = self.samples_df.with_columns(
2927
- pl.Series("sample_color", colors).alias("sample_color")
2979
+ pl.Series("sample_color", colors).alias("sample_color"),
2928
2980
  )
2929
-
2981
+
2930
2982
  self.logger.debug(f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)")
2931
-
2983
+
2932
2984
  except ImportError:
2933
2985
  self.logger.error("cmap library is required for sample color reset. Install with: pip install cmap")
2934
2986
  except Exception as e:
@@ -2938,13 +2990,13 @@ def sample_color_reset(self):
2938
2990
  def _get_color_palette(palette_name):
2939
2991
  """
2940
2992
  Get color palette as a list of hex color codes using the cmap library.
2941
-
2993
+
2942
2994
  Parameters:
2943
2995
  palette_name (str): Name of the palette
2944
-
2996
+
2945
2997
  Returns:
2946
2998
  list: List of hex color codes
2947
-
2999
+
2948
3000
  Raises:
2949
3001
  ValueError: If palette_name is not supported
2950
3002
  """
@@ -2952,40 +3004,38 @@ def _get_color_palette(palette_name):
2952
3004
  from cmap import Colormap
2953
3005
  except ImportError:
2954
3006
  raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
2955
-
3007
+
2956
3008
  # Map common palette names to cmap names
2957
3009
  palette_mapping = {
2958
3010
  # Scientific colormaps
2959
3011
  "Turbo256": "turbo",
2960
- "Viridis256": "viridis",
3012
+ "Viridis256": "viridis",
2961
3013
  "Plasma256": "plasma",
2962
3014
  "Inferno256": "inferno",
2963
3015
  "Magma256": "magma",
2964
3016
  "Cividis256": "cividis",
2965
-
2966
3017
  # Qualitative palettes
2967
3018
  "Set1": "Set1",
2968
- "Set2": "Set2",
3019
+ "Set2": "Set2",
2969
3020
  "Set3": "Set3",
2970
3021
  "Tab10": "tab10",
2971
3022
  "Tab20": "tab20",
2972
3023
  "Dark2": "Dark2",
2973
3024
  "Paired": "Paired",
2974
-
2975
3025
  # Additional useful palettes
2976
3026
  "Spectral": "Spectral",
2977
3027
  "Rainbow": "rainbow",
2978
3028
  "Coolwarm": "coolwarm",
2979
3029
  "Seismic": "seismic",
2980
3030
  }
2981
-
3031
+
2982
3032
  # Get the cmap name
2983
3033
  cmap_name = palette_mapping.get(palette_name, palette_name.lower())
2984
-
3034
+
2985
3035
  try:
2986
3036
  # Create colormap
2987
3037
  cm = Colormap(cmap_name)
2988
-
3038
+
2989
3039
  # Determine number of colors to generate
2990
3040
  if "256" in palette_name:
2991
3041
  n_colors = 256
@@ -3001,7 +3051,7 @@ def _get_color_palette(palette_name):
3001
3051
  n_colors = 20
3002
3052
  else:
3003
3053
  n_colors = 256 # Default for continuous colormaps
3004
-
3054
+
3005
3055
  # Generate colors
3006
3056
  if n_colors <= 20:
3007
3057
  # For discrete palettes, use evenly spaced indices
@@ -3009,11 +3059,11 @@ def _get_color_palette(palette_name):
3009
3059
  else:
3010
3060
  # For continuous palettes, use full range
3011
3061
  indices = [i / (n_colors - 1) for i in range(n_colors)]
3012
-
3062
+
3013
3063
  # Get colors as RGBA and convert to hex
3014
3064
  colors = cm(indices)
3015
3065
  hex_colors = []
3016
-
3066
+
3017
3067
  for color in colors:
3018
3068
  if len(color) >= 3: # RGBA or RGB
3019
3069
  r, g, b = color[:3]
@@ -3022,25 +3072,26 @@ def _get_color_palette(palette_name):
3022
3072
  r, g, b = int(r * 255), int(g * 255), int(b * 255)
3023
3073
  hex_color = f"#{r:02x}{g:02x}{b:02x}"
3024
3074
  hex_colors.append(hex_color)
3025
-
3075
+
3026
3076
  return hex_colors
3027
-
3077
+
3028
3078
  except Exception as e:
3029
- raise ValueError(f"Failed to create colormap '{cmap_name}': {e}. "
3030
- f"Available palettes: {list(palette_mapping.keys())}")
3079
+ raise ValueError(
3080
+ f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
3081
+ )
3031
3082
 
3032
3083
 
3033
3084
  def _sample_colors_from_colormap(palette_name, n_colors):
3034
3085
  """
3035
3086
  Sample colors evenly from the whole colormap range, similar to sample_color_reset.
3036
-
3087
+
3037
3088
  Parameters:
3038
3089
  palette_name (str): Name of the palette/colormap
3039
3090
  n_colors (int): Number of colors to sample
3040
-
3091
+
3041
3092
  Returns:
3042
3093
  list: List of hex color codes sampled evenly from the colormap
3043
-
3094
+
3044
3095
  Raises:
3045
3096
  ValueError: If palette_name is not supported
3046
3097
  """
@@ -3048,51 +3099,49 @@ def _sample_colors_from_colormap(palette_name, n_colors):
3048
3099
  from cmap import Colormap
3049
3100
  except ImportError:
3050
3101
  raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
3051
-
3102
+
3052
3103
  # Map common palette names to cmap names (same as _get_color_palette)
3053
3104
  palette_mapping = {
3054
3105
  # Scientific colormaps
3055
3106
  "Turbo256": "turbo",
3056
- "Viridis256": "viridis",
3107
+ "Viridis256": "viridis",
3057
3108
  "Plasma256": "plasma",
3058
3109
  "Inferno256": "inferno",
3059
3110
  "Magma256": "magma",
3060
3111
  "Cividis256": "cividis",
3061
-
3062
3112
  # Qualitative palettes
3063
3113
  "Set1": "Set1",
3064
- "Set2": "Set2",
3114
+ "Set2": "Set2",
3065
3115
  "Set3": "Set3",
3066
3116
  "Tab10": "tab10",
3067
3117
  "Tab20": "tab20",
3068
3118
  "Dark2": "Dark2",
3069
3119
  "Paired": "Paired",
3070
-
3071
3120
  # Additional useful palettes
3072
3121
  "Spectral": "Spectral",
3073
3122
  "Rainbow": "rainbow",
3074
3123
  "Coolwarm": "coolwarm",
3075
3124
  "Seismic": "seismic",
3076
3125
  }
3077
-
3126
+
3078
3127
  # Get the cmap name
3079
3128
  cmap_name = palette_mapping.get(palette_name, palette_name.lower())
3080
-
3129
+
3081
3130
  try:
3082
3131
  # Create colormap
3083
3132
  cm = Colormap(cmap_name)
3084
-
3133
+
3085
3134
  colors = []
3086
-
3135
+
3087
3136
  # Distribute samples evenly across the full colormap range (same approach as sample_color_reset)
3088
3137
  for i in range(n_colors):
3089
3138
  # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
3090
3139
  normalized_value = (i + 0.5) / n_colors # +0.5 to center samples in their bins
3091
3140
  # Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
3092
3141
  normalized_value = 0.1 + (normalized_value * 0.8)
3093
-
3142
+
3094
3143
  color_rgba = cm(normalized_value)
3095
-
3144
+
3096
3145
  # Convert RGBA to hex
3097
3146
  if len(color_rgba) >= 3:
3098
3147
  r, g, b = color_rgba[:3]
@@ -3101,12 +3150,13 @@ def _sample_colors_from_colormap(palette_name, n_colors):
3101
3150
  r, g, b = int(r * 255), int(g * 255), int(b * 255)
3102
3151
  hex_color = f"#{r:02x}{g:02x}{b:02x}"
3103
3152
  colors.append(hex_color)
3104
-
3153
+
3105
3154
  return colors
3106
-
3155
+
3107
3156
  except Exception as e:
3108
- raise ValueError(f"Failed to create colormap '{cmap_name}': {e}. "
3109
- f"Available palettes: {list(palette_mapping.keys())}")
3157
+ raise ValueError(
3158
+ f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
3159
+ )
3110
3160
 
3111
3161
 
3112
3162
  def _matplotlib_to_hex(color_dict):
@@ -3115,32 +3165,32 @@ def _matplotlib_to_hex(color_dict):
3115
3165
 
3116
3166
 
3117
3167
  # =====================================================================================
3118
- # SCHEMA AND DATA STRUCTURE FUNCTIONS
3168
+ # SCHEMA AND DATA STRUCTURE FUNCTIONS
3119
3169
  # =====================================================================================
3120
3170
 
3121
3171
 
3122
3172
  def _ensure_features_df_schema_order(self):
3123
3173
  """
3124
3174
  Ensure features_df columns are ordered according to study5_schema.json.
3125
-
3175
+
3126
3176
  This method should be called after operations that might scramble the column order.
3127
3177
  """
3128
3178
  if self.features_df is None or self.features_df.is_empty():
3129
3179
  return
3130
-
3180
+
3131
3181
  try:
3132
3182
  import os
3133
3183
  import json
3134
3184
  from masster.study.h5 import _reorder_columns_by_schema
3135
-
3185
+
3136
3186
  # Load schema
3137
3187
  schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
3138
- with open(schema_path, 'r') as f:
3188
+ with open(schema_path) as f:
3139
3189
  schema = json.load(f)
3140
-
3190
+
3141
3191
  # Reorder columns to match schema
3142
- self.features_df = _reorder_columns_by_schema(self.features_df, schema, 'features_df')
3143
-
3192
+ self.features_df = _reorder_columns_by_schema(self.features_df, schema, "features_df")
3193
+
3144
3194
  except Exception as e:
3145
3195
  self.logger.warning(f"Failed to reorder features_df columns: {e}")
3146
3196
 
@@ -3148,38 +3198,38 @@ def _ensure_features_df_schema_order(self):
3148
3198
  def migrate_map_id_to_index(self):
3149
3199
  """
3150
3200
  Migrate map_id from string-based OpenMS unique IDs to integer indices.
3151
-
3201
+
3152
3202
  This function converts the map_id column from string type (with OpenMS unique IDs)
3153
3203
  to integer type where each map_id corresponds to the index of the feature map
3154
3204
  in self.features_maps.
3155
-
3205
+
3156
3206
  This migration is needed for studies that were created before the map_id format
3157
3207
  change from OpenMS unique IDs to feature map indices.
3158
3208
  """
3159
3209
  if self.samples_df is None or self.samples_df.is_empty():
3160
3210
  self.logger.warning("No samples to migrate")
3161
3211
  return
3162
-
3212
+
3163
3213
  # Check if migration is needed
3164
- current_dtype = self.samples_df['map_id'].dtype
3214
+ current_dtype = self.samples_df["map_id"].dtype
3165
3215
  if current_dtype == pl.Int64:
3166
3216
  self.logger.info("map_id column is already Int64 type - no migration needed")
3167
3217
  return
3168
-
3218
+
3169
3219
  self.logger.info("Migrating map_id from string-based OpenMS IDs to integer indices")
3170
-
3220
+
3171
3221
  # Create new map_id values based on sample order
3172
3222
  # Each sample gets a map_id that corresponds to its position in features_maps
3173
3223
  sample_count = len(self.samples_df)
3174
3224
  new_map_ids = list(range(sample_count))
3175
-
3225
+
3176
3226
  # Update the map_id column
3177
3227
  self.samples_df = self.samples_df.with_columns(
3178
- pl.lit(new_map_ids).alias("map_id")
3228
+ pl.lit(new_map_ids).alias("map_id"),
3179
3229
  )
3180
-
3230
+
3181
3231
  # Ensure the column is Int64 type
3182
3232
  self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
3183
-
3233
+
3184
3234
  self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
3185
3235
  self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")