masster 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/helpers.py CHANGED
@@ -6,7 +6,7 @@ like data retrieval, filtering, compression, and utility functions.
6
6
 
7
7
  The functions are organized into the following sections:
8
8
  1. Chromatogram extraction functions (BPC, TIC, EIC, chrom matrix)
9
- 2. Data retrieval helper functions (get_sample, get_consensus, etc.)
9
+ 2. Data retrieval helper functions (get_sample, get_consensus, etc.)
10
10
  3. UID helper functions (_get_*_uids)
11
11
  4. Data filtering and selection functions
12
12
  5. Data compression and restoration functions
@@ -150,9 +150,19 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
150
150
  # build Chromatogram
151
151
  ycol = "inty"
152
152
  try:
153
- chrom = Chromatogram(rt=bpc_pd["rt"].to_numpy(), inty=bpc_pd[ycol].to_numpy(), label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
153
+ chrom = Chromatogram(
154
+ rt=bpc_pd["rt"].to_numpy(),
155
+ inty=bpc_pd[ycol].to_numpy(),
156
+ label=label or "Base Peak Chromatogram",
157
+ rt_unit=rt_unit,
158
+ )
154
159
  except Exception:
155
- chrom = Chromatogram(rt=bpc_pd["rt"].values, inty=bpc_pd[ycol].values, label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
160
+ chrom = Chromatogram(
161
+ rt=bpc_pd["rt"].values,
162
+ inty=bpc_pd[ycol].values,
163
+ label=label or "Base Peak Chromatogram",
164
+ rt_unit=rt_unit,
165
+ )
156
166
 
157
167
  return chrom
158
168
 
@@ -204,13 +214,21 @@ def get_tic(owner, sample=None, label=None):
204
214
  tic_pd = tic_pd.rename(columns={tic_pd.columns[1]: "inty_tot"})
205
215
 
206
216
  try:
207
- chrom = Chromatogram(rt=tic_pd["rt"].to_numpy(), inty=tic_pd["inty_tot"].to_numpy(), label=label or "Total Ion Chromatogram")
217
+ chrom = Chromatogram(
218
+ rt=tic_pd["rt"].to_numpy(),
219
+ inty=tic_pd["inty_tot"].to_numpy(),
220
+ label=label or "Total Ion Chromatogram",
221
+ )
208
222
  except Exception:
209
- chrom = Chromatogram(rt=tic_pd["rt"].values, inty=tic_pd["inty_tot"].values, label=label or "Total Ion Chromatogram")
223
+ chrom = Chromatogram(
224
+ rt=tic_pd["rt"].values,
225
+ inty=tic_pd["inty_tot"].values,
226
+ label=label or "Total Ion Chromatogram",
227
+ )
210
228
 
211
229
  return chrom
212
230
 
213
-
231
+
214
232
  def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
215
233
  """
216
234
  Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
@@ -223,7 +241,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
223
241
 
224
242
  Parameters:
225
243
  owner: Study or Sample instance
226
- sample: Sample identifier (required if owner is Study)
244
+ sample: Sample identifier (required if owner is Study)
227
245
  mz (float): Target m/z value
228
246
  mz_tol (float): m/z tolerance. If None, uses owner.parameters.eic_mz_tol (for Study) or defaults to 0.01
229
247
  rt_unit (str): Retention time unit for the chromatogram
@@ -234,7 +252,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
234
252
  """
235
253
  # Use default mz_tol from study parameters if not provided
236
254
  if mz_tol is None:
237
- if hasattr(owner, 'parameters') and hasattr(owner.parameters, 'eic_mz_tol'):
255
+ if hasattr(owner, "parameters") and hasattr(owner.parameters, "eic_mz_tol"):
238
256
  mz_tol = owner.parameters.eic_mz_tol
239
257
  else:
240
258
  mz_tol = 0.01 # fallback default
@@ -267,17 +285,18 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
267
285
  mz_min = mz - mz_tol
268
286
  mz_max = mz + mz_tol
269
287
  eic_data = s.ms1_df.filter(
270
- (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
288
+ (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max),
271
289
  )
272
290
 
273
291
  if eic_data.is_empty():
274
292
  # Return empty chromatogram if no data found
275
293
  import numpy as _np
294
+
276
295
  return Chromatogram(
277
- rt=_np.array([0.0]),
278
- inty=_np.array([0.0]),
296
+ rt=_np.array([0.0]),
297
+ inty=_np.array([0.0]),
279
298
  label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
280
- rt_unit=rt_unit
299
+ rt_unit=rt_unit,
281
300
  )
282
301
 
283
302
  # Aggregate intensities per retention time (sum in case of multiple points per rt)
@@ -290,34 +309,35 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
290
309
  if eic_pd.empty:
291
310
  # Return empty chromatogram if no data found
292
311
  import numpy as _np
312
+
293
313
  return Chromatogram(
294
- rt=_np.array([0.0]),
295
- inty=_np.array([0.0]),
314
+ rt=_np.array([0.0]),
315
+ inty=_np.array([0.0]),
296
316
  label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
297
- rt_unit=rt_unit
317
+ rt_unit=rt_unit,
298
318
  )
299
319
 
300
320
  # build Chromatogram
301
321
  try:
302
322
  chrom = Chromatogram(
303
- rt=eic_pd["rt"].to_numpy(),
304
- inty=eic_pd["inty"].to_numpy(),
323
+ rt=eic_pd["rt"].to_numpy(),
324
+ inty=eic_pd["inty"].to_numpy(),
305
325
  label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
306
- rt_unit=rt_unit
326
+ rt_unit=rt_unit,
307
327
  )
308
328
  except Exception:
309
329
  chrom = Chromatogram(
310
- rt=eic_pd["rt"].values,
311
- inty=eic_pd["inty"].values,
330
+ rt=eic_pd["rt"].values,
331
+ inty=eic_pd["inty"].values,
312
332
  label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
313
- rt_unit=rt_unit
333
+ rt_unit=rt_unit,
314
334
  )
315
335
 
316
336
  return chrom
317
337
 
318
338
 
319
339
  # =====================================================================================
320
- # DATA RETRIEVAL AND MATRIX FUNCTIONS
340
+ # DATA RETRIEVAL AND MATRIX FUNCTIONS
321
341
  # =====================================================================================
322
342
 
323
343
 
@@ -451,9 +471,9 @@ def align_reset(self):
451
471
  self.alignment_ref_index = None
452
472
  # in self.features_df, set rt equal to rt_original
453
473
  self.features_df = self.features_df.with_columns(
454
- pl.col("rt_original").alias("rt")
474
+ pl.col("rt_original").alias("rt"),
455
475
  )
456
-
476
+
457
477
  # Ensure column order is maintained after with_columns operation
458
478
  self._ensure_features_df_schema_order()
459
479
 
@@ -614,7 +634,7 @@ def get_consensus_matches(self, uids=None):
614
634
  return matches
615
635
 
616
636
 
617
- # =====================================================================================
637
+ # =====================================================================================
618
638
  # UID HELPER FUNCTIONS
619
639
  # =====================================================================================
620
640
 
@@ -796,7 +816,7 @@ def get_sample(self, sample):
796
816
  return cache[sample_uid]
797
817
 
798
818
  sample_path = row.get("sample_path", None)
799
- s = Sample(log_level='ERROR')
819
+ s = Sample(log_level="ERROR")
800
820
  try:
801
821
  if sample_path:
802
822
  try:
@@ -816,13 +836,13 @@ def get_orphans(self):
816
836
  Get all features that are not in the consensus mapping.
817
837
  """
818
838
  not_in_consensus = self.features_df.filter(
819
- ~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list())
839
+ ~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list()),
820
840
  )
821
841
  return not_in_consensus
822
842
 
823
843
 
824
844
  # =====================================================================================
825
- # DATA COMPRESSION AND RESTORATION FUNCTIONS
845
+ # DATA COMPRESSION AND RESTORATION FUNCTIONS
826
846
  # =====================================================================================
827
847
 
828
848
 
@@ -878,7 +898,7 @@ def compress_features(self):
878
898
 
879
899
  removed_count = initial_count - len(self.features_df)
880
900
  self.logger.info(
881
- f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column"
901
+ f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column",
882
902
  )
883
903
 
884
904
 
@@ -1119,7 +1139,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1119
1139
  total_chroms = len(self.features_df)
1120
1140
 
1121
1141
  self.logger.debug(
1122
- f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)"
1142
+ f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)",
1123
1143
  )
1124
1144
 
1125
1145
  if empty_chroms == 0:
@@ -1249,7 +1269,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1249
1269
  final_total = len(self.features_df)
1250
1270
 
1251
1271
  self.logger.info(
1252
- f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)"
1272
+ f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)",
1253
1273
  )
1254
1274
  self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
1255
1275
 
@@ -1290,7 +1310,7 @@ def compress_ms2(self, max_replicates=5):
1290
1310
 
1291
1311
  removed_count = initial_count - len(self.consensus_ms2)
1292
1312
  self.logger.info(
1293
- f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair"
1313
+ f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair",
1294
1314
  )
1295
1315
 
1296
1316
 
@@ -1328,14 +1348,14 @@ def compress_chrom(self):
1328
1348
  def sample_name_replace(self, replace_dict):
1329
1349
  """
1330
1350
  Replace sample names in samples_df based on a dictionary mapping.
1331
-
1332
- Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
1333
- all keys with their corresponding values from replace_dict. Checks that all
1351
+
1352
+ Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
1353
+ all keys with their corresponding values from replace_dict. Checks that all
1334
1354
  resulting sample names are unique. If unique, replaces the values in self.samples_df.
1335
1355
 
1336
1356
  Parameters:
1337
1357
  replace_dict (dict): Dictionary mapping old names (keys) to new names (values).
1338
- All keys found in sample names will be replaced with their
1358
+ All keys found in sample names will be replaced with their
1339
1359
  corresponding values.
1340
1360
  e.g., {"old_name1": "new_name1", "old_name2": "new_name2"}
1341
1361
 
@@ -1348,22 +1368,22 @@ def sample_name_replace(self, replace_dict):
1348
1368
  """
1349
1369
  if not isinstance(replace_dict, dict):
1350
1370
  raise ValueError("replace_dict must be a dictionary")
1351
-
1371
+
1352
1372
  if self.samples_df is None or len(self.samples_df) == 0:
1353
1373
  self.logger.warning("No samples found in study.")
1354
1374
  return
1355
-
1375
+
1356
1376
  if not replace_dict:
1357
1377
  self.logger.warning("Empty replace_dict provided, no changes made.")
1358
1378
  return
1359
1379
 
1360
1380
  # Get current sample names
1361
1381
  current_names = self.samples_df.get_column("sample_name").to_list()
1362
-
1382
+
1363
1383
  # Create a copy and apply replacements
1364
1384
  new_names = []
1365
1385
  replaced_count = 0
1366
-
1386
+
1367
1387
  for name in current_names:
1368
1388
  if name in replace_dict:
1369
1389
  new_names.append(replace_dict[name])
@@ -1371,7 +1391,7 @@ def sample_name_replace(self, replace_dict):
1371
1391
  self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
1372
1392
  else:
1373
1393
  new_names.append(name)
1374
-
1394
+
1375
1395
  # Check that all new names are unique
1376
1396
  if len(set(new_names)) != len(new_names):
1377
1397
  duplicates = []
@@ -1382,19 +1402,19 @@ def sample_name_replace(self, replace_dict):
1382
1402
  else:
1383
1403
  seen.add(name)
1384
1404
  raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
1385
-
1405
+
1386
1406
  # If we get here, all names are unique - apply the changes
1387
1407
  self.samples_df = self.samples_df.with_columns(
1388
1408
  pl.Series("sample_name", new_names).alias("sample_name"),
1389
1409
  )
1390
-
1410
+
1391
1411
  self.logger.info(f"Successfully replaced {replaced_count} sample names")
1392
1412
 
1393
1413
 
1394
1414
  def sample_name_reset(self):
1395
1415
  """
1396
1416
  Reset sample names to the basename of sample_path without extensions.
1397
-
1417
+
1398
1418
  Takes all paths in self.samples_df['sample_path'], extracts the basename,
1399
1419
  removes file extensions, and checks that all resulting names are unique.
1400
1420
  If unique, replaces the values in self.samples_df['sample_name'].
@@ -1407,31 +1427,31 @@ def sample_name_reset(self):
1407
1427
  RuntimeError: If any sample_path is None or empty
1408
1428
  """
1409
1429
  import os
1410
-
1430
+
1411
1431
  if self.samples_df is None or len(self.samples_df) == 0:
1412
1432
  self.logger.warning("No samples found in study.")
1413
1433
  return
1414
1434
 
1415
1435
  # Get current sample paths
1416
1436
  sample_paths = self.samples_df.get_column("sample_path").to_list()
1417
-
1437
+
1418
1438
  # Extract basenames without extensions
1419
1439
  new_names = []
1420
-
1440
+
1421
1441
  for i, path in enumerate(sample_paths):
1422
1442
  if path is None or path == "":
1423
1443
  raise RuntimeError(f"Sample at index {i} has no sample_path set")
1424
-
1444
+
1425
1445
  # Get basename and remove extension(s)
1426
1446
  basename = os.path.basename(path)
1427
1447
  # Remove all extensions (handles cases like .tar.gz, .sample5.gz, etc.)
1428
1448
  name_without_ext = basename
1429
- while '.' in name_without_ext:
1449
+ while "." in name_without_ext:
1430
1450
  name_without_ext = os.path.splitext(name_without_ext)[0]
1431
-
1451
+
1432
1452
  new_names.append(name_without_ext)
1433
1453
  self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
1434
-
1454
+
1435
1455
  # Check that all new names are unique
1436
1456
  if len(set(new_names)) != len(new_names):
1437
1457
  duplicates = []
@@ -1442,12 +1462,12 @@ def sample_name_reset(self):
1442
1462
  else:
1443
1463
  seen.add(name)
1444
1464
  raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
1445
-
1465
+
1446
1466
  # If we get here, all names are unique - apply the changes
1447
1467
  self.samples_df = self.samples_df.with_columns(
1448
1468
  pl.Series("sample_name", new_names).alias("sample_name"),
1449
1469
  )
1450
-
1470
+
1451
1471
  self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
1452
1472
 
1453
1473
 
@@ -1704,7 +1724,7 @@ def features_select(
1704
1724
  if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
1705
1725
  min_coherence, max_coherence = chrom_coherence
1706
1726
  filter_conditions.append(
1707
- (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
1727
+ (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
1708
1728
  )
1709
1729
  else:
1710
1730
  filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
@@ -1717,7 +1737,7 @@ def features_select(
1717
1737
  if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
1718
1738
  min_prominence, max_prominence = chrom_prominence
1719
1739
  filter_conditions.append(
1720
- (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
1740
+ (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
1721
1741
  )
1722
1742
  else:
1723
1743
  filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
@@ -1731,7 +1751,7 @@ def features_select(
1731
1751
  min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
1732
1752
  filter_conditions.append(
1733
1753
  (pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
1734
- & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
1754
+ & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
1735
1755
  )
1736
1756
  else:
1737
1757
  filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
@@ -1745,7 +1765,7 @@ def features_select(
1745
1765
  min_height_scaled, max_height_scaled = chrom_height_scaled
1746
1766
  filter_conditions.append(
1747
1767
  (pl.col("chrom_height_scaled") >= min_height_scaled)
1748
- & (pl.col("chrom_height_scaled") <= max_height_scaled)
1768
+ & (pl.col("chrom_height_scaled") <= max_height_scaled),
1749
1769
  )
1750
1770
  else:
1751
1771
  filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
@@ -1852,7 +1872,7 @@ def features_filter(self, features):
1852
1872
  # Single comprehensive log message
1853
1873
  if mapping_removed_count > 0:
1854
1874
  self.logger.info(
1855
- f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features."
1875
+ f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.",
1856
1876
  )
1857
1877
  else:
1858
1878
  self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
@@ -1929,7 +1949,7 @@ def features_delete(self, features):
1929
1949
  # Single comprehensive log message
1930
1950
  if mapping_removed_count > 0:
1931
1951
  self.logger.info(
1932
- f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}"
1952
+ f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}",
1933
1953
  )
1934
1954
  else:
1935
1955
  self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
@@ -1994,7 +2014,7 @@ def consensus_select(
1994
2014
  # Filter by m/z
1995
2015
  if mz is not None:
1996
2016
  consensus_len_before_filter = len(consensus)
1997
-
2017
+
1998
2018
  if isinstance(mz, tuple) and len(mz) == 2:
1999
2019
  # Check if second value is smaller than first (indicating mz, mz_tol format)
2000
2020
  if mz[1] < mz[0]:
@@ -2008,18 +2028,19 @@ def consensus_select(
2008
2028
  consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
2009
2029
  else:
2010
2030
  # Single float value - use default mz tolerance from study parameters
2011
- default_mz_tol = getattr(self, 'parameters', None)
2012
- if default_mz_tol and hasattr(default_mz_tol, 'eic_mz_tol'):
2031
+ default_mz_tol = getattr(self, "parameters", None)
2032
+ if default_mz_tol and hasattr(default_mz_tol, "eic_mz_tol"):
2013
2033
  default_mz_tol = default_mz_tol.eic_mz_tol
2014
2034
  else:
2015
2035
  # Fallback to align_defaults if study parameters not available
2016
2036
  from masster.study.defaults.align_def import align_defaults
2037
+
2017
2038
  default_mz_tol = align_defaults().mz_max_diff
2018
-
2039
+
2019
2040
  min_mz = mz - default_mz_tol
2020
2041
  max_mz = mz + default_mz_tol
2021
2042
  consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
2022
-
2043
+
2023
2044
  self.logger.debug(
2024
2045
  f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2025
2046
  )
@@ -2027,7 +2048,7 @@ def consensus_select(
2027
2048
  # Filter by retention time
2028
2049
  if rt is not None:
2029
2050
  consensus_len_before_filter = len(consensus)
2030
-
2051
+
2031
2052
  if isinstance(rt, tuple) and len(rt) == 2:
2032
2053
  # Check if second value is smaller than first (indicating rt, rt_tol format)
2033
2054
  if rt[1] < rt[0]:
@@ -2041,18 +2062,19 @@ def consensus_select(
2041
2062
  consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
2042
2063
  else:
2043
2064
  # Single float value - use default rt tolerance from study parameters
2044
- default_rt_tol = getattr(self, 'parameters', None)
2045
- if default_rt_tol and hasattr(default_rt_tol, 'eic_rt_tol'):
2065
+ default_rt_tol = getattr(self, "parameters", None)
2066
+ if default_rt_tol and hasattr(default_rt_tol, "eic_rt_tol"):
2046
2067
  default_rt_tol = default_rt_tol.eic_rt_tol
2047
2068
  else:
2048
2069
  # Fallback to align_defaults if study parameters not available
2049
2070
  from masster.study.defaults.align_def import align_defaults
2071
+
2050
2072
  default_rt_tol = align_defaults().rt_max_diff
2051
-
2073
+
2052
2074
  min_rt = rt - default_rt_tol
2053
2075
  max_rt = rt + default_rt_tol
2054
2076
  consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
2055
-
2077
+
2056
2078
  self.logger.debug(
2057
2079
  f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2058
2080
  )
@@ -2077,7 +2099,7 @@ def consensus_select(
2077
2099
  # Treat as range
2078
2100
  min_uid, max_uid = consensus_uid
2079
2101
  consensus = consensus.filter(
2080
- (pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid)
2102
+ (pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid),
2081
2103
  )
2082
2104
  else:
2083
2105
  # Treat as list
@@ -2105,7 +2127,7 @@ def consensus_select(
2105
2127
  if isinstance(number_samples, tuple) and len(number_samples) == 2:
2106
2128
  min_samples, max_samples = number_samples
2107
2129
  consensus = consensus.filter(
2108
- (pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples)
2130
+ (pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples),
2109
2131
  )
2110
2132
  else:
2111
2133
  consensus = consensus.filter(pl.col("number_samples") >= number_samples)
@@ -2163,7 +2185,7 @@ def consensus_select(
2163
2185
  min_coherence, max_coherence = chrom_coherence_mean
2164
2186
  consensus = consensus.filter(
2165
2187
  (pl.col("chrom_coherence_mean") >= min_coherence)
2166
- & (pl.col("chrom_coherence_mean") <= max_coherence)
2188
+ & (pl.col("chrom_coherence_mean") <= max_coherence),
2167
2189
  )
2168
2190
  else:
2169
2191
  consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
@@ -2181,7 +2203,7 @@ def consensus_select(
2181
2203
  min_prominence, max_prominence = chrom_prominence_mean
2182
2204
  consensus = consensus.filter(
2183
2205
  (pl.col("chrom_prominence_mean") >= min_prominence)
2184
- & (pl.col("chrom_prominence_mean") <= max_prominence)
2206
+ & (pl.col("chrom_prominence_mean") <= max_prominence),
2185
2207
  )
2186
2208
  else:
2187
2209
  consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
@@ -2199,7 +2221,7 @@ def consensus_select(
2199
2221
  min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
2200
2222
  consensus = consensus.filter(
2201
2223
  (pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
2202
- & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled)
2224
+ & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
2203
2225
  )
2204
2226
  else:
2205
2227
  consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
@@ -2217,7 +2239,7 @@ def consensus_select(
2217
2239
  min_height_scaled, max_height_scaled = chrom_height_scaled_mean
2218
2240
  consensus = consensus.filter(
2219
2241
  (pl.col("chrom_height_scaled_mean") >= min_height_scaled)
2220
- & (pl.col("chrom_height_scaled_mean") <= max_height_scaled)
2242
+ & (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
2221
2243
  )
2222
2244
  else:
2223
2245
  consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
@@ -2234,7 +2256,7 @@ def consensus_select(
2234
2256
  if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
2235
2257
  min_rt_delta, max_rt_delta = rt_delta_mean
2236
2258
  consensus = consensus.filter(
2237
- (pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta)
2259
+ (pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta),
2238
2260
  )
2239
2261
  else:
2240
2262
  consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
@@ -2261,10 +2283,10 @@ def consensus_select(
2261
2283
  # Multiple columns
2262
2284
  valid_columns = [col for col in sortby if col in consensus.columns]
2263
2285
  invalid_columns = [col for col in sortby if col not in consensus.columns]
2264
-
2286
+
2265
2287
  if invalid_columns:
2266
2288
  self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
2267
-
2289
+
2268
2290
  if valid_columns:
2269
2291
  consensus = consensus.sort(valid_columns, descending=descending)
2270
2292
  else:
@@ -2355,7 +2377,7 @@ def consensus_filter(self, consensus):
2355
2377
 
2356
2378
  removed_consensus_count = initial_consensus_count - len(self.consensus_df)
2357
2379
  self.logger.info(
2358
- f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}"
2380
+ f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}",
2359
2381
  )
2360
2382
 
2361
2383
 
@@ -2485,7 +2507,9 @@ def samples_select(
2485
2507
  if len(sample_batch) == 2 and not isinstance(sample_batch, list):
2486
2508
  # Treat as range
2487
2509
  min_batch, max_batch = sample_batch
2488
- filter_conditions.append((pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch))
2510
+ filter_conditions.append(
2511
+ (pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch),
2512
+ )
2489
2513
  else:
2490
2514
  # Treat as list
2491
2515
  filter_conditions.append(pl.col("sample_batch").is_in(sample_batch))
@@ -2501,7 +2525,9 @@ def samples_select(
2501
2525
  if len(sample_sequence) == 2 and not isinstance(sample_sequence, list):
2502
2526
  # Treat as range
2503
2527
  min_seq, max_seq = sample_sequence
2504
- filter_conditions.append((pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq))
2528
+ filter_conditions.append(
2529
+ (pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq),
2530
+ )
2505
2531
  else:
2506
2532
  # Treat as list
2507
2533
  filter_conditions.append(pl.col("sample_sequence").is_in(sample_sequence))
@@ -2515,7 +2541,9 @@ def samples_select(
2515
2541
  if "num_features" in available_columns:
2516
2542
  if isinstance(num_features, tuple) and len(num_features) == 2:
2517
2543
  min_features, max_features = num_features
2518
- filter_conditions.append((pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features))
2544
+ filter_conditions.append(
2545
+ (pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features),
2546
+ )
2519
2547
  else:
2520
2548
  filter_conditions.append(pl.col("num_features") >= num_features)
2521
2549
  else:
@@ -2572,15 +2600,15 @@ def samples_select(
2572
2600
  def samples_delete(self, samples):
2573
2601
  """
2574
2602
  Delete samples and all related data from the study based on sample identifiers.
2575
-
2576
- This function eliminates all data related to the specified samples (and their sample_uids)
2603
+
2604
+ This function eliminates all data related to the specified samples (and their sample_uids)
2577
2605
  from all dataframes including:
2578
2606
  - samples_df: Removes the sample rows
2579
2607
  - features_df: Removes all features belonging to these samples
2580
2608
  - consensus_mapping_df: Removes mappings for features from these samples
2581
2609
  - consensus_ms2: Removes MS2 spectra for features from these samples
2582
2610
  - feature_maps: Removes the corresponding feature maps
2583
-
2611
+
2584
2612
  Also updates map_id values to maintain sequential indices after deletion.
2585
2613
 
2586
2614
  Parameters:
@@ -2642,10 +2670,10 @@ def samples_delete(self, samples):
2642
2670
 
2643
2671
  # Get map_ids to remove from feature_maps (needed before samples_df deletion)
2644
2672
  map_ids_to_remove = []
2645
- if hasattr(self, 'feature_maps') and self.feature_maps is not None:
2673
+ if hasattr(self, "feature_maps") and self.feature_maps is not None:
2646
2674
  # Get map_ids for samples to be deleted
2647
2675
  map_ids_df = self.samples_df.filter(
2648
- pl.col("sample_uid").is_in(sample_uids_to_remove)
2676
+ pl.col("sample_uid").is_in(sample_uids_to_remove),
2649
2677
  ).select("map_id")
2650
2678
  if not map_ids_df.is_empty():
2651
2679
  map_ids_to_remove = map_ids_df["map_id"].to_list()
@@ -2683,7 +2711,7 @@ def samples_delete(self, samples):
2683
2711
 
2684
2712
  # 5. Remove from feature_maps and update map_id
2685
2713
  removed_maps_count = 0
2686
- if hasattr(self, 'feature_maps') and self.feature_maps is not None and map_ids_to_remove:
2714
+ if hasattr(self, "feature_maps") and self.feature_maps is not None and map_ids_to_remove:
2687
2715
  # Remove feature maps in reverse order to maintain indices
2688
2716
  for map_id in sorted(map_ids_to_remove, reverse=True):
2689
2717
  if 0 <= map_id < len(self.feature_maps):
@@ -2694,7 +2722,7 @@ def samples_delete(self, samples):
2694
2722
  if len(self.samples_df) > 0:
2695
2723
  new_map_ids = list(range(len(self.samples_df)))
2696
2724
  self.samples_df = self.samples_df.with_columns(
2697
- pl.lit(new_map_ids).alias("map_id")
2725
+ pl.lit(new_map_ids).alias("map_id"),
2698
2726
  )
2699
2727
 
2700
2728
  # Calculate and log results
@@ -2705,16 +2733,16 @@ def samples_delete(self, samples):
2705
2733
  summary_parts = [
2706
2734
  f"Deleted {removed_sample_count} samples",
2707
2735
  ]
2708
-
2736
+
2709
2737
  if removed_features_count > 0:
2710
2738
  summary_parts.append(f"{removed_features_count} features")
2711
-
2739
+
2712
2740
  if removed_mapping_count > 0:
2713
2741
  summary_parts.append(f"{removed_mapping_count} consensus mappings")
2714
-
2742
+
2715
2743
  if removed_ms2_count > 0:
2716
2744
  summary_parts.append(f"{removed_ms2_count} MS2 spectra")
2717
-
2745
+
2718
2746
  if removed_maps_count > 0:
2719
2747
  summary_parts.append(f"{removed_maps_count} feature maps")
2720
2748
 
@@ -2735,14 +2763,14 @@ def samples_delete(self, samples):
2735
2763
  def sample_color(self, by=None, palette="Turbo256"):
2736
2764
  """
2737
2765
  Set sample colors in the sample_color column of samples_df.
2738
-
2766
+
2739
2767
  When a new sample is added, this function resets all colors picking from the specified palette.
2740
2768
  The default palette is Turbo256.
2741
2769
 
2742
2770
  Parameters:
2743
2771
  by (str or list, optional): Property to base colors on. Options:
2744
2772
  - 'sample_uid': Use sample_uid values to assign colors
2745
- - 'sample_index': Use sample index (position) to assign colors
2773
+ - 'sample_index': Use sample index (position) to assign colors
2746
2774
  - 'sample_type': Use sample_type values to assign colors
2747
2775
  - 'sample_name': Use sample_name values to assign colors
2748
2776
  - list of colors: Use provided list of hex color codes
@@ -2755,7 +2783,7 @@ def sample_color(self, by=None, palette="Turbo256"):
2755
2783
  - 'Magma256': Magma colormap (256 colors, perceptually uniform)
2756
2784
  - 'Cividis256': Cividis colormap (256 colors, colorblind-friendly)
2757
2785
  - 'Set1': Qualitative palette (9 distinct colors)
2758
- - 'Set2': Qualitative palette (8 distinct colors)
2786
+ - 'Set2': Qualitative palette (8 distinct colors)
2759
2787
  - 'Set3': Qualitative palette (12 distinct colors)
2760
2788
  - 'Tab10': Tableau 10 palette (10 distinct colors)
2761
2789
  - 'Tab20': Tableau 20 palette (20 distinct colors)
@@ -2766,7 +2794,7 @@ def sample_color(self, by=None, palette="Turbo256"):
2766
2794
  - 'Coolwarm': Cool-warm diverging colormap
2767
2795
  - 'Seismic': Seismic diverging colormap
2768
2796
  - Any other colormap name supported by the cmap library
2769
-
2797
+
2770
2798
  For a complete catalog of available colormaps, see:
2771
2799
  https://cmap-docs.readthedocs.io/en/latest/catalog/
2772
2800
 
@@ -2776,10 +2804,10 @@ def sample_color(self, by=None, palette="Turbo256"):
2776
2804
  Example:
2777
2805
  # Set colors based on sample type
2778
2806
  study.sample_color(by='sample_type', palette='Set1')
2779
-
2807
+
2780
2808
  # Set colors using a custom color list
2781
2809
  study.sample_color(by=['#FF0000', '#00FF00', '#0000FF'])
2782
-
2810
+
2783
2811
  # Reset to default Turbo256 sequential colors
2784
2812
  study.sample_color()
2785
2813
  """
@@ -2788,11 +2816,13 @@ def sample_color(self, by=None, palette="Turbo256"):
2788
2816
  return
2789
2817
 
2790
2818
  sample_count = len(self.samples_df)
2791
-
2819
+
2792
2820
  # Handle custom color list
2793
2821
  if isinstance(by, list):
2794
2822
  if len(by) < sample_count:
2795
- self.logger.warning(f"Provided color list has {len(by)} colors but {sample_count} samples. Repeating colors.")
2823
+ self.logger.warning(
2824
+ f"Provided color list has {len(by)} colors but {sample_count} samples. Repeating colors.",
2825
+ )
2796
2826
  # Cycle through the provided colors if there aren't enough
2797
2827
  colors = []
2798
2828
  for i in range(sample_count):
@@ -2808,10 +2838,10 @@ def sample_color(self, by=None, palette="Turbo256"):
2808
2838
  except ValueError as e:
2809
2839
  self.logger.error(f"Error sampling colors from colormap: {e}")
2810
2840
  return
2811
-
2812
- elif by == 'sample_uid':
2841
+
2842
+ elif by == "sample_uid":
2813
2843
  # Use sample_uid to determine position in evenly sampled colormap
2814
- sample_uids = self.samples_df['sample_uid'].to_list()
2844
+ sample_uids = self.samples_df["sample_uid"].to_list()
2815
2845
  try:
2816
2846
  # Sample colors evenly for the number of samples
2817
2847
  palette_colors = _sample_colors_from_colormap(palette, sample_count)
@@ -2823,29 +2853,29 @@ def sample_color(self, by=None, palette="Turbo256"):
2823
2853
  except ValueError as e:
2824
2854
  self.logger.error(f"Error sampling colors from colormap: {e}")
2825
2855
  return
2826
-
2827
- elif by == 'sample_index':
2856
+
2857
+ elif by == "sample_index":
2828
2858
  # Use sample index (position in DataFrame) with evenly sampled colors
2829
2859
  try:
2830
2860
  colors = _sample_colors_from_colormap(palette, sample_count)
2831
2861
  except ValueError as e:
2832
2862
  self.logger.error(f"Error sampling colors from colormap: {e}")
2833
2863
  return
2834
-
2835
- elif by == 'sample_type':
2864
+
2865
+ elif by == "sample_type":
2836
2866
  # Use sample_type to assign colors - same type gets same color
2837
2867
  # Sample colors evenly across colormap for unique types
2838
- sample_types = self.samples_df['sample_type'].to_list()
2839
- unique_types = list(set([t for t in sample_types if t is not None]))
2840
-
2868
+ sample_types = self.samples_df["sample_type"].to_list()
2869
+ unique_types = list({t for t in sample_types if t is not None})
2870
+
2841
2871
  try:
2842
2872
  # Sample colors evenly for unique types
2843
2873
  type_colors = _sample_colors_from_colormap(palette, len(unique_types))
2844
2874
  type_to_color = {}
2845
-
2875
+
2846
2876
  for i, sample_type in enumerate(unique_types):
2847
2877
  type_to_color[sample_type] = type_colors[i]
2848
-
2878
+
2849
2879
  colors = []
2850
2880
  for sample_type in sample_types:
2851
2881
  if sample_type is None:
@@ -2856,21 +2886,21 @@ def sample_color(self, by=None, palette="Turbo256"):
2856
2886
  except ValueError as e:
2857
2887
  self.logger.error(f"Error sampling colors from colormap: {e}")
2858
2888
  return
2859
-
2860
- elif by == 'sample_name':
2889
+
2890
+ elif by == "sample_name":
2861
2891
  # Use sample_name to assign colors - same name gets same color (unlikely but possible)
2862
2892
  # Sample colors evenly across colormap for unique names
2863
- sample_names = self.samples_df['sample_name'].to_list()
2864
- unique_names = list(set([n for n in sample_names if n is not None]))
2865
-
2893
+ sample_names = self.samples_df["sample_name"].to_list()
2894
+ unique_names = list({n for n in sample_names if n is not None})
2895
+
2866
2896
  try:
2867
2897
  # Sample colors evenly for unique names
2868
2898
  name_colors = _sample_colors_from_colormap(palette, len(unique_names))
2869
2899
  name_to_color = {}
2870
-
2900
+
2871
2901
  for i, sample_name in enumerate(unique_names):
2872
2902
  name_to_color[sample_name] = name_colors[i]
2873
-
2903
+
2874
2904
  colors = []
2875
2905
  for sample_name in sample_names:
2876
2906
  if sample_name is None:
@@ -2882,14 +2912,16 @@ def sample_color(self, by=None, palette="Turbo256"):
2882
2912
  self.logger.error(f"Error sampling colors from colormap: {e}")
2883
2913
  return
2884
2914
  else:
2885
- self.logger.error(f"Invalid by value: {by}. Must be 'sample_uid', 'sample_index', 'sample_type', 'sample_name', a list of colors, or None.")
2915
+ self.logger.error(
2916
+ f"Invalid by value: {by}. Must be 'sample_uid', 'sample_index', 'sample_type', 'sample_name', a list of colors, or None.",
2917
+ )
2886
2918
  return
2887
2919
 
2888
2920
  # Update the sample_color column
2889
2921
  self.samples_df = self.samples_df.with_columns(
2890
- pl.Series("sample_color", colors).alias("sample_color")
2922
+ pl.Series("sample_color", colors).alias("sample_color"),
2891
2923
  )
2892
-
2924
+
2893
2925
  if isinstance(by, list):
2894
2926
  self.logger.debug(f"Set sample colors using provided color list ({len(by)} colors)")
2895
2927
  elif by is None:
@@ -2901,28 +2933,28 @@ def sample_color(self, by=None, palette="Turbo256"):
2901
2933
  def sample_color_reset(self):
2902
2934
  """
2903
2935
  Reset sample colors to default coloring using the 'turbo' colormap.
2904
-
2936
+
2905
2937
  This function assigns colors by distributing samples evenly across the full
2906
2938
  turbo colormap range, ensuring maximum color diversity and visual distinction
2907
2939
  between samples.
2908
-
2940
+
2909
2941
  Returns:
2910
2942
  None (modifies self.samples_df in place)
2911
2943
  """
2912
2944
  if self.samples_df is None or len(self.samples_df) == 0:
2913
2945
  self.logger.warning("No samples found in study.")
2914
2946
  return
2915
-
2947
+
2916
2948
  try:
2917
2949
  from cmap import Colormap
2918
-
2950
+
2919
2951
  # Use turbo colormap
2920
- cm = Colormap('turbo')
2921
-
2952
+ cm = Colormap("turbo")
2953
+
2922
2954
  # Get sample count and assign colors evenly distributed across colormap
2923
2955
  n_samples = len(self.samples_df)
2924
2956
  colors = []
2925
-
2957
+
2926
2958
  # Distribute samples evenly across the full colormap range
2927
2959
  for i in range(n_samples):
2928
2960
  # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
@@ -2930,9 +2962,9 @@ def sample_color_reset(self):
2930
2962
  # Optionally, map to a subset of colormap to avoid extreme colors
2931
2963
  # Use 10% to 90% of colormap range for better color diversity
2932
2964
  normalized_value = 0.1 + (normalized_value * 0.8)
2933
-
2965
+
2934
2966
  color_rgba = cm(normalized_value)
2935
-
2967
+
2936
2968
  # Convert RGBA to hex
2937
2969
  if len(color_rgba) >= 3:
2938
2970
  r, g, b = color_rgba[:3]
@@ -2941,14 +2973,14 @@ def sample_color_reset(self):
2941
2973
  r, g, b = int(r * 255), int(g * 255), int(b * 255)
2942
2974
  hex_color = f"#{r:02x}{g:02x}{b:02x}"
2943
2975
  colors.append(hex_color)
2944
-
2976
+
2945
2977
  # Update the sample_color column
2946
2978
  self.samples_df = self.samples_df.with_columns(
2947
- pl.Series("sample_color", colors).alias("sample_color")
2979
+ pl.Series("sample_color", colors).alias("sample_color"),
2948
2980
  )
2949
-
2981
+
2950
2982
  self.logger.debug(f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)")
2951
-
2983
+
2952
2984
  except ImportError:
2953
2985
  self.logger.error("cmap library is required for sample color reset. Install with: pip install cmap")
2954
2986
  except Exception as e:
@@ -2958,13 +2990,13 @@ def sample_color_reset(self):
2958
2990
  def _get_color_palette(palette_name):
2959
2991
  """
2960
2992
  Get color palette as a list of hex color codes using the cmap library.
2961
-
2993
+
2962
2994
  Parameters:
2963
2995
  palette_name (str): Name of the palette
2964
-
2996
+
2965
2997
  Returns:
2966
2998
  list: List of hex color codes
2967
-
2999
+
2968
3000
  Raises:
2969
3001
  ValueError: If palette_name is not supported
2970
3002
  """
@@ -2972,40 +3004,38 @@ def _get_color_palette(palette_name):
2972
3004
  from cmap import Colormap
2973
3005
  except ImportError:
2974
3006
  raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
2975
-
3007
+
2976
3008
  # Map common palette names to cmap names
2977
3009
  palette_mapping = {
2978
3010
  # Scientific colormaps
2979
3011
  "Turbo256": "turbo",
2980
- "Viridis256": "viridis",
3012
+ "Viridis256": "viridis",
2981
3013
  "Plasma256": "plasma",
2982
3014
  "Inferno256": "inferno",
2983
3015
  "Magma256": "magma",
2984
3016
  "Cividis256": "cividis",
2985
-
2986
3017
  # Qualitative palettes
2987
3018
  "Set1": "Set1",
2988
- "Set2": "Set2",
3019
+ "Set2": "Set2",
2989
3020
  "Set3": "Set3",
2990
3021
  "Tab10": "tab10",
2991
3022
  "Tab20": "tab20",
2992
3023
  "Dark2": "Dark2",
2993
3024
  "Paired": "Paired",
2994
-
2995
3025
  # Additional useful palettes
2996
3026
  "Spectral": "Spectral",
2997
3027
  "Rainbow": "rainbow",
2998
3028
  "Coolwarm": "coolwarm",
2999
3029
  "Seismic": "seismic",
3000
3030
  }
3001
-
3031
+
3002
3032
  # Get the cmap name
3003
3033
  cmap_name = palette_mapping.get(palette_name, palette_name.lower())
3004
-
3034
+
3005
3035
  try:
3006
3036
  # Create colormap
3007
3037
  cm = Colormap(cmap_name)
3008
-
3038
+
3009
3039
  # Determine number of colors to generate
3010
3040
  if "256" in palette_name:
3011
3041
  n_colors = 256
@@ -3021,7 +3051,7 @@ def _get_color_palette(palette_name):
3021
3051
  n_colors = 20
3022
3052
  else:
3023
3053
  n_colors = 256 # Default for continuous colormaps
3024
-
3054
+
3025
3055
  # Generate colors
3026
3056
  if n_colors <= 20:
3027
3057
  # For discrete palettes, use evenly spaced indices
@@ -3029,11 +3059,11 @@ def _get_color_palette(palette_name):
3029
3059
  else:
3030
3060
  # For continuous palettes, use full range
3031
3061
  indices = [i / (n_colors - 1) for i in range(n_colors)]
3032
-
3062
+
3033
3063
  # Get colors as RGBA and convert to hex
3034
3064
  colors = cm(indices)
3035
3065
  hex_colors = []
3036
-
3066
+
3037
3067
  for color in colors:
3038
3068
  if len(color) >= 3: # RGBA or RGB
3039
3069
  r, g, b = color[:3]
@@ -3042,25 +3072,26 @@ def _get_color_palette(palette_name):
3042
3072
  r, g, b = int(r * 255), int(g * 255), int(b * 255)
3043
3073
  hex_color = f"#{r:02x}{g:02x}{b:02x}"
3044
3074
  hex_colors.append(hex_color)
3045
-
3075
+
3046
3076
  return hex_colors
3047
-
3077
+
3048
3078
  except Exception as e:
3049
- raise ValueError(f"Failed to create colormap '{cmap_name}': {e}. "
3050
- f"Available palettes: {list(palette_mapping.keys())}")
3079
+ raise ValueError(
3080
+ f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
3081
+ )
3051
3082
 
3052
3083
 
3053
3084
  def _sample_colors_from_colormap(palette_name, n_colors):
3054
3085
  """
3055
3086
  Sample colors evenly from the whole colormap range, similar to sample_color_reset.
3056
-
3087
+
3057
3088
  Parameters:
3058
3089
  palette_name (str): Name of the palette/colormap
3059
3090
  n_colors (int): Number of colors to sample
3060
-
3091
+
3061
3092
  Returns:
3062
3093
  list: List of hex color codes sampled evenly from the colormap
3063
-
3094
+
3064
3095
  Raises:
3065
3096
  ValueError: If palette_name is not supported
3066
3097
  """
@@ -3068,51 +3099,49 @@ def _sample_colors_from_colormap(palette_name, n_colors):
3068
3099
  from cmap import Colormap
3069
3100
  except ImportError:
3070
3101
  raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
3071
-
3102
+
3072
3103
  # Map common palette names to cmap names (same as _get_color_palette)
3073
3104
  palette_mapping = {
3074
3105
  # Scientific colormaps
3075
3106
  "Turbo256": "turbo",
3076
- "Viridis256": "viridis",
3107
+ "Viridis256": "viridis",
3077
3108
  "Plasma256": "plasma",
3078
3109
  "Inferno256": "inferno",
3079
3110
  "Magma256": "magma",
3080
3111
  "Cividis256": "cividis",
3081
-
3082
3112
  # Qualitative palettes
3083
3113
  "Set1": "Set1",
3084
- "Set2": "Set2",
3114
+ "Set2": "Set2",
3085
3115
  "Set3": "Set3",
3086
3116
  "Tab10": "tab10",
3087
3117
  "Tab20": "tab20",
3088
3118
  "Dark2": "Dark2",
3089
3119
  "Paired": "Paired",
3090
-
3091
3120
  # Additional useful palettes
3092
3121
  "Spectral": "Spectral",
3093
3122
  "Rainbow": "rainbow",
3094
3123
  "Coolwarm": "coolwarm",
3095
3124
  "Seismic": "seismic",
3096
3125
  }
3097
-
3126
+
3098
3127
  # Get the cmap name
3099
3128
  cmap_name = palette_mapping.get(palette_name, palette_name.lower())
3100
-
3129
+
3101
3130
  try:
3102
3131
  # Create colormap
3103
3132
  cm = Colormap(cmap_name)
3104
-
3133
+
3105
3134
  colors = []
3106
-
3135
+
3107
3136
  # Distribute samples evenly across the full colormap range (same approach as sample_color_reset)
3108
3137
  for i in range(n_colors):
3109
3138
  # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
3110
3139
  normalized_value = (i + 0.5) / n_colors # +0.5 to center samples in their bins
3111
3140
  # Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
3112
3141
  normalized_value = 0.1 + (normalized_value * 0.8)
3113
-
3142
+
3114
3143
  color_rgba = cm(normalized_value)
3115
-
3144
+
3116
3145
  # Convert RGBA to hex
3117
3146
  if len(color_rgba) >= 3:
3118
3147
  r, g, b = color_rgba[:3]
@@ -3121,12 +3150,13 @@ def _sample_colors_from_colormap(palette_name, n_colors):
3121
3150
  r, g, b = int(r * 255), int(g * 255), int(b * 255)
3122
3151
  hex_color = f"#{r:02x}{g:02x}{b:02x}"
3123
3152
  colors.append(hex_color)
3124
-
3153
+
3125
3154
  return colors
3126
-
3155
+
3127
3156
  except Exception as e:
3128
- raise ValueError(f"Failed to create colormap '{cmap_name}': {e}. "
3129
- f"Available palettes: {list(palette_mapping.keys())}")
3157
+ raise ValueError(
3158
+ f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
3159
+ )
3130
3160
 
3131
3161
 
3132
3162
  def _matplotlib_to_hex(color_dict):
@@ -3135,32 +3165,32 @@ def _matplotlib_to_hex(color_dict):
3135
3165
 
3136
3166
 
3137
3167
  # =====================================================================================
3138
- # SCHEMA AND DATA STRUCTURE FUNCTIONS
3168
+ # SCHEMA AND DATA STRUCTURE FUNCTIONS
3139
3169
  # =====================================================================================
3140
3170
 
3141
3171
 
3142
3172
  def _ensure_features_df_schema_order(self):
3143
3173
  """
3144
3174
  Ensure features_df columns are ordered according to study5_schema.json.
3145
-
3175
+
3146
3176
  This method should be called after operations that might scramble the column order.
3147
3177
  """
3148
3178
  if self.features_df is None or self.features_df.is_empty():
3149
3179
  return
3150
-
3180
+
3151
3181
  try:
3152
3182
  import os
3153
3183
  import json
3154
3184
  from masster.study.h5 import _reorder_columns_by_schema
3155
-
3185
+
3156
3186
  # Load schema
3157
3187
  schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
3158
- with open(schema_path, 'r') as f:
3188
+ with open(schema_path) as f:
3159
3189
  schema = json.load(f)
3160
-
3190
+
3161
3191
  # Reorder columns to match schema
3162
- self.features_df = _reorder_columns_by_schema(self.features_df, schema, 'features_df')
3163
-
3192
+ self.features_df = _reorder_columns_by_schema(self.features_df, schema, "features_df")
3193
+
3164
3194
  except Exception as e:
3165
3195
  self.logger.warning(f"Failed to reorder features_df columns: {e}")
3166
3196
 
@@ -3168,38 +3198,38 @@ def _ensure_features_df_schema_order(self):
3168
3198
  def migrate_map_id_to_index(self):
3169
3199
  """
3170
3200
  Migrate map_id from string-based OpenMS unique IDs to integer indices.
3171
-
3201
+
3172
3202
  This function converts the map_id column from string type (with OpenMS unique IDs)
3173
3203
  to integer type where each map_id corresponds to the index of the feature map
3174
3204
  in self.features_maps.
3175
-
3205
+
3176
3206
  This migration is needed for studies that were created before the map_id format
3177
3207
  change from OpenMS unique IDs to feature map indices.
3178
3208
  """
3179
3209
  if self.samples_df is None or self.samples_df.is_empty():
3180
3210
  self.logger.warning("No samples to migrate")
3181
3211
  return
3182
-
3212
+
3183
3213
  # Check if migration is needed
3184
- current_dtype = self.samples_df['map_id'].dtype
3214
+ current_dtype = self.samples_df["map_id"].dtype
3185
3215
  if current_dtype == pl.Int64:
3186
3216
  self.logger.info("map_id column is already Int64 type - no migration needed")
3187
3217
  return
3188
-
3218
+
3189
3219
  self.logger.info("Migrating map_id from string-based OpenMS IDs to integer indices")
3190
-
3220
+
3191
3221
  # Create new map_id values based on sample order
3192
3222
  # Each sample gets a map_id that corresponds to its position in features_maps
3193
3223
  sample_count = len(self.samples_df)
3194
3224
  new_map_ids = list(range(sample_count))
3195
-
3225
+
3196
3226
  # Update the map_id column
3197
3227
  self.samples_df = self.samples_df.with_columns(
3198
- pl.lit(new_map_ids).alias("map_id")
3228
+ pl.lit(new_map_ids).alias("map_id"),
3199
3229
  )
3200
-
3230
+
3201
3231
  # Ensure the column is Int64 type
3202
3232
  self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
3203
-
3233
+
3204
3234
  self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
3205
3235
  self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")