masster 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/helpers.py CHANGED
@@ -211,7 +211,7 @@ def get_tic(owner, sample=None, label=None):
211
211
  return chrom
212
212
 
213
213
 
214
- def get_eic(owner, sample=None, mz=None, mz_tol=0.01, rt_unit="s", label=None):
214
+ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
215
215
  """
216
216
  Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
217
217
 
@@ -225,13 +225,20 @@ def get_eic(owner, sample=None, mz=None, mz_tol=0.01, rt_unit="s", label=None):
225
225
  owner: Study or Sample instance
226
226
  sample: Sample identifier (required if owner is Study)
227
227
  mz (float): Target m/z value
228
- mz_tol (float): m/z tolerance (default 0.01)
228
+ mz_tol (float): m/z tolerance. If None, uses owner.parameters.eic_mz_tol (for Study) or defaults to 0.01
229
229
  rt_unit (str): Retention time unit for the chromatogram
230
230
  label (str): Optional label for the chromatogram
231
231
 
232
232
  Returns:
233
233
  Chromatogram
234
234
  """
235
+ # Use default mz_tol from study parameters if not provided
236
+ if mz_tol is None:
237
+ if hasattr(owner, 'parameters') and hasattr(owner.parameters, 'eic_mz_tol'):
238
+ mz_tol = owner.parameters.eic_mz_tol
239
+ else:
240
+ mz_tol = 0.01 # fallback default
241
+
235
242
  if mz is None:
236
243
  raise ValueError("mz must be provided for EIC computation")
237
244
 
@@ -1298,7 +1305,7 @@ def compress_chrom(self):
1298
1305
  # =====================================================================================
1299
1306
 
1300
1307
 
1301
- def name_replace(self, replace_dict):
1308
+ def sample_name_replace(self, replace_dict):
1302
1309
  """
1303
1310
  Replace sample names in samples_df based on a dictionary mapping.
1304
1311
 
@@ -1364,7 +1371,7 @@ def name_replace(self, replace_dict):
1364
1371
  self.logger.info(f"Successfully replaced {replaced_count} sample names")
1365
1372
 
1366
1373
 
1367
- def name_reset(self):
1374
+ def sample_name_reset(self):
1368
1375
  """
1369
1376
  Reset sample names to the basename of sample_path without extensions.
1370
1377
 
@@ -1446,7 +1453,7 @@ def set_source(self, filename):
1446
1453
  failed_count = 0
1447
1454
 
1448
1455
  # Get all current file_source values
1449
- current_sources = self.samples_df.get_column("file_source").to_list()
1456
+ current_sources = self.samples_df.get_column("sample_source").to_list()
1450
1457
  sample_names = self.samples_df.get_column("sample_name").to_list()
1451
1458
 
1452
1459
  new_sources = []
@@ -1924,13 +1931,21 @@ def consensus_select(
1924
1931
  chrom_prominence_scaled_mean=None,
1925
1932
  chrom_height_scaled_mean=None,
1926
1933
  rt_delta_mean=None,
1934
+ sortby=None,
1935
+ descending=True,
1927
1936
  ):
1928
1937
  """
1929
1938
  Select consensus features from consensus_df based on specified criteria and return the filtered DataFrame.
1930
1939
 
1931
1940
  Parameters:
1932
- mz: m/z range filter (tuple for range, single value for minimum)
1933
- rt: retention time range filter (tuple for range, single value for minimum)
1941
+ mz: m/z filter with flexible formats:
1942
+ - float: m/z value ± default tolerance (uses study.parameters.eic_mz_tol)
1943
+ - tuple (mz_min, mz_max): range where mz_max > mz_min
1944
+ - tuple (mz_center, mz_tol): range where mz_tol < mz_center (interpreted as mz_center ± mz_tol)
1945
+ rt: retention time filter with flexible formats:
1946
+ - float: RT value ± default tolerance (uses study.parameters.eic_rt_tol)
1947
+ - tuple (rt_min, rt_max): range where rt_max > rt_min
1948
+ - tuple (rt_center, rt_tol): range where rt_tol < rt_center (interpreted as rt_center ± rt_tol)
1934
1949
  inty_mean: mean intensity filter (tuple for range, single value for minimum)
1935
1950
  consensus_uid: consensus UID filter (list, single value, or tuple for range)
1936
1951
  consensus_id: consensus ID filter (list or single value)
@@ -1943,6 +1958,8 @@ def consensus_select(
1943
1958
  chrom_prominence_scaled_mean: mean scaled chromatogram prominence filter (tuple for range, single value for minimum)
1944
1959
  chrom_height_scaled_mean: mean scaled chromatogram height filter (tuple for range, single value for minimum)
1945
1960
  rt_delta_mean: mean RT delta filter (tuple for range, single value for minimum)
1961
+ sortby: column name(s) to sort by (string, list of strings, or None for no sorting)
1962
+ descending: sort direction (True for descending, False for ascending, default is True)
1946
1963
 
1947
1964
  Returns:
1948
1965
  polars.DataFrame: Filtered consensus DataFrame
@@ -1957,11 +1974,32 @@ def consensus_select(
1957
1974
  # Filter by m/z
1958
1975
  if mz is not None:
1959
1976
  consensus_len_before_filter = len(consensus)
1977
+
1960
1978
  if isinstance(mz, tuple) and len(mz) == 2:
1961
- min_mz, max_mz = mz
1979
+ # Check if second value is smaller than first (indicating mz, mz_tol format)
1980
+ if mz[1] < mz[0]:
1981
+ # First is mz, second is mz_tol
1982
+ mz_center, mz_tol = mz
1983
+ min_mz = mz_center - mz_tol
1984
+ max_mz = mz_center + mz_tol
1985
+ else:
1986
+ # Standard (min_mz, max_mz) format
1987
+ min_mz, max_mz = mz
1962
1988
  consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
1963
1989
  else:
1964
- consensus = consensus.filter(pl.col("mz") >= mz)
1990
+ # Single float value - use default mz tolerance from study parameters
1991
+ default_mz_tol = getattr(self, 'parameters', None)
1992
+ if default_mz_tol and hasattr(default_mz_tol, 'eic_mz_tol'):
1993
+ default_mz_tol = default_mz_tol.eic_mz_tol
1994
+ else:
1995
+ # Fallback to align_defaults if study parameters not available
1996
+ from masster.study.defaults.align_def import align_defaults
1997
+ default_mz_tol = align_defaults().mz_max_diff
1998
+
1999
+ min_mz = mz - default_mz_tol
2000
+ max_mz = mz + default_mz_tol
2001
+ consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
2002
+
1965
2003
  self.logger.debug(
1966
2004
  f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1967
2005
  )
@@ -1969,11 +2007,32 @@ def consensus_select(
1969
2007
  # Filter by retention time
1970
2008
  if rt is not None:
1971
2009
  consensus_len_before_filter = len(consensus)
2010
+
1972
2011
  if isinstance(rt, tuple) and len(rt) == 2:
1973
- min_rt, max_rt = rt
2012
+ # Check if second value is smaller than first (indicating rt, rt_tol format)
2013
+ if rt[1] < rt[0]:
2014
+ # First is rt, second is rt_tol
2015
+ rt_center, rt_tol = rt
2016
+ min_rt = rt_center - rt_tol
2017
+ max_rt = rt_center + rt_tol
2018
+ else:
2019
+ # Standard (min_rt, max_rt) format
2020
+ min_rt, max_rt = rt
1974
2021
  consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
1975
2022
  else:
1976
- consensus = consensus.filter(pl.col("rt") >= rt)
2023
+ # Single float value - use default rt tolerance from study parameters
2024
+ default_rt_tol = getattr(self, 'parameters', None)
2025
+ if default_rt_tol and hasattr(default_rt_tol, 'eic_rt_tol'):
2026
+ default_rt_tol = default_rt_tol.eic_rt_tol
2027
+ else:
2028
+ # Fallback to align_defaults if study parameters not available
2029
+ from masster.study.defaults.align_def import align_defaults
2030
+ default_rt_tol = align_defaults().rt_max_diff
2031
+
2032
+ min_rt = rt - default_rt_tol
2033
+ max_rt = rt + default_rt_tol
2034
+ consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
2035
+
1977
2036
  self.logger.debug(
1978
2037
  f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1979
2038
  )
@@ -2170,6 +2229,27 @@ def consensus_select(
2170
2229
  else:
2171
2230
  self.logger.info(f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})")
2172
2231
 
2232
+ # Sort the results if sortby is specified
2233
+ if sortby is not None:
2234
+ if isinstance(sortby, str):
2235
+ # Single column
2236
+ if sortby in consensus.columns:
2237
+ consensus = consensus.sort(sortby, descending=descending)
2238
+ else:
2239
+ self.logger.warning(f"Sort column '{sortby}' not found in consensus DataFrame")
2240
+ elif isinstance(sortby, (list, tuple)):
2241
+ # Multiple columns
2242
+ valid_columns = [col for col in sortby if col in consensus.columns]
2243
+ invalid_columns = [col for col in sortby if col not in consensus.columns]
2244
+
2245
+ if invalid_columns:
2246
+ self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
2247
+
2248
+ if valid_columns:
2249
+ consensus = consensus.sort(valid_columns, descending=descending)
2250
+ else:
2251
+ self.logger.warning(f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.")
2252
+
2173
2253
  return consensus
2174
2254
 
2175
2255
 
@@ -2276,6 +2356,357 @@ def consensus_delete(self, consensus):
2276
2356
  self.consensus_filter(consensus)
2277
2357
 
2278
2358
 
2359
+ # =====================================================================================
2360
+ # SAMPLE MANAGEMENT AND DELETION FUNCTIONS
2361
+ # =====================================================================================
2362
+
2363
+
2364
+ def samples_select(
2365
+ self,
2366
+ sample_uid=None,
2367
+ sample_name=None,
2368
+ sample_type=None,
2369
+ sample_group=None,
2370
+ sample_batch=None,
2371
+ sample_sequence=None,
2372
+ num_features=None,
2373
+ num_ms1=None,
2374
+ num_ms2=None,
2375
+ ):
2376
+ """
2377
+ Select samples from samples_df based on specified criteria and return the filtered DataFrame.
2378
+
2379
+ Parameters:
2380
+ sample_uid: sample UID filter (list, single value, or tuple for range)
2381
+ sample_name: sample name filter (list or single value)
2382
+ sample_type: sample type filter (list or single value)
2383
+ sample_group: sample group filter (list or single value)
2384
+ sample_batch: sample batch filter (list, single value, or tuple for range)
2385
+ sample_sequence: sample sequence filter (list, single value, or tuple for range)
2386
+ num_features: number of features filter (tuple for range, single value for minimum)
2387
+ num_ms1: number of MS1 spectra filter (tuple for range, single value for minimum)
2388
+ num_ms2: number of MS2 spectra filter (tuple for range, single value for minimum)
2389
+
2390
+ Returns:
2391
+ polars.DataFrame: Filtered samples DataFrame
2392
+ """
2393
+ if self.samples_df is None or self.samples_df.is_empty():
2394
+ self.logger.warning("No samples found in study.")
2395
+ return pl.DataFrame()
2396
+
2397
+ # Early return if no filters provided
2398
+ filter_params = [
2399
+ sample_uid,
2400
+ sample_name,
2401
+ sample_type,
2402
+ sample_group,
2403
+ sample_batch,
2404
+ sample_sequence,
2405
+ num_features,
2406
+ num_ms1,
2407
+ num_ms2,
2408
+ ]
2409
+ if all(param is None for param in filter_params):
2410
+ return self.samples_df.clone()
2411
+
2412
+ initial_count = len(self.samples_df)
2413
+
2414
+ # Pre-check available columns once for efficiency
2415
+ available_columns = set(self.samples_df.columns)
2416
+
2417
+ # Build all filter conditions first, then apply them all at once
2418
+ filter_conditions = []
2419
+ warnings = []
2420
+
2421
+ # Filter by sample_uid
2422
+ if sample_uid is not None:
2423
+ if isinstance(sample_uid, (list, tuple)):
2424
+ if len(sample_uid) == 2 and not isinstance(sample_uid, list):
2425
+ # Treat as range
2426
+ min_uid, max_uid = sample_uid
2427
+ filter_conditions.append((pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid))
2428
+ else:
2429
+ # Treat as list
2430
+ filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
2431
+ else:
2432
+ filter_conditions.append(pl.col("sample_uid") == sample_uid)
2433
+
2434
+ # Filter by sample_name
2435
+ if sample_name is not None:
2436
+ if isinstance(sample_name, list):
2437
+ filter_conditions.append(pl.col("sample_name").is_in(sample_name))
2438
+ else:
2439
+ filter_conditions.append(pl.col("sample_name") == sample_name)
2440
+
2441
+ # Filter by sample_type
2442
+ if sample_type is not None:
2443
+ if "sample_type" in available_columns:
2444
+ if isinstance(sample_type, list):
2445
+ filter_conditions.append(pl.col("sample_type").is_in(sample_type))
2446
+ else:
2447
+ filter_conditions.append(pl.col("sample_type") == sample_type)
2448
+ else:
2449
+ warnings.append("'sample_type' column not found in samples_df")
2450
+
2451
+ # Filter by sample_group
2452
+ if sample_group is not None:
2453
+ if "sample_group" in available_columns:
2454
+ if isinstance(sample_group, list):
2455
+ filter_conditions.append(pl.col("sample_group").is_in(sample_group))
2456
+ else:
2457
+ filter_conditions.append(pl.col("sample_group") == sample_group)
2458
+ else:
2459
+ warnings.append("'sample_group' column not found in samples_df")
2460
+
2461
+ # Filter by sample_batch
2462
+ if sample_batch is not None:
2463
+ if "sample_batch" in available_columns:
2464
+ if isinstance(sample_batch, (list, tuple)):
2465
+ if len(sample_batch) == 2 and not isinstance(sample_batch, list):
2466
+ # Treat as range
2467
+ min_batch, max_batch = sample_batch
2468
+ filter_conditions.append((pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch))
2469
+ else:
2470
+ # Treat as list
2471
+ filter_conditions.append(pl.col("sample_batch").is_in(sample_batch))
2472
+ else:
2473
+ filter_conditions.append(pl.col("sample_batch") == sample_batch)
2474
+ else:
2475
+ warnings.append("'sample_batch' column not found in samples_df")
2476
+
2477
+ # Filter by sample_sequence
2478
+ if sample_sequence is not None:
2479
+ if "sample_sequence" in available_columns:
2480
+ if isinstance(sample_sequence, (list, tuple)):
2481
+ if len(sample_sequence) == 2 and not isinstance(sample_sequence, list):
2482
+ # Treat as range
2483
+ min_seq, max_seq = sample_sequence
2484
+ filter_conditions.append((pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq))
2485
+ else:
2486
+ # Treat as list
2487
+ filter_conditions.append(pl.col("sample_sequence").is_in(sample_sequence))
2488
+ else:
2489
+ filter_conditions.append(pl.col("sample_sequence") == sample_sequence)
2490
+ else:
2491
+ warnings.append("'sample_sequence' column not found in samples_df")
2492
+
2493
+ # Filter by num_features
2494
+ if num_features is not None:
2495
+ if "num_features" in available_columns:
2496
+ if isinstance(num_features, tuple) and len(num_features) == 2:
2497
+ min_features, max_features = num_features
2498
+ filter_conditions.append((pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features))
2499
+ else:
2500
+ filter_conditions.append(pl.col("num_features") >= num_features)
2501
+ else:
2502
+ warnings.append("'num_features' column not found in samples_df")
2503
+
2504
+ # Filter by num_ms1
2505
+ if num_ms1 is not None:
2506
+ if "num_ms1" in available_columns:
2507
+ if isinstance(num_ms1, tuple) and len(num_ms1) == 2:
2508
+ min_ms1, max_ms1 = num_ms1
2509
+ filter_conditions.append((pl.col("num_ms1") >= min_ms1) & (pl.col("num_ms1") <= max_ms1))
2510
+ else:
2511
+ filter_conditions.append(pl.col("num_ms1") >= num_ms1)
2512
+ else:
2513
+ warnings.append("'num_ms1' column not found in samples_df")
2514
+
2515
+ # Filter by num_ms2
2516
+ if num_ms2 is not None:
2517
+ if "num_ms2" in available_columns:
2518
+ if isinstance(num_ms2, tuple) and len(num_ms2) == 2:
2519
+ min_ms2, max_ms2 = num_ms2
2520
+ filter_conditions.append((pl.col("num_ms2") >= min_ms2) & (pl.col("num_ms2") <= max_ms2))
2521
+ else:
2522
+ filter_conditions.append(pl.col("num_ms2") >= num_ms2)
2523
+ else:
2524
+ warnings.append("'num_ms2' column not found in samples_df")
2525
+
2526
+ # Log all warnings once at the end for efficiency
2527
+ for warning in warnings:
2528
+ self.logger.warning(warning)
2529
+
2530
+ # Apply all filters at once using lazy evaluation for optimal performance
2531
+ if filter_conditions:
2532
+ # Combine all conditions with AND
2533
+ combined_filter = filter_conditions[0]
2534
+ for condition in filter_conditions[1:]:
2535
+ combined_filter = combined_filter & condition
2536
+
2537
+ # Apply the combined filter using lazy evaluation
2538
+ samples = self.samples_df.lazy().filter(combined_filter).collect()
2539
+ else:
2540
+ samples = self.samples_df.clone()
2541
+
2542
+ final_count = len(samples)
2543
+
2544
+ if final_count == 0:
2545
+ self.logger.warning("No samples remaining after applying selection criteria.")
2546
+ else:
2547
+ self.logger.info(f"Samples selected: {final_count} (out of {initial_count})")
2548
+
2549
+ return samples
2550
+
2551
+
2552
+ def samples_delete(self, samples):
2553
+ """
2554
+ Delete samples and all related data from the study based on sample identifiers.
2555
+
2556
+ This function eliminates all data related to the specified samples (and their sample_uids)
2557
+ from all dataframes including:
2558
+ - samples_df: Removes the sample rows
2559
+ - features_df: Removes all features belonging to these samples
2560
+ - consensus_mapping_df: Removes mappings for features from these samples
2561
+ - consensus_ms2: Removes MS2 spectra for features from these samples
2562
+ - feature_maps: Removes the corresponding feature maps
2563
+
2564
+ Also updates map_id values to maintain sequential indices after deletion.
2565
+
2566
+ Parameters:
2567
+ samples: Samples to delete. Can be:
2568
+ - list of int: List of sample_uids to delete
2569
+ - polars.DataFrame: DataFrame obtained from samples_select (will use sample_uid column)
2570
+ - int: Single sample_uid to delete
2571
+
2572
+ Returns:
2573
+ None (modifies study DataFrames and feature_maps in place)
2574
+ """
2575
+ if self.samples_df is None or self.samples_df.is_empty():
2576
+ self.logger.warning("No samples found in study.")
2577
+ return
2578
+
2579
+ # Early return if no samples provided
2580
+ if samples is None:
2581
+ self.logger.warning("No samples provided for deletion.")
2582
+ return
2583
+
2584
+ initial_sample_count = len(self.samples_df)
2585
+
2586
+ # Determine sample_uids to remove
2587
+ if isinstance(samples, pl.DataFrame):
2588
+ if "sample_uid" not in samples.columns:
2589
+ self.logger.error("samples DataFrame must contain 'sample_uid' column")
2590
+ return
2591
+ sample_uids_to_remove = samples["sample_uid"].to_list()
2592
+ elif isinstance(samples, (list, tuple)):
2593
+ sample_uids_to_remove = list(samples) # Convert tuple to list if needed
2594
+ elif isinstance(samples, int):
2595
+ sample_uids_to_remove = [samples]
2596
+ else:
2597
+ self.logger.error("samples parameter must be a DataFrame, list, tuple, or int")
2598
+ return
2599
+
2600
+ # Early return if no UIDs to remove
2601
+ if not sample_uids_to_remove:
2602
+ self.logger.warning("No sample UIDs provided for deletion.")
2603
+ return
2604
+
2605
+ # Convert to set for faster lookup if list is large
2606
+ if len(sample_uids_to_remove) > 100:
2607
+ sample_uids_set = set(sample_uids_to_remove)
2608
+ # Use the set for filtering if it's significantly smaller
2609
+ if len(sample_uids_set) < len(sample_uids_to_remove) * 0.8:
2610
+ sample_uids_to_remove = list(sample_uids_set)
2611
+
2612
+ self.logger.info(f"Deleting {len(sample_uids_to_remove)} samples and all related data...")
2613
+
2614
+ # Get feature_uids that need to be removed from features_df
2615
+ feature_uids_to_remove = []
2616
+ initial_features_count = 0
2617
+ if self.features_df is not None and not self.features_df.is_empty():
2618
+ initial_features_count = len(self.features_df)
2619
+ feature_uids_to_remove = self.features_df.filter(
2620
+ pl.col("sample_uid").is_in(sample_uids_to_remove),
2621
+ )["feature_uid"].to_list()
2622
+
2623
+ # Get map_ids to remove from feature_maps (needed before samples_df deletion)
2624
+ map_ids_to_remove = []
2625
+ if hasattr(self, 'feature_maps') and self.feature_maps is not None:
2626
+ # Get map_ids for samples to be deleted
2627
+ map_ids_df = self.samples_df.filter(
2628
+ pl.col("sample_uid").is_in(sample_uids_to_remove)
2629
+ ).select("map_id")
2630
+ if not map_ids_df.is_empty():
2631
+ map_ids_to_remove = map_ids_df["map_id"].to_list()
2632
+
2633
+ # 1. Remove samples from samples_df
2634
+ self.samples_df = self.samples_df.filter(
2635
+ ~pl.col("sample_uid").is_in(sample_uids_to_remove),
2636
+ )
2637
+
2638
+ # 2. Remove corresponding features from features_df
2639
+ removed_features_count = 0
2640
+ if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
2641
+ self.features_df = self.features_df.filter(
2642
+ ~pl.col("sample_uid").is_in(sample_uids_to_remove),
2643
+ )
2644
+ removed_features_count = initial_features_count - len(self.features_df)
2645
+
2646
+ # 3. Remove from consensus_mapping_df
2647
+ removed_mapping_count = 0
2648
+ if feature_uids_to_remove and self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
2649
+ initial_mapping_count = len(self.consensus_mapping_df)
2650
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
2651
+ ~pl.col("feature_uid").is_in(feature_uids_to_remove),
2652
+ )
2653
+ removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
2654
+
2655
+ # 4. Remove from consensus_ms2 if it exists
2656
+ removed_ms2_count = 0
2657
+ if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
2658
+ initial_ms2_count = len(self.consensus_ms2)
2659
+ self.consensus_ms2 = self.consensus_ms2.filter(
2660
+ ~pl.col("sample_uid").is_in(sample_uids_to_remove),
2661
+ )
2662
+ removed_ms2_count = initial_ms2_count - len(self.consensus_ms2)
2663
+
2664
+ # 5. Remove from feature_maps and update map_id
2665
+ removed_maps_count = 0
2666
+ if hasattr(self, 'feature_maps') and self.feature_maps is not None and map_ids_to_remove:
2667
+ # Remove feature maps in reverse order to maintain indices
2668
+ for map_id in sorted(map_ids_to_remove, reverse=True):
2669
+ if 0 <= map_id < len(self.feature_maps):
2670
+ self.feature_maps.pop(map_id)
2671
+ removed_maps_count += 1
2672
+
2673
+ # Update map_id values in samples_df to maintain sequential indices
2674
+ if len(self.samples_df) > 0:
2675
+ new_map_ids = list(range(len(self.samples_df)))
2676
+ self.samples_df = self.samples_df.with_columns(
2677
+ pl.lit(new_map_ids).alias("map_id")
2678
+ )
2679
+
2680
+ # Calculate and log results
2681
+ removed_sample_count = initial_sample_count - len(self.samples_df)
2682
+ final_sample_count = len(self.samples_df)
2683
+
2684
+ # Create comprehensive summary message
2685
+ summary_parts = [
2686
+ f"Deleted {removed_sample_count} samples",
2687
+ ]
2688
+
2689
+ if removed_features_count > 0:
2690
+ summary_parts.append(f"{removed_features_count} features")
2691
+
2692
+ if removed_mapping_count > 0:
2693
+ summary_parts.append(f"{removed_mapping_count} consensus mappings")
2694
+
2695
+ if removed_ms2_count > 0:
2696
+ summary_parts.append(f"{removed_ms2_count} MS2 spectra")
2697
+
2698
+ if removed_maps_count > 0:
2699
+ summary_parts.append(f"{removed_maps_count} feature maps")
2700
+
2701
+ summary_parts.append(f"Remaining samples: {final_sample_count}")
2702
+
2703
+ self.logger.info(". ".join(summary_parts))
2704
+
2705
+ # Update map_id indices if needed
2706
+ if removed_maps_count > 0 and final_sample_count > 0:
2707
+ self.logger.debug(f"Updated map_id values to range from 0 to {final_sample_count - 1}")
2708
+
2709
+
2279
2710
  # =====================================================================================
2280
2711
  # COLOR PALETTE AND VISUALIZATION FUNCTIONS
2281
2712
  # =====================================================================================
@@ -2712,3 +3143,43 @@ def _ensure_features_df_schema_order(self):
2712
3143
 
2713
3144
  except Exception as e:
2714
3145
  self.logger.warning(f"Failed to reorder features_df columns: {e}")
3146
+
3147
+
3148
+ def migrate_map_id_to_index(self):
3149
+ """
3150
+ Migrate map_id from string-based OpenMS unique IDs to integer indices.
3151
+
3152
+ This function converts the map_id column from string type (with OpenMS unique IDs)
3153
+ to integer type where each map_id corresponds to the index of the feature map
3154
+ in self.features_maps.
3155
+
3156
+ This migration is needed for studies that were created before the map_id format
3157
+ change from OpenMS unique IDs to feature map indices.
3158
+ """
3159
+ if self.samples_df is None or self.samples_df.is_empty():
3160
+ self.logger.warning("No samples to migrate")
3161
+ return
3162
+
3163
+ # Check if migration is needed
3164
+ current_dtype = self.samples_df['map_id'].dtype
3165
+ if current_dtype == pl.Int64:
3166
+ self.logger.info("map_id column is already Int64 type - no migration needed")
3167
+ return
3168
+
3169
+ self.logger.info("Migrating map_id from string-based OpenMS IDs to integer indices")
3170
+
3171
+ # Create new map_id values based on sample order
3172
+ # Each sample gets a map_id that corresponds to its position in features_maps
3173
+ sample_count = len(self.samples_df)
3174
+ new_map_ids = list(range(sample_count))
3175
+
3176
+ # Update the map_id column
3177
+ self.samples_df = self.samples_df.with_columns(
3178
+ pl.lit(new_map_ids).alias("map_id")
3179
+ )
3180
+
3181
+ # Ensure the column is Int64 type
3182
+ self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
3183
+
3184
+ self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
3185
+ self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")