masster 0.3.16__tar.gz → 0.3.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (78) hide show
  1. {masster-0.3.16 → masster-0.3.18}/PKG-INFO +1 -1
  2. {masster-0.3.16 → masster-0.3.18}/pyproject.toml +1 -1
  3. {masster-0.3.16 → masster-0.3.18}/src/masster/_version.py +1 -1
  4. {masster-0.3.16 → masster-0.3.18}/src/masster/study/defaults/align_def.py +9 -0
  5. {masster-0.3.16 → masster-0.3.18}/src/masster/study/helpers.py +55 -35
  6. {masster-0.3.16 → masster-0.3.18}/src/masster/study/load.py +92 -46
  7. {masster-0.3.16 → masster-0.3.18}/src/masster/study/plot.py +66 -5
  8. {masster-0.3.16 → masster-0.3.18}/src/masster/study/processing.py +149 -120
  9. {masster-0.3.16 → masster-0.3.18}/uv.lock +1 -1
  10. {masster-0.3.16 → masster-0.3.18}/.github/workflows/publish.yml +0 -0
  11. {masster-0.3.16 → masster-0.3.18}/.github/workflows/security.yml +0 -0
  12. {masster-0.3.16 → masster-0.3.18}/.github/workflows/test.yml +0 -0
  13. {masster-0.3.16 → masster-0.3.18}/.gitignore +0 -0
  14. {masster-0.3.16 → masster-0.3.18}/.pre-commit-config.yaml +0 -0
  15. {masster-0.3.16 → masster-0.3.18}/LICENSE +0 -0
  16. {masster-0.3.16 → masster-0.3.18}/Makefile +0 -0
  17. {masster-0.3.16 → masster-0.3.18}/README.md +0 -0
  18. {masster-0.3.16 → masster-0.3.18}/TESTING.md +0 -0
  19. {masster-0.3.16 → masster-0.3.18}/demo/example_batch_process.py +0 -0
  20. {masster-0.3.16 → masster-0.3.18}/demo/example_sample_process.py +0 -0
  21. {masster-0.3.16 → masster-0.3.18}/src/masster/__init__.py +0 -0
  22. {masster-0.3.16 → masster-0.3.18}/src/masster/chromatogram.py +0 -0
  23. {masster-0.3.16 → masster-0.3.18}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +0 -0
  24. {masster-0.3.16 → masster-0.3.18}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
  25. {masster-0.3.16 → masster-0.3.18}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
  26. {masster-0.3.16 → masster-0.3.18}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
  27. {masster-0.3.16 → masster-0.3.18}/src/masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
  28. {masster-0.3.16 → masster-0.3.18}/src/masster/logger.py +0 -0
  29. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/__init__.py +0 -0
  30. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/defaults/__init__.py +0 -0
  31. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/defaults/find_adducts_def.py +0 -0
  32. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/defaults/find_features_def.py +0 -0
  33. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/defaults/find_ms2_def.py +0 -0
  34. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/defaults/get_spectrum_def.py +0 -0
  35. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/defaults/sample_def.py +0 -0
  36. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/h5.py +0 -0
  37. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/helpers.py +0 -0
  38. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/lib.py +0 -0
  39. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/load.py +0 -0
  40. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/parameters.py +0 -0
  41. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/plot.py +0 -0
  42. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/processing.py +0 -0
  43. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/quant.py +0 -0
  44. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/sample.py +0 -0
  45. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/sample5_schema.json +0 -0
  46. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/save.py +0 -0
  47. {masster-0.3.16 → masster-0.3.18}/src/masster/sample/sciex.py +0 -0
  48. {masster-0.3.16 → masster-0.3.18}/src/masster/spectrum.py +0 -0
  49. {masster-0.3.16 → masster-0.3.18}/src/masster/study/__init__.py +0 -0
  50. {masster-0.3.16 → masster-0.3.18}/src/masster/study/defaults/__init__.py +0 -0
  51. {masster-0.3.16 → masster-0.3.18}/src/masster/study/defaults/export_def.py +0 -0
  52. {masster-0.3.16 → masster-0.3.18}/src/masster/study/defaults/fill_chrom_def.py +0 -0
  53. {masster-0.3.16 → masster-0.3.18}/src/masster/study/defaults/fill_def.py +0 -0
  54. {masster-0.3.16 → masster-0.3.18}/src/masster/study/defaults/find_consensus_def.py +0 -0
  55. {masster-0.3.16 → masster-0.3.18}/src/masster/study/defaults/find_ms2_def.py +0 -0
  56. {masster-0.3.16 → masster-0.3.18}/src/masster/study/defaults/integrate_chrom_def.py +0 -0
  57. {masster-0.3.16 → masster-0.3.18}/src/masster/study/defaults/integrate_def.py +0 -0
  58. {masster-0.3.16 → masster-0.3.18}/src/masster/study/defaults/merge_def.py +0 -0
  59. {masster-0.3.16 → masster-0.3.18}/src/masster/study/defaults/study_def.py +0 -0
  60. {masster-0.3.16 → masster-0.3.18}/src/masster/study/export.py +0 -0
  61. {masster-0.3.16 → masster-0.3.18}/src/masster/study/h5.py +0 -0
  62. {masster-0.3.16 → masster-0.3.18}/src/masster/study/helpers_optimized.py +0 -0
  63. {masster-0.3.16 → masster-0.3.18}/src/masster/study/parameters.py +0 -0
  64. {masster-0.3.16 → masster-0.3.18}/src/masster/study/save.py +0 -0
  65. {masster-0.3.16 → masster-0.3.18}/src/masster/study/study.py +0 -0
  66. {masster-0.3.16 → masster-0.3.18}/src/masster/study/study5_schema.json +0 -0
  67. {masster-0.3.16 → masster-0.3.18}/tests/conftest.py +0 -0
  68. {masster-0.3.16 → masster-0.3.18}/tests/test_chromatogram.py +0 -0
  69. {masster-0.3.16 → masster-0.3.18}/tests/test_defaults.py +0 -0
  70. {masster-0.3.16 → masster-0.3.18}/tests/test_imports.py +0 -0
  71. {masster-0.3.16 → masster-0.3.18}/tests/test_integration.py +0 -0
  72. {masster-0.3.16 → masster-0.3.18}/tests/test_logger.py +0 -0
  73. {masster-0.3.16 → masster-0.3.18}/tests/test_parameters.py +0 -0
  74. {masster-0.3.16 → masster-0.3.18}/tests/test_sample.py +0 -0
  75. {masster-0.3.16 → masster-0.3.18}/tests/test_spectrum.py +0 -0
  76. {masster-0.3.16 → masster-0.3.18}/tests/test_study.py +0 -0
  77. {masster-0.3.16 → masster-0.3.18}/tests/test_version.py +0 -0
  78. {masster-0.3.16 → masster-0.3.18}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: masster
3
- Version: 0.3.16
3
+ Version: 0.3.18
4
4
  Summary: Mass spectrometry data analysis package
5
5
  Project-URL: homepage, https://github.com/zamboni-lab/masster
6
6
  Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -1,7 +1,7 @@
1
1
 
2
2
  [project]
3
3
  name = "masster"
4
- version = "0.3.16"
4
+ version = "0.3.18"
5
5
  description = "Mass spectrometry data analysis package"
6
6
  authors = [
7
7
  { name = "Zamboni Lab" }
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.3.16"
4
+ __version__ = "0.3.18"
5
5
 
6
6
 
7
7
  def get_version():
@@ -24,6 +24,7 @@ class align_defaults:
24
24
  skip_blanks (bool): Whether to skip blank samples. Default is False.
25
25
 
26
26
  KD algorithm specific parameters:
27
+ min_samples (int): Minimum number of samples required for KD alignment. Default is 3.
27
28
  nr_partitions (int): Number of partitions in m/z dimension. Default is 100.
28
29
  warp_enabled (bool): Enable non-linear retention time transformation. Default is True.
29
30
  warp_rt_tol (float): RT tolerance for the LOWESS fit. Default is 5.0.
@@ -59,6 +60,7 @@ class align_defaults:
59
60
  algo: str = "pc"
60
61
 
61
62
  # KD algorithm specific parameters
63
+ min_samples: int = 3
62
64
  nr_partitions: int = 100
63
65
  warp_enabled: bool = True
64
66
  warp_rt_tol: float = 5.0
@@ -137,6 +139,13 @@ class align_defaults:
137
139
  "allowed_values": ["pc", "kd"],
138
140
  },
139
141
  # KD algorithm specific parameters
142
+ "min_samples": {
143
+ "dtype": int,
144
+ "description": "Minimum number of samples required for KD alignment algorithm",
145
+ "default": 3,
146
+ "min_value": 2,
147
+ "max_value": 1000,
148
+ },
140
149
  "nr_partitions": {
141
150
  "dtype": int,
142
151
  "description": "Number of partitions in m/z dimension for KD algorithm",
@@ -479,7 +479,9 @@ def get_consensus(self, quant="chrom_area"):
479
479
  # sort by consensus_id
480
480
  df1 = df1.sort_index()
481
481
 
482
- df2 = self.get_consensus_matrix(quant=quant)
482
+ df2_polars = self.get_consensus_matrix(quant=quant)
483
+ # Convert to pandas for merging (since the result is used for export)
484
+ df2 = df2_polars.to_pandas().set_index("consensus_uid")
483
485
  # sort df2 row by consensus_id
484
486
  df2 = df2.sort_index()
485
487
  # merge df and df2 on consensus_id
@@ -492,6 +494,7 @@ def get_consensus(self, quant="chrom_area"):
492
494
  def get_consensus_matrix(self, quant="chrom_area"):
493
495
  """
494
496
  Get a matrix of consensus features with samples as columns and consensus features as rows.
497
+ Optimized implementation that avoids expensive join operations.
495
498
  """
496
499
  if quant not in self.features_df.columns:
497
500
  self.logger.error(
@@ -499,41 +502,58 @@ def get_consensus_matrix(self, quant="chrom_area"):
499
502
  )
500
503
  return None
501
504
 
502
- # Use Polars join instead of pandas merge
503
- features_subset = self.features_df.select(["feature_uid", "sample_uid", quant])
504
- consensus_mapping_subset = self.consensus_mapping_df.select([
505
- "consensus_uid",
506
- "feature_uid",
507
- ])
508
-
509
- df1 = features_subset.join(
510
- consensus_mapping_subset,
511
- on="feature_uid",
512
- how="left",
513
- )
514
-
515
- # Convert to pandas for pivot operation (Polars pivot is still evolving)
516
- df1_pd = df1.to_pandas()
517
- df2 = df1_pd.pivot_table(
518
- index="consensus_uid",
519
- columns="sample_uid",
520
- values=quant,
521
- aggfunc="max",
522
- )
523
-
524
- # Create sample_uid to sample_name mapping using Polars
525
- sample_mapping = dict(
526
- self.samples_df.select(["sample_uid", "sample_name"]).iter_rows(),
527
- )
528
- # replace sample_uid with sample_name in df2
529
- df2 = df2.rename(columns=sample_mapping)
505
+ # Create a lookup dictionary from features_df for O(1) value access
506
+ feature_values = {}
507
+ for row in self.features_df.iter_rows(named=True):
508
+ feature_uid = row['feature_uid']
509
+ sample_uid = row['sample_uid']
510
+ value = row[quant] if row[quant] is not None else 0
511
+ feature_values[(feature_uid, sample_uid)] = value
512
+
513
+ # Build consensus matrix directly using the consensus_mapping_df
514
+ matrix_dict = {}
515
+ sample_mapping = dict(self.samples_df.select(["sample_uid", "sample_name"]).iter_rows())
516
+
517
+ for row in self.consensus_mapping_df.iter_rows(named=True):
518
+ consensus_uid = row['consensus_uid']
519
+ sample_uid = row['sample_uid']
520
+ feature_uid = row['feature_uid']
521
+
522
+ # Look up the quantification value
523
+ key = (feature_uid, sample_uid)
524
+ value = feature_values.get(key, 0)
525
+
526
+ if consensus_uid not in matrix_dict:
527
+ matrix_dict[consensus_uid] = {}
528
+
529
+ sample_name = sample_mapping.get(sample_uid, f"sample_{sample_uid}")
530
+
531
+ # Take max if multiple features map to same consensus/sample combination
532
+ if sample_name in matrix_dict[consensus_uid]:
533
+ matrix_dict[consensus_uid][sample_name] = max(matrix_dict[consensus_uid][sample_name], value)
534
+ else:
535
+ matrix_dict[consensus_uid][sample_name] = value
530
536
 
531
- # round to integer
532
- df2 = df2.round()
533
- # set consensus_id as uint64
534
- df2.index = df2.index.astype("uint64")
535
- # set index to consensus_id
536
- df2.index.name = "consensus_uid"
537
+ # Convert to Polars DataFrame with proper formatting
538
+ import polars as pl
539
+
540
+ # Convert matrix_dict to list of records for Polars
541
+ records = []
542
+ for consensus_uid, sample_values in matrix_dict.items():
543
+ record = {"consensus_uid": consensus_uid}
544
+ record.update(sample_values)
545
+ records.append(record)
546
+
547
+ # Create Polars DataFrame and set proper data types
548
+ df2 = pl.DataFrame(records)
549
+
550
+ # Fill null values with 0 and round numeric columns
551
+ numeric_cols = [col for col in df2.columns if col != "consensus_uid"]
552
+ df2 = df2.with_columns([
553
+ pl.col("consensus_uid").cast(pl.UInt64),
554
+ *[pl.col(col).fill_null(0).round(0) for col in numeric_cols]
555
+ ])
556
+
537
557
  return df2
538
558
 
539
559
 
@@ -961,51 +961,96 @@ def _get_missing_consensus_sample_combinations(self, uids):
961
961
  """
962
962
  Efficiently identify which consensus_uid/sample combinations are missing.
963
963
  Returns a list of tuples: (consensus_uid, sample_uid, sample_name, sample_path)
964
+
965
+ Optimized for common scenarios:
966
+ - Early termination for fully-filled studies
967
+ - Efficient dictionary lookups instead of expensive DataFrame joins
968
+ - Smart handling of sparse vs dense missing data patterns
964
969
  """
965
- # Get all consensus UIDs we're interested in
966
- consensus_uids_set = set(uids)
967
-
968
- # Get all sample UIDs and create lookup
969
- all_sample_info = {}
970
- for row in self.samples_df.select([
971
- "sample_uid",
972
- "sample_name",
973
- "sample_path",
974
- ]).iter_rows(named=True):
975
- all_sample_info[row["sample_uid"]] = {
976
- "sample_name": row["sample_name"],
977
- "sample_path": row["sample_path"],
978
- }
979
-
980
- # Get existing consensus/sample combinations from consensus_mapping_df
981
- existing_combinations = set()
982
- consensus_mapping_filtered = self.consensus_mapping_df.filter(
983
- pl.col("consensus_uid").is_in(list(consensus_uids_set)),
984
- )
985
-
986
- # Join with features_df to get sample_uid information
987
- existing_features = consensus_mapping_filtered.join(
988
- self.features_df.select(["feature_uid", "sample_uid"]),
989
- on="feature_uid",
990
- how="inner",
970
+ if not uids:
971
+ return []
972
+
973
+ n_consensus = len(uids)
974
+ n_samples = len(self.samples_df)
975
+ total_possible = n_consensus * n_samples
976
+
977
+ # Quick early termination check for fully/nearly filled studies
978
+ # This handles the common case where fill() is run on an already-filled study
979
+ consensus_counts = (
980
+ self.consensus_mapping_df
981
+ .filter(pl.col("consensus_uid").is_in(uids))
982
+ .group_by("consensus_uid")
983
+ .agg(pl.count("feature_uid").alias("count"))
991
984
  )
992
-
993
- for row in existing_features.select(["consensus_uid", "sample_uid"]).iter_rows():
994
- existing_combinations.add((row[0], row[1])) # (consensus_uid, sample_uid)
995
-
996
- # Find missing combinations
997
- missing_combinations = []
998
- for consensus_uid in consensus_uids_set:
999
- for sample_uid, sample_info in all_sample_info.items():
1000
- if (consensus_uid, sample_uid) not in existing_combinations:
1001
- missing_combinations.append((
1002
- consensus_uid,
1003
- sample_uid,
1004
- sample_info["sample_name"],
1005
- sample_info["sample_path"],
1006
- ))
1007
-
1008
- return missing_combinations
985
+
986
+ total_existing = consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
987
+
988
+ # If >95% filled, likely no gaps (common case)
989
+ if total_existing >= total_possible * 0.95:
990
+ self.logger.debug(f"Study appears {total_existing/total_possible*100:.1f}% filled, using sparse optimization")
991
+
992
+ # For sparse missing data, check each consensus feature individually
993
+ missing_combinations = []
994
+ uids_set = set(uids)
995
+
996
+ # Build efficient lookups
997
+ feature_to_sample = dict(
998
+ self.features_df.select(["feature_uid", "sample_uid"]).iter_rows()
999
+ )
1000
+
1001
+ # Get existing combinations for target UIDs only
1002
+ existing_by_consensus = {}
1003
+ for consensus_uid, feature_uid in self.consensus_mapping_df.select(["consensus_uid", "feature_uid"]).iter_rows():
1004
+ if consensus_uid in uids_set and feature_uid in feature_to_sample:
1005
+ if consensus_uid not in existing_by_consensus:
1006
+ existing_by_consensus[consensus_uid] = set()
1007
+ existing_by_consensus[consensus_uid].add(feature_to_sample[feature_uid])
1008
+
1009
+ # Get sample info once
1010
+ all_samples = list(
1011
+ self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows()
1012
+ )
1013
+
1014
+ # Check for missing combinations
1015
+ for consensus_uid in uids:
1016
+ existing_samples = existing_by_consensus.get(consensus_uid, set())
1017
+ for sample_uid, sample_name, sample_path in all_samples:
1018
+ if sample_uid not in existing_samples:
1019
+ missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path))
1020
+
1021
+ return missing_combinations
1022
+
1023
+ else:
1024
+ # For studies with many gaps, use bulk operations
1025
+ self.logger.debug(f"Study {total_existing/total_possible*100:.1f}% filled, using bulk optimization")
1026
+
1027
+ # Build efficient lookups
1028
+ uids_set = set(uids)
1029
+ feature_to_sample = dict(
1030
+ self.features_df.select(["feature_uid", "sample_uid"]).iter_rows()
1031
+ )
1032
+
1033
+ # Build existing combinations set
1034
+ existing_combinations = {
1035
+ (consensus_uid, feature_to_sample[feature_uid])
1036
+ for consensus_uid, feature_uid in self.consensus_mapping_df.select(["consensus_uid", "feature_uid"]).iter_rows()
1037
+ if consensus_uid in uids_set and feature_uid in feature_to_sample
1038
+ }
1039
+
1040
+ # Get all sample info
1041
+ all_samples = list(
1042
+ self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows()
1043
+ )
1044
+
1045
+ # Generate all missing combinations
1046
+ missing_combinations = [
1047
+ (consensus_uid, sample_uid, sample_name, sample_path)
1048
+ for consensus_uid in uids
1049
+ for sample_uid, sample_name, sample_path in all_samples
1050
+ if (consensus_uid, sample_uid) not in existing_combinations
1051
+ ]
1052
+
1053
+ return missing_combinations
1009
1054
 
1010
1055
 
1011
1056
  def sanitize(self):
@@ -1334,7 +1379,7 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
1334
1379
  self.samples_df = pl.concat([self.samples_df, new_sample])
1335
1380
 
1336
1381
  # SIMPLIFIED feature processing
1337
- current_sample_uid = len(self.samples_df) - 1
1382
+ current_sample_uid = len(self.samples_df)
1338
1383
 
1339
1384
  # Add required columns with minimal operations
1340
1385
  columns_to_add = [
@@ -1475,7 +1520,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
1475
1520
  self.samples_df = pl.concat([self.samples_df, new_sample])
1476
1521
 
1477
1522
  # SIMPLIFIED feature processing
1478
- current_sample_uid = len(self.samples_df) - 1
1523
+ current_sample_uid = len(self.samples_df)
1479
1524
 
1480
1525
  # Add required columns with minimal operations
1481
1526
  columns_to_add = [
@@ -1576,7 +1621,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
1576
1621
  self.samples_df = pl.concat([self.samples_df, new_sample])
1577
1622
 
1578
1623
  # SIMPLIFIED feature processing
1579
- current_sample_uid = len(self.samples_df) - 1
1624
+ current_sample_uid = len(self.samples_df)
1580
1625
 
1581
1626
  # Add required columns with minimal operations
1582
1627
  columns_to_add = [
@@ -1650,3 +1695,4 @@ def _sample_color_reset_optimized(self):
1650
1695
  )
1651
1696
 
1652
1697
  self.logger.debug(f"Reset sample colors (cached) for {n_samples} samples")
1698
+
@@ -17,7 +17,7 @@ hv.extension("bokeh")
17
17
  from bokeh.layouts import row as bokeh_row
18
18
 
19
19
 
20
- def plot_alignment(self, maps: bool = True, filename: str | None = None, width: int = 450, height: int = 450, markersize: int = 3):
20
+ def plot_alignment(self, maps: bool = True, samples: int | list[int | str] | None = None, filename: str | None = None, width: int = 450, height: int = 450, markersize: int = 3):
21
21
  """Visualize retention time alignment using two synchronized Bokeh scatter plots.
22
22
 
23
23
  - When ``maps=True`` the function reads ``self.features_maps`` (list of FeatureMap)
@@ -27,6 +27,11 @@ def plot_alignment(self, maps: bool = True, filename: str | None = None, width:
27
27
 
28
28
  Parameters
29
29
  - maps: whether to use feature maps (default True).
30
+ - samples: Sample selection parameter, interpreted like in plot_samples_2d:
31
+ - None: show all samples
32
+ - int: show a random subset of N samples
33
+ - list of ints: show samples with these sample_uids
34
+ - list of strings: show samples with these sample_names
30
35
  - filename: optional HTML file path to save the plot.
31
36
  - width/height: pixel size of each subplot.
32
37
  - markersize: base marker size.
@@ -54,6 +59,32 @@ def plot_alignment(self, maps: bool = True, filename: str | None = None, width:
54
59
  self.logger.error("No feature maps available for plotting.")
55
60
  return
56
61
 
62
+ # Get sample_uids to limit which samples to show
63
+ sample_uids_to_show = self._get_sample_uids(samples)
64
+
65
+ # Filter feature maps based on sample selection
66
+ if sample_uids_to_show is not None:
67
+ # Get sample indices for the selected sample_uids
68
+ selected_indices = []
69
+ if hasattr(self, 'samples_df') and self.samples_df is not None and not self.samples_df.is_empty():
70
+ samples_info = self.samples_df.to_pandas()
71
+ for idx, row in samples_info.iterrows():
72
+ if row.get('sample_uid') in sample_uids_to_show:
73
+ selected_indices.append(idx)
74
+ else:
75
+ # If no samples_df, just limit to the first N samples
76
+ if isinstance(samples, int):
77
+ selected_indices = list(range(min(samples, len(fmaps))))
78
+ else:
79
+ selected_indices = list(range(len(fmaps)))
80
+
81
+ # Filter feature maps to only include selected indices
82
+ fmaps = [fmaps[i] for i in selected_indices if i < len(fmaps)]
83
+
84
+ if not fmaps:
85
+ self.logger.error("No feature maps match the selected samples.")
86
+ return
87
+
57
88
  # Reference (first) sample: use current RT for both before and after
58
89
  ref = fmaps[0]
59
90
  ref_rt = [f.getRT() for f in ref]
@@ -143,6 +174,28 @@ def plot_alignment(self, maps: bool = True, filename: str | None = None, width:
143
174
  self.logger.error("No sample identifier column found in features_df.")
144
175
  return
145
176
 
177
+ # Get sample_uids to limit which samples to show
178
+ sample_uids_to_show = self._get_sample_uids(samples)
179
+
180
+ # Filter features_df based on sample selection if specified
181
+ if sample_uids_to_show is not None:
182
+ if sample_col == 'sample_uid':
183
+ features_df = features_df.filter(pl.col('sample_uid').is_in(sample_uids_to_show))
184
+ else:
185
+ # Need to convert sample names to sample_uids if using sample_name column
186
+ if 'sample_uid' in features_df.columns:
187
+ # Filter by sample_uid even though we're using sample_name as the primary column
188
+ features_df = features_df.filter(pl.col('sample_uid').is_in(sample_uids_to_show))
189
+ else:
190
+ # Convert sample_uids to sample_names and filter
191
+ sample_names_to_show = []
192
+ if hasattr(self, 'samples_df') and self.samples_df is not None:
193
+ for uid in sample_uids_to_show:
194
+ matching_rows = self.samples_df.filter(pl.col("sample_uid") == uid)
195
+ if not matching_rows.is_empty():
196
+ sample_names_to_show.append(matching_rows.row(0, named=True)["sample_name"])
197
+ features_df = features_df.filter(pl.col('sample_name').is_in(sample_names_to_show))
198
+
146
199
  # Get unique samples using Polars
147
200
  samples = features_df.select(pl.col(sample_col)).unique().to_series().to_list()
148
201
 
@@ -1649,11 +1702,19 @@ def plot_pca(
1649
1702
 
1650
1703
  self.logger.debug(f"Performing PCA on consensus matrix with shape: {consensus_matrix.shape}")
1651
1704
 
1652
- # Convert consensus matrix to numpy if it's not already
1653
- if hasattr(consensus_matrix, "values"):
1705
+ # Convert consensus matrix to numpy - handle both Polars and pandas DataFrames
1706
+ if hasattr(consensus_matrix, "to_numpy"):
1707
+ # Polars or pandas DataFrame
1708
+ if hasattr(consensus_matrix, "select"):
1709
+ # Polars DataFrame - exclude the consensus_uid column
1710
+ numeric_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
1711
+ matrix_data = consensus_matrix.select(numeric_cols).to_numpy()
1712
+ else:
1713
+ # Pandas DataFrame
1714
+ matrix_data = consensus_matrix.to_numpy()
1715
+ elif hasattr(consensus_matrix, "values"):
1716
+ # Pandas DataFrame
1654
1717
  matrix_data = consensus_matrix.values
1655
- elif hasattr(consensus_matrix, "to_numpy"):
1656
- matrix_data = consensus_matrix.to_numpy()
1657
1718
  else:
1658
1719
  matrix_data = np.array(consensus_matrix)
1659
1720
 
@@ -33,6 +33,7 @@ def align(self, **kwargs):
33
33
  - algo (str): Alignment algorithm ('pc' for PoseClustering, 'kd' for KD).
34
34
 
35
35
  KD algorithm specific parameters:
36
+ - min_samples (int): Minimum number of samples required for KD alignment.
36
37
  - nr_partitions (int): Number of partitions in m/z dimension.
37
38
  - warp_enabled (bool): Enable non-linear retention time transformation.
38
39
  - warp_rt_tol (float): RT tolerance for the LOWESS fit.
@@ -87,131 +88,17 @@ def align(self, **kwargs):
87
88
 
88
89
  fmaps = self.features_maps
89
90
 
90
- # Initialize OpenMS parameters
91
- params_oms = oms.Param()
92
- # Choose alignment algorithm based on parameter
91
+ # Choose alignment algorithm
93
92
  algo = params.get("algo").lower()
94
-
95
- # Set common parameters for both algorithms
96
- if algo == "pc":
97
- # Parameters specific to PoseClustering
98
- params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
99
- params_oms.setValue("pairfinder:ignore_charge", "true")
100
- params_oms.setValue("max_num_peaks_considered", 1000)
101
- params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
102
- params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
103
- params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
104
- params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
105
- params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
106
- params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
107
- params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
108
-
109
- """
110
- {b'max_num_peaks_considered': 1000,
111
- b'superimposer:mz_pair_max_distance': 0.5,
112
- b'superimposer:rt_pair_distance_fraction': 0.1,
113
- b'superimposer:num_used_points': 2000,
114
- b'superimposer:scaling_bucket_size': 0.005,
115
- b'superimposer:shift_bucket_size': 3.0,
116
- b'superimposer:max_shift': 1000.0,
117
- b'superimposer:max_scaling': 2.0,
118
- b'superimposer:dump_buckets': '',
119
- b'superimposer:dump_pairs': '',
120
- b'pairfinder:second_nearest_gap': 2.0,
121
- b'pairfinder:use_identifications': 'false',
122
- b'pairfinder:ignore_charge': 'false',
123
- b'pairfinder:ignore_adduct': 'true',
124
- b'pairfinder:distance_RT:max_difference': 100.0,
125
- b'pairfinder:distance_RT:exponent': 1.0,
126
- b'pairfinder:distance_RT:weight': 1.0,
127
- b'pairfinder:distance_MZ:max_difference': 0.3,
128
- b'pairfinder:distance_MZ:unit': 'Da',
129
- b'pairfinder:distance_MZ:exponent': 2.0,
130
- b'pairfinder:distance_MZ:weight': 1.0,
131
- b'pairfinder:distance_intensity:exponent': 1.0,
132
- b'pairfinder:distance_intensity:weight': 0.0,
133
- b'pairfinder:distance_intensity:log_transform': 'disabled'}
134
- """
135
- elif algo == "kd":
136
- # Parameters specific to KD algorithm
137
- params_oms.setValue("mz_unit", "Da")
138
- params_oms.setValue("nr_partitions", params.get("nr_partitions"))
139
-
140
- # Warp parameters for non-linear RT transformation
141
- params_oms.setValue("warp:enabled", "true" if params.get("warp_enabled") else "false")
142
- params_oms.setValue("warp:rt_tol", params.get("warp_rt_tol"))
143
- params_oms.setValue("warp:mz_tol", params.get("warp_mz_tol"))
144
- params_oms.setValue("warp:max_pairwise_log_fc", params.get("warp_max_pairwise_log_fc"))
145
- params_oms.setValue("warp:min_rel_cc_size", params.get("warp_min_rel_cc_size"))
146
- params_oms.setValue("warp:max_nr_conflicts", params.get("warp_max_nr_conflicts"))
147
-
148
- # Link parameters
149
- params_oms.setValue("link:rt_tol", params.get("link_rt_tol"))
150
- params_oms.setValue("link:mz_tol", params.get("link_mz_tol"))
151
- params_oms.setValue("link:charge_merging", params.get("link_charge_merging"))
152
- params_oms.setValue("link:adduct_merging", params.get("link_adduct_merging"))
153
-
154
- # Distance parameters
155
- params_oms.setValue("distance_RT:exponent", params.get("distance_RT_exponent"))
156
- params_oms.setValue("distance_RT:weight", params.get("distance_RT_weight"))
157
- params_oms.setValue("distance_MZ:exponent", params.get("distance_MZ_exponent"))
158
- params_oms.setValue("distance_MZ:weight", params.get("distance_MZ_weight"))
159
- params_oms.setValue("distance_intensity:exponent", params.get("distance_intensity_exponent"))
160
- params_oms.setValue("distance_intensity:weight", params.get("distance_intensity_weight"))
161
- params_oms.setValue("distance_intensity:log_transform", params.get("distance_intensity_log_transform"))
162
-
163
- # LOWESS parameters
164
- params_oms.setValue("LOWESS:span", params.get("LOWESS_span"))
165
- params_oms.setValue("LOWESS:num_iterations", params.get("LOWESS_num_iterations"))
166
- params_oms.setValue("LOWESS:delta", params.get("LOWESS_delta"))
167
- params_oms.setValue("LOWESS:interpolation_type", params.get("LOWESS_interpolation_type"))
168
- params_oms.setValue("LOWESS:extrapolation_type", params.get("LOWESS_extrapolation_type"))
169
-
93
+
170
94
  if algo == "pc":
171
- aligner = oms.MapAlignmentAlgorithmPoseClustering()
172
- self.logger.info("Starting alignment with PoseClustering")
173
- # set ref_index to feature map index with largest number of features
174
- ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
175
- self.logger.debug(
176
- f"Reference map is {self.samples_df.row(ref_index, named=True)['sample_name']}",
177
- )
178
- aligner.setParameters(params_oms)
179
- aligner.setReference(fmaps[ref_index])
180
- self.logger.debug(f"Parameters for alignment: {params}")
181
- # perform alignment and transformation of feature maps to the reference map (exclude reference map)
182
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
183
- for index, fm in tqdm(
184
- list(enumerate(fmaps)),
185
- total=len(fmaps),
186
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Align feature maps",
187
- disable=tdqm_disable,
188
- ):
189
- if index == ref_index:
190
- continue
191
- if params.get("skip_blanks") and self.samples_df.row(index, named=True)["sample_type"] == "blank":
192
- continue
193
- trafo = oms.TransformationDescription()
194
- aligner.align(fm, trafo)
195
- transformer = oms.MapAlignmentTransformer()
196
- transformer.transformRetentionTimes(fm, trafo, True)
197
-
198
- self.alignment_ref_index = ref_index
199
-
95
+ _align_pose_clustering(self, fmaps, params)
96
+
200
97
  elif algo == "kd":
201
- # KD algorithm requires num_maps and Param parameters
202
- num_maps = len(fmaps)
203
- aligner = oms.MapAlignmentAlgorithmKD(3, params_oms)
204
- self.logger.info(f"Starting alignment with KD algorithm using {num_maps} maps")
205
-
206
- kdtree = oms.KDTreeFeatureMaps()
207
- kdtree.addMaps(fmaps) # Add all feature maps to the KDTree
208
- # kdtree.optimizeTree()
209
- aligner.addRTFitData(kdtree)
210
- aligner.fitLOWESS()
211
- aligner.transform(kdtree)
212
-
98
+ _align_kd_algorithm(self, fmaps, params)
213
99
  else:
214
100
  self.logger.error(f"Unknown alignment algorithm '{algo}'")
101
+ self.logger.error(f"Unknown alignment algorithm '{algo}'")
215
102
 
216
103
  # check if rt_original exists in features_df, if not, add it after rt
217
104
  if "rt_original" not in self.features_df.columns:
@@ -1163,3 +1050,145 @@ def _find_closest_valley(chrom, rt, dir="left", threshold=0.9):
1163
1050
  else:
1164
1051
  break
1165
1052
  return chrom.rt[idx]
1053
+
1054
+
1055
+ def _align_pose_clustering(study_obj, fmaps, params):
1056
+ """Perform alignment using PoseClustering algorithm."""
1057
+ import pyopenms as oms
1058
+ from tqdm import tqdm
1059
+ from datetime import datetime
1060
+
1061
+ # Create PC-specific OpenMS parameters
1062
+ params_oms = oms.Param()
1063
+ params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
1064
+ params_oms.setValue("pairfinder:ignore_charge", "true")
1065
+ params_oms.setValue("max_num_peaks_considered", 1000)
1066
+ params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
1067
+ params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
1068
+ params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
1069
+ params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
1070
+ params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
1071
+ params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
1072
+ params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
1073
+
1074
+ aligner = oms.MapAlignmentAlgorithmPoseClustering()
1075
+ study_obj.logger.info("Starting alignment with PoseClustering")
1076
+
1077
+ # Set ref_index to feature map index with largest number of features
1078
+ ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
1079
+ study_obj.logger.debug(
1080
+ f"Reference map is {study_obj.samples_df.row(ref_index, named=True)['sample_name']}",
1081
+ )
1082
+
1083
+ aligner.setParameters(params_oms)
1084
+ aligner.setReference(fmaps[ref_index])
1085
+ study_obj.logger.debug(f"Parameters for alignment: {params}")
1086
+
1087
+ # Perform alignment and transformation of feature maps to the reference map (exclude reference map)
1088
+ tdqm_disable = study_obj.log_level not in ["TRACE", "DEBUG", "INFO"]
1089
+ for index, fm in tqdm(
1090
+ list(enumerate(fmaps)),
1091
+ total=len(fmaps),
1092
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study_obj.log_label}Align feature maps",
1093
+ disable=tdqm_disable,
1094
+ ):
1095
+ if index == ref_index:
1096
+ continue
1097
+ if params.get("skip_blanks") and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank":
1098
+ continue
1099
+ trafo = oms.TransformationDescription()
1100
+ aligner.align(fm, trafo)
1101
+ transformer = oms.MapAlignmentTransformer()
1102
+ transformer.transformRetentionTimes(fm, trafo, True)
1103
+
1104
+ study_obj.alignment_ref_index = ref_index
1105
+
1106
+
1107
+ def _align_kd_algorithm(study_obj, fmaps, params):
1108
+ """Perform alignment using KD algorithm."""
1109
+ import pyopenms as oms
1110
+
1111
+ num_maps = len(fmaps)
1112
+ study_obj.logger.info(f"Starting alignment with KD algorithm using {num_maps} maps")
1113
+
1114
+ try:
1115
+ # Use the EXACT approach from test_oms.py that works
1116
+ # First parameter is DIMENSIONS (3), not min_samples!
1117
+ study_obj.logger.debug("Creating MapAlignmentAlgorithmKD with 3 dimensions and empty parameters...")
1118
+ empty_params = oms.Param() # Empty params - this is what worked in test_oms.py!
1119
+ aligner = oms.MapAlignmentAlgorithmKD(3, empty_params) # 3 = dimensions, not min_samples
1120
+ study_obj.logger.debug("Created MapAlignmentAlgorithmKD successfully")
1121
+
1122
+ # Create KD-tree structure
1123
+ kdtree = oms.KDTreeFeatureMaps()
1124
+
1125
+ # Set all required warping parameters based on OpenMS requirements
1126
+ kd_params = oms.Param()
1127
+ # Core warp parameters that OpenMS expects
1128
+ kd_params.setValue(b"warp:min_rel_cc_size", 0.2, b"Minimum relative connected component size")
1129
+ kd_params.setValue(b"warp:max_ratio_small_big", 0.5, b"Maximum ratio of small to big connected component")
1130
+ kd_params.setValue(b"warp:min_score", 0.3, b"Minimum score for warping")
1131
+ kd_params.setValue(b"warp:rt_tol", 5.0, b"RT tolerance for feature matching")
1132
+ kd_params.setValue(b"warp:mz_tol", 0.015, b"m/z tolerance for feature matching")
1133
+ # Additional potentially required parameters
1134
+ kd_params.setValue(b"warp:max_shift", 30.0, b"Maximum RT shift allowed")
1135
+ kd_params.setValue(b"warp:bins", 100, b"Number of bins for warping")
1136
+ kdtree.setParameters(kd_params)
1137
+
1138
+ # Add all feature maps to KD-tree (NO limiting - this worked with 38k features!)
1139
+ study_obj.logger.debug("Adding maps to KD-tree structure...")
1140
+ kdtree.addMaps(fmaps)
1141
+ study_obj.logger.debug("Successfully added maps to KD-tree")
1142
+
1143
+ # Add RT fitting data (this is where the magic happens)
1144
+ study_obj.logger.debug("Adding RT fitting data to aligner...")
1145
+ aligner.addRTFitData(kdtree)
1146
+ study_obj.logger.debug("Successfully added RT fitting data")
1147
+
1148
+ # Perform LOWESS fitting
1149
+ study_obj.logger.debug("Performing LOWESS fitting...")
1150
+ aligner.fitLOWESS()
1151
+ study_obj.logger.debug("Successfully completed LOWESS fitting")
1152
+
1153
+ # Apply transformations to feature maps
1154
+ study_obj.logger.debug("Applying transformations to feature maps...")
1155
+ for i, fmap in enumerate(fmaps):
1156
+ trafo = oms.TransformationDescription()
1157
+ aligner.getTransformation(i, trafo)
1158
+ oms.MapAlignmentTransformer.transformRetentionTimes(fmap, trafo, True)
1159
+
1160
+ study_obj.logger.info("KD alignment completed successfully")
1161
+
1162
+ except Exception as e:
1163
+ study_obj.logger.error(f"KD alignment failed with error: {e}")
1164
+ study_obj.logger.info("Falling back to PoseClustering alignment...")
1165
+
1166
+ # Fallback to pose clustering with basic parameters
1167
+ _align_pose_clustering_fallback(study_obj, fmaps, params)
1168
+
1169
+
1170
+ def _align_pose_clustering_fallback(study_obj, fmaps, params):
1171
+ """Fallback PoseClustering alignment with minimal parameters."""
1172
+ import pyopenms as oms
1173
+
1174
+ aligner = oms.MapAlignmentAlgorithmPoseClustering()
1175
+ ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
1176
+
1177
+ # Set up basic parameters for pose clustering
1178
+ pc_params = oms.Param()
1179
+ pc_params.setValue("max_num_peaks_considered", 1000)
1180
+ pc_params.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
1181
+ pc_params.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
1182
+
1183
+ aligner.setParameters(pc_params)
1184
+ aligner.setReference(fmaps[ref_index])
1185
+
1186
+ for index, fm in enumerate(fmaps):
1187
+ if index == ref_index:
1188
+ continue
1189
+ trafo = oms.TransformationDescription()
1190
+ aligner.align(fm, trafo)
1191
+ transformer = oms.MapAlignmentTransformer()
1192
+ transformer.transformRetentionTimes(fm, trafo, True)
1193
+
1194
+ study_obj.alignment_ref_index = ref_index
@@ -1372,7 +1372,7 @@ wheels = [
1372
1372
 
1373
1373
  [[package]]
1374
1374
  name = "masster"
1375
- version = "0.3.16"
1375
+ version = "0.3.18"
1376
1376
  source = { editable = "." }
1377
1377
  dependencies = [
1378
1378
  { name = "alphabase" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes