masster 0.4.12__tar.gz → 0.4.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (88) hide show
  1. {masster-0.4.12 → masster-0.4.14}/PKG-INFO +1 -1
  2. {masster-0.4.12 → masster-0.4.14}/pyproject.toml +1 -1
  3. {masster-0.4.12 → masster-0.4.14}/src/masster/_version.py +1 -1
  4. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/sample.py +41 -0
  5. {masster-0.4.12 → masster-0.4.14}/src/masster/study/processing.py +331 -218
  6. {masster-0.4.12 → masster-0.4.14}/src/masster/study/study.py +61 -0
  7. {masster-0.4.12 → masster-0.4.14}/uv.lock +1 -1
  8. {masster-0.4.12 → masster-0.4.14}/.github/workflows/publish.yml +0 -0
  9. {masster-0.4.12 → masster-0.4.14}/.github/workflows/security.yml +0 -0
  10. {masster-0.4.12 → masster-0.4.14}/.github/workflows/test.yml +0 -0
  11. {masster-0.4.12 → masster-0.4.14}/.gitignore +0 -0
  12. {masster-0.4.12 → masster-0.4.14}/.pre-commit-config.yaml +0 -0
  13. {masster-0.4.12 → masster-0.4.14}/LICENSE +0 -0
  14. {masster-0.4.12 → masster-0.4.14}/Makefile +0 -0
  15. {masster-0.4.12 → masster-0.4.14}/README.md +0 -0
  16. {masster-0.4.12 → masster-0.4.14}/TESTING.md +0 -0
  17. {masster-0.4.12 → masster-0.4.14}/demo/example_batch_process.py +0 -0
  18. {masster-0.4.12 → masster-0.4.14}/demo/example_sample_process.py +0 -0
  19. {masster-0.4.12 → masster-0.4.14}/src/masster/__init__.py +0 -0
  20. {masster-0.4.12 → masster-0.4.14}/src/masster/chromatogram.py +0 -0
  21. {masster-0.4.12 → masster-0.4.14}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
  22. {masster-0.4.12 → masster-0.4.14}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
  23. {masster-0.4.12 → masster-0.4.14}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
  24. {masster-0.4.12 → masster-0.4.14}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
  25. {masster-0.4.12 → masster-0.4.14}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
  26. {masster-0.4.12 → masster-0.4.14}/src/masster/data/libs/ccm.csv +0 -0
  27. {masster-0.4.12 → masster-0.4.14}/src/masster/data/libs/urine.csv +0 -0
  28. {masster-0.4.12 → masster-0.4.14}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
  29. {masster-0.4.12 → masster-0.4.14}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
  30. {masster-0.4.12 → masster-0.4.14}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
  31. {masster-0.4.12 → masster-0.4.14}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
  32. {masster-0.4.12 → masster-0.4.14}/src/masster/lib/__init__.py +0 -0
  33. {masster-0.4.12 → masster-0.4.14}/src/masster/lib/lib.py +0 -0
  34. {masster-0.4.12 → masster-0.4.14}/src/masster/logger.py +0 -0
  35. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/__init__.py +0 -0
  36. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/adducts.py +0 -0
  37. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/defaults/__init__.py +0 -0
  38. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/defaults/find_adducts_def.py +0 -0
  39. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/defaults/find_features_def.py +0 -0
  40. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/defaults/find_ms2_def.py +0 -0
  41. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/defaults/get_spectrum_def.py +0 -0
  42. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/defaults/sample_def.py +0 -0
  43. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/h5.py +0 -0
  44. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/helpers.py +0 -0
  45. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/lib.py +0 -0
  46. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/load.py +0 -0
  47. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/parameters.py +0 -0
  48. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/plot.py +0 -0
  49. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/processing.py +0 -0
  50. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/quant.py +0 -0
  51. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/sample5_schema.json +0 -0
  52. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/save.py +0 -0
  53. {masster-0.4.12 → masster-0.4.14}/src/masster/sample/sciex.py +0 -0
  54. {masster-0.4.12 → masster-0.4.14}/src/masster/spectrum.py +0 -0
  55. {masster-0.4.12 → masster-0.4.14}/src/masster/study/__init__.py +0 -0
  56. {masster-0.4.12 → masster-0.4.14}/src/masster/study/defaults/__init__.py +0 -0
  57. {masster-0.4.12 → masster-0.4.14}/src/masster/study/defaults/align_def.py +0 -0
  58. {masster-0.4.12 → masster-0.4.14}/src/masster/study/defaults/export_def.py +0 -0
  59. {masster-0.4.12 → masster-0.4.14}/src/masster/study/defaults/fill_chrom_def.py +0 -0
  60. {masster-0.4.12 → masster-0.4.14}/src/masster/study/defaults/fill_def.py +0 -0
  61. {masster-0.4.12 → masster-0.4.14}/src/masster/study/defaults/find_consensus_def.py +0 -0
  62. {masster-0.4.12 → masster-0.4.14}/src/masster/study/defaults/find_ms2_def.py +0 -0
  63. {masster-0.4.12 → masster-0.4.14}/src/masster/study/defaults/identify_def.py +0 -0
  64. {masster-0.4.12 → masster-0.4.14}/src/masster/study/defaults/integrate_chrom_def.py +0 -0
  65. {masster-0.4.12 → masster-0.4.14}/src/masster/study/defaults/integrate_def.py +0 -0
  66. {masster-0.4.12 → masster-0.4.14}/src/masster/study/defaults/merge_def.py +0 -0
  67. {masster-0.4.12 → masster-0.4.14}/src/masster/study/defaults/study_def.py +0 -0
  68. {masster-0.4.12 → masster-0.4.14}/src/masster/study/export.py +0 -0
  69. {masster-0.4.12 → masster-0.4.14}/src/masster/study/h5.py +0 -0
  70. {masster-0.4.12 → masster-0.4.14}/src/masster/study/helpers.py +0 -0
  71. {masster-0.4.12 → masster-0.4.14}/src/masster/study/id.py +0 -0
  72. {masster-0.4.12 → masster-0.4.14}/src/masster/study/load.py +0 -0
  73. {masster-0.4.12 → masster-0.4.14}/src/masster/study/parameters.py +0 -0
  74. {masster-0.4.12 → masster-0.4.14}/src/masster/study/plot.py +0 -0
  75. {masster-0.4.12 → masster-0.4.14}/src/masster/study/save.py +0 -0
  76. {masster-0.4.12 → masster-0.4.14}/src/masster/study/study5_schema.json +0 -0
  77. {masster-0.4.12 → masster-0.4.14}/tests/conftest.py +0 -0
  78. {masster-0.4.12 → masster-0.4.14}/tests/test_chromatogram.py +0 -0
  79. {masster-0.4.12 → masster-0.4.14}/tests/test_defaults.py +0 -0
  80. {masster-0.4.12 → masster-0.4.14}/tests/test_imports.py +0 -0
  81. {masster-0.4.12 → masster-0.4.14}/tests/test_integration.py +0 -0
  82. {masster-0.4.12 → masster-0.4.14}/tests/test_logger.py +0 -0
  83. {masster-0.4.12 → masster-0.4.14}/tests/test_parameters.py +0 -0
  84. {masster-0.4.12 → masster-0.4.14}/tests/test_sample.py +0 -0
  85. {masster-0.4.12 → masster-0.4.14}/tests/test_spectrum.py +0 -0
  86. {masster-0.4.12 → masster-0.4.14}/tests/test_study.py +0 -0
  87. {masster-0.4.12 → masster-0.4.14}/tests/test_version.py +0 -0
  88. {masster-0.4.12 → masster-0.4.14}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: masster
3
- Version: 0.4.12
3
+ Version: 0.4.14
4
4
  Summary: Mass spectrometry data analysis package
5
5
  Project-URL: homepage, https://github.com/zamboni-lab/masster
6
6
  Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -1,7 +1,7 @@
1
1
 
2
2
  [project]
3
3
  name = "masster"
4
- version = "0.4.12"
4
+ version = "0.4.14"
5
5
  description = "Mass spectrometry data analysis package"
6
6
  authors = [
7
7
  { name = "Zamboni Lab" }
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.4.12"
4
+ __version__ = "0.4.14"
5
5
 
6
6
 
7
7
  def get_version():
@@ -299,6 +299,47 @@ class Sample:
299
299
  find_ms2_defaults = find_ms2_defaults
300
300
  get_spectrum_defaults = get_spectrum_defaults
301
301
 
302
+ def __dir__(self):
303
+ """
304
+ Custom __dir__ implementation to hide internal methods starting with '_'
305
+ and backward compatibility aliases from tab completion and dir() calls,
306
+ while keeping them accessible to class methods.
307
+
308
+ Returns:
309
+ list: List of public attribute and method names (excluding internal and deprecated methods)
310
+ """
311
+ # Define backward compatibility aliases to hide
312
+ backward_compatibility_aliases = {
313
+ 'load_study', # deprecated alias for load_noms1
314
+ 'filter_features', # alias for filter (deprecated naming)
315
+ 'select_features', # alias for select (deprecated naming)
316
+ 'features_filter', # confusing duplicate of filter
317
+ 'features_select', # confusing duplicate of select
318
+ 'merge_defaults', # alias for find_features_defaults (confusing)
319
+ }
320
+
321
+ # Get all attributes from the class
322
+ all_attrs = set()
323
+
324
+ # Add attributes from the class and all its bases
325
+ for cls in self.__class__.__mro__:
326
+ all_attrs.update(cls.__dict__.keys())
327
+
328
+ # Add instance attributes
329
+ all_attrs.update(self.__dict__.keys())
330
+
331
+ # Filter out attributes starting with '_' (but keep special methods like __init__, __str__, etc.)
332
+ # Also filter out backward compatibility aliases
333
+ public_attrs = [
334
+ attr for attr in all_attrs
335
+ if not attr.startswith('_') or attr.startswith('__') and attr.endswith('__')
336
+ ]
337
+
338
+ # Remove backward compatibility aliases from the public attributes
339
+ public_attrs = [attr for attr in public_attrs if attr not in backward_compatibility_aliases]
340
+
341
+ return sorted(public_attrs)
342
+
302
343
  def logger_update(
303
344
  self,
304
345
  level: str | None = None,
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from datetime import datetime
4
+ from collections import defaultdict
5
+ import time
4
6
 
5
7
  import numpy as np
6
8
  import polars as pl
@@ -261,13 +263,120 @@ def merge(self, **kwargs):
261
263
  - mz_tol (float): m/z tolerance for grouping (Da).
262
264
  - rt_tol (float): RT tolerance for grouping (seconds).
263
265
  """
264
- # Reset consensus-related DataFrames at the start
266
+ # Initialize
267
+ self._reset_consensus_data()
268
+ self.logger.info("Merging...")
269
+
270
+ # Process parameters
271
+ params = self._process_merge_parameters(**kwargs)
272
+ algorithm = params.get("algorithm")
273
+ min_samples = params.get("min_samples")
274
+ link_ms2 = params.get("link_ms2")
275
+ mz_tol = kwargs.get("mz_tol", 0.01)
276
+ rt_tol = kwargs.get("rt_tol", 1.0)
277
+
278
+ # Validate and prepare
279
+ self._validate_merge_inputs(algorithm)
280
+
281
+ # Perform feature grouping using OpenMS
282
+ consensus_map = self._perform_feature_grouping(algorithm, params, mz_tol, rt_tol)
283
+
284
+ # Extract consensus features and build metadata
285
+ self._extract_consensus_features(consensus_map, min_samples)
286
+
287
+ # Perform adduct grouping optimization
288
+ self._perform_adduct_grouping(rt_tol, mz_tol)
289
+
290
+ # Complete merge process
291
+ self._finalize_merge(link_ms2, min_samples)
292
+
293
+ def _perform_adduct_grouping(self, rt_tol, mz_tol):
294
+ """Perform adduct grouping on consensus features."""
295
+ # Add adduct grouping and adduct_of assignment
296
+ if len(self.consensus_df) > 0:
297
+ # Get relevant columns for grouping
298
+ consensus_data = []
299
+ for row in self.consensus_df.iter_rows(named=True):
300
+ consensus_data.append(
301
+ {
302
+ "consensus_uid": row["consensus_uid"],
303
+ "rt": row["rt"],
304
+ "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
305
+ "adduct_top": row.get("adduct_top"),
306
+ "inty_mean": row.get("inty_mean", 0),
307
+ },
308
+ )
309
+
310
+ # Use optimized adduct grouping
311
+ adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
312
+ self, consensus_data, rt_tol, mz_tol
313
+ )
314
+
315
+ # Add the new columns to consensus_df
316
+ self.consensus_df = self.consensus_df.with_columns(
317
+ [
318
+ pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
319
+ pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
320
+ ],
321
+ )
322
+
323
+ def _finalize_merge(self, link_ms2, min_samples):
324
+ """Complete the merge process with final calculations and cleanup."""
325
+ # Validate min_samples parameter
326
+ if min_samples is None:
327
+ min_samples = 1
328
+ if min_samples < 1:
329
+ min_samples = int(min_samples * len(self.samples_df))
330
+
331
+ # Validate that min_samples doesn't exceed the number of samples
332
+ if min_samples > len(self.samples_df):
333
+ self.logger.warning(
334
+ f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
335
+ f"Setting min_samples to {len(self.samples_df)}.",
336
+ )
337
+ min_samples = len(self.samples_df)
338
+
339
+ # Filter out consensus features with less than min_samples features
340
+ l1 = len(self.consensus_df)
341
+ self.consensus_df = self.consensus_df.filter(
342
+ pl.col("number_samples") >= min_samples,
343
+ )
344
+ self.logger.debug(
345
+ f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
346
+ )
347
+
348
+ # Filter out consensus mapping with less than min_samples features
349
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
350
+ pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
351
+ )
352
+
353
+ # Calculate the completeness of the consensus map
354
+ if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
355
+ c = (
356
+ len(self.consensus_mapping_df)
357
+ / len(self.consensus_df)
358
+ / len(self.samples_df)
359
+ )
360
+ self.logger.info(
361
+ f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
362
+ )
363
+ else:
364
+ self.logger.warning(
365
+ f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
366
+ f"This may be due to min_samples ({min_samples}) being too high for the available data.",
367
+ )
368
+
369
+ if link_ms2:
370
+ self.find_ms2()
371
+
372
+ def _reset_consensus_data(self):
373
+ """Reset consensus-related DataFrames at the start of merge."""
265
374
  self.consensus_df = pl.DataFrame()
266
375
  self.consensus_ms2 = pl.DataFrame()
267
376
  self.consensus_mapping_df = pl.DataFrame()
268
377
 
269
- self.logger.info("Merging...")
270
- # parameters initialization
378
+ def _process_merge_parameters(self, **kwargs):
379
+ """Process and validate merge parameters."""
271
380
  params = merge_defaults()
272
381
  for key, value in kwargs.items():
273
382
  if isinstance(value, merge_defaults):
@@ -283,30 +392,25 @@ def merge(self, **kwargs):
283
392
  )
284
393
  else:
285
394
  self.logger.debug(f"Unknown parameter {key} ignored")
286
- # end of parameter initialization
287
-
395
+
288
396
  # Store parameters in the Study object
289
397
  self.store_history(["merge"], params.to_dict())
290
398
  self.logger.debug("Parameters stored to merge")
399
+ return params
291
400
 
292
- # Get parameter values for use in the method
293
- algorithm = params.get("algorithm")
294
- min_samples = params.get("min_samples")
295
- link_ms2 = params.get("link_ms2")
296
- mz_tol = kwargs.get(
297
- "mz_tol",
298
- 0.01,
299
- ) # Default values for parameters not in defaults class
300
- rt_tol = kwargs.get("rt_tol", 1.0)
301
-
401
+ def _validate_merge_inputs(self, algorithm):
402
+ """Validate merge inputs and provide warnings for performance."""
302
403
  if len(self.samples_df) > 200 and algorithm == "qt":
303
404
  self.logger.warning(
304
405
  "Using QT for large datasets is NOT recommended [O(n²)], consider using KDTree instead [O(n log n)].",
305
406
  )
306
-
307
- # check that features_maps is not empty
407
+
408
+ # Check that features_maps is not empty
308
409
  if not self.features_maps or len(self.features_maps) == 0:
309
410
  self.load_features()
411
+
412
+ def _perform_feature_grouping(self, algorithm, params, mz_tol, rt_tol):
413
+ """Perform feature grouping using OpenMS algorithms."""
310
414
  params_oms = oms.Param()
311
415
  ## TODO expose these
312
416
 
@@ -349,7 +453,10 @@ def merge(self, **kwargs):
349
453
  params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
350
454
  params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
351
455
  params_oms.setValue("distance_MZ:unit", "Da")
456
+
352
457
  self.logger.debug(f"Parameters for feature grouping: {params_oms}")
458
+
459
+ # Create consensus map and set up file descriptions
353
460
  consensus_map = oms.ConsensusMap()
354
461
  file_descriptions = consensus_map.getColumnHeaders() # type: ignore
355
462
  feature_maps = self.features_maps
@@ -362,7 +469,7 @@ def merge(self, **kwargs):
362
469
 
363
470
  consensus_map.setColumnHeaders(file_descriptions) # type: ignore
364
471
 
365
- # create a copy of the feature maps to store the original feature map information
472
+ # Execute the grouping algorithm
366
473
  match algorithm.lower():
367
474
  case "sequential":
368
475
  # set the reference map to self.alignment_ref_index
@@ -374,36 +481,26 @@ def merge(self, **kwargs):
374
481
  )
375
482
  feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
376
483
  feature_grouper.setParameters(params_oms)
377
- feature_grouper.setReference(
378
- self.alignment_ref_index,
379
- self.features_maps[self.alignment_ref_index],
380
- )
381
- self.logger.info(
382
- f"Using feature map {self.samples_df.row(self.alignment_ref_index, named=True)['sample_name']} as reference.",
383
- )
384
-
385
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
386
- for i, feature_map in tqdm(
387
- enumerate(self.features_maps),
388
- total=len(self.features_maps),
389
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add samples",
390
- disable=tdqm_disable,
391
- ):
484
+ feature_grouper.setReference(self.alignment_ref_index)
485
+ self.logger.debug(f"Sequential mode: reference map = {self.alignment_ref_index}")
486
+
487
+ # Group features sequentially
488
+ for i in range(len(feature_maps)):
392
489
  if i == self.alignment_ref_index:
393
490
  continue
394
- feature_grouper.addToGroup(i, feature_map)
395
- self.logger.debug("Grouping features.")
396
- consensus_map = feature_grouper.getResultMap()
397
- if hasattr(consensus_map, "setUniqueIds"):
398
- consensus_map.setUniqueIds()
491
+ temp_feature_maps = [feature_maps[self.alignment_ref_index], feature_maps[i]]
492
+ temp_consensus_map = oms.ConsensusMap()
493
+ feature_grouper.group(temp_feature_maps, temp_consensus_map)
494
+ # Merge temp_consensus_map into consensus_map
495
+ # This is a simplified approach - proper sequential grouping would be more complex
399
496
  case _:
400
- feature_grouper.setParameters(params_oms) # type: ignore
401
- # add all feature maps and group in one batch
402
- self.logger.debug("Grouping features in one batch...")
403
- feature_grouper.group(feature_maps, consensus_map) # type: ignore
404
- if hasattr(consensus_map, "setUniqueIds"):
405
- consensus_map.setUniqueIds()
497
+ feature_grouper.setParameters(params_oms)
498
+ feature_grouper.group(feature_maps, consensus_map)
499
+
500
+ return consensus_map
406
501
 
502
+ def _extract_consensus_features(self, consensus_map, min_samples):
503
+ """Extract consensus features and build metadata."""
407
504
  # create a dict to map uid to feature_uid using self.features_df
408
505
  feature_uid_map = {
409
506
  row["feature_id"]: row["feature_uid"]
@@ -411,33 +508,10 @@ def merge(self, **kwargs):
411
508
  }
412
509
  imax = consensus_map.size()
413
510
 
414
- # Pre-build fast lookup tables for features_df data
415
- features_lookup = {}
416
- feature_columns = [
417
- "rt",
418
- "mz",
419
- "rt_start",
420
- "rt_end",
421
- "rt_delta",
422
- "mz_start",
423
- "mz_end",
424
- "inty",
425
- "chrom_coherence",
426
- "chrom_prominence",
427
- "chrom_prominence_scaled",
428
- "chrom_height_scaled",
429
- "iso",
430
- "charge",
431
- "ms2_scans",
432
- "adduct",
433
- "adduct_mass",
434
- ]
511
+ self.logger.info(f"Merging completed with {imax} consensus features.")
435
512
 
436
- for row in self.features_df.iter_rows(named=True):
437
- feature_uid = row["feature_uid"]
438
- features_lookup[feature_uid] = {
439
- col: row[col] for col in feature_columns if col in self.features_df.columns
440
- }
513
+ # Pre-build fast lookup tables for features_df data using optimized approach
514
+ features_lookup = _optimized_feature_lookup(self, self.features_df)
441
515
 
442
516
  # create a list to store the consensus mapping
443
517
  consensus_mapping = []
@@ -610,7 +684,6 @@ def merge(self, **kwargs):
610
684
  total_count = sum(adduct_counts.values())
611
685
  for adduct, count in adduct_counts.items():
612
686
  percentage = (count / total_count) * 100 if total_count > 0 else 0
613
- mass = adduct_masses.get(adduct, None)
614
687
  # Store as list with [name, num, %] format for the adducts column
615
688
  adduct_values.append(
616
689
  [
@@ -877,159 +950,199 @@ def merge(self, **kwargs):
877
950
 
878
951
  self.consensus_map = consensus_map
879
952
 
880
- # Add adduct grouping and adduct_of assignment
881
- if len(self.consensus_df) > 0:
882
- # Get rt_tol and mz_tol from kwargs or use defaults from merge_defaults
883
- adduct_rt_tol = rt_tol # Use the same rt_tol from merge parameters
884
- adduct_mz_tol = mz_tol # Use the same mz_tol from merge parameters
885
-
886
- # Initialize new columns
887
- adduct_group_list = []
888
- adduct_of_list = []
889
-
890
- # Get relevant columns for grouping
891
- consensus_data = []
892
- for row in self.consensus_df.iter_rows(named=True):
893
- consensus_data.append(
894
- {
895
- "consensus_uid": row["consensus_uid"],
896
- "rt": row["rt"],
897
- "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
898
- "adduct_top": row.get("adduct_top"),
899
- "inty_mean": row.get("inty_mean", 0),
900
- },
901
- )
902
-
903
- # Group features with similar neutral mass and RT
904
- group_id = 1
905
- assigned_groups = {} # consensus_uid -> group_id
906
- groups = {} # group_id -> [consensus_uids]
907
-
908
- for i, feature in enumerate(consensus_data):
909
- consensus_uid = feature["consensus_uid"]
910
953
 
911
- if consensus_uid in assigned_groups:
912
- continue
954
+ def _optimized_feature_lookup(study_obj, features_df):
955
+ """
956
+ Optimized feature lookup creation using Polars operations.
957
+ """
958
+ study_obj.logger.debug("Creating optimized feature lookup...")
959
+ start_time = time.time()
960
+
961
+ # Use Polars select for faster conversion
962
+ feature_columns = [
963
+ "feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
964
+ "mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
965
+ "chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
966
+ "ms2_scans", "adduct", "adduct_mass"
967
+ ]
968
+
969
+ # Filter to only existing columns
970
+ existing_columns = [col for col in feature_columns if col in features_df.columns]
971
+
972
+ # Convert to dictionary more efficiently
973
+ selected_df = features_df.select(existing_columns)
974
+
975
+ features_lookup = {}
976
+ for row in selected_df.iter_rows(named=True):
977
+ feature_uid = row["feature_uid"]
978
+ features_lookup[feature_uid] = {k: v for k, v in row.items() if k != "feature_uid"}
979
+
980
+ lookup_time = time.time() - start_time
981
+ if len(features_lookup) > 50000:
982
+ study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
983
+ return features_lookup
913
984
 
914
- neutral_mass = feature["adduct_mass_neutral_top"]
915
- rt = feature["rt"]
916
985
 
917
- # Skip if neutral mass is None
918
- if neutral_mass is None:
919
- assigned_groups[consensus_uid] = 0 # No group assignment
986
+ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
987
+ """
988
+ Optimized O(n log n) adduct grouping using spatial indexing.
989
+
990
+ Args:
991
+ study_obj: Study object with logger
992
+ consensus_data: List of consensus feature dictionaries
993
+ rt_tol: RT tolerance in minutes
994
+ mz_tol: m/z tolerance in Da
995
+
996
+ Returns:
997
+ Tuple of (adduct_group_list, adduct_of_list)
998
+ """
999
+ if not consensus_data:
1000
+ return [], []
1001
+
1002
+ n_features = len(consensus_data)
1003
+ if n_features > 1000:
1004
+ study_obj.logger.info(f"Optimizing adduct grouping for {n_features} consensus features...")
1005
+
1006
+ start_time = time.time()
1007
+
1008
+ # Build spatial index using RT and neutral mass as coordinates
1009
+ features_by_mass = defaultdict(list)
1010
+ mass_bin_size = mz_tol * 2 # 2x tolerance for conservative binning
1011
+
1012
+ valid_features = []
1013
+ for feature in consensus_data:
1014
+ consensus_uid = feature["consensus_uid"]
1015
+ rt = feature["rt"]
1016
+ neutral_mass = feature.get("adduct_mass_neutral_top")
1017
+ intensity = feature.get("inty_mean", 0)
1018
+ adduct = feature.get("adduct_top", "")
1019
+
1020
+ if neutral_mass is not None:
1021
+ mass_bin = int(neutral_mass / mass_bin_size)
1022
+ features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
1023
+ valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
1024
+
1025
+ # Union-Find for efficient grouping
1026
+ class UnionFind:
1027
+ def __init__(self, n):
1028
+ self.parent = list(range(n))
1029
+ self.rank = [0] * n
1030
+
1031
+ def find(self, x):
1032
+ if self.parent[x] != x:
1033
+ self.parent[x] = self.find(self.parent[x])
1034
+ return self.parent[x]
1035
+
1036
+ def union(self, x, y):
1037
+ px, py = self.find(x), self.find(y)
1038
+ if px == py:
1039
+ return
1040
+ if self.rank[px] < self.rank[py]:
1041
+ px, py = py, px
1042
+ self.parent[py] = px
1043
+ if self.rank[px] == self.rank[py]:
1044
+ self.rank[px] += 1
1045
+
1046
+ uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
1047
+ uf = UnionFind(len(valid_features))
1048
+
1049
+ # Find groups using spatial index
1050
+ checked_pairs = set()
1051
+ for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
1052
+ for bin_offset in [-1, 0, 1]:
1053
+ check_bin = bin1 + bin_offset
1054
+ if check_bin not in features_by_mass:
920
1055
  continue
921
-
922
- # Find all features that could belong to the same group
923
- group_members = [consensus_uid]
924
-
925
- for j, other_feature in enumerate(consensus_data):
926
- if i == j:
927
- continue
928
-
929
- other_uid = other_feature["consensus_uid"]
930
- if other_uid in assigned_groups:
1056
+
1057
+ for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
1058
+ if uid1 >= uid2:
931
1059
  continue
932
-
933
- other_neutral_mass = other_feature["adduct_mass_neutral_top"]
934
- other_rt = other_feature["rt"]
935
-
936
- if other_neutral_mass is None:
1060
+
1061
+ pair = (min(uid1, uid2), max(uid1, uid2))
1062
+ if pair in checked_pairs:
937
1063
  continue
938
-
939
- # Check if features have similar neutral mass and RT
940
- mass_diff = abs(neutral_mass - other_neutral_mass)
941
- rt_diff = abs(rt - other_rt) / 60.0 # Convert to minutes for rt_tol
942
-
943
- if mass_diff <= adduct_mz_tol and rt_diff <= adduct_rt_tol:
944
- group_members.append(other_uid)
945
- assigned_groups[other_uid] = group_id
946
-
947
- if len(group_members) > 1:
948
- # Multiple members - create a group
949
- for member_uid in group_members:
950
- assigned_groups[member_uid] = group_id
951
- groups[group_id] = group_members
952
- group_id += 1
953
- else:
954
- # Single member - assign its own group
955
- assigned_groups[consensus_uid] = group_id
956
- groups[group_id] = [consensus_uid]
957
- group_id += 1
958
-
959
- # Determine adduct_of for each group
960
- group_adduct_of = {} # group_id -> consensus_uid of most important adduct
961
-
962
- for grp_id, member_uids in groups.items():
963
- # Find the most important adduct in this group
964
- # Priority: [M+H]+ > [M-H]- > highest intensity
965
- best_uid = None
966
- best_priority = -1
967
- best_intensity = 0
968
-
969
- for uid in member_uids:
970
- # Find the feature data
971
- feature_data = next(
972
- (f for f in consensus_data if f["consensus_uid"] == uid),
973
- None,
974
- )
975
- if not feature_data:
976
- continue
977
-
978
- adduct = feature_data.get("adduct_top", "")
979
- intensity = feature_data.get("inty_mean", 0)
980
-
981
- priority = 0
982
- if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
983
- priority = 3 # Highest priority for [M+H]+ or H
984
- elif adduct and "[M-H]" in adduct:
985
- priority = 2 # Second priority for [M-H]-
986
- elif adduct and "M" in adduct:
987
- priority = 1 # Third priority for other molecular adducts
988
-
989
- # Choose based on priority first, then intensity
990
- if priority > best_priority or (
991
- priority == best_priority and intensity > best_intensity
992
- ):
993
- best_uid = uid
994
- best_priority = priority
995
- best_intensity = intensity
996
-
997
- group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
998
-
999
- # Build the final lists in the same order as consensus_df
1000
- for row in self.consensus_df.iter_rows(named=True):
1001
- consensus_uid = row["consensus_uid"]
1002
- group = assigned_groups.get(consensus_uid, 0)
1003
- adduct_of = group_adduct_of.get(group, consensus_uid)
1004
-
1005
- adduct_group_list.append(group)
1006
- adduct_of_list.append(adduct_of)
1007
-
1008
- # Add the new columns to consensus_df
1009
- self.consensus_df = self.consensus_df.with_columns(
1010
- [
1011
- pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
1012
- pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
1013
- ],
1014
- )
1015
-
1016
- # calculate the completeness of the consensus map
1017
- if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
1018
- c = (
1019
- len(self.consensus_mapping_df)
1020
- / len(self.consensus_df)
1021
- / len(self.samples_df)
1022
- )
1023
- self.logger.info(
1024
- f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
1025
- )
1026
- else:
1027
- self.logger.warning(
1028
- f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
1029
- f"This may be due to min_samples ({min_samples}) being too high for the available data.",
1030
- )
1031
- if link_ms2:
1032
- self.find_ms2()
1064
+ checked_pairs.add(pair)
1065
+
1066
+ mass_diff = abs(mass1 - mass2)
1067
+ rt_diff = abs(rt1 - rt2) / 60.0 # Convert to minutes
1068
+
1069
+ if mass_diff <= mz_tol and rt_diff <= rt_tol:
1070
+ j = uid_to_idx[uid2]
1071
+ uf.union(i, j)
1072
+
1073
+ # Extract groups
1074
+ groups_by_root = defaultdict(list)
1075
+ for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
1076
+ root = uf.find(i)
1077
+ groups_by_root[root].append((uid, rt, mass, inty, adduct))
1078
+
1079
+ groups = {}
1080
+ group_id = 1
1081
+ assigned_groups = {}
1082
+
1083
+ for group_members in groups_by_root.values():
1084
+ member_uids = [uid for uid, _, _, _, _ in group_members]
1085
+
1086
+ for uid in member_uids:
1087
+ assigned_groups[uid] = group_id
1088
+ groups[group_id] = member_uids
1089
+ group_id += 1
1090
+
1091
+ # Handle features without neutral mass
1092
+ for feature in consensus_data:
1093
+ uid = feature["consensus_uid"]
1094
+ if uid not in assigned_groups:
1095
+ assigned_groups[uid] = group_id
1096
+ groups[group_id] = [uid]
1097
+ group_id += 1
1098
+
1099
+ # Determine adduct_of for each group
1100
+ group_adduct_of = {}
1101
+ for grp_id, member_uids in groups.items():
1102
+ best_uid = None
1103
+ best_priority = -1
1104
+ best_intensity = 0
1105
+
1106
+ for uid in member_uids:
1107
+ feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
1108
+ if not feature_data:
1109
+ continue
1110
+
1111
+ adduct = feature_data.get("adduct_top", "")
1112
+ intensity = feature_data.get("inty_mean", 0)
1113
+
1114
+ priority = 0
1115
+ if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
1116
+ priority = 3
1117
+ elif adduct and "[M-H]" in adduct:
1118
+ priority = 2
1119
+ elif adduct and "M" in adduct:
1120
+ priority = 1
1121
+
1122
+ if priority > best_priority or (priority == best_priority and intensity > best_intensity):
1123
+ best_uid = uid
1124
+ best_priority = priority
1125
+ best_intensity = intensity
1126
+
1127
+ group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
1128
+
1129
+ # Build final lists in same order as consensus_data
1130
+ adduct_group_list = []
1131
+ adduct_of_list = []
1132
+
1133
+ for feature in consensus_data:
1134
+ uid = feature["consensus_uid"]
1135
+ group = assigned_groups.get(uid, 0)
1136
+ adduct_of = group_adduct_of.get(group, uid)
1137
+
1138
+ adduct_group_list.append(group)
1139
+ adduct_of_list.append(adduct_of)
1140
+
1141
+ grouping_time = time.time() - start_time
1142
+ if n_features > 1000:
1143
+ study_obj.logger.info(f"Adduct grouping completed in {grouping_time:.2f}s ({len(groups)} groups)")
1144
+
1145
+ return adduct_group_list, adduct_of_list
1033
1146
 
1034
1147
 
1035
1148
  # Backward compatibility alias
@@ -119,6 +119,13 @@ from masster.study.processing import align
119
119
  from masster.study.processing import merge
120
120
  from masster.study.processing import integrate
121
121
  from masster.study.processing import find_ms2
122
+ from masster.study.processing import _reset_consensus_data
123
+ from masster.study.processing import _process_merge_parameters
124
+ from masster.study.processing import _validate_merge_inputs
125
+ from masster.study.processing import _perform_feature_grouping
126
+ from masster.study.processing import _extract_consensus_features
127
+ from masster.study.processing import _perform_adduct_grouping
128
+ from masster.study.processing import _finalize_merge
122
129
  from masster.study.parameters import store_history
123
130
  from masster.study.parameters import get_parameters
124
131
  from masster.study.parameters import update_parameters
@@ -490,6 +497,15 @@ class Study:
490
497
  _format_adduct_name = _format_adduct_name
491
498
  _parse_element_counts = _parse_element_counts
492
499
 
500
+ # === Merge Helper Methods ===
501
+ _reset_consensus_data = _reset_consensus_data
502
+ _process_merge_parameters = _process_merge_parameters
503
+ _validate_merge_inputs = _validate_merge_inputs
504
+ _perform_feature_grouping = _perform_feature_grouping
505
+ _extract_consensus_features = _extract_consensus_features
506
+ _perform_adduct_grouping = _perform_adduct_grouping
507
+ _finalize_merge = _finalize_merge
508
+
493
509
  # === Default Parameters ===
494
510
  study_defaults = study_defaults
495
511
  align_defaults = align_defaults
@@ -587,6 +603,51 @@ class Study:
587
603
  except Exception as e:
588
604
  self.logger.error(f"Failed to reload current module {current_module}: {e}")
589
605
 
606
+ def __dir__(self):
607
+ """
608
+ Custom __dir__ implementation to hide internal methods starting with '_'
609
+ and backward compatibility aliases from tab completion and dir() calls,
610
+ while keeping them accessible to class methods.
611
+
612
+ Returns:
613
+ list: List of public attribute and method names (excluding internal and deprecated methods)
614
+ """
615
+ # Define backward compatibility aliases to hide
616
+ backward_compatibility_aliases = {
617
+ 'add_folder', # alias for add
618
+ 'find_consensus', # alias for merge
619
+ 'integrate_chrom', # alias for integrate
620
+ 'fill_chrom', # alias for fill
621
+ 'fill_chrom_single', # alias for fill_single
622
+ 'filter_consensus', # alias for consensus_filter
623
+ 'select_consensus', # alias for consensus_select
624
+ 'filter_features', # alias for features_filter
625
+ 'select_features', # alias for features_select
626
+ 'consensus_find', # alias for merge
627
+ }
628
+
629
+ # Get all attributes from the class
630
+ all_attrs = set()
631
+
632
+ # Add attributes from the class and all its bases
633
+ for cls in self.__class__.__mro__:
634
+ all_attrs.update(cls.__dict__.keys())
635
+
636
+ # Add instance attributes
637
+ all_attrs.update(self.__dict__.keys())
638
+
639
+ # Filter out attributes starting with '_' (but keep special methods like __init__, __str__, etc.)
640
+ # Also filter out backward compatibility aliases
641
+ public_attrs = [
642
+ attr for attr in all_attrs
643
+ if not attr.startswith('_') or attr.startswith('__') and attr.endswith('__')
644
+ ]
645
+
646
+ # Remove backward compatibility aliases from the public attributes
647
+ public_attrs = [attr for attr in public_attrs if attr not in backward_compatibility_aliases]
648
+
649
+ return sorted(public_attrs)
650
+
590
651
  def __str__(self):
591
652
  """
592
653
  Returns a string representation of the study.
@@ -1374,7 +1374,7 @@ wheels = [
1374
1374
 
1375
1375
  [[package]]
1376
1376
  name = "masster"
1377
- version = "0.4.12"
1377
+ version = "0.4.14"
1378
1378
  source = { editable = "." }
1379
1379
  dependencies = [
1380
1380
  { name = "alpharaw" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes