masster 0.4.18__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/__init__.py CHANGED
@@ -27,5 +27,4 @@ __all__ = [
27
27
  "Study",
28
28
  "Wizard",
29
29
  "__version__",
30
- # "get_version",
31
30
  ]
masster/_version.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.4.18"
4
+ __version__ = "0.4.20"
5
5
 
6
6
 
7
7
  def get_version():
masster/logger.py CHANGED
@@ -55,6 +55,9 @@ class MassterLogger:
55
55
  # Convert string sink to actual object
56
56
  if sink == "sys.stdout" or sink is None:
57
57
  self.sink = sys.stdout
58
+ elif isinstance(sink, str) and sink != "sys.stdout":
59
+ # If it's a file path string, open the file for writing
60
+ self.sink = open(sink, "a", encoding="utf-8")
58
61
  else:
59
62
  self.sink = sink
60
63
 
@@ -67,6 +70,21 @@ class MassterLogger:
67
70
  # Remove any existing handlers to prevent duplicates
68
71
  if self.logger_instance.hasHandlers():
69
72
  self.logger_instance.handlers.clear()
73
+
74
+ # Also ensure no duplicate handlers on parent loggers
75
+ parent = self.logger_instance.parent
76
+ while parent:
77
+ if parent.name == "masster" and parent.hasHandlers():
78
+ # Remove duplicate handlers from masster parent logger
79
+ unique_handlers = []
80
+ handler_types = set()
81
+ for handler in parent.handlers:
82
+ handler_type = type(handler)
83
+ if handler_type not in handler_types:
84
+ unique_handlers.append(handler)
85
+ handler_types.add(handler_type)
86
+ parent.handlers = unique_handlers
87
+ parent = parent.parent
70
88
 
71
89
  self.logger_instance.setLevel(getattr(logging, self.level))
72
90
 
@@ -129,6 +147,17 @@ class MassterLogger:
129
147
 
130
148
  # Prevent propagation to avoid duplicate messages
131
149
  self.logger_instance.propagate = False
150
+
151
+ # Additional fix: ensure no duplicate handlers in the entire logging hierarchy
152
+ masster_logger = logging.getLogger("masster")
153
+ if masster_logger.hasHandlers():
154
+ # Keep only one handler per type
155
+ unique_handlers = {}
156
+ for handler in masster_logger.handlers:
157
+ handler_key = (type(handler).__name__, getattr(handler, 'stream', None))
158
+ if handler_key not in unique_handlers:
159
+ unique_handlers[handler_key] = handler
160
+ masster_logger.handlers = list(unique_handlers.values())
132
161
 
133
162
  def update_level(self, level: str):
134
163
  """Update the logging level."""
@@ -326,7 +355,20 @@ class MassterLogger:
326
355
  """Remove this logger's handler."""
327
356
  if self.handler:
328
357
  self.logger_instance.removeHandler(self.handler)
358
+ # Close the file handle if it's not stdout
359
+ if hasattr(self.sink, 'close') and self.sink != sys.stdout:
360
+ try:
361
+ self.sink.close()
362
+ except Exception:
363
+ pass # Ignore close errors
329
364
  self.handler = None
330
365
 
366
+ def __del__(self):
367
+ """Cleanup when the logger is destroyed."""
368
+ try:
369
+ self.remove()
370
+ except Exception:
371
+ pass # Ignore cleanup errors during destruction
372
+
331
373
  def __repr__(self):
332
374
  return f"MassterLogger(type={self.instance_type}, id={self.instance_id}, level={self.level})"
masster/sample/load.py CHANGED
@@ -37,21 +37,22 @@ See Also:
37
37
  """
38
38
 
39
39
  import os
40
-
40
+ import warnings
41
41
  from datetime import datetime
42
42
 
43
43
  import numpy as np
44
44
  import pandas as pd
45
45
  import polars as pl
46
- import pyopenms as oms
47
-
48
46
  from tqdm import tqdm
49
47
 
50
48
  from masster.chromatogram import Chromatogram
51
-
52
- # Parameters removed - using hardcoded defaults
53
49
  from masster.spectrum import Spectrum
54
50
 
51
+ # Import pyopenms with suppressed warnings
52
+ with warnings.catch_warnings():
53
+ warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH environment variable already exists.*", category=UserWarning)
54
+ import pyopenms as oms
55
+
55
56
 
56
57
  def load(
57
58
  self,
masster/sample/sample.py CHANGED
@@ -56,15 +56,6 @@ from masster.sample.helpers import _estimate_memory_usage
56
56
  from masster.sample.helpers import _get_scan_uids
57
57
  from masster.sample.helpers import _get_feature_uids
58
58
  from masster.sample.helpers import _features_sync
59
-
60
- # from masster.sample.helpers import _parse_adduct_specs
61
- # from masster.sample.helpers import _calculate_adduct_mass_shift
62
- # from masster.sample.helpers import _parse_formula_expression
63
- # from masster.sample.helpers import _calculate_molecular_mass
64
- # from masster.sample.helpers import _parse_legacy_adduct_format
65
- # from masster.sample.helpers import _extract_adduct_probability
66
- # from masster.sample.helpers import _detect_adduct_groups_direct
67
- # from masster.sample.helpers import _check_adduct_relationship
68
59
  from masster.sample.adducts import _get_adducts
69
60
  from masster.sample.adducts import find_adducts
70
61
  from masster.sample.helpers import features_delete
@@ -1,7 +1,7 @@
1
1
  """Parameter class for Study merge method."""
2
2
 
3
3
  from dataclasses import dataclass, field
4
- from typing import Any
4
+ from typing import Any, Optional
5
5
 
6
6
 
7
7
  @dataclass
@@ -36,6 +36,9 @@ class merge_defaults:
36
36
  max_nr_conflicts: int = 0
37
37
  link_ms2: bool = True
38
38
 
39
+ # Parallel processing parameters
40
+ threads: Optional[int] = None
41
+
39
42
  # KD-Strict specific parameters
40
43
  optimize_rt_tol: bool = False
41
44
  rt_tol_range: tuple = (0.5, 4.0)
@@ -115,6 +118,14 @@ class merge_defaults:
115
118
  "description": "Whether to link MS2 spectra to consensus features",
116
119
  "default": True,
117
120
  },
121
+ # Parallel processing parameters
122
+ "threads": {
123
+ "dtype": [int, type(None)],
124
+ "description": "Number of parallel threads/processes for chunked methods (None=original sequential)",
125
+ "default": None,
126
+ "min_value": 1,
127
+ "max_value": 32,
128
+ },
118
129
  # KD-Strict specific parameters
119
130
  "optimize_rt_tol": {
120
131
  "dtype": bool,
@@ -217,7 +228,37 @@ class merge_defaults:
217
228
  metadata = self._param_metadata[param_name]
218
229
  expected_dtype = metadata["dtype"]
219
230
 
220
- # Type checking
231
+ # Handle Optional types (list of types including None)
232
+ if isinstance(expected_dtype, list):
233
+ # Check if value matches any of the allowed types
234
+ valid_type = False
235
+ for dtype in expected_dtype:
236
+ if dtype is type(None) and value is None:
237
+ return True # None is explicitly allowed
238
+ elif dtype is int and isinstance(value, int):
239
+ valid_type = True
240
+ break
241
+ elif dtype is float and isinstance(value, (int, float)):
242
+ valid_type = True
243
+ break
244
+ elif dtype is bool and isinstance(value, bool):
245
+ valid_type = True
246
+ break
247
+ elif dtype is str and isinstance(value, str):
248
+ valid_type = True
249
+ break
250
+
251
+ if not valid_type:
252
+ return False
253
+
254
+ # For None values, skip further validation
255
+ if value is None:
256
+ return True
257
+
258
+ # Use the first non-None type for range validation
259
+ expected_dtype = next((dt for dt in expected_dtype if dt is not type(None)), expected_dtype[0])
260
+
261
+ # Type checking for non-Optional types
221
262
  if expected_dtype is int:
222
263
  if not isinstance(value, int):
223
264
  try:
masster/study/helpers.py CHANGED
@@ -641,20 +641,61 @@ def get_gaps_stats(self, uids=None):
641
641
 
642
642
 
643
643
  # TODO is uid not supposed to be a list anymore?
644
- def get_consensus_matches(self, uids=None):
644
+ def get_consensus_matches(self, uids=None, filled=True):
645
+ """
646
+ Get feature matches for consensus UIDs with optimized join operation.
647
+
648
+ Parameters:
649
+ uids: Consensus UID(s) to get matches for. Can be:
650
+ - None: get matches for all consensus features
651
+ - int: single consensus UID (converted to list)
652
+ - list: multiple consensus UIDs
653
+ filled (bool): Whether to include filled rows (True) or exclude them (False).
654
+ Default is True to maintain backward compatibility.
655
+
656
+ Returns:
657
+ pl.DataFrame: Feature matches for the specified consensus UIDs
658
+ """
659
+ # Handle single int by converting to list
660
+ if isinstance(uids, int):
661
+ uids = [uids]
662
+
645
663
  uids = self._get_consensus_uids(uids)
646
-
647
- # find all rows in consensus_mapping_df with consensus_id=id - use Polars filtering
648
- fid = (
649
- self.consensus_mapping_df.filter(
650
- pl.col("consensus_uid").is_in(uids),
664
+
665
+ if not uids:
666
+ return pl.DataFrame()
667
+
668
+ # Early validation checks
669
+ if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
670
+ self.logger.warning("No consensus mapping data available")
671
+ return pl.DataFrame()
672
+
673
+ if self.features_df is None or self.features_df.is_empty():
674
+ self.logger.warning("No feature data available")
675
+ return pl.DataFrame()
676
+
677
+ # Build the query with optional filled filter
678
+ features_query = self.features_df.lazy()
679
+
680
+ # Apply filled filter if specified
681
+ if not filled and "filled" in self.features_df.columns:
682
+ features_query = features_query.filter(~pl.col("filled"))
683
+
684
+ # Optimized single-pass operation using join instead of two separate filters
685
+ # This avoids creating intermediate Python lists and leverages Polars' optimized joins
686
+ matches = (
687
+ features_query
688
+ .join(
689
+ self.consensus_mapping_df
690
+ .lazy()
691
+ .filter(pl.col("consensus_uid").is_in(uids))
692
+ .select("feature_uid"), # Only select what we need for the join
693
+ on="feature_uid",
694
+ how="inner"
651
695
  )
652
- .select("feature_uid")
653
- .to_series()
654
- .to_list()
696
+ .collect(streaming=True) # Use streaming for memory efficiency with large datasets
655
697
  )
656
- # select all rows in features_df with uid in fid
657
- matches = self.features_df.filter(pl.col("feature_uid").is_in(fid)).clone()
698
+
658
699
  return matches
659
700
 
660
701