masster 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -11,6 +11,7 @@ This module contains the optimized version of features_select that:
11
11
 
12
12
  import polars as pl
13
13
 
14
+
14
15
  def features_select_optimized(
15
16
  self,
16
17
  mz=None,
@@ -29,14 +30,14 @@ def features_select_optimized(
29
30
  ):
30
31
  """
31
32
  Optimized version of features_select with improved performance.
32
-
33
+
33
34
  Key optimizations:
34
35
  - Combines all filters into a single expression
35
36
  - Uses lazy evaluation for better performance
36
37
  - Reduces logging overhead
37
38
  - Pre-checks column existence once
38
39
  - Early return for no filters
39
-
40
+
40
41
  Args:
41
42
  mz: mass-to-charge ratio filter (tuple for range, single value for minimum)
42
43
  rt: retention time filter (tuple for range, single value for minimum)
@@ -51,30 +52,42 @@ def features_select_optimized(
51
52
  chrom_prominence: chromatogram prominence filter (tuple for range, single value for minimum)
52
53
  chrom_prominence_scaled: scaled chromatogram prominence filter (tuple for range, single value for minimum)
53
54
  chrom_height_scaled: scaled chromatogram height filter (tuple for range, single value for minimum)
54
-
55
+
55
56
  Returns:
56
57
  polars.DataFrame: Filtered features DataFrame
57
58
  """
58
59
  if self.features_df is None or self.features_df.is_empty():
59
60
  self.logger.warning("No features found in study.")
60
61
  return pl.DataFrame()
61
-
62
+
62
63
  # Early return if no filters provided
63
- filter_params = [mz, rt, inty, sample_uid, sample_name, consensus_uid,
64
- feature_uid, filled, quality, chrom_coherence,
65
- chrom_prominence, chrom_prominence_scaled, chrom_height_scaled]
64
+ filter_params = [
65
+ mz,
66
+ rt,
67
+ inty,
68
+ sample_uid,
69
+ sample_name,
70
+ consensus_uid,
71
+ feature_uid,
72
+ filled,
73
+ quality,
74
+ chrom_coherence,
75
+ chrom_prominence,
76
+ chrom_prominence_scaled,
77
+ chrom_height_scaled,
78
+ ]
66
79
  if all(param is None for param in filter_params):
67
80
  return self.features_df.clone()
68
-
81
+
69
82
  initial_count = len(self.features_df)
70
-
83
+
71
84
  # Pre-check available columns once
72
85
  available_columns = set(self.features_df.columns)
73
-
86
+
74
87
  # Build all filter conditions
75
88
  filter_conditions = []
76
89
  warnings = []
77
-
90
+
78
91
  # Filter by m/z
79
92
  if mz is not None:
80
93
  if isinstance(mz, tuple) and len(mz) == 2:
@@ -82,7 +95,7 @@ def features_select_optimized(
82
95
  filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
83
96
  else:
84
97
  filter_conditions.append(pl.col("mz") >= mz)
85
-
98
+
86
99
  # Filter by retention time
87
100
  if rt is not None:
88
101
  if isinstance(rt, tuple) and len(rt) == 2:
@@ -90,7 +103,7 @@ def features_select_optimized(
90
103
  filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
91
104
  else:
92
105
  filter_conditions.append(pl.col("rt") >= rt)
93
-
106
+
94
107
  # Filter by intensity
95
108
  if inty is not None:
96
109
  if isinstance(inty, tuple) and len(inty) == 2:
@@ -98,7 +111,7 @@ def features_select_optimized(
98
111
  filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
99
112
  else:
100
113
  filter_conditions.append(pl.col("inty") >= inty)
101
-
114
+
102
115
  # Filter by sample_uid
103
116
  if sample_uid is not None:
104
117
  if isinstance(sample_uid, (list, tuple)):
@@ -111,24 +124,24 @@ def features_select_optimized(
111
124
  filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
112
125
  else:
113
126
  filter_conditions.append(pl.col("sample_uid") == sample_uid)
114
-
127
+
115
128
  # Filter by sample_name (requires pre-processing)
116
129
  if sample_name is not None:
117
130
  # Get sample_uids for the given sample names
118
131
  if isinstance(sample_name, list):
119
132
  sample_uids_for_names = self.samples_df.filter(
120
- pl.col("sample_name").is_in(sample_name)
133
+ pl.col("sample_name").is_in(sample_name),
121
134
  )["sample_uid"].to_list()
122
135
  else:
123
136
  sample_uids_for_names = self.samples_df.filter(
124
- pl.col("sample_name") == sample_name
137
+ pl.col("sample_name") == sample_name,
125
138
  )["sample_uid"].to_list()
126
-
139
+
127
140
  if sample_uids_for_names:
128
141
  filter_conditions.append(pl.col("sample_uid").is_in(sample_uids_for_names))
129
142
  else:
130
143
  filter_conditions.append(pl.lit(False)) # No matching samples
131
-
144
+
132
145
  # Filter by consensus_uid
133
146
  if consensus_uid is not None:
134
147
  if isinstance(consensus_uid, (list, tuple)):
@@ -141,7 +154,7 @@ def features_select_optimized(
141
154
  filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
142
155
  else:
143
156
  filter_conditions.append(pl.col("consensus_uid") == consensus_uid)
144
-
157
+
145
158
  # Filter by feature_uid
146
159
  if feature_uid is not None:
147
160
  if isinstance(feature_uid, (list, tuple)):
@@ -154,7 +167,7 @@ def features_select_optimized(
154
167
  filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
155
168
  else:
156
169
  filter_conditions.append(pl.col("feature_uid") == feature_uid)
157
-
170
+
158
171
  # Filter by filled status
159
172
  if filled is not None:
160
173
  if "filled" in available_columns:
@@ -164,7 +177,7 @@ def features_select_optimized(
164
177
  filter_conditions.append(~pl.col("filled") | pl.col("filled").is_null())
165
178
  else:
166
179
  warnings.append("'filled' column not found in features_df")
167
-
180
+
168
181
  # Filter by quality
169
182
  if quality is not None:
170
183
  if "quality" in available_columns:
@@ -175,75 +188,85 @@ def features_select_optimized(
175
188
  filter_conditions.append(pl.col("quality") >= quality)
176
189
  else:
177
190
  warnings.append("'quality' column not found in features_df")
178
-
191
+
179
192
  # Filter by chromatogram coherence
180
193
  if chrom_coherence is not None:
181
194
  if "chrom_coherence" in available_columns:
182
195
  if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
183
196
  min_coherence, max_coherence = chrom_coherence
184
- filter_conditions.append((pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence))
197
+ filter_conditions.append(
198
+ (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
199
+ )
185
200
  else:
186
201
  filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
187
202
  else:
188
203
  warnings.append("'chrom_coherence' column not found in features_df")
189
-
204
+
190
205
  # Filter by chromatogram prominence
191
206
  if chrom_prominence is not None:
192
207
  if "chrom_prominence" in available_columns:
193
208
  if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
194
209
  min_prominence, max_prominence = chrom_prominence
195
- filter_conditions.append((pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence))
210
+ filter_conditions.append(
211
+ (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
212
+ )
196
213
  else:
197
214
  filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
198
215
  else:
199
216
  warnings.append("'chrom_prominence' column not found in features_df")
200
-
217
+
201
218
  # Filter by scaled chromatogram prominence
202
219
  if chrom_prominence_scaled is not None:
203
220
  if "chrom_prominence_scaled" in available_columns:
204
221
  if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
205
222
  min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
206
- filter_conditions.append((pl.col("chrom_prominence_scaled") >= min_prominence_scaled) & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled))
223
+ filter_conditions.append(
224
+ (pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
225
+ & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
226
+ )
207
227
  else:
208
228
  filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
209
229
  else:
210
230
  warnings.append("'chrom_prominence_scaled' column not found in features_df")
211
-
231
+
212
232
  # Filter by scaled chromatogram height
213
233
  if chrom_height_scaled is not None:
214
234
  if "chrom_height_scaled" in available_columns:
215
235
  if isinstance(chrom_height_scaled, tuple) and len(chrom_height_scaled) == 2:
216
236
  min_height_scaled, max_height_scaled = chrom_height_scaled
217
- filter_conditions.append((pl.col("chrom_height_scaled") >= min_height_scaled) & (pl.col("chrom_height_scaled") <= max_height_scaled))
237
+ filter_conditions.append(
238
+ (pl.col("chrom_height_scaled") >= min_height_scaled)
239
+ & (pl.col("chrom_height_scaled") <= max_height_scaled)
240
+ )
218
241
  else:
219
242
  filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
220
243
  else:
221
244
  warnings.append("'chrom_height_scaled' column not found in features_df")
222
-
245
+
223
246
  # Log warnings once at the end
224
247
  for warning in warnings:
225
248
  self.logger.warning(warning)
226
-
249
+
227
250
  # Apply all filters at once if any exist
228
251
  if filter_conditions:
229
252
  # Combine all conditions with AND
230
253
  combined_filter = filter_conditions[0]
231
254
  for condition in filter_conditions[1:]:
232
255
  combined_filter = combined_filter & condition
233
-
256
+
234
257
  # Apply the combined filter using lazy evaluation for better performance
235
258
  feats = self.features_df.lazy().filter(combined_filter).collect()
236
259
  else:
237
260
  feats = self.features_df.clone()
238
-
261
+
239
262
  final_count = len(feats)
240
-
263
+
241
264
  if final_count == 0:
242
265
  self.logger.warning("No features remaining after applying selection criteria.")
243
266
  else:
244
267
  removed_count = initial_count - final_count
245
268
  self.logger.info(f"Features selected: {final_count} (removed: {removed_count})")
246
-
269
+
247
270
  return feats
248
271
 
249
272
 
@@ -267,51 +290,70 @@ def features_select_benchmarked(
267
290
  Benchmarked version that compares old vs new implementation performance.
268
291
  """
269
292
  import time
270
-
293
+
271
294
  # Call the original method for comparison
272
295
  start_time = time.perf_counter()
273
296
  _ = self.features_select_original(
274
- mz=mz, rt=rt, inty=inty, sample_uid=sample_uid, sample_name=sample_name,
275
- consensus_uid=consensus_uid, feature_uid=feature_uid, filled=filled,
276
- quality=quality, chrom_coherence=chrom_coherence,
277
- chrom_prominence=chrom_prominence, chrom_prominence_scaled=chrom_prominence_scaled,
278
- chrom_height_scaled=chrom_height_scaled
297
+ mz=mz,
298
+ rt=rt,
299
+ inty=inty,
300
+ sample_uid=sample_uid,
301
+ sample_name=sample_name,
302
+ consensus_uid=consensus_uid,
303
+ feature_uid=feature_uid,
304
+ filled=filled,
305
+ quality=quality,
306
+ chrom_coherence=chrom_coherence,
307
+ chrom_prominence=chrom_prominence,
308
+ chrom_prominence_scaled=chrom_prominence_scaled,
309
+ chrom_height_scaled=chrom_height_scaled,
279
310
  )
280
311
  original_time = time.perf_counter() - start_time
281
-
312
+
282
313
  # Call the optimized method
283
314
  start_time = time.perf_counter()
284
315
  result_optimized = features_select_optimized(
285
- self, mz=mz, rt=rt, inty=inty, sample_uid=sample_uid, sample_name=sample_name,
286
- consensus_uid=consensus_uid, feature_uid=feature_uid, filled=filled,
287
- quality=quality, chrom_coherence=chrom_coherence,
288
- chrom_prominence=chrom_prominence, chrom_prominence_scaled=chrom_prominence_scaled,
289
- chrom_height_scaled=chrom_height_scaled
316
+ self,
317
+ mz=mz,
318
+ rt=rt,
319
+ inty=inty,
320
+ sample_uid=sample_uid,
321
+ sample_name=sample_name,
322
+ consensus_uid=consensus_uid,
323
+ feature_uid=feature_uid,
324
+ filled=filled,
325
+ quality=quality,
326
+ chrom_coherence=chrom_coherence,
327
+ chrom_prominence=chrom_prominence,
328
+ chrom_prominence_scaled=chrom_prominence_scaled,
329
+ chrom_height_scaled=chrom_height_scaled,
290
330
  )
291
331
  optimized_time = time.perf_counter() - start_time
292
-
332
+
293
333
  # Log performance comparison
294
- speedup = original_time / optimized_time if optimized_time > 0 else float('inf')
295
- self.logger.info(f"Performance comparison - Original: {original_time:.4f}s, Optimized: {optimized_time:.4f}s, Speedup: {speedup:.2f}x")
296
-
334
+ speedup = original_time / optimized_time if optimized_time > 0 else float("inf")
335
+ self.logger.info(
336
+ f"Performance comparison - Original: {original_time:.4f}s, Optimized: {optimized_time:.4f}s, Speedup: {speedup:.2f}x"
337
+ )
338
+
297
339
  return result_optimized
298
340
 
299
341
 
300
342
  def monkey_patch_study():
301
343
  """
302
344
  Apply the optimized features_select method to the Study class.
303
-
345
+
304
346
  Call this function to replace the original features_select with the optimized version.
305
347
  """
306
348
  from masster.study.study import Study
307
-
349
+
308
350
  # Store original method for benchmarking
309
351
  Study.features_select_original = Study.features_select
310
-
352
+
311
353
  # Replace with optimized version
312
354
  Study.features_select = features_select_optimized
313
-
355
+
314
356
  # Add benchmarked version as an option
315
357
  Study.features_select_benchmarked = features_select_benchmarked
316
-
358
+
317
359
  print("Successfully patched Study.features_select with optimized version")
masster/study/load.py CHANGED
@@ -48,10 +48,10 @@ def add(
48
48
  folder = os.getcwd()
49
49
 
50
50
  self.logger.debug(f"Adding files from: {folder}")
51
-
51
+
52
52
  # Define file extensions to search for in order of priority
53
53
  extensions = [".sample5", ".wiff", ".raw", ".mzML"]
54
-
54
+
55
55
  # Check if folder contains glob patterns
56
56
  if not any(char in folder for char in ["*", "?", "[", "]"]):
57
57
  search_folder = folder
@@ -68,7 +68,7 @@ def add(
68
68
  for ext in extensions:
69
69
  if max_files is not None and counter >= max_files:
70
70
  break
71
-
71
+
72
72
  # Build search pattern
73
73
  if any(char in folder for char in ["*", "?", "[", "]"]):
74
74
  # If folder already contains glob patterns, modify the extension
@@ -78,16 +78,16 @@ def add(
78
78
  pattern = os.path.join(search_folder, "**", f"*{ext}")
79
79
  else:
80
80
  pattern = os.path.join(search_folder, "**", f"*{ext}")
81
-
81
+
82
82
  files = glob.glob(pattern, recursive=True)
83
-
83
+
84
84
  if len(files) > 0:
85
85
  # Limit files if max_files is specified
86
86
  remaining_slots = max_files - counter if max_files is not None else len(files)
87
87
  files = files[:remaining_slots]
88
-
88
+
89
89
  self.logger.debug(f"Found {len(files)} {ext} files")
90
-
90
+
91
91
  # Process files
92
92
  for i, file in enumerate(
93
93
  tqdm(
@@ -99,18 +99,18 @@ def add(
99
99
  ):
100
100
  if max_files is not None and counter >= max_files:
101
101
  break
102
-
102
+
103
103
  # Get filename without extension for blacklist check
104
104
  basename = os.path.basename(file)
105
105
  filename_no_ext = os.path.splitext(basename)[0]
106
-
106
+
107
107
  # Check if this filename (without extension) is already in blacklist
108
108
  if filename_no_ext in blacklist:
109
109
  self.logger.debug(f"Skipping {file} - filename already processed")
110
110
  continue
111
-
111
+
112
112
  self.logger.debug(f"Add file {counter + 1}: {file}")
113
-
113
+
114
114
  # Try to add the sample
115
115
  try:
116
116
  self.add_sample(file=file, reset=reset, adducts=adducts)
@@ -138,11 +138,11 @@ def add(
138
138
  # TODO type is not used
139
139
  def add_sample(self, file, type=None, reset=False, adducts=None):
140
140
  self.logger.debug(f"Adding: {file}")
141
-
141
+
142
142
  # Extract sample name by removing any known extension
143
143
  basename = os.path.basename(file)
144
144
  sample_name = os.path.splitext(basename)[0]
145
-
145
+
146
146
  # check if sample_name is already in the samples_df
147
147
  if sample_name in self.samples_df["sample_name"].to_list():
148
148
  self.logger.warning(
@@ -163,7 +163,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
163
163
  # Load the sample based on file type
164
164
  ddaobj = Sample()
165
165
  ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
166
-
166
+
167
167
  if file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
168
168
  ddaobj.load(file)
169
169
  else:
@@ -178,7 +178,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
178
178
  if ddaobj.features is None or reset:
179
179
  ddaobj.find_features()
180
180
  ddaobj.find_adducts(adducts=adducts)
181
- ddaobj.find_ms2()
181
+ ddaobj.find_ms2()
182
182
 
183
183
  self.features_maps.append(ddaobj.features)
184
184
 
@@ -194,7 +194,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
194
194
  # If input is already .sample5, keep it in original location
195
195
  final_sample_path = file
196
196
  self.logger.debug(f"Using existing .sample5 file at original location: {final_sample_path}")
197
-
197
+
198
198
  # Check if there's a corresponding featureXML file in the same directory
199
199
  featurexml_path = file.replace(".sample5", ".featureXML")
200
200
  if os.path.exists(featurexml_path):
@@ -218,7 +218,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
218
218
  # Count MS1 and MS2 scans from the loaded sample
219
219
  ms1_count = 0
220
220
  ms2_count = 0
221
- if hasattr(ddaobj, 'scans_df') and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
221
+ if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
222
222
  ms1_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 1).height)
223
223
  ms2_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 2).height)
224
224
 
@@ -230,7 +230,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
230
230
  "sample_type": [sample_type],
231
231
  "size": [int(ddaobj.features.size())],
232
232
  "map_id": [map_id_value],
233
- "file_source": [getattr(ddaobj, 'file_source', file)],
233
+ "file_source": [getattr(ddaobj, "file_source", file)],
234
234
  "ms1": [ms1_count],
235
235
  "ms2": [ms2_count],
236
236
  },
@@ -304,8 +304,8 @@ def load(self, filename=None):
304
304
  else:
305
305
  self.logger.error("Either filename or folder must be provided")
306
306
  return
307
-
308
- #self.logger.info(f"Loading study from {filename}")
307
+
308
+ # self.logger.info(f"Loading study from {filename}")
309
309
  self._load_study5(filename)
310
310
  # After loading the study, check if consensus XML exists and load it
311
311
  consensus_xml_path = filename.replace(".study5", ".consensusXML")
@@ -566,7 +566,20 @@ def _fill_chrom_single_impl(
566
566
  rows_to_add.append(new_row)
567
567
 
568
568
  # Create and add new DataFrame
569
- new_df = pl.from_dicts(rows_to_add)
569
+ if rows_to_add:
570
+ # Ensure consistent data types by explicitly casting problematic columns
571
+ for row in rows_to_add:
572
+ # Cast numeric columns to ensure consistency
573
+ for key, value in row.items():
574
+ if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
575
+ row[key] = float(value)
576
+ elif key in ["sample_id", "feature_id"] and value is not None:
577
+ row[key] = int(value)
578
+
579
+ new_df = pl.from_dicts(rows_to_add, infer_schema_length=len(rows_to_add))
580
+ else:
581
+ # Handle empty case - create empty DataFrame with proper schema
582
+ new_df = pl.DataFrame(schema=self.features_df.schema)
570
583
 
571
584
  # Cast columns to match existing schema
572
585
  cast_exprs = []
@@ -606,8 +619,9 @@ def fill_single(self, **kwargs):
606
619
  """
607
620
  # parameters initialization
608
621
  from masster.study.defaults import fill_defaults
622
+
609
623
  params = fill_defaults()
610
-
624
+
611
625
  for key, value in kwargs.items():
612
626
  if isinstance(value, fill_defaults):
613
627
  params = value
@@ -959,7 +973,20 @@ def _fill_chrom_impl(
959
973
  rows_to_add.append(new_row)
960
974
 
961
975
  # Create and add new DataFrame
962
- new_df = pl.from_dicts(rows_to_add)
976
+ if rows_to_add:
977
+ # Ensure consistent data types by explicitly casting problematic columns
978
+ for row in rows_to_add:
979
+ # Cast numeric columns to ensure consistency
980
+ for key, value in row.items():
981
+ if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
982
+ row[key] = float(value)
983
+ elif key in ["sample_id", "feature_id"] and value is not None:
984
+ row[key] = int(value)
985
+
986
+ new_df = pl.from_dicts(rows_to_add, infer_schema_length=len(rows_to_add))
987
+ else:
988
+ # Handle empty case - create empty DataFrame with proper schema
989
+ new_df = pl.DataFrame(schema=self.features_df.schema)
963
990
 
964
991
  # Cast columns to match existing schema
965
992
  cast_exprs = []
@@ -1001,7 +1028,7 @@ def fill(self, **kwargs):
1001
1028
  # parameters initialization
1002
1029
  params = fill_defaults()
1003
1030
  num_workers = kwargs.get("num_workers", 4) # Default parameter not in defaults class
1004
-
1031
+
1005
1032
  for key, value in kwargs.items():
1006
1033
  if isinstance(value, fill_defaults):
1007
1034
  params = value
@@ -1228,4 +1255,3 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
1228
1255
  self.consensus_map = oms.ConsensusMap()
1229
1256
  fh.load(filename, self.consensus_map)
1230
1257
  self.logger.debug(f"Loaded consensus map from {filename}.")
1231
-