masster 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (55) hide show
  1. masster/__init__.py +27 -27
  2. masster/_version.py +17 -17
  3. masster/chromatogram.py +497 -503
  4. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
  5. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
  6. masster/logger.py +318 -244
  7. masster/sample/__init__.py +9 -9
  8. masster/sample/defaults/__init__.py +15 -15
  9. masster/sample/defaults/find_adducts_def.py +325 -325
  10. masster/sample/defaults/find_features_def.py +366 -366
  11. masster/sample/defaults/find_ms2_def.py +285 -285
  12. masster/sample/defaults/get_spectrum_def.py +314 -318
  13. masster/sample/defaults/sample_def.py +374 -378
  14. masster/sample/h5.py +1321 -1297
  15. masster/sample/helpers.py +833 -364
  16. masster/sample/lib.py +762 -0
  17. masster/sample/load.py +1220 -1187
  18. masster/sample/parameters.py +131 -131
  19. masster/sample/plot.py +1685 -1622
  20. masster/sample/processing.py +1402 -1416
  21. masster/sample/quant.py +209 -0
  22. masster/sample/sample.py +393 -387
  23. masster/sample/sample5_schema.json +181 -181
  24. masster/sample/save.py +737 -736
  25. masster/sample/sciex.py +1213 -0
  26. masster/spectrum.py +1287 -1319
  27. masster/study/__init__.py +9 -9
  28. masster/study/defaults/__init__.py +21 -19
  29. masster/study/defaults/align_def.py +267 -267
  30. masster/study/defaults/export_def.py +41 -40
  31. masster/study/defaults/fill_chrom_def.py +264 -264
  32. masster/study/defaults/fill_def.py +260 -0
  33. masster/study/defaults/find_consensus_def.py +256 -256
  34. masster/study/defaults/find_ms2_def.py +163 -163
  35. masster/study/defaults/integrate_chrom_def.py +225 -225
  36. masster/study/defaults/integrate_def.py +221 -0
  37. masster/study/defaults/merge_def.py +256 -0
  38. masster/study/defaults/study_def.py +272 -269
  39. masster/study/export.py +674 -287
  40. masster/study/h5.py +1406 -886
  41. masster/study/helpers.py +1713 -433
  42. masster/study/helpers_optimized.py +317 -0
  43. masster/study/load.py +1231 -1078
  44. masster/study/parameters.py +99 -99
  45. masster/study/plot.py +632 -645
  46. masster/study/processing.py +1057 -1046
  47. masster/study/save.py +161 -134
  48. masster/study/study.py +612 -522
  49. masster/study/study5_schema.json +253 -241
  50. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/METADATA +15 -10
  51. masster-0.3.1.dist-info/RECORD +59 -0
  52. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/licenses/LICENSE +661 -661
  53. masster-0.2.5.dist-info/RECORD +0 -50
  54. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/WHEEL +0 -0
  55. {masster-0.2.5.dist-info → masster-0.3.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,317 @@
1
+ """
2
+ Optimized features_select method for improved performance.
3
+
4
+ This module contains the optimized version of features_select that:
5
+ 1. Combines all filters into a single expression
6
+ 2. Uses lazy evaluation
7
+ 3. Reduces logging overhead
8
+ 4. Pre-checks column existence
9
+ 5. Implements early returns
10
+ """
11
+
12
+ import polars as pl
13
+
14
+ def features_select_optimized(
15
+ self,
16
+ mz=None,
17
+ rt=None,
18
+ inty=None,
19
+ sample_uid=None,
20
+ sample_name=None,
21
+ consensus_uid=None,
22
+ feature_uid=None,
23
+ filled=None,
24
+ quality=None,
25
+ chrom_coherence=None,
26
+ chrom_prominence=None,
27
+ chrom_prominence_scaled=None,
28
+ chrom_height_scaled=None,
29
+ ):
30
+ """
31
+ Optimized version of features_select with improved performance.
32
+
33
+ Key optimizations:
34
+ - Combines all filters into a single expression
35
+ - Uses lazy evaluation for better performance
36
+ - Reduces logging overhead
37
+ - Pre-checks column existence once
38
+ - Early return for no filters
39
+
40
+ Args:
41
+ mz: mass-to-charge ratio filter (tuple for range, single value for minimum)
42
+ rt: retention time filter (tuple for range, single value for minimum)
43
+ inty: intensity filter (tuple for range, single value for minimum)
44
+ sample_uid: sample UID filter (list, single value, or tuple for range)
45
+ sample_name: sample name filter (list or single value)
46
+ consensus_uid: consensus UID filter (list, single value, or tuple for range)
47
+ feature_uid: feature UID filter (list, single value, or tuple for range)
48
+ filled: filter for filled/not filled features (bool)
49
+ quality: quality score filter (tuple for range, single value for minimum)
50
+ chrom_coherence: chromatogram coherence filter (tuple for range, single value for minimum)
51
+ chrom_prominence: chromatogram prominence filter (tuple for range, single value for minimum)
52
+ chrom_prominence_scaled: scaled chromatogram prominence filter (tuple for range, single value for minimum)
53
+ chrom_height_scaled: scaled chromatogram height filter (tuple for range, single value for minimum)
54
+
55
+ Returns:
56
+ polars.DataFrame: Filtered features DataFrame
57
+ """
58
+ if self.features_df is None or self.features_df.is_empty():
59
+ self.logger.warning("No features found in study.")
60
+ return pl.DataFrame()
61
+
62
+ # Early return if no filters provided
63
+ filter_params = [mz, rt, inty, sample_uid, sample_name, consensus_uid,
64
+ feature_uid, filled, quality, chrom_coherence,
65
+ chrom_prominence, chrom_prominence_scaled, chrom_height_scaled]
66
+ if all(param is None for param in filter_params):
67
+ return self.features_df.clone()
68
+
69
+ initial_count = len(self.features_df)
70
+
71
+ # Pre-check available columns once
72
+ available_columns = set(self.features_df.columns)
73
+
74
+ # Build all filter conditions
75
+ filter_conditions = []
76
+ warnings = []
77
+
78
+ # Filter by m/z
79
+ if mz is not None:
80
+ if isinstance(mz, tuple) and len(mz) == 2:
81
+ min_mz, max_mz = mz
82
+ filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
83
+ else:
84
+ filter_conditions.append(pl.col("mz") >= mz)
85
+
86
+ # Filter by retention time
87
+ if rt is not None:
88
+ if isinstance(rt, tuple) and len(rt) == 2:
89
+ min_rt, max_rt = rt
90
+ filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
91
+ else:
92
+ filter_conditions.append(pl.col("rt") >= rt)
93
+
94
+ # Filter by intensity
95
+ if inty is not None:
96
+ if isinstance(inty, tuple) and len(inty) == 2:
97
+ min_inty, max_inty = inty
98
+ filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
99
+ else:
100
+ filter_conditions.append(pl.col("inty") >= inty)
101
+
102
+ # Filter by sample_uid
103
+ if sample_uid is not None:
104
+ if isinstance(sample_uid, (list, tuple)):
105
+ if len(sample_uid) == 2 and not isinstance(sample_uid, list):
106
+ # Treat as range
107
+ min_uid, max_uid = sample_uid
108
+ filter_conditions.append((pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid))
109
+ else:
110
+ # Treat as list
111
+ filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
112
+ else:
113
+ filter_conditions.append(pl.col("sample_uid") == sample_uid)
114
+
115
+ # Filter by sample_name (requires pre-processing)
116
+ if sample_name is not None:
117
+ # Get sample_uids for the given sample names
118
+ if isinstance(sample_name, list):
119
+ sample_uids_for_names = self.samples_df.filter(
120
+ pl.col("sample_name").is_in(sample_name)
121
+ )["sample_uid"].to_list()
122
+ else:
123
+ sample_uids_for_names = self.samples_df.filter(
124
+ pl.col("sample_name") == sample_name
125
+ )["sample_uid"].to_list()
126
+
127
+ if sample_uids_for_names:
128
+ filter_conditions.append(pl.col("sample_uid").is_in(sample_uids_for_names))
129
+ else:
130
+ filter_conditions.append(pl.lit(False)) # No matching samples
131
+
132
+ # Filter by consensus_uid
133
+ if consensus_uid is not None:
134
+ if isinstance(consensus_uid, (list, tuple)):
135
+ if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
136
+ # Treat as range
137
+ min_uid, max_uid = consensus_uid
138
+ filter_conditions.append((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
139
+ else:
140
+ # Treat as list
141
+ filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
142
+ else:
143
+ filter_conditions.append(pl.col("consensus_uid") == consensus_uid)
144
+
145
+ # Filter by feature_uid
146
+ if feature_uid is not None:
147
+ if isinstance(feature_uid, (list, tuple)):
148
+ if len(feature_uid) == 2 and not isinstance(feature_uid, list):
149
+ # Treat as range
150
+ min_uid, max_uid = feature_uid
151
+ filter_conditions.append((pl.col("feature_uid") >= min_uid) & (pl.col("feature_uid") <= max_uid))
152
+ else:
153
+ # Treat as list
154
+ filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
155
+ else:
156
+ filter_conditions.append(pl.col("feature_uid") == feature_uid)
157
+
158
+ # Filter by filled status
159
+ if filled is not None:
160
+ if "filled" in available_columns:
161
+ if filled:
162
+ filter_conditions.append(pl.col("filled"))
163
+ else:
164
+ filter_conditions.append(~pl.col("filled") | pl.col("filled").is_null())
165
+ else:
166
+ warnings.append("'filled' column not found in features_df")
167
+
168
+ # Filter by quality
169
+ if quality is not None:
170
+ if "quality" in available_columns:
171
+ if isinstance(quality, tuple) and len(quality) == 2:
172
+ min_quality, max_quality = quality
173
+ filter_conditions.append((pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality))
174
+ else:
175
+ filter_conditions.append(pl.col("quality") >= quality)
176
+ else:
177
+ warnings.append("'quality' column not found in features_df")
178
+
179
+ # Filter by chromatogram coherence
180
+ if chrom_coherence is not None:
181
+ if "chrom_coherence" in available_columns:
182
+ if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
183
+ min_coherence, max_coherence = chrom_coherence
184
+ filter_conditions.append((pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence))
185
+ else:
186
+ filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
187
+ else:
188
+ warnings.append("'chrom_coherence' column not found in features_df")
189
+
190
+ # Filter by chromatogram prominence
191
+ if chrom_prominence is not None:
192
+ if "chrom_prominence" in available_columns:
193
+ if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
194
+ min_prominence, max_prominence = chrom_prominence
195
+ filter_conditions.append((pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence))
196
+ else:
197
+ filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
198
+ else:
199
+ warnings.append("'chrom_prominence' column not found in features_df")
200
+
201
+ # Filter by scaled chromatogram prominence
202
+ if chrom_prominence_scaled is not None:
203
+ if "chrom_prominence_scaled" in available_columns:
204
+ if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
205
+ min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
206
+ filter_conditions.append((pl.col("chrom_prominence_scaled") >= min_prominence_scaled) & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled))
207
+ else:
208
+ filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
209
+ else:
210
+ warnings.append("'chrom_prominence_scaled' column not found in features_df")
211
+
212
+ # Filter by scaled chromatogram height
213
+ if chrom_height_scaled is not None:
214
+ if "chrom_height_scaled" in available_columns:
215
+ if isinstance(chrom_height_scaled, tuple) and len(chrom_height_scaled) == 2:
216
+ min_height_scaled, max_height_scaled = chrom_height_scaled
217
+ filter_conditions.append((pl.col("chrom_height_scaled") >= min_height_scaled) & (pl.col("chrom_height_scaled") <= max_height_scaled))
218
+ else:
219
+ filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
220
+ else:
221
+ warnings.append("'chrom_height_scaled' column not found in features_df")
222
+
223
+ # Log warnings once at the end
224
+ for warning in warnings:
225
+ self.logger.warning(warning)
226
+
227
+ # Apply all filters at once if any exist
228
+ if filter_conditions:
229
+ # Combine all conditions with AND
230
+ combined_filter = filter_conditions[0]
231
+ for condition in filter_conditions[1:]:
232
+ combined_filter = combined_filter & condition
233
+
234
+ # Apply the combined filter using lazy evaluation for better performance
235
+ feats = self.features_df.lazy().filter(combined_filter).collect()
236
+ else:
237
+ feats = self.features_df.clone()
238
+
239
+ final_count = len(feats)
240
+
241
+ if final_count == 0:
242
+ self.logger.warning("No features remaining after applying selection criteria.")
243
+ else:
244
+ removed_count = initial_count - final_count
245
+ self.logger.info(f"Features selected: {final_count} (removed: {removed_count})")
246
+
247
+ return feats
248
+
249
+
250
+ def features_select_benchmarked(
251
+ self,
252
+ mz=None,
253
+ rt=None,
254
+ inty=None,
255
+ sample_uid=None,
256
+ sample_name=None,
257
+ consensus_uid=None,
258
+ feature_uid=None,
259
+ filled=None,
260
+ quality=None,
261
+ chrom_coherence=None,
262
+ chrom_prominence=None,
263
+ chrom_prominence_scaled=None,
264
+ chrom_height_scaled=None,
265
+ ):
266
+ """
267
+ Benchmarked version that compares old vs new implementation performance.
268
+ """
269
+ import time
270
+
271
+ # Call the original method for comparison
272
+ start_time = time.perf_counter()
273
+ _ = self.features_select_original(
274
+ mz=mz, rt=rt, inty=inty, sample_uid=sample_uid, sample_name=sample_name,
275
+ consensus_uid=consensus_uid, feature_uid=feature_uid, filled=filled,
276
+ quality=quality, chrom_coherence=chrom_coherence,
277
+ chrom_prominence=chrom_prominence, chrom_prominence_scaled=chrom_prominence_scaled,
278
+ chrom_height_scaled=chrom_height_scaled
279
+ )
280
+ original_time = time.perf_counter() - start_time
281
+
282
+ # Call the optimized method
283
+ start_time = time.perf_counter()
284
+ result_optimized = features_select_optimized(
285
+ self, mz=mz, rt=rt, inty=inty, sample_uid=sample_uid, sample_name=sample_name,
286
+ consensus_uid=consensus_uid, feature_uid=feature_uid, filled=filled,
287
+ quality=quality, chrom_coherence=chrom_coherence,
288
+ chrom_prominence=chrom_prominence, chrom_prominence_scaled=chrom_prominence_scaled,
289
+ chrom_height_scaled=chrom_height_scaled
290
+ )
291
+ optimized_time = time.perf_counter() - start_time
292
+
293
+ # Log performance comparison
294
+ speedup = original_time / optimized_time if optimized_time > 0 else float('inf')
295
+ self.logger.info(f"Performance comparison - Original: {original_time:.4f}s, Optimized: {optimized_time:.4f}s, Speedup: {speedup:.2f}x")
296
+
297
+ return result_optimized
298
+
299
+
300
+ def monkey_patch_study():
301
+ """
302
+ Apply the optimized features_select method to the Study class.
303
+
304
+ Call this function to replace the original features_select with the optimized version.
305
+ """
306
+ from masster.study.study import Study
307
+
308
+ # Store original method for benchmarking
309
+ Study.features_select_original = Study.features_select
310
+
311
+ # Replace with optimized version
312
+ Study.features_select = features_select_optimized
313
+
314
+ # Add benchmarked version as an option
315
+ Study.features_select_benchmarked = features_select_benchmarked
316
+
317
+ print("Successfully patched Study.features_select with optimized version")