masster 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/docs/SCX_API_Documentation.md +0 -0
- masster/docs/SCX_DLL_Analysis.md +0 -0
- masster/logger.py +92 -78
- masster/sample/defaults/find_features_def.py +16 -6
- masster/sample/defaults/sample_def.py +1 -1
- masster/sample/h5.py +2 -2
- masster/sample/helpers.py +137 -136
- masster/sample/load.py +13 -9
- masster/sample/plot.py +156 -131
- masster/sample/processing.py +18 -12
- masster/sample/sample.py +4 -4
- masster/sample/sample5_schema.json +62 -62
- masster/sample/save.py +16 -13
- masster/sample/sciex.py +187 -176
- masster/study/defaults/align_def.py +224 -6
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/study_def.py +2 -2
- masster/study/export.py +144 -131
- masster/study/h5.py +193 -133
- masster/study/helpers.py +293 -245
- masster/study/helpers_optimized.py +99 -57
- masster/study/load.py +51 -25
- masster/study/plot.py +453 -17
- masster/study/processing.py +159 -76
- masster/study/save.py +7 -7
- masster/study/study.py +97 -88
- masster/study/study5_schema.json +82 -82
- {masster-0.3.10.dist-info → masster-0.3.11.dist-info}/METADATA +1 -1
- {masster-0.3.10.dist-info → masster-0.3.11.dist-info}/RECORD +33 -31
- {masster-0.3.10.dist-info → masster-0.3.11.dist-info}/WHEEL +0 -0
- {masster-0.3.10.dist-info → masster-0.3.11.dist-info}/entry_points.txt +0 -0
- {masster-0.3.10.dist-info → masster-0.3.11.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,6 +11,7 @@ This module contains the optimized version of features_select that:
|
|
|
11
11
|
|
|
12
12
|
import polars as pl
|
|
13
13
|
|
|
14
|
+
|
|
14
15
|
def features_select_optimized(
|
|
15
16
|
self,
|
|
16
17
|
mz=None,
|
|
@@ -29,14 +30,14 @@ def features_select_optimized(
|
|
|
29
30
|
):
|
|
30
31
|
"""
|
|
31
32
|
Optimized version of features_select with improved performance.
|
|
32
|
-
|
|
33
|
+
|
|
33
34
|
Key optimizations:
|
|
34
35
|
- Combines all filters into a single expression
|
|
35
36
|
- Uses lazy evaluation for better performance
|
|
36
37
|
- Reduces logging overhead
|
|
37
38
|
- Pre-checks column existence once
|
|
38
39
|
- Early return for no filters
|
|
39
|
-
|
|
40
|
+
|
|
40
41
|
Args:
|
|
41
42
|
mz: mass-to-charge ratio filter (tuple for range, single value for minimum)
|
|
42
43
|
rt: retention time filter (tuple for range, single value for minimum)
|
|
@@ -51,30 +52,42 @@ def features_select_optimized(
|
|
|
51
52
|
chrom_prominence: chromatogram prominence filter (tuple for range, single value for minimum)
|
|
52
53
|
chrom_prominence_scaled: scaled chromatogram prominence filter (tuple for range, single value for minimum)
|
|
53
54
|
chrom_height_scaled: scaled chromatogram height filter (tuple for range, single value for minimum)
|
|
54
|
-
|
|
55
|
+
|
|
55
56
|
Returns:
|
|
56
57
|
polars.DataFrame: Filtered features DataFrame
|
|
57
58
|
"""
|
|
58
59
|
if self.features_df is None or self.features_df.is_empty():
|
|
59
60
|
self.logger.warning("No features found in study.")
|
|
60
61
|
return pl.DataFrame()
|
|
61
|
-
|
|
62
|
+
|
|
62
63
|
# Early return if no filters provided
|
|
63
|
-
filter_params = [
|
|
64
|
-
|
|
65
|
-
|
|
64
|
+
filter_params = [
|
|
65
|
+
mz,
|
|
66
|
+
rt,
|
|
67
|
+
inty,
|
|
68
|
+
sample_uid,
|
|
69
|
+
sample_name,
|
|
70
|
+
consensus_uid,
|
|
71
|
+
feature_uid,
|
|
72
|
+
filled,
|
|
73
|
+
quality,
|
|
74
|
+
chrom_coherence,
|
|
75
|
+
chrom_prominence,
|
|
76
|
+
chrom_prominence_scaled,
|
|
77
|
+
chrom_height_scaled,
|
|
78
|
+
]
|
|
66
79
|
if all(param is None for param in filter_params):
|
|
67
80
|
return self.features_df.clone()
|
|
68
|
-
|
|
81
|
+
|
|
69
82
|
initial_count = len(self.features_df)
|
|
70
|
-
|
|
83
|
+
|
|
71
84
|
# Pre-check available columns once
|
|
72
85
|
available_columns = set(self.features_df.columns)
|
|
73
|
-
|
|
86
|
+
|
|
74
87
|
# Build all filter conditions
|
|
75
88
|
filter_conditions = []
|
|
76
89
|
warnings = []
|
|
77
|
-
|
|
90
|
+
|
|
78
91
|
# Filter by m/z
|
|
79
92
|
if mz is not None:
|
|
80
93
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
@@ -82,7 +95,7 @@ def features_select_optimized(
|
|
|
82
95
|
filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
83
96
|
else:
|
|
84
97
|
filter_conditions.append(pl.col("mz") >= mz)
|
|
85
|
-
|
|
98
|
+
|
|
86
99
|
# Filter by retention time
|
|
87
100
|
if rt is not None:
|
|
88
101
|
if isinstance(rt, tuple) and len(rt) == 2:
|
|
@@ -90,7 +103,7 @@ def features_select_optimized(
|
|
|
90
103
|
filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
91
104
|
else:
|
|
92
105
|
filter_conditions.append(pl.col("rt") >= rt)
|
|
93
|
-
|
|
106
|
+
|
|
94
107
|
# Filter by intensity
|
|
95
108
|
if inty is not None:
|
|
96
109
|
if isinstance(inty, tuple) and len(inty) == 2:
|
|
@@ -98,7 +111,7 @@ def features_select_optimized(
|
|
|
98
111
|
filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
|
|
99
112
|
else:
|
|
100
113
|
filter_conditions.append(pl.col("inty") >= inty)
|
|
101
|
-
|
|
114
|
+
|
|
102
115
|
# Filter by sample_uid
|
|
103
116
|
if sample_uid is not None:
|
|
104
117
|
if isinstance(sample_uid, (list, tuple)):
|
|
@@ -111,24 +124,24 @@ def features_select_optimized(
|
|
|
111
124
|
filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
|
|
112
125
|
else:
|
|
113
126
|
filter_conditions.append(pl.col("sample_uid") == sample_uid)
|
|
114
|
-
|
|
127
|
+
|
|
115
128
|
# Filter by sample_name (requires pre-processing)
|
|
116
129
|
if sample_name is not None:
|
|
117
130
|
# Get sample_uids for the given sample names
|
|
118
131
|
if isinstance(sample_name, list):
|
|
119
132
|
sample_uids_for_names = self.samples_df.filter(
|
|
120
|
-
pl.col("sample_name").is_in(sample_name)
|
|
133
|
+
pl.col("sample_name").is_in(sample_name),
|
|
121
134
|
)["sample_uid"].to_list()
|
|
122
135
|
else:
|
|
123
136
|
sample_uids_for_names = self.samples_df.filter(
|
|
124
|
-
pl.col("sample_name") == sample_name
|
|
137
|
+
pl.col("sample_name") == sample_name,
|
|
125
138
|
)["sample_uid"].to_list()
|
|
126
|
-
|
|
139
|
+
|
|
127
140
|
if sample_uids_for_names:
|
|
128
141
|
filter_conditions.append(pl.col("sample_uid").is_in(sample_uids_for_names))
|
|
129
142
|
else:
|
|
130
143
|
filter_conditions.append(pl.lit(False)) # No matching samples
|
|
131
|
-
|
|
144
|
+
|
|
132
145
|
# Filter by consensus_uid
|
|
133
146
|
if consensus_uid is not None:
|
|
134
147
|
if isinstance(consensus_uid, (list, tuple)):
|
|
@@ -141,7 +154,7 @@ def features_select_optimized(
|
|
|
141
154
|
filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
|
|
142
155
|
else:
|
|
143
156
|
filter_conditions.append(pl.col("consensus_uid") == consensus_uid)
|
|
144
|
-
|
|
157
|
+
|
|
145
158
|
# Filter by feature_uid
|
|
146
159
|
if feature_uid is not None:
|
|
147
160
|
if isinstance(feature_uid, (list, tuple)):
|
|
@@ -154,7 +167,7 @@ def features_select_optimized(
|
|
|
154
167
|
filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
|
|
155
168
|
else:
|
|
156
169
|
filter_conditions.append(pl.col("feature_uid") == feature_uid)
|
|
157
|
-
|
|
170
|
+
|
|
158
171
|
# Filter by filled status
|
|
159
172
|
if filled is not None:
|
|
160
173
|
if "filled" in available_columns:
|
|
@@ -164,7 +177,7 @@ def features_select_optimized(
|
|
|
164
177
|
filter_conditions.append(~pl.col("filled") | pl.col("filled").is_null())
|
|
165
178
|
else:
|
|
166
179
|
warnings.append("'filled' column not found in features_df")
|
|
167
|
-
|
|
180
|
+
|
|
168
181
|
# Filter by quality
|
|
169
182
|
if quality is not None:
|
|
170
183
|
if "quality" in available_columns:
|
|
@@ -175,75 +188,85 @@ def features_select_optimized(
|
|
|
175
188
|
filter_conditions.append(pl.col("quality") >= quality)
|
|
176
189
|
else:
|
|
177
190
|
warnings.append("'quality' column not found in features_df")
|
|
178
|
-
|
|
191
|
+
|
|
179
192
|
# Filter by chromatogram coherence
|
|
180
193
|
if chrom_coherence is not None:
|
|
181
194
|
if "chrom_coherence" in available_columns:
|
|
182
195
|
if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
|
|
183
196
|
min_coherence, max_coherence = chrom_coherence
|
|
184
|
-
filter_conditions.append(
|
|
197
|
+
filter_conditions.append(
|
|
198
|
+
(pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
|
|
199
|
+
)
|
|
185
200
|
else:
|
|
186
201
|
filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
|
|
187
202
|
else:
|
|
188
203
|
warnings.append("'chrom_coherence' column not found in features_df")
|
|
189
|
-
|
|
204
|
+
|
|
190
205
|
# Filter by chromatogram prominence
|
|
191
206
|
if chrom_prominence is not None:
|
|
192
207
|
if "chrom_prominence" in available_columns:
|
|
193
208
|
if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
|
|
194
209
|
min_prominence, max_prominence = chrom_prominence
|
|
195
|
-
filter_conditions.append(
|
|
210
|
+
filter_conditions.append(
|
|
211
|
+
(pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
|
|
212
|
+
)
|
|
196
213
|
else:
|
|
197
214
|
filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
|
|
198
215
|
else:
|
|
199
216
|
warnings.append("'chrom_prominence' column not found in features_df")
|
|
200
|
-
|
|
217
|
+
|
|
201
218
|
# Filter by scaled chromatogram prominence
|
|
202
219
|
if chrom_prominence_scaled is not None:
|
|
203
220
|
if "chrom_prominence_scaled" in available_columns:
|
|
204
221
|
if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
|
|
205
222
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
|
|
206
|
-
filter_conditions.append(
|
|
223
|
+
filter_conditions.append(
|
|
224
|
+
(pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
|
|
225
|
+
& (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
|
|
226
|
+
)
|
|
207
227
|
else:
|
|
208
228
|
filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
|
|
209
229
|
else:
|
|
210
230
|
warnings.append("'chrom_prominence_scaled' column not found in features_df")
|
|
211
|
-
|
|
231
|
+
|
|
212
232
|
# Filter by scaled chromatogram height
|
|
213
233
|
if chrom_height_scaled is not None:
|
|
214
234
|
if "chrom_height_scaled" in available_columns:
|
|
215
235
|
if isinstance(chrom_height_scaled, tuple) and len(chrom_height_scaled) == 2:
|
|
216
236
|
min_height_scaled, max_height_scaled = chrom_height_scaled
|
|
217
|
-
filter_conditions.append(
|
|
237
|
+
filter_conditions.append(
|
|
238
|
+
(pl.col("chrom_height_scaled") >= min_height_scaled)
|
|
239
|
+
& (pl.col("chrom_height_scaled") <= max_height_scaled)
|
|
240
|
+
)
|
|
218
241
|
else:
|
|
219
242
|
filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
|
|
220
243
|
else:
|
|
221
244
|
warnings.append("'chrom_height_scaled' column not found in features_df")
|
|
222
|
-
|
|
245
|
+
|
|
223
246
|
# Log warnings once at the end
|
|
224
247
|
for warning in warnings:
|
|
225
248
|
self.logger.warning(warning)
|
|
226
|
-
|
|
249
|
+
|
|
227
250
|
# Apply all filters at once if any exist
|
|
228
251
|
if filter_conditions:
|
|
229
252
|
# Combine all conditions with AND
|
|
230
253
|
combined_filter = filter_conditions[0]
|
|
231
254
|
for condition in filter_conditions[1:]:
|
|
232
255
|
combined_filter = combined_filter & condition
|
|
233
|
-
|
|
256
|
+
|
|
234
257
|
# Apply the combined filter using lazy evaluation for better performance
|
|
235
258
|
feats = self.features_df.lazy().filter(combined_filter).collect()
|
|
236
259
|
else:
|
|
237
260
|
feats = self.features_df.clone()
|
|
238
|
-
|
|
261
|
+
|
|
239
262
|
final_count = len(feats)
|
|
240
|
-
|
|
263
|
+
|
|
241
264
|
if final_count == 0:
|
|
242
265
|
self.logger.warning("No features remaining after applying selection criteria.")
|
|
243
266
|
else:
|
|
244
267
|
removed_count = initial_count - final_count
|
|
245
268
|
self.logger.info(f"Features selected: {final_count} (removed: {removed_count})")
|
|
246
|
-
|
|
269
|
+
|
|
247
270
|
return feats
|
|
248
271
|
|
|
249
272
|
|
|
@@ -267,51 +290,70 @@ def features_select_benchmarked(
|
|
|
267
290
|
Benchmarked version that compares old vs new implementation performance.
|
|
268
291
|
"""
|
|
269
292
|
import time
|
|
270
|
-
|
|
293
|
+
|
|
271
294
|
# Call the original method for comparison
|
|
272
295
|
start_time = time.perf_counter()
|
|
273
296
|
_ = self.features_select_original(
|
|
274
|
-
mz=mz,
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
297
|
+
mz=mz,
|
|
298
|
+
rt=rt,
|
|
299
|
+
inty=inty,
|
|
300
|
+
sample_uid=sample_uid,
|
|
301
|
+
sample_name=sample_name,
|
|
302
|
+
consensus_uid=consensus_uid,
|
|
303
|
+
feature_uid=feature_uid,
|
|
304
|
+
filled=filled,
|
|
305
|
+
quality=quality,
|
|
306
|
+
chrom_coherence=chrom_coherence,
|
|
307
|
+
chrom_prominence=chrom_prominence,
|
|
308
|
+
chrom_prominence_scaled=chrom_prominence_scaled,
|
|
309
|
+
chrom_height_scaled=chrom_height_scaled,
|
|
279
310
|
)
|
|
280
311
|
original_time = time.perf_counter() - start_time
|
|
281
|
-
|
|
312
|
+
|
|
282
313
|
# Call the optimized method
|
|
283
314
|
start_time = time.perf_counter()
|
|
284
315
|
result_optimized = features_select_optimized(
|
|
285
|
-
self,
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
316
|
+
self,
|
|
317
|
+
mz=mz,
|
|
318
|
+
rt=rt,
|
|
319
|
+
inty=inty,
|
|
320
|
+
sample_uid=sample_uid,
|
|
321
|
+
sample_name=sample_name,
|
|
322
|
+
consensus_uid=consensus_uid,
|
|
323
|
+
feature_uid=feature_uid,
|
|
324
|
+
filled=filled,
|
|
325
|
+
quality=quality,
|
|
326
|
+
chrom_coherence=chrom_coherence,
|
|
327
|
+
chrom_prominence=chrom_prominence,
|
|
328
|
+
chrom_prominence_scaled=chrom_prominence_scaled,
|
|
329
|
+
chrom_height_scaled=chrom_height_scaled,
|
|
290
330
|
)
|
|
291
331
|
optimized_time = time.perf_counter() - start_time
|
|
292
|
-
|
|
332
|
+
|
|
293
333
|
# Log performance comparison
|
|
294
|
-
speedup = original_time / optimized_time if optimized_time > 0 else float(
|
|
295
|
-
self.logger.info(
|
|
296
|
-
|
|
334
|
+
speedup = original_time / optimized_time if optimized_time > 0 else float("inf")
|
|
335
|
+
self.logger.info(
|
|
336
|
+
f"Performance comparison - Original: {original_time:.4f}s, Optimized: {optimized_time:.4f}s, Speedup: {speedup:.2f}x"
|
|
337
|
+
)
|
|
338
|
+
|
|
297
339
|
return result_optimized
|
|
298
340
|
|
|
299
341
|
|
|
300
342
|
def monkey_patch_study():
|
|
301
343
|
"""
|
|
302
344
|
Apply the optimized features_select method to the Study class.
|
|
303
|
-
|
|
345
|
+
|
|
304
346
|
Call this function to replace the original features_select with the optimized version.
|
|
305
347
|
"""
|
|
306
348
|
from masster.study.study import Study
|
|
307
|
-
|
|
349
|
+
|
|
308
350
|
# Store original method for benchmarking
|
|
309
351
|
Study.features_select_original = Study.features_select
|
|
310
|
-
|
|
352
|
+
|
|
311
353
|
# Replace with optimized version
|
|
312
354
|
Study.features_select = features_select_optimized
|
|
313
|
-
|
|
355
|
+
|
|
314
356
|
# Add benchmarked version as an option
|
|
315
357
|
Study.features_select_benchmarked = features_select_benchmarked
|
|
316
|
-
|
|
358
|
+
|
|
317
359
|
print("Successfully patched Study.features_select with optimized version")
|
masster/study/load.py
CHANGED
|
@@ -48,10 +48,10 @@ def add(
|
|
|
48
48
|
folder = os.getcwd()
|
|
49
49
|
|
|
50
50
|
self.logger.debug(f"Adding files from: {folder}")
|
|
51
|
-
|
|
51
|
+
|
|
52
52
|
# Define file extensions to search for in order of priority
|
|
53
53
|
extensions = [".sample5", ".wiff", ".raw", ".mzML"]
|
|
54
|
-
|
|
54
|
+
|
|
55
55
|
# Check if folder contains glob patterns
|
|
56
56
|
if not any(char in folder for char in ["*", "?", "[", "]"]):
|
|
57
57
|
search_folder = folder
|
|
@@ -68,7 +68,7 @@ def add(
|
|
|
68
68
|
for ext in extensions:
|
|
69
69
|
if max_files is not None and counter >= max_files:
|
|
70
70
|
break
|
|
71
|
-
|
|
71
|
+
|
|
72
72
|
# Build search pattern
|
|
73
73
|
if any(char in folder for char in ["*", "?", "[", "]"]):
|
|
74
74
|
# If folder already contains glob patterns, modify the extension
|
|
@@ -78,16 +78,16 @@ def add(
|
|
|
78
78
|
pattern = os.path.join(search_folder, "**", f"*{ext}")
|
|
79
79
|
else:
|
|
80
80
|
pattern = os.path.join(search_folder, "**", f"*{ext}")
|
|
81
|
-
|
|
81
|
+
|
|
82
82
|
files = glob.glob(pattern, recursive=True)
|
|
83
|
-
|
|
83
|
+
|
|
84
84
|
if len(files) > 0:
|
|
85
85
|
# Limit files if max_files is specified
|
|
86
86
|
remaining_slots = max_files - counter if max_files is not None else len(files)
|
|
87
87
|
files = files[:remaining_slots]
|
|
88
|
-
|
|
88
|
+
|
|
89
89
|
self.logger.debug(f"Found {len(files)} {ext} files")
|
|
90
|
-
|
|
90
|
+
|
|
91
91
|
# Process files
|
|
92
92
|
for i, file in enumerate(
|
|
93
93
|
tqdm(
|
|
@@ -99,18 +99,18 @@ def add(
|
|
|
99
99
|
):
|
|
100
100
|
if max_files is not None and counter >= max_files:
|
|
101
101
|
break
|
|
102
|
-
|
|
102
|
+
|
|
103
103
|
# Get filename without extension for blacklist check
|
|
104
104
|
basename = os.path.basename(file)
|
|
105
105
|
filename_no_ext = os.path.splitext(basename)[0]
|
|
106
|
-
|
|
106
|
+
|
|
107
107
|
# Check if this filename (without extension) is already in blacklist
|
|
108
108
|
if filename_no_ext in blacklist:
|
|
109
109
|
self.logger.debug(f"Skipping {file} - filename already processed")
|
|
110
110
|
continue
|
|
111
|
-
|
|
111
|
+
|
|
112
112
|
self.logger.debug(f"Add file {counter + 1}: {file}")
|
|
113
|
-
|
|
113
|
+
|
|
114
114
|
# Try to add the sample
|
|
115
115
|
try:
|
|
116
116
|
self.add_sample(file=file, reset=reset, adducts=adducts)
|
|
@@ -138,11 +138,11 @@ def add(
|
|
|
138
138
|
# TODO type is not used
|
|
139
139
|
def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
140
140
|
self.logger.debug(f"Adding: {file}")
|
|
141
|
-
|
|
141
|
+
|
|
142
142
|
# Extract sample name by removing any known extension
|
|
143
143
|
basename = os.path.basename(file)
|
|
144
144
|
sample_name = os.path.splitext(basename)[0]
|
|
145
|
-
|
|
145
|
+
|
|
146
146
|
# check if sample_name is already in the samples_df
|
|
147
147
|
if sample_name in self.samples_df["sample_name"].to_list():
|
|
148
148
|
self.logger.warning(
|
|
@@ -163,7 +163,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
|
163
163
|
# Load the sample based on file type
|
|
164
164
|
ddaobj = Sample()
|
|
165
165
|
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
166
|
-
|
|
166
|
+
|
|
167
167
|
if file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
|
|
168
168
|
ddaobj.load(file)
|
|
169
169
|
else:
|
|
@@ -178,7 +178,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
|
178
178
|
if ddaobj.features is None or reset:
|
|
179
179
|
ddaobj.find_features()
|
|
180
180
|
ddaobj.find_adducts(adducts=adducts)
|
|
181
|
-
ddaobj.find_ms2()
|
|
181
|
+
ddaobj.find_ms2()
|
|
182
182
|
|
|
183
183
|
self.features_maps.append(ddaobj.features)
|
|
184
184
|
|
|
@@ -194,7 +194,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
|
194
194
|
# If input is already .sample5, keep it in original location
|
|
195
195
|
final_sample_path = file
|
|
196
196
|
self.logger.debug(f"Using existing .sample5 file at original location: {final_sample_path}")
|
|
197
|
-
|
|
197
|
+
|
|
198
198
|
# Check if there's a corresponding featureXML file in the same directory
|
|
199
199
|
featurexml_path = file.replace(".sample5", ".featureXML")
|
|
200
200
|
if os.path.exists(featurexml_path):
|
|
@@ -218,7 +218,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
|
218
218
|
# Count MS1 and MS2 scans from the loaded sample
|
|
219
219
|
ms1_count = 0
|
|
220
220
|
ms2_count = 0
|
|
221
|
-
if hasattr(ddaobj,
|
|
221
|
+
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
222
222
|
ms1_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 1).height)
|
|
223
223
|
ms2_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 2).height)
|
|
224
224
|
|
|
@@ -230,7 +230,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
|
230
230
|
"sample_type": [sample_type],
|
|
231
231
|
"size": [int(ddaobj.features.size())],
|
|
232
232
|
"map_id": [map_id_value],
|
|
233
|
-
"file_source": [getattr(ddaobj,
|
|
233
|
+
"file_source": [getattr(ddaobj, "file_source", file)],
|
|
234
234
|
"ms1": [ms1_count],
|
|
235
235
|
"ms2": [ms2_count],
|
|
236
236
|
},
|
|
@@ -304,8 +304,8 @@ def load(self, filename=None):
|
|
|
304
304
|
else:
|
|
305
305
|
self.logger.error("Either filename or folder must be provided")
|
|
306
306
|
return
|
|
307
|
-
|
|
308
|
-
#self.logger.info(f"Loading study from {filename}")
|
|
307
|
+
|
|
308
|
+
# self.logger.info(f"Loading study from {filename}")
|
|
309
309
|
self._load_study5(filename)
|
|
310
310
|
# After loading the study, check if consensus XML exists and load it
|
|
311
311
|
consensus_xml_path = filename.replace(".study5", ".consensusXML")
|
|
@@ -566,7 +566,20 @@ def _fill_chrom_single_impl(
|
|
|
566
566
|
rows_to_add.append(new_row)
|
|
567
567
|
|
|
568
568
|
# Create and add new DataFrame
|
|
569
|
-
|
|
569
|
+
if rows_to_add:
|
|
570
|
+
# Ensure consistent data types by explicitly casting problematic columns
|
|
571
|
+
for row in rows_to_add:
|
|
572
|
+
# Cast numeric columns to ensure consistency
|
|
573
|
+
for key, value in row.items():
|
|
574
|
+
if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
|
|
575
|
+
row[key] = float(value)
|
|
576
|
+
elif key in ["sample_id", "feature_id"] and value is not None:
|
|
577
|
+
row[key] = int(value)
|
|
578
|
+
|
|
579
|
+
new_df = pl.from_dicts(rows_to_add, infer_schema_length=len(rows_to_add))
|
|
580
|
+
else:
|
|
581
|
+
# Handle empty case - create empty DataFrame with proper schema
|
|
582
|
+
new_df = pl.DataFrame(schema=self.features_df.schema)
|
|
570
583
|
|
|
571
584
|
# Cast columns to match existing schema
|
|
572
585
|
cast_exprs = []
|
|
@@ -606,8 +619,9 @@ def fill_single(self, **kwargs):
|
|
|
606
619
|
"""
|
|
607
620
|
# parameters initialization
|
|
608
621
|
from masster.study.defaults import fill_defaults
|
|
622
|
+
|
|
609
623
|
params = fill_defaults()
|
|
610
|
-
|
|
624
|
+
|
|
611
625
|
for key, value in kwargs.items():
|
|
612
626
|
if isinstance(value, fill_defaults):
|
|
613
627
|
params = value
|
|
@@ -959,7 +973,20 @@ def _fill_chrom_impl(
|
|
|
959
973
|
rows_to_add.append(new_row)
|
|
960
974
|
|
|
961
975
|
# Create and add new DataFrame
|
|
962
|
-
|
|
976
|
+
if rows_to_add:
|
|
977
|
+
# Ensure consistent data types by explicitly casting problematic columns
|
|
978
|
+
for row in rows_to_add:
|
|
979
|
+
# Cast numeric columns to ensure consistency
|
|
980
|
+
for key, value in row.items():
|
|
981
|
+
if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
|
|
982
|
+
row[key] = float(value)
|
|
983
|
+
elif key in ["sample_id", "feature_id"] and value is not None:
|
|
984
|
+
row[key] = int(value)
|
|
985
|
+
|
|
986
|
+
new_df = pl.from_dicts(rows_to_add, infer_schema_length=len(rows_to_add))
|
|
987
|
+
else:
|
|
988
|
+
# Handle empty case - create empty DataFrame with proper schema
|
|
989
|
+
new_df = pl.DataFrame(schema=self.features_df.schema)
|
|
963
990
|
|
|
964
991
|
# Cast columns to match existing schema
|
|
965
992
|
cast_exprs = []
|
|
@@ -1001,7 +1028,7 @@ def fill(self, **kwargs):
|
|
|
1001
1028
|
# parameters initialization
|
|
1002
1029
|
params = fill_defaults()
|
|
1003
1030
|
num_workers = kwargs.get("num_workers", 4) # Default parameter not in defaults class
|
|
1004
|
-
|
|
1031
|
+
|
|
1005
1032
|
for key, value in kwargs.items():
|
|
1006
1033
|
if isinstance(value, fill_defaults):
|
|
1007
1034
|
params = value
|
|
@@ -1228,4 +1255,3 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
|
|
|
1228
1255
|
self.consensus_map = oms.ConsensusMap()
|
|
1229
1256
|
fh.load(filename, self.consensus_map)
|
|
1230
1257
|
self.logger.debug(f"Loaded consensus map from {filename}.")
|
|
1231
|
-
|