masster 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/h5.py +577 -0
- masster/sample/helpers.py +9 -2
- masster/sample/load.py +68 -7
- masster/sample/plot.py +43 -34
- masster/sample/sample.py +4 -0
- masster/spectrum.py +3 -0
- masster/study/defaults/fill_def.py +3 -3
- masster/study/defaults/study_def.py +20 -0
- masster/study/export.py +3 -0
- masster/study/h5.py +120 -23
- masster/study/helpers.py +482 -11
- masster/study/load.py +566 -205
- masster/study/plot.py +9 -2
- masster/study/study.py +32 -13
- masster/study/study5_schema.json +17 -5
- {masster-0.3.14.dist-info → masster-0.3.16.dist-info}/METADATA +1 -1
- {masster-0.3.14.dist-info → masster-0.3.16.dist-info}/RECORD +21 -21
- {masster-0.3.14.dist-info → masster-0.3.16.dist-info}/WHEEL +0 -0
- {masster-0.3.14.dist-info → masster-0.3.16.dist-info}/entry_points.txt +0 -0
- {masster-0.3.14.dist-info → masster-0.3.16.dist-info}/licenses/LICENSE +0 -0
masster/study/load.py
CHANGED
|
@@ -40,7 +40,21 @@ def add(
|
|
|
40
40
|
reset=False,
|
|
41
41
|
adducts=None,
|
|
42
42
|
max_files=None,
|
|
43
|
+
fast=True,
|
|
43
44
|
):
|
|
45
|
+
"""Add samples from a folder to the study.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
folder (str, optional): Path to folder containing sample files.
|
|
49
|
+
Defaults to study folder or current working directory.
|
|
50
|
+
reset (bool, optional): Whether to reset the study before adding samples.
|
|
51
|
+
Defaults to False.
|
|
52
|
+
adducts (optional): Adducts to use for sample loading. Defaults to None.
|
|
53
|
+
max_files (int, optional): Maximum number of files to process.
|
|
54
|
+
Defaults to None (no limit).
|
|
55
|
+
fast (bool, optional): Whether to use optimized loading that skips ms1_df
|
|
56
|
+
for better performance. Defaults to True.
|
|
57
|
+
"""
|
|
44
58
|
if folder is None:
|
|
45
59
|
if self.folder is not None:
|
|
46
60
|
folder = self.folder
|
|
@@ -85,39 +99,29 @@ def add(
|
|
|
85
99
|
|
|
86
100
|
self.logger.debug(f"Found {len(files)} {ext} files")
|
|
87
101
|
|
|
88
|
-
#
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
files,
|
|
92
|
-
total=len(files),
|
|
93
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add *{ext}",
|
|
94
|
-
disable=tdqm_disable,
|
|
95
|
-
),
|
|
96
|
-
):
|
|
102
|
+
# Filter files not already processed and respect max_files limit
|
|
103
|
+
files_to_process = []
|
|
104
|
+
for file in files:
|
|
97
105
|
if max_files is not None and counter >= max_files:
|
|
98
106
|
break
|
|
99
|
-
|
|
107
|
+
|
|
100
108
|
# Get filename without extension for blacklist check
|
|
101
109
|
basename = os.path.basename(file)
|
|
102
110
|
filename_no_ext = os.path.splitext(basename)[0]
|
|
103
|
-
|
|
111
|
+
|
|
104
112
|
# Check if this filename (without extension) is already in blacklist
|
|
105
|
-
if filename_no_ext in blacklist:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
counter += 1
|
|
113
|
+
if filename_no_ext not in blacklist:
|
|
114
|
+
files_to_process.append(file)
|
|
115
|
+
if len(files_to_process) + counter >= (max_files or float('inf')):
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
# Batch process all files of this extension using ultra-optimized method
|
|
119
|
+
if files_to_process:
|
|
120
|
+
self.logger.debug(f"Batch processing {len(files_to_process)} {ext} files")
|
|
121
|
+
successful = self._add_samples_batch(files_to_process, reset=reset, adducts=adducts, blacklist=blacklist, fast=fast)
|
|
122
|
+
counter += successful
|
|
123
|
+
if successful > 0:
|
|
117
124
|
not_zero = True
|
|
118
|
-
except Exception as e:
|
|
119
|
-
self.logger.warning(f"Failed to add sample {file}: {e}")
|
|
120
|
-
continue
|
|
121
125
|
|
|
122
126
|
if max_files is not None and counter >= max_files:
|
|
123
127
|
self.logger.debug(
|
|
@@ -133,187 +137,43 @@ def add(
|
|
|
133
137
|
|
|
134
138
|
|
|
135
139
|
# TODO type is not used
|
|
136
|
-
def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
if
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
# Load the sample based on file type
|
|
161
|
-
ddaobj = Sample()
|
|
162
|
-
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
163
|
-
|
|
164
|
-
if file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
|
|
165
|
-
ddaobj.load(file)
|
|
166
|
-
else:
|
|
167
|
-
self.logger.error(f"Unsupported file format: {file}")
|
|
168
|
-
return
|
|
169
|
-
if ddaobj.features_df is None and not reset:
|
|
170
|
-
self.logger.debug(
|
|
171
|
-
f"File {file} will be newly processed.",
|
|
172
|
-
)
|
|
173
|
-
ddaobj.features = None
|
|
174
|
-
|
|
175
|
-
if ddaobj.features is None or reset:
|
|
176
|
-
ddaobj.find_features()
|
|
177
|
-
ddaobj.find_adducts(adducts=adducts)
|
|
178
|
-
ddaobj.find_ms2()
|
|
179
|
-
|
|
180
|
-
self.features_maps.append(ddaobj.features)
|
|
181
|
-
|
|
182
|
-
sample_type = "sample" if type is None else type
|
|
183
|
-
if "qc" in sample_name.lower():
|
|
184
|
-
sample_type = "qc"
|
|
185
|
-
if "blank" in sample_name.lower():
|
|
186
|
-
sample_type = "blank"
|
|
187
|
-
map_id_value = str(ddaobj.features.getUniqueId())
|
|
188
|
-
|
|
189
|
-
# Determine the final sample path based on file type
|
|
190
|
-
if file.endswith(".sample5"):
|
|
191
|
-
# If input is already .sample5, keep it in original location
|
|
192
|
-
final_sample_path = file
|
|
193
|
-
self.logger.debug(f"Using existing .sample5 file at original location: {final_sample_path}")
|
|
194
|
-
|
|
195
|
-
# Check if there's a corresponding featureXML file in the same directory
|
|
196
|
-
featurexml_path = file.replace(".sample5", ".featureXML")
|
|
197
|
-
if os.path.exists(featurexml_path):
|
|
198
|
-
self.logger.debug(f"Found corresponding featureXML file: {featurexml_path}")
|
|
199
|
-
else:
|
|
200
|
-
self.logger.debug(f"No corresponding featureXML file found at: {featurexml_path}")
|
|
201
|
-
else:
|
|
202
|
-
# For .wiff, .mzML, .raw files, save to study folder (original behavior)
|
|
203
|
-
if self.folder is not None:
|
|
204
|
-
if not os.path.exists(self.folder):
|
|
205
|
-
os.makedirs(self.folder)
|
|
206
|
-
final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
|
|
207
|
-
ddaobj.save(final_sample_path)
|
|
208
|
-
self.logger.debug(f"Saved converted sample to study folder: {final_sample_path}")
|
|
209
|
-
else:
|
|
210
|
-
# If no study folder is set, save in current directory
|
|
211
|
-
final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
|
|
212
|
-
ddaobj.save(final_sample_path)
|
|
213
|
-
self.logger.debug(f"Saved converted sample to current directory: {final_sample_path}")
|
|
214
|
-
|
|
215
|
-
# Count MS1 and MS2 scans from the loaded sample
|
|
216
|
-
ms1_count = 0
|
|
217
|
-
ms2_count = 0
|
|
218
|
-
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
219
|
-
ms1_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 1).height)
|
|
220
|
-
ms2_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 2).height)
|
|
221
|
-
|
|
222
|
-
new_sample = pl.DataFrame(
|
|
223
|
-
{
|
|
224
|
-
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
225
|
-
"sample_name": [sample_name],
|
|
226
|
-
"sample_path": [final_sample_path], # Use the determined path
|
|
227
|
-
"sample_type": [sample_type],
|
|
228
|
-
"size": [int(ddaobj.features.size())],
|
|
229
|
-
"map_id": [map_id_value],
|
|
230
|
-
"file_source": [getattr(ddaobj, "file_source", file)],
|
|
231
|
-
"ms1": [ms1_count],
|
|
232
|
-
"ms2": [ms2_count],
|
|
233
|
-
"sample_color": [None], # Will be set by set_sample_color below
|
|
234
|
-
},
|
|
235
|
-
schema={
|
|
236
|
-
"sample_uid": pl.Int64,
|
|
237
|
-
"sample_name": pl.Utf8,
|
|
238
|
-
"sample_path": pl.Utf8,
|
|
239
|
-
"sample_type": pl.Utf8,
|
|
240
|
-
"size": pl.Int64,
|
|
241
|
-
"map_id": pl.Utf8,
|
|
242
|
-
"file_source": pl.Utf8,
|
|
243
|
-
"ms1": pl.Int64,
|
|
244
|
-
"ms2": pl.Int64,
|
|
245
|
-
"sample_color": pl.Utf8,
|
|
246
|
-
},
|
|
247
|
-
)
|
|
248
|
-
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
249
|
-
|
|
250
|
-
# Optimized DataFrame operations - chain operations instead of multiple clones
|
|
251
|
-
columns_to_add = [
|
|
252
|
-
pl.lit(len(self.samples_df)).alias("sample_uid"),
|
|
253
|
-
pl.lit(False).alias("filled"),
|
|
254
|
-
pl.lit(-1.0).alias("chrom_area"),
|
|
255
|
-
]
|
|
256
|
-
|
|
257
|
-
# Only add rt_original if it doesn't exist
|
|
258
|
-
if "rt_original" not in ddaobj.features_df.columns:
|
|
259
|
-
columns_to_add.append(pl.col("rt").alias("rt_original"))
|
|
260
|
-
|
|
261
|
-
f_df = ddaobj.features_df.with_columns(columns_to_add)
|
|
262
|
-
|
|
263
|
-
if self.features_df.is_empty():
|
|
264
|
-
# Create new features_df with feature_uid column
|
|
265
|
-
self.features_df = f_df.with_columns(
|
|
266
|
-
pl.int_range(pl.len()).add(1).alias("feature_uid"),
|
|
267
|
-
).select(
|
|
268
|
-
["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
|
|
140
|
+
def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
|
|
141
|
+
"""
|
|
142
|
+
Add a single sample to the study.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
file (str): Path to the sample file
|
|
146
|
+
type (str, optional): File type to force. Defaults to None (auto-detect).
|
|
147
|
+
reset (bool, optional): Whether to reset the study. Defaults to False.
|
|
148
|
+
adducts (optional): Adducts to use for sample loading. Defaults to None.
|
|
149
|
+
fast (bool, optional): Whether to use optimized loading that skips ms1_df
|
|
150
|
+
for better performance. Defaults to True.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
bool: True if successful, False otherwise.
|
|
154
|
+
"""
|
|
155
|
+
if fast:
|
|
156
|
+
# Use optimized method for better performance
|
|
157
|
+
success = self._add_sample_optimized(
|
|
158
|
+
file,
|
|
159
|
+
type=type,
|
|
160
|
+
reset=reset,
|
|
161
|
+
adducts=adducts,
|
|
162
|
+
skip_color_reset=False, # Do color reset for individual calls
|
|
163
|
+
skip_schema_check=True # Skip schema check for performance (safe with diagonal concat)
|
|
269
164
|
)
|
|
270
|
-
# Ensure column order matches schema from the very beginning
|
|
271
|
-
self._ensure_features_df_schema_order()
|
|
272
165
|
else:
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
166
|
+
# Use standard method with full ms1_df loading
|
|
167
|
+
success = self._add_sample_standard(
|
|
168
|
+
file,
|
|
169
|
+
type=type,
|
|
170
|
+
reset=reset,
|
|
171
|
+
adducts=adducts,
|
|
172
|
+
skip_color_reset=False, # Do color reset for individual calls
|
|
173
|
+
skip_schema_check=True # Skip schema check for performance
|
|
279
174
|
)
|
|
280
|
-
|
|
281
|
-
# Reorganize f_df columns to match self.features_df column order and schema
|
|
282
|
-
target_columns = self.features_df.columns
|
|
283
|
-
target_schema = self.features_df.schema
|
|
284
|
-
f_df_columns = f_df.columns
|
|
285
|
-
|
|
286
|
-
# Create select expressions for reordering and type casting
|
|
287
|
-
select_exprs = []
|
|
288
|
-
for col in target_columns:
|
|
289
|
-
if col in f_df_columns:
|
|
290
|
-
# Cast to the expected type
|
|
291
|
-
expected_dtype = target_schema[col]
|
|
292
|
-
select_exprs.append(pl.col(col).cast(expected_dtype, strict=False))
|
|
293
|
-
else:
|
|
294
|
-
# Add missing columns with null values of the correct type
|
|
295
|
-
expected_dtype = target_schema[col]
|
|
296
|
-
select_exprs.append(pl.lit(None, dtype=expected_dtype).alias(col))
|
|
297
|
-
|
|
298
|
-
# Add any extra columns from f_df that aren't in target_columns (keep their original types)
|
|
299
|
-
for col in f_df_columns:
|
|
300
|
-
if col not in target_columns:
|
|
301
|
-
select_exprs.append(pl.col(col))
|
|
302
|
-
|
|
303
|
-
# Reorder and type-cast f_df columns
|
|
304
|
-
f_df = f_df.select(select_exprs)
|
|
305
|
-
|
|
306
|
-
self.features_df = pl.concat([self.features_df, f_df])
|
|
307
|
-
|
|
308
|
-
# Ensure features_df column order matches schema
|
|
309
|
-
self._ensure_features_df_schema_order()
|
|
310
175
|
|
|
311
|
-
|
|
312
|
-
self.sample_color_reset()
|
|
313
|
-
|
|
314
|
-
self.logger.debug(
|
|
315
|
-
f"Added sample {sample_name} with {ddaobj.features.size()} features to the study.",
|
|
316
|
-
)
|
|
176
|
+
return success
|
|
317
177
|
|
|
318
178
|
|
|
319
179
|
def load(self, filename=None):
|
|
@@ -931,8 +791,6 @@ def _fill_chrom_impl(
|
|
|
931
791
|
})
|
|
932
792
|
|
|
933
793
|
total_missing = len(missing_combinations_df)
|
|
934
|
-
total_samples = len(samples_to_process)
|
|
935
|
-
|
|
936
794
|
self.logger.debug(
|
|
937
795
|
f"Gap filling for {total_missing} missing features...",
|
|
938
796
|
)
|
|
@@ -1289,3 +1147,506 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
|
|
|
1289
1147
|
self.consensus_map = oms.ConsensusMap()
|
|
1290
1148
|
fh.load(filename, self.consensus_map)
|
|
1291
1149
|
self.logger.debug(f"Loaded consensus map from {filename}.")
|
|
1150
|
+
|
|
1151
|
+
def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, fast=True):
|
|
1152
|
+
"""
|
|
1153
|
+
Optimized batch addition of samples.
|
|
1154
|
+
|
|
1155
|
+
Args:
|
|
1156
|
+
files (list): List of file paths to process
|
|
1157
|
+
reset (bool): Whether to reset features before processing
|
|
1158
|
+
adducts: Adducts to use for sample loading
|
|
1159
|
+
blacklist (set): Set of filenames already processed
|
|
1160
|
+
fast (bool): Whether to use optimized loading (skips ms1_df) or standard loading
|
|
1161
|
+
|
|
1162
|
+
Performance optimizations:
|
|
1163
|
+
1. No per-sample color reset
|
|
1164
|
+
2. No schema enforcement during addition
|
|
1165
|
+
3. Simplified DataFrame operations
|
|
1166
|
+
4. Batch progress reporting
|
|
1167
|
+
"""
|
|
1168
|
+
if not files:
|
|
1169
|
+
return 0
|
|
1170
|
+
|
|
1171
|
+
if blacklist is None:
|
|
1172
|
+
blacklist = set()
|
|
1173
|
+
|
|
1174
|
+
self.logger.debug(f"Starting batch addition of {len(files)} samples (fast={fast})...")
|
|
1175
|
+
|
|
1176
|
+
successful_additions = 0
|
|
1177
|
+
failed_additions = 0
|
|
1178
|
+
|
|
1179
|
+
# Progress reporting setup
|
|
1180
|
+
tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1181
|
+
|
|
1182
|
+
for i, file in enumerate(
|
|
1183
|
+
tqdm(
|
|
1184
|
+
files,
|
|
1185
|
+
total=len(files),
|
|
1186
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Batch add",
|
|
1187
|
+
disable=tqdm_disable,
|
|
1188
|
+
)
|
|
1189
|
+
):
|
|
1190
|
+
try:
|
|
1191
|
+
# Choose between optimized and standard loading
|
|
1192
|
+
if fast:
|
|
1193
|
+
success = self._add_sample_optimized(
|
|
1194
|
+
file,
|
|
1195
|
+
reset=reset,
|
|
1196
|
+
adducts=adducts,
|
|
1197
|
+
skip_color_reset=True, # Skip color reset during batch
|
|
1198
|
+
skip_schema_check=True # Skip schema enforcement
|
|
1199
|
+
)
|
|
1200
|
+
else:
|
|
1201
|
+
success = self._add_sample_standard(
|
|
1202
|
+
file,
|
|
1203
|
+
reset=reset,
|
|
1204
|
+
adducts=adducts,
|
|
1205
|
+
skip_color_reset=True, # Skip color reset during batch
|
|
1206
|
+
skip_schema_check=True # Skip schema enforcement
|
|
1207
|
+
)
|
|
1208
|
+
|
|
1209
|
+
if success:
|
|
1210
|
+
# Add to blacklist for filename tracking
|
|
1211
|
+
basename = os.path.basename(file)
|
|
1212
|
+
filename_no_ext = os.path.splitext(basename)[0]
|
|
1213
|
+
blacklist.add(filename_no_ext)
|
|
1214
|
+
successful_additions += 1
|
|
1215
|
+
|
|
1216
|
+
except Exception as e:
|
|
1217
|
+
self.logger.warning(f"Failed to add sample {file}: {e}")
|
|
1218
|
+
failed_additions += 1
|
|
1219
|
+
continue
|
|
1220
|
+
|
|
1221
|
+
# Final cleanup operations done once at the end
|
|
1222
|
+
if successful_additions > 0:
|
|
1223
|
+
self.logger.debug("Performing final batch cleanup...")
|
|
1224
|
+
|
|
1225
|
+
# Optional: Only do schema enforcement if specifically needed (usually not required)
|
|
1226
|
+
# self._ensure_features_df_schema_order()
|
|
1227
|
+
|
|
1228
|
+
# Color assignment done once for all samples
|
|
1229
|
+
self._sample_color_reset_optimized()
|
|
1230
|
+
|
|
1231
|
+
self.logger.debug(f"Batch addition complete: {successful_additions} successful, {failed_additions} failed")
|
|
1232
|
+
|
|
1233
|
+
return successful_additions
|
|
1234
|
+
|
|
1235
|
+
def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
|
|
1236
|
+
"""
|
|
1237
|
+
Optimized add_sample with performance improvements integrated.
|
|
1238
|
+
|
|
1239
|
+
Removes:
|
|
1240
|
+
- Schema enforcement (_ensure_features_df_schema_order)
|
|
1241
|
+
- Complex column alignment and type casting
|
|
1242
|
+
- Per-addition color reset
|
|
1243
|
+
- Unnecessary column reordering
|
|
1244
|
+
|
|
1245
|
+
Returns True if successful, False otherwise.
|
|
1246
|
+
"""
|
|
1247
|
+
self.logger.debug(f"Adding: {file}")
|
|
1248
|
+
|
|
1249
|
+
# Basic validation
|
|
1250
|
+
basename = os.path.basename(file)
|
|
1251
|
+
sample_name = os.path.splitext(basename)[0]
|
|
1252
|
+
|
|
1253
|
+
if sample_name in self.samples_df["sample_name"].to_list():
|
|
1254
|
+
self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
|
|
1255
|
+
return False
|
|
1256
|
+
|
|
1257
|
+
if not os.path.exists(file):
|
|
1258
|
+
self.logger.error(f"File {file} does not exist.")
|
|
1259
|
+
return False
|
|
1260
|
+
|
|
1261
|
+
if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
|
|
1262
|
+
self.logger.error(f"Unsupported file type: {file}")
|
|
1263
|
+
return False
|
|
1264
|
+
|
|
1265
|
+
# Load sample
|
|
1266
|
+
ddaobj = Sample()
|
|
1267
|
+
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
1268
|
+
# Use standard loading method temporarily to test if this fixes the astuple error
|
|
1269
|
+
ddaobj.load(file)
|
|
1270
|
+
|
|
1271
|
+
if ddaobj.features_df is None and not reset:
|
|
1272
|
+
ddaobj.features = None
|
|
1273
|
+
|
|
1274
|
+
if ddaobj.features is None or reset:
|
|
1275
|
+
ddaobj.find_features()
|
|
1276
|
+
ddaobj.find_adducts(adducts=adducts)
|
|
1277
|
+
ddaobj.find_ms2()
|
|
1278
|
+
|
|
1279
|
+
self.features_maps.append(ddaobj.features)
|
|
1280
|
+
|
|
1281
|
+
# Determine sample type
|
|
1282
|
+
sample_type = "sample" if type is None else type
|
|
1283
|
+
if "qc" in sample_name.lower():
|
|
1284
|
+
sample_type = "qc"
|
|
1285
|
+
if "blank" in sample_name.lower():
|
|
1286
|
+
sample_type = "blank"
|
|
1287
|
+
|
|
1288
|
+
map_id_value = len(self.features_maps) - 1
|
|
1289
|
+
|
|
1290
|
+
# Handle file paths
|
|
1291
|
+
if file.endswith(".sample5"):
|
|
1292
|
+
final_sample_path = file
|
|
1293
|
+
self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
|
|
1294
|
+
else:
|
|
1295
|
+
if self.folder is not None:
|
|
1296
|
+
if not os.path.exists(self.folder):
|
|
1297
|
+
os.makedirs(self.folder)
|
|
1298
|
+
final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
|
|
1299
|
+
else:
|
|
1300
|
+
final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
|
|
1301
|
+
ddaobj.save(final_sample_path)
|
|
1302
|
+
self.logger.debug(f"Saved converted sample: {final_sample_path}")
|
|
1303
|
+
|
|
1304
|
+
# Efficient scan counting
|
|
1305
|
+
ms1_count = ms2_count = 0
|
|
1306
|
+
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
1307
|
+
scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1308
|
+
ms_levels = scan_counts.get("ms_level", [])
|
|
1309
|
+
counts = scan_counts.get("len", [])
|
|
1310
|
+
for level, count in zip(ms_levels, counts):
|
|
1311
|
+
if level == 1:
|
|
1312
|
+
ms1_count = count
|
|
1313
|
+
elif level == 2:
|
|
1314
|
+
ms2_count = count
|
|
1315
|
+
|
|
1316
|
+
# Create sample entry
|
|
1317
|
+
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1318
|
+
new_sample = pl.DataFrame({
|
|
1319
|
+
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
1320
|
+
"sample_name": [sample_name],
|
|
1321
|
+
"sample_path": [final_sample_path],
|
|
1322
|
+
"sample_type": [sample_type],
|
|
1323
|
+
"map_id": [map_id_value],
|
|
1324
|
+
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
1325
|
+
"sample_color": [None], # Will be set in batch at end
|
|
1326
|
+
"sample_group": [""],
|
|
1327
|
+
"sample_batch": [1],
|
|
1328
|
+
"sample_sequence": [next_sequence],
|
|
1329
|
+
"num_features": [int(ddaobj.features.size())],
|
|
1330
|
+
"num_ms1": [ms1_count],
|
|
1331
|
+
"num_ms2": [ms2_count],
|
|
1332
|
+
})
|
|
1333
|
+
|
|
1334
|
+
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1335
|
+
|
|
1336
|
+
# SIMPLIFIED feature processing
|
|
1337
|
+
current_sample_uid = len(self.samples_df) - 1
|
|
1338
|
+
|
|
1339
|
+
# Add required columns with minimal operations
|
|
1340
|
+
columns_to_add = [
|
|
1341
|
+
pl.lit(current_sample_uid).alias("sample_uid"),
|
|
1342
|
+
pl.lit(False).alias("filled"),
|
|
1343
|
+
pl.lit(-1.0).alias("chrom_area"),
|
|
1344
|
+
]
|
|
1345
|
+
|
|
1346
|
+
# Only add rt_original if it doesn't exist
|
|
1347
|
+
if "rt_original" not in ddaobj.features_df.columns:
|
|
1348
|
+
columns_to_add.append(pl.col("rt").alias("rt_original"))
|
|
1349
|
+
|
|
1350
|
+
f_df = ddaobj.features_df.with_columns(columns_to_add)
|
|
1351
|
+
|
|
1352
|
+
if self.features_df.is_empty():
|
|
1353
|
+
# First sample
|
|
1354
|
+
self.features_df = f_df.with_columns(
|
|
1355
|
+
pl.int_range(pl.len()).add(1).alias("feature_uid")
|
|
1356
|
+
)
|
|
1357
|
+
else:
|
|
1358
|
+
# Subsequent samples - minimal overhead
|
|
1359
|
+
offset = self.features_df["feature_uid"].max() + 1
|
|
1360
|
+
f_df = f_df.with_columns(
|
|
1361
|
+
pl.int_range(pl.len()).add(offset).alias("feature_uid")
|
|
1362
|
+
)
|
|
1363
|
+
|
|
1364
|
+
# OPTIMIZED: Use diagonal concatenation without any schema enforcement
|
|
1365
|
+
# This is the fastest concatenation method in Polars and handles type mismatches automatically
|
|
1366
|
+
self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
|
|
1367
|
+
|
|
1368
|
+
# REMOVED ALL EXPENSIVE OPERATIONS:
|
|
1369
|
+
# - No _ensure_features_df_schema_order()
|
|
1370
|
+
# - No complex column alignment
|
|
1371
|
+
# - No type casting loops
|
|
1372
|
+
# - No sample_color_reset()
|
|
1373
|
+
|
|
1374
|
+
self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (optimized)")
|
|
1375
|
+
return True
|
|
1376
|
+
|
|
1377
|
+
|
|
1378
|
+
def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
|
|
1379
|
+
"""
|
|
1380
|
+
Standard add_sample method that uses full sample loading (includes ms1_df).
|
|
1381
|
+
|
|
1382
|
+
This method uses the standard sample.load() method which loads all data
|
|
1383
|
+
including ms1_df, providing full functionality but potentially slower performance
|
|
1384
|
+
for large MS1 datasets.
|
|
1385
|
+
|
|
1386
|
+
Returns True if successful, False otherwise.
|
|
1387
|
+
"""
|
|
1388
|
+
self.logger.debug(f"Adding (standard): {file}")
|
|
1389
|
+
|
|
1390
|
+
# Basic validation
|
|
1391
|
+
basename = os.path.basename(file)
|
|
1392
|
+
sample_name = os.path.splitext(basename)[0]
|
|
1393
|
+
|
|
1394
|
+
if sample_name in self.samples_df["sample_name"].to_list():
|
|
1395
|
+
self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
|
|
1396
|
+
return False
|
|
1397
|
+
|
|
1398
|
+
if not os.path.exists(file):
|
|
1399
|
+
self.logger.error(f"File {file} does not exist.")
|
|
1400
|
+
return False
|
|
1401
|
+
|
|
1402
|
+
if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
|
|
1403
|
+
self.logger.error(f"Unsupported file type: {file}")
|
|
1404
|
+
return False
|
|
1405
|
+
|
|
1406
|
+
# Load sample using standard method (includes ms1_df)
|
|
1407
|
+
ddaobj = Sample()
|
|
1408
|
+
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
1409
|
+
# Use standard loading method that loads all data including ms1_df
|
|
1410
|
+
ddaobj.load(file)
|
|
1411
|
+
|
|
1412
|
+
if ddaobj.features_df is None and not reset:
|
|
1413
|
+
ddaobj.features = None
|
|
1414
|
+
|
|
1415
|
+
if ddaobj.features is None or reset:
|
|
1416
|
+
ddaobj.find_features()
|
|
1417
|
+
ddaobj.find_adducts(adducts=adducts)
|
|
1418
|
+
ddaobj.find_ms2()
|
|
1419
|
+
|
|
1420
|
+
self.features_maps.append(ddaobj.features)
|
|
1421
|
+
|
|
1422
|
+
# Determine sample type
|
|
1423
|
+
sample_type = "sample" if type is None else type
|
|
1424
|
+
if "qc" in sample_name.lower():
|
|
1425
|
+
sample_type = "qc"
|
|
1426
|
+
if "blank" in sample_name.lower():
|
|
1427
|
+
sample_type = "blank"
|
|
1428
|
+
|
|
1429
|
+
map_id_value = len(self.features_maps) - 1
|
|
1430
|
+
|
|
1431
|
+
# Handle file paths
|
|
1432
|
+
if file.endswith(".sample5"):
|
|
1433
|
+
final_sample_path = file
|
|
1434
|
+
self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
|
|
1435
|
+
else:
|
|
1436
|
+
if self.folder is not None:
|
|
1437
|
+
if not os.path.exists(self.folder):
|
|
1438
|
+
os.makedirs(self.folder)
|
|
1439
|
+
final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
|
|
1440
|
+
else:
|
|
1441
|
+
final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
|
|
1442
|
+
ddaobj.save(final_sample_path)
|
|
1443
|
+
self.logger.debug(f"Saved converted sample: {final_sample_path}")
|
|
1444
|
+
|
|
1445
|
+
# Efficient scan counting
|
|
1446
|
+
ms1_count = ms2_count = 0
|
|
1447
|
+
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
1448
|
+
scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1449
|
+
ms_levels = scan_counts.get("ms_level", [])
|
|
1450
|
+
counts = scan_counts.get("len", [])
|
|
1451
|
+
for level, count in zip(ms_levels, counts):
|
|
1452
|
+
if level == 1:
|
|
1453
|
+
ms1_count = count
|
|
1454
|
+
elif level == 2:
|
|
1455
|
+
ms2_count = count
|
|
1456
|
+
|
|
1457
|
+
# Create sample entry
|
|
1458
|
+
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1459
|
+
new_sample = pl.DataFrame({
|
|
1460
|
+
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
1461
|
+
"sample_name": [sample_name],
|
|
1462
|
+
"sample_path": [final_sample_path],
|
|
1463
|
+
"sample_type": [sample_type],
|
|
1464
|
+
"map_id": [map_id_value],
|
|
1465
|
+
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
1466
|
+
"sample_color": [None], # Will be set in batch at end
|
|
1467
|
+
"sample_group": [""],
|
|
1468
|
+
"sample_batch": [1],
|
|
1469
|
+
"sample_sequence": [next_sequence],
|
|
1470
|
+
"num_features": [int(ddaobj.features.size())],
|
|
1471
|
+
"num_ms1": [ms1_count],
|
|
1472
|
+
"num_ms2": [ms2_count],
|
|
1473
|
+
})
|
|
1474
|
+
|
|
1475
|
+
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1476
|
+
|
|
1477
|
+
# SIMPLIFIED feature processing
|
|
1478
|
+
current_sample_uid = len(self.samples_df) - 1
|
|
1479
|
+
|
|
1480
|
+
# Add required columns with minimal operations
|
|
1481
|
+
columns_to_add = [
|
|
1482
|
+
pl.lit(current_sample_uid).alias("sample_uid"),
|
|
1483
|
+
pl.lit(False).alias("filled"),
|
|
1484
|
+
pl.lit(-1.0).alias("chrom_area"),
|
|
1485
|
+
]
|
|
1486
|
+
|
|
1487
|
+
# Only add rt_original if it doesn't exist
|
|
1488
|
+
if "rt_original" not in ddaobj.features_df.columns:
|
|
1489
|
+
columns_to_add.append(pl.col("rt").alias("rt_original"))
|
|
1490
|
+
|
|
1491
|
+
f_df = ddaobj.features_df.with_columns(columns_to_add)
|
|
1492
|
+
|
|
1493
|
+
if self.features_df.is_empty():
|
|
1494
|
+
# First sample
|
|
1495
|
+
self.features_df = f_df.with_columns(
|
|
1496
|
+
pl.int_range(pl.len()).add(1).alias("feature_uid")
|
|
1497
|
+
)
|
|
1498
|
+
else:
|
|
1499
|
+
# Subsequent samples - minimal overhead
|
|
1500
|
+
offset = self.features_df["feature_uid"].max() + 1
|
|
1501
|
+
f_df = f_df.with_columns(
|
|
1502
|
+
pl.int_range(pl.len()).add(offset).alias("feature_uid")
|
|
1503
|
+
)
|
|
1504
|
+
|
|
1505
|
+
# Use diagonal concatenation for flexibility
|
|
1506
|
+
self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
|
|
1507
|
+
|
|
1508
|
+
self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
|
|
1509
|
+
return True
|
|
1510
|
+
# Use standard loading method that loads all data including ms1_df
|
|
1511
|
+
ddaobj.load(file)
|
|
1512
|
+
|
|
1513
|
+
if ddaobj.features_df is None and not reset:
|
|
1514
|
+
ddaobj.features = None
|
|
1515
|
+
|
|
1516
|
+
if ddaobj.features is None or reset:
|
|
1517
|
+
ddaobj.find_features()
|
|
1518
|
+
ddaobj.find_adducts(adducts=adducts)
|
|
1519
|
+
ddaobj.find_ms2()
|
|
1520
|
+
|
|
1521
|
+
self.features_maps.append(ddaobj.features)
|
|
1522
|
+
|
|
1523
|
+
# Determine sample type
|
|
1524
|
+
sample_type = "sample" if type is None else type
|
|
1525
|
+
if "qc" in sample_name.lower():
|
|
1526
|
+
sample_type = "qc"
|
|
1527
|
+
if "blank" in sample_name.lower():
|
|
1528
|
+
sample_type = "blank"
|
|
1529
|
+
|
|
1530
|
+
map_id_value = len(self.features_maps) - 1
|
|
1531
|
+
|
|
1532
|
+
# Handle file paths
|
|
1533
|
+
if file.endswith(".sample5"):
|
|
1534
|
+
final_sample_path = file
|
|
1535
|
+
self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
|
|
1536
|
+
else:
|
|
1537
|
+
if self.folder is not None:
|
|
1538
|
+
if not os.path.exists(self.folder):
|
|
1539
|
+
os.makedirs(self.folder)
|
|
1540
|
+
final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
|
|
1541
|
+
else:
|
|
1542
|
+
final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
|
|
1543
|
+
ddaobj.save(final_sample_path)
|
|
1544
|
+
self.logger.debug(f"Saved converted sample: {final_sample_path}")
|
|
1545
|
+
|
|
1546
|
+
# Efficient scan counting
|
|
1547
|
+
ms1_count = ms2_count = 0
|
|
1548
|
+
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
1549
|
+
scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1550
|
+
ms_levels = scan_counts.get("ms_level", [])
|
|
1551
|
+
counts = scan_counts.get("len", [])
|
|
1552
|
+
for level, count in zip(ms_levels, counts):
|
|
1553
|
+
if level == 1:
|
|
1554
|
+
ms1_count = count
|
|
1555
|
+
elif level == 2:
|
|
1556
|
+
ms2_count = count
|
|
1557
|
+
|
|
1558
|
+
# Create sample entry
|
|
1559
|
+
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1560
|
+
new_sample = pl.DataFrame({
|
|
1561
|
+
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
1562
|
+
"sample_name": [sample_name],
|
|
1563
|
+
"sample_path": [final_sample_path],
|
|
1564
|
+
"sample_type": [sample_type],
|
|
1565
|
+
"map_id": [map_id_value],
|
|
1566
|
+
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
1567
|
+
"sample_color": [None], # Will be set in batch at end
|
|
1568
|
+
"sample_group": [""],
|
|
1569
|
+
"sample_batch": [1],
|
|
1570
|
+
"sample_sequence": [next_sequence],
|
|
1571
|
+
"num_features": [int(ddaobj.features.size())],
|
|
1572
|
+
"num_ms1": [ms1_count],
|
|
1573
|
+
"num_ms2": [ms2_count],
|
|
1574
|
+
})
|
|
1575
|
+
|
|
1576
|
+
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1577
|
+
|
|
1578
|
+
# SIMPLIFIED feature processing
|
|
1579
|
+
current_sample_uid = len(self.samples_df) - 1
|
|
1580
|
+
|
|
1581
|
+
# Add required columns with minimal operations
|
|
1582
|
+
columns_to_add = [
|
|
1583
|
+
pl.lit(current_sample_uid).alias("sample_uid"),
|
|
1584
|
+
pl.lit(False).alias("filled"),
|
|
1585
|
+
pl.lit(-1.0).alias("chrom_area"),
|
|
1586
|
+
]
|
|
1587
|
+
|
|
1588
|
+
# Only add rt_original if it doesn't exist
|
|
1589
|
+
if "rt_original" not in ddaobj.features_df.columns:
|
|
1590
|
+
columns_to_add.append(pl.col("rt").alias("rt_original"))
|
|
1591
|
+
|
|
1592
|
+
f_df = ddaobj.features_df.with_columns(columns_to_add)
|
|
1593
|
+
|
|
1594
|
+
if self.features_df.is_empty():
|
|
1595
|
+
# First sample
|
|
1596
|
+
self.features_df = f_df.with_columns(
|
|
1597
|
+
pl.int_range(pl.len()).add(1).alias("feature_uid")
|
|
1598
|
+
)
|
|
1599
|
+
else:
|
|
1600
|
+
# Subsequent samples - minimal overhead
|
|
1601
|
+
offset = self.features_df["feature_uid"].max() + 1
|
|
1602
|
+
f_df = f_df.with_columns(
|
|
1603
|
+
pl.int_range(pl.len()).add(offset).alias("feature_uid")
|
|
1604
|
+
)
|
|
1605
|
+
|
|
1606
|
+
# Use diagonal concatenation for flexibility
|
|
1607
|
+
self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
|
|
1608
|
+
|
|
1609
|
+
self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
|
|
1610
|
+
return True
|
|
1611
|
+
|
|
1612
|
+
|
|
1613
|
+
def _sample_color_reset_optimized(self):
|
|
1614
|
+
"""
|
|
1615
|
+
Optimized version of sample_color_reset that caches colormap initialization.
|
|
1616
|
+
"""
|
|
1617
|
+
if self.samples_df is None or len(self.samples_df) == 0:
|
|
1618
|
+
self.logger.warning("No samples found in study.")
|
|
1619
|
+
return
|
|
1620
|
+
|
|
1621
|
+
# Cache the colormap if not already cached
|
|
1622
|
+
if not hasattr(self, '_cached_colormap'):
|
|
1623
|
+
try:
|
|
1624
|
+
from cmap import Colormap
|
|
1625
|
+
self._cached_colormap = Colormap('turbo')
|
|
1626
|
+
except ImportError:
|
|
1627
|
+
self.logger.warning("cmap package not available, using default colors")
|
|
1628
|
+
return
|
|
1629
|
+
|
|
1630
|
+
cm = self._cached_colormap
|
|
1631
|
+
n_samples = len(self.samples_df)
|
|
1632
|
+
|
|
1633
|
+
# Pre-allocate colors list for better performance
|
|
1634
|
+
colors = [None] * n_samples
|
|
1635
|
+
|
|
1636
|
+
# Vectorized color generation
|
|
1637
|
+
for i in range(n_samples):
|
|
1638
|
+
normalized_value = 0.1 + ((i + 0.5) / n_samples) * 0.8
|
|
1639
|
+
color_rgba = cm(normalized_value)
|
|
1640
|
+
|
|
1641
|
+
if len(color_rgba) >= 3:
|
|
1642
|
+
r, g, b = color_rgba[:3]
|
|
1643
|
+
if max(color_rgba[:3]) <= 1.0:
|
|
1644
|
+
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
|
1645
|
+
colors[i] = f"#{r:02x}{g:02x}{b:02x}"
|
|
1646
|
+
|
|
1647
|
+
# Update the sample_color column efficiently
|
|
1648
|
+
self.samples_df = self.samples_df.with_columns(
|
|
1649
|
+
pl.Series("sample_color", colors).alias("sample_color")
|
|
1650
|
+
)
|
|
1651
|
+
|
|
1652
|
+
self.logger.debug(f"Reset sample colors (cached) for {n_samples} samples")
|