masster 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/h5.py +577 -0
- masster/sample/load.py +57 -0
- masster/sample/sample.py +4 -0
- masster/spectrum.py +3 -0
- masster/study/defaults/align_def.py +9 -0
- masster/study/defaults/fill_def.py +3 -3
- masster/study/export.py +3 -0
- masster/study/load.py +653 -258
- masster/study/processing.py +149 -120
- masster/study/study.py +8 -0
- masster/study/study5_schema.json +3 -0
- {masster-0.3.15.dist-info → masster-0.3.17.dist-info}/METADATA +1 -1
- {masster-0.3.15.dist-info → masster-0.3.17.dist-info}/RECORD +17 -17
- {masster-0.3.15.dist-info → masster-0.3.17.dist-info}/WHEEL +0 -0
- {masster-0.3.15.dist-info → masster-0.3.17.dist-info}/entry_points.txt +0 -0
- {masster-0.3.15.dist-info → masster-0.3.17.dist-info}/licenses/LICENSE +0 -0
masster/study/load.py
CHANGED
|
@@ -40,7 +40,21 @@ def add(
|
|
|
40
40
|
reset=False,
|
|
41
41
|
adducts=None,
|
|
42
42
|
max_files=None,
|
|
43
|
+
fast=True,
|
|
43
44
|
):
|
|
45
|
+
"""Add samples from a folder to the study.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
folder (str, optional): Path to folder containing sample files.
|
|
49
|
+
Defaults to study folder or current working directory.
|
|
50
|
+
reset (bool, optional): Whether to reset the study before adding samples.
|
|
51
|
+
Defaults to False.
|
|
52
|
+
adducts (optional): Adducts to use for sample loading. Defaults to None.
|
|
53
|
+
max_files (int, optional): Maximum number of files to process.
|
|
54
|
+
Defaults to None (no limit).
|
|
55
|
+
fast (bool, optional): Whether to use optimized loading that skips ms1_df
|
|
56
|
+
for better performance. Defaults to True.
|
|
57
|
+
"""
|
|
44
58
|
if folder is None:
|
|
45
59
|
if self.folder is not None:
|
|
46
60
|
folder = self.folder
|
|
@@ -85,39 +99,29 @@ def add(
|
|
|
85
99
|
|
|
86
100
|
self.logger.debug(f"Found {len(files)} {ext} files")
|
|
87
101
|
|
|
88
|
-
#
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
files,
|
|
92
|
-
total=len(files),
|
|
93
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add *{ext}",
|
|
94
|
-
disable=tdqm_disable,
|
|
95
|
-
),
|
|
96
|
-
):
|
|
102
|
+
# Filter files not already processed and respect max_files limit
|
|
103
|
+
files_to_process = []
|
|
104
|
+
for file in files:
|
|
97
105
|
if max_files is not None and counter >= max_files:
|
|
98
106
|
break
|
|
99
|
-
|
|
107
|
+
|
|
100
108
|
# Get filename without extension for blacklist check
|
|
101
109
|
basename = os.path.basename(file)
|
|
102
110
|
filename_no_ext = os.path.splitext(basename)[0]
|
|
103
|
-
|
|
111
|
+
|
|
104
112
|
# Check if this filename (without extension) is already in blacklist
|
|
105
|
-
if filename_no_ext in blacklist:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
counter += 1
|
|
113
|
+
if filename_no_ext not in blacklist:
|
|
114
|
+
files_to_process.append(file)
|
|
115
|
+
if len(files_to_process) + counter >= (max_files or float('inf')):
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
# Batch process all files of this extension using ultra-optimized method
|
|
119
|
+
if files_to_process:
|
|
120
|
+
self.logger.debug(f"Batch processing {len(files_to_process)} {ext} files")
|
|
121
|
+
successful = self._add_samples_batch(files_to_process, reset=reset, adducts=adducts, blacklist=blacklist, fast=fast)
|
|
122
|
+
counter += successful
|
|
123
|
+
if successful > 0:
|
|
117
124
|
not_zero = True
|
|
118
|
-
except Exception as e:
|
|
119
|
-
self.logger.warning(f"Failed to add sample {file}: {e}")
|
|
120
|
-
continue
|
|
121
125
|
|
|
122
126
|
if max_files is not None and counter >= max_files:
|
|
123
127
|
self.logger.debug(
|
|
@@ -133,198 +137,43 @@ def add(
|
|
|
133
137
|
|
|
134
138
|
|
|
135
139
|
# TODO type is not used
|
|
136
|
-
def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
# Extract sample name by removing any known extension
|
|
140
|
-
basename = os.path.basename(file)
|
|
141
|
-
sample_name = os.path.splitext(basename)[0]
|
|
142
|
-
|
|
143
|
-
# check if sample_name is already in the samples_df
|
|
144
|
-
if sample_name in self.samples_df["sample_name"].to_list():
|
|
145
|
-
self.logger.warning(
|
|
146
|
-
f"Sample {sample_name} already exists in the study. Skipping.",
|
|
147
|
-
)
|
|
148
|
-
return
|
|
149
|
-
|
|
150
|
-
# check if file exists
|
|
151
|
-
if not os.path.exists(file):
|
|
152
|
-
self.logger.error(f"File {file} does not exist.")
|
|
153
|
-
return
|
|
154
|
-
|
|
155
|
-
# Check for supported file extensions
|
|
156
|
-
if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
|
|
157
|
-
self.logger.error(f"File {file} is not a supported file type. Supported: .sample5, .wiff, .raw, .mzML")
|
|
158
|
-
return
|
|
159
|
-
|
|
160
|
-
# Load the sample based on file type
|
|
161
|
-
ddaobj = Sample()
|
|
162
|
-
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
163
|
-
|
|
164
|
-
if file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
|
|
165
|
-
ddaobj.load(file)
|
|
166
|
-
else:
|
|
167
|
-
self.logger.error(f"Unsupported file format: {file}")
|
|
168
|
-
return
|
|
169
|
-
if ddaobj.features_df is None and not reset:
|
|
170
|
-
self.logger.debug(
|
|
171
|
-
f"File {file} will be newly processed.",
|
|
172
|
-
)
|
|
173
|
-
ddaobj.features = None
|
|
174
|
-
|
|
175
|
-
if ddaobj.features is None or reset:
|
|
176
|
-
ddaobj.find_features()
|
|
177
|
-
ddaobj.find_adducts(adducts=adducts)
|
|
178
|
-
ddaobj.find_ms2()
|
|
179
|
-
|
|
180
|
-
self.features_maps.append(ddaobj.features)
|
|
181
|
-
|
|
182
|
-
sample_type = "sample" if type is None else type
|
|
183
|
-
if "qc" in sample_name.lower():
|
|
184
|
-
sample_type = "qc"
|
|
185
|
-
if "blank" in sample_name.lower():
|
|
186
|
-
sample_type = "blank"
|
|
140
|
+
def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
|
|
141
|
+
"""
|
|
142
|
+
Add a single sample to the study.
|
|
187
143
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
|
|
209
|
-
ddaobj.save(final_sample_path)
|
|
210
|
-
self.logger.debug(f"Saved converted sample to study folder: {final_sample_path}")
|
|
211
|
-
else:
|
|
212
|
-
# If no study folder is set, save in current directory
|
|
213
|
-
final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
|
|
214
|
-
ddaobj.save(final_sample_path)
|
|
215
|
-
self.logger.debug(f"Saved converted sample to current directory: {final_sample_path}")
|
|
216
|
-
|
|
217
|
-
# Count MS1 and MS2 scans from the loaded sample
|
|
218
|
-
ms1_count = 0
|
|
219
|
-
ms2_count = 0
|
|
220
|
-
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
221
|
-
ms1_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 1).height)
|
|
222
|
-
ms2_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 2).height)
|
|
223
|
-
|
|
224
|
-
# Calculate next sequence number
|
|
225
|
-
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
226
|
-
|
|
227
|
-
new_sample = pl.DataFrame(
|
|
228
|
-
{
|
|
229
|
-
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
230
|
-
"sample_name": [sample_name],
|
|
231
|
-
"sample_path": [final_sample_path], # Use the determined path
|
|
232
|
-
"sample_type": [sample_type],
|
|
233
|
-
"map_id": [map_id_value],
|
|
234
|
-
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
235
|
-
"sample_color": [None], # Will be set by set_sample_color below
|
|
236
|
-
"sample_group": [""], # Default empty string
|
|
237
|
-
"sample_batch": [1], # Default batch 1
|
|
238
|
-
"sample_sequence": [next_sequence], # Increasing sequence number
|
|
239
|
-
"num_features": [int(ddaobj.features.size())],
|
|
240
|
-
"num_ms1": [ms1_count],
|
|
241
|
-
"num_ms2": [ms2_count],
|
|
242
|
-
},
|
|
243
|
-
schema={
|
|
244
|
-
"sample_uid": pl.Int64,
|
|
245
|
-
"sample_name": pl.Utf8,
|
|
246
|
-
"sample_path": pl.Utf8,
|
|
247
|
-
"sample_type": pl.Utf8,
|
|
248
|
-
"map_id": pl.Int64,
|
|
249
|
-
"sample_source": pl.Utf8,
|
|
250
|
-
"sample_color": pl.Utf8,
|
|
251
|
-
"sample_group": pl.Utf8,
|
|
252
|
-
"sample_batch": pl.Int64,
|
|
253
|
-
"sample_sequence": pl.Int64,
|
|
254
|
-
"num_features": pl.Int64,
|
|
255
|
-
"num_ms1": pl.Int64,
|
|
256
|
-
"num_ms2": pl.Int64,
|
|
257
|
-
},
|
|
258
|
-
)
|
|
259
|
-
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
260
|
-
|
|
261
|
-
# Optimized DataFrame operations - chain operations instead of multiple clones
|
|
262
|
-
columns_to_add = [
|
|
263
|
-
pl.lit(len(self.samples_df)).alias("sample_uid"),
|
|
264
|
-
pl.lit(False).alias("filled"),
|
|
265
|
-
pl.lit(-1.0).alias("chrom_area"),
|
|
266
|
-
]
|
|
267
|
-
|
|
268
|
-
# Only add rt_original if it doesn't exist
|
|
269
|
-
if "rt_original" not in ddaobj.features_df.columns:
|
|
270
|
-
columns_to_add.append(pl.col("rt").alias("rt_original"))
|
|
271
|
-
|
|
272
|
-
f_df = ddaobj.features_df.with_columns(columns_to_add)
|
|
273
|
-
|
|
274
|
-
if self.features_df.is_empty():
|
|
275
|
-
# Create new features_df with feature_uid column
|
|
276
|
-
self.features_df = f_df.with_columns(
|
|
277
|
-
pl.int_range(pl.len()).add(1).alias("feature_uid"),
|
|
278
|
-
).select(
|
|
279
|
-
["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
|
|
144
|
+
Args:
|
|
145
|
+
file (str): Path to the sample file
|
|
146
|
+
type (str, optional): File type to force. Defaults to None (auto-detect).
|
|
147
|
+
reset (bool, optional): Whether to reset the study. Defaults to False.
|
|
148
|
+
adducts (optional): Adducts to use for sample loading. Defaults to None.
|
|
149
|
+
fast (bool, optional): Whether to use optimized loading that skips ms1_df
|
|
150
|
+
for better performance. Defaults to True.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
bool: True if successful, False otherwise.
|
|
154
|
+
"""
|
|
155
|
+
if fast:
|
|
156
|
+
# Use optimized method for better performance
|
|
157
|
+
success = self._add_sample_optimized(
|
|
158
|
+
file,
|
|
159
|
+
type=type,
|
|
160
|
+
reset=reset,
|
|
161
|
+
adducts=adducts,
|
|
162
|
+
skip_color_reset=False, # Do color reset for individual calls
|
|
163
|
+
skip_schema_check=True # Skip schema check for performance (safe with diagonal concat)
|
|
280
164
|
)
|
|
281
|
-
# Ensure column order matches schema from the very beginning
|
|
282
|
-
self._ensure_features_df_schema_order()
|
|
283
165
|
else:
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
166
|
+
# Use standard method with full ms1_df loading
|
|
167
|
+
success = self._add_sample_standard(
|
|
168
|
+
file,
|
|
169
|
+
type=type,
|
|
170
|
+
reset=reset,
|
|
171
|
+
adducts=adducts,
|
|
172
|
+
skip_color_reset=False, # Do color reset for individual calls
|
|
173
|
+
skip_schema_check=True # Skip schema check for performance
|
|
290
174
|
)
|
|
291
|
-
|
|
292
|
-
# Reorganize f_df columns to match self.features_df column order and schema
|
|
293
|
-
target_columns = self.features_df.columns
|
|
294
|
-
target_schema = self.features_df.schema
|
|
295
|
-
f_df_columns = f_df.columns
|
|
296
|
-
|
|
297
|
-
# Create select expressions for reordering and type casting
|
|
298
|
-
select_exprs = []
|
|
299
|
-
for col in target_columns:
|
|
300
|
-
if col in f_df_columns:
|
|
301
|
-
# Cast to the expected type
|
|
302
|
-
expected_dtype = target_schema[col]
|
|
303
|
-
select_exprs.append(pl.col(col).cast(expected_dtype, strict=False))
|
|
304
|
-
else:
|
|
305
|
-
# Add missing columns with null values of the correct type
|
|
306
|
-
expected_dtype = target_schema[col]
|
|
307
|
-
select_exprs.append(pl.lit(None, dtype=expected_dtype).alias(col))
|
|
308
|
-
|
|
309
|
-
# Add any extra columns from f_df that aren't in target_columns (keep their original types)
|
|
310
|
-
for col in f_df_columns:
|
|
311
|
-
if col not in target_columns:
|
|
312
|
-
select_exprs.append(pl.col(col))
|
|
313
|
-
|
|
314
|
-
# Reorder and type-cast f_df columns
|
|
315
|
-
f_df = f_df.select(select_exprs)
|
|
316
|
-
|
|
317
|
-
self.features_df = pl.concat([self.features_df, f_df])
|
|
318
|
-
|
|
319
|
-
# Ensure features_df column order matches schema
|
|
320
|
-
self._ensure_features_df_schema_order()
|
|
321
|
-
|
|
322
|
-
# Auto-assign colors when new sample is added (reset all colors using turbo colormap based on UID)
|
|
323
|
-
self.sample_color_reset()
|
|
324
175
|
|
|
325
|
-
|
|
326
|
-
f"Added sample {sample_name} with {ddaobj.features.size()} features to the study.",
|
|
327
|
-
)
|
|
176
|
+
return success
|
|
328
177
|
|
|
329
178
|
|
|
330
179
|
def load(self, filename=None):
|
|
@@ -942,8 +791,6 @@ def _fill_chrom_impl(
|
|
|
942
791
|
})
|
|
943
792
|
|
|
944
793
|
total_missing = len(missing_combinations_df)
|
|
945
|
-
total_samples = len(samples_to_process)
|
|
946
|
-
|
|
947
794
|
self.logger.debug(
|
|
948
795
|
f"Gap filling for {total_missing} missing features...",
|
|
949
796
|
)
|
|
@@ -1114,51 +961,96 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1114
961
|
"""
|
|
1115
962
|
Efficiently identify which consensus_uid/sample combinations are missing.
|
|
1116
963
|
Returns a list of tuples: (consensus_uid, sample_uid, sample_name, sample_path)
|
|
964
|
+
|
|
965
|
+
Optimized for common scenarios:
|
|
966
|
+
- Early termination for fully-filled studies
|
|
967
|
+
- Efficient dictionary lookups instead of expensive DataFrame joins
|
|
968
|
+
- Smart handling of sparse vs dense missing data patterns
|
|
1117
969
|
"""
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
# Get existing consensus/sample combinations from consensus_mapping_df
|
|
1134
|
-
existing_combinations = set()
|
|
1135
|
-
consensus_mapping_filtered = self.consensus_mapping_df.filter(
|
|
1136
|
-
pl.col("consensus_uid").is_in(list(consensus_uids_set)),
|
|
1137
|
-
)
|
|
1138
|
-
|
|
1139
|
-
# Join with features_df to get sample_uid information
|
|
1140
|
-
existing_features = consensus_mapping_filtered.join(
|
|
1141
|
-
self.features_df.select(["feature_uid", "sample_uid"]),
|
|
1142
|
-
on="feature_uid",
|
|
1143
|
-
how="inner",
|
|
970
|
+
if not uids:
|
|
971
|
+
return []
|
|
972
|
+
|
|
973
|
+
n_consensus = len(uids)
|
|
974
|
+
n_samples = len(self.samples_df)
|
|
975
|
+
total_possible = n_consensus * n_samples
|
|
976
|
+
|
|
977
|
+
# Quick early termination check for fully/nearly filled studies
|
|
978
|
+
# This handles the common case where fill() is run on an already-filled study
|
|
979
|
+
consensus_counts = (
|
|
980
|
+
self.consensus_mapping_df
|
|
981
|
+
.filter(pl.col("consensus_uid").is_in(uids))
|
|
982
|
+
.group_by("consensus_uid")
|
|
983
|
+
.agg(pl.count("feature_uid").alias("count"))
|
|
1144
984
|
)
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
985
|
+
|
|
986
|
+
total_existing = consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
|
|
987
|
+
|
|
988
|
+
# If >95% filled, likely no gaps (common case)
|
|
989
|
+
if total_existing >= total_possible * 0.95:
|
|
990
|
+
self.logger.debug(f"Study appears {total_existing/total_possible*100:.1f}% filled, using sparse optimization")
|
|
991
|
+
|
|
992
|
+
# For sparse missing data, check each consensus feature individually
|
|
993
|
+
missing_combinations = []
|
|
994
|
+
uids_set = set(uids)
|
|
995
|
+
|
|
996
|
+
# Build efficient lookups
|
|
997
|
+
feature_to_sample = dict(
|
|
998
|
+
self.features_df.select(["feature_uid", "sample_uid"]).iter_rows()
|
|
999
|
+
)
|
|
1000
|
+
|
|
1001
|
+
# Get existing combinations for target UIDs only
|
|
1002
|
+
existing_by_consensus = {}
|
|
1003
|
+
for consensus_uid, feature_uid in self.consensus_mapping_df.select(["consensus_uid", "feature_uid"]).iter_rows():
|
|
1004
|
+
if consensus_uid in uids_set and feature_uid in feature_to_sample:
|
|
1005
|
+
if consensus_uid not in existing_by_consensus:
|
|
1006
|
+
existing_by_consensus[consensus_uid] = set()
|
|
1007
|
+
existing_by_consensus[consensus_uid].add(feature_to_sample[feature_uid])
|
|
1008
|
+
|
|
1009
|
+
# Get sample info once
|
|
1010
|
+
all_samples = list(
|
|
1011
|
+
self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows()
|
|
1012
|
+
)
|
|
1013
|
+
|
|
1014
|
+
# Check for missing combinations
|
|
1015
|
+
for consensus_uid in uids:
|
|
1016
|
+
existing_samples = existing_by_consensus.get(consensus_uid, set())
|
|
1017
|
+
for sample_uid, sample_name, sample_path in all_samples:
|
|
1018
|
+
if sample_uid not in existing_samples:
|
|
1019
|
+
missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path))
|
|
1020
|
+
|
|
1021
|
+
return missing_combinations
|
|
1022
|
+
|
|
1023
|
+
else:
|
|
1024
|
+
# For studies with many gaps, use bulk operations
|
|
1025
|
+
self.logger.debug(f"Study {total_existing/total_possible*100:.1f}% filled, using bulk optimization")
|
|
1026
|
+
|
|
1027
|
+
# Build efficient lookups
|
|
1028
|
+
uids_set = set(uids)
|
|
1029
|
+
feature_to_sample = dict(
|
|
1030
|
+
self.features_df.select(["feature_uid", "sample_uid"]).iter_rows()
|
|
1031
|
+
)
|
|
1032
|
+
|
|
1033
|
+
# Build existing combinations set
|
|
1034
|
+
existing_combinations = {
|
|
1035
|
+
(consensus_uid, feature_to_sample[feature_uid])
|
|
1036
|
+
for consensus_uid, feature_uid in self.consensus_mapping_df.select(["consensus_uid", "feature_uid"]).iter_rows()
|
|
1037
|
+
if consensus_uid in uids_set and feature_uid in feature_to_sample
|
|
1038
|
+
}
|
|
1039
|
+
|
|
1040
|
+
# Get all sample info
|
|
1041
|
+
all_samples = list(
|
|
1042
|
+
self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows()
|
|
1043
|
+
)
|
|
1044
|
+
|
|
1045
|
+
# Generate all missing combinations
|
|
1046
|
+
missing_combinations = [
|
|
1047
|
+
(consensus_uid, sample_uid, sample_name, sample_path)
|
|
1048
|
+
for consensus_uid in uids
|
|
1049
|
+
for sample_uid, sample_name, sample_path in all_samples
|
|
1050
|
+
if (consensus_uid, sample_uid) not in existing_combinations
|
|
1051
|
+
]
|
|
1052
|
+
|
|
1053
|
+
return missing_combinations
|
|
1162
1054
|
|
|
1163
1055
|
|
|
1164
1056
|
def sanitize(self):
|
|
@@ -1300,3 +1192,506 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
|
|
|
1300
1192
|
self.consensus_map = oms.ConsensusMap()
|
|
1301
1193
|
fh.load(filename, self.consensus_map)
|
|
1302
1194
|
self.logger.debug(f"Loaded consensus map from {filename}.")
|
|
1195
|
+
|
|
1196
|
+
def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, fast=True):
|
|
1197
|
+
"""
|
|
1198
|
+
Optimized batch addition of samples.
|
|
1199
|
+
|
|
1200
|
+
Args:
|
|
1201
|
+
files (list): List of file paths to process
|
|
1202
|
+
reset (bool): Whether to reset features before processing
|
|
1203
|
+
adducts: Adducts to use for sample loading
|
|
1204
|
+
blacklist (set): Set of filenames already processed
|
|
1205
|
+
fast (bool): Whether to use optimized loading (skips ms1_df) or standard loading
|
|
1206
|
+
|
|
1207
|
+
Performance optimizations:
|
|
1208
|
+
1. No per-sample color reset
|
|
1209
|
+
2. No schema enforcement during addition
|
|
1210
|
+
3. Simplified DataFrame operations
|
|
1211
|
+
4. Batch progress reporting
|
|
1212
|
+
"""
|
|
1213
|
+
if not files:
|
|
1214
|
+
return 0
|
|
1215
|
+
|
|
1216
|
+
if blacklist is None:
|
|
1217
|
+
blacklist = set()
|
|
1218
|
+
|
|
1219
|
+
self.logger.debug(f"Starting batch addition of {len(files)} samples (fast={fast})...")
|
|
1220
|
+
|
|
1221
|
+
successful_additions = 0
|
|
1222
|
+
failed_additions = 0
|
|
1223
|
+
|
|
1224
|
+
# Progress reporting setup
|
|
1225
|
+
tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1226
|
+
|
|
1227
|
+
for i, file in enumerate(
|
|
1228
|
+
tqdm(
|
|
1229
|
+
files,
|
|
1230
|
+
total=len(files),
|
|
1231
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Batch add",
|
|
1232
|
+
disable=tqdm_disable,
|
|
1233
|
+
)
|
|
1234
|
+
):
|
|
1235
|
+
try:
|
|
1236
|
+
# Choose between optimized and standard loading
|
|
1237
|
+
if fast:
|
|
1238
|
+
success = self._add_sample_optimized(
|
|
1239
|
+
file,
|
|
1240
|
+
reset=reset,
|
|
1241
|
+
adducts=adducts,
|
|
1242
|
+
skip_color_reset=True, # Skip color reset during batch
|
|
1243
|
+
skip_schema_check=True # Skip schema enforcement
|
|
1244
|
+
)
|
|
1245
|
+
else:
|
|
1246
|
+
success = self._add_sample_standard(
|
|
1247
|
+
file,
|
|
1248
|
+
reset=reset,
|
|
1249
|
+
adducts=adducts,
|
|
1250
|
+
skip_color_reset=True, # Skip color reset during batch
|
|
1251
|
+
skip_schema_check=True # Skip schema enforcement
|
|
1252
|
+
)
|
|
1253
|
+
|
|
1254
|
+
if success:
|
|
1255
|
+
# Add to blacklist for filename tracking
|
|
1256
|
+
basename = os.path.basename(file)
|
|
1257
|
+
filename_no_ext = os.path.splitext(basename)[0]
|
|
1258
|
+
blacklist.add(filename_no_ext)
|
|
1259
|
+
successful_additions += 1
|
|
1260
|
+
|
|
1261
|
+
except Exception as e:
|
|
1262
|
+
self.logger.warning(f"Failed to add sample {file}: {e}")
|
|
1263
|
+
failed_additions += 1
|
|
1264
|
+
continue
|
|
1265
|
+
|
|
1266
|
+
# Final cleanup operations done once at the end
|
|
1267
|
+
if successful_additions > 0:
|
|
1268
|
+
self.logger.debug("Performing final batch cleanup...")
|
|
1269
|
+
|
|
1270
|
+
# Optional: Only do schema enforcement if specifically needed (usually not required)
|
|
1271
|
+
# self._ensure_features_df_schema_order()
|
|
1272
|
+
|
|
1273
|
+
# Color assignment done once for all samples
|
|
1274
|
+
self._sample_color_reset_optimized()
|
|
1275
|
+
|
|
1276
|
+
self.logger.debug(f"Batch addition complete: {successful_additions} successful, {failed_additions} failed")
|
|
1277
|
+
|
|
1278
|
+
return successful_additions
|
|
1279
|
+
|
|
1280
|
+
def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
|
|
1281
|
+
"""
|
|
1282
|
+
Optimized add_sample with performance improvements integrated.
|
|
1283
|
+
|
|
1284
|
+
Removes:
|
|
1285
|
+
- Schema enforcement (_ensure_features_df_schema_order)
|
|
1286
|
+
- Complex column alignment and type casting
|
|
1287
|
+
- Per-addition color reset
|
|
1288
|
+
- Unnecessary column reordering
|
|
1289
|
+
|
|
1290
|
+
Returns True if successful, False otherwise.
|
|
1291
|
+
"""
|
|
1292
|
+
self.logger.debug(f"Adding: {file}")
|
|
1293
|
+
|
|
1294
|
+
# Basic validation
|
|
1295
|
+
basename = os.path.basename(file)
|
|
1296
|
+
sample_name = os.path.splitext(basename)[0]
|
|
1297
|
+
|
|
1298
|
+
if sample_name in self.samples_df["sample_name"].to_list():
|
|
1299
|
+
self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
|
|
1300
|
+
return False
|
|
1301
|
+
|
|
1302
|
+
if not os.path.exists(file):
|
|
1303
|
+
self.logger.error(f"File {file} does not exist.")
|
|
1304
|
+
return False
|
|
1305
|
+
|
|
1306
|
+
if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
|
|
1307
|
+
self.logger.error(f"Unsupported file type: {file}")
|
|
1308
|
+
return False
|
|
1309
|
+
|
|
1310
|
+
# Load sample
|
|
1311
|
+
ddaobj = Sample()
|
|
1312
|
+
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
1313
|
+
# Use standard loading method temporarily to test if this fixes the astuple error
|
|
1314
|
+
ddaobj.load(file)
|
|
1315
|
+
|
|
1316
|
+
if ddaobj.features_df is None and not reset:
|
|
1317
|
+
ddaobj.features = None
|
|
1318
|
+
|
|
1319
|
+
if ddaobj.features is None or reset:
|
|
1320
|
+
ddaobj.find_features()
|
|
1321
|
+
ddaobj.find_adducts(adducts=adducts)
|
|
1322
|
+
ddaobj.find_ms2()
|
|
1323
|
+
|
|
1324
|
+
self.features_maps.append(ddaobj.features)
|
|
1325
|
+
|
|
1326
|
+
# Determine sample type
|
|
1327
|
+
sample_type = "sample" if type is None else type
|
|
1328
|
+
if "qc" in sample_name.lower():
|
|
1329
|
+
sample_type = "qc"
|
|
1330
|
+
if "blank" in sample_name.lower():
|
|
1331
|
+
sample_type = "blank"
|
|
1332
|
+
|
|
1333
|
+
map_id_value = len(self.features_maps) - 1
|
|
1334
|
+
|
|
1335
|
+
# Handle file paths
|
|
1336
|
+
if file.endswith(".sample5"):
|
|
1337
|
+
final_sample_path = file
|
|
1338
|
+
self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
|
|
1339
|
+
else:
|
|
1340
|
+
if self.folder is not None:
|
|
1341
|
+
if not os.path.exists(self.folder):
|
|
1342
|
+
os.makedirs(self.folder)
|
|
1343
|
+
final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
|
|
1344
|
+
else:
|
|
1345
|
+
final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
|
|
1346
|
+
ddaobj.save(final_sample_path)
|
|
1347
|
+
self.logger.debug(f"Saved converted sample: {final_sample_path}")
|
|
1348
|
+
|
|
1349
|
+
# Efficient scan counting
|
|
1350
|
+
ms1_count = ms2_count = 0
|
|
1351
|
+
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
1352
|
+
scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1353
|
+
ms_levels = scan_counts.get("ms_level", [])
|
|
1354
|
+
counts = scan_counts.get("len", [])
|
|
1355
|
+
for level, count in zip(ms_levels, counts):
|
|
1356
|
+
if level == 1:
|
|
1357
|
+
ms1_count = count
|
|
1358
|
+
elif level == 2:
|
|
1359
|
+
ms2_count = count
|
|
1360
|
+
|
|
1361
|
+
# Create sample entry
|
|
1362
|
+
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1363
|
+
new_sample = pl.DataFrame({
|
|
1364
|
+
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
1365
|
+
"sample_name": [sample_name],
|
|
1366
|
+
"sample_path": [final_sample_path],
|
|
1367
|
+
"sample_type": [sample_type],
|
|
1368
|
+
"map_id": [map_id_value],
|
|
1369
|
+
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
1370
|
+
"sample_color": [None], # Will be set in batch at end
|
|
1371
|
+
"sample_group": [""],
|
|
1372
|
+
"sample_batch": [1],
|
|
1373
|
+
"sample_sequence": [next_sequence],
|
|
1374
|
+
"num_features": [int(ddaobj.features.size())],
|
|
1375
|
+
"num_ms1": [ms1_count],
|
|
1376
|
+
"num_ms2": [ms2_count],
|
|
1377
|
+
})
|
|
1378
|
+
|
|
1379
|
+
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1380
|
+
|
|
1381
|
+
# SIMPLIFIED feature processing
|
|
1382
|
+
current_sample_uid = len(self.samples_df) - 1
|
|
1383
|
+
|
|
1384
|
+
# Add required columns with minimal operations
|
|
1385
|
+
columns_to_add = [
|
|
1386
|
+
pl.lit(current_sample_uid).alias("sample_uid"),
|
|
1387
|
+
pl.lit(False).alias("filled"),
|
|
1388
|
+
pl.lit(-1.0).alias("chrom_area"),
|
|
1389
|
+
]
|
|
1390
|
+
|
|
1391
|
+
# Only add rt_original if it doesn't exist
|
|
1392
|
+
if "rt_original" not in ddaobj.features_df.columns:
|
|
1393
|
+
columns_to_add.append(pl.col("rt").alias("rt_original"))
|
|
1394
|
+
|
|
1395
|
+
f_df = ddaobj.features_df.with_columns(columns_to_add)
|
|
1396
|
+
|
|
1397
|
+
if self.features_df.is_empty():
|
|
1398
|
+
# First sample
|
|
1399
|
+
self.features_df = f_df.with_columns(
|
|
1400
|
+
pl.int_range(pl.len()).add(1).alias("feature_uid")
|
|
1401
|
+
)
|
|
1402
|
+
else:
|
|
1403
|
+
# Subsequent samples - minimal overhead
|
|
1404
|
+
offset = self.features_df["feature_uid"].max() + 1
|
|
1405
|
+
f_df = f_df.with_columns(
|
|
1406
|
+
pl.int_range(pl.len()).add(offset).alias("feature_uid")
|
|
1407
|
+
)
|
|
1408
|
+
|
|
1409
|
+
# OPTIMIZED: Use diagonal concatenation without any schema enforcement
|
|
1410
|
+
# This is the fastest concatenation method in Polars and handles type mismatches automatically
|
|
1411
|
+
self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
|
|
1412
|
+
|
|
1413
|
+
# REMOVED ALL EXPENSIVE OPERATIONS:
|
|
1414
|
+
# - No _ensure_features_df_schema_order()
|
|
1415
|
+
# - No complex column alignment
|
|
1416
|
+
# - No type casting loops
|
|
1417
|
+
# - No sample_color_reset()
|
|
1418
|
+
|
|
1419
|
+
self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (optimized)")
|
|
1420
|
+
return True
|
|
1421
|
+
|
|
1422
|
+
|
|
1423
|
+
def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
|
|
1424
|
+
"""
|
|
1425
|
+
Standard add_sample method that uses full sample loading (includes ms1_df).
|
|
1426
|
+
|
|
1427
|
+
This method uses the standard sample.load() method which loads all data
|
|
1428
|
+
including ms1_df, providing full functionality but potentially slower performance
|
|
1429
|
+
for large MS1 datasets.
|
|
1430
|
+
|
|
1431
|
+
Returns True if successful, False otherwise.
|
|
1432
|
+
"""
|
|
1433
|
+
self.logger.debug(f"Adding (standard): {file}")
|
|
1434
|
+
|
|
1435
|
+
# Basic validation
|
|
1436
|
+
basename = os.path.basename(file)
|
|
1437
|
+
sample_name = os.path.splitext(basename)[0]
|
|
1438
|
+
|
|
1439
|
+
if sample_name in self.samples_df["sample_name"].to_list():
|
|
1440
|
+
self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
|
|
1441
|
+
return False
|
|
1442
|
+
|
|
1443
|
+
if not os.path.exists(file):
|
|
1444
|
+
self.logger.error(f"File {file} does not exist.")
|
|
1445
|
+
return False
|
|
1446
|
+
|
|
1447
|
+
if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
|
|
1448
|
+
self.logger.error(f"Unsupported file type: {file}")
|
|
1449
|
+
return False
|
|
1450
|
+
|
|
1451
|
+
# Load sample using standard method (includes ms1_df)
|
|
1452
|
+
ddaobj = Sample()
|
|
1453
|
+
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
1454
|
+
# Use standard loading method that loads all data including ms1_df
|
|
1455
|
+
ddaobj.load(file)
|
|
1456
|
+
|
|
1457
|
+
if ddaobj.features_df is None and not reset:
|
|
1458
|
+
ddaobj.features = None
|
|
1459
|
+
|
|
1460
|
+
if ddaobj.features is None or reset:
|
|
1461
|
+
ddaobj.find_features()
|
|
1462
|
+
ddaobj.find_adducts(adducts=adducts)
|
|
1463
|
+
ddaobj.find_ms2()
|
|
1464
|
+
|
|
1465
|
+
self.features_maps.append(ddaobj.features)
|
|
1466
|
+
|
|
1467
|
+
# Determine sample type
|
|
1468
|
+
sample_type = "sample" if type is None else type
|
|
1469
|
+
if "qc" in sample_name.lower():
|
|
1470
|
+
sample_type = "qc"
|
|
1471
|
+
if "blank" in sample_name.lower():
|
|
1472
|
+
sample_type = "blank"
|
|
1473
|
+
|
|
1474
|
+
map_id_value = len(self.features_maps) - 1
|
|
1475
|
+
|
|
1476
|
+
# Handle file paths
|
|
1477
|
+
if file.endswith(".sample5"):
|
|
1478
|
+
final_sample_path = file
|
|
1479
|
+
self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
|
|
1480
|
+
else:
|
|
1481
|
+
if self.folder is not None:
|
|
1482
|
+
if not os.path.exists(self.folder):
|
|
1483
|
+
os.makedirs(self.folder)
|
|
1484
|
+
final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
|
|
1485
|
+
else:
|
|
1486
|
+
final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
|
|
1487
|
+
ddaobj.save(final_sample_path)
|
|
1488
|
+
self.logger.debug(f"Saved converted sample: {final_sample_path}")
|
|
1489
|
+
|
|
1490
|
+
# Efficient scan counting
|
|
1491
|
+
ms1_count = ms2_count = 0
|
|
1492
|
+
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
1493
|
+
scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1494
|
+
ms_levels = scan_counts.get("ms_level", [])
|
|
1495
|
+
counts = scan_counts.get("len", [])
|
|
1496
|
+
for level, count in zip(ms_levels, counts):
|
|
1497
|
+
if level == 1:
|
|
1498
|
+
ms1_count = count
|
|
1499
|
+
elif level == 2:
|
|
1500
|
+
ms2_count = count
|
|
1501
|
+
|
|
1502
|
+
# Create sample entry
|
|
1503
|
+
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1504
|
+
new_sample = pl.DataFrame({
|
|
1505
|
+
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
1506
|
+
"sample_name": [sample_name],
|
|
1507
|
+
"sample_path": [final_sample_path],
|
|
1508
|
+
"sample_type": [sample_type],
|
|
1509
|
+
"map_id": [map_id_value],
|
|
1510
|
+
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
1511
|
+
"sample_color": [None], # Will be set in batch at end
|
|
1512
|
+
"sample_group": [""],
|
|
1513
|
+
"sample_batch": [1],
|
|
1514
|
+
"sample_sequence": [next_sequence],
|
|
1515
|
+
"num_features": [int(ddaobj.features.size())],
|
|
1516
|
+
"num_ms1": [ms1_count],
|
|
1517
|
+
"num_ms2": [ms2_count],
|
|
1518
|
+
})
|
|
1519
|
+
|
|
1520
|
+
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1521
|
+
|
|
1522
|
+
# SIMPLIFIED feature processing
|
|
1523
|
+
current_sample_uid = len(self.samples_df) - 1
|
|
1524
|
+
|
|
1525
|
+
# Add required columns with minimal operations
|
|
1526
|
+
columns_to_add = [
|
|
1527
|
+
pl.lit(current_sample_uid).alias("sample_uid"),
|
|
1528
|
+
pl.lit(False).alias("filled"),
|
|
1529
|
+
pl.lit(-1.0).alias("chrom_area"),
|
|
1530
|
+
]
|
|
1531
|
+
|
|
1532
|
+
# Only add rt_original if it doesn't exist
|
|
1533
|
+
if "rt_original" not in ddaobj.features_df.columns:
|
|
1534
|
+
columns_to_add.append(pl.col("rt").alias("rt_original"))
|
|
1535
|
+
|
|
1536
|
+
f_df = ddaobj.features_df.with_columns(columns_to_add)
|
|
1537
|
+
|
|
1538
|
+
if self.features_df.is_empty():
|
|
1539
|
+
# First sample
|
|
1540
|
+
self.features_df = f_df.with_columns(
|
|
1541
|
+
pl.int_range(pl.len()).add(1).alias("feature_uid")
|
|
1542
|
+
)
|
|
1543
|
+
else:
|
|
1544
|
+
# Subsequent samples - minimal overhead
|
|
1545
|
+
offset = self.features_df["feature_uid"].max() + 1
|
|
1546
|
+
f_df = f_df.with_columns(
|
|
1547
|
+
pl.int_range(pl.len()).add(offset).alias("feature_uid")
|
|
1548
|
+
)
|
|
1549
|
+
|
|
1550
|
+
# Use diagonal concatenation for flexibility
|
|
1551
|
+
self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
|
|
1552
|
+
|
|
1553
|
+
self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
|
|
1554
|
+
return True
|
|
1555
|
+
# Use standard loading method that loads all data including ms1_df
|
|
1556
|
+
ddaobj.load(file)
|
|
1557
|
+
|
|
1558
|
+
if ddaobj.features_df is None and not reset:
|
|
1559
|
+
ddaobj.features = None
|
|
1560
|
+
|
|
1561
|
+
if ddaobj.features is None or reset:
|
|
1562
|
+
ddaobj.find_features()
|
|
1563
|
+
ddaobj.find_adducts(adducts=adducts)
|
|
1564
|
+
ddaobj.find_ms2()
|
|
1565
|
+
|
|
1566
|
+
self.features_maps.append(ddaobj.features)
|
|
1567
|
+
|
|
1568
|
+
# Determine sample type
|
|
1569
|
+
sample_type = "sample" if type is None else type
|
|
1570
|
+
if "qc" in sample_name.lower():
|
|
1571
|
+
sample_type = "qc"
|
|
1572
|
+
if "blank" in sample_name.lower():
|
|
1573
|
+
sample_type = "blank"
|
|
1574
|
+
|
|
1575
|
+
map_id_value = len(self.features_maps) - 1
|
|
1576
|
+
|
|
1577
|
+
# Handle file paths
|
|
1578
|
+
if file.endswith(".sample5"):
|
|
1579
|
+
final_sample_path = file
|
|
1580
|
+
self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
|
|
1581
|
+
else:
|
|
1582
|
+
if self.folder is not None:
|
|
1583
|
+
if not os.path.exists(self.folder):
|
|
1584
|
+
os.makedirs(self.folder)
|
|
1585
|
+
final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
|
|
1586
|
+
else:
|
|
1587
|
+
final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
|
|
1588
|
+
ddaobj.save(final_sample_path)
|
|
1589
|
+
self.logger.debug(f"Saved converted sample: {final_sample_path}")
|
|
1590
|
+
|
|
1591
|
+
# Efficient scan counting
|
|
1592
|
+
ms1_count = ms2_count = 0
|
|
1593
|
+
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
1594
|
+
scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1595
|
+
ms_levels = scan_counts.get("ms_level", [])
|
|
1596
|
+
counts = scan_counts.get("len", [])
|
|
1597
|
+
for level, count in zip(ms_levels, counts):
|
|
1598
|
+
if level == 1:
|
|
1599
|
+
ms1_count = count
|
|
1600
|
+
elif level == 2:
|
|
1601
|
+
ms2_count = count
|
|
1602
|
+
|
|
1603
|
+
# Create sample entry
|
|
1604
|
+
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1605
|
+
new_sample = pl.DataFrame({
|
|
1606
|
+
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
1607
|
+
"sample_name": [sample_name],
|
|
1608
|
+
"sample_path": [final_sample_path],
|
|
1609
|
+
"sample_type": [sample_type],
|
|
1610
|
+
"map_id": [map_id_value],
|
|
1611
|
+
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
1612
|
+
"sample_color": [None], # Will be set in batch at end
|
|
1613
|
+
"sample_group": [""],
|
|
1614
|
+
"sample_batch": [1],
|
|
1615
|
+
"sample_sequence": [next_sequence],
|
|
1616
|
+
"num_features": [int(ddaobj.features.size())],
|
|
1617
|
+
"num_ms1": [ms1_count],
|
|
1618
|
+
"num_ms2": [ms2_count],
|
|
1619
|
+
})
|
|
1620
|
+
|
|
1621
|
+
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1622
|
+
|
|
1623
|
+
# SIMPLIFIED feature processing
|
|
1624
|
+
current_sample_uid = len(self.samples_df) - 1
|
|
1625
|
+
|
|
1626
|
+
# Add required columns with minimal operations
|
|
1627
|
+
columns_to_add = [
|
|
1628
|
+
pl.lit(current_sample_uid).alias("sample_uid"),
|
|
1629
|
+
pl.lit(False).alias("filled"),
|
|
1630
|
+
pl.lit(-1.0).alias("chrom_area"),
|
|
1631
|
+
]
|
|
1632
|
+
|
|
1633
|
+
# Only add rt_original if it doesn't exist
|
|
1634
|
+
if "rt_original" not in ddaobj.features_df.columns:
|
|
1635
|
+
columns_to_add.append(pl.col("rt").alias("rt_original"))
|
|
1636
|
+
|
|
1637
|
+
f_df = ddaobj.features_df.with_columns(columns_to_add)
|
|
1638
|
+
|
|
1639
|
+
if self.features_df.is_empty():
|
|
1640
|
+
# First sample
|
|
1641
|
+
self.features_df = f_df.with_columns(
|
|
1642
|
+
pl.int_range(pl.len()).add(1).alias("feature_uid")
|
|
1643
|
+
)
|
|
1644
|
+
else:
|
|
1645
|
+
# Subsequent samples - minimal overhead
|
|
1646
|
+
offset = self.features_df["feature_uid"].max() + 1
|
|
1647
|
+
f_df = f_df.with_columns(
|
|
1648
|
+
pl.int_range(pl.len()).add(offset).alias("feature_uid")
|
|
1649
|
+
)
|
|
1650
|
+
|
|
1651
|
+
# Use diagonal concatenation for flexibility
|
|
1652
|
+
self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
|
|
1653
|
+
|
|
1654
|
+
self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
|
|
1655
|
+
return True
|
|
1656
|
+
|
|
1657
|
+
|
|
1658
|
+
def _sample_color_reset_optimized(self):
|
|
1659
|
+
"""
|
|
1660
|
+
Optimized version of sample_color_reset that caches colormap initialization.
|
|
1661
|
+
"""
|
|
1662
|
+
if self.samples_df is None or len(self.samples_df) == 0:
|
|
1663
|
+
self.logger.warning("No samples found in study.")
|
|
1664
|
+
return
|
|
1665
|
+
|
|
1666
|
+
# Cache the colormap if not already cached
|
|
1667
|
+
if not hasattr(self, '_cached_colormap'):
|
|
1668
|
+
try:
|
|
1669
|
+
from cmap import Colormap
|
|
1670
|
+
self._cached_colormap = Colormap('turbo')
|
|
1671
|
+
except ImportError:
|
|
1672
|
+
self.logger.warning("cmap package not available, using default colors")
|
|
1673
|
+
return
|
|
1674
|
+
|
|
1675
|
+
cm = self._cached_colormap
|
|
1676
|
+
n_samples = len(self.samples_df)
|
|
1677
|
+
|
|
1678
|
+
# Pre-allocate colors list for better performance
|
|
1679
|
+
colors = [None] * n_samples
|
|
1680
|
+
|
|
1681
|
+
# Vectorized color generation
|
|
1682
|
+
for i in range(n_samples):
|
|
1683
|
+
normalized_value = 0.1 + ((i + 0.5) / n_samples) * 0.8
|
|
1684
|
+
color_rgba = cm(normalized_value)
|
|
1685
|
+
|
|
1686
|
+
if len(color_rgba) >= 3:
|
|
1687
|
+
r, g, b = color_rgba[:3]
|
|
1688
|
+
if max(color_rgba[:3]) <= 1.0:
|
|
1689
|
+
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
|
1690
|
+
colors[i] = f"#{r:02x}{g:02x}{b:02x}"
|
|
1691
|
+
|
|
1692
|
+
# Update the sample_color column efficiently
|
|
1693
|
+
self.samples_df = self.samples_df.with_columns(
|
|
1694
|
+
pl.Series("sample_color", colors).alias("sample_color")
|
|
1695
|
+
)
|
|
1696
|
+
|
|
1697
|
+
self.logger.debug(f"Reset sample colors (cached) for {n_samples} samples")
|