masster 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/load.py CHANGED
@@ -40,7 +40,21 @@ def add(
40
40
  reset=False,
41
41
  adducts=None,
42
42
  max_files=None,
43
+ fast=True,
43
44
  ):
45
+ """Add samples from a folder to the study.
46
+
47
+ Args:
48
+ folder (str, optional): Path to folder containing sample files.
49
+ Defaults to study folder or current working directory.
50
+ reset (bool, optional): Whether to reset the study before adding samples.
51
+ Defaults to False.
52
+ adducts (optional): Adducts to use for sample loading. Defaults to None.
53
+ max_files (int, optional): Maximum number of files to process.
54
+ Defaults to None (no limit).
55
+ fast (bool, optional): Whether to use optimized loading that skips ms1_df
56
+ for better performance. Defaults to True.
57
+ """
44
58
  if folder is None:
45
59
  if self.folder is not None:
46
60
  folder = self.folder
@@ -85,39 +99,29 @@ def add(
85
99
 
86
100
  self.logger.debug(f"Found {len(files)} {ext} files")
87
101
 
88
- # Process files
89
- for i, file in enumerate(
90
- tqdm(
91
- files,
92
- total=len(files),
93
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add *{ext}",
94
- disable=tdqm_disable,
95
- ),
96
- ):
102
+ # Filter files not already processed and respect max_files limit
103
+ files_to_process = []
104
+ for file in files:
97
105
  if max_files is not None and counter >= max_files:
98
106
  break
99
-
107
+
100
108
  # Get filename without extension for blacklist check
101
109
  basename = os.path.basename(file)
102
110
  filename_no_ext = os.path.splitext(basename)[0]
103
-
111
+
104
112
  # Check if this filename (without extension) is already in blacklist
105
- if filename_no_ext in blacklist:
106
- self.logger.debug(f"Skipping {file} - filename already processed")
107
- continue
108
-
109
- self.logger.debug(f"Add file {counter + 1}: {file}")
110
-
111
- # Try to add the sample
112
- try:
113
- self.add_sample(file=file, reset=reset, adducts=adducts)
114
- # If successful, add to blacklist and increment counter
115
- blacklist.add(filename_no_ext)
116
- counter += 1
113
+ if filename_no_ext not in blacklist:
114
+ files_to_process.append(file)
115
+ if len(files_to_process) + counter >= (max_files or float('inf')):
116
+ break
117
+
118
+ # Batch process all files of this extension using ultra-optimized method
119
+ if files_to_process:
120
+ self.logger.debug(f"Batch processing {len(files_to_process)} {ext} files")
121
+ successful = self._add_samples_batch(files_to_process, reset=reset, adducts=adducts, blacklist=blacklist, fast=fast)
122
+ counter += successful
123
+ if successful > 0:
117
124
  not_zero = True
118
- except Exception as e:
119
- self.logger.warning(f"Failed to add sample {file}: {e}")
120
- continue
121
125
 
122
126
  if max_files is not None and counter >= max_files:
123
127
  self.logger.debug(
@@ -133,198 +137,43 @@ def add(
133
137
 
134
138
 
135
139
  # TODO type is not used
136
- def add_sample(self, file, type=None, reset=False, adducts=None):
137
- self.logger.debug(f"Adding: {file}")
138
-
139
- # Extract sample name by removing any known extension
140
- basename = os.path.basename(file)
141
- sample_name = os.path.splitext(basename)[0]
142
-
143
- # check if sample_name is already in the samples_df
144
- if sample_name in self.samples_df["sample_name"].to_list():
145
- self.logger.warning(
146
- f"Sample {sample_name} already exists in the study. Skipping.",
147
- )
148
- return
149
-
150
- # check if file exists
151
- if not os.path.exists(file):
152
- self.logger.error(f"File {file} does not exist.")
153
- return
154
-
155
- # Check for supported file extensions
156
- if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
157
- self.logger.error(f"File {file} is not a supported file type. Supported: .sample5, .wiff, .raw, .mzML")
158
- return
159
-
160
- # Load the sample based on file type
161
- ddaobj = Sample()
162
- ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
163
-
164
- if file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
165
- ddaobj.load(file)
166
- else:
167
- self.logger.error(f"Unsupported file format: {file}")
168
- return
169
- if ddaobj.features_df is None and not reset:
170
- self.logger.debug(
171
- f"File {file} will be newly processed.",
172
- )
173
- ddaobj.features = None
174
-
175
- if ddaobj.features is None or reset:
176
- ddaobj.find_features()
177
- ddaobj.find_adducts(adducts=adducts)
178
- ddaobj.find_ms2()
179
-
180
- self.features_maps.append(ddaobj.features)
181
-
182
- sample_type = "sample" if type is None else type
183
- if "qc" in sample_name.lower():
184
- sample_type = "qc"
185
- if "blank" in sample_name.lower():
186
- sample_type = "blank"
140
+ def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
141
+ """
142
+ Add a single sample to the study.
187
143
 
188
- # Use the index of the feature map in self.features_maps as map_id
189
- map_id_value = len(self.features_maps) - 1
190
-
191
- # Determine the final sample path based on file type
192
- if file.endswith(".sample5"):
193
- # If input is already .sample5, keep it in original location
194
- final_sample_path = file
195
- self.logger.debug(f"Using existing .sample5 file at original location: {final_sample_path}")
196
-
197
- # Check if there's a corresponding featureXML file in the same directory
198
- featurexml_path = file.replace(".sample5", ".featureXML")
199
- if os.path.exists(featurexml_path):
200
- self.logger.debug(f"Found corresponding featureXML file: {featurexml_path}")
201
- else:
202
- self.logger.debug(f"No corresponding featureXML file found at: {featurexml_path}")
203
- else:
204
- # For .wiff, .mzML, .raw files, save to study folder (original behavior)
205
- if self.folder is not None:
206
- if not os.path.exists(self.folder):
207
- os.makedirs(self.folder)
208
- final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
209
- ddaobj.save(final_sample_path)
210
- self.logger.debug(f"Saved converted sample to study folder: {final_sample_path}")
211
- else:
212
- # If no study folder is set, save in current directory
213
- final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
214
- ddaobj.save(final_sample_path)
215
- self.logger.debug(f"Saved converted sample to current directory: {final_sample_path}")
216
-
217
- # Count MS1 and MS2 scans from the loaded sample
218
- ms1_count = 0
219
- ms2_count = 0
220
- if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
221
- ms1_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 1).height)
222
- ms2_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 2).height)
223
-
224
- # Calculate next sequence number
225
- next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
226
-
227
- new_sample = pl.DataFrame(
228
- {
229
- "sample_uid": [int(len(self.samples_df) + 1)],
230
- "sample_name": [sample_name],
231
- "sample_path": [final_sample_path], # Use the determined path
232
- "sample_type": [sample_type],
233
- "map_id": [map_id_value],
234
- "sample_source": [getattr(ddaobj, "file_source", file)],
235
- "sample_color": [None], # Will be set by set_sample_color below
236
- "sample_group": [""], # Default empty string
237
- "sample_batch": [1], # Default batch 1
238
- "sample_sequence": [next_sequence], # Increasing sequence number
239
- "num_features": [int(ddaobj.features.size())],
240
- "num_ms1": [ms1_count],
241
- "num_ms2": [ms2_count],
242
- },
243
- schema={
244
- "sample_uid": pl.Int64,
245
- "sample_name": pl.Utf8,
246
- "sample_path": pl.Utf8,
247
- "sample_type": pl.Utf8,
248
- "map_id": pl.Int64,
249
- "sample_source": pl.Utf8,
250
- "sample_color": pl.Utf8,
251
- "sample_group": pl.Utf8,
252
- "sample_batch": pl.Int64,
253
- "sample_sequence": pl.Int64,
254
- "num_features": pl.Int64,
255
- "num_ms1": pl.Int64,
256
- "num_ms2": pl.Int64,
257
- },
258
- )
259
- self.samples_df = pl.concat([self.samples_df, new_sample])
260
-
261
- # Optimized DataFrame operations - chain operations instead of multiple clones
262
- columns_to_add = [
263
- pl.lit(len(self.samples_df)).alias("sample_uid"),
264
- pl.lit(False).alias("filled"),
265
- pl.lit(-1.0).alias("chrom_area"),
266
- ]
267
-
268
- # Only add rt_original if it doesn't exist
269
- if "rt_original" not in ddaobj.features_df.columns:
270
- columns_to_add.append(pl.col("rt").alias("rt_original"))
271
-
272
- f_df = ddaobj.features_df.with_columns(columns_to_add)
273
-
274
- if self.features_df.is_empty():
275
- # Create new features_df with feature_uid column
276
- self.features_df = f_df.with_columns(
277
- pl.int_range(pl.len()).add(1).alias("feature_uid"),
278
- ).select(
279
- ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
144
+ Args:
145
+ file (str): Path to the sample file
146
+ type (str, optional): File type to force. Defaults to None (auto-detect).
147
+ reset (bool, optional): Whether to reset the study. Defaults to False.
148
+ adducts (optional): Adducts to use for sample loading. Defaults to None.
149
+ fast (bool, optional): Whether to use optimized loading that skips ms1_df
150
+ for better performance. Defaults to True.
151
+
152
+ Returns:
153
+ bool: True if successful, False otherwise.
154
+ """
155
+ if fast:
156
+ # Use optimized method for better performance
157
+ success = self._add_sample_optimized(
158
+ file,
159
+ type=type,
160
+ reset=reset,
161
+ adducts=adducts,
162
+ skip_color_reset=False, # Do color reset for individual calls
163
+ skip_schema_check=True # Skip schema check for performance (safe with diagonal concat)
280
164
  )
281
- # Ensure column order matches schema from the very beginning
282
- self._ensure_features_df_schema_order()
283
165
  else:
284
- offset = self.features_df["feature_uid"].max() + 1 if not self.features_df.is_empty() else 1
285
- # Chain operations and add to existing DataFrame
286
- f_df = f_df.with_columns(
287
- pl.int_range(pl.len()).add(offset).alias("feature_uid"),
288
- ).select(
289
- ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
166
+ # Use standard method with full ms1_df loading
167
+ success = self._add_sample_standard(
168
+ file,
169
+ type=type,
170
+ reset=reset,
171
+ adducts=adducts,
172
+ skip_color_reset=False, # Do color reset for individual calls
173
+ skip_schema_check=True # Skip schema check for performance
290
174
  )
291
-
292
- # Reorganize f_df columns to match self.features_df column order and schema
293
- target_columns = self.features_df.columns
294
- target_schema = self.features_df.schema
295
- f_df_columns = f_df.columns
296
-
297
- # Create select expressions for reordering and type casting
298
- select_exprs = []
299
- for col in target_columns:
300
- if col in f_df_columns:
301
- # Cast to the expected type
302
- expected_dtype = target_schema[col]
303
- select_exprs.append(pl.col(col).cast(expected_dtype, strict=False))
304
- else:
305
- # Add missing columns with null values of the correct type
306
- expected_dtype = target_schema[col]
307
- select_exprs.append(pl.lit(None, dtype=expected_dtype).alias(col))
308
-
309
- # Add any extra columns from f_df that aren't in target_columns (keep their original types)
310
- for col in f_df_columns:
311
- if col not in target_columns:
312
- select_exprs.append(pl.col(col))
313
-
314
- # Reorder and type-cast f_df columns
315
- f_df = f_df.select(select_exprs)
316
-
317
- self.features_df = pl.concat([self.features_df, f_df])
318
-
319
- # Ensure features_df column order matches schema
320
- self._ensure_features_df_schema_order()
321
-
322
- # Auto-assign colors when new sample is added (reset all colors using turbo colormap based on UID)
323
- self.sample_color_reset()
324
175
 
325
- self.logger.debug(
326
- f"Added sample {sample_name} with {ddaobj.features.size()} features to the study.",
327
- )
176
+ return success
328
177
 
329
178
 
330
179
  def load(self, filename=None):
@@ -942,8 +791,6 @@ def _fill_chrom_impl(
942
791
  })
943
792
 
944
793
  total_missing = len(missing_combinations_df)
945
- total_samples = len(samples_to_process)
946
-
947
794
  self.logger.debug(
948
795
  f"Gap filling for {total_missing} missing features...",
949
796
  )
@@ -1114,51 +961,96 @@ def _get_missing_consensus_sample_combinations(self, uids):
1114
961
  """
1115
962
  Efficiently identify which consensus_uid/sample combinations are missing.
1116
963
  Returns a list of tuples: (consensus_uid, sample_uid, sample_name, sample_path)
964
+
965
+ Optimized for common scenarios:
966
+ - Early termination for fully-filled studies
967
+ - Efficient dictionary lookups instead of expensive DataFrame joins
968
+ - Smart handling of sparse vs dense missing data patterns
1117
969
  """
1118
- # Get all consensus UIDs we're interested in
1119
- consensus_uids_set = set(uids)
1120
-
1121
- # Get all sample UIDs and create lookup
1122
- all_sample_info = {}
1123
- for row in self.samples_df.select([
1124
- "sample_uid",
1125
- "sample_name",
1126
- "sample_path",
1127
- ]).iter_rows(named=True):
1128
- all_sample_info[row["sample_uid"]] = {
1129
- "sample_name": row["sample_name"],
1130
- "sample_path": row["sample_path"],
1131
- }
1132
-
1133
- # Get existing consensus/sample combinations from consensus_mapping_df
1134
- existing_combinations = set()
1135
- consensus_mapping_filtered = self.consensus_mapping_df.filter(
1136
- pl.col("consensus_uid").is_in(list(consensus_uids_set)),
1137
- )
1138
-
1139
- # Join with features_df to get sample_uid information
1140
- existing_features = consensus_mapping_filtered.join(
1141
- self.features_df.select(["feature_uid", "sample_uid"]),
1142
- on="feature_uid",
1143
- how="inner",
970
+ if not uids:
971
+ return []
972
+
973
+ n_consensus = len(uids)
974
+ n_samples = len(self.samples_df)
975
+ total_possible = n_consensus * n_samples
976
+
977
+ # Quick early termination check for fully/nearly filled studies
978
+ # This handles the common case where fill() is run on an already-filled study
979
+ consensus_counts = (
980
+ self.consensus_mapping_df
981
+ .filter(pl.col("consensus_uid").is_in(uids))
982
+ .group_by("consensus_uid")
983
+ .agg(pl.count("feature_uid").alias("count"))
1144
984
  )
1145
-
1146
- for row in existing_features.select(["consensus_uid", "sample_uid"]).iter_rows():
1147
- existing_combinations.add((row[0], row[1])) # (consensus_uid, sample_uid)
1148
-
1149
- # Find missing combinations
1150
- missing_combinations = []
1151
- for consensus_uid in consensus_uids_set:
1152
- for sample_uid, sample_info in all_sample_info.items():
1153
- if (consensus_uid, sample_uid) not in existing_combinations:
1154
- missing_combinations.append((
1155
- consensus_uid,
1156
- sample_uid,
1157
- sample_info["sample_name"],
1158
- sample_info["sample_path"],
1159
- ))
1160
-
1161
- return missing_combinations
985
+
986
+ total_existing = consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
987
+
988
+ # If >95% filled, likely no gaps (common case)
989
+ if total_existing >= total_possible * 0.95:
990
+ self.logger.debug(f"Study appears {total_existing/total_possible*100:.1f}% filled, using sparse optimization")
991
+
992
+ # For sparse missing data, check each consensus feature individually
993
+ missing_combinations = []
994
+ uids_set = set(uids)
995
+
996
+ # Build efficient lookups
997
+ feature_to_sample = dict(
998
+ self.features_df.select(["feature_uid", "sample_uid"]).iter_rows()
999
+ )
1000
+
1001
+ # Get existing combinations for target UIDs only
1002
+ existing_by_consensus = {}
1003
+ for consensus_uid, feature_uid in self.consensus_mapping_df.select(["consensus_uid", "feature_uid"]).iter_rows():
1004
+ if consensus_uid in uids_set and feature_uid in feature_to_sample:
1005
+ if consensus_uid not in existing_by_consensus:
1006
+ existing_by_consensus[consensus_uid] = set()
1007
+ existing_by_consensus[consensus_uid].add(feature_to_sample[feature_uid])
1008
+
1009
+ # Get sample info once
1010
+ all_samples = list(
1011
+ self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows()
1012
+ )
1013
+
1014
+ # Check for missing combinations
1015
+ for consensus_uid in uids:
1016
+ existing_samples = existing_by_consensus.get(consensus_uid, set())
1017
+ for sample_uid, sample_name, sample_path in all_samples:
1018
+ if sample_uid not in existing_samples:
1019
+ missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path))
1020
+
1021
+ return missing_combinations
1022
+
1023
+ else:
1024
+ # For studies with many gaps, use bulk operations
1025
+ self.logger.debug(f"Study {total_existing/total_possible*100:.1f}% filled, using bulk optimization")
1026
+
1027
+ # Build efficient lookups
1028
+ uids_set = set(uids)
1029
+ feature_to_sample = dict(
1030
+ self.features_df.select(["feature_uid", "sample_uid"]).iter_rows()
1031
+ )
1032
+
1033
+ # Build existing combinations set
1034
+ existing_combinations = {
1035
+ (consensus_uid, feature_to_sample[feature_uid])
1036
+ for consensus_uid, feature_uid in self.consensus_mapping_df.select(["consensus_uid", "feature_uid"]).iter_rows()
1037
+ if consensus_uid in uids_set and feature_uid in feature_to_sample
1038
+ }
1039
+
1040
+ # Get all sample info
1041
+ all_samples = list(
1042
+ self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows()
1043
+ )
1044
+
1045
+ # Generate all missing combinations
1046
+ missing_combinations = [
1047
+ (consensus_uid, sample_uid, sample_name, sample_path)
1048
+ for consensus_uid in uids
1049
+ for sample_uid, sample_name, sample_path in all_samples
1050
+ if (consensus_uid, sample_uid) not in existing_combinations
1051
+ ]
1052
+
1053
+ return missing_combinations
1162
1054
 
1163
1055
 
1164
1056
  def sanitize(self):
@@ -1300,3 +1192,506 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
1300
1192
  self.consensus_map = oms.ConsensusMap()
1301
1193
  fh.load(filename, self.consensus_map)
1302
1194
  self.logger.debug(f"Loaded consensus map from {filename}.")
1195
+
1196
+ def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, fast=True):
1197
+ """
1198
+ Optimized batch addition of samples.
1199
+
1200
+ Args:
1201
+ files (list): List of file paths to process
1202
+ reset (bool): Whether to reset features before processing
1203
+ adducts: Adducts to use for sample loading
1204
+ blacklist (set): Set of filenames already processed
1205
+ fast (bool): Whether to use optimized loading (skips ms1_df) or standard loading
1206
+
1207
+ Performance optimizations:
1208
+ 1. No per-sample color reset
1209
+ 2. No schema enforcement during addition
1210
+ 3. Simplified DataFrame operations
1211
+ 4. Batch progress reporting
1212
+ """
1213
+ if not files:
1214
+ return 0
1215
+
1216
+ if blacklist is None:
1217
+ blacklist = set()
1218
+
1219
+ self.logger.debug(f"Starting batch addition of {len(files)} samples (fast={fast})...")
1220
+
1221
+ successful_additions = 0
1222
+ failed_additions = 0
1223
+
1224
+ # Progress reporting setup
1225
+ tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1226
+
1227
+ for i, file in enumerate(
1228
+ tqdm(
1229
+ files,
1230
+ total=len(files),
1231
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Batch add",
1232
+ disable=tqdm_disable,
1233
+ )
1234
+ ):
1235
+ try:
1236
+ # Choose between optimized and standard loading
1237
+ if fast:
1238
+ success = self._add_sample_optimized(
1239
+ file,
1240
+ reset=reset,
1241
+ adducts=adducts,
1242
+ skip_color_reset=True, # Skip color reset during batch
1243
+ skip_schema_check=True # Skip schema enforcement
1244
+ )
1245
+ else:
1246
+ success = self._add_sample_standard(
1247
+ file,
1248
+ reset=reset,
1249
+ adducts=adducts,
1250
+ skip_color_reset=True, # Skip color reset during batch
1251
+ skip_schema_check=True # Skip schema enforcement
1252
+ )
1253
+
1254
+ if success:
1255
+ # Add to blacklist for filename tracking
1256
+ basename = os.path.basename(file)
1257
+ filename_no_ext = os.path.splitext(basename)[0]
1258
+ blacklist.add(filename_no_ext)
1259
+ successful_additions += 1
1260
+
1261
+ except Exception as e:
1262
+ self.logger.warning(f"Failed to add sample {file}: {e}")
1263
+ failed_additions += 1
1264
+ continue
1265
+
1266
+ # Final cleanup operations done once at the end
1267
+ if successful_additions > 0:
1268
+ self.logger.debug("Performing final batch cleanup...")
1269
+
1270
+ # Optional: Only do schema enforcement if specifically needed (usually not required)
1271
+ # self._ensure_features_df_schema_order()
1272
+
1273
+ # Color assignment done once for all samples
1274
+ self._sample_color_reset_optimized()
1275
+
1276
+ self.logger.debug(f"Batch addition complete: {successful_additions} successful, {failed_additions} failed")
1277
+
1278
+ return successful_additions
1279
+
1280
+ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
1281
+ """
1282
+ Optimized add_sample with performance improvements integrated.
1283
+
1284
+ Removes:
1285
+ - Schema enforcement (_ensure_features_df_schema_order)
1286
+ - Complex column alignment and type casting
1287
+ - Per-addition color reset
1288
+ - Unnecessary column reordering
1289
+
1290
+ Returns True if successful, False otherwise.
1291
+ """
1292
+ self.logger.debug(f"Adding: {file}")
1293
+
1294
+ # Basic validation
1295
+ basename = os.path.basename(file)
1296
+ sample_name = os.path.splitext(basename)[0]
1297
+
1298
+ if sample_name in self.samples_df["sample_name"].to_list():
1299
+ self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
1300
+ return False
1301
+
1302
+ if not os.path.exists(file):
1303
+ self.logger.error(f"File {file} does not exist.")
1304
+ return False
1305
+
1306
+ if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
1307
+ self.logger.error(f"Unsupported file type: {file}")
1308
+ return False
1309
+
1310
+ # Load sample
1311
+ ddaobj = Sample()
1312
+ ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
1313
+ # Use standard loading method temporarily to test if this fixes the astuple error
1314
+ ddaobj.load(file)
1315
+
1316
+ if ddaobj.features_df is None and not reset:
1317
+ ddaobj.features = None
1318
+
1319
+ if ddaobj.features is None or reset:
1320
+ ddaobj.find_features()
1321
+ ddaobj.find_adducts(adducts=adducts)
1322
+ ddaobj.find_ms2()
1323
+
1324
+ self.features_maps.append(ddaobj.features)
1325
+
1326
+ # Determine sample type
1327
+ sample_type = "sample" if type is None else type
1328
+ if "qc" in sample_name.lower():
1329
+ sample_type = "qc"
1330
+ if "blank" in sample_name.lower():
1331
+ sample_type = "blank"
1332
+
1333
+ map_id_value = len(self.features_maps) - 1
1334
+
1335
+ # Handle file paths
1336
+ if file.endswith(".sample5"):
1337
+ final_sample_path = file
1338
+ self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
1339
+ else:
1340
+ if self.folder is not None:
1341
+ if not os.path.exists(self.folder):
1342
+ os.makedirs(self.folder)
1343
+ final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
1344
+ else:
1345
+ final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
1346
+ ddaobj.save(final_sample_path)
1347
+ self.logger.debug(f"Saved converted sample: {final_sample_path}")
1348
+
1349
+ # Efficient scan counting
1350
+ ms1_count = ms2_count = 0
1351
+ if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
1352
+ scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1353
+ ms_levels = scan_counts.get("ms_level", [])
1354
+ counts = scan_counts.get("len", [])
1355
+ for level, count in zip(ms_levels, counts):
1356
+ if level == 1:
1357
+ ms1_count = count
1358
+ elif level == 2:
1359
+ ms2_count = count
1360
+
1361
+ # Create sample entry
1362
+ next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1363
+ new_sample = pl.DataFrame({
1364
+ "sample_uid": [int(len(self.samples_df) + 1)],
1365
+ "sample_name": [sample_name],
1366
+ "sample_path": [final_sample_path],
1367
+ "sample_type": [sample_type],
1368
+ "map_id": [map_id_value],
1369
+ "sample_source": [getattr(ddaobj, "file_source", file)],
1370
+ "sample_color": [None], # Will be set in batch at end
1371
+ "sample_group": [""],
1372
+ "sample_batch": [1],
1373
+ "sample_sequence": [next_sequence],
1374
+ "num_features": [int(ddaobj.features.size())],
1375
+ "num_ms1": [ms1_count],
1376
+ "num_ms2": [ms2_count],
1377
+ })
1378
+
1379
+ self.samples_df = pl.concat([self.samples_df, new_sample])
1380
+
1381
+ # SIMPLIFIED feature processing
1382
+ current_sample_uid = len(self.samples_df) - 1
1383
+
1384
+ # Add required columns with minimal operations
1385
+ columns_to_add = [
1386
+ pl.lit(current_sample_uid).alias("sample_uid"),
1387
+ pl.lit(False).alias("filled"),
1388
+ pl.lit(-1.0).alias("chrom_area"),
1389
+ ]
1390
+
1391
+ # Only add rt_original if it doesn't exist
1392
+ if "rt_original" not in ddaobj.features_df.columns:
1393
+ columns_to_add.append(pl.col("rt").alias("rt_original"))
1394
+
1395
+ f_df = ddaobj.features_df.with_columns(columns_to_add)
1396
+
1397
+ if self.features_df.is_empty():
1398
+ # First sample
1399
+ self.features_df = f_df.with_columns(
1400
+ pl.int_range(pl.len()).add(1).alias("feature_uid")
1401
+ )
1402
+ else:
1403
+ # Subsequent samples - minimal overhead
1404
+ offset = self.features_df["feature_uid"].max() + 1
1405
+ f_df = f_df.with_columns(
1406
+ pl.int_range(pl.len()).add(offset).alias("feature_uid")
1407
+ )
1408
+
1409
+ # OPTIMIZED: Use diagonal concatenation without any schema enforcement
1410
+ # This is the fastest concatenation method in Polars and handles type mismatches automatically
1411
+ self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
1412
+
1413
+ # REMOVED ALL EXPENSIVE OPERATIONS:
1414
+ # - No _ensure_features_df_schema_order()
1415
+ # - No complex column alignment
1416
+ # - No type casting loops
1417
+ # - No sample_color_reset()
1418
+
1419
+ self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (optimized)")
1420
+ return True
1421
+
1422
+
1423
+ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
1424
+ """
1425
+ Standard add_sample method that uses full sample loading (includes ms1_df).
1426
+
1427
+ This method uses the standard sample.load() method which loads all data
1428
+ including ms1_df, providing full functionality but potentially slower performance
1429
+ for large MS1 datasets.
1430
+
1431
+ Returns True if successful, False otherwise.
1432
+ """
1433
+ self.logger.debug(f"Adding (standard): {file}")
1434
+
1435
+ # Basic validation
1436
+ basename = os.path.basename(file)
1437
+ sample_name = os.path.splitext(basename)[0]
1438
+
1439
+ if sample_name in self.samples_df["sample_name"].to_list():
1440
+ self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
1441
+ return False
1442
+
1443
+ if not os.path.exists(file):
1444
+ self.logger.error(f"File {file} does not exist.")
1445
+ return False
1446
+
1447
+ if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
1448
+ self.logger.error(f"Unsupported file type: {file}")
1449
+ return False
1450
+
1451
+ # Load sample using standard method (includes ms1_df)
1452
+ ddaobj = Sample()
1453
+ ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
1454
+ # Use standard loading method that loads all data including ms1_df
1455
+ ddaobj.load(file)
1456
+
1457
+ if ddaobj.features_df is None and not reset:
1458
+ ddaobj.features = None
1459
+
1460
+ if ddaobj.features is None or reset:
1461
+ ddaobj.find_features()
1462
+ ddaobj.find_adducts(adducts=adducts)
1463
+ ddaobj.find_ms2()
1464
+
1465
+ self.features_maps.append(ddaobj.features)
1466
+
1467
+ # Determine sample type
1468
+ sample_type = "sample" if type is None else type
1469
+ if "qc" in sample_name.lower():
1470
+ sample_type = "qc"
1471
+ if "blank" in sample_name.lower():
1472
+ sample_type = "blank"
1473
+
1474
+ map_id_value = len(self.features_maps) - 1
1475
+
1476
+ # Handle file paths
1477
+ if file.endswith(".sample5"):
1478
+ final_sample_path = file
1479
+ self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
1480
+ else:
1481
+ if self.folder is not None:
1482
+ if not os.path.exists(self.folder):
1483
+ os.makedirs(self.folder)
1484
+ final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
1485
+ else:
1486
+ final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
1487
+ ddaobj.save(final_sample_path)
1488
+ self.logger.debug(f"Saved converted sample: {final_sample_path}")
1489
+
1490
+ # Efficient scan counting
1491
+ ms1_count = ms2_count = 0
1492
+ if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
1493
+ scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1494
+ ms_levels = scan_counts.get("ms_level", [])
1495
+ counts = scan_counts.get("len", [])
1496
+ for level, count in zip(ms_levels, counts):
1497
+ if level == 1:
1498
+ ms1_count = count
1499
+ elif level == 2:
1500
+ ms2_count = count
1501
+
1502
+ # Create sample entry
1503
+ next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1504
+ new_sample = pl.DataFrame({
1505
+ "sample_uid": [int(len(self.samples_df) + 1)],
1506
+ "sample_name": [sample_name],
1507
+ "sample_path": [final_sample_path],
1508
+ "sample_type": [sample_type],
1509
+ "map_id": [map_id_value],
1510
+ "sample_source": [getattr(ddaobj, "file_source", file)],
1511
+ "sample_color": [None], # Will be set in batch at end
1512
+ "sample_group": [""],
1513
+ "sample_batch": [1],
1514
+ "sample_sequence": [next_sequence],
1515
+ "num_features": [int(ddaobj.features.size())],
1516
+ "num_ms1": [ms1_count],
1517
+ "num_ms2": [ms2_count],
1518
+ })
1519
+
1520
+ self.samples_df = pl.concat([self.samples_df, new_sample])
1521
+
1522
+ # SIMPLIFIED feature processing
1523
+ current_sample_uid = len(self.samples_df) - 1
1524
+
1525
+ # Add required columns with minimal operations
1526
+ columns_to_add = [
1527
+ pl.lit(current_sample_uid).alias("sample_uid"),
1528
+ pl.lit(False).alias("filled"),
1529
+ pl.lit(-1.0).alias("chrom_area"),
1530
+ ]
1531
+
1532
+ # Only add rt_original if it doesn't exist
1533
+ if "rt_original" not in ddaobj.features_df.columns:
1534
+ columns_to_add.append(pl.col("rt").alias("rt_original"))
1535
+
1536
+ f_df = ddaobj.features_df.with_columns(columns_to_add)
1537
+
1538
+ if self.features_df.is_empty():
1539
+ # First sample
1540
+ self.features_df = f_df.with_columns(
1541
+ pl.int_range(pl.len()).add(1).alias("feature_uid")
1542
+ )
1543
+ else:
1544
+ # Subsequent samples - minimal overhead
1545
+ offset = self.features_df["feature_uid"].max() + 1
1546
+ f_df = f_df.with_columns(
1547
+ pl.int_range(pl.len()).add(offset).alias("feature_uid")
1548
+ )
1549
+
1550
+ # Use diagonal concatenation for flexibility
1551
+ self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
1552
+
1553
+ self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
1554
+ return True
1555
+ # Use standard loading method that loads all data including ms1_df
1556
+ ddaobj.load(file)
1557
+
1558
+ if ddaobj.features_df is None and not reset:
1559
+ ddaobj.features = None
1560
+
1561
+ if ddaobj.features is None or reset:
1562
+ ddaobj.find_features()
1563
+ ddaobj.find_adducts(adducts=adducts)
1564
+ ddaobj.find_ms2()
1565
+
1566
+ self.features_maps.append(ddaobj.features)
1567
+
1568
+ # Determine sample type
1569
+ sample_type = "sample" if type is None else type
1570
+ if "qc" in sample_name.lower():
1571
+ sample_type = "qc"
1572
+ if "blank" in sample_name.lower():
1573
+ sample_type = "blank"
1574
+
1575
+ map_id_value = len(self.features_maps) - 1
1576
+
1577
+ # Handle file paths
1578
+ if file.endswith(".sample5"):
1579
+ final_sample_path = file
1580
+ self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
1581
+ else:
1582
+ if self.folder is not None:
1583
+ if not os.path.exists(self.folder):
1584
+ os.makedirs(self.folder)
1585
+ final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
1586
+ else:
1587
+ final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
1588
+ ddaobj.save(final_sample_path)
1589
+ self.logger.debug(f"Saved converted sample: {final_sample_path}")
1590
+
1591
+ # Efficient scan counting
1592
+ ms1_count = ms2_count = 0
1593
+ if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
1594
+ scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1595
+ ms_levels = scan_counts.get("ms_level", [])
1596
+ counts = scan_counts.get("len", [])
1597
+ for level, count in zip(ms_levels, counts):
1598
+ if level == 1:
1599
+ ms1_count = count
1600
+ elif level == 2:
1601
+ ms2_count = count
1602
+
1603
+ # Create sample entry
1604
+ next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1605
+ new_sample = pl.DataFrame({
1606
+ "sample_uid": [int(len(self.samples_df) + 1)],
1607
+ "sample_name": [sample_name],
1608
+ "sample_path": [final_sample_path],
1609
+ "sample_type": [sample_type],
1610
+ "map_id": [map_id_value],
1611
+ "sample_source": [getattr(ddaobj, "file_source", file)],
1612
+ "sample_color": [None], # Will be set in batch at end
1613
+ "sample_group": [""],
1614
+ "sample_batch": [1],
1615
+ "sample_sequence": [next_sequence],
1616
+ "num_features": [int(ddaobj.features.size())],
1617
+ "num_ms1": [ms1_count],
1618
+ "num_ms2": [ms2_count],
1619
+ })
1620
+
1621
+ self.samples_df = pl.concat([self.samples_df, new_sample])
1622
+
1623
+ # SIMPLIFIED feature processing
1624
+ current_sample_uid = len(self.samples_df) - 1
1625
+
1626
+ # Add required columns with minimal operations
1627
+ columns_to_add = [
1628
+ pl.lit(current_sample_uid).alias("sample_uid"),
1629
+ pl.lit(False).alias("filled"),
1630
+ pl.lit(-1.0).alias("chrom_area"),
1631
+ ]
1632
+
1633
+ # Only add rt_original if it doesn't exist
1634
+ if "rt_original" not in ddaobj.features_df.columns:
1635
+ columns_to_add.append(pl.col("rt").alias("rt_original"))
1636
+
1637
+ f_df = ddaobj.features_df.with_columns(columns_to_add)
1638
+
1639
+ if self.features_df.is_empty():
1640
+ # First sample
1641
+ self.features_df = f_df.with_columns(
1642
+ pl.int_range(pl.len()).add(1).alias("feature_uid")
1643
+ )
1644
+ else:
1645
+ # Subsequent samples - minimal overhead
1646
+ offset = self.features_df["feature_uid"].max() + 1
1647
+ f_df = f_df.with_columns(
1648
+ pl.int_range(pl.len()).add(offset).alias("feature_uid")
1649
+ )
1650
+
1651
+ # Use diagonal concatenation for flexibility
1652
+ self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
1653
+
1654
+ self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
1655
+ return True
1656
+
1657
+
1658
+ def _sample_color_reset_optimized(self):
1659
+ """
1660
+ Optimized version of sample_color_reset that caches colormap initialization.
1661
+ """
1662
+ if self.samples_df is None or len(self.samples_df) == 0:
1663
+ self.logger.warning("No samples found in study.")
1664
+ return
1665
+
1666
+ # Cache the colormap if not already cached
1667
+ if not hasattr(self, '_cached_colormap'):
1668
+ try:
1669
+ from cmap import Colormap
1670
+ self._cached_colormap = Colormap('turbo')
1671
+ except ImportError:
1672
+ self.logger.warning("cmap package not available, using default colors")
1673
+ return
1674
+
1675
+ cm = self._cached_colormap
1676
+ n_samples = len(self.samples_df)
1677
+
1678
+ # Pre-allocate colors list for better performance
1679
+ colors = [None] * n_samples
1680
+
1681
+ # Vectorized color generation
1682
+ for i in range(n_samples):
1683
+ normalized_value = 0.1 + ((i + 0.5) / n_samples) * 0.8
1684
+ color_rgba = cm(normalized_value)
1685
+
1686
+ if len(color_rgba) >= 3:
1687
+ r, g, b = color_rgba[:3]
1688
+ if max(color_rgba[:3]) <= 1.0:
1689
+ r, g, b = int(r * 255), int(g * 255), int(b * 255)
1690
+ colors[i] = f"#{r:02x}{g:02x}{b:02x}"
1691
+
1692
+ # Update the sample_color column efficiently
1693
+ self.samples_df = self.samples_df.with_columns(
1694
+ pl.Series("sample_color", colors).alias("sample_color")
1695
+ )
1696
+
1697
+ self.logger.debug(f"Reset sample colors (cached) for {n_samples} samples")