masster 0.3.15__py3-none-any.whl → 0.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/load.py CHANGED
@@ -40,7 +40,21 @@ def add(
40
40
  reset=False,
41
41
  adducts=None,
42
42
  max_files=None,
43
+ fast=True,
43
44
  ):
45
+ """Add samples from a folder to the study.
46
+
47
+ Args:
48
+ folder (str, optional): Path to folder containing sample files.
49
+ Defaults to study folder or current working directory.
50
+ reset (bool, optional): Whether to reset the study before adding samples.
51
+ Defaults to False.
52
+ adducts (optional): Adducts to use for sample loading. Defaults to None.
53
+ max_files (int, optional): Maximum number of files to process.
54
+ Defaults to None (no limit).
55
+ fast (bool, optional): Whether to use optimized loading that skips ms1_df
56
+ for better performance. Defaults to True.
57
+ """
44
58
  if folder is None:
45
59
  if self.folder is not None:
46
60
  folder = self.folder
@@ -85,39 +99,29 @@ def add(
85
99
 
86
100
  self.logger.debug(f"Found {len(files)} {ext} files")
87
101
 
88
- # Process files
89
- for i, file in enumerate(
90
- tqdm(
91
- files,
92
- total=len(files),
93
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add *{ext}",
94
- disable=tdqm_disable,
95
- ),
96
- ):
102
+ # Filter files not already processed and respect max_files limit
103
+ files_to_process = []
104
+ for file in files:
97
105
  if max_files is not None and counter >= max_files:
98
106
  break
99
-
107
+
100
108
  # Get filename without extension for blacklist check
101
109
  basename = os.path.basename(file)
102
110
  filename_no_ext = os.path.splitext(basename)[0]
103
-
111
+
104
112
  # Check if this filename (without extension) is already in blacklist
105
- if filename_no_ext in blacklist:
106
- self.logger.debug(f"Skipping {file} - filename already processed")
107
- continue
108
-
109
- self.logger.debug(f"Add file {counter + 1}: {file}")
110
-
111
- # Try to add the sample
112
- try:
113
- self.add_sample(file=file, reset=reset, adducts=adducts)
114
- # If successful, add to blacklist and increment counter
115
- blacklist.add(filename_no_ext)
116
- counter += 1
113
+ if filename_no_ext not in blacklist:
114
+ files_to_process.append(file)
115
+ if len(files_to_process) + counter >= (max_files or float('inf')):
116
+ break
117
+
118
+ # Batch process all files of this extension using ultra-optimized method
119
+ if files_to_process:
120
+ self.logger.debug(f"Batch processing {len(files_to_process)} {ext} files")
121
+ successful = self._add_samples_batch(files_to_process, reset=reset, adducts=adducts, blacklist=blacklist, fast=fast)
122
+ counter += successful
123
+ if successful > 0:
117
124
  not_zero = True
118
- except Exception as e:
119
- self.logger.warning(f"Failed to add sample {file}: {e}")
120
- continue
121
125
 
122
126
  if max_files is not None and counter >= max_files:
123
127
  self.logger.debug(
@@ -133,198 +137,43 @@ def add(
133
137
 
134
138
 
135
139
  # TODO type is not used
136
- def add_sample(self, file, type=None, reset=False, adducts=None):
137
- self.logger.debug(f"Adding: {file}")
138
-
139
- # Extract sample name by removing any known extension
140
- basename = os.path.basename(file)
141
- sample_name = os.path.splitext(basename)[0]
142
-
143
- # check if sample_name is already in the samples_df
144
- if sample_name in self.samples_df["sample_name"].to_list():
145
- self.logger.warning(
146
- f"Sample {sample_name} already exists in the study. Skipping.",
147
- )
148
- return
149
-
150
- # check if file exists
151
- if not os.path.exists(file):
152
- self.logger.error(f"File {file} does not exist.")
153
- return
154
-
155
- # Check for supported file extensions
156
- if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
157
- self.logger.error(f"File {file} is not a supported file type. Supported: .sample5, .wiff, .raw, .mzML")
158
- return
159
-
160
- # Load the sample based on file type
161
- ddaobj = Sample()
162
- ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
163
-
164
- if file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
165
- ddaobj.load(file)
166
- else:
167
- self.logger.error(f"Unsupported file format: {file}")
168
- return
169
- if ddaobj.features_df is None and not reset:
170
- self.logger.debug(
171
- f"File {file} will be newly processed.",
172
- )
173
- ddaobj.features = None
174
-
175
- if ddaobj.features is None or reset:
176
- ddaobj.find_features()
177
- ddaobj.find_adducts(adducts=adducts)
178
- ddaobj.find_ms2()
179
-
180
- self.features_maps.append(ddaobj.features)
181
-
182
- sample_type = "sample" if type is None else type
183
- if "qc" in sample_name.lower():
184
- sample_type = "qc"
185
- if "blank" in sample_name.lower():
186
- sample_type = "blank"
140
+ def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
141
+ """
142
+ Add a single sample to the study.
187
143
 
188
- # Use the index of the feature map in self.features_maps as map_id
189
- map_id_value = len(self.features_maps) - 1
190
-
191
- # Determine the final sample path based on file type
192
- if file.endswith(".sample5"):
193
- # If input is already .sample5, keep it in original location
194
- final_sample_path = file
195
- self.logger.debug(f"Using existing .sample5 file at original location: {final_sample_path}")
196
-
197
- # Check if there's a corresponding featureXML file in the same directory
198
- featurexml_path = file.replace(".sample5", ".featureXML")
199
- if os.path.exists(featurexml_path):
200
- self.logger.debug(f"Found corresponding featureXML file: {featurexml_path}")
201
- else:
202
- self.logger.debug(f"No corresponding featureXML file found at: {featurexml_path}")
203
- else:
204
- # For .wiff, .mzML, .raw files, save to study folder (original behavior)
205
- if self.folder is not None:
206
- if not os.path.exists(self.folder):
207
- os.makedirs(self.folder)
208
- final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
209
- ddaobj.save(final_sample_path)
210
- self.logger.debug(f"Saved converted sample to study folder: {final_sample_path}")
211
- else:
212
- # If no study folder is set, save in current directory
213
- final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
214
- ddaobj.save(final_sample_path)
215
- self.logger.debug(f"Saved converted sample to current directory: {final_sample_path}")
216
-
217
- # Count MS1 and MS2 scans from the loaded sample
218
- ms1_count = 0
219
- ms2_count = 0
220
- if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
221
- ms1_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 1).height)
222
- ms2_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 2).height)
223
-
224
- # Calculate next sequence number
225
- next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
226
-
227
- new_sample = pl.DataFrame(
228
- {
229
- "sample_uid": [int(len(self.samples_df) + 1)],
230
- "sample_name": [sample_name],
231
- "sample_path": [final_sample_path], # Use the determined path
232
- "sample_type": [sample_type],
233
- "map_id": [map_id_value],
234
- "sample_source": [getattr(ddaobj, "file_source", file)],
235
- "sample_color": [None], # Will be set by set_sample_color below
236
- "sample_group": [""], # Default empty string
237
- "sample_batch": [1], # Default batch 1
238
- "sample_sequence": [next_sequence], # Increasing sequence number
239
- "num_features": [int(ddaobj.features.size())],
240
- "num_ms1": [ms1_count],
241
- "num_ms2": [ms2_count],
242
- },
243
- schema={
244
- "sample_uid": pl.Int64,
245
- "sample_name": pl.Utf8,
246
- "sample_path": pl.Utf8,
247
- "sample_type": pl.Utf8,
248
- "map_id": pl.Int64,
249
- "sample_source": pl.Utf8,
250
- "sample_color": pl.Utf8,
251
- "sample_group": pl.Utf8,
252
- "sample_batch": pl.Int64,
253
- "sample_sequence": pl.Int64,
254
- "num_features": pl.Int64,
255
- "num_ms1": pl.Int64,
256
- "num_ms2": pl.Int64,
257
- },
258
- )
259
- self.samples_df = pl.concat([self.samples_df, new_sample])
260
-
261
- # Optimized DataFrame operations - chain operations instead of multiple clones
262
- columns_to_add = [
263
- pl.lit(len(self.samples_df)).alias("sample_uid"),
264
- pl.lit(False).alias("filled"),
265
- pl.lit(-1.0).alias("chrom_area"),
266
- ]
267
-
268
- # Only add rt_original if it doesn't exist
269
- if "rt_original" not in ddaobj.features_df.columns:
270
- columns_to_add.append(pl.col("rt").alias("rt_original"))
271
-
272
- f_df = ddaobj.features_df.with_columns(columns_to_add)
273
-
274
- if self.features_df.is_empty():
275
- # Create new features_df with feature_uid column
276
- self.features_df = f_df.with_columns(
277
- pl.int_range(pl.len()).add(1).alias("feature_uid"),
278
- ).select(
279
- ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
144
+ Args:
145
+ file (str): Path to the sample file
146
+ type (str, optional): File type to force. Defaults to None (auto-detect).
147
+ reset (bool, optional): Whether to reset the study. Defaults to False.
148
+ adducts (optional): Adducts to use for sample loading. Defaults to None.
149
+ fast (bool, optional): Whether to use optimized loading that skips ms1_df
150
+ for better performance. Defaults to True.
151
+
152
+ Returns:
153
+ bool: True if successful, False otherwise.
154
+ """
155
+ if fast:
156
+ # Use optimized method for better performance
157
+ success = self._add_sample_optimized(
158
+ file,
159
+ type=type,
160
+ reset=reset,
161
+ adducts=adducts,
162
+ skip_color_reset=False, # Do color reset for individual calls
163
+ skip_schema_check=True # Skip schema check for performance (safe with diagonal concat)
280
164
  )
281
- # Ensure column order matches schema from the very beginning
282
- self._ensure_features_df_schema_order()
283
165
  else:
284
- offset = self.features_df["feature_uid"].max() + 1 if not self.features_df.is_empty() else 1
285
- # Chain operations and add to existing DataFrame
286
- f_df = f_df.with_columns(
287
- pl.int_range(pl.len()).add(offset).alias("feature_uid"),
288
- ).select(
289
- ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
166
+ # Use standard method with full ms1_df loading
167
+ success = self._add_sample_standard(
168
+ file,
169
+ type=type,
170
+ reset=reset,
171
+ adducts=adducts,
172
+ skip_color_reset=False, # Do color reset for individual calls
173
+ skip_schema_check=True # Skip schema check for performance
290
174
  )
291
-
292
- # Reorganize f_df columns to match self.features_df column order and schema
293
- target_columns = self.features_df.columns
294
- target_schema = self.features_df.schema
295
- f_df_columns = f_df.columns
296
-
297
- # Create select expressions for reordering and type casting
298
- select_exprs = []
299
- for col in target_columns:
300
- if col in f_df_columns:
301
- # Cast to the expected type
302
- expected_dtype = target_schema[col]
303
- select_exprs.append(pl.col(col).cast(expected_dtype, strict=False))
304
- else:
305
- # Add missing columns with null values of the correct type
306
- expected_dtype = target_schema[col]
307
- select_exprs.append(pl.lit(None, dtype=expected_dtype).alias(col))
308
-
309
- # Add any extra columns from f_df that aren't in target_columns (keep their original types)
310
- for col in f_df_columns:
311
- if col not in target_columns:
312
- select_exprs.append(pl.col(col))
313
-
314
- # Reorder and type-cast f_df columns
315
- f_df = f_df.select(select_exprs)
316
-
317
- self.features_df = pl.concat([self.features_df, f_df])
318
-
319
- # Ensure features_df column order matches schema
320
- self._ensure_features_df_schema_order()
321
-
322
- # Auto-assign colors when new sample is added (reset all colors using turbo colormap based on UID)
323
- self.sample_color_reset()
324
175
 
325
- self.logger.debug(
326
- f"Added sample {sample_name} with {ddaobj.features.size()} features to the study.",
327
- )
176
+ return success
328
177
 
329
178
 
330
179
  def load(self, filename=None):
@@ -942,8 +791,6 @@ def _fill_chrom_impl(
942
791
  })
943
792
 
944
793
  total_missing = len(missing_combinations_df)
945
- total_samples = len(samples_to_process)
946
-
947
794
  self.logger.debug(
948
795
  f"Gap filling for {total_missing} missing features...",
949
796
  )
@@ -1300,3 +1147,506 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
1300
1147
  self.consensus_map = oms.ConsensusMap()
1301
1148
  fh.load(filename, self.consensus_map)
1302
1149
  self.logger.debug(f"Loaded consensus map from {filename}.")
1150
+
1151
+ def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, fast=True):
1152
+ """
1153
+ Optimized batch addition of samples.
1154
+
1155
+ Args:
1156
+ files (list): List of file paths to process
1157
+ reset (bool): Whether to reset features before processing
1158
+ adducts: Adducts to use for sample loading
1159
+ blacklist (set): Set of filenames already processed
1160
+ fast (bool): Whether to use optimized loading (skips ms1_df) or standard loading
1161
+
1162
+ Performance optimizations:
1163
+ 1. No per-sample color reset
1164
+ 2. No schema enforcement during addition
1165
+ 3. Simplified DataFrame operations
1166
+ 4. Batch progress reporting
1167
+ """
1168
+ if not files:
1169
+ return 0
1170
+
1171
+ if blacklist is None:
1172
+ blacklist = set()
1173
+
1174
+ self.logger.debug(f"Starting batch addition of {len(files)} samples (fast={fast})...")
1175
+
1176
+ successful_additions = 0
1177
+ failed_additions = 0
1178
+
1179
+ # Progress reporting setup
1180
+ tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1181
+
1182
+ for i, file in enumerate(
1183
+ tqdm(
1184
+ files,
1185
+ total=len(files),
1186
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Batch add",
1187
+ disable=tqdm_disable,
1188
+ )
1189
+ ):
1190
+ try:
1191
+ # Choose between optimized and standard loading
1192
+ if fast:
1193
+ success = self._add_sample_optimized(
1194
+ file,
1195
+ reset=reset,
1196
+ adducts=adducts,
1197
+ skip_color_reset=True, # Skip color reset during batch
1198
+ skip_schema_check=True # Skip schema enforcement
1199
+ )
1200
+ else:
1201
+ success = self._add_sample_standard(
1202
+ file,
1203
+ reset=reset,
1204
+ adducts=adducts,
1205
+ skip_color_reset=True, # Skip color reset during batch
1206
+ skip_schema_check=True # Skip schema enforcement
1207
+ )
1208
+
1209
+ if success:
1210
+ # Add to blacklist for filename tracking
1211
+ basename = os.path.basename(file)
1212
+ filename_no_ext = os.path.splitext(basename)[0]
1213
+ blacklist.add(filename_no_ext)
1214
+ successful_additions += 1
1215
+
1216
+ except Exception as e:
1217
+ self.logger.warning(f"Failed to add sample {file}: {e}")
1218
+ failed_additions += 1
1219
+ continue
1220
+
1221
+ # Final cleanup operations done once at the end
1222
+ if successful_additions > 0:
1223
+ self.logger.debug("Performing final batch cleanup...")
1224
+
1225
+ # Optional: Only do schema enforcement if specifically needed (usually not required)
1226
+ # self._ensure_features_df_schema_order()
1227
+
1228
+ # Color assignment done once for all samples
1229
+ self._sample_color_reset_optimized()
1230
+
1231
+ self.logger.debug(f"Batch addition complete: {successful_additions} successful, {failed_additions} failed")
1232
+
1233
+ return successful_additions
1234
+
1235
+ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
1236
+ """
1237
+ Optimized add_sample with performance improvements integrated.
1238
+
1239
+ Removes:
1240
+ - Schema enforcement (_ensure_features_df_schema_order)
1241
+ - Complex column alignment and type casting
1242
+ - Per-addition color reset
1243
+ - Unnecessary column reordering
1244
+
1245
+ Returns True if successful, False otherwise.
1246
+ """
1247
+ self.logger.debug(f"Adding: {file}")
1248
+
1249
+ # Basic validation
1250
+ basename = os.path.basename(file)
1251
+ sample_name = os.path.splitext(basename)[0]
1252
+
1253
+ if sample_name in self.samples_df["sample_name"].to_list():
1254
+ self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
1255
+ return False
1256
+
1257
+ if not os.path.exists(file):
1258
+ self.logger.error(f"File {file} does not exist.")
1259
+ return False
1260
+
1261
+ if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
1262
+ self.logger.error(f"Unsupported file type: {file}")
1263
+ return False
1264
+
1265
+ # Load sample
1266
+ ddaobj = Sample()
1267
+ ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
1268
+ # Use standard loading method temporarily to test if this fixes the astuple error
1269
+ ddaobj.load(file)
1270
+
1271
+ if ddaobj.features_df is None and not reset:
1272
+ ddaobj.features = None
1273
+
1274
+ if ddaobj.features is None or reset:
1275
+ ddaobj.find_features()
1276
+ ddaobj.find_adducts(adducts=adducts)
1277
+ ddaobj.find_ms2()
1278
+
1279
+ self.features_maps.append(ddaobj.features)
1280
+
1281
+ # Determine sample type
1282
+ sample_type = "sample" if type is None else type
1283
+ if "qc" in sample_name.lower():
1284
+ sample_type = "qc"
1285
+ if "blank" in sample_name.lower():
1286
+ sample_type = "blank"
1287
+
1288
+ map_id_value = len(self.features_maps) - 1
1289
+
1290
+ # Handle file paths
1291
+ if file.endswith(".sample5"):
1292
+ final_sample_path = file
1293
+ self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
1294
+ else:
1295
+ if self.folder is not None:
1296
+ if not os.path.exists(self.folder):
1297
+ os.makedirs(self.folder)
1298
+ final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
1299
+ else:
1300
+ final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
1301
+ ddaobj.save(final_sample_path)
1302
+ self.logger.debug(f"Saved converted sample: {final_sample_path}")
1303
+
1304
+ # Efficient scan counting
1305
+ ms1_count = ms2_count = 0
1306
+ if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
1307
+ scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1308
+ ms_levels = scan_counts.get("ms_level", [])
1309
+ counts = scan_counts.get("len", [])
1310
+ for level, count in zip(ms_levels, counts):
1311
+ if level == 1:
1312
+ ms1_count = count
1313
+ elif level == 2:
1314
+ ms2_count = count
1315
+
1316
+ # Create sample entry
1317
+ next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1318
+ new_sample = pl.DataFrame({
1319
+ "sample_uid": [int(len(self.samples_df) + 1)],
1320
+ "sample_name": [sample_name],
1321
+ "sample_path": [final_sample_path],
1322
+ "sample_type": [sample_type],
1323
+ "map_id": [map_id_value],
1324
+ "sample_source": [getattr(ddaobj, "file_source", file)],
1325
+ "sample_color": [None], # Will be set in batch at end
1326
+ "sample_group": [""],
1327
+ "sample_batch": [1],
1328
+ "sample_sequence": [next_sequence],
1329
+ "num_features": [int(ddaobj.features.size())],
1330
+ "num_ms1": [ms1_count],
1331
+ "num_ms2": [ms2_count],
1332
+ })
1333
+
1334
+ self.samples_df = pl.concat([self.samples_df, new_sample])
1335
+
1336
+ # SIMPLIFIED feature processing
1337
+ current_sample_uid = len(self.samples_df) - 1
1338
+
1339
+ # Add required columns with minimal operations
1340
+ columns_to_add = [
1341
+ pl.lit(current_sample_uid).alias("sample_uid"),
1342
+ pl.lit(False).alias("filled"),
1343
+ pl.lit(-1.0).alias("chrom_area"),
1344
+ ]
1345
+
1346
+ # Only add rt_original if it doesn't exist
1347
+ if "rt_original" not in ddaobj.features_df.columns:
1348
+ columns_to_add.append(pl.col("rt").alias("rt_original"))
1349
+
1350
+ f_df = ddaobj.features_df.with_columns(columns_to_add)
1351
+
1352
+ if self.features_df.is_empty():
1353
+ # First sample
1354
+ self.features_df = f_df.with_columns(
1355
+ pl.int_range(pl.len()).add(1).alias("feature_uid")
1356
+ )
1357
+ else:
1358
+ # Subsequent samples - minimal overhead
1359
+ offset = self.features_df["feature_uid"].max() + 1
1360
+ f_df = f_df.with_columns(
1361
+ pl.int_range(pl.len()).add(offset).alias("feature_uid")
1362
+ )
1363
+
1364
+ # OPTIMIZED: Use diagonal concatenation without any schema enforcement
1365
+ # This is the fastest concatenation method in Polars and handles type mismatches automatically
1366
+ self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
1367
+
1368
+ # REMOVED ALL EXPENSIVE OPERATIONS:
1369
+ # - No _ensure_features_df_schema_order()
1370
+ # - No complex column alignment
1371
+ # - No type casting loops
1372
+ # - No sample_color_reset()
1373
+
1374
+ self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (optimized)")
1375
+ return True
1376
+
1377
+
1378
+ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
1379
+ """
1380
+ Standard add_sample method that uses full sample loading (includes ms1_df).
1381
+
1382
+ This method uses the standard sample.load() method which loads all data
1383
+ including ms1_df, providing full functionality but potentially slower performance
1384
+ for large MS1 datasets.
1385
+
1386
+ Returns True if successful, False otherwise.
1387
+ """
1388
+ self.logger.debug(f"Adding (standard): {file}")
1389
+
1390
+ # Basic validation
1391
+ basename = os.path.basename(file)
1392
+ sample_name = os.path.splitext(basename)[0]
1393
+
1394
+ if sample_name in self.samples_df["sample_name"].to_list():
1395
+ self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
1396
+ return False
1397
+
1398
+ if not os.path.exists(file):
1399
+ self.logger.error(f"File {file} does not exist.")
1400
+ return False
1401
+
1402
+ if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
1403
+ self.logger.error(f"Unsupported file type: {file}")
1404
+ return False
1405
+
1406
+ # Load sample using standard method (includes ms1_df)
1407
+ ddaobj = Sample()
1408
+ ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
1409
+ # Use standard loading method that loads all data including ms1_df
1410
+ ddaobj.load(file)
1411
+
1412
+ if ddaobj.features_df is None and not reset:
1413
+ ddaobj.features = None
1414
+
1415
+ if ddaobj.features is None or reset:
1416
+ ddaobj.find_features()
1417
+ ddaobj.find_adducts(adducts=adducts)
1418
+ ddaobj.find_ms2()
1419
+
1420
+ self.features_maps.append(ddaobj.features)
1421
+
1422
+ # Determine sample type
1423
+ sample_type = "sample" if type is None else type
1424
+ if "qc" in sample_name.lower():
1425
+ sample_type = "qc"
1426
+ if "blank" in sample_name.lower():
1427
+ sample_type = "blank"
1428
+
1429
+ map_id_value = len(self.features_maps) - 1
1430
+
1431
+ # Handle file paths
1432
+ if file.endswith(".sample5"):
1433
+ final_sample_path = file
1434
+ self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
1435
+ else:
1436
+ if self.folder is not None:
1437
+ if not os.path.exists(self.folder):
1438
+ os.makedirs(self.folder)
1439
+ final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
1440
+ else:
1441
+ final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
1442
+ ddaobj.save(final_sample_path)
1443
+ self.logger.debug(f"Saved converted sample: {final_sample_path}")
1444
+
1445
+ # Efficient scan counting
1446
+ ms1_count = ms2_count = 0
1447
+ if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
1448
+ scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1449
+ ms_levels = scan_counts.get("ms_level", [])
1450
+ counts = scan_counts.get("len", [])
1451
+ for level, count in zip(ms_levels, counts):
1452
+ if level == 1:
1453
+ ms1_count = count
1454
+ elif level == 2:
1455
+ ms2_count = count
1456
+
1457
+ # Create sample entry
1458
+ next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1459
+ new_sample = pl.DataFrame({
1460
+ "sample_uid": [int(len(self.samples_df) + 1)],
1461
+ "sample_name": [sample_name],
1462
+ "sample_path": [final_sample_path],
1463
+ "sample_type": [sample_type],
1464
+ "map_id": [map_id_value],
1465
+ "sample_source": [getattr(ddaobj, "file_source", file)],
1466
+ "sample_color": [None], # Will be set in batch at end
1467
+ "sample_group": [""],
1468
+ "sample_batch": [1],
1469
+ "sample_sequence": [next_sequence],
1470
+ "num_features": [int(ddaobj.features.size())],
1471
+ "num_ms1": [ms1_count],
1472
+ "num_ms2": [ms2_count],
1473
+ })
1474
+
1475
+ self.samples_df = pl.concat([self.samples_df, new_sample])
1476
+
1477
+ # SIMPLIFIED feature processing
1478
+ current_sample_uid = len(self.samples_df) - 1
1479
+
1480
+ # Add required columns with minimal operations
1481
+ columns_to_add = [
1482
+ pl.lit(current_sample_uid).alias("sample_uid"),
1483
+ pl.lit(False).alias("filled"),
1484
+ pl.lit(-1.0).alias("chrom_area"),
1485
+ ]
1486
+
1487
+ # Only add rt_original if it doesn't exist
1488
+ if "rt_original" not in ddaobj.features_df.columns:
1489
+ columns_to_add.append(pl.col("rt").alias("rt_original"))
1490
+
1491
+ f_df = ddaobj.features_df.with_columns(columns_to_add)
1492
+
1493
+ if self.features_df.is_empty():
1494
+ # First sample
1495
+ self.features_df = f_df.with_columns(
1496
+ pl.int_range(pl.len()).add(1).alias("feature_uid")
1497
+ )
1498
+ else:
1499
+ # Subsequent samples - minimal overhead
1500
+ offset = self.features_df["feature_uid"].max() + 1
1501
+ f_df = f_df.with_columns(
1502
+ pl.int_range(pl.len()).add(offset).alias("feature_uid")
1503
+ )
1504
+
1505
+ # Use diagonal concatenation for flexibility
1506
+ self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
1507
+
1508
+ self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
1509
+ return True
1510
+ # Use standard loading method that loads all data including ms1_df
1511
+ ddaobj.load(file)
1512
+
1513
+ if ddaobj.features_df is None and not reset:
1514
+ ddaobj.features = None
1515
+
1516
+ if ddaobj.features is None or reset:
1517
+ ddaobj.find_features()
1518
+ ddaobj.find_adducts(adducts=adducts)
1519
+ ddaobj.find_ms2()
1520
+
1521
+ self.features_maps.append(ddaobj.features)
1522
+
1523
+ # Determine sample type
1524
+ sample_type = "sample" if type is None else type
1525
+ if "qc" in sample_name.lower():
1526
+ sample_type = "qc"
1527
+ if "blank" in sample_name.lower():
1528
+ sample_type = "blank"
1529
+
1530
+ map_id_value = len(self.features_maps) - 1
1531
+
1532
+ # Handle file paths
1533
+ if file.endswith(".sample5"):
1534
+ final_sample_path = file
1535
+ self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
1536
+ else:
1537
+ if self.folder is not None:
1538
+ if not os.path.exists(self.folder):
1539
+ os.makedirs(self.folder)
1540
+ final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
1541
+ else:
1542
+ final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
1543
+ ddaobj.save(final_sample_path)
1544
+ self.logger.debug(f"Saved converted sample: {final_sample_path}")
1545
+
1546
+ # Efficient scan counting
1547
+ ms1_count = ms2_count = 0
1548
+ if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
1549
+ scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1550
+ ms_levels = scan_counts.get("ms_level", [])
1551
+ counts = scan_counts.get("len", [])
1552
+ for level, count in zip(ms_levels, counts):
1553
+ if level == 1:
1554
+ ms1_count = count
1555
+ elif level == 2:
1556
+ ms2_count = count
1557
+
1558
+ # Create sample entry
1559
+ next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1560
+ new_sample = pl.DataFrame({
1561
+ "sample_uid": [int(len(self.samples_df) + 1)],
1562
+ "sample_name": [sample_name],
1563
+ "sample_path": [final_sample_path],
1564
+ "sample_type": [sample_type],
1565
+ "map_id": [map_id_value],
1566
+ "sample_source": [getattr(ddaobj, "file_source", file)],
1567
+ "sample_color": [None], # Will be set in batch at end
1568
+ "sample_group": [""],
1569
+ "sample_batch": [1],
1570
+ "sample_sequence": [next_sequence],
1571
+ "num_features": [int(ddaobj.features.size())],
1572
+ "num_ms1": [ms1_count],
1573
+ "num_ms2": [ms2_count],
1574
+ })
1575
+
1576
+ self.samples_df = pl.concat([self.samples_df, new_sample])
1577
+
1578
+ # SIMPLIFIED feature processing
1579
+ current_sample_uid = len(self.samples_df) - 1
1580
+
1581
+ # Add required columns with minimal operations
1582
+ columns_to_add = [
1583
+ pl.lit(current_sample_uid).alias("sample_uid"),
1584
+ pl.lit(False).alias("filled"),
1585
+ pl.lit(-1.0).alias("chrom_area"),
1586
+ ]
1587
+
1588
+ # Only add rt_original if it doesn't exist
1589
+ if "rt_original" not in ddaobj.features_df.columns:
1590
+ columns_to_add.append(pl.col("rt").alias("rt_original"))
1591
+
1592
+ f_df = ddaobj.features_df.with_columns(columns_to_add)
1593
+
1594
+ if self.features_df.is_empty():
1595
+ # First sample
1596
+ self.features_df = f_df.with_columns(
1597
+ pl.int_range(pl.len()).add(1).alias("feature_uid")
1598
+ )
1599
+ else:
1600
+ # Subsequent samples - minimal overhead
1601
+ offset = self.features_df["feature_uid"].max() + 1
1602
+ f_df = f_df.with_columns(
1603
+ pl.int_range(pl.len()).add(offset).alias("feature_uid")
1604
+ )
1605
+
1606
+ # Use diagonal concatenation for flexibility
1607
+ self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
1608
+
1609
+ self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
1610
+ return True
1611
+
1612
+
1613
+ def _sample_color_reset_optimized(self):
1614
+ """
1615
+ Optimized version of sample_color_reset that caches colormap initialization.
1616
+ """
1617
+ if self.samples_df is None or len(self.samples_df) == 0:
1618
+ self.logger.warning("No samples found in study.")
1619
+ return
1620
+
1621
+ # Cache the colormap if not already cached
1622
+ if not hasattr(self, '_cached_colormap'):
1623
+ try:
1624
+ from cmap import Colormap
1625
+ self._cached_colormap = Colormap('turbo')
1626
+ except ImportError:
1627
+ self.logger.warning("cmap package not available, using default colors")
1628
+ return
1629
+
1630
+ cm = self._cached_colormap
1631
+ n_samples = len(self.samples_df)
1632
+
1633
+ # Pre-allocate colors list for better performance
1634
+ colors = [None] * n_samples
1635
+
1636
+ # Vectorized color generation
1637
+ for i in range(n_samples):
1638
+ normalized_value = 0.1 + ((i + 0.5) / n_samples) * 0.8
1639
+ color_rgba = cm(normalized_value)
1640
+
1641
+ if len(color_rgba) >= 3:
1642
+ r, g, b = color_rgba[:3]
1643
+ if max(color_rgba[:3]) <= 1.0:
1644
+ r, g, b = int(r * 255), int(g * 255), int(b * 255)
1645
+ colors[i] = f"#{r:02x}{g:02x}{b:02x}"
1646
+
1647
+ # Update the sample_color column efficiently
1648
+ self.samples_df = self.samples_df.with_columns(
1649
+ pl.Series("sample_color", colors).alias("sample_color")
1650
+ )
1651
+
1652
+ self.logger.debug(f"Reset sample colors (cached) for {n_samples} samples")