masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +3 -9
  4. masster/data/libs/README.md +1 -1
  5. masster/data/libs/ccm.csv +120 -120
  6. masster/data/libs/ccm.py +116 -62
  7. masster/data/libs/central_carbon_README.md +1 -1
  8. masster/data/libs/urine.py +161 -65
  9. masster/data/libs/urine_metabolites.csv +4693 -4693
  10. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
  11. masster/logger.py +43 -78
  12. masster/sample/__init__.py +1 -1
  13. masster/sample/adducts.py +264 -338
  14. masster/sample/defaults/find_adducts_def.py +8 -21
  15. masster/sample/defaults/find_features_def.py +1 -6
  16. masster/sample/defaults/get_spectrum_def.py +1 -5
  17. masster/sample/defaults/sample_def.py +1 -5
  18. masster/sample/h5.py +282 -561
  19. masster/sample/helpers.py +75 -131
  20. masster/sample/lib.py +17 -42
  21. masster/sample/load.py +17 -31
  22. masster/sample/parameters.py +2 -6
  23. masster/sample/plot.py +27 -88
  24. masster/sample/processing.py +87 -117
  25. masster/sample/quant.py +51 -57
  26. masster/sample/sample.py +90 -103
  27. masster/sample/sample5_schema.json +44 -44
  28. masster/sample/save.py +12 -35
  29. masster/sample/sciex.py +19 -66
  30. masster/spectrum.py +20 -58
  31. masster/study/__init__.py +1 -1
  32. masster/study/defaults/align_def.py +1 -5
  33. masster/study/defaults/fill_chrom_def.py +1 -5
  34. masster/study/defaults/fill_def.py +1 -5
  35. masster/study/defaults/integrate_chrom_def.py +1 -5
  36. masster/study/defaults/integrate_def.py +1 -5
  37. masster/study/defaults/study_def.py +25 -58
  38. masster/study/export.py +207 -233
  39. masster/study/h5.py +136 -470
  40. masster/study/helpers.py +202 -495
  41. masster/study/helpers_optimized.py +13 -40
  42. masster/study/id.py +110 -213
  43. masster/study/load.py +143 -230
  44. masster/study/plot.py +257 -518
  45. masster/study/processing.py +257 -469
  46. masster/study/save.py +5 -15
  47. masster/study/study.py +276 -379
  48. masster/study/study5_schema.json +96 -96
  49. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
  50. masster-0.4.1.dist-info/RECORD +67 -0
  51. masster-0.4.0.dist-info/RECORD +0 -67
  52. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
  53. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
  54. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/study/load.py CHANGED
@@ -10,10 +10,10 @@ import pyopenms as oms
10
10
 
11
11
  from tqdm import tqdm
12
12
 
13
- from master.chromatogram import Chromatogram
14
- from master.study.defaults import fill_defaults
15
- from master.sample.sample import Sample
16
- from master.spectrum import Spectrum
13
+ from masster.chromatogram import Chromatogram
14
+ from masster.study.defaults import fill_defaults
15
+ from masster.sample.sample import Sample
16
+ from masster.spectrum import Spectrum
17
17
 
18
18
 
19
19
  # Pre-import heavy modules to avoid repeated loading in add_sample()
@@ -94,9 +94,7 @@ def add(
94
94
 
95
95
  if len(files) > 0:
96
96
  # Limit files if max_files is specified
97
- remaining_slots = (
98
- max_files - counter if max_files is not None else len(files)
99
- )
97
+ remaining_slots = max_files - counter if max_files is not None else len(files)
100
98
  files = files[:remaining_slots]
101
99
 
102
100
  self.logger.debug(f"Found {len(files)} {ext} files")
@@ -119,9 +117,7 @@ def add(
119
117
 
120
118
  # Batch process all files of this extension using ultra-optimized method
121
119
  if files_to_process:
122
- self.logger.debug(
123
- f"Batch processing {len(files_to_process)} {ext} files",
124
- )
120
+ self.logger.debug(f"Batch processing {len(files_to_process)} {ext} files")
125
121
  successful = self._add_samples_batch(
126
122
  files_to_process,
127
123
  reset=reset,
@@ -260,8 +256,7 @@ def _fill_chrom_single_impl(
260
256
  if min_number > 0:
261
257
  original_count = len(uids)
262
258
  uids = self.consensus_df.filter(
263
- (pl.col("number_samples") >= min_number)
264
- & (pl.col("consensus_uid").is_in(uids)),
259
+ (pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
265
260
  )["consensus_uid"].to_list()
266
261
  self.logger.debug(
267
262
  f"Features to fill: {original_count} -> {len(uids)}",
@@ -276,15 +271,13 @@ def _fill_chrom_single_impl(
276
271
  # Build lookup dictionaries
277
272
  self.logger.debug("Building lookup dictionaries...")
278
273
  consensus_info = {}
279
- consensus_subset = self.consensus_df.select(
280
- [
281
- "consensus_uid",
282
- "rt_start_mean",
283
- "rt_end_mean",
284
- "mz",
285
- "rt",
286
- ],
287
- ).filter(pl.col("consensus_uid").is_in(uids))
274
+ consensus_subset = self.consensus_df.select([
275
+ "consensus_uid",
276
+ "rt_start_mean",
277
+ "rt_end_mean",
278
+ "mz",
279
+ "rt",
280
+ ]).filter(pl.col("consensus_uid").is_in(uids))
288
281
 
289
282
  for row in consensus_subset.iter_rows(named=True):
290
283
  consensus_info[row["consensus_uid"]] = {
@@ -451,13 +444,11 @@ def _fill_chrom_single_impl(
451
444
  }
452
445
 
453
446
  new_features.append(new_feature)
454
- new_mapping.append(
455
- {
456
- "consensus_uid": consensus_uid,
457
- "sample_uid": sample_uid,
458
- "feature_uid": feature_uid,
459
- },
460
- )
447
+ new_mapping.append({
448
+ "consensus_uid": consensus_uid,
449
+ "sample_uid": sample_uid,
450
+ "feature_uid": feature_uid,
451
+ })
461
452
  counter += 1
462
453
 
463
454
  # Add new features to DataFrames
@@ -480,10 +471,7 @@ def _fill_chrom_single_impl(
480
471
  for row in rows_to_add:
481
472
  # Cast numeric columns to ensure consistency
482
473
  for key, value in row.items():
483
- if (
484
- key in ["mz", "rt", "intensity", "area", "height"]
485
- and value is not None
486
- ):
474
+ if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
487
475
  row[key] = float(value)
488
476
  elif key in ["sample_id", "feature_id"] and value is not None:
489
477
  row[key] = int(value)
@@ -530,7 +518,7 @@ def fill_single(self, **kwargs):
530
518
  min_samples_abs: Absolute minimum sample threshold (default: 2)
531
519
  """
532
520
  # parameters initialization
533
- from master.study.defaults import fill_defaults
521
+ from masster.study.defaults import fill_defaults
534
522
 
535
523
  params = fill_defaults()
536
524
 
@@ -702,13 +690,11 @@ def _process_sample_for_parallel_fill(
702
690
  }
703
691
 
704
692
  new_features.append(new_feature)
705
- new_mapping.append(
706
- {
707
- "consensus_uid": consensus_uid,
708
- "sample_uid": sample_uid,
709
- "feature_uid": feature_uid,
710
- },
711
- )
693
+ new_mapping.append({
694
+ "consensus_uid": consensus_uid,
695
+ "sample_uid": sample_uid,
696
+ "feature_uid": feature_uid,
697
+ })
712
698
  counter += 1
713
699
 
714
700
  return new_features, new_mapping, counter
@@ -754,8 +740,7 @@ def _fill_chrom_impl(
754
740
  if min_number > 0:
755
741
  original_count = len(uids)
756
742
  uids = self.consensus_df.filter(
757
- (pl.col("number_samples") >= min_number)
758
- & (pl.col("consensus_uid").is_in(uids)),
743
+ (pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
759
744
  )["consensus_uid"].to_list()
760
745
  self.logger.debug(f"Features to fill: {original_count} -> {len(uids)}")
761
746
 
@@ -782,15 +767,13 @@ def _fill_chrom_impl(
782
767
  # Build lookup dictionaries
783
768
  self.logger.debug("Building lookup dictionaries...")
784
769
  consensus_info = {}
785
- consensus_subset = self.consensus_df.select(
786
- [
787
- "consensus_uid",
788
- "rt_start_mean",
789
- "rt_end_mean",
790
- "mz",
791
- "rt",
792
- ],
793
- ).filter(pl.col("consensus_uid").is_in(uids))
770
+ consensus_subset = self.consensus_df.select([
771
+ "consensus_uid",
772
+ "rt_start_mean",
773
+ "rt_end_mean",
774
+ "mz",
775
+ "rt",
776
+ ]).filter(pl.col("consensus_uid").is_in(uids))
794
777
 
795
778
  for row in consensus_subset.iter_rows(named=True):
796
779
  consensus_info[row["consensus_uid"]] = {
@@ -807,13 +790,11 @@ def _fill_chrom_impl(
807
790
  for row in self.samples_df.filter(
808
791
  pl.col("sample_uid").is_in(unique_sample_uids),
809
792
  ).iter_rows(named=True):
810
- samples_to_process.append(
811
- {
812
- "sample_name": row["sample_name"],
813
- "sample_uid": row["sample_uid"],
814
- "sample_path": row["sample_path"],
815
- },
816
- )
793
+ samples_to_process.append({
794
+ "sample_name": row["sample_name"],
795
+ "sample_uid": row["sample_uid"],
796
+ "sample_path": row["sample_path"],
797
+ })
817
798
 
818
799
  total_missing = len(missing_combinations_df)
819
800
  self.logger.debug(
@@ -821,9 +802,7 @@ def _fill_chrom_impl(
821
802
  )
822
803
 
823
804
  # Calculate current max feature_uid to avoid conflicts
824
- features_df_max_uid = (
825
- self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
826
- )
805
+ features_df_max_uid = self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
827
806
 
828
807
  # Process samples in parallel
829
808
  all_new_features: list[dict] = []
@@ -897,10 +876,7 @@ def _fill_chrom_impl(
897
876
  for row in rows_to_add:
898
877
  # Cast numeric columns to ensure consistency
899
878
  for key, value in row.items():
900
- if (
901
- key in ["mz", "rt", "intensity", "area", "height"]
902
- and value is not None
903
- ):
879
+ if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
904
880
  row[key] = float(value)
905
881
  elif key in ["sample_id", "feature_id"] and value is not None:
906
882
  row[key] = int(value)
@@ -949,10 +925,7 @@ def fill(self, **kwargs):
949
925
  """
950
926
  # parameters initialization
951
927
  params = fill_defaults()
952
- num_workers = kwargs.get(
953
- "num_workers",
954
- 4,
955
- ) # Default parameter not in defaults class
928
+ num_workers = kwargs.get("num_workers", 4) # Default parameter not in defaults class
956
929
 
957
930
  for key, value in kwargs.items():
958
931
  if isinstance(value, fill_defaults):
@@ -1015,9 +988,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
1015
988
  .agg(pl.count("feature_uid").alias("count"))
1016
989
  )
1017
990
 
1018
- total_existing = (
1019
- consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
1020
- )
991
+ total_existing = consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
1021
992
 
1022
993
  # If >95% filled, likely no gaps (common case)
1023
994
  if total_existing >= total_possible * 0.95:
@@ -1036,12 +1007,10 @@ def _get_missing_consensus_sample_combinations(self, uids):
1036
1007
 
1037
1008
  # Get existing combinations for target UIDs only
1038
1009
  existing_by_consensus = {}
1039
- for consensus_uid, feature_uid in self.consensus_mapping_df.select(
1040
- [
1041
- "consensus_uid",
1042
- "feature_uid",
1043
- ],
1044
- ).iter_rows():
1010
+ for consensus_uid, feature_uid in self.consensus_mapping_df.select([
1011
+ "consensus_uid",
1012
+ "feature_uid",
1013
+ ]).iter_rows():
1045
1014
  if consensus_uid in uids_set and feature_uid in feature_to_sample:
1046
1015
  if consensus_uid not in existing_by_consensus:
1047
1016
  existing_by_consensus[consensus_uid] = set()
@@ -1049,9 +1018,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
1049
1018
 
1050
1019
  # Get sample info once
1051
1020
  all_samples = list(
1052
- self.samples_df.select(
1053
- ["sample_uid", "sample_name", "sample_path"],
1054
- ).iter_rows(),
1021
+ self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows(),
1055
1022
  )
1056
1023
 
1057
1024
  # Check for missing combinations
@@ -1059,17 +1026,13 @@ def _get_missing_consensus_sample_combinations(self, uids):
1059
1026
  existing_samples = existing_by_consensus.get(consensus_uid, set())
1060
1027
  for sample_uid, sample_name, sample_path in all_samples:
1061
1028
  if sample_uid not in existing_samples:
1062
- missing_combinations.append(
1063
- (consensus_uid, sample_uid, sample_name, sample_path),
1064
- )
1029
+ missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path))
1065
1030
 
1066
1031
  return missing_combinations
1067
1032
 
1068
1033
  else:
1069
1034
  # For studies with many gaps, use bulk operations
1070
- self.logger.debug(
1071
- f"Study {total_existing / total_possible * 100:.1f}% filled, using bulk optimization",
1072
- )
1035
+ self.logger.debug(f"Study {total_existing / total_possible * 100:.1f}% filled, using bulk optimization")
1073
1036
 
1074
1037
  # Build efficient lookups
1075
1038
  uids_set = set(uids)
@@ -1080,20 +1043,16 @@ def _get_missing_consensus_sample_combinations(self, uids):
1080
1043
  # Build existing combinations set
1081
1044
  existing_combinations = {
1082
1045
  (consensus_uid, feature_to_sample[feature_uid])
1083
- for consensus_uid, feature_uid in self.consensus_mapping_df.select(
1084
- [
1085
- "consensus_uid",
1086
- "feature_uid",
1087
- ],
1088
- ).iter_rows()
1046
+ for consensus_uid, feature_uid in self.consensus_mapping_df.select([
1047
+ "consensus_uid",
1048
+ "feature_uid",
1049
+ ]).iter_rows()
1089
1050
  if consensus_uid in uids_set and feature_uid in feature_to_sample
1090
1051
  }
1091
1052
 
1092
1053
  # Get all sample info
1093
1054
  all_samples = list(
1094
- self.samples_df.select(
1095
- ["sample_uid", "sample_name", "sample_path"],
1096
- ).iter_rows(),
1055
+ self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows(),
1097
1056
  )
1098
1057
 
1099
1058
  # Generate all missing combinations
@@ -1162,10 +1121,7 @@ def sanitize(self):
1162
1121
  for ms2_specs in row_data["ms2_specs"]:
1163
1122
  if not isinstance(ms2_specs, Spectrum):
1164
1123
  try:
1165
- new_ms2_specs = Spectrum(
1166
- mz=np.array([0]),
1167
- inty=np.array([0]),
1168
- )
1124
+ new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
1169
1125
  if hasattr(ms2_specs, "__dict__"):
1170
1126
  new_ms2_specs.from_dict(ms2_specs.__dict__)
1171
1127
  else:
@@ -1204,8 +1160,8 @@ def sanitize(self):
1204
1160
  def load_features(self):
1205
1161
  """
1206
1162
  Load features by reconstructing FeatureMaps from the processed features_df data.
1207
-
1208
- This ensures that the loaded FeatureMaps contain the same processed features
1163
+
1164
+ This ensures that the loaded FeatureMaps contain the same processed features
1209
1165
  as stored in features_df, rather than loading raw features from .featureXML files
1210
1166
  which may not match the processed data after filtering, alignment, etc.
1211
1167
  """
@@ -1213,25 +1169,25 @@ def load_features(self):
1213
1169
  import pyopenms as oms
1214
1170
  from tqdm import tqdm
1215
1171
  from datetime import datetime
1216
-
1172
+
1217
1173
  self.features_maps = []
1218
-
1174
+
1219
1175
  # Check if features_df exists and is not empty
1220
1176
  if self.features_df is None:
1221
1177
  self.logger.warning("features_df is None. Falling back to XML loading.")
1222
1178
  self._load_features_from_xml()
1223
1179
  return
1224
-
1180
+
1225
1181
  if len(self.features_df) == 0:
1226
1182
  self.logger.warning("features_df is empty. Falling back to XML loading.")
1227
1183
  self._load_features_from_xml()
1228
1184
  return
1229
-
1185
+
1230
1186
  # If we get here, we should use the new method
1231
1187
  self.logger.debug("Reconstructing FeatureMaps from features_df.")
1232
-
1188
+
1233
1189
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1234
-
1190
+
1235
1191
  # Process each sample in order
1236
1192
  for sample_index, row_dict in tqdm(
1237
1193
  enumerate(self.samples_df.iter_rows(named=True)),
@@ -1239,39 +1195,37 @@ def load_features(self):
1239
1195
  desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Reconstruct FeatureMaps from DataFrame",
1240
1196
  disable=tdqm_disable,
1241
1197
  ):
1242
- sample_uid = row_dict["sample_uid"]
1243
- sample_name = row_dict["sample_name"]
1244
-
1198
+ sample_uid = row_dict['sample_uid']
1199
+ sample_name = row_dict['sample_name']
1200
+
1245
1201
  # Get features for this sample from features_df
1246
- sample_features = self.features_df.filter(pl.col("sample_uid") == sample_uid)
1247
-
1202
+ sample_features = self.features_df.filter(pl.col('sample_uid') == sample_uid)
1203
+
1248
1204
  # Create new FeatureMap
1249
1205
  feature_map = oms.FeatureMap()
1250
-
1206
+
1251
1207
  # Convert DataFrame features to OpenMS Features
1252
1208
  for feature_row in sample_features.iter_rows(named=True):
1253
1209
  feature = oms.Feature()
1254
-
1210
+
1255
1211
  # Set properties from DataFrame (handle missing values gracefully)
1256
1212
  try:
1257
- feature.setUniqueId(int(feature_row["feature_id"]))
1258
- feature.setMZ(float(feature_row["mz"]))
1259
- feature.setRT(float(feature_row["rt"]))
1260
- feature.setIntensity(float(feature_row["inty"]))
1261
- feature.setOverallQuality(float(feature_row["quality"]))
1262
- feature.setCharge(int(feature_row["charge"]))
1263
-
1213
+ feature.setUniqueId(int(feature_row['feature_id']))
1214
+ feature.setMZ(float(feature_row['mz']))
1215
+ feature.setRT(float(feature_row['rt']))
1216
+ feature.setIntensity(float(feature_row['inty']))
1217
+ feature.setOverallQuality(float(feature_row['quality']))
1218
+ feature.setCharge(int(feature_row['charge']))
1219
+
1264
1220
  # Add to feature map
1265
1221
  feature_map.push_back(feature)
1266
1222
  except (ValueError, TypeError) as e:
1267
1223
  self.logger.warning(f"Skipping feature due to conversion error: {e}")
1268
1224
  continue
1269
-
1225
+
1270
1226
  self.features_maps.append(feature_map)
1271
-
1272
- self.logger.debug(
1273
- f"Successfully reconstructed {len(self.features_maps)} FeatureMaps from features_df.",
1274
- )
1227
+
1228
+ self.logger.debug(f"Successfully reconstructed {len(self.features_maps)} FeatureMaps from features_df.")
1275
1229
 
1276
1230
 
1277
1231
  def _load_features_from_xml(self):
@@ -1326,14 +1280,7 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
1326
1280
  self.logger.debug(f"Loaded consensus map from {filename}.")
1327
1281
 
1328
1282
 
1329
- def _add_samples_batch(
1330
- self,
1331
- files,
1332
- reset=False,
1333
- adducts=None,
1334
- blacklist=None,
1335
- fast=True,
1336
- ):
1283
+ def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, fast=True):
1337
1284
  """
1338
1285
  Optimized batch addition of samples.
1339
1286
 
@@ -1356,9 +1303,7 @@ def _add_samples_batch(
1356
1303
  if blacklist is None:
1357
1304
  blacklist = set()
1358
1305
 
1359
- self.logger.debug(
1360
- f"Starting batch addition of {len(files)} samples (skip_ms1={fast})...",
1361
- )
1306
+ self.logger.debug(f"Starting batch addition of {len(files)} samples (skip_ms1={fast})...")
1362
1307
 
1363
1308
  successful_additions = 0
1364
1309
  failed_additions = 0
@@ -1415,9 +1360,7 @@ def _add_samples_batch(
1415
1360
  # Color assignment done once for all samples
1416
1361
  self._sample_color_reset_optimized()
1417
1362
 
1418
- self.logger.debug(
1419
- f"Add samples complete: {successful_additions} successful, {failed_additions} failed",
1420
- )
1363
+ self.logger.debug(f"Add samples complete: {successful_additions} successful, {failed_additions} failed")
1421
1364
 
1422
1365
  return successful_additions
1423
1366
 
@@ -1463,7 +1406,7 @@ def _add_sample_optimized(
1463
1406
  # Load sample
1464
1407
  ddaobj = Sample()
1465
1408
  ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
1466
-
1409
+
1467
1410
  # Try optimized loading first (study-specific, skips ms1_df for better performance)
1468
1411
 
1469
1412
  if file.endswith(".sample5"):
@@ -1499,7 +1442,7 @@ def _add_sample_optimized(
1499
1442
  # Handle file paths
1500
1443
  if file.endswith(".sample5"):
1501
1444
  final_sample_path = file
1502
- # self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
1445
+ #self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
1503
1446
  else:
1504
1447
  if self.folder is not None:
1505
1448
  if not os.path.exists(self.folder):
@@ -1512,14 +1455,8 @@ def _add_sample_optimized(
1512
1455
 
1513
1456
  # Efficient scan counting
1514
1457
  ms1_count = ms2_count = 0
1515
- if (
1516
- hasattr(ddaobj, "scans_df")
1517
- and ddaobj.scans_df is not None
1518
- and not ddaobj.scans_df.is_empty()
1519
- ):
1520
- scan_counts = (
1521
- ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1522
- )
1458
+ if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
1459
+ scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1523
1460
  ms_levels = scan_counts.get("ms_level", [])
1524
1461
  counts = scan_counts.get("len", [])
1525
1462
  for level, count in zip(ms_levels, counts):
@@ -1530,23 +1467,21 @@ def _add_sample_optimized(
1530
1467
 
1531
1468
  # Create sample entry
1532
1469
  next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1533
- new_sample = pl.DataFrame(
1534
- {
1535
- "sample_uid": [int(len(self.samples_df) + 1)],
1536
- "sample_name": [sample_name],
1537
- "sample_path": [final_sample_path],
1538
- "sample_type": [sample_type],
1539
- "map_id": [map_id_value],
1540
- "sample_source": [getattr(ddaobj, "file_source", file)],
1541
- "sample_color": [None], # Will be set in batch at end
1542
- "sample_group": [""],
1543
- "sample_batch": [1],
1544
- "sample_sequence": [next_sequence],
1545
- "num_features": [int(ddaobj._oms_features_map.size())],
1546
- "num_ms1": [ms1_count],
1547
- "num_ms2": [ms2_count],
1548
- },
1549
- )
1470
+ new_sample = pl.DataFrame({
1471
+ "sample_uid": [int(len(self.samples_df) + 1)],
1472
+ "sample_name": [sample_name],
1473
+ "sample_path": [final_sample_path],
1474
+ "sample_type": [sample_type],
1475
+ "map_id": [map_id_value],
1476
+ "sample_source": [getattr(ddaobj, "file_source", file)],
1477
+ "sample_color": [None], # Will be set in batch at end
1478
+ "sample_group": [""],
1479
+ "sample_batch": [1],
1480
+ "sample_sequence": [next_sequence],
1481
+ "num_features": [int(ddaobj._oms_features_map.size())],
1482
+ "num_ms1": [ms1_count],
1483
+ "num_ms2": [ms2_count],
1484
+ })
1550
1485
 
1551
1486
  self.samples_df = pl.concat([self.samples_df, new_sample])
1552
1487
 
@@ -1588,9 +1523,7 @@ def _add_sample_optimized(
1588
1523
  # - No type casting loops
1589
1524
  # - No sample_color_reset()
1590
1525
 
1591
- self.logger.debug(
1592
- f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (optimized)",
1593
- )
1526
+ self.logger.debug(f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (optimized)")
1594
1527
  return True
1595
1528
 
1596
1529
 
@@ -1634,7 +1567,7 @@ def _add_sample_standard(
1634
1567
  ddaobj = Sample()
1635
1568
  ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
1636
1569
  # Use standard loading method that loads all data including ms1_df
1637
-
1570
+
1638
1571
  if file.endswith(".sample5"):
1639
1572
  ddaobj.load(file)
1640
1573
  # restore _oms_features_map
@@ -1668,7 +1601,7 @@ def _add_sample_standard(
1668
1601
  # Handle file paths
1669
1602
  if file.endswith(".sample5"):
1670
1603
  final_sample_path = file
1671
- # self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
1604
+ #self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
1672
1605
  else:
1673
1606
  if self.folder is not None:
1674
1607
  if not os.path.exists(self.folder):
@@ -1681,14 +1614,8 @@ def _add_sample_standard(
1681
1614
 
1682
1615
  # Efficient scan counting
1683
1616
  ms1_count = ms2_count = 0
1684
- if (
1685
- hasattr(ddaobj, "scans_df")
1686
- and ddaobj.scans_df is not None
1687
- and not ddaobj.scans_df.is_empty()
1688
- ):
1689
- scan_counts = (
1690
- ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1691
- )
1617
+ if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
1618
+ scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1692
1619
  ms_levels = scan_counts.get("ms_level", [])
1693
1620
  counts = scan_counts.get("len", [])
1694
1621
  for level, count in zip(ms_levels, counts):
@@ -1699,23 +1626,21 @@ def _add_sample_standard(
1699
1626
 
1700
1627
  # Create sample entry
1701
1628
  next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1702
- new_sample = pl.DataFrame(
1703
- {
1704
- "sample_uid": [int(len(self.samples_df) + 1)],
1705
- "sample_name": [sample_name],
1706
- "sample_path": [final_sample_path],
1707
- "sample_type": [sample_type],
1708
- "map_id": [map_id_value],
1709
- "sample_source": [getattr(ddaobj, "file_source", file)],
1710
- "sample_color": [None], # Will be set in batch at end
1711
- "sample_group": [""],
1712
- "sample_batch": [1],
1713
- "sample_sequence": [next_sequence],
1714
- "num_features": [int(ddaobj._oms_features_map.size())],
1715
- "num_ms1": [ms1_count],
1716
- "num_ms2": [ms2_count],
1717
- },
1718
- )
1629
+ new_sample = pl.DataFrame({
1630
+ "sample_uid": [int(len(self.samples_df) + 1)],
1631
+ "sample_name": [sample_name],
1632
+ "sample_path": [final_sample_path],
1633
+ "sample_type": [sample_type],
1634
+ "map_id": [map_id_value],
1635
+ "sample_source": [getattr(ddaobj, "file_source", file)],
1636
+ "sample_color": [None], # Will be set in batch at end
1637
+ "sample_group": [""],
1638
+ "sample_batch": [1],
1639
+ "sample_sequence": [next_sequence],
1640
+ "num_features": [int(ddaobj._oms_features_map.size())],
1641
+ "num_ms1": [ms1_count],
1642
+ "num_ms2": [ms2_count],
1643
+ })
1719
1644
 
1720
1645
  self.samples_df = pl.concat([self.samples_df, new_sample])
1721
1646
 
@@ -1750,9 +1675,7 @@ def _add_sample_standard(
1750
1675
  # Use diagonal concatenation for flexibility
1751
1676
  self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
1752
1677
 
1753
- self.logger.debug(
1754
- f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)",
1755
- )
1678
+ self.logger.debug(f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)")
1756
1679
  return True
1757
1680
  ## COMMENT AR: Is this intentional?
1758
1681
  # Use standard loading method that loads all data including ms1_df
@@ -1780,7 +1703,7 @@ def _add_sample_standard(
1780
1703
  # Handle file paths
1781
1704
  if file.endswith(".sample5"):
1782
1705
  final_sample_path = file
1783
- # self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
1706
+ #self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
1784
1707
  else:
1785
1708
  if self.folder is not None:
1786
1709
  if not os.path.exists(self.folder):
@@ -1793,14 +1716,8 @@ def _add_sample_standard(
1793
1716
 
1794
1717
  # Efficient scan counting
1795
1718
  ms1_count = ms2_count = 0
1796
- if (
1797
- hasattr(ddaobj, "scans_df")
1798
- and ddaobj.scans_df is not None
1799
- and not ddaobj.scans_df.is_empty()
1800
- ):
1801
- scan_counts = (
1802
- ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1803
- )
1719
+ if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
1720
+ scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1804
1721
  ms_levels = scan_counts.get("ms_level", [])
1805
1722
  counts = scan_counts.get("len", [])
1806
1723
  for level, count in zip(ms_levels, counts):
@@ -1811,23 +1728,21 @@ def _add_sample_standard(
1811
1728
 
1812
1729
  # Create sample entry
1813
1730
  next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1814
- new_sample = pl.DataFrame(
1815
- {
1816
- "sample_uid": [int(len(self.samples_df) + 1)],
1817
- "sample_name": [sample_name],
1818
- "sample_path": [final_sample_path],
1819
- "sample_type": [sample_type],
1820
- "map_id": [map_id_value],
1821
- "sample_source": [getattr(ddaobj, "file_source", file)],
1822
- "sample_color": [None], # Will be set in batch at end
1823
- "sample_group": [""],
1824
- "sample_batch": [1],
1825
- "sample_sequence": [next_sequence],
1826
- "num_features": [int(ddaobj._oms_features_map.size())],
1827
- "num_ms1": [ms1_count],
1828
- "num_ms2": [ms2_count],
1829
- },
1830
- )
1731
+ new_sample = pl.DataFrame({
1732
+ "sample_uid": [int(len(self.samples_df) + 1)],
1733
+ "sample_name": [sample_name],
1734
+ "sample_path": [final_sample_path],
1735
+ "sample_type": [sample_type],
1736
+ "map_id": [map_id_value],
1737
+ "sample_source": [getattr(ddaobj, "file_source", file)],
1738
+ "sample_color": [None], # Will be set in batch at end
1739
+ "sample_group": [""],
1740
+ "sample_batch": [1],
1741
+ "sample_sequence": [next_sequence],
1742
+ "num_features": [int(ddaobj._oms_features_map.size())],
1743
+ "num_ms1": [ms1_count],
1744
+ "num_ms2": [ms2_count],
1745
+ })
1831
1746
 
1832
1747
  self.samples_df = pl.concat([self.samples_df, new_sample])
1833
1748
 
@@ -1862,9 +1777,7 @@ def _add_sample_standard(
1862
1777
  # Use diagonal concatenation for flexibility
1863
1778
  self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
1864
1779
 
1865
- self.logger.debug(
1866
- f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)",
1867
- )
1780
+ self.logger.debug(f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)")
1868
1781
  return True
1869
1782
 
1870
1783