masster 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (39) hide show
  1. masster/__init__.py +8 -8
  2. masster/chromatogram.py +1 -1
  3. masster/data/libs/urine.csv +3 -3
  4. masster/logger.py +11 -11
  5. masster/sample/__init__.py +1 -1
  6. masster/sample/adducts.py +338 -264
  7. masster/sample/defaults/find_adducts_def.py +21 -8
  8. masster/sample/h5.py +561 -282
  9. masster/sample/helpers.py +131 -75
  10. masster/sample/lib.py +4 -4
  11. masster/sample/load.py +31 -17
  12. masster/sample/parameters.py +1 -1
  13. masster/sample/plot.py +7 -7
  14. masster/sample/processing.py +117 -87
  15. masster/sample/sample.py +103 -90
  16. masster/sample/sample5_schema.json +44 -44
  17. masster/sample/save.py +35 -12
  18. masster/spectrum.py +1 -1
  19. masster/study/__init__.py +1 -1
  20. masster/study/defaults/align_def.py +5 -1
  21. masster/study/defaults/identify_def.py +3 -1
  22. masster/study/defaults/study_def.py +58 -25
  23. masster/study/export.py +360 -210
  24. masster/study/h5.py +560 -158
  25. masster/study/helpers.py +496 -203
  26. masster/study/helpers_optimized.py +1 -1
  27. masster/study/id.py +538 -349
  28. masster/study/load.py +233 -143
  29. masster/study/plot.py +71 -71
  30. masster/study/processing.py +456 -254
  31. masster/study/save.py +15 -5
  32. masster/study/study.py +213 -131
  33. masster/study/study5_schema.json +149 -149
  34. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/METADATA +3 -1
  35. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/RECORD +39 -39
  36. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/WHEEL +0 -0
  37. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/entry_points.txt +0 -0
  38. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/licenses/LICENSE +0 -0
  39. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/top_level.txt +0 -0
masster/study/load.py CHANGED
@@ -10,10 +10,10 @@ import pyopenms as oms
10
10
 
11
11
  from tqdm import tqdm
12
12
 
13
- from masster.chromatogram import Chromatogram
14
- from masster.study.defaults import fill_defaults
15
- from masster.sample.sample import Sample
16
- from masster.spectrum import Spectrum
13
+ from master.chromatogram import Chromatogram
14
+ from master.study.defaults import fill_defaults
15
+ from master.sample.sample import Sample
16
+ from master.spectrum import Spectrum
17
17
 
18
18
 
19
19
  # Pre-import heavy modules to avoid repeated loading in add_sample()
@@ -94,7 +94,9 @@ def add(
94
94
 
95
95
  if len(files) > 0:
96
96
  # Limit files if max_files is specified
97
- remaining_slots = max_files - counter if max_files is not None else len(files)
97
+ remaining_slots = (
98
+ max_files - counter if max_files is not None else len(files)
99
+ )
98
100
  files = files[:remaining_slots]
99
101
 
100
102
  self.logger.debug(f"Found {len(files)} {ext} files")
@@ -117,7 +119,9 @@ def add(
117
119
 
118
120
  # Batch process all files of this extension using ultra-optimized method
119
121
  if files_to_process:
120
- self.logger.debug(f"Batch processing {len(files_to_process)} {ext} files")
122
+ self.logger.debug(
123
+ f"Batch processing {len(files_to_process)} {ext} files",
124
+ )
121
125
  successful = self._add_samples_batch(
122
126
  files_to_process,
123
127
  reset=reset,
@@ -140,6 +144,9 @@ def add(
140
144
  )
141
145
  else:
142
146
  self.logger.debug(f"Successfully added {counter} samples to the study.")
147
+
148
+ # Return a simple summary to suppress marimo's automatic object display
149
+ return f"Added {counter} samples to study"
143
150
 
144
151
 
145
152
  # TODO type is not used
@@ -256,7 +263,8 @@ def _fill_chrom_single_impl(
256
263
  if min_number > 0:
257
264
  original_count = len(uids)
258
265
  uids = self.consensus_df.filter(
259
- (pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
266
+ (pl.col("number_samples") >= min_number)
267
+ & (pl.col("consensus_uid").is_in(uids)),
260
268
  )["consensus_uid"].to_list()
261
269
  self.logger.debug(
262
270
  f"Features to fill: {original_count} -> {len(uids)}",
@@ -271,13 +279,15 @@ def _fill_chrom_single_impl(
271
279
  # Build lookup dictionaries
272
280
  self.logger.debug("Building lookup dictionaries...")
273
281
  consensus_info = {}
274
- consensus_subset = self.consensus_df.select([
275
- "consensus_uid",
276
- "rt_start_mean",
277
- "rt_end_mean",
278
- "mz",
279
- "rt",
280
- ]).filter(pl.col("consensus_uid").is_in(uids))
282
+ consensus_subset = self.consensus_df.select(
283
+ [
284
+ "consensus_uid",
285
+ "rt_start_mean",
286
+ "rt_end_mean",
287
+ "mz",
288
+ "rt",
289
+ ],
290
+ ).filter(pl.col("consensus_uid").is_in(uids))
281
291
 
282
292
  for row in consensus_subset.iter_rows(named=True):
283
293
  consensus_info[row["consensus_uid"]] = {
@@ -444,11 +454,13 @@ def _fill_chrom_single_impl(
444
454
  }
445
455
 
446
456
  new_features.append(new_feature)
447
- new_mapping.append({
448
- "consensus_uid": consensus_uid,
449
- "sample_uid": sample_uid,
450
- "feature_uid": feature_uid,
451
- })
457
+ new_mapping.append(
458
+ {
459
+ "consensus_uid": consensus_uid,
460
+ "sample_uid": sample_uid,
461
+ "feature_uid": feature_uid,
462
+ },
463
+ )
452
464
  counter += 1
453
465
 
454
466
  # Add new features to DataFrames
@@ -471,7 +483,10 @@ def _fill_chrom_single_impl(
471
483
  for row in rows_to_add:
472
484
  # Cast numeric columns to ensure consistency
473
485
  for key, value in row.items():
474
- if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
486
+ if (
487
+ key in ["mz", "rt", "intensity", "area", "height"]
488
+ and value is not None
489
+ ):
475
490
  row[key] = float(value)
476
491
  elif key in ["sample_id", "feature_id"] and value is not None:
477
492
  row[key] = int(value)
@@ -518,7 +533,7 @@ def fill_single(self, **kwargs):
518
533
  min_samples_abs: Absolute minimum sample threshold (default: 2)
519
534
  """
520
535
  # parameters initialization
521
- from masster.study.defaults import fill_defaults
536
+ from master.study.defaults import fill_defaults
522
537
 
523
538
  params = fill_defaults()
524
539
 
@@ -690,11 +705,13 @@ def _process_sample_for_parallel_fill(
690
705
  }
691
706
 
692
707
  new_features.append(new_feature)
693
- new_mapping.append({
694
- "consensus_uid": consensus_uid,
695
- "sample_uid": sample_uid,
696
- "feature_uid": feature_uid,
697
- })
708
+ new_mapping.append(
709
+ {
710
+ "consensus_uid": consensus_uid,
711
+ "sample_uid": sample_uid,
712
+ "feature_uid": feature_uid,
713
+ },
714
+ )
698
715
  counter += 1
699
716
 
700
717
  return new_features, new_mapping, counter
@@ -740,7 +757,8 @@ def _fill_chrom_impl(
740
757
  if min_number > 0:
741
758
  original_count = len(uids)
742
759
  uids = self.consensus_df.filter(
743
- (pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
760
+ (pl.col("number_samples") >= min_number)
761
+ & (pl.col("consensus_uid").is_in(uids)),
744
762
  )["consensus_uid"].to_list()
745
763
  self.logger.debug(f"Features to fill: {original_count} -> {len(uids)}")
746
764
 
@@ -767,13 +785,15 @@ def _fill_chrom_impl(
767
785
  # Build lookup dictionaries
768
786
  self.logger.debug("Building lookup dictionaries...")
769
787
  consensus_info = {}
770
- consensus_subset = self.consensus_df.select([
771
- "consensus_uid",
772
- "rt_start_mean",
773
- "rt_end_mean",
774
- "mz",
775
- "rt",
776
- ]).filter(pl.col("consensus_uid").is_in(uids))
788
+ consensus_subset = self.consensus_df.select(
789
+ [
790
+ "consensus_uid",
791
+ "rt_start_mean",
792
+ "rt_end_mean",
793
+ "mz",
794
+ "rt",
795
+ ],
796
+ ).filter(pl.col("consensus_uid").is_in(uids))
777
797
 
778
798
  for row in consensus_subset.iter_rows(named=True):
779
799
  consensus_info[row["consensus_uid"]] = {
@@ -790,11 +810,13 @@ def _fill_chrom_impl(
790
810
  for row in self.samples_df.filter(
791
811
  pl.col("sample_uid").is_in(unique_sample_uids),
792
812
  ).iter_rows(named=True):
793
- samples_to_process.append({
794
- "sample_name": row["sample_name"],
795
- "sample_uid": row["sample_uid"],
796
- "sample_path": row["sample_path"],
797
- })
813
+ samples_to_process.append(
814
+ {
815
+ "sample_name": row["sample_name"],
816
+ "sample_uid": row["sample_uid"],
817
+ "sample_path": row["sample_path"],
818
+ },
819
+ )
798
820
 
799
821
  total_missing = len(missing_combinations_df)
800
822
  self.logger.debug(
@@ -802,7 +824,9 @@ def _fill_chrom_impl(
802
824
  )
803
825
 
804
826
  # Calculate current max feature_uid to avoid conflicts
805
- features_df_max_uid = self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
827
+ features_df_max_uid = (
828
+ self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
829
+ )
806
830
 
807
831
  # Process samples in parallel
808
832
  all_new_features: list[dict] = []
@@ -876,7 +900,10 @@ def _fill_chrom_impl(
876
900
  for row in rows_to_add:
877
901
  # Cast numeric columns to ensure consistency
878
902
  for key, value in row.items():
879
- if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
903
+ if (
904
+ key in ["mz", "rt", "intensity", "area", "height"]
905
+ and value is not None
906
+ ):
880
907
  row[key] = float(value)
881
908
  elif key in ["sample_id", "feature_id"] and value is not None:
882
909
  row[key] = int(value)
@@ -925,7 +952,10 @@ def fill(self, **kwargs):
925
952
  """
926
953
  # parameters initialization
927
954
  params = fill_defaults()
928
- num_workers = kwargs.get("num_workers", 4) # Default parameter not in defaults class
955
+ num_workers = kwargs.get(
956
+ "num_workers",
957
+ 4,
958
+ ) # Default parameter not in defaults class
929
959
 
930
960
  for key, value in kwargs.items():
931
961
  if isinstance(value, fill_defaults):
@@ -988,7 +1018,9 @@ def _get_missing_consensus_sample_combinations(self, uids):
988
1018
  .agg(pl.count("feature_uid").alias("count"))
989
1019
  )
990
1020
 
991
- total_existing = consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
1021
+ total_existing = (
1022
+ consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
1023
+ )
992
1024
 
993
1025
  # If >95% filled, likely no gaps (common case)
994
1026
  if total_existing >= total_possible * 0.95:
@@ -1007,10 +1039,12 @@ def _get_missing_consensus_sample_combinations(self, uids):
1007
1039
 
1008
1040
  # Get existing combinations for target UIDs only
1009
1041
  existing_by_consensus = {}
1010
- for consensus_uid, feature_uid in self.consensus_mapping_df.select([
1011
- "consensus_uid",
1012
- "feature_uid",
1013
- ]).iter_rows():
1042
+ for consensus_uid, feature_uid in self.consensus_mapping_df.select(
1043
+ [
1044
+ "consensus_uid",
1045
+ "feature_uid",
1046
+ ],
1047
+ ).iter_rows():
1014
1048
  if consensus_uid in uids_set and feature_uid in feature_to_sample:
1015
1049
  if consensus_uid not in existing_by_consensus:
1016
1050
  existing_by_consensus[consensus_uid] = set()
@@ -1018,7 +1052,9 @@ def _get_missing_consensus_sample_combinations(self, uids):
1018
1052
 
1019
1053
  # Get sample info once
1020
1054
  all_samples = list(
1021
- self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows(),
1055
+ self.samples_df.select(
1056
+ ["sample_uid", "sample_name", "sample_path"],
1057
+ ).iter_rows(),
1022
1058
  )
1023
1059
 
1024
1060
  # Check for missing combinations
@@ -1026,13 +1062,17 @@ def _get_missing_consensus_sample_combinations(self, uids):
1026
1062
  existing_samples = existing_by_consensus.get(consensus_uid, set())
1027
1063
  for sample_uid, sample_name, sample_path in all_samples:
1028
1064
  if sample_uid not in existing_samples:
1029
- missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path))
1065
+ missing_combinations.append(
1066
+ (consensus_uid, sample_uid, sample_name, sample_path),
1067
+ )
1030
1068
 
1031
1069
  return missing_combinations
1032
1070
 
1033
1071
  else:
1034
1072
  # For studies with many gaps, use bulk operations
1035
- self.logger.debug(f"Study {total_existing / total_possible * 100:.1f}% filled, using bulk optimization")
1073
+ self.logger.debug(
1074
+ f"Study {total_existing / total_possible * 100:.1f}% filled, using bulk optimization",
1075
+ )
1036
1076
 
1037
1077
  # Build efficient lookups
1038
1078
  uids_set = set(uids)
@@ -1043,16 +1083,20 @@ def _get_missing_consensus_sample_combinations(self, uids):
1043
1083
  # Build existing combinations set
1044
1084
  existing_combinations = {
1045
1085
  (consensus_uid, feature_to_sample[feature_uid])
1046
- for consensus_uid, feature_uid in self.consensus_mapping_df.select([
1047
- "consensus_uid",
1048
- "feature_uid",
1049
- ]).iter_rows()
1086
+ for consensus_uid, feature_uid in self.consensus_mapping_df.select(
1087
+ [
1088
+ "consensus_uid",
1089
+ "feature_uid",
1090
+ ],
1091
+ ).iter_rows()
1050
1092
  if consensus_uid in uids_set and feature_uid in feature_to_sample
1051
1093
  }
1052
1094
 
1053
1095
  # Get all sample info
1054
1096
  all_samples = list(
1055
- self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows(),
1097
+ self.samples_df.select(
1098
+ ["sample_uid", "sample_name", "sample_path"],
1099
+ ).iter_rows(),
1056
1100
  )
1057
1101
 
1058
1102
  # Generate all missing combinations
@@ -1121,7 +1165,10 @@ def sanitize(self):
1121
1165
  for ms2_specs in row_data["ms2_specs"]:
1122
1166
  if not isinstance(ms2_specs, Spectrum):
1123
1167
  try:
1124
- new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
1168
+ new_ms2_specs = Spectrum(
1169
+ mz=np.array([0]),
1170
+ inty=np.array([0]),
1171
+ )
1125
1172
  if hasattr(ms2_specs, "__dict__"):
1126
1173
  new_ms2_specs.from_dict(ms2_specs.__dict__)
1127
1174
  else:
@@ -1160,8 +1207,8 @@ def sanitize(self):
1160
1207
  def load_features(self):
1161
1208
  """
1162
1209
  Load features by reconstructing FeatureMaps from the processed features_df data.
1163
-
1164
- This ensures that the loaded FeatureMaps contain the same processed features
1210
+
1211
+ This ensures that the loaded FeatureMaps contain the same processed features
1165
1212
  as stored in features_df, rather than loading raw features from .featureXML files
1166
1213
  which may not match the processed data after filtering, alignment, etc.
1167
1214
  """
@@ -1169,25 +1216,25 @@ def load_features(self):
1169
1216
  import pyopenms as oms
1170
1217
  from tqdm import tqdm
1171
1218
  from datetime import datetime
1172
-
1219
+
1173
1220
  self.features_maps = []
1174
-
1221
+
1175
1222
  # Check if features_df exists and is not empty
1176
1223
  if self.features_df is None:
1177
1224
  self.logger.warning("features_df is None. Falling back to XML loading.")
1178
1225
  self._load_features_from_xml()
1179
1226
  return
1180
-
1227
+
1181
1228
  if len(self.features_df) == 0:
1182
1229
  self.logger.warning("features_df is empty. Falling back to XML loading.")
1183
1230
  self._load_features_from_xml()
1184
1231
  return
1185
-
1232
+
1186
1233
  # If we get here, we should use the new method
1187
1234
  self.logger.debug("Reconstructing FeatureMaps from features_df.")
1188
-
1235
+
1189
1236
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1190
-
1237
+
1191
1238
  # Process each sample in order
1192
1239
  for sample_index, row_dict in tqdm(
1193
1240
  enumerate(self.samples_df.iter_rows(named=True)),
@@ -1195,37 +1242,39 @@ def load_features(self):
1195
1242
  desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Reconstruct FeatureMaps from DataFrame",
1196
1243
  disable=tdqm_disable,
1197
1244
  ):
1198
- sample_uid = row_dict['sample_uid']
1199
- sample_name = row_dict['sample_name']
1200
-
1245
+ sample_uid = row_dict["sample_uid"]
1246
+ sample_name = row_dict["sample_name"]
1247
+
1201
1248
  # Get features for this sample from features_df
1202
- sample_features = self.features_df.filter(pl.col('sample_uid') == sample_uid)
1203
-
1249
+ sample_features = self.features_df.filter(pl.col("sample_uid") == sample_uid)
1250
+
1204
1251
  # Create new FeatureMap
1205
1252
  feature_map = oms.FeatureMap()
1206
-
1253
+
1207
1254
  # Convert DataFrame features to OpenMS Features
1208
1255
  for feature_row in sample_features.iter_rows(named=True):
1209
1256
  feature = oms.Feature()
1210
-
1257
+
1211
1258
  # Set properties from DataFrame (handle missing values gracefully)
1212
1259
  try:
1213
- feature.setUniqueId(int(feature_row['feature_id']))
1214
- feature.setMZ(float(feature_row['mz']))
1215
- feature.setRT(float(feature_row['rt']))
1216
- feature.setIntensity(float(feature_row['inty']))
1217
- feature.setOverallQuality(float(feature_row['quality']))
1218
- feature.setCharge(int(feature_row['charge']))
1219
-
1260
+ feature.setUniqueId(int(feature_row["feature_id"]))
1261
+ feature.setMZ(float(feature_row["mz"]))
1262
+ feature.setRT(float(feature_row["rt"]))
1263
+ feature.setIntensity(float(feature_row["inty"]))
1264
+ feature.setOverallQuality(float(feature_row["quality"]))
1265
+ feature.setCharge(int(feature_row["charge"]))
1266
+
1220
1267
  # Add to feature map
1221
1268
  feature_map.push_back(feature)
1222
1269
  except (ValueError, TypeError) as e:
1223
1270
  self.logger.warning(f"Skipping feature due to conversion error: {e}")
1224
1271
  continue
1225
-
1272
+
1226
1273
  self.features_maps.append(feature_map)
1227
-
1228
- self.logger.debug(f"Successfully reconstructed {len(self.features_maps)} FeatureMaps from features_df.")
1274
+
1275
+ self.logger.debug(
1276
+ f"Successfully reconstructed {len(self.features_maps)} FeatureMaps from features_df.",
1277
+ )
1229
1278
 
1230
1279
 
1231
1280
  def _load_features_from_xml(self):
@@ -1280,7 +1329,14 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
1280
1329
  self.logger.debug(f"Loaded consensus map from {filename}.")
1281
1330
 
1282
1331
 
1283
- def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, fast=True):
1332
+ def _add_samples_batch(
1333
+ self,
1334
+ files,
1335
+ reset=False,
1336
+ adducts=None,
1337
+ blacklist=None,
1338
+ fast=True,
1339
+ ):
1284
1340
  """
1285
1341
  Optimized batch addition of samples.
1286
1342
 
@@ -1303,7 +1359,9 @@ def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, f
1303
1359
  if blacklist is None:
1304
1360
  blacklist = set()
1305
1361
 
1306
- self.logger.debug(f"Starting batch addition of {len(files)} samples (skip_ms1={fast})...")
1362
+ self.logger.debug(
1363
+ f"Starting batch addition of {len(files)} samples (skip_ms1={fast})...",
1364
+ )
1307
1365
 
1308
1366
  successful_additions = 0
1309
1367
  failed_additions = 0
@@ -1360,7 +1418,9 @@ def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, f
1360
1418
  # Color assignment done once for all samples
1361
1419
  self._sample_color_reset_optimized()
1362
1420
 
1363
- self.logger.debug(f"Add samples complete: {successful_additions} successful, {failed_additions} failed")
1421
+ self.logger.debug(
1422
+ f"Add samples complete: {successful_additions} successful, {failed_additions} failed",
1423
+ )
1364
1424
 
1365
1425
  return successful_additions
1366
1426
 
@@ -1406,7 +1466,7 @@ def _add_sample_optimized(
1406
1466
  # Load sample
1407
1467
  ddaobj = Sample()
1408
1468
  ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
1409
-
1469
+
1410
1470
  # Try optimized loading first (study-specific, skips ms1_df for better performance)
1411
1471
 
1412
1472
  if file.endswith(".sample5"):
@@ -1442,7 +1502,7 @@ def _add_sample_optimized(
1442
1502
  # Handle file paths
1443
1503
  if file.endswith(".sample5"):
1444
1504
  final_sample_path = file
1445
- #self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
1505
+ # self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
1446
1506
  else:
1447
1507
  if self.folder is not None:
1448
1508
  if not os.path.exists(self.folder):
@@ -1455,8 +1515,14 @@ def _add_sample_optimized(
1455
1515
 
1456
1516
  # Efficient scan counting
1457
1517
  ms1_count = ms2_count = 0
1458
- if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
1459
- scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1518
+ if (
1519
+ hasattr(ddaobj, "scans_df")
1520
+ and ddaobj.scans_df is not None
1521
+ and not ddaobj.scans_df.is_empty()
1522
+ ):
1523
+ scan_counts = (
1524
+ ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1525
+ )
1460
1526
  ms_levels = scan_counts.get("ms_level", [])
1461
1527
  counts = scan_counts.get("len", [])
1462
1528
  for level, count in zip(ms_levels, counts):
@@ -1467,21 +1533,23 @@ def _add_sample_optimized(
1467
1533
 
1468
1534
  # Create sample entry
1469
1535
  next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1470
- new_sample = pl.DataFrame({
1471
- "sample_uid": [int(len(self.samples_df) + 1)],
1472
- "sample_name": [sample_name],
1473
- "sample_path": [final_sample_path],
1474
- "sample_type": [sample_type],
1475
- "map_id": [map_id_value],
1476
- "sample_source": [getattr(ddaobj, "file_source", file)],
1477
- "sample_color": [None], # Will be set in batch at end
1478
- "sample_group": [""],
1479
- "sample_batch": [1],
1480
- "sample_sequence": [next_sequence],
1481
- "num_features": [int(ddaobj._oms_features_map.size())],
1482
- "num_ms1": [ms1_count],
1483
- "num_ms2": [ms2_count],
1484
- })
1536
+ new_sample = pl.DataFrame(
1537
+ {
1538
+ "sample_uid": [int(len(self.samples_df) + 1)],
1539
+ "sample_name": [sample_name],
1540
+ "sample_path": [final_sample_path],
1541
+ "sample_type": [sample_type],
1542
+ "map_id": [map_id_value],
1543
+ "sample_source": [getattr(ddaobj, "file_source", file)],
1544
+ "sample_color": [None], # Will be set in batch at end
1545
+ "sample_group": [""],
1546
+ "sample_batch": [1],
1547
+ "sample_sequence": [next_sequence],
1548
+ "num_features": [int(ddaobj._oms_features_map.size())],
1549
+ "num_ms1": [ms1_count],
1550
+ "num_ms2": [ms2_count],
1551
+ },
1552
+ )
1485
1553
 
1486
1554
  self.samples_df = pl.concat([self.samples_df, new_sample])
1487
1555
 
@@ -1523,7 +1591,9 @@ def _add_sample_optimized(
1523
1591
  # - No type casting loops
1524
1592
  # - No sample_color_reset()
1525
1593
 
1526
- self.logger.debug(f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (optimized)")
1594
+ self.logger.debug(
1595
+ f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (optimized)",
1596
+ )
1527
1597
  return True
1528
1598
 
1529
1599
 
@@ -1567,7 +1637,7 @@ def _add_sample_standard(
1567
1637
  ddaobj = Sample()
1568
1638
  ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
1569
1639
  # Use standard loading method that loads all data including ms1_df
1570
-
1640
+
1571
1641
  if file.endswith(".sample5"):
1572
1642
  ddaobj.load(file)
1573
1643
  # restore _oms_features_map
@@ -1601,7 +1671,7 @@ def _add_sample_standard(
1601
1671
  # Handle file paths
1602
1672
  if file.endswith(".sample5"):
1603
1673
  final_sample_path = file
1604
- #self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
1674
+ # self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
1605
1675
  else:
1606
1676
  if self.folder is not None:
1607
1677
  if not os.path.exists(self.folder):
@@ -1614,8 +1684,14 @@ def _add_sample_standard(
1614
1684
 
1615
1685
  # Efficient scan counting
1616
1686
  ms1_count = ms2_count = 0
1617
- if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
1618
- scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1687
+ if (
1688
+ hasattr(ddaobj, "scans_df")
1689
+ and ddaobj.scans_df is not None
1690
+ and not ddaobj.scans_df.is_empty()
1691
+ ):
1692
+ scan_counts = (
1693
+ ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1694
+ )
1619
1695
  ms_levels = scan_counts.get("ms_level", [])
1620
1696
  counts = scan_counts.get("len", [])
1621
1697
  for level, count in zip(ms_levels, counts):
@@ -1626,21 +1702,23 @@ def _add_sample_standard(
1626
1702
 
1627
1703
  # Create sample entry
1628
1704
  next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1629
- new_sample = pl.DataFrame({
1630
- "sample_uid": [int(len(self.samples_df) + 1)],
1631
- "sample_name": [sample_name],
1632
- "sample_path": [final_sample_path],
1633
- "sample_type": [sample_type],
1634
- "map_id": [map_id_value],
1635
- "sample_source": [getattr(ddaobj, "file_source", file)],
1636
- "sample_color": [None], # Will be set in batch at end
1637
- "sample_group": [""],
1638
- "sample_batch": [1],
1639
- "sample_sequence": [next_sequence],
1640
- "num_features": [int(ddaobj._oms_features_map.size())],
1641
- "num_ms1": [ms1_count],
1642
- "num_ms2": [ms2_count],
1643
- })
1705
+ new_sample = pl.DataFrame(
1706
+ {
1707
+ "sample_uid": [int(len(self.samples_df) + 1)],
1708
+ "sample_name": [sample_name],
1709
+ "sample_path": [final_sample_path],
1710
+ "sample_type": [sample_type],
1711
+ "map_id": [map_id_value],
1712
+ "sample_source": [getattr(ddaobj, "file_source", file)],
1713
+ "sample_color": [None], # Will be set in batch at end
1714
+ "sample_group": [""],
1715
+ "sample_batch": [1],
1716
+ "sample_sequence": [next_sequence],
1717
+ "num_features": [int(ddaobj._oms_features_map.size())],
1718
+ "num_ms1": [ms1_count],
1719
+ "num_ms2": [ms2_count],
1720
+ },
1721
+ )
1644
1722
 
1645
1723
  self.samples_df = pl.concat([self.samples_df, new_sample])
1646
1724
 
@@ -1675,7 +1753,9 @@ def _add_sample_standard(
1675
1753
  # Use diagonal concatenation for flexibility
1676
1754
  self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
1677
1755
 
1678
- self.logger.debug(f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)")
1756
+ self.logger.debug(
1757
+ f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)",
1758
+ )
1679
1759
  return True
1680
1760
  ## COMMENT AR: Is this intentional?
1681
1761
  # Use standard loading method that loads all data including ms1_df
@@ -1703,7 +1783,7 @@ def _add_sample_standard(
1703
1783
  # Handle file paths
1704
1784
  if file.endswith(".sample5"):
1705
1785
  final_sample_path = file
1706
- #self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
1786
+ # self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
1707
1787
  else:
1708
1788
  if self.folder is not None:
1709
1789
  if not os.path.exists(self.folder):
@@ -1716,8 +1796,14 @@ def _add_sample_standard(
1716
1796
 
1717
1797
  # Efficient scan counting
1718
1798
  ms1_count = ms2_count = 0
1719
- if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
1720
- scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1799
+ if (
1800
+ hasattr(ddaobj, "scans_df")
1801
+ and ddaobj.scans_df is not None
1802
+ and not ddaobj.scans_df.is_empty()
1803
+ ):
1804
+ scan_counts = (
1805
+ ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1806
+ )
1721
1807
  ms_levels = scan_counts.get("ms_level", [])
1722
1808
  counts = scan_counts.get("len", [])
1723
1809
  for level, count in zip(ms_levels, counts):
@@ -1728,21 +1814,23 @@ def _add_sample_standard(
1728
1814
 
1729
1815
  # Create sample entry
1730
1816
  next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1731
- new_sample = pl.DataFrame({
1732
- "sample_uid": [int(len(self.samples_df) + 1)],
1733
- "sample_name": [sample_name],
1734
- "sample_path": [final_sample_path],
1735
- "sample_type": [sample_type],
1736
- "map_id": [map_id_value],
1737
- "sample_source": [getattr(ddaobj, "file_source", file)],
1738
- "sample_color": [None], # Will be set in batch at end
1739
- "sample_group": [""],
1740
- "sample_batch": [1],
1741
- "sample_sequence": [next_sequence],
1742
- "num_features": [int(ddaobj._oms_features_map.size())],
1743
- "num_ms1": [ms1_count],
1744
- "num_ms2": [ms2_count],
1745
- })
1817
+ new_sample = pl.DataFrame(
1818
+ {
1819
+ "sample_uid": [int(len(self.samples_df) + 1)],
1820
+ "sample_name": [sample_name],
1821
+ "sample_path": [final_sample_path],
1822
+ "sample_type": [sample_type],
1823
+ "map_id": [map_id_value],
1824
+ "sample_source": [getattr(ddaobj, "file_source", file)],
1825
+ "sample_color": [None], # Will be set in batch at end
1826
+ "sample_group": [""],
1827
+ "sample_batch": [1],
1828
+ "sample_sequence": [next_sequence],
1829
+ "num_features": [int(ddaobj._oms_features_map.size())],
1830
+ "num_ms1": [ms1_count],
1831
+ "num_ms2": [ms2_count],
1832
+ },
1833
+ )
1746
1834
 
1747
1835
  self.samples_df = pl.concat([self.samples_df, new_sample])
1748
1836
 
@@ -1777,7 +1865,9 @@ def _add_sample_standard(
1777
1865
  # Use diagonal concatenation for flexibility
1778
1866
  self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
1779
1867
 
1780
- self.logger.debug(f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)")
1868
+ self.logger.debug(
1869
+ f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)",
1870
+ )
1781
1871
  return True
1782
1872
 
1783
1873