masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +3 -9
  4. masster/data/libs/README.md +1 -1
  5. masster/data/libs/ccm.csv +120 -120
  6. masster/data/libs/ccm.py +116 -62
  7. masster/data/libs/central_carbon_README.md +1 -1
  8. masster/data/libs/urine.py +161 -65
  9. masster/data/libs/urine_metabolites.csv +4693 -4693
  10. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
  11. masster/logger.py +43 -78
  12. masster/sample/__init__.py +1 -1
  13. masster/sample/adducts.py +264 -338
  14. masster/sample/defaults/find_adducts_def.py +8 -21
  15. masster/sample/defaults/find_features_def.py +1 -6
  16. masster/sample/defaults/get_spectrum_def.py +1 -5
  17. masster/sample/defaults/sample_def.py +1 -5
  18. masster/sample/h5.py +282 -561
  19. masster/sample/helpers.py +75 -131
  20. masster/sample/lib.py +17 -42
  21. masster/sample/load.py +17 -31
  22. masster/sample/parameters.py +2 -6
  23. masster/sample/plot.py +27 -88
  24. masster/sample/processing.py +87 -117
  25. masster/sample/quant.py +51 -57
  26. masster/sample/sample.py +90 -103
  27. masster/sample/sample5_schema.json +44 -44
  28. masster/sample/save.py +12 -35
  29. masster/sample/sciex.py +19 -66
  30. masster/spectrum.py +20 -58
  31. masster/study/__init__.py +1 -1
  32. masster/study/defaults/align_def.py +1 -5
  33. masster/study/defaults/fill_chrom_def.py +1 -5
  34. masster/study/defaults/fill_def.py +1 -5
  35. masster/study/defaults/integrate_chrom_def.py +1 -5
  36. masster/study/defaults/integrate_def.py +1 -5
  37. masster/study/defaults/study_def.py +25 -58
  38. masster/study/export.py +207 -233
  39. masster/study/h5.py +136 -470
  40. masster/study/helpers.py +202 -495
  41. masster/study/helpers_optimized.py +13 -40
  42. masster/study/id.py +110 -213
  43. masster/study/load.py +143 -230
  44. masster/study/plot.py +257 -518
  45. masster/study/processing.py +257 -469
  46. masster/study/save.py +5 -15
  47. masster/study/study.py +276 -379
  48. masster/study/study5_schema.json +96 -96
  49. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
  50. masster-0.4.1.dist-info/RECORD +67 -0
  51. masster-0.4.0.dist-info/RECORD +0 -67
  52. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
  53. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
  54. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/sample/h5.py CHANGED
@@ -7,17 +7,11 @@ import polars as pl
7
7
 
8
8
  from typing import Any, Dict, List, Optional, Tuple
9
9
 
10
- from master.chromatogram import Chromatogram
11
- from master.spectrum import Spectrum
10
+ from masster.chromatogram import Chromatogram
11
+ from masster.spectrum import Spectrum
12
12
 
13
13
 
14
- def _save_sample5(
15
- self,
16
- filename=None,
17
- include_ms1=True,
18
- include_scans=True,
19
- save_featurexml=False,
20
- ):
14
+ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, save_featurexml=False):
21
15
  """
22
16
  Save the instance data to a sample5 HDF5 file with optimized compression.
23
17
 
@@ -62,16 +56,14 @@ def _save_sample5(
62
56
  return
63
57
 
64
58
  # synchronize feature_map if it exists
65
- if hasattr(self, "_feature_map") and self._feature_map is not None:
59
+ if hasattr(self, '_feature_map') and self._feature_map is not None:
66
60
  self._features_sync()
67
61
 
68
62
  # if no extension is given, add .sample5
69
63
  if not filename.endswith(".sample5"):
70
64
  filename += ".sample5"
71
65
 
72
- self.logger.debug(
73
- f"Saving sample to {filename} with optimized LZF+shuffle compression",
74
- )
66
+ self.logger.debug(f"Saving sample to {filename} with optimized LZF+shuffle compression")
75
67
 
76
68
  # delete existing file if it exists
77
69
  if os.path.exists(filename):
@@ -124,18 +116,12 @@ def _save_sample5(
124
116
  except Exception:
125
117
  try:
126
118
  # Try to convert to numeric using numpy
127
- numeric_data = np.array(
128
- [
129
- float(x)
130
- if x is not None
131
- and str(x)
132
- .replace(".", "")
133
- .replace("-", "")
134
- .isdigit()
135
- else np.nan
136
- for x in data
137
- ],
138
- )
119
+ numeric_data = np.array([
120
+ float(x)
121
+ if x is not None and str(x).replace(".", "").replace("-", "").isdigit()
122
+ else np.nan
123
+ for x in data
124
+ ])
139
125
  if not np.isnan(numeric_data).all():
140
126
  scans_group.create_dataset(
141
127
  col,
@@ -163,12 +149,7 @@ def _save_sample5(
163
149
  )
164
150
  scans_group[col].attrs["dtype"] = "string_repr"
165
151
  else:
166
- scans_group.create_dataset(
167
- col,
168
- data=data,
169
- compression="lzf",
170
- shuffle=True,
171
- )
152
+ scans_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
172
153
  scans_group[col].attrs["dtype"] = "native"
173
154
  scans_group.attrs["columns"] = list(scans_df.columns)
174
155
 
@@ -245,12 +226,7 @@ def _save_sample5(
245
226
  data = features[col].to_list()
246
227
  # convert None to 'None' strings
247
228
  data = ["None" if x is None else x for x in data]
248
- features_group.create_dataset(
249
- col,
250
- data=data,
251
- compression="lzf",
252
- shuffle=True,
253
- )
229
+ features_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
254
230
  else:
255
231
  try:
256
232
  data = features[col].to_numpy()
@@ -285,18 +261,16 @@ def _save_sample5(
285
261
  feature_map = self._get_feature_map()
286
262
  if feature_map is not None:
287
263
  # Temporarily set features for save operation
288
- old_features = getattr(self, "_oms_features_map", None)
264
+ old_features = getattr(self, '_oms_features_map', None)
289
265
  self._oms_features_map = feature_map
290
266
  try:
291
- self._save_featureXML(
292
- filename=filename.replace(".sample5", ".featureXML"),
293
- )
267
+ self._save_featureXML(filename=filename.replace(".sample5", ".featureXML"))
294
268
  finally:
295
269
  # Restore original features value
296
270
  if old_features is not None:
297
271
  self._oms_features_map = old_features
298
272
  else:
299
- delattr(self, "_oms_features_map")
273
+ delattr(self, '_oms_features_map')
300
274
  else:
301
275
  self.logger.warning("Cannot save featureXML: no feature data available")
302
276
 
@@ -335,28 +309,22 @@ def _load_sample5(self, filename: str, map: bool = False):
335
309
  # Load metadata
336
310
  if "metadata" in f:
337
311
  metadata_group = f["metadata"]
338
- self.file_path = decode_metadata_attr(
339
- metadata_group.attrs.get("file_path", ""),
340
- )
312
+ self.file_path = decode_metadata_attr(metadata_group.attrs.get("file_path", ""))
341
313
 
342
314
  # Load file_source if it exists, otherwise set it equal to file_path
343
315
  if "file_source" in metadata_group.attrs:
344
- self.file_source = decode_metadata_attr(
345
- metadata_group.attrs.get("file_source", ""),
346
- )
316
+ self.file_source = decode_metadata_attr(metadata_group.attrs.get("file_source", ""))
347
317
  else:
348
318
  self.file_source = self.file_path
349
319
 
350
- self.file_type = decode_metadata_attr(
351
- metadata_group.attrs.get("file_type", ""),
352
- )
320
+ self.file_type = decode_metadata_attr(metadata_group.attrs.get("file_type", ""))
353
321
  self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
354
322
 
355
323
  # Load parameters from JSON in metadata
356
324
  loaded_data = load_parameters_from_metadata(metadata_group)
357
325
 
358
326
  # Always create a fresh sample_defaults object
359
- from master.sample.defaults.sample_def import sample_defaults
327
+ from masster.sample.defaults.sample_def import sample_defaults
360
328
 
361
329
  self.parameters = sample_defaults()
362
330
 
@@ -400,23 +368,19 @@ def _load_sample5(self, filename: str, map: bool = False):
400
368
  # Convert "None" strings and NaN values to proper null values
401
369
  for col in self.scans_df.columns:
402
370
  if self.scans_df[col].dtype == pl.Utf8: # String columns
403
- self.scans_df = self.scans_df.with_columns(
404
- [
405
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
406
- .then(None)
407
- .otherwise(pl.col(col))
408
- .alias(col),
409
- ],
410
- )
371
+ self.scans_df = self.scans_df.with_columns([
372
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
373
+ .then(None)
374
+ .otherwise(pl.col(col))
375
+ .alias(col),
376
+ ])
411
377
  elif self.scans_df[col].dtype in [
412
378
  pl.Float64,
413
379
  pl.Float32,
414
380
  ]: # Float columns
415
- self.scans_df = self.scans_df.with_columns(
416
- [
417
- pl.col(col).fill_nan(None).alias(col),
418
- ],
419
- )
381
+ self.scans_df = self.scans_df.with_columns([
382
+ pl.col(col).fill_nan(None).alias(col),
383
+ ])
420
384
 
421
385
  # update all columns with schema types
422
386
  for col in self.scans_df.columns:
@@ -434,9 +398,7 @@ def _load_sample5(self, filename: str, map: bool = False):
434
398
  if self.scans_df[col].dtype == pl.Utf8:
435
399
  # String data - convert to integer
436
400
  self.scans_df = self.scans_df.with_columns(
437
- pl.col(col)
438
- .str.to_integer()
439
- .cast(eval(dtype_str)),
401
+ pl.col(col).str.to_integer().cast(eval(dtype_str)),
440
402
  )
441
403
  elif self.scans_df[col].dtype in [
442
404
  pl.Float64,
@@ -456,9 +418,7 @@ def _load_sample5(self, filename: str, map: bool = False):
456
418
  if self.scans_df[col].dtype == pl.Utf8:
457
419
  # String data - convert to float
458
420
  self.scans_df = self.scans_df.with_columns(
459
- pl.col(col)
460
- .str.to_decimal()
461
- .cast(eval(dtype_str)),
421
+ pl.col(col).str.to_decimal().cast(eval(dtype_str)),
462
422
  )
463
423
  else:
464
424
  # Try direct casting
@@ -482,9 +442,7 @@ def _load_sample5(self, filename: str, map: bool = False):
482
442
  self.scans_df = self.scans_df.with_columns(
483
443
  pl.col(col)
484
444
  .map_elements(
485
- lambda x: x.decode("utf-8")
486
- if isinstance(x, bytes)
487
- else str(x),
445
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
488
446
  return_dtype=pl.Utf8,
489
447
  )
490
448
  .cast(target_dtype),
@@ -493,9 +451,7 @@ def _load_sample5(self, filename: str, map: bool = False):
493
451
  self.scans_df = self.scans_df.with_columns(
494
452
  pl.col(col)
495
453
  .map_elements(
496
- lambda x: x.decode("utf-8")
497
- if isinstance(x, bytes)
498
- else str(x),
454
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
499
455
  return_dtype=pl.Utf8,
500
456
  )
501
457
  .str.to_integer()
@@ -505,9 +461,7 @@ def _load_sample5(self, filename: str, map: bool = False):
505
461
  self.scans_df = self.scans_df.with_columns(
506
462
  pl.col(col)
507
463
  .map_elements(
508
- lambda x: x.decode("utf-8")
509
- if isinstance(x, bytes)
510
- else str(x),
464
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
511
465
  return_dtype=pl.Utf8,
512
466
  )
513
467
  .str.to_decimal()
@@ -536,9 +490,7 @@ def _load_sample5(self, filename: str, map: bool = False):
536
490
  if "scans_df" in schema and "columns" in schema["scans_df"]:
537
491
  schema_column_order = list(schema["scans_df"]["columns"].keys())
538
492
  # Only reorder columns that exist in both schema and DataFrame
539
- existing_columns = [
540
- col for col in schema_column_order if col in self.scans_df.columns
541
- ]
493
+ existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
542
494
  if existing_columns:
543
495
  self.scans_df = self.scans_df.select(existing_columns)
544
496
 
@@ -665,29 +617,23 @@ def _load_sample5(self, filename: str, map: bool = False):
665
617
  if k in schema.get("features_df", {}).get("columns", {})
666
618
  and schema["features_df"]["columns"][k]["dtype"] == "pl.Object"
667
619
  }
668
- regular_columns = {
669
- k: v for k, v in data.items() if k not in object_columns
670
- }
620
+ regular_columns = {k: v for k, v in data.items() if k not in object_columns}
671
621
 
672
622
  # Create DataFrame with regular columns first
673
623
  if regular_columns:
674
624
  self.features_df = pl.DataFrame(regular_columns)
675
625
  # Add Object columns one by one
676
626
  for col, values in object_columns.items():
677
- self.features_df = self.features_df.with_columns(
678
- [
679
- pl.Series(col, values, dtype=pl.Object),
680
- ],
681
- )
627
+ self.features_df = self.features_df.with_columns([
628
+ pl.Series(col, values, dtype=pl.Object),
629
+ ])
682
630
  else:
683
631
  # Only Object columns
684
632
  self.features_df = pl.DataFrame()
685
633
  for col, values in object_columns.items():
686
- self.features_df = self.features_df.with_columns(
687
- [
688
- pl.Series(col, values, dtype=pl.Object),
689
- ],
690
- )
634
+ self.features_df = self.features_df.with_columns([
635
+ pl.Series(col, values, dtype=pl.Object),
636
+ ])
691
637
 
692
638
  # update all columns with schema types (skip Object columns)
693
639
  for col in self.features_df.columns:
@@ -704,25 +650,16 @@ def _load_sample5(self, filename: str, map: bool = False):
704
650
  # Convert to numeric first, handling different input types
705
651
  if self.features_df[col].dtype == pl.Utf8:
706
652
  # String data - convert to integer
707
- self.features_df = (
708
- self.features_df.with_columns(
709
- pl.col(col)
710
- .str.to_integer()
711
- .cast(eval(dtype_str)),
712
- )
653
+ self.features_df = self.features_df.with_columns(
654
+ pl.col(col).str.to_integer().cast(eval(dtype_str)),
713
655
  )
714
656
  elif self.features_df[col].dtype in [
715
657
  pl.Float64,
716
658
  pl.Float32,
717
659
  ]:
718
660
  # Float data - cast to integer with null handling for NaN values
719
- self.features_df = (
720
- self.features_df.with_columns(
721
- pl.col(col).cast(
722
- eval(dtype_str),
723
- strict=False,
724
- ),
725
- )
661
+ self.features_df = self.features_df.with_columns(
662
+ pl.col(col).cast(eval(dtype_str), strict=False),
726
663
  )
727
664
  else:
728
665
  # Handle special cases and try direct casting for other types
@@ -733,70 +670,50 @@ def _load_sample5(self, filename: str, map: bool = False):
733
670
  if "Binary" in str(current_dtype):
734
671
  # Convert binary to string first, then to target type
735
672
  if target_dtype == pl.Utf8:
736
- self.features_df = (
737
- self.features_df.with_columns(
738
- pl.col(col)
739
- .map_elements(
740
- lambda x: x.decode("utf-8")
741
- if isinstance(x, bytes)
742
- else str(x),
743
- return_dtype=pl.Utf8,
744
- )
745
- .cast(target_dtype),
673
+ self.features_df = self.features_df.with_columns(
674
+ pl.col(col)
675
+ .map_elements(
676
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
677
+ return_dtype=pl.Utf8,
746
678
  )
679
+ .cast(target_dtype),
747
680
  )
748
681
  elif "Int" in str(target_dtype):
749
- self.features_df = (
750
- self.features_df.with_columns(
751
- pl.col(col)
752
- .map_elements(
753
- lambda x: x.decode("utf-8")
754
- if isinstance(x, bytes)
755
- else str(x),
756
- return_dtype=pl.Utf8,
757
- )
758
- .str.to_integer()
759
- .cast(target_dtype),
682
+ self.features_df = self.features_df.with_columns(
683
+ pl.col(col)
684
+ .map_elements(
685
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
686
+ return_dtype=pl.Utf8,
760
687
  )
688
+ .str.to_integer()
689
+ .cast(target_dtype),
761
690
  )
762
691
  elif "Float" in str(target_dtype):
763
- self.features_df = (
764
- self.features_df.with_columns(
765
- pl.col(col)
766
- .map_elements(
767
- lambda x: x.decode("utf-8")
768
- if isinstance(x, bytes)
769
- else str(x),
770
- return_dtype=pl.Utf8,
771
- )
772
- .str.to_decimal()
773
- .cast(target_dtype),
692
+ self.features_df = self.features_df.with_columns(
693
+ pl.col(col)
694
+ .map_elements(
695
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
696
+ return_dtype=pl.Utf8,
774
697
  )
698
+ .str.to_decimal()
699
+ .cast(target_dtype),
775
700
  )
776
701
  else:
777
702
  # Try direct casting
778
- self.features_df = (
779
- self.features_df.with_columns(
780
- pl.col(col).cast(target_dtype),
781
- )
703
+ self.features_df = self.features_df.with_columns(
704
+ pl.col(col).cast(target_dtype),
782
705
  )
783
706
  else:
784
707
  # Try direct casting for non-binary types
785
- self.features_df = (
786
- self.features_df.with_columns(
787
- pl.col(col).cast(target_dtype),
788
- )
708
+ self.features_df = self.features_df.with_columns(
709
+ pl.col(col).cast(target_dtype),
789
710
  )
790
711
  elif "Float" in dtype_str:
791
712
  # Convert to float, handling different input types
792
713
  if self.features_df[col].dtype == pl.Utf8:
793
714
  # String data - convert to float
794
- self.features_df = (
795
- self.features_df.with_columns(
796
- pl.col(col)
797
- .str.to_decimal()
798
- .cast(eval(dtype_str)),
799
- )
715
+ self.features_df = self.features_df.with_columns(
716
+ pl.col(col).str.to_decimal().cast(eval(dtype_str)),
800
717
  )
801
718
  else:
802
719
  # Handle special cases and try direct casting for other types
@@ -807,59 +724,43 @@ def _load_sample5(self, filename: str, map: bool = False):
807
724
  if "Binary" in str(current_dtype):
808
725
  # Convert binary to string first, then to target type
809
726
  if target_dtype == pl.Utf8:
810
- self.features_df = (
811
- self.features_df.with_columns(
812
- pl.col(col)
813
- .map_elements(
814
- lambda x: x.decode("utf-8")
815
- if isinstance(x, bytes)
816
- else str(x),
817
- return_dtype=pl.Utf8,
818
- )
819
- .cast(target_dtype),
727
+ self.features_df = self.features_df.with_columns(
728
+ pl.col(col)
729
+ .map_elements(
730
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
731
+ return_dtype=pl.Utf8,
820
732
  )
733
+ .cast(target_dtype),
821
734
  )
822
735
  elif "Int" in str(target_dtype):
823
- self.features_df = (
824
- self.features_df.with_columns(
825
- pl.col(col)
826
- .map_elements(
827
- lambda x: x.decode("utf-8")
828
- if isinstance(x, bytes)
829
- else str(x),
830
- return_dtype=pl.Utf8,
831
- )
832
- .str.to_integer()
833
- .cast(target_dtype),
736
+ self.features_df = self.features_df.with_columns(
737
+ pl.col(col)
738
+ .map_elements(
739
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
740
+ return_dtype=pl.Utf8,
834
741
  )
742
+ .str.to_integer()
743
+ .cast(target_dtype),
835
744
  )
836
745
  elif "Float" in str(target_dtype):
837
- self.features_df = (
838
- self.features_df.with_columns(
839
- pl.col(col)
840
- .map_elements(
841
- lambda x: x.decode("utf-8")
842
- if isinstance(x, bytes)
843
- else str(x),
844
- return_dtype=pl.Utf8,
845
- )
846
- .str.to_decimal()
847
- .cast(target_dtype),
746
+ self.features_df = self.features_df.with_columns(
747
+ pl.col(col)
748
+ .map_elements(
749
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
750
+ return_dtype=pl.Utf8,
848
751
  )
752
+ .str.to_decimal()
753
+ .cast(target_dtype),
849
754
  )
850
755
  else:
851
756
  # Try direct casting
852
- self.features_df = (
853
- self.features_df.with_columns(
854
- pl.col(col).cast(target_dtype),
855
- )
757
+ self.features_df = self.features_df.with_columns(
758
+ pl.col(col).cast(target_dtype),
856
759
  )
857
760
  else:
858
761
  # Try direct casting for non-binary types
859
- self.features_df = (
860
- self.features_df.with_columns(
861
- pl.col(col).cast(target_dtype),
862
- )
762
+ self.features_df = self.features_df.with_columns(
763
+ pl.col(col).cast(target_dtype),
863
764
  )
864
765
  elif "Utf8" in dtype_str:
865
766
  # Ensure it's string type
@@ -875,59 +776,43 @@ def _load_sample5(self, filename: str, map: bool = False):
875
776
  if "Binary" in str(current_dtype):
876
777
  # Convert binary to string first, then to target type
877
778
  if target_dtype == pl.Utf8:
878
- self.features_df = (
879
- self.features_df.with_columns(
880
- pl.col(col)
881
- .map_elements(
882
- lambda x: x.decode("utf-8")
883
- if isinstance(x, bytes)
884
- else str(x),
885
- return_dtype=pl.Utf8,
886
- )
887
- .cast(target_dtype),
779
+ self.features_df = self.features_df.with_columns(
780
+ pl.col(col)
781
+ .map_elements(
782
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
783
+ return_dtype=pl.Utf8,
888
784
  )
785
+ .cast(target_dtype),
889
786
  )
890
787
  elif "Int" in str(target_dtype):
891
- self.features_df = (
892
- self.features_df.with_columns(
893
- pl.col(col)
894
- .map_elements(
895
- lambda x: x.decode("utf-8")
896
- if isinstance(x, bytes)
897
- else str(x),
898
- return_dtype=pl.Utf8,
899
- )
900
- .str.to_integer()
901
- .cast(target_dtype),
788
+ self.features_df = self.features_df.with_columns(
789
+ pl.col(col)
790
+ .map_elements(
791
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
792
+ return_dtype=pl.Utf8,
902
793
  )
794
+ .str.to_integer()
795
+ .cast(target_dtype),
903
796
  )
904
797
  elif "Float" in str(target_dtype):
905
- self.features_df = (
906
- self.features_df.with_columns(
907
- pl.col(col)
908
- .map_elements(
909
- lambda x: x.decode("utf-8")
910
- if isinstance(x, bytes)
911
- else str(x),
912
- return_dtype=pl.Utf8,
913
- )
914
- .str.to_decimal()
915
- .cast(target_dtype),
798
+ self.features_df = self.features_df.with_columns(
799
+ pl.col(col)
800
+ .map_elements(
801
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
802
+ return_dtype=pl.Utf8,
916
803
  )
804
+ .str.to_decimal()
805
+ .cast(target_dtype),
917
806
  )
918
807
  else:
919
808
  # Try direct casting
920
- self.features_df = (
921
- self.features_df.with_columns(
922
- pl.col(col).cast(target_dtype),
923
- )
809
+ self.features_df = self.features_df.with_columns(
810
+ pl.col(col).cast(target_dtype),
924
811
  )
925
812
  else:
926
813
  # Try direct casting for non-binary types
927
- self.features_df = (
928
- self.features_df.with_columns(
929
- pl.col(col).cast(target_dtype),
930
- )
814
+ self.features_df = self.features_df.with_columns(
815
+ pl.col(col).cast(target_dtype),
931
816
  )
932
817
  except Exception as e:
933
818
  self.logger.warning(
@@ -942,31 +827,23 @@ def _load_sample5(self, filename: str, map: bool = False):
942
827
  # This ensures "None" strings introduced by failed conversions are properly handled
943
828
  for col in self.features_df.columns:
944
829
  if self.features_df[col].dtype == pl.Utf8: # String columns
945
- self.features_df = self.features_df.with_columns(
946
- [
947
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
948
- .then(None)
949
- .otherwise(pl.col(col))
950
- .alias(col),
951
- ],
952
- )
830
+ self.features_df = self.features_df.with_columns([
831
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
832
+ .then(None)
833
+ .otherwise(pl.col(col))
834
+ .alias(col),
835
+ ])
953
836
  # Float columns
954
837
  elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
955
- self.features_df = self.features_df.with_columns(
956
- [
957
- pl.col(col).fill_nan(None).alias(col),
958
- ],
959
- )
838
+ self.features_df = self.features_df.with_columns([
839
+ pl.col(col).fill_nan(None).alias(col),
840
+ ])
960
841
 
961
842
  # Ensure column order matches schema order
962
843
  if "features_df" in schema and "columns" in schema["features_df"]:
963
844
  schema_column_order = list(schema["features_df"]["columns"].keys())
964
845
  # Only reorder columns that exist in both schema and DataFrame
965
- existing_columns = [
966
- col
967
- for col in schema_column_order
968
- if col in self.features_df.columns
969
- ]
846
+ existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
970
847
  if existing_columns:
971
848
  self.features_df = self.features_df.select(existing_columns)
972
849
 
@@ -996,17 +873,13 @@ def _load_sample5(self, filename: str, map: bool = False):
996
873
  dtype_str = schema_columns[col]["dtype"]
997
874
  try:
998
875
  if "Int" in dtype_str:
999
- self.ms1_df = self.ms1_df.with_columns(
1000
- [
1001
- pl.col(col).cast(pl.Int64, strict=False),
1002
- ],
1003
- )
876
+ self.ms1_df = self.ms1_df.with_columns([
877
+ pl.col(col).cast(pl.Int64, strict=False),
878
+ ])
1004
879
  elif "Float" in dtype_str:
1005
- self.ms1_df = self.ms1_df.with_columns(
1006
- [
1007
- pl.col(col).cast(pl.Float64, strict=False),
1008
- ],
1009
- )
880
+ self.ms1_df = self.ms1_df.with_columns([
881
+ pl.col(col).cast(pl.Float64, strict=False),
882
+ ])
1010
883
  except Exception as e:
1011
884
  self.logger.warning(
1012
885
  f"Failed to apply schema type {dtype_str} to column {col}: {e}",
@@ -1075,28 +948,22 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1075
948
  # Load metadata
1076
949
  if "metadata" in f:
1077
950
  metadata_group = f["metadata"]
1078
- self.file_path = decode_metadata_attr(
1079
- metadata_group.attrs.get("file_path", ""),
1080
- )
951
+ self.file_path = decode_metadata_attr(metadata_group.attrs.get("file_path", ""))
1081
952
 
1082
953
  # Load file_source if it exists, otherwise set it equal to file_path
1083
954
  if "file_source" in metadata_group.attrs:
1084
- self.file_source = decode_metadata_attr(
1085
- metadata_group.attrs.get("file_source", ""),
1086
- )
955
+ self.file_source = decode_metadata_attr(metadata_group.attrs.get("file_source", ""))
1087
956
  else:
1088
957
  self.file_source = self.file_path
1089
958
 
1090
- self.file_type = decode_metadata_attr(
1091
- metadata_group.attrs.get("file_type", ""),
1092
- )
959
+ self.file_type = decode_metadata_attr(metadata_group.attrs.get("file_type", ""))
1093
960
  self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
1094
961
 
1095
962
  # Load parameters from JSON in metadata
1096
963
  loaded_data = load_parameters_from_metadata(metadata_group)
1097
964
 
1098
965
  # Always create a fresh sample_defaults object
1099
- from master.sample.defaults.sample_def import sample_defaults
966
+ from masster.sample.defaults.sample_def import sample_defaults
1100
967
 
1101
968
  self.parameters = sample_defaults()
1102
969
 
@@ -1140,23 +1007,19 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1140
1007
  # Convert "None" strings and NaN values to proper null values
1141
1008
  for col in self.scans_df.columns:
1142
1009
  if self.scans_df[col].dtype == pl.Utf8: # String columns
1143
- self.scans_df = self.scans_df.with_columns(
1144
- [
1145
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1146
- .then(None)
1147
- .otherwise(pl.col(col))
1148
- .alias(col),
1149
- ],
1150
- )
1010
+ self.scans_df = self.scans_df.with_columns([
1011
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1012
+ .then(None)
1013
+ .otherwise(pl.col(col))
1014
+ .alias(col),
1015
+ ])
1151
1016
  elif self.scans_df[col].dtype in [
1152
1017
  pl.Float64,
1153
1018
  pl.Float32,
1154
1019
  ]: # Float columns
1155
- self.scans_df = self.scans_df.with_columns(
1156
- [
1157
- pl.col(col).fill_nan(None).alias(col),
1158
- ],
1159
- )
1020
+ self.scans_df = self.scans_df.with_columns([
1021
+ pl.col(col).fill_nan(None).alias(col),
1022
+ ])
1160
1023
 
1161
1024
  # update all columns with schema types
1162
1025
  for col in self.scans_df.columns:
@@ -1174,9 +1037,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1174
1037
  if self.scans_df[col].dtype == pl.Utf8:
1175
1038
  # String data - convert to integer
1176
1039
  self.scans_df = self.scans_df.with_columns(
1177
- pl.col(col)
1178
- .str.to_integer()
1179
- .cast(eval(dtype_str)),
1040
+ pl.col(col).str.to_integer().cast(eval(dtype_str)),
1180
1041
  )
1181
1042
  elif self.scans_df[col].dtype in [
1182
1043
  pl.Float64,
@@ -1196,9 +1057,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1196
1057
  if self.scans_df[col].dtype == pl.Utf8:
1197
1058
  # String data - convert to float
1198
1059
  self.scans_df = self.scans_df.with_columns(
1199
- pl.col(col)
1200
- .str.to_decimal()
1201
- .cast(eval(dtype_str)),
1060
+ pl.col(col).str.to_decimal().cast(eval(dtype_str)),
1202
1061
  )
1203
1062
  else:
1204
1063
  # Try direct casting
@@ -1222,9 +1081,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1222
1081
  self.scans_df = self.scans_df.with_columns(
1223
1082
  pl.col(col)
1224
1083
  .map_elements(
1225
- lambda x: x.decode("utf-8")
1226
- if isinstance(x, bytes)
1227
- else str(x),
1084
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1228
1085
  return_dtype=pl.Utf8,
1229
1086
  )
1230
1087
  .cast(target_dtype),
@@ -1233,9 +1090,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1233
1090
  self.scans_df = self.scans_df.with_columns(
1234
1091
  pl.col(col)
1235
1092
  .map_elements(
1236
- lambda x: x.decode("utf-8")
1237
- if isinstance(x, bytes)
1238
- else str(x),
1093
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1239
1094
  return_dtype=pl.Utf8,
1240
1095
  )
1241
1096
  .str.to_integer()
@@ -1245,9 +1100,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1245
1100
  self.scans_df = self.scans_df.with_columns(
1246
1101
  pl.col(col)
1247
1102
  .map_elements(
1248
- lambda x: x.decode("utf-8")
1249
- if isinstance(x, bytes)
1250
- else str(x),
1103
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1251
1104
  return_dtype=pl.Utf8,
1252
1105
  )
1253
1106
  .str.to_decimal()
@@ -1276,9 +1129,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1276
1129
  if "scans_df" in schema and "columns" in schema["scans_df"]:
1277
1130
  schema_column_order = list(schema["scans_df"]["columns"].keys())
1278
1131
  # Only reorder columns that exist in both schema and DataFrame
1279
- existing_columns = [
1280
- col for col in schema_column_order if col in self.scans_df.columns
1281
- ]
1132
+ existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
1282
1133
  if existing_columns:
1283
1134
  self.scans_df = self.scans_df.select(existing_columns)
1284
1135
 
@@ -1357,18 +1208,12 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1357
1208
  spectrum_list = []
1358
1209
  for spec_data in json.loads(item):
1359
1210
  if spec_data is not None:
1360
- spectrum = Spectrum.from_json(
1361
- spec_data,
1362
- )
1211
+ spectrum = Spectrum.from_json(spec_data)
1363
1212
  spectrum_list.append(spectrum)
1364
1213
  else:
1365
1214
  spectrum_list.append(None)
1366
1215
  reconstructed_data.append(spectrum_list)
1367
- except (
1368
- json.JSONDecodeError,
1369
- ValueError,
1370
- TypeError,
1371
- ):
1216
+ except (json.JSONDecodeError, ValueError, TypeError):
1372
1217
  reconstructed_data.append(None)
1373
1218
 
1374
1219
  data[col] = reconstructed_data
@@ -1384,13 +1229,10 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1384
1229
  # Separate Object columns from regular columns to avoid astuple issues
1385
1230
  object_columns = {}
1386
1231
  regular_columns = {}
1387
-
1232
+
1388
1233
  for col, values in data.items():
1389
1234
  if col in schema.get("features_df", {}).get("columns", {}):
1390
- if "Object" in schema["features_df"]["columns"][col].get(
1391
- "dtype",
1392
- "",
1393
- ):
1235
+ if "Object" in schema["features_df"]["columns"][col].get("dtype", ""):
1394
1236
  object_columns[col] = values
1395
1237
  else:
1396
1238
  regular_columns[col] = values
@@ -1403,48 +1245,38 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1403
1245
  else:
1404
1246
  # If no regular columns, create empty DataFrame
1405
1247
  self.features_df = pl.DataFrame()
1406
-
1248
+
1407
1249
  # Add Object columns one by one
1408
1250
  for col, values in object_columns.items():
1409
1251
  if not self.features_df.is_empty():
1410
1252
  self.features_df = self.features_df.with_columns(
1411
- pl.Series(col, values, dtype=pl.Object).alias(col),
1253
+ pl.Series(col, values, dtype=pl.Object).alias(col)
1412
1254
  )
1413
1255
  else:
1414
1256
  # Create DataFrame with just this Object column
1415
- self.features_df = pl.DataFrame(
1416
- {col: values},
1417
- schema={col: pl.Object},
1418
- )
1257
+ self.features_df = pl.DataFrame({col: values}, schema={col: pl.Object})
1419
1258
 
1420
1259
  # Convert "None" strings and NaN values to proper null values for regular columns first
1421
1260
  for col in self.features_df.columns:
1422
1261
  # Skip Object columns - they're already properly reconstructed
1423
1262
  if col in schema.get("features_df", {}).get("columns", {}):
1424
- if "Object" in schema["features_df"]["columns"][col].get(
1425
- "dtype",
1426
- "",
1427
- ):
1263
+ if "Object" in schema["features_df"]["columns"][col].get("dtype", ""):
1428
1264
  continue
1429
1265
 
1430
1266
  if self.features_df[col].dtype == pl.Utf8: # String columns
1431
- self.features_df = self.features_df.with_columns(
1432
- [
1433
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1434
- .then(None)
1435
- .otherwise(pl.col(col))
1436
- .alias(col),
1437
- ],
1438
- )
1267
+ self.features_df = self.features_df.with_columns([
1268
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1269
+ .then(None)
1270
+ .otherwise(pl.col(col))
1271
+ .alias(col),
1272
+ ])
1439
1273
  elif self.features_df[col].dtype in [
1440
1274
  pl.Float64,
1441
1275
  pl.Float32,
1442
1276
  ]: # Float columns
1443
- self.features_df = self.features_df.with_columns(
1444
- [
1445
- pl.col(col).fill_nan(None).alias(col),
1446
- ],
1447
- )
1277
+ self.features_df = self.features_df.with_columns([
1278
+ pl.col(col).fill_nan(None).alias(col),
1279
+ ])
1448
1280
 
1449
1281
  # update all columns with schema types
1450
1282
  for col in self.features_df.columns:
@@ -1461,25 +1293,16 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1461
1293
  # Convert to numeric first, handling different input types
1462
1294
  if self.features_df[col].dtype == pl.Utf8:
1463
1295
  # String data - convert to integer
1464
- self.features_df = (
1465
- self.features_df.with_columns(
1466
- pl.col(col)
1467
- .str.to_integer()
1468
- .cast(eval(dtype_str)),
1469
- )
1296
+ self.features_df = self.features_df.with_columns(
1297
+ pl.col(col).str.to_integer().cast(eval(dtype_str)),
1470
1298
  )
1471
1299
  elif self.features_df[col].dtype in [
1472
1300
  pl.Float64,
1473
1301
  pl.Float32,
1474
1302
  ]:
1475
1303
  # Float data - cast to integer with null handling for NaN values
1476
- self.features_df = (
1477
- self.features_df.with_columns(
1478
- pl.col(col).cast(
1479
- eval(dtype_str),
1480
- strict=False,
1481
- ),
1482
- )
1304
+ self.features_df = self.features_df.with_columns(
1305
+ pl.col(col).cast(eval(dtype_str), strict=False),
1483
1306
  )
1484
1307
  else:
1485
1308
  # Handle special cases and try direct casting for other types
@@ -1490,70 +1313,50 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1490
1313
  if "Binary" in str(current_dtype):
1491
1314
  # Convert binary to string first, then to target type
1492
1315
  if target_dtype == pl.Utf8:
1493
- self.features_df = (
1494
- self.features_df.with_columns(
1495
- pl.col(col)
1496
- .map_elements(
1497
- lambda x: x.decode("utf-8")
1498
- if isinstance(x, bytes)
1499
- else str(x),
1500
- return_dtype=pl.Utf8,
1501
- )
1502
- .cast(target_dtype),
1316
+ self.features_df = self.features_df.with_columns(
1317
+ pl.col(col)
1318
+ .map_elements(
1319
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1320
+ return_dtype=pl.Utf8,
1503
1321
  )
1322
+ .cast(target_dtype),
1504
1323
  )
1505
1324
  elif "Int" in str(target_dtype):
1506
- self.features_df = (
1507
- self.features_df.with_columns(
1508
- pl.col(col)
1509
- .map_elements(
1510
- lambda x: x.decode("utf-8")
1511
- if isinstance(x, bytes)
1512
- else str(x),
1513
- return_dtype=pl.Utf8,
1514
- )
1515
- .str.to_integer()
1516
- .cast(target_dtype),
1325
+ self.features_df = self.features_df.with_columns(
1326
+ pl.col(col)
1327
+ .map_elements(
1328
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1329
+ return_dtype=pl.Utf8,
1517
1330
  )
1331
+ .str.to_integer()
1332
+ .cast(target_dtype),
1518
1333
  )
1519
1334
  elif "Float" in str(target_dtype):
1520
- self.features_df = (
1521
- self.features_df.with_columns(
1522
- pl.col(col)
1523
- .map_elements(
1524
- lambda x: x.decode("utf-8")
1525
- if isinstance(x, bytes)
1526
- else str(x),
1527
- return_dtype=pl.Utf8,
1528
- )
1529
- .str.to_decimal()
1530
- .cast(target_dtype),
1335
+ self.features_df = self.features_df.with_columns(
1336
+ pl.col(col)
1337
+ .map_elements(
1338
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1339
+ return_dtype=pl.Utf8,
1531
1340
  )
1341
+ .str.to_decimal()
1342
+ .cast(target_dtype),
1532
1343
  )
1533
1344
  else:
1534
1345
  # Try direct casting
1535
- self.features_df = (
1536
- self.features_df.with_columns(
1537
- pl.col(col).cast(target_dtype),
1538
- )
1346
+ self.features_df = self.features_df.with_columns(
1347
+ pl.col(col).cast(target_dtype),
1539
1348
  )
1540
1349
  else:
1541
1350
  # Try direct casting for non-binary types
1542
- self.features_df = (
1543
- self.features_df.with_columns(
1544
- pl.col(col).cast(target_dtype),
1545
- )
1351
+ self.features_df = self.features_df.with_columns(
1352
+ pl.col(col).cast(target_dtype),
1546
1353
  )
1547
1354
  elif "Float" in dtype_str:
1548
1355
  # Convert to float, handling different input types
1549
1356
  if self.features_df[col].dtype == pl.Utf8:
1550
1357
  # String data - convert to float
1551
- self.features_df = (
1552
- self.features_df.with_columns(
1553
- pl.col(col)
1554
- .str.to_decimal()
1555
- .cast(eval(dtype_str)),
1556
- )
1358
+ self.features_df = self.features_df.with_columns(
1359
+ pl.col(col).str.to_decimal().cast(eval(dtype_str)),
1557
1360
  )
1558
1361
  else:
1559
1362
  # Handle special cases and try direct casting for other types
@@ -1564,59 +1367,43 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1564
1367
  if "Binary" in str(current_dtype):
1565
1368
  # Convert binary to string first, then to target type
1566
1369
  if target_dtype == pl.Utf8:
1567
- self.features_df = (
1568
- self.features_df.with_columns(
1569
- pl.col(col)
1570
- .map_elements(
1571
- lambda x: x.decode("utf-8")
1572
- if isinstance(x, bytes)
1573
- else str(x),
1574
- return_dtype=pl.Utf8,
1575
- )
1576
- .cast(target_dtype),
1370
+ self.features_df = self.features_df.with_columns(
1371
+ pl.col(col)
1372
+ .map_elements(
1373
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1374
+ return_dtype=pl.Utf8,
1577
1375
  )
1376
+ .cast(target_dtype),
1578
1377
  )
1579
1378
  elif "Int" in str(target_dtype):
1580
- self.features_df = (
1581
- self.features_df.with_columns(
1582
- pl.col(col)
1583
- .map_elements(
1584
- lambda x: x.decode("utf-8")
1585
- if isinstance(x, bytes)
1586
- else str(x),
1587
- return_dtype=pl.Utf8,
1588
- )
1589
- .str.to_integer()
1590
- .cast(target_dtype),
1379
+ self.features_df = self.features_df.with_columns(
1380
+ pl.col(col)
1381
+ .map_elements(
1382
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1383
+ return_dtype=pl.Utf8,
1591
1384
  )
1385
+ .str.to_integer()
1386
+ .cast(target_dtype),
1592
1387
  )
1593
1388
  elif "Float" in str(target_dtype):
1594
- self.features_df = (
1595
- self.features_df.with_columns(
1596
- pl.col(col)
1597
- .map_elements(
1598
- lambda x: x.decode("utf-8")
1599
- if isinstance(x, bytes)
1600
- else str(x),
1601
- return_dtype=pl.Utf8,
1602
- )
1603
- .str.to_decimal()
1604
- .cast(target_dtype),
1389
+ self.features_df = self.features_df.with_columns(
1390
+ pl.col(col)
1391
+ .map_elements(
1392
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1393
+ return_dtype=pl.Utf8,
1605
1394
  )
1395
+ .str.to_decimal()
1396
+ .cast(target_dtype),
1606
1397
  )
1607
1398
  else:
1608
1399
  # Try direct casting
1609
- self.features_df = (
1610
- self.features_df.with_columns(
1611
- pl.col(col).cast(target_dtype),
1612
- )
1400
+ self.features_df = self.features_df.with_columns(
1401
+ pl.col(col).cast(target_dtype),
1613
1402
  )
1614
1403
  else:
1615
1404
  # Try direct casting for non-binary types
1616
- self.features_df = (
1617
- self.features_df.with_columns(
1618
- pl.col(col).cast(target_dtype),
1619
- )
1405
+ self.features_df = self.features_df.with_columns(
1406
+ pl.col(col).cast(target_dtype),
1620
1407
  )
1621
1408
  elif "Utf8" in dtype_str:
1622
1409
  # Ensure it's string type
@@ -1632,59 +1419,43 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1632
1419
  if "Binary" in str(current_dtype):
1633
1420
  # Convert binary to string first, then to target type
1634
1421
  if target_dtype == pl.Utf8:
1635
- self.features_df = (
1636
- self.features_df.with_columns(
1637
- pl.col(col)
1638
- .map_elements(
1639
- lambda x: x.decode("utf-8")
1640
- if isinstance(x, bytes)
1641
- else str(x),
1642
- return_dtype=pl.Utf8,
1643
- )
1644
- .cast(target_dtype),
1422
+ self.features_df = self.features_df.with_columns(
1423
+ pl.col(col)
1424
+ .map_elements(
1425
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1426
+ return_dtype=pl.Utf8,
1645
1427
  )
1428
+ .cast(target_dtype),
1646
1429
  )
1647
1430
  elif "Int" in str(target_dtype):
1648
- self.features_df = (
1649
- self.features_df.with_columns(
1650
- pl.col(col)
1651
- .map_elements(
1652
- lambda x: x.decode("utf-8")
1653
- if isinstance(x, bytes)
1654
- else str(x),
1655
- return_dtype=pl.Utf8,
1656
- )
1657
- .str.to_integer()
1658
- .cast(target_dtype),
1431
+ self.features_df = self.features_df.with_columns(
1432
+ pl.col(col)
1433
+ .map_elements(
1434
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1435
+ return_dtype=pl.Utf8,
1659
1436
  )
1437
+ .str.to_integer()
1438
+ .cast(target_dtype),
1660
1439
  )
1661
1440
  elif "Float" in str(target_dtype):
1662
- self.features_df = (
1663
- self.features_df.with_columns(
1664
- pl.col(col)
1665
- .map_elements(
1666
- lambda x: x.decode("utf-8")
1667
- if isinstance(x, bytes)
1668
- else str(x),
1669
- return_dtype=pl.Utf8,
1670
- )
1671
- .str.to_decimal()
1672
- .cast(target_dtype),
1441
+ self.features_df = self.features_df.with_columns(
1442
+ pl.col(col)
1443
+ .map_elements(
1444
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1445
+ return_dtype=pl.Utf8,
1673
1446
  )
1447
+ .str.to_decimal()
1448
+ .cast(target_dtype),
1674
1449
  )
1675
1450
  else:
1676
1451
  # Try direct casting
1677
- self.features_df = (
1678
- self.features_df.with_columns(
1679
- pl.col(col).cast(target_dtype),
1680
- )
1452
+ self.features_df = self.features_df.with_columns(
1453
+ pl.col(col).cast(target_dtype),
1681
1454
  )
1682
1455
  else:
1683
1456
  # Try direct casting for non-binary types
1684
- self.features_df = (
1685
- self.features_df.with_columns(
1686
- pl.col(col).cast(target_dtype),
1687
- )
1457
+ self.features_df = self.features_df.with_columns(
1458
+ pl.col(col).cast(target_dtype),
1688
1459
  )
1689
1460
  except Exception as e:
1690
1461
  self.logger.warning(
@@ -1699,31 +1470,23 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1699
1470
  # This ensures "None" strings introduced by failed conversions are properly handled
1700
1471
  for col in self.features_df.columns:
1701
1472
  if self.features_df[col].dtype == pl.Utf8: # String columns
1702
- self.features_df = self.features_df.with_columns(
1703
- [
1704
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1705
- .then(None)
1706
- .otherwise(pl.col(col))
1707
- .alias(col),
1708
- ],
1709
- )
1473
+ self.features_df = self.features_df.with_columns([
1474
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1475
+ .then(None)
1476
+ .otherwise(pl.col(col))
1477
+ .alias(col),
1478
+ ])
1710
1479
  # Float columns
1711
1480
  elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
1712
- self.features_df = self.features_df.with_columns(
1713
- [
1714
- pl.col(col).fill_nan(None).alias(col),
1715
- ],
1716
- )
1481
+ self.features_df = self.features_df.with_columns([
1482
+ pl.col(col).fill_nan(None).alias(col),
1483
+ ])
1717
1484
 
1718
1485
  # Ensure column order matches schema order
1719
1486
  if "features_df" in schema and "columns" in schema["features_df"]:
1720
1487
  schema_column_order = list(schema["features_df"]["columns"].keys())
1721
1488
  # Only reorder columns that exist in both schema and DataFrame
1722
- existing_columns = [
1723
- col
1724
- for col in schema_column_order
1725
- if col in self.features_df.columns
1726
- ]
1489
+ existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
1727
1490
  if existing_columns:
1728
1491
  self.features_df = self.features_df.select(existing_columns)
1729
1492
 
@@ -1753,9 +1516,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1753
1516
  # set self.label to basename without extension
1754
1517
  if self.label is None or self.label == "":
1755
1518
  self.label = os.path.splitext(os.path.basename(filename))[0]
1756
- self.logger.info(
1757
- f"Sample loaded successfully from {filename} (optimized for study)",
1758
- )
1519
+ self.logger.info(f"Sample loaded successfully from {filename} (optimized for study)")
1759
1520
 
1760
1521
 
1761
1522
  def load_schema(schema_path: str) -> Dict[str, Any]:
@@ -1803,20 +1564,13 @@ def clean_null_values_polars(df: pl.DataFrame) -> pl.DataFrame:
1803
1564
  cleaned_df = df
1804
1565
  for col in df.columns:
1805
1566
  if df[col].dtype == pl.Utf8: # String columns
1806
- cleaned_df = cleaned_df.with_columns(
1807
- [
1808
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1809
- .then(None)
1810
- .otherwise(pl.col(col))
1811
- .alias(col),
1812
- ],
1813
- )
1567
+ cleaned_df = cleaned_df.with_columns([
1568
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"])).then(None).otherwise(pl.col(col)).alias(col),
1569
+ ])
1814
1570
  elif df[col].dtype in [pl.Float64, pl.Float32]: # Float columns
1815
- cleaned_df = cleaned_df.with_columns(
1816
- [
1817
- pl.col(col).fill_nan(None).alias(col),
1818
- ],
1819
- )
1571
+ cleaned_df = cleaned_df.with_columns([
1572
+ pl.col(col).fill_nan(None).alias(col),
1573
+ ])
1820
1574
  return cleaned_df
1821
1575
 
1822
1576
 
@@ -1852,12 +1606,7 @@ def cast_column_by_dtype(df: pl.DataFrame, col: str, dtype_str: str) -> pl.DataF
1852
1606
  return df
1853
1607
 
1854
1608
 
1855
- def _cast_to_int(
1856
- df: pl.DataFrame,
1857
- col: str,
1858
- current_dtype: pl.DataType,
1859
- target_dtype: pl.DataType,
1860
- ) -> pl.DataFrame:
1609
+ def _cast_to_int(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
1861
1610
  """Helper function to cast column to integer type."""
1862
1611
  if current_dtype == pl.Utf8:
1863
1612
  return df.with_columns(
@@ -1869,12 +1618,7 @@ def _cast_to_int(
1869
1618
  return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
1870
1619
 
1871
1620
 
1872
- def _cast_to_float(
1873
- df: pl.DataFrame,
1874
- col: str,
1875
- current_dtype: pl.DataType,
1876
- target_dtype: pl.DataType,
1877
- ) -> pl.DataFrame:
1621
+ def _cast_to_float(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
1878
1622
  """Helper function to cast column to float type."""
1879
1623
  if current_dtype == pl.Utf8:
1880
1624
  return df.with_columns(
@@ -1895,29 +1639,20 @@ def _cast_with_binary_handling(
1895
1639
  if target_dtype == pl.Utf8:
1896
1640
  return df.with_columns(
1897
1641
  pl.col(col)
1898
- .map_elements(
1899
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1900
- return_dtype=pl.Utf8,
1901
- )
1642
+ .map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
1902
1643
  .cast(target_dtype),
1903
1644
  )
1904
1645
  elif "Int" in str(target_dtype):
1905
1646
  return df.with_columns(
1906
1647
  pl.col(col)
1907
- .map_elements(
1908
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1909
- return_dtype=pl.Utf8,
1910
- )
1648
+ .map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
1911
1649
  .str.to_integer()
1912
1650
  .cast(target_dtype),
1913
1651
  )
1914
1652
  elif "Float" in str(target_dtype):
1915
1653
  return df.with_columns(
1916
1654
  pl.col(col)
1917
- .map_elements(
1918
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1919
- return_dtype=pl.Utf8,
1920
- )
1655
+ .map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
1921
1656
  .str.to_decimal()
1922
1657
  .cast(target_dtype),
1923
1658
  )
@@ -1926,11 +1661,7 @@ def _cast_with_binary_handling(
1926
1661
  return df.with_columns(pl.col(col).cast(target_dtype))
1927
1662
 
1928
1663
 
1929
- def apply_schema_to_dataframe(
1930
- df: pl.DataFrame,
1931
- schema: Dict[str, Any],
1932
- df_name: str,
1933
- ) -> pl.DataFrame:
1664
+ def apply_schema_to_dataframe(df: pl.DataFrame, schema: Dict[str, Any], df_name: str) -> pl.DataFrame:
1934
1665
  """
1935
1666
  Apply schema type casting to a Polars DataFrame.
1936
1667
 
@@ -2088,9 +1819,7 @@ def _create_dataframe_with_object_columns(
2088
1819
  schema_columns = schema.get(df_name, {}).get("columns", {})
2089
1820
 
2090
1821
  object_columns = {
2091
- k: v
2092
- for k, v in data.items()
2093
- if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
1822
+ k: v for k, v in data.items() if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
2094
1823
  }
2095
1824
  regular_columns = {k: v for k, v in data.items() if k not in object_columns}
2096
1825
 
@@ -2145,17 +1874,13 @@ def load_ms1_dataframe_from_h5_group(
2145
1874
  dtype_str = schema_columns[col]["dtype"]
2146
1875
  try:
2147
1876
  if "Int" in dtype_str:
2148
- ms1_df = ms1_df.with_columns(
2149
- [
2150
- pl.col(col).cast(pl.Int64, strict=False),
2151
- ],
2152
- )
1877
+ ms1_df = ms1_df.with_columns([
1878
+ pl.col(col).cast(pl.Int64, strict=False),
1879
+ ])
2153
1880
  elif "Float" in dtype_str:
2154
- ms1_df = ms1_df.with_columns(
2155
- [
2156
- pl.col(col).cast(pl.Float64, strict=False),
2157
- ],
2158
- )
1881
+ ms1_df = ms1_df.with_columns([
1882
+ pl.col(col).cast(pl.Float64, strict=False),
1883
+ ])
2159
1884
  except Exception as e:
2160
1885
  if logger:
2161
1886
  logger.warning(
@@ -2166,9 +1891,7 @@ def load_ms1_dataframe_from_h5_group(
2166
1891
  return clean_null_values_polars(ms1_df)
2167
1892
 
2168
1893
 
2169
- def load_parameters_from_metadata(
2170
- metadata_group: h5py.Group,
2171
- ) -> Optional[Dict[str, Any]]:
1894
+ def load_parameters_from_metadata(metadata_group: h5py.Group) -> Optional[Dict[str, Any]]:
2172
1895
  """
2173
1896
  Load parameters from HDF5 metadata group.
2174
1897
 
@@ -2215,8 +1938,6 @@ def create_h5_metadata_group(
2215
1938
  metadata_group = f.create_group("metadata")
2216
1939
  metadata_group.attrs["format"] = "master-sample5-1"
2217
1940
  metadata_group.attrs["file_path"] = str(file_path) if file_path is not None else ""
2218
- metadata_group.attrs["file_source"] = (
2219
- str(file_source) if file_source is not None else ""
2220
- )
1941
+ metadata_group.attrs["file_source"] = str(file_source) if file_source is not None else ""
2221
1942
  metadata_group.attrs["file_type"] = str(file_type) if file_type is not None else ""
2222
1943
  metadata_group.attrs["label"] = str(label) if label is not None else ""