masster 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +1 -1
  4. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
  5. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
  6. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
  7. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
  8. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
  9. masster/data/libs/__pycache__/ccm.cpython-312.pyc +0 -0
  10. masster/data/libs/__pycache__/urine.cpython-312.pyc +0 -0
  11. masster/data/libs/ccm.csv +120 -0
  12. masster/data/libs/urine.csv +4693 -0
  13. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
  14. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
  15. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
  16. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
  17. masster/logger.py +11 -11
  18. masster/sample/__init__.py +1 -1
  19. masster/sample/adducts.py +338 -264
  20. masster/sample/defaults/find_adducts_def.py +21 -8
  21. masster/sample/h5.py +561 -282
  22. masster/sample/helpers.py +131 -75
  23. masster/sample/lib.py +4 -4
  24. masster/sample/load.py +31 -17
  25. masster/sample/parameters.py +1 -1
  26. masster/sample/plot.py +7 -7
  27. masster/sample/processing.py +117 -87
  28. masster/sample/sample.py +103 -90
  29. masster/sample/sample5_schema.json +196 -0
  30. masster/sample/save.py +35 -12
  31. masster/spectrum.py +1 -1
  32. masster/study/__init__.py +1 -1
  33. masster/study/defaults/align_def.py +5 -1
  34. masster/study/defaults/identify_def.py +3 -1
  35. masster/study/defaults/study_def.py +58 -25
  36. masster/study/export.py +360 -210
  37. masster/study/h5.py +560 -158
  38. masster/study/helpers.py +496 -203
  39. masster/study/helpers_optimized.py +1 -1
  40. masster/study/id.py +538 -349
  41. masster/study/load.py +233 -143
  42. masster/study/plot.py +71 -71
  43. masster/study/processing.py +456 -254
  44. masster/study/save.py +15 -5
  45. masster/study/study.py +213 -131
  46. masster/study/study5_schema.json +360 -0
  47. masster-0.4.5.dist-info/METADATA +131 -0
  48. masster-0.4.5.dist-info/RECORD +71 -0
  49. masster-0.4.3.dist-info/METADATA +0 -791
  50. masster-0.4.3.dist-info/RECORD +0 -56
  51. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/WHEEL +0 -0
  52. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/entry_points.txt +0 -0
  53. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/licenses/LICENSE +0 -0
  54. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/top_level.txt +0 -0
masster/sample/h5.py CHANGED
@@ -7,11 +7,17 @@ import polars as pl
7
7
 
8
8
  from typing import Any, Dict, List, Optional, Tuple
9
9
 
10
- from masster.chromatogram import Chromatogram
11
- from masster.spectrum import Spectrum
10
+ from master.chromatogram import Chromatogram
11
+ from master.spectrum import Spectrum
12
12
 
13
13
 
14
- def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, save_featurexml=False):
14
+ def _save_sample5(
15
+ self,
16
+ filename=None,
17
+ include_ms1=True,
18
+ include_scans=True,
19
+ save_featurexml=False,
20
+ ):
15
21
  """
16
22
  Save the instance data to a sample5 HDF5 file with optimized compression.
17
23
 
@@ -56,14 +62,16 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
56
62
  return
57
63
 
58
64
  # synchronize feature_map if it exists
59
- if hasattr(self, '_feature_map') and self._feature_map is not None:
65
+ if hasattr(self, "_feature_map") and self._feature_map is not None:
60
66
  self._features_sync()
61
67
 
62
68
  # if no extension is given, add .sample5
63
69
  if not filename.endswith(".sample5"):
64
70
  filename += ".sample5"
65
71
 
66
- self.logger.debug(f"Saving sample to {filename} with optimized LZF+shuffle compression")
72
+ self.logger.debug(
73
+ f"Saving sample to {filename} with optimized LZF+shuffle compression",
74
+ )
67
75
 
68
76
  # delete existing file if it exists
69
77
  if os.path.exists(filename):
@@ -116,12 +124,18 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
116
124
  except Exception:
117
125
  try:
118
126
  # Try to convert to numeric using numpy
119
- numeric_data = np.array([
120
- float(x)
121
- if x is not None and str(x).replace(".", "").replace("-", "").isdigit()
122
- else np.nan
123
- for x in data
124
- ])
127
+ numeric_data = np.array(
128
+ [
129
+ float(x)
130
+ if x is not None
131
+ and str(x)
132
+ .replace(".", "")
133
+ .replace("-", "")
134
+ .isdigit()
135
+ else np.nan
136
+ for x in data
137
+ ],
138
+ )
125
139
  if not np.isnan(numeric_data).all():
126
140
  scans_group.create_dataset(
127
141
  col,
@@ -149,7 +163,12 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
149
163
  )
150
164
  scans_group[col].attrs["dtype"] = "string_repr"
151
165
  else:
152
- scans_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
166
+ scans_group.create_dataset(
167
+ col,
168
+ data=data,
169
+ compression="lzf",
170
+ shuffle=True,
171
+ )
153
172
  scans_group[col].attrs["dtype"] = "native"
154
173
  scans_group.attrs["columns"] = list(scans_df.columns)
155
174
 
@@ -226,7 +245,12 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
226
245
  data = features[col].to_list()
227
246
  # convert None to 'None' strings
228
247
  data = ["None" if x is None else x for x in data]
229
- features_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
248
+ features_group.create_dataset(
249
+ col,
250
+ data=data,
251
+ compression="lzf",
252
+ shuffle=True,
253
+ )
230
254
  else:
231
255
  try:
232
256
  data = features[col].to_numpy()
@@ -261,16 +285,18 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
261
285
  feature_map = self._get_feature_map()
262
286
  if feature_map is not None:
263
287
  # Temporarily set features for save operation
264
- old_features = getattr(self, '_oms_features_map', None)
288
+ old_features = getattr(self, "_oms_features_map", None)
265
289
  self._oms_features_map = feature_map
266
290
  try:
267
- self._save_featureXML(filename=filename.replace(".sample5", ".featureXML"))
291
+ self._save_featureXML(
292
+ filename=filename.replace(".sample5", ".featureXML"),
293
+ )
268
294
  finally:
269
295
  # Restore original features value
270
296
  if old_features is not None:
271
297
  self._oms_features_map = old_features
272
298
  else:
273
- delattr(self, '_oms_features_map')
299
+ delattr(self, "_oms_features_map")
274
300
  else:
275
301
  self.logger.warning("Cannot save featureXML: no feature data available")
276
302
 
@@ -309,22 +335,28 @@ def _load_sample5(self, filename: str, map: bool = False):
309
335
  # Load metadata
310
336
  if "metadata" in f:
311
337
  metadata_group = f["metadata"]
312
- self.file_path = decode_metadata_attr(metadata_group.attrs.get("file_path", ""))
338
+ self.file_path = decode_metadata_attr(
339
+ metadata_group.attrs.get("file_path", ""),
340
+ )
313
341
 
314
342
  # Load file_source if it exists, otherwise set it equal to file_path
315
343
  if "file_source" in metadata_group.attrs:
316
- self.file_source = decode_metadata_attr(metadata_group.attrs.get("file_source", ""))
344
+ self.file_source = decode_metadata_attr(
345
+ metadata_group.attrs.get("file_source", ""),
346
+ )
317
347
  else:
318
348
  self.file_source = self.file_path
319
349
 
320
- self.file_type = decode_metadata_attr(metadata_group.attrs.get("file_type", ""))
350
+ self.file_type = decode_metadata_attr(
351
+ metadata_group.attrs.get("file_type", ""),
352
+ )
321
353
  self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
322
354
 
323
355
  # Load parameters from JSON in metadata
324
356
  loaded_data = load_parameters_from_metadata(metadata_group)
325
357
 
326
358
  # Always create a fresh sample_defaults object
327
- from masster.sample.defaults.sample_def import sample_defaults
359
+ from master.sample.defaults.sample_def import sample_defaults
328
360
 
329
361
  self.parameters = sample_defaults()
330
362
 
@@ -368,19 +400,23 @@ def _load_sample5(self, filename: str, map: bool = False):
368
400
  # Convert "None" strings and NaN values to proper null values
369
401
  for col in self.scans_df.columns:
370
402
  if self.scans_df[col].dtype == pl.Utf8: # String columns
371
- self.scans_df = self.scans_df.with_columns([
372
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
373
- .then(None)
374
- .otherwise(pl.col(col))
375
- .alias(col),
376
- ])
403
+ self.scans_df = self.scans_df.with_columns(
404
+ [
405
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
406
+ .then(None)
407
+ .otherwise(pl.col(col))
408
+ .alias(col),
409
+ ],
410
+ )
377
411
  elif self.scans_df[col].dtype in [
378
412
  pl.Float64,
379
413
  pl.Float32,
380
414
  ]: # Float columns
381
- self.scans_df = self.scans_df.with_columns([
382
- pl.col(col).fill_nan(None).alias(col),
383
- ])
415
+ self.scans_df = self.scans_df.with_columns(
416
+ [
417
+ pl.col(col).fill_nan(None).alias(col),
418
+ ],
419
+ )
384
420
 
385
421
  # update all columns with schema types
386
422
  for col in self.scans_df.columns:
@@ -398,7 +434,9 @@ def _load_sample5(self, filename: str, map: bool = False):
398
434
  if self.scans_df[col].dtype == pl.Utf8:
399
435
  # String data - convert to integer
400
436
  self.scans_df = self.scans_df.with_columns(
401
- pl.col(col).str.to_integer().cast(eval(dtype_str)),
437
+ pl.col(col)
438
+ .str.to_integer()
439
+ .cast(eval(dtype_str)),
402
440
  )
403
441
  elif self.scans_df[col].dtype in [
404
442
  pl.Float64,
@@ -418,7 +456,9 @@ def _load_sample5(self, filename: str, map: bool = False):
418
456
  if self.scans_df[col].dtype == pl.Utf8:
419
457
  # String data - convert to float
420
458
  self.scans_df = self.scans_df.with_columns(
421
- pl.col(col).str.to_decimal().cast(eval(dtype_str)),
459
+ pl.col(col)
460
+ .str.to_decimal()
461
+ .cast(eval(dtype_str)),
422
462
  )
423
463
  else:
424
464
  # Try direct casting
@@ -442,7 +482,9 @@ def _load_sample5(self, filename: str, map: bool = False):
442
482
  self.scans_df = self.scans_df.with_columns(
443
483
  pl.col(col)
444
484
  .map_elements(
445
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
485
+ lambda x: x.decode("utf-8")
486
+ if isinstance(x, bytes)
487
+ else str(x),
446
488
  return_dtype=pl.Utf8,
447
489
  )
448
490
  .cast(target_dtype),
@@ -451,7 +493,9 @@ def _load_sample5(self, filename: str, map: bool = False):
451
493
  self.scans_df = self.scans_df.with_columns(
452
494
  pl.col(col)
453
495
  .map_elements(
454
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
496
+ lambda x: x.decode("utf-8")
497
+ if isinstance(x, bytes)
498
+ else str(x),
455
499
  return_dtype=pl.Utf8,
456
500
  )
457
501
  .str.to_integer()
@@ -461,7 +505,9 @@ def _load_sample5(self, filename: str, map: bool = False):
461
505
  self.scans_df = self.scans_df.with_columns(
462
506
  pl.col(col)
463
507
  .map_elements(
464
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
508
+ lambda x: x.decode("utf-8")
509
+ if isinstance(x, bytes)
510
+ else str(x),
465
511
  return_dtype=pl.Utf8,
466
512
  )
467
513
  .str.to_decimal()
@@ -490,7 +536,9 @@ def _load_sample5(self, filename: str, map: bool = False):
490
536
  if "scans_df" in schema and "columns" in schema["scans_df"]:
491
537
  schema_column_order = list(schema["scans_df"]["columns"].keys())
492
538
  # Only reorder columns that exist in both schema and DataFrame
493
- existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
539
+ existing_columns = [
540
+ col for col in schema_column_order if col in self.scans_df.columns
541
+ ]
494
542
  if existing_columns:
495
543
  self.scans_df = self.scans_df.select(existing_columns)
496
544
 
@@ -617,23 +665,29 @@ def _load_sample5(self, filename: str, map: bool = False):
617
665
  if k in schema.get("features_df", {}).get("columns", {})
618
666
  and schema["features_df"]["columns"][k]["dtype"] == "pl.Object"
619
667
  }
620
- regular_columns = {k: v for k, v in data.items() if k not in object_columns}
668
+ regular_columns = {
669
+ k: v for k, v in data.items() if k not in object_columns
670
+ }
621
671
 
622
672
  # Create DataFrame with regular columns first
623
673
  if regular_columns:
624
674
  self.features_df = pl.DataFrame(regular_columns)
625
675
  # Add Object columns one by one
626
676
  for col, values in object_columns.items():
627
- self.features_df = self.features_df.with_columns([
628
- pl.Series(col, values, dtype=pl.Object),
629
- ])
677
+ self.features_df = self.features_df.with_columns(
678
+ [
679
+ pl.Series(col, values, dtype=pl.Object),
680
+ ],
681
+ )
630
682
  else:
631
683
  # Only Object columns
632
684
  self.features_df = pl.DataFrame()
633
685
  for col, values in object_columns.items():
634
- self.features_df = self.features_df.with_columns([
635
- pl.Series(col, values, dtype=pl.Object),
636
- ])
686
+ self.features_df = self.features_df.with_columns(
687
+ [
688
+ pl.Series(col, values, dtype=pl.Object),
689
+ ],
690
+ )
637
691
 
638
692
  # update all columns with schema types (skip Object columns)
639
693
  for col in self.features_df.columns:
@@ -650,16 +704,25 @@ def _load_sample5(self, filename: str, map: bool = False):
650
704
  # Convert to numeric first, handling different input types
651
705
  if self.features_df[col].dtype == pl.Utf8:
652
706
  # String data - convert to integer
653
- self.features_df = self.features_df.with_columns(
654
- pl.col(col).str.to_integer().cast(eval(dtype_str)),
707
+ self.features_df = (
708
+ self.features_df.with_columns(
709
+ pl.col(col)
710
+ .str.to_integer()
711
+ .cast(eval(dtype_str)),
712
+ )
655
713
  )
656
714
  elif self.features_df[col].dtype in [
657
715
  pl.Float64,
658
716
  pl.Float32,
659
717
  ]:
660
718
  # Float data - cast to integer with null handling for NaN values
661
- self.features_df = self.features_df.with_columns(
662
- pl.col(col).cast(eval(dtype_str), strict=False),
719
+ self.features_df = (
720
+ self.features_df.with_columns(
721
+ pl.col(col).cast(
722
+ eval(dtype_str),
723
+ strict=False,
724
+ ),
725
+ )
663
726
  )
664
727
  else:
665
728
  # Handle special cases and try direct casting for other types
@@ -670,50 +733,70 @@ def _load_sample5(self, filename: str, map: bool = False):
670
733
  if "Binary" in str(current_dtype):
671
734
  # Convert binary to string first, then to target type
672
735
  if target_dtype == pl.Utf8:
673
- self.features_df = self.features_df.with_columns(
674
- pl.col(col)
675
- .map_elements(
676
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
677
- return_dtype=pl.Utf8,
736
+ self.features_df = (
737
+ self.features_df.with_columns(
738
+ pl.col(col)
739
+ .map_elements(
740
+ lambda x: x.decode("utf-8")
741
+ if isinstance(x, bytes)
742
+ else str(x),
743
+ return_dtype=pl.Utf8,
744
+ )
745
+ .cast(target_dtype),
678
746
  )
679
- .cast(target_dtype),
680
747
  )
681
748
  elif "Int" in str(target_dtype):
682
- self.features_df = self.features_df.with_columns(
683
- pl.col(col)
684
- .map_elements(
685
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
686
- return_dtype=pl.Utf8,
749
+ self.features_df = (
750
+ self.features_df.with_columns(
751
+ pl.col(col)
752
+ .map_elements(
753
+ lambda x: x.decode("utf-8")
754
+ if isinstance(x, bytes)
755
+ else str(x),
756
+ return_dtype=pl.Utf8,
757
+ )
758
+ .str.to_integer()
759
+ .cast(target_dtype),
687
760
  )
688
- .str.to_integer()
689
- .cast(target_dtype),
690
761
  )
691
762
  elif "Float" in str(target_dtype):
692
- self.features_df = self.features_df.with_columns(
693
- pl.col(col)
694
- .map_elements(
695
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
696
- return_dtype=pl.Utf8,
763
+ self.features_df = (
764
+ self.features_df.with_columns(
765
+ pl.col(col)
766
+ .map_elements(
767
+ lambda x: x.decode("utf-8")
768
+ if isinstance(x, bytes)
769
+ else str(x),
770
+ return_dtype=pl.Utf8,
771
+ )
772
+ .str.to_decimal()
773
+ .cast(target_dtype),
697
774
  )
698
- .str.to_decimal()
699
- .cast(target_dtype),
700
775
  )
701
776
  else:
702
777
  # Try direct casting
703
- self.features_df = self.features_df.with_columns(
704
- pl.col(col).cast(target_dtype),
778
+ self.features_df = (
779
+ self.features_df.with_columns(
780
+ pl.col(col).cast(target_dtype),
781
+ )
705
782
  )
706
783
  else:
707
784
  # Try direct casting for non-binary types
708
- self.features_df = self.features_df.with_columns(
709
- pl.col(col).cast(target_dtype),
785
+ self.features_df = (
786
+ self.features_df.with_columns(
787
+ pl.col(col).cast(target_dtype),
788
+ )
710
789
  )
711
790
  elif "Float" in dtype_str:
712
791
  # Convert to float, handling different input types
713
792
  if self.features_df[col].dtype == pl.Utf8:
714
793
  # String data - convert to float
715
- self.features_df = self.features_df.with_columns(
716
- pl.col(col).str.to_decimal().cast(eval(dtype_str)),
794
+ self.features_df = (
795
+ self.features_df.with_columns(
796
+ pl.col(col)
797
+ .str.to_decimal()
798
+ .cast(eval(dtype_str)),
799
+ )
717
800
  )
718
801
  else:
719
802
  # Handle special cases and try direct casting for other types
@@ -724,43 +807,59 @@ def _load_sample5(self, filename: str, map: bool = False):
724
807
  if "Binary" in str(current_dtype):
725
808
  # Convert binary to string first, then to target type
726
809
  if target_dtype == pl.Utf8:
727
- self.features_df = self.features_df.with_columns(
728
- pl.col(col)
729
- .map_elements(
730
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
731
- return_dtype=pl.Utf8,
810
+ self.features_df = (
811
+ self.features_df.with_columns(
812
+ pl.col(col)
813
+ .map_elements(
814
+ lambda x: x.decode("utf-8")
815
+ if isinstance(x, bytes)
816
+ else str(x),
817
+ return_dtype=pl.Utf8,
818
+ )
819
+ .cast(target_dtype),
732
820
  )
733
- .cast(target_dtype),
734
821
  )
735
822
  elif "Int" in str(target_dtype):
736
- self.features_df = self.features_df.with_columns(
737
- pl.col(col)
738
- .map_elements(
739
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
740
- return_dtype=pl.Utf8,
823
+ self.features_df = (
824
+ self.features_df.with_columns(
825
+ pl.col(col)
826
+ .map_elements(
827
+ lambda x: x.decode("utf-8")
828
+ if isinstance(x, bytes)
829
+ else str(x),
830
+ return_dtype=pl.Utf8,
831
+ )
832
+ .str.to_integer()
833
+ .cast(target_dtype),
741
834
  )
742
- .str.to_integer()
743
- .cast(target_dtype),
744
835
  )
745
836
  elif "Float" in str(target_dtype):
746
- self.features_df = self.features_df.with_columns(
747
- pl.col(col)
748
- .map_elements(
749
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
750
- return_dtype=pl.Utf8,
837
+ self.features_df = (
838
+ self.features_df.with_columns(
839
+ pl.col(col)
840
+ .map_elements(
841
+ lambda x: x.decode("utf-8")
842
+ if isinstance(x, bytes)
843
+ else str(x),
844
+ return_dtype=pl.Utf8,
845
+ )
846
+ .str.to_decimal()
847
+ .cast(target_dtype),
751
848
  )
752
- .str.to_decimal()
753
- .cast(target_dtype),
754
849
  )
755
850
  else:
756
851
  # Try direct casting
757
- self.features_df = self.features_df.with_columns(
758
- pl.col(col).cast(target_dtype),
852
+ self.features_df = (
853
+ self.features_df.with_columns(
854
+ pl.col(col).cast(target_dtype),
855
+ )
759
856
  )
760
857
  else:
761
858
  # Try direct casting for non-binary types
762
- self.features_df = self.features_df.with_columns(
763
- pl.col(col).cast(target_dtype),
859
+ self.features_df = (
860
+ self.features_df.with_columns(
861
+ pl.col(col).cast(target_dtype),
862
+ )
764
863
  )
765
864
  elif "Utf8" in dtype_str:
766
865
  # Ensure it's string type
@@ -776,43 +875,59 @@ def _load_sample5(self, filename: str, map: bool = False):
776
875
  if "Binary" in str(current_dtype):
777
876
  # Convert binary to string first, then to target type
778
877
  if target_dtype == pl.Utf8:
779
- self.features_df = self.features_df.with_columns(
780
- pl.col(col)
781
- .map_elements(
782
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
783
- return_dtype=pl.Utf8,
878
+ self.features_df = (
879
+ self.features_df.with_columns(
880
+ pl.col(col)
881
+ .map_elements(
882
+ lambda x: x.decode("utf-8")
883
+ if isinstance(x, bytes)
884
+ else str(x),
885
+ return_dtype=pl.Utf8,
886
+ )
887
+ .cast(target_dtype),
784
888
  )
785
- .cast(target_dtype),
786
889
  )
787
890
  elif "Int" in str(target_dtype):
788
- self.features_df = self.features_df.with_columns(
789
- pl.col(col)
790
- .map_elements(
791
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
792
- return_dtype=pl.Utf8,
891
+ self.features_df = (
892
+ self.features_df.with_columns(
893
+ pl.col(col)
894
+ .map_elements(
895
+ lambda x: x.decode("utf-8")
896
+ if isinstance(x, bytes)
897
+ else str(x),
898
+ return_dtype=pl.Utf8,
899
+ )
900
+ .str.to_integer()
901
+ .cast(target_dtype),
793
902
  )
794
- .str.to_integer()
795
- .cast(target_dtype),
796
903
  )
797
904
  elif "Float" in str(target_dtype):
798
- self.features_df = self.features_df.with_columns(
799
- pl.col(col)
800
- .map_elements(
801
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
802
- return_dtype=pl.Utf8,
905
+ self.features_df = (
906
+ self.features_df.with_columns(
907
+ pl.col(col)
908
+ .map_elements(
909
+ lambda x: x.decode("utf-8")
910
+ if isinstance(x, bytes)
911
+ else str(x),
912
+ return_dtype=pl.Utf8,
913
+ )
914
+ .str.to_decimal()
915
+ .cast(target_dtype),
803
916
  )
804
- .str.to_decimal()
805
- .cast(target_dtype),
806
917
  )
807
918
  else:
808
919
  # Try direct casting
809
- self.features_df = self.features_df.with_columns(
810
- pl.col(col).cast(target_dtype),
920
+ self.features_df = (
921
+ self.features_df.with_columns(
922
+ pl.col(col).cast(target_dtype),
923
+ )
811
924
  )
812
925
  else:
813
926
  # Try direct casting for non-binary types
814
- self.features_df = self.features_df.with_columns(
815
- pl.col(col).cast(target_dtype),
927
+ self.features_df = (
928
+ self.features_df.with_columns(
929
+ pl.col(col).cast(target_dtype),
930
+ )
816
931
  )
817
932
  except Exception as e:
818
933
  self.logger.warning(
@@ -827,23 +942,31 @@ def _load_sample5(self, filename: str, map: bool = False):
827
942
  # This ensures "None" strings introduced by failed conversions are properly handled
828
943
  for col in self.features_df.columns:
829
944
  if self.features_df[col].dtype == pl.Utf8: # String columns
830
- self.features_df = self.features_df.with_columns([
831
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
832
- .then(None)
833
- .otherwise(pl.col(col))
834
- .alias(col),
835
- ])
945
+ self.features_df = self.features_df.with_columns(
946
+ [
947
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
948
+ .then(None)
949
+ .otherwise(pl.col(col))
950
+ .alias(col),
951
+ ],
952
+ )
836
953
  # Float columns
837
954
  elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
838
- self.features_df = self.features_df.with_columns([
839
- pl.col(col).fill_nan(None).alias(col),
840
- ])
955
+ self.features_df = self.features_df.with_columns(
956
+ [
957
+ pl.col(col).fill_nan(None).alias(col),
958
+ ],
959
+ )
841
960
 
842
961
  # Ensure column order matches schema order
843
962
  if "features_df" in schema and "columns" in schema["features_df"]:
844
963
  schema_column_order = list(schema["features_df"]["columns"].keys())
845
964
  # Only reorder columns that exist in both schema and DataFrame
846
- existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
965
+ existing_columns = [
966
+ col
967
+ for col in schema_column_order
968
+ if col in self.features_df.columns
969
+ ]
847
970
  if existing_columns:
848
971
  self.features_df = self.features_df.select(existing_columns)
849
972
 
@@ -873,13 +996,17 @@ def _load_sample5(self, filename: str, map: bool = False):
873
996
  dtype_str = schema_columns[col]["dtype"]
874
997
  try:
875
998
  if "Int" in dtype_str:
876
- self.ms1_df = self.ms1_df.with_columns([
877
- pl.col(col).cast(pl.Int64, strict=False),
878
- ])
999
+ self.ms1_df = self.ms1_df.with_columns(
1000
+ [
1001
+ pl.col(col).cast(pl.Int64, strict=False),
1002
+ ],
1003
+ )
879
1004
  elif "Float" in dtype_str:
880
- self.ms1_df = self.ms1_df.with_columns([
881
- pl.col(col).cast(pl.Float64, strict=False),
882
- ])
1005
+ self.ms1_df = self.ms1_df.with_columns(
1006
+ [
1007
+ pl.col(col).cast(pl.Float64, strict=False),
1008
+ ],
1009
+ )
883
1010
  except Exception as e:
884
1011
  self.logger.warning(
885
1012
  f"Failed to apply schema type {dtype_str} to column {col}: {e}",
@@ -948,22 +1075,28 @@ def _load_sample5_study(self, filename: str, map: bool = False):
948
1075
  # Load metadata
949
1076
  if "metadata" in f:
950
1077
  metadata_group = f["metadata"]
951
- self.file_path = decode_metadata_attr(metadata_group.attrs.get("file_path", ""))
1078
+ self.file_path = decode_metadata_attr(
1079
+ metadata_group.attrs.get("file_path", ""),
1080
+ )
952
1081
 
953
1082
  # Load file_source if it exists, otherwise set it equal to file_path
954
1083
  if "file_source" in metadata_group.attrs:
955
- self.file_source = decode_metadata_attr(metadata_group.attrs.get("file_source", ""))
1084
+ self.file_source = decode_metadata_attr(
1085
+ metadata_group.attrs.get("file_source", ""),
1086
+ )
956
1087
  else:
957
1088
  self.file_source = self.file_path
958
1089
 
959
- self.file_type = decode_metadata_attr(metadata_group.attrs.get("file_type", ""))
1090
+ self.file_type = decode_metadata_attr(
1091
+ metadata_group.attrs.get("file_type", ""),
1092
+ )
960
1093
  self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
961
1094
 
962
1095
  # Load parameters from JSON in metadata
963
1096
  loaded_data = load_parameters_from_metadata(metadata_group)
964
1097
 
965
1098
  # Always create a fresh sample_defaults object
966
- from masster.sample.defaults.sample_def import sample_defaults
1099
+ from master.sample.defaults.sample_def import sample_defaults
967
1100
 
968
1101
  self.parameters = sample_defaults()
969
1102
 
@@ -1007,19 +1140,23 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1007
1140
  # Convert "None" strings and NaN values to proper null values
1008
1141
  for col in self.scans_df.columns:
1009
1142
  if self.scans_df[col].dtype == pl.Utf8: # String columns
1010
- self.scans_df = self.scans_df.with_columns([
1011
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1012
- .then(None)
1013
- .otherwise(pl.col(col))
1014
- .alias(col),
1015
- ])
1143
+ self.scans_df = self.scans_df.with_columns(
1144
+ [
1145
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1146
+ .then(None)
1147
+ .otherwise(pl.col(col))
1148
+ .alias(col),
1149
+ ],
1150
+ )
1016
1151
  elif self.scans_df[col].dtype in [
1017
1152
  pl.Float64,
1018
1153
  pl.Float32,
1019
1154
  ]: # Float columns
1020
- self.scans_df = self.scans_df.with_columns([
1021
- pl.col(col).fill_nan(None).alias(col),
1022
- ])
1155
+ self.scans_df = self.scans_df.with_columns(
1156
+ [
1157
+ pl.col(col).fill_nan(None).alias(col),
1158
+ ],
1159
+ )
1023
1160
 
1024
1161
  # update all columns with schema types
1025
1162
  for col in self.scans_df.columns:
@@ -1037,7 +1174,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1037
1174
  if self.scans_df[col].dtype == pl.Utf8:
1038
1175
  # String data - convert to integer
1039
1176
  self.scans_df = self.scans_df.with_columns(
1040
- pl.col(col).str.to_integer().cast(eval(dtype_str)),
1177
+ pl.col(col)
1178
+ .str.to_integer()
1179
+ .cast(eval(dtype_str)),
1041
1180
  )
1042
1181
  elif self.scans_df[col].dtype in [
1043
1182
  pl.Float64,
@@ -1057,7 +1196,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1057
1196
  if self.scans_df[col].dtype == pl.Utf8:
1058
1197
  # String data - convert to float
1059
1198
  self.scans_df = self.scans_df.with_columns(
1060
- pl.col(col).str.to_decimal().cast(eval(dtype_str)),
1199
+ pl.col(col)
1200
+ .str.to_decimal()
1201
+ .cast(eval(dtype_str)),
1061
1202
  )
1062
1203
  else:
1063
1204
  # Try direct casting
@@ -1081,7 +1222,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1081
1222
  self.scans_df = self.scans_df.with_columns(
1082
1223
  pl.col(col)
1083
1224
  .map_elements(
1084
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1225
+ lambda x: x.decode("utf-8")
1226
+ if isinstance(x, bytes)
1227
+ else str(x),
1085
1228
  return_dtype=pl.Utf8,
1086
1229
  )
1087
1230
  .cast(target_dtype),
@@ -1090,7 +1233,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1090
1233
  self.scans_df = self.scans_df.with_columns(
1091
1234
  pl.col(col)
1092
1235
  .map_elements(
1093
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1236
+ lambda x: x.decode("utf-8")
1237
+ if isinstance(x, bytes)
1238
+ else str(x),
1094
1239
  return_dtype=pl.Utf8,
1095
1240
  )
1096
1241
  .str.to_integer()
@@ -1100,7 +1245,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1100
1245
  self.scans_df = self.scans_df.with_columns(
1101
1246
  pl.col(col)
1102
1247
  .map_elements(
1103
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1248
+ lambda x: x.decode("utf-8")
1249
+ if isinstance(x, bytes)
1250
+ else str(x),
1104
1251
  return_dtype=pl.Utf8,
1105
1252
  )
1106
1253
  .str.to_decimal()
@@ -1129,7 +1276,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1129
1276
  if "scans_df" in schema and "columns" in schema["scans_df"]:
1130
1277
  schema_column_order = list(schema["scans_df"]["columns"].keys())
1131
1278
  # Only reorder columns that exist in both schema and DataFrame
1132
- existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
1279
+ existing_columns = [
1280
+ col for col in schema_column_order if col in self.scans_df.columns
1281
+ ]
1133
1282
  if existing_columns:
1134
1283
  self.scans_df = self.scans_df.select(existing_columns)
1135
1284
 
@@ -1208,12 +1357,18 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1208
1357
  spectrum_list = []
1209
1358
  for spec_data in json.loads(item):
1210
1359
  if spec_data is not None:
1211
- spectrum = Spectrum.from_json(spec_data)
1360
+ spectrum = Spectrum.from_json(
1361
+ spec_data,
1362
+ )
1212
1363
  spectrum_list.append(spectrum)
1213
1364
  else:
1214
1365
  spectrum_list.append(None)
1215
1366
  reconstructed_data.append(spectrum_list)
1216
- except (json.JSONDecodeError, ValueError, TypeError):
1367
+ except (
1368
+ json.JSONDecodeError,
1369
+ ValueError,
1370
+ TypeError,
1371
+ ):
1217
1372
  reconstructed_data.append(None)
1218
1373
 
1219
1374
  data[col] = reconstructed_data
@@ -1229,10 +1384,13 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1229
1384
  # Separate Object columns from regular columns to avoid astuple issues
1230
1385
  object_columns = {}
1231
1386
  regular_columns = {}
1232
-
1387
+
1233
1388
  for col, values in data.items():
1234
1389
  if col in schema.get("features_df", {}).get("columns", {}):
1235
- if "Object" in schema["features_df"]["columns"][col].get("dtype", ""):
1390
+ if "Object" in schema["features_df"]["columns"][col].get(
1391
+ "dtype",
1392
+ "",
1393
+ ):
1236
1394
  object_columns[col] = values
1237
1395
  else:
1238
1396
  regular_columns[col] = values
@@ -1245,38 +1403,48 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1245
1403
  else:
1246
1404
  # If no regular columns, create empty DataFrame
1247
1405
  self.features_df = pl.DataFrame()
1248
-
1406
+
1249
1407
  # Add Object columns one by one
1250
1408
  for col, values in object_columns.items():
1251
1409
  if not self.features_df.is_empty():
1252
1410
  self.features_df = self.features_df.with_columns(
1253
- pl.Series(col, values, dtype=pl.Object).alias(col)
1411
+ pl.Series(col, values, dtype=pl.Object).alias(col),
1254
1412
  )
1255
1413
  else:
1256
1414
  # Create DataFrame with just this Object column
1257
- self.features_df = pl.DataFrame({col: values}, schema={col: pl.Object})
1415
+ self.features_df = pl.DataFrame(
1416
+ {col: values},
1417
+ schema={col: pl.Object},
1418
+ )
1258
1419
 
1259
1420
  # Convert "None" strings and NaN values to proper null values for regular columns first
1260
1421
  for col in self.features_df.columns:
1261
1422
  # Skip Object columns - they're already properly reconstructed
1262
1423
  if col in schema.get("features_df", {}).get("columns", {}):
1263
- if "Object" in schema["features_df"]["columns"][col].get("dtype", ""):
1424
+ if "Object" in schema["features_df"]["columns"][col].get(
1425
+ "dtype",
1426
+ "",
1427
+ ):
1264
1428
  continue
1265
1429
 
1266
1430
  if self.features_df[col].dtype == pl.Utf8: # String columns
1267
- self.features_df = self.features_df.with_columns([
1268
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1269
- .then(None)
1270
- .otherwise(pl.col(col))
1271
- .alias(col),
1272
- ])
1431
+ self.features_df = self.features_df.with_columns(
1432
+ [
1433
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1434
+ .then(None)
1435
+ .otherwise(pl.col(col))
1436
+ .alias(col),
1437
+ ],
1438
+ )
1273
1439
  elif self.features_df[col].dtype in [
1274
1440
  pl.Float64,
1275
1441
  pl.Float32,
1276
1442
  ]: # Float columns
1277
- self.features_df = self.features_df.with_columns([
1278
- pl.col(col).fill_nan(None).alias(col),
1279
- ])
1443
+ self.features_df = self.features_df.with_columns(
1444
+ [
1445
+ pl.col(col).fill_nan(None).alias(col),
1446
+ ],
1447
+ )
1280
1448
 
1281
1449
  # update all columns with schema types
1282
1450
  for col in self.features_df.columns:
@@ -1293,16 +1461,25 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1293
1461
  # Convert to numeric first, handling different input types
1294
1462
  if self.features_df[col].dtype == pl.Utf8:
1295
1463
  # String data - convert to integer
1296
- self.features_df = self.features_df.with_columns(
1297
- pl.col(col).str.to_integer().cast(eval(dtype_str)),
1464
+ self.features_df = (
1465
+ self.features_df.with_columns(
1466
+ pl.col(col)
1467
+ .str.to_integer()
1468
+ .cast(eval(dtype_str)),
1469
+ )
1298
1470
  )
1299
1471
  elif self.features_df[col].dtype in [
1300
1472
  pl.Float64,
1301
1473
  pl.Float32,
1302
1474
  ]:
1303
1475
  # Float data - cast to integer with null handling for NaN values
1304
- self.features_df = self.features_df.with_columns(
1305
- pl.col(col).cast(eval(dtype_str), strict=False),
1476
+ self.features_df = (
1477
+ self.features_df.with_columns(
1478
+ pl.col(col).cast(
1479
+ eval(dtype_str),
1480
+ strict=False,
1481
+ ),
1482
+ )
1306
1483
  )
1307
1484
  else:
1308
1485
  # Handle special cases and try direct casting for other types
@@ -1313,50 +1490,70 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1313
1490
  if "Binary" in str(current_dtype):
1314
1491
  # Convert binary to string first, then to target type
1315
1492
  if target_dtype == pl.Utf8:
1316
- self.features_df = self.features_df.with_columns(
1317
- pl.col(col)
1318
- .map_elements(
1319
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1320
- return_dtype=pl.Utf8,
1493
+ self.features_df = (
1494
+ self.features_df.with_columns(
1495
+ pl.col(col)
1496
+ .map_elements(
1497
+ lambda x: x.decode("utf-8")
1498
+ if isinstance(x, bytes)
1499
+ else str(x),
1500
+ return_dtype=pl.Utf8,
1501
+ )
1502
+ .cast(target_dtype),
1321
1503
  )
1322
- .cast(target_dtype),
1323
1504
  )
1324
1505
  elif "Int" in str(target_dtype):
1325
- self.features_df = self.features_df.with_columns(
1326
- pl.col(col)
1327
- .map_elements(
1328
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1329
- return_dtype=pl.Utf8,
1506
+ self.features_df = (
1507
+ self.features_df.with_columns(
1508
+ pl.col(col)
1509
+ .map_elements(
1510
+ lambda x: x.decode("utf-8")
1511
+ if isinstance(x, bytes)
1512
+ else str(x),
1513
+ return_dtype=pl.Utf8,
1514
+ )
1515
+ .str.to_integer()
1516
+ .cast(target_dtype),
1330
1517
  )
1331
- .str.to_integer()
1332
- .cast(target_dtype),
1333
1518
  )
1334
1519
  elif "Float" in str(target_dtype):
1335
- self.features_df = self.features_df.with_columns(
1336
- pl.col(col)
1337
- .map_elements(
1338
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1339
- return_dtype=pl.Utf8,
1520
+ self.features_df = (
1521
+ self.features_df.with_columns(
1522
+ pl.col(col)
1523
+ .map_elements(
1524
+ lambda x: x.decode("utf-8")
1525
+ if isinstance(x, bytes)
1526
+ else str(x),
1527
+ return_dtype=pl.Utf8,
1528
+ )
1529
+ .str.to_decimal()
1530
+ .cast(target_dtype),
1340
1531
  )
1341
- .str.to_decimal()
1342
- .cast(target_dtype),
1343
1532
  )
1344
1533
  else:
1345
1534
  # Try direct casting
1346
- self.features_df = self.features_df.with_columns(
1347
- pl.col(col).cast(target_dtype),
1535
+ self.features_df = (
1536
+ self.features_df.with_columns(
1537
+ pl.col(col).cast(target_dtype),
1538
+ )
1348
1539
  )
1349
1540
  else:
1350
1541
  # Try direct casting for non-binary types
1351
- self.features_df = self.features_df.with_columns(
1352
- pl.col(col).cast(target_dtype),
1542
+ self.features_df = (
1543
+ self.features_df.with_columns(
1544
+ pl.col(col).cast(target_dtype),
1545
+ )
1353
1546
  )
1354
1547
  elif "Float" in dtype_str:
1355
1548
  # Convert to float, handling different input types
1356
1549
  if self.features_df[col].dtype == pl.Utf8:
1357
1550
  # String data - convert to float
1358
- self.features_df = self.features_df.with_columns(
1359
- pl.col(col).str.to_decimal().cast(eval(dtype_str)),
1551
+ self.features_df = (
1552
+ self.features_df.with_columns(
1553
+ pl.col(col)
1554
+ .str.to_decimal()
1555
+ .cast(eval(dtype_str)),
1556
+ )
1360
1557
  )
1361
1558
  else:
1362
1559
  # Handle special cases and try direct casting for other types
@@ -1367,43 +1564,59 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1367
1564
  if "Binary" in str(current_dtype):
1368
1565
  # Convert binary to string first, then to target type
1369
1566
  if target_dtype == pl.Utf8:
1370
- self.features_df = self.features_df.with_columns(
1371
- pl.col(col)
1372
- .map_elements(
1373
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1374
- return_dtype=pl.Utf8,
1567
+ self.features_df = (
1568
+ self.features_df.with_columns(
1569
+ pl.col(col)
1570
+ .map_elements(
1571
+ lambda x: x.decode("utf-8")
1572
+ if isinstance(x, bytes)
1573
+ else str(x),
1574
+ return_dtype=pl.Utf8,
1575
+ )
1576
+ .cast(target_dtype),
1375
1577
  )
1376
- .cast(target_dtype),
1377
1578
  )
1378
1579
  elif "Int" in str(target_dtype):
1379
- self.features_df = self.features_df.with_columns(
1380
- pl.col(col)
1381
- .map_elements(
1382
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1383
- return_dtype=pl.Utf8,
1580
+ self.features_df = (
1581
+ self.features_df.with_columns(
1582
+ pl.col(col)
1583
+ .map_elements(
1584
+ lambda x: x.decode("utf-8")
1585
+ if isinstance(x, bytes)
1586
+ else str(x),
1587
+ return_dtype=pl.Utf8,
1588
+ )
1589
+ .str.to_integer()
1590
+ .cast(target_dtype),
1384
1591
  )
1385
- .str.to_integer()
1386
- .cast(target_dtype),
1387
1592
  )
1388
1593
  elif "Float" in str(target_dtype):
1389
- self.features_df = self.features_df.with_columns(
1390
- pl.col(col)
1391
- .map_elements(
1392
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1393
- return_dtype=pl.Utf8,
1594
+ self.features_df = (
1595
+ self.features_df.with_columns(
1596
+ pl.col(col)
1597
+ .map_elements(
1598
+ lambda x: x.decode("utf-8")
1599
+ if isinstance(x, bytes)
1600
+ else str(x),
1601
+ return_dtype=pl.Utf8,
1602
+ )
1603
+ .str.to_decimal()
1604
+ .cast(target_dtype),
1394
1605
  )
1395
- .str.to_decimal()
1396
- .cast(target_dtype),
1397
1606
  )
1398
1607
  else:
1399
1608
  # Try direct casting
1400
- self.features_df = self.features_df.with_columns(
1401
- pl.col(col).cast(target_dtype),
1609
+ self.features_df = (
1610
+ self.features_df.with_columns(
1611
+ pl.col(col).cast(target_dtype),
1612
+ )
1402
1613
  )
1403
1614
  else:
1404
1615
  # Try direct casting for non-binary types
1405
- self.features_df = self.features_df.with_columns(
1406
- pl.col(col).cast(target_dtype),
1616
+ self.features_df = (
1617
+ self.features_df.with_columns(
1618
+ pl.col(col).cast(target_dtype),
1619
+ )
1407
1620
  )
1408
1621
  elif "Utf8" in dtype_str:
1409
1622
  # Ensure it's string type
@@ -1419,43 +1632,59 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1419
1632
  if "Binary" in str(current_dtype):
1420
1633
  # Convert binary to string first, then to target type
1421
1634
  if target_dtype == pl.Utf8:
1422
- self.features_df = self.features_df.with_columns(
1423
- pl.col(col)
1424
- .map_elements(
1425
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1426
- return_dtype=pl.Utf8,
1635
+ self.features_df = (
1636
+ self.features_df.with_columns(
1637
+ pl.col(col)
1638
+ .map_elements(
1639
+ lambda x: x.decode("utf-8")
1640
+ if isinstance(x, bytes)
1641
+ else str(x),
1642
+ return_dtype=pl.Utf8,
1643
+ )
1644
+ .cast(target_dtype),
1427
1645
  )
1428
- .cast(target_dtype),
1429
1646
  )
1430
1647
  elif "Int" in str(target_dtype):
1431
- self.features_df = self.features_df.with_columns(
1432
- pl.col(col)
1433
- .map_elements(
1434
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1435
- return_dtype=pl.Utf8,
1648
+ self.features_df = (
1649
+ self.features_df.with_columns(
1650
+ pl.col(col)
1651
+ .map_elements(
1652
+ lambda x: x.decode("utf-8")
1653
+ if isinstance(x, bytes)
1654
+ else str(x),
1655
+ return_dtype=pl.Utf8,
1656
+ )
1657
+ .str.to_integer()
1658
+ .cast(target_dtype),
1436
1659
  )
1437
- .str.to_integer()
1438
- .cast(target_dtype),
1439
1660
  )
1440
1661
  elif "Float" in str(target_dtype):
1441
- self.features_df = self.features_df.with_columns(
1442
- pl.col(col)
1443
- .map_elements(
1444
- lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1445
- return_dtype=pl.Utf8,
1662
+ self.features_df = (
1663
+ self.features_df.with_columns(
1664
+ pl.col(col)
1665
+ .map_elements(
1666
+ lambda x: x.decode("utf-8")
1667
+ if isinstance(x, bytes)
1668
+ else str(x),
1669
+ return_dtype=pl.Utf8,
1670
+ )
1671
+ .str.to_decimal()
1672
+ .cast(target_dtype),
1446
1673
  )
1447
- .str.to_decimal()
1448
- .cast(target_dtype),
1449
1674
  )
1450
1675
  else:
1451
1676
  # Try direct casting
1452
- self.features_df = self.features_df.with_columns(
1453
- pl.col(col).cast(target_dtype),
1677
+ self.features_df = (
1678
+ self.features_df.with_columns(
1679
+ pl.col(col).cast(target_dtype),
1680
+ )
1454
1681
  )
1455
1682
  else:
1456
1683
  # Try direct casting for non-binary types
1457
- self.features_df = self.features_df.with_columns(
1458
- pl.col(col).cast(target_dtype),
1684
+ self.features_df = (
1685
+ self.features_df.with_columns(
1686
+ pl.col(col).cast(target_dtype),
1687
+ )
1459
1688
  )
1460
1689
  except Exception as e:
1461
1690
  self.logger.warning(
@@ -1470,23 +1699,31 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1470
1699
  # This ensures "None" strings introduced by failed conversions are properly handled
1471
1700
  for col in self.features_df.columns:
1472
1701
  if self.features_df[col].dtype == pl.Utf8: # String columns
1473
- self.features_df = self.features_df.with_columns([
1474
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1475
- .then(None)
1476
- .otherwise(pl.col(col))
1477
- .alias(col),
1478
- ])
1702
+ self.features_df = self.features_df.with_columns(
1703
+ [
1704
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1705
+ .then(None)
1706
+ .otherwise(pl.col(col))
1707
+ .alias(col),
1708
+ ],
1709
+ )
1479
1710
  # Float columns
1480
1711
  elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
1481
- self.features_df = self.features_df.with_columns([
1482
- pl.col(col).fill_nan(None).alias(col),
1483
- ])
1712
+ self.features_df = self.features_df.with_columns(
1713
+ [
1714
+ pl.col(col).fill_nan(None).alias(col),
1715
+ ],
1716
+ )
1484
1717
 
1485
1718
  # Ensure column order matches schema order
1486
1719
  if "features_df" in schema and "columns" in schema["features_df"]:
1487
1720
  schema_column_order = list(schema["features_df"]["columns"].keys())
1488
1721
  # Only reorder columns that exist in both schema and DataFrame
1489
- existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
1722
+ existing_columns = [
1723
+ col
1724
+ for col in schema_column_order
1725
+ if col in self.features_df.columns
1726
+ ]
1490
1727
  if existing_columns:
1491
1728
  self.features_df = self.features_df.select(existing_columns)
1492
1729
 
@@ -1516,7 +1753,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
1516
1753
  # set self.label to basename without extension
1517
1754
  if self.label is None or self.label == "":
1518
1755
  self.label = os.path.splitext(os.path.basename(filename))[0]
1519
- self.logger.info(f"Sample loaded successfully from {filename} (optimized for study)")
1756
+ self.logger.info(
1757
+ f"Sample loaded successfully from {filename} (optimized for study)",
1758
+ )
1520
1759
 
1521
1760
 
1522
1761
  def load_schema(schema_path: str) -> Dict[str, Any]:
@@ -1564,13 +1803,20 @@ def clean_null_values_polars(df: pl.DataFrame) -> pl.DataFrame:
1564
1803
  cleaned_df = df
1565
1804
  for col in df.columns:
1566
1805
  if df[col].dtype == pl.Utf8: # String columns
1567
- cleaned_df = cleaned_df.with_columns([
1568
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"])).then(None).otherwise(pl.col(col)).alias(col),
1569
- ])
1806
+ cleaned_df = cleaned_df.with_columns(
1807
+ [
1808
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
1809
+ .then(None)
1810
+ .otherwise(pl.col(col))
1811
+ .alias(col),
1812
+ ],
1813
+ )
1570
1814
  elif df[col].dtype in [pl.Float64, pl.Float32]: # Float columns
1571
- cleaned_df = cleaned_df.with_columns([
1572
- pl.col(col).fill_nan(None).alias(col),
1573
- ])
1815
+ cleaned_df = cleaned_df.with_columns(
1816
+ [
1817
+ pl.col(col).fill_nan(None).alias(col),
1818
+ ],
1819
+ )
1574
1820
  return cleaned_df
1575
1821
 
1576
1822
 
@@ -1606,7 +1852,12 @@ def cast_column_by_dtype(df: pl.DataFrame, col: str, dtype_str: str) -> pl.DataF
1606
1852
  return df
1607
1853
 
1608
1854
 
1609
- def _cast_to_int(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
1855
+ def _cast_to_int(
1856
+ df: pl.DataFrame,
1857
+ col: str,
1858
+ current_dtype: pl.DataType,
1859
+ target_dtype: pl.DataType,
1860
+ ) -> pl.DataFrame:
1610
1861
  """Helper function to cast column to integer type."""
1611
1862
  if current_dtype == pl.Utf8:
1612
1863
  return df.with_columns(
@@ -1618,7 +1869,12 @@ def _cast_to_int(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_
1618
1869
  return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
1619
1870
 
1620
1871
 
1621
- def _cast_to_float(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
1872
+ def _cast_to_float(
1873
+ df: pl.DataFrame,
1874
+ col: str,
1875
+ current_dtype: pl.DataType,
1876
+ target_dtype: pl.DataType,
1877
+ ) -> pl.DataFrame:
1622
1878
  """Helper function to cast column to float type."""
1623
1879
  if current_dtype == pl.Utf8:
1624
1880
  return df.with_columns(
@@ -1639,20 +1895,29 @@ def _cast_with_binary_handling(
1639
1895
  if target_dtype == pl.Utf8:
1640
1896
  return df.with_columns(
1641
1897
  pl.col(col)
1642
- .map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
1898
+ .map_elements(
1899
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1900
+ return_dtype=pl.Utf8,
1901
+ )
1643
1902
  .cast(target_dtype),
1644
1903
  )
1645
1904
  elif "Int" in str(target_dtype):
1646
1905
  return df.with_columns(
1647
1906
  pl.col(col)
1648
- .map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
1907
+ .map_elements(
1908
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1909
+ return_dtype=pl.Utf8,
1910
+ )
1649
1911
  .str.to_integer()
1650
1912
  .cast(target_dtype),
1651
1913
  )
1652
1914
  elif "Float" in str(target_dtype):
1653
1915
  return df.with_columns(
1654
1916
  pl.col(col)
1655
- .map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
1917
+ .map_elements(
1918
+ lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
1919
+ return_dtype=pl.Utf8,
1920
+ )
1656
1921
  .str.to_decimal()
1657
1922
  .cast(target_dtype),
1658
1923
  )
@@ -1661,7 +1926,11 @@ def _cast_with_binary_handling(
1661
1926
  return df.with_columns(pl.col(col).cast(target_dtype))
1662
1927
 
1663
1928
 
1664
- def apply_schema_to_dataframe(df: pl.DataFrame, schema: Dict[str, Any], df_name: str) -> pl.DataFrame:
1929
+ def apply_schema_to_dataframe(
1930
+ df: pl.DataFrame,
1931
+ schema: Dict[str, Any],
1932
+ df_name: str,
1933
+ ) -> pl.DataFrame:
1665
1934
  """
1666
1935
  Apply schema type casting to a Polars DataFrame.
1667
1936
 
@@ -1819,7 +2088,9 @@ def _create_dataframe_with_object_columns(
1819
2088
  schema_columns = schema.get(df_name, {}).get("columns", {})
1820
2089
 
1821
2090
  object_columns = {
1822
- k: v for k, v in data.items() if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
2091
+ k: v
2092
+ for k, v in data.items()
2093
+ if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
1823
2094
  }
1824
2095
  regular_columns = {k: v for k, v in data.items() if k not in object_columns}
1825
2096
 
@@ -1874,13 +2145,17 @@ def load_ms1_dataframe_from_h5_group(
1874
2145
  dtype_str = schema_columns[col]["dtype"]
1875
2146
  try:
1876
2147
  if "Int" in dtype_str:
1877
- ms1_df = ms1_df.with_columns([
1878
- pl.col(col).cast(pl.Int64, strict=False),
1879
- ])
2148
+ ms1_df = ms1_df.with_columns(
2149
+ [
2150
+ pl.col(col).cast(pl.Int64, strict=False),
2151
+ ],
2152
+ )
1880
2153
  elif "Float" in dtype_str:
1881
- ms1_df = ms1_df.with_columns([
1882
- pl.col(col).cast(pl.Float64, strict=False),
1883
- ])
2154
+ ms1_df = ms1_df.with_columns(
2155
+ [
2156
+ pl.col(col).cast(pl.Float64, strict=False),
2157
+ ],
2158
+ )
1884
2159
  except Exception as e:
1885
2160
  if logger:
1886
2161
  logger.warning(
@@ -1891,7 +2166,9 @@ def load_ms1_dataframe_from_h5_group(
1891
2166
  return clean_null_values_polars(ms1_df)
1892
2167
 
1893
2168
 
1894
- def load_parameters_from_metadata(metadata_group: h5py.Group) -> Optional[Dict[str, Any]]:
2169
+ def load_parameters_from_metadata(
2170
+ metadata_group: h5py.Group,
2171
+ ) -> Optional[Dict[str, Any]]:
1895
2172
  """
1896
2173
  Load parameters from HDF5 metadata group.
1897
2174
 
@@ -1938,6 +2215,8 @@ def create_h5_metadata_group(
1938
2215
  metadata_group = f.create_group("metadata")
1939
2216
  metadata_group.attrs["format"] = "master-sample5-1"
1940
2217
  metadata_group.attrs["file_path"] = str(file_path) if file_path is not None else ""
1941
- metadata_group.attrs["file_source"] = str(file_source) if file_source is not None else ""
2218
+ metadata_group.attrs["file_source"] = (
2219
+ str(file_source) if file_source is not None else ""
2220
+ )
1942
2221
  metadata_group.attrs["file_type"] = str(file_type) if file_type is not None else ""
1943
2222
  metadata_group.attrs["label"] = str(label) if label is not None else ""