masster 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/h5.py CHANGED
@@ -59,10 +59,10 @@ def _decode_bytes_attr(attr_value):
59
59
  def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=10000):
60
60
  """
61
61
  Save an entire DataFrame to HDF5 with optimized batch processing and memory efficiency.
62
-
62
+
63
63
  This function replaces individual column processing with batch operations for much
64
64
  better performance on large datasets (300+ samples).
65
-
65
+
66
66
  Args:
67
67
  df: Polars DataFrame to save
68
68
  group: HDF5 group to save to
@@ -73,17 +73,17 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
73
73
  """
74
74
  if df is None or df.is_empty():
75
75
  return
76
-
76
+
77
77
  try:
78
78
  # Reorder columns according to schema
79
79
  df_ordered = _reorder_columns_by_schema(df.clone(), schema, df_name)
80
80
  total_rows = len(df_ordered)
81
-
81
+
82
82
  # Group columns by processing type for batch optimization
83
83
  numeric_cols = []
84
84
  string_cols = []
85
85
  object_cols = []
86
-
86
+
87
87
  for col in df_ordered.columns:
88
88
  dtype = str(df_ordered[col].dtype).lower()
89
89
  if dtype == "object":
@@ -92,23 +92,25 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
92
92
  string_cols.append(col)
93
93
  else:
94
94
  numeric_cols.append(col)
95
-
96
- logger.debug(f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns")
97
-
95
+
96
+ logger.debug(
97
+ f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns"
98
+ )
99
+
98
100
  # Process numeric columns in batch (most efficient)
99
101
  if numeric_cols:
100
102
  for col in numeric_cols:
101
103
  _save_numeric_column_fast(group, col, df_ordered[col], logger)
102
-
103
- # Process string columns in batch
104
+
105
+ # Process string columns in batch
104
106
  if string_cols:
105
107
  for col in string_cols:
106
108
  _save_string_column_fast(group, col, df_ordered[col], logger)
107
-
109
+
108
110
  # Process object columns with optimized serialization
109
111
  if object_cols:
110
112
  _save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
111
-
113
+
112
114
  except Exception as e:
113
115
  logger.error(f"Failed to save DataFrame {df_name}: {e}")
114
116
  # Fallback to old method for safety
@@ -119,20 +121,20 @@ def _save_numeric_column_fast(group, col, data_series, logger):
119
121
  """Fast numeric column saving with optimal compression."""
120
122
  try:
121
123
  import numpy as np
122
-
124
+
123
125
  # Get compression settings based on column name
124
126
  if col in ["consensus_uid", "feature_uid", "scan_id", "rt", "mz", "intensity"]:
125
127
  compression_kwargs = {"compression": "lzf", "shuffle": True}
126
128
  else:
127
129
  compression_kwargs = {"compression": "lzf"}
128
-
130
+
129
131
  # Convert to numpy array efficiently
130
132
  try:
131
133
  data_array = data_series.to_numpy()
132
134
  except Exception:
133
135
  # Fallback for complex data types
134
136
  data_array = np.array(data_series.to_list())
135
-
137
+
136
138
  # Handle None/null values efficiently
137
139
  if data_array.dtype == object:
138
140
  # Check if this is actually a list/array column that should be treated as object
@@ -141,13 +143,13 @@ def _save_numeric_column_fast(group, col, data_series, logger):
141
143
  if val is not None:
142
144
  sample_value = val
143
145
  break
144
-
146
+
145
147
  # If sample value is a list/array, treat as object column
146
148
  if isinstance(sample_value, (list, tuple, np.ndarray)):
147
149
  logger.debug(f"Column '{col}' contains array-like data, treating as object")
148
150
  _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
149
151
  return
150
-
152
+
151
153
  # Otherwise, convert None values to -123 sentinel for mixed-type numeric columns
152
154
  try:
153
155
  data_array = np.array([(-123 if x is None else float(x)) for x in data_array])
@@ -156,9 +158,9 @@ def _save_numeric_column_fast(group, col, data_series, logger):
156
158
  logger.debug(f"Column '{col}' is not numeric, treating as object")
157
159
  _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
158
160
  return
159
-
161
+
160
162
  group.create_dataset(col, data=data_array, **compression_kwargs)
161
-
163
+
162
164
  except Exception as e:
163
165
  logger.warning(f"Failed to save numeric column '{col}' efficiently: {e}")
164
166
  # Fallback to old method
@@ -170,10 +172,10 @@ def _save_string_column_fast(group, col, data_series, logger):
170
172
  try:
171
173
  # Convert to string array efficiently
172
174
  string_data = ["None" if x is None else str(x) for x in data_series.to_list()]
173
-
175
+
174
176
  compression_kwargs = {"compression": "gzip", "compression_opts": 6}
175
177
  group.create_dataset(col, data=string_data, **compression_kwargs)
176
-
178
+
177
179
  except Exception as e:
178
180
  logger.warning(f"Failed to save string column '{col}' efficiently: {e}")
179
181
  # Fallback to old method
@@ -183,11 +185,11 @@ def _save_string_column_fast(group, col, data_series, logger):
183
185
  def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
184
186
  """Optimized object column processing with chunking and parallel serialization."""
185
187
  import json
186
-
188
+
187
189
  def serialize_chunk(col_name, chunk_data):
188
190
  """Serialize a chunk of object data."""
189
191
  serialized_chunk = []
190
-
192
+
191
193
  if col_name == "chrom":
192
194
  # Handle Chromatogram objects
193
195
  for item in chunk_data:
@@ -233,19 +235,19 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
233
235
  logger.warning(f"Unknown object column '{col_name}', using default serialization")
234
236
  for item in chunk_data:
235
237
  serialized_chunk.append(str(item) if item is not None else "None")
236
-
238
+
237
239
  return serialized_chunk
238
-
240
+
239
241
  # Process each object column
240
242
  for col in object_cols:
241
243
  try:
242
244
  data_list = df[col].to_list()
243
245
  total_items = len(data_list)
244
-
246
+
245
247
  if total_items == 0:
246
248
  group.create_dataset(col, data=[], compression="gzip", compression_opts=6)
247
249
  continue
248
-
250
+
249
251
  # For small datasets, process directly
250
252
  if total_items <= chunk_size:
251
253
  serialized_data = serialize_chunk(col, data_list)
@@ -253,19 +255,19 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
253
255
  else:
254
256
  # For large datasets, use chunked processing with parallel serialization
255
257
  logger.debug(f"Processing large object column '{col}' with {total_items} items in chunks")
256
-
258
+
257
259
  all_serialized = []
258
260
  num_chunks = (total_items + chunk_size - 1) // chunk_size
259
-
261
+
260
262
  # Use thread pool for parallel serialization of chunks
261
263
  with ThreadPoolExecutor(max_workers=min(4, num_chunks)) as executor:
262
264
  futures = {}
263
-
265
+
264
266
  for i in range(0, total_items, chunk_size):
265
- chunk = data_list[i:i + chunk_size]
267
+ chunk = data_list[i : i + chunk_size]
266
268
  future = executor.submit(serialize_chunk, col, chunk)
267
269
  futures[future] = i
268
-
270
+
269
271
  # Collect results in order
270
272
  results = {}
271
273
  for future in as_completed(futures):
@@ -274,18 +276,20 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
274
276
  chunk_result = future.result()
275
277
  results[chunk_start] = chunk_result
276
278
  except Exception as e:
277
- logger.warning(f"Failed to serialize chunk starting at {chunk_start} for column '{col}': {e}")
279
+ logger.warning(
280
+ f"Failed to serialize chunk starting at {chunk_start} for column '{col}': {e}"
281
+ )
278
282
  # Fallback to simple string conversion for this chunk
279
- chunk = data_list[chunk_start:chunk_start + chunk_size]
283
+ chunk = data_list[chunk_start : chunk_start + chunk_size]
280
284
  results[chunk_start] = [str(item) if item is not None else "None" for item in chunk]
281
-
285
+
282
286
  # Reassemble in correct order
283
287
  for i in range(0, total_items, chunk_size):
284
288
  if i in results:
285
289
  all_serialized.extend(results[i])
286
-
290
+
287
291
  group.create_dataset(col, data=all_serialized, compression="gzip", compression_opts=6)
288
-
292
+
289
293
  except Exception as e:
290
294
  logger.warning(f"Failed to save object column '{col}' with optimization: {e}")
291
295
  # Fallback to old method
@@ -430,7 +434,9 @@ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, com
430
434
  data_as_str.append("None")
431
435
  group.create_dataset(col, data=data_as_str, compression=compression)
432
436
  else:
433
- logger.warning(f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column.")
437
+ logger.warning(
438
+ f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column."
439
+ )
434
440
  elif dtype == "string":
435
441
  # Handle string columns
436
442
  string_data = ["None" if x is None else str(x) for x in data]
@@ -479,6 +485,7 @@ def _reconstruct_object_column(data_col, col_name: str):
479
485
  # Handle non-string data (e.g., float32 NaN from corrupted compression)
480
486
  if not isinstance(item, str):
481
487
  import numpy as np
488
+
482
489
  if isinstance(item, (float, np.floating)) and np.isnan(item):
483
490
  reconstructed_data.append(None)
484
491
  continue
@@ -594,16 +601,16 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
594
601
  expected_length = None
595
602
  if regular_data:
596
603
  for values in regular_data.values():
597
- if values is not None and hasattr(values, '__len__'):
604
+ if values is not None and hasattr(values, "__len__"):
598
605
  expected_length = len(values)
599
606
  break
600
-
607
+
601
608
  if expected_length is None and object_data:
602
609
  for values in object_data.values():
603
- if values is not None and hasattr(values, '__len__'):
610
+ if values is not None and hasattr(values, "__len__"):
604
611
  expected_length = len(values)
605
612
  break
606
-
613
+
607
614
  if expected_length is None:
608
615
  expected_length = 0
609
616
 
@@ -611,7 +618,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
611
618
  for col in object_columns:
612
619
  if col in object_data:
613
620
  values = object_data[col]
614
- if values is None or (hasattr(values, '__len__') and len(values) == 0):
621
+ if values is None or (hasattr(values, "__len__") and len(values) == 0):
615
622
  object_data[col] = [None] * expected_length
616
623
  # print(f"DEBUG: Fixed object column '{col}' to have length {expected_length}")
617
624
 
@@ -624,12 +631,20 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
624
631
  # print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
625
632
  if col == "adducts":
626
633
  # Handle adducts as List(Struct) - now contains dicts
627
- df = df.with_columns([pl.Series(col, values, dtype=pl.List(pl.Struct([
628
- pl.Field("adduct", pl.Utf8),
629
- pl.Field("count", pl.Int64),
630
- pl.Field("percentage", pl.Float64),
631
- pl.Field("mass", pl.Float64)
632
- ])))])
634
+ df = df.with_columns([
635
+ pl.Series(
636
+ col,
637
+ values,
638
+ dtype=pl.List(
639
+ pl.Struct([
640
+ pl.Field("adduct", pl.Utf8),
641
+ pl.Field("count", pl.Int64),
642
+ pl.Field("percentage", pl.Float64),
643
+ pl.Field("mass", pl.Float64),
644
+ ]),
645
+ ),
646
+ ),
647
+ ])
633
648
  else:
634
649
  # Other object columns stay as Object
635
650
  df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
@@ -640,12 +655,20 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
640
655
  # print(f"DEBUG: Creating object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
641
656
  if col == "adducts":
642
657
  # Handle adducts as List(Struct) - now contains dicts
643
- df = df.with_columns([pl.Series(col, values, dtype=pl.List(pl.Struct([
644
- pl.Field("adduct", pl.Utf8),
645
- pl.Field("count", pl.Int64),
646
- pl.Field("percentage", pl.Float64),
647
- pl.Field("mass", pl.Float64)
648
- ])))])
658
+ df = df.with_columns([
659
+ pl.Series(
660
+ col,
661
+ values,
662
+ dtype=pl.List(
663
+ pl.Struct([
664
+ pl.Field("adduct", pl.Utf8),
665
+ pl.Field("count", pl.Int64),
666
+ pl.Field("percentage", pl.Float64),
667
+ pl.Field("mass", pl.Float64),
668
+ ]),
669
+ ),
670
+ ),
671
+ ])
649
672
  else:
650
673
  # Other object columns stay as Object
651
674
  df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
@@ -713,11 +736,11 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
713
736
  # Determine expected DataFrame length from loaded columns
714
737
  expected_length = None
715
738
  for col, values in data.items():
716
- if values is not None and hasattr(values, '__len__'):
739
+ if values is not None and hasattr(values, "__len__"):
717
740
  expected_length = len(values)
718
741
  logger.debug(f"Determined expected_length={expected_length} from loaded column '{col}'")
719
742
  break
720
-
743
+
721
744
  # If no data loaded yet, try HDF5 columns directly
722
745
  if expected_length is None:
723
746
  hdf5_columns = list(group.keys())
@@ -727,7 +750,7 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
727
750
  expected_length = len(col_data)
728
751
  logger.debug(f"Determined expected_length={expected_length} from HDF5 column '{col}'")
729
752
  break
730
-
753
+
731
754
  # Default to 0 if no data found
732
755
  if expected_length is None:
733
756
  expected_length = 0
@@ -747,25 +770,25 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
747
770
  # Check for columns in HDF5 file that are not in schema (for backward compatibility)
748
771
  hdf5_columns = list(group.keys())
749
772
  extra_columns = [col for col in hdf5_columns if col not in (schema_columns or [])]
750
-
773
+
751
774
  for col in extra_columns:
752
775
  logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
753
776
  column_data = group[col][:]
754
-
777
+
755
778
  # Try to determine if this should be treated as an object column
756
779
  # by checking if the data looks like JSON strings
757
780
  if len(column_data) > 0 and isinstance(column_data[0], bytes):
758
781
  try:
759
782
  # Check if it looks like JSON
760
- test_decode = column_data[0].decode('utf-8')
761
- if test_decode.startswith('[') or test_decode.startswith('{'):
783
+ test_decode = column_data[0].decode("utf-8")
784
+ if test_decode.startswith("[") or test_decode.startswith("{"):
762
785
  # Looks like JSON, treat as object column
763
786
  data[col] = _reconstruct_object_column(column_data, col)
764
787
  if col not in object_columns:
765
788
  object_columns.append(col)
766
789
  else:
767
790
  # Regular string data
768
- data[col] = [item.decode('utf-8') if isinstance(item, bytes) else item for item in column_data]
791
+ data[col] = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
769
792
  except Exception:
770
793
  # If decoding fails, treat as regular data
771
794
  data[col] = column_data
@@ -784,7 +807,7 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
784
807
  if df_name in schema and "columns" in schema[df_name] and col in schema[df_name]["columns"]:
785
808
  dtype_str = schema[df_name]["columns"][col]["dtype"]
786
809
  should_be_string = dtype_str == "pl.Utf8"
787
-
810
+
788
811
  if should_be_string:
789
812
  processed_values = []
790
813
  for val in values:
@@ -815,11 +838,11 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
815
838
  def _save_study5_compressed(self, filename=None):
816
839
  """
817
840
  Compressed save identical to _save_study5 but skips serialization of chrom and ms2_specs columns in features_df.
818
-
841
+
819
842
  This version maintains full compatibility with _load_study5() while providing performance benefits
820
843
  by skipping the serialization of heavy object columns (chrom and ms2_specs) in features_df.
821
844
  """
822
-
845
+
823
846
  # if no extension is given, add .study5
824
847
  if not filename.endswith(".study5"):
825
848
  filename += ".study5"
@@ -849,18 +872,17 @@ def _save_study5_compressed(self, filename=None):
849
872
  dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
850
873
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
851
874
  dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
852
-
875
+
853
876
  total_steps = len(dataframes_to_save) + 1 # +1 for metadata
854
-
877
+
855
878
  # Show progress for large saves
856
879
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
857
-
880
+
858
881
  with tqdm(
859
882
  total=total_steps,
860
883
  desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Fast saving study",
861
884
  disable=tdqm_disable,
862
885
  ) as pbar:
863
-
864
886
  # Create groups for organization
865
887
  metadata_group = f.create_group("metadata")
866
888
  features_group = f.create_group("features")
@@ -883,9 +905,11 @@ def _save_study5_compressed(self, filename=None):
883
905
  metadata_group.create_dataset("parameters", data="")
884
906
  else:
885
907
  metadata_group.create_dataset("parameters", data="")
886
-
908
+
887
909
  pbar.update(1)
888
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes")
910
+ pbar.set_description(
911
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes"
912
+ )
889
913
 
890
914
  # Store samples_df - use optimized batch processing
891
915
  if self.samples_df is not None and not self.samples_df.is_empty():
@@ -896,7 +920,9 @@ def _save_study5_compressed(self, filename=None):
896
920
 
897
921
  # Store features_df - use fast method that skips chrom and ms2_specs columns
898
922
  if self.features_df is not None and not self.features_df.is_empty():
899
- self.logger.debug(f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)")
923
+ self.logger.debug(
924
+ f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)"
925
+ )
900
926
  _save_dataframe_optimized_fast(self.features_df, features_group, schema, "features_df", self.logger)
901
927
  pbar.update(1)
902
928
 
@@ -932,10 +958,10 @@ def _save_study5_compressed(self, filename=None):
932
958
  def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_size=10000):
933
959
  """
934
960
  Save DataFrame with optimized batch processing, but skip chrom and ms2_specs columns for features_df.
935
-
961
+
936
962
  This function is identical to _save_dataframe_optimized but excludes heavy object columns
937
963
  (chrom and ms2_specs) when saving features_df to improve performance.
938
-
964
+
939
965
  Args:
940
966
  df: Polars DataFrame to save
941
967
  group: HDF5 group to save to
@@ -946,24 +972,24 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
946
972
  """
947
973
  if df is None or df.is_empty():
948
974
  return
949
-
975
+
950
976
  try:
951
977
  # Reorder columns according to schema
952
978
  df_ordered = _reorder_columns_by_schema(df.clone(), schema, df_name)
953
-
979
+
954
980
  # Skip chrom and ms2_specs columns for features_df
955
981
  if df_name == "features_df":
956
982
  skip_columns = ["chrom", "ms2_specs"]
957
983
  df_ordered = df_ordered.select([col for col in df_ordered.columns if col not in skip_columns])
958
984
  logger.debug(f"Fast save: skipping columns {skip_columns} for {df_name}")
959
-
985
+
960
986
  total_rows = len(df_ordered)
961
-
987
+
962
988
  # Group columns by processing type for batch optimization
963
989
  numeric_cols = []
964
990
  string_cols = []
965
991
  object_cols = []
966
-
992
+
967
993
  for col in df_ordered.columns:
968
994
  dtype = str(df_ordered[col].dtype).lower()
969
995
  if dtype == "object":
@@ -972,23 +998,25 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
972
998
  string_cols.append(col)
973
999
  else:
974
1000
  numeric_cols.append(col)
975
-
976
- logger.debug(f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns")
977
-
1001
+
1002
+ logger.debug(
1003
+ f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns"
1004
+ )
1005
+
978
1006
  # Process numeric columns in batch (most efficient)
979
1007
  if numeric_cols:
980
1008
  for col in numeric_cols:
981
1009
  _save_numeric_column_fast(group, col, df_ordered[col], logger)
982
-
983
- # Process string columns in batch
1010
+
1011
+ # Process string columns in batch
984
1012
  if string_cols:
985
1013
  for col in string_cols:
986
1014
  _save_string_column_fast(group, col, df_ordered[col], logger)
987
-
1015
+
988
1016
  # Process object columns with optimized serialization
989
1017
  if object_cols:
990
1018
  _save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
991
-
1019
+
992
1020
  except Exception as e:
993
1021
  logger.error(f"Failed to save DataFrame {df_name}: {e}")
994
1022
  # Fallback to old method for safety
@@ -1054,18 +1082,17 @@ def _save_study5(self, filename=None):
1054
1082
  dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
1055
1083
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1056
1084
  dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
1057
-
1085
+
1058
1086
  total_steps = len(dataframes_to_save) + 1 # +1 for metadata
1059
-
1087
+
1060
1088
  # Show progress for large saves
1061
1089
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1062
-
1090
+
1063
1091
  with tqdm(
1064
1092
  total=total_steps,
1065
1093
  desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving study",
1066
1094
  disable=tdqm_disable,
1067
1095
  ) as pbar:
1068
-
1069
1096
  # Create groups for organization
1070
1097
  metadata_group = f.create_group("metadata")
1071
1098
  features_group = f.create_group("features")
@@ -1088,9 +1115,11 @@ def _save_study5(self, filename=None):
1088
1115
  metadata_group.create_dataset("parameters", data="")
1089
1116
  else:
1090
1117
  metadata_group.create_dataset("parameters", data="")
1091
-
1118
+
1092
1119
  pbar.update(1)
1093
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes")
1120
+ pbar.set_description(
1121
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes"
1122
+ )
1094
1123
 
1095
1124
  # Store samples_df - use optimized batch processing
1096
1125
  if self.samples_df is not None and not self.samples_df.is_empty():
@@ -1099,7 +1128,7 @@ def _save_study5(self, filename=None):
1099
1128
  _save_dataframe_optimized(self.samples_df, samples_group, schema, "samples_df", self.logger)
1100
1129
  pbar.update(1)
1101
1130
 
1102
- # Store features_df - use optimized batch processing
1131
+ # Store features_df - use optimized batch processing
1103
1132
  if self.features_df is not None and not self.features_df.is_empty():
1104
1133
  self.logger.debug(f"Saving features_df with {len(self.features_df)} rows using optimized method")
1105
1134
  _save_dataframe_optimized(self.features_df, features_group, schema, "features_df", self.logger)
@@ -1154,7 +1183,7 @@ def _load_study5(self, filename=None):
1154
1183
  - Properly handles MS2 scan lists and spectrum lists
1155
1184
  - Restores parameters dictionary from JSON serialization
1156
1185
  """
1157
-
1186
+
1158
1187
  self.logger.info(f"Loading study from {filename}")
1159
1188
 
1160
1189
  # Handle default filename
@@ -1182,26 +1211,26 @@ def _load_study5(self, filename=None):
1182
1211
  # Define loading steps for progress tracking
1183
1212
  loading_steps = [
1184
1213
  "metadata",
1185
- "samples_df",
1214
+ "samples_df",
1186
1215
  "features_df",
1187
1216
  "consensus_df",
1188
1217
  "consensus_mapping_df",
1189
- "consensus_ms2"
1218
+ "consensus_ms2",
1190
1219
  ]
1191
-
1220
+
1192
1221
  # Check if progress bar should be disabled based on log level
1193
1222
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1194
1223
 
1195
1224
  # Define loading steps for progress tracking
1196
1225
  loading_steps = [
1197
1226
  "metadata",
1198
- "samples_df",
1227
+ "samples_df",
1199
1228
  "features_df",
1200
1229
  "consensus_df",
1201
1230
  "consensus_mapping_df",
1202
- "consensus_ms2"
1231
+ "consensus_ms2",
1203
1232
  ]
1204
-
1233
+
1205
1234
  # Check if progress bar should be disabled based on log level
1206
1235
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1207
1236
 
@@ -1212,9 +1241,10 @@ def _load_study5(self, filename=None):
1212
1241
  desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading study",
1213
1242
  disable=tdqm_disable,
1214
1243
  ) as pbar:
1215
-
1216
1244
  # Load metadata
1217
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata")
1245
+ pbar.set_description(
1246
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata"
1247
+ )
1218
1248
  if "metadata" in f:
1219
1249
  metadata = f["metadata"]
1220
1250
  self.folder = _decode_bytes_attr(metadata.attrs.get("folder", ""))
@@ -1240,10 +1270,10 @@ def _load_study5(self, filename=None):
1240
1270
 
1241
1271
  # Reconstruct self.parameters from loaded history
1242
1272
  from masster.study.defaults.study_def import study_defaults
1243
-
1273
+
1244
1274
  # Always create a fresh study_defaults object to ensure we have all defaults
1245
1275
  self.parameters = study_defaults()
1246
-
1276
+
1247
1277
  # Update parameters from loaded history if available
1248
1278
  if self.history and "study" in self.history:
1249
1279
  study_params = self.history["study"]
@@ -1257,24 +1287,26 @@ def _load_study5(self, filename=None):
1257
1287
  self.logger.debug("Study parameters in history are not a valid dictionary")
1258
1288
  else:
1259
1289
  self.logger.debug("No study parameters found in history, using defaults")
1260
-
1290
+
1261
1291
  # Synchronize instance attributes with parameters (similar to __init__)
1262
1292
  # Note: folder and label are already loaded from metadata attributes above
1263
1293
  # but we ensure they match the parameters for consistency
1264
- if hasattr(self.parameters, 'folder') and self.parameters.folder is not None:
1294
+ if hasattr(self.parameters, "folder") and self.parameters.folder is not None:
1265
1295
  self.folder = self.parameters.folder
1266
- if hasattr(self.parameters, 'label') and self.parameters.label is not None:
1296
+ if hasattr(self.parameters, "label") and self.parameters.label is not None:
1267
1297
  self.label = self.parameters.label
1268
- if hasattr(self.parameters, 'log_level'):
1298
+ if hasattr(self.parameters, "log_level"):
1269
1299
  self.log_level = self.parameters.log_level
1270
- if hasattr(self.parameters, 'log_label'):
1300
+ if hasattr(self.parameters, "log_label"):
1271
1301
  self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
1272
- if hasattr(self.parameters, 'log_sink'):
1302
+ if hasattr(self.parameters, "log_sink"):
1273
1303
  self.log_sink = self.parameters.log_sink
1274
1304
  pbar.update(1)
1275
1305
 
1276
1306
  # Load samples_df
1277
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
1307
+ pbar.set_description(
1308
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples"
1309
+ )
1278
1310
  if "samples" in f and len(f["samples"].keys()) > 0:
1279
1311
  self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
1280
1312
  else:
@@ -1306,7 +1338,9 @@ def _load_study5(self, filename=None):
1306
1338
  )
1307
1339
  pbar.update(1)
1308
1340
  # Load samples_df
1309
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
1341
+ pbar.set_description(
1342
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples"
1343
+ )
1310
1344
  if "samples" in f and len(f["samples"].keys()) > 0:
1311
1345
  self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
1312
1346
  else:
@@ -1339,66 +1373,92 @@ def _load_study5(self, filename=None):
1339
1373
  pbar.update(1)
1340
1374
 
1341
1375
  # Load features_df
1342
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features")
1376
+ pbar.set_description(
1377
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features"
1378
+ )
1343
1379
  if "features" in f and len(f["features"].keys()) > 0:
1344
1380
  object_columns = ["chrom", "ms2_scans", "ms2_specs"]
1345
- self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
1381
+ self.features_df = _load_dataframe_from_group(
1382
+ f["features"], schema, "features_df", self.logger, object_columns
1383
+ )
1346
1384
  else:
1347
1385
  self.features_df = None
1348
1386
  pbar.update(1)
1349
1387
 
1350
1388
  # Load consensus_df
1351
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus")
1389
+ pbar.set_description(
1390
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus"
1391
+ )
1352
1392
  if "consensus" in f and len(f["consensus"].keys()) > 0:
1353
1393
  # Only include adducts in object_columns if it actually exists in the file
1354
1394
  object_columns = []
1355
1395
  if "adducts" in f["consensus"]:
1356
1396
  object_columns.append("adducts")
1357
-
1358
- self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger, object_columns)
1359
-
1397
+
1398
+ self.consensus_df = _load_dataframe_from_group(
1399
+ f["consensus"], schema, "consensus_df", self.logger, object_columns
1400
+ )
1401
+
1360
1402
  # Backward compatibility: If adducts column doesn't exist, initialize with empty lists
1361
1403
  if self.consensus_df is not None:
1362
1404
  if "adducts" not in self.consensus_df.columns or self.consensus_df["adducts"].dtype == pl.Null:
1363
1405
  self.logger.info("Adding missing 'adducts' column for backward compatibility")
1364
1406
  empty_adducts: list[list] = [[] for _ in range(len(self.consensus_df))]
1365
-
1407
+
1366
1408
  # If column exists but is Null, drop it first
1367
1409
  if "adducts" in self.consensus_df.columns:
1368
1410
  self.consensus_df = self.consensus_df.drop("adducts")
1369
-
1411
+
1370
1412
  self.consensus_df = self.consensus_df.with_columns([
1371
- pl.Series("adducts", empty_adducts, dtype=pl.List(pl.Struct([
1372
- pl.Field("adduct", pl.Utf8),
1373
- pl.Field("count", pl.Int64),
1374
- pl.Field("percentage", pl.Float64),
1375
- pl.Field("mass", pl.Float64)
1376
- ])))
1413
+ pl.Series(
1414
+ "adducts",
1415
+ empty_adducts,
1416
+ dtype=pl.List(
1417
+ pl.Struct([
1418
+ pl.Field("adduct", pl.Utf8),
1419
+ pl.Field("count", pl.Int64),
1420
+ pl.Field("percentage", pl.Float64),
1421
+ pl.Field("mass", pl.Float64),
1422
+ ]),
1423
+ ),
1424
+ ),
1377
1425
  ])
1378
1426
  else:
1379
1427
  self.consensus_df = None
1380
1428
  pbar.update(1)
1381
1429
 
1382
1430
  # Load consensus_mapping_df
1383
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
1431
+ pbar.set_description(
1432
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping"
1433
+ )
1384
1434
  if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
1385
- self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
1435
+ self.consensus_mapping_df = _load_dataframe_from_group(
1436
+ f["consensus_mapping"], schema, "consensus_mapping_df", self.logger
1437
+ )
1386
1438
  else:
1387
1439
  self.consensus_mapping_df = None
1388
1440
  pbar.update(1)
1389
1441
  # Load consensus_mapping_df
1390
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
1442
+ pbar.set_description(
1443
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping"
1444
+ )
1391
1445
  if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
1392
- self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
1446
+ self.consensus_mapping_df = _load_dataframe_from_group(
1447
+ f["consensus_mapping"], schema, "consensus_mapping_df", self.logger
1448
+ )
1393
1449
  else:
1394
1450
  self.consensus_mapping_df = None
1395
1451
  pbar.update(1)
1396
1452
 
1397
1453
  # Load consensus_ms2
1398
- pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus MS2")
1454
+ pbar.set_description(
1455
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus MS2"
1456
+ )
1399
1457
  if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
1400
1458
  object_columns = ["spec"]
1401
- self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
1459
+ self.consensus_ms2 = _load_dataframe_from_group(
1460
+ f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns
1461
+ )
1402
1462
  else:
1403
1463
  self.consensus_ms2 = None
1404
1464
  pbar.update(1)