masster 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/h5.py CHANGED
@@ -109,7 +109,13 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
109
109
 
110
110
  # Process object columns with optimized serialization
111
111
  if object_cols:
112
- _save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
112
+ _save_object_columns_optimized(
113
+ group,
114
+ df_ordered,
115
+ object_cols,
116
+ logger,
117
+ chunk_size,
118
+ )
113
119
 
114
120
  except Exception as e:
115
121
  logger.error(f"Failed to save DataFrame {df_name}: {e}")
@@ -146,17 +152,33 @@ def _save_numeric_column_fast(group, col, data_series, logger):
146
152
 
147
153
  # If sample value is a list/array, treat as object column
148
154
  if isinstance(sample_value, (list, tuple, np.ndarray)):
149
- logger.debug(f"Column '{col}' contains array-like data, treating as object")
150
- _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
155
+ logger.debug(
156
+ f"Column '{col}' contains array-like data, treating as object",
157
+ )
158
+ _save_dataframe_column_legacy_single(
159
+ group,
160
+ col,
161
+ data_series.to_list(),
162
+ "object",
163
+ logger,
164
+ )
151
165
  return
152
166
 
153
167
  # Otherwise, convert None values to -123 sentinel for mixed-type numeric columns
154
168
  try:
155
- data_array = np.array([(-123 if x is None else float(x)) for x in data_array])
169
+ data_array = np.array(
170
+ [(-123 if x is None else float(x)) for x in data_array],
171
+ )
156
172
  except (ValueError, TypeError):
157
173
  # If conversion fails, this is not a numeric column
158
174
  logger.debug(f"Column '{col}' is not numeric, treating as object")
159
- _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
175
+ _save_dataframe_column_legacy_single(
176
+ group,
177
+ col,
178
+ data_series.to_list(),
179
+ "object",
180
+ logger,
181
+ )
160
182
  return
161
183
 
162
184
  group.create_dataset(col, data=data_array, **compression_kwargs)
@@ -164,7 +186,13 @@ def _save_numeric_column_fast(group, col, data_series, logger):
164
186
  except Exception as e:
165
187
  logger.warning(f"Failed to save numeric column '{col}' efficiently: {e}")
166
188
  # Fallback to old method
167
- _save_dataframe_column_legacy_single(group, col, data_series.to_list(), str(data_series.dtype), logger)
189
+ _save_dataframe_column_legacy_single(
190
+ group,
191
+ col,
192
+ data_series.to_list(),
193
+ str(data_series.dtype),
194
+ logger,
195
+ )
168
196
 
169
197
 
170
198
  def _save_string_column_fast(group, col, data_series, logger):
@@ -179,7 +207,13 @@ def _save_string_column_fast(group, col, data_series, logger):
179
207
  except Exception as e:
180
208
  logger.warning(f"Failed to save string column '{col}' efficiently: {e}")
181
209
  # Fallback to old method
182
- _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "string", logger)
210
+ _save_dataframe_column_legacy_single(
211
+ group,
212
+ col,
213
+ data_series.to_list(),
214
+ "string",
215
+ logger,
216
+ )
183
217
 
184
218
 
185
219
  def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
@@ -232,7 +266,9 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
232
266
  else:
233
267
  serialized_chunk.append("None")
234
268
  else:
235
- logger.warning(f"Unknown object column '{col_name}', using default serialization")
269
+ logger.warning(
270
+ f"Unknown object column '{col_name}', using default serialization",
271
+ )
236
272
  for item in chunk_data:
237
273
  serialized_chunk.append(str(item) if item is not None else "None")
238
274
 
@@ -245,16 +281,28 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
245
281
  total_items = len(data_list)
246
282
 
247
283
  if total_items == 0:
248
- group.create_dataset(col, data=[], compression="gzip", compression_opts=6)
284
+ group.create_dataset(
285
+ col,
286
+ data=[],
287
+ compression="gzip",
288
+ compression_opts=6,
289
+ )
249
290
  continue
250
291
 
251
292
  # For small datasets, process directly
252
293
  if total_items <= chunk_size:
253
294
  serialized_data = serialize_chunk(col, data_list)
254
- group.create_dataset(col, data=serialized_data, compression="gzip", compression_opts=6)
295
+ group.create_dataset(
296
+ col,
297
+ data=serialized_data,
298
+ compression="gzip",
299
+ compression_opts=6,
300
+ )
255
301
  else:
256
302
  # For large datasets, use chunked processing with parallel serialization
257
- logger.debug(f"Processing large object column '{col}' with {total_items} items in chunks")
303
+ logger.debug(
304
+ f"Processing large object column '{col}' with {total_items} items in chunks",
305
+ )
258
306
 
259
307
  all_serialized = []
260
308
  num_chunks = (total_items + chunk_size - 1) // chunk_size
@@ -281,28 +329,58 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
281
329
  )
282
330
  # Fallback to simple string conversion for this chunk
283
331
  chunk = data_list[chunk_start : chunk_start + chunk_size]
284
- results[chunk_start] = [str(item) if item is not None else "None" for item in chunk]
332
+ results[chunk_start] = [
333
+ str(item) if item is not None else "None"
334
+ for item in chunk
335
+ ]
285
336
 
286
337
  # Reassemble in correct order
287
338
  for i in range(0, total_items, chunk_size):
288
339
  if i in results:
289
340
  all_serialized.extend(results[i])
290
341
 
291
- group.create_dataset(col, data=all_serialized, compression="gzip", compression_opts=6)
342
+ group.create_dataset(
343
+ col,
344
+ data=all_serialized,
345
+ compression="gzip",
346
+ compression_opts=6,
347
+ )
292
348
 
293
349
  except Exception as e:
294
- logger.warning(f"Failed to save object column '{col}' with optimization: {e}")
350
+ logger.warning(
351
+ f"Failed to save object column '{col}' with optimization: {e}",
352
+ )
295
353
  # Fallback to old method
296
- _save_dataframe_column_legacy_single(group, col, df[col].to_list(), "object", logger)
354
+ _save_dataframe_column_legacy_single(
355
+ group,
356
+ col,
357
+ df[col].to_list(),
358
+ "object",
359
+ logger,
360
+ )
297
361
 
298
362
 
299
- def _save_dataframe_column_legacy_single(group, col: str, data, dtype: str, logger, compression="gzip"):
363
+ def _save_dataframe_column_legacy_single(
364
+ group,
365
+ col: str,
366
+ data,
367
+ dtype: str,
368
+ logger,
369
+ compression="gzip",
370
+ ):
300
371
  """Legacy single column save method for fallback."""
301
372
  # This is the original _save_dataframe_column method for compatibility
302
373
  return _save_dataframe_column_legacy(group, col, data, dtype, logger, compression)
303
374
 
304
375
 
305
- def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, compression="gzip"):
376
+ def _save_dataframe_column_legacy(
377
+ group,
378
+ col: str,
379
+ data,
380
+ dtype: str,
381
+ logger,
382
+ compression="gzip",
383
+ ):
306
384
  """
307
385
  Save a single DataFrame column to an HDF5 group with optimized compression.
308
386
 
@@ -327,7 +405,10 @@ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, com
327
405
 
328
406
  # Optimized compression configuration
329
407
  COMPRESSION_CONFIG = {
330
- "fast_access": {"compression": "lzf", "shuffle": True}, # Fast I/O for IDs, rt, mz
408
+ "fast_access": {
409
+ "compression": "lzf",
410
+ "shuffle": True,
411
+ }, # Fast I/O for IDs, rt, mz
331
412
  "numeric": {"compression": "lzf"}, # Standard numeric data
332
413
  "string": {"compression": "gzip", "compression_opts": 6}, # String data
333
414
  "json": {"compression": "gzip", "compression_opts": 6}, # JSON objects
@@ -350,11 +431,22 @@ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, com
350
431
  return COMPRESSION_CONFIG["fast_access"]
351
432
 
352
433
  # JSON object columns (complex serialized data)
353
- elif column_name in ["spectrum", "chromatogram", "chromatograms", "ms2_specs", "chrom"]:
434
+ elif column_name in [
435
+ "spectrum",
436
+ "chromatogram",
437
+ "chromatograms",
438
+ "ms2_specs",
439
+ "chrom",
440
+ ]:
354
441
  return COMPRESSION_CONFIG["json"]
355
442
 
356
443
  # String/text columns
357
- elif data_type in ["string", "object"] and column_name in ["sample_name", "file_path", "label", "file_type"]:
444
+ elif data_type in ["string", "object"] and column_name in [
445
+ "sample_name",
446
+ "file_path",
447
+ "label",
448
+ "file_type",
449
+ ]:
358
450
  return COMPRESSION_CONFIG["string"]
359
451
 
360
452
  # Large bulk numeric data
@@ -524,12 +616,16 @@ def _reconstruct_object_column(data_col, col_name: str):
524
616
  for adduct_row in adducts_list:
525
617
  if len(adduct_row) >= 3:
526
618
  # Convert from [adduct, count, percentage] to dict structure
527
- converted_adducts.append({
528
- "adduct": str(adduct_row[0]),
529
- "count": int(float(adduct_row[1])),
530
- "percentage": float(adduct_row[2]),
531
- "mass": float(adduct_row[3]) if len(adduct_row) > 3 else 0.0
532
- })
619
+ converted_adducts.append(
620
+ {
621
+ "adduct": str(adduct_row[0]),
622
+ "count": int(float(adduct_row[1])),
623
+ "percentage": float(adduct_row[2]),
624
+ "mass": float(adduct_row[3])
625
+ if len(adduct_row) > 3
626
+ else 0.0,
627
+ },
628
+ )
533
629
  reconstructed_data.append(converted_adducts)
534
630
  else:
535
631
  # Unknown object column
@@ -544,9 +640,14 @@ def _clean_string_nulls(df: pl.DataFrame) -> pl.DataFrame:
544
640
  """Convert string null representations to proper nulls."""
545
641
  for col in df.columns:
546
642
  if df[col].dtype == pl.Utf8:
547
- df = df.with_columns([
548
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"])).then(None).otherwise(pl.col(col)).alias(col),
549
- ])
643
+ df = df.with_columns(
644
+ [
645
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
646
+ .then(None)
647
+ .otherwise(pl.col(col))
648
+ .alias(col),
649
+ ],
650
+ )
550
651
  return df
551
652
 
552
653
 
@@ -587,7 +688,11 @@ def _apply_schema_casting(df: pl.DataFrame, schema: dict, df_name: str) -> pl.Da
587
688
  return df
588
689
 
589
690
 
590
- def _reorder_columns_by_schema(df: pl.DataFrame, schema: dict, df_name: str) -> pl.DataFrame:
691
+ def _reorder_columns_by_schema(
692
+ df: pl.DataFrame,
693
+ schema: dict,
694
+ df_name: str,
695
+ ) -> pl.DataFrame:
591
696
  """Reorder DataFrame columns to match schema order."""
592
697
  if df_name not in schema or "columns" not in schema[df_name]:
593
698
  return df
@@ -641,20 +746,24 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
641
746
  # print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
642
747
  if col == "adducts":
643
748
  # Handle adducts as List(Struct) - now contains dicts
644
- df = df.with_columns([
645
- pl.Series(
646
- col,
647
- values,
648
- dtype=pl.List(
649
- pl.Struct([
650
- pl.Field("adduct", pl.Utf8),
651
- pl.Field("count", pl.Int64),
652
- pl.Field("percentage", pl.Float64),
653
- pl.Field("mass", pl.Float64),
654
- ]),
749
+ df = df.with_columns(
750
+ [
751
+ pl.Series(
752
+ col,
753
+ values,
754
+ dtype=pl.List(
755
+ pl.Struct(
756
+ [
757
+ pl.Field("adduct", pl.Utf8),
758
+ pl.Field("count", pl.Int64),
759
+ pl.Field("percentage", pl.Float64),
760
+ pl.Field("mass", pl.Float64),
761
+ ],
762
+ ),
763
+ ),
655
764
  ),
656
- ),
657
- ])
765
+ ],
766
+ )
658
767
  else:
659
768
  # Other object columns stay as Object
660
769
  df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
@@ -665,20 +774,24 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
665
774
  # print(f"DEBUG: Creating object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
666
775
  if col == "adducts":
667
776
  # Handle adducts as List(Struct) - now contains dicts
668
- df = df.with_columns([
669
- pl.Series(
670
- col,
671
- values,
672
- dtype=pl.List(
673
- pl.Struct([
674
- pl.Field("adduct", pl.Utf8),
675
- pl.Field("count", pl.Int64),
676
- pl.Field("percentage", pl.Float64),
677
- pl.Field("mass", pl.Float64),
678
- ]),
777
+ df = df.with_columns(
778
+ [
779
+ pl.Series(
780
+ col,
781
+ values,
782
+ dtype=pl.List(
783
+ pl.Struct(
784
+ [
785
+ pl.Field("adduct", pl.Utf8),
786
+ pl.Field("count", pl.Int64),
787
+ pl.Field("percentage", pl.Float64),
788
+ pl.Field("mass", pl.Float64),
789
+ ],
790
+ ),
791
+ ),
679
792
  ),
680
- ),
681
- ])
793
+ ],
794
+ )
682
795
  else:
683
796
  # Other object columns stay as Object
684
797
  df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
@@ -686,7 +799,13 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
686
799
  return df
687
800
 
688
801
 
689
- def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object_columns: list | None = None) -> pl.DataFrame:
802
+ def _load_dataframe_from_group(
803
+ group,
804
+ schema: dict,
805
+ df_name: str,
806
+ logger,
807
+ object_columns: list | None = None,
808
+ ) -> pl.DataFrame:
690
809
  """Load a DataFrame from HDF5 group using schema."""
691
810
  if object_columns is None:
692
811
  object_columns = []
@@ -700,7 +819,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
700
819
  )
701
820
  schema_section = schema.get(df_name, {}) if isinstance(schema, dict) else {}
702
821
  logger.debug(f"Schema section for {df_name}: {schema_section}")
703
- schema_columns = schema_section.get("columns", []) if isinstance(schema_section, dict) else []
822
+ schema_columns = (
823
+ schema_section.get("columns", []) if isinstance(schema_section, dict) else []
824
+ )
704
825
  logger.debug(f"Schema columns for {df_name}: {schema_columns}")
705
826
  if schema_columns is None:
706
827
  schema_columns = []
@@ -723,7 +844,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
723
844
  effective_columns = hdf5_columns.copy()
724
845
  for old_name, new_name in column_migrations.items():
725
846
  if old_name in effective_columns:
726
- logger.info(f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility")
847
+ logger.info(
848
+ f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility",
849
+ )
727
850
  # Add the new name to effective columns and optionally remove old name
728
851
  effective_columns.append(new_name)
729
852
 
@@ -788,7 +911,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
788
911
  for col, values in data.items():
789
912
  if values is not None and hasattr(values, "__len__"):
790
913
  expected_length = len(values)
791
- logger.debug(f"Determined expected_length={expected_length} from loaded column '{col}'")
914
+ logger.debug(
915
+ f"Determined expected_length={expected_length} from loaded column '{col}'",
916
+ )
792
917
  break
793
918
 
794
919
  # If no data loaded yet, try HDF5 columns directly
@@ -798,7 +923,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
798
923
  col_data = group[col][:]
799
924
  if expected_length is None:
800
925
  expected_length = len(col_data)
801
- logger.debug(f"Determined expected_length={expected_length} from HDF5 column '{col}'")
926
+ logger.debug(
927
+ f"Determined expected_length={expected_length} from HDF5 column '{col}'",
928
+ )
802
929
  break
803
930
 
804
931
  # Default to 0 if no data found
@@ -812,26 +939,38 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
812
939
  # For missing columns, create appropriately sized array with appropriate defaults
813
940
  if col in object_columns:
814
941
  data[col] = [None] * expected_length
815
- logger.debug(f"Created missing object column '{col}' with length {expected_length}")
942
+ logger.debug(
943
+ f"Created missing object column '{col}' with length {expected_length}",
944
+ )
816
945
  else:
817
946
  # Provide specific default values for new columns for backward compatibility
818
947
  if df_name == "samples_df":
819
948
  if col == "sample_group":
820
949
  data[col] = [""] * expected_length # Empty string default
821
- logger.debug(f"Created missing column '{col}' with empty string defaults")
950
+ logger.debug(
951
+ f"Created missing column '{col}' with empty string defaults",
952
+ )
822
953
  elif col == "sample_batch":
823
954
  data[col] = [1] * expected_length # Batch 1 default
824
- logger.debug(f"Created missing column '{col}' with batch 1 defaults")
955
+ logger.debug(
956
+ f"Created missing column '{col}' with batch 1 defaults",
957
+ )
825
958
  elif col == "sample_sequence":
826
959
  # Create increasing sequence numbers
827
960
  data[col] = list(range(1, expected_length + 1))
828
- logger.debug(f"Created missing column '{col}' with sequence 1-{expected_length}")
961
+ logger.debug(
962
+ f"Created missing column '{col}' with sequence 1-{expected_length}",
963
+ )
829
964
  else:
830
965
  data[col] = [None] * expected_length
831
- logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
966
+ logger.debug(
967
+ f"Created missing regular column '{col}' with length {expected_length}",
968
+ )
832
969
  else:
833
970
  data[col] = [None] * expected_length
834
- logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
971
+ logger.debug(
972
+ f"Created missing regular column '{col}' with length {expected_length}",
973
+ )
835
974
 
836
975
  # Check for columns in HDF5 file that are not in schema (for backward compatibility)
837
976
  # But skip the old column names we already migrated
@@ -845,7 +984,11 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
845
984
  }
846
985
  migrated_old_names = set(column_migrations.keys())
847
986
 
848
- extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
987
+ extra_columns = [
988
+ col
989
+ for col in hdf5_columns
990
+ if col not in (schema_columns or []) and col not in migrated_old_names
991
+ ]
849
992
 
850
993
  for col in extra_columns:
851
994
  logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
@@ -864,7 +1007,10 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
864
1007
  object_columns.append(col)
865
1008
  else:
866
1009
  # Regular string data
867
- data[col] = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
1010
+ data[col] = [
1011
+ item.decode("utf-8") if isinstance(item, bytes) else item
1012
+ for item in column_data
1013
+ ]
868
1014
  except Exception:
869
1015
  # If decoding fails, treat as regular data
870
1016
  data[col] = column_data
@@ -877,10 +1023,19 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
877
1023
  # Handle byte string conversion for non-object columns
878
1024
  # Only convert to strings for columns that should actually be strings
879
1025
  for col, values in data.items():
880
- if col not in object_columns and values is not None and len(values) > 0 and isinstance(values[0], bytes):
1026
+ if (
1027
+ col not in object_columns
1028
+ and values is not None
1029
+ and len(values) > 0
1030
+ and isinstance(values[0], bytes)
1031
+ ):
881
1032
  # Check schema to see if this should be a string column
882
1033
  should_be_string = False
883
- if df_name in schema and "columns" in schema[df_name] and col in schema[df_name]["columns"]:
1034
+ if (
1035
+ df_name in schema
1036
+ and "columns" in schema[df_name]
1037
+ and col in schema[df_name]["columns"]
1038
+ ):
884
1039
  dtype_str = schema[df_name]["columns"][col]["dtype"]
885
1040
  should_be_string = dtype_str == "pl.Utf8"
886
1041
 
@@ -898,7 +1053,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
898
1053
  logger.debug(f"Creating DataFrame with object columns: {object_columns}")
899
1054
  for col in object_columns:
900
1055
  if col in data:
901
- logger.debug(f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}")
1056
+ logger.debug(
1057
+ f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}",
1058
+ )
902
1059
  df = _create_dataframe_with_objects(data, object_columns)
903
1060
  else:
904
1061
  df = pl.DataFrame(data)
@@ -944,19 +1101,34 @@ def _save_study5_compressed(self, filename):
944
1101
  dataframes_to_save.append(("features", len(self.features_df)))
945
1102
  if self.consensus_df is not None and not self.consensus_df.is_empty():
946
1103
  dataframes_to_save.append(("consensus", len(self.consensus_df)))
947
- if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
948
- dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
1104
+ if (
1105
+ self.consensus_mapping_df is not None
1106
+ and not self.consensus_mapping_df.is_empty()
1107
+ ):
1108
+ dataframes_to_save.append(
1109
+ ("consensus_mapping", len(self.consensus_mapping_df)),
1110
+ )
949
1111
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
950
1112
  dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
951
- if hasattr(self, 'lib_df') and self.lib_df is not None and not self.lib_df.is_empty():
1113
+ if (
1114
+ hasattr(self, "lib_df")
1115
+ and self.lib_df is not None
1116
+ and not self.lib_df.is_empty()
1117
+ ):
952
1118
  dataframes_to_save.append(("lib", len(self.lib_df)))
953
- if hasattr(self, 'id_df') and self.id_df is not None and not self.id_df.is_empty():
1119
+ if (
1120
+ hasattr(self, "id_df")
1121
+ and self.id_df is not None
1122
+ and not self.id_df.is_empty()
1123
+ ):
954
1124
  dataframes_to_save.append(("id", len(self.id_df)))
955
1125
 
956
1126
  total_steps = len(dataframes_to_save) + 1 # +1 for metadata
957
1127
 
958
1128
  # Show progress for large saves
959
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1129
+ tdqm_disable = (
1130
+ self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1131
+ )
960
1132
 
961
1133
  with tqdm(
962
1134
  total=total_steps,
@@ -974,8 +1146,14 @@ def _save_study5_compressed(self, filename):
974
1146
 
975
1147
  # Store metadata
976
1148
  metadata_group.attrs["format"] = "master-study-1"
977
- metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
978
- metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
1149
+ metadata_group.attrs["folder"] = (
1150
+ str(self.folder) if self.folder is not None else ""
1151
+ )
1152
+ metadata_group.attrs["label"] = (
1153
+ str(self.label)
1154
+ if hasattr(self, "label") and self.label is not None
1155
+ else ""
1156
+ )
979
1157
 
980
1158
  # Store parameters as JSON
981
1159
  if hasattr(self, "parameters") and self.history is not None:
@@ -996,8 +1174,16 @@ def _save_study5_compressed(self, filename):
996
1174
  # Store samples_df - use optimized batch processing
997
1175
  if self.samples_df is not None and not self.samples_df.is_empty():
998
1176
  samples_group = f.create_group("samples")
999
- self.logger.debug(f"Saving samples_df with {len(self.samples_df)} rows using optimized method")
1000
- _save_dataframe_optimized(self.samples_df, samples_group, schema, "samples_df", self.logger)
1177
+ self.logger.debug(
1178
+ f"Saving samples_df with {len(self.samples_df)} rows using optimized method",
1179
+ )
1180
+ _save_dataframe_optimized(
1181
+ self.samples_df,
1182
+ samples_group,
1183
+ schema,
1184
+ "samples_df",
1185
+ self.logger,
1186
+ )
1001
1187
  pbar.update(1)
1002
1188
 
1003
1189
  # Store features_df - use fast method that skips chrom and ms2_specs columns
@@ -1005,50 +1191,115 @@ def _save_study5_compressed(self, filename):
1005
1191
  self.logger.debug(
1006
1192
  f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)",
1007
1193
  )
1008
- _save_dataframe_optimized_fast(self.features_df, features_group, schema, "features_df", self.logger)
1194
+ _save_dataframe_optimized_fast(
1195
+ self.features_df,
1196
+ features_group,
1197
+ schema,
1198
+ "features_df",
1199
+ self.logger,
1200
+ )
1009
1201
  pbar.update(1)
1010
1202
 
1011
1203
  # Store consensus_df - use optimized batch processing
1012
1204
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1013
- self.logger.debug(f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method")
1014
- _save_dataframe_optimized(self.consensus_df, consensus_group, schema, "consensus_df", self.logger)
1205
+ self.logger.debug(
1206
+ f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method",
1207
+ )
1208
+ _save_dataframe_optimized(
1209
+ self.consensus_df,
1210
+ consensus_group,
1211
+ schema,
1212
+ "consensus_df",
1213
+ self.logger,
1214
+ )
1015
1215
  pbar.update(1)
1016
1216
 
1017
1217
  # Store consensus_mapping_df - keep existing fast method
1018
- if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1218
+ if (
1219
+ self.consensus_mapping_df is not None
1220
+ and not self.consensus_mapping_df.is_empty()
1221
+ ):
1019
1222
  consensus_mapping = self.consensus_mapping_df.clone()
1020
- self.logger.debug(f"Saving consensus_mapping_df with {len(consensus_mapping)} rows")
1223
+ self.logger.debug(
1224
+ f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
1225
+ )
1021
1226
  for col in consensus_mapping.columns:
1022
1227
  try:
1023
1228
  data = consensus_mapping[col].to_numpy()
1024
1229
  # Use LZF compression for consensus mapping data
1025
- consensus_mapping_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
1230
+ consensus_mapping_group.create_dataset(
1231
+ col,
1232
+ data=data,
1233
+ compression="lzf",
1234
+ shuffle=True,
1235
+ )
1026
1236
  except Exception as e:
1027
- self.logger.warning(f"Failed to save column '{col}' in consensus_mapping_df: {e}")
1237
+ self.logger.warning(
1238
+ f"Failed to save column '{col}' in consensus_mapping_df: {e}",
1239
+ )
1028
1240
  pbar.update(1)
1029
1241
 
1030
1242
  # Store consensus_ms2 - use optimized batch processing
1031
1243
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1032
- self.logger.debug(f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method")
1033
- _save_dataframe_optimized(self.consensus_ms2, consensus_ms2_group, schema, "consensus_ms2", self.logger)
1244
+ self.logger.debug(
1245
+ f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method",
1246
+ )
1247
+ _save_dataframe_optimized(
1248
+ self.consensus_ms2,
1249
+ consensus_ms2_group,
1250
+ schema,
1251
+ "consensus_ms2",
1252
+ self.logger,
1253
+ )
1034
1254
  pbar.update(1)
1035
1255
 
1036
1256
  # Store lib_df - library data
1037
- if hasattr(self, 'lib_df') and self.lib_df is not None and not self.lib_df.is_empty():
1038
- self.logger.debug(f"Saving lib_df with {len(self.lib_df)} rows using optimized method")
1039
- _save_dataframe_optimized(self.lib_df, lib_group, schema, "lib_df", self.logger)
1257
+ if (
1258
+ hasattr(self, "lib_df")
1259
+ and self.lib_df is not None
1260
+ and not self.lib_df.is_empty()
1261
+ ):
1262
+ self.logger.debug(
1263
+ f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
1264
+ )
1265
+ _save_dataframe_optimized(
1266
+ self.lib_df,
1267
+ lib_group,
1268
+ schema,
1269
+ "lib_df",
1270
+ self.logger,
1271
+ )
1040
1272
  pbar.update(1)
1041
1273
 
1042
1274
  # Store id_df - identification results
1043
- if hasattr(self, 'id_df') and self.id_df is not None and not self.id_df.is_empty():
1044
- self.logger.debug(f"Saving id_df with {len(self.id_df)} rows using optimized method")
1045
- _save_dataframe_optimized(self.id_df, id_group, schema, "id_df", self.logger)
1275
+ if (
1276
+ hasattr(self, "id_df")
1277
+ and self.id_df is not None
1278
+ and not self.id_df.is_empty()
1279
+ ):
1280
+ self.logger.debug(
1281
+ f"Saving id_df with {len(self.id_df)} rows using optimized method",
1282
+ )
1283
+ _save_dataframe_optimized(
1284
+ self.id_df,
1285
+ id_group,
1286
+ schema,
1287
+ "id_df",
1288
+ self.logger,
1289
+ )
1046
1290
  pbar.update(1)
1047
1291
 
1048
1292
  self.logger.debug(f"Fast save completed for {filename}")
1049
1293
 
1050
1294
 
1051
- def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_size=10000):
1295
+ def _save_dataframe_optimized_fast(
1296
+ df,
1297
+ group,
1298
+ schema,
1299
+ df_name,
1300
+ logger,
1301
+ chunk_size=10000,
1302
+ ):
1052
1303
  """
1053
1304
  Save DataFrame with optimized batch processing, but skip chrom and ms2_specs columns for features_df.
1054
1305
 
@@ -1073,7 +1324,9 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
1073
1324
  # Skip chrom and ms2_specs columns for features_df
1074
1325
  if df_name == "features_df":
1075
1326
  skip_columns = ["chrom", "ms2_specs"]
1076
- df_ordered = df_ordered.select([col for col in df_ordered.columns if col not in skip_columns])
1327
+ df_ordered = df_ordered.select(
1328
+ [col for col in df_ordered.columns if col not in skip_columns],
1329
+ )
1077
1330
  logger.debug(f"Fast save: skipping columns {skip_columns} for {df_name}")
1078
1331
 
1079
1332
  total_rows = len(df_ordered)
@@ -1108,7 +1361,13 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
1108
1361
 
1109
1362
  # Process object columns with optimized serialization
1110
1363
  if object_cols:
1111
- _save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
1364
+ _save_object_columns_optimized(
1365
+ group,
1366
+ df_ordered,
1367
+ object_cols,
1368
+ logger,
1369
+ chunk_size,
1370
+ )
1112
1371
 
1113
1372
  except Exception as e:
1114
1373
  logger.error(f"Failed to save DataFrame {df_name}: {e}")
@@ -1173,19 +1432,34 @@ def _save_study5(self, filename):
1173
1432
  dataframes_to_save.append(("features", len(self.features_df)))
1174
1433
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1175
1434
  dataframes_to_save.append(("consensus", len(self.consensus_df)))
1176
- if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1177
- dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
1435
+ if (
1436
+ self.consensus_mapping_df is not None
1437
+ and not self.consensus_mapping_df.is_empty()
1438
+ ):
1439
+ dataframes_to_save.append(
1440
+ ("consensus_mapping", len(self.consensus_mapping_df)),
1441
+ )
1178
1442
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1179
1443
  dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
1180
- if hasattr(self, 'lib_df') and self.lib_df is not None and not self.lib_df.is_empty():
1444
+ if (
1445
+ hasattr(self, "lib_df")
1446
+ and self.lib_df is not None
1447
+ and not self.lib_df.is_empty()
1448
+ ):
1181
1449
  dataframes_to_save.append(("lib", len(self.lib_df)))
1182
- if hasattr(self, 'id_df') and self.id_df is not None and not self.id_df.is_empty():
1450
+ if (
1451
+ hasattr(self, "id_df")
1452
+ and self.id_df is not None
1453
+ and not self.id_df.is_empty()
1454
+ ):
1183
1455
  dataframes_to_save.append(("id", len(self.id_df)))
1184
1456
 
1185
1457
  total_steps = len(dataframes_to_save) + 1 # +1 for metadata
1186
1458
 
1187
1459
  # Show progress for large saves
1188
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1460
+ tdqm_disable = (
1461
+ self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1462
+ )
1189
1463
 
1190
1464
  with tqdm(
1191
1465
  total=total_steps,
@@ -1203,8 +1477,14 @@ def _save_study5(self, filename):
1203
1477
 
1204
1478
  # Store metadata
1205
1479
  metadata_group.attrs["format"] = "master-study-1"
1206
- metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
1207
- metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
1480
+ metadata_group.attrs["folder"] = (
1481
+ str(self.folder) if self.folder is not None else ""
1482
+ )
1483
+ metadata_group.attrs["label"] = (
1484
+ str(self.label)
1485
+ if hasattr(self, "label") and self.label is not None
1486
+ else ""
1487
+ )
1208
1488
 
1209
1489
  # Store parameters as JSON
1210
1490
  if hasattr(self, "parameters") and self.history is not None:
@@ -1225,51 +1505,119 @@ def _save_study5(self, filename):
1225
1505
  # Store samples_df - use optimized batch processing
1226
1506
  if self.samples_df is not None and not self.samples_df.is_empty():
1227
1507
  samples_group = f.create_group("samples")
1228
- self.logger.debug(f"Saving samples_df with {len(self.samples_df)} rows using optimized method")
1229
- _save_dataframe_optimized(self.samples_df, samples_group, schema, "samples_df", self.logger)
1508
+ self.logger.debug(
1509
+ f"Saving samples_df with {len(self.samples_df)} rows using optimized method",
1510
+ )
1511
+ _save_dataframe_optimized(
1512
+ self.samples_df,
1513
+ samples_group,
1514
+ schema,
1515
+ "samples_df",
1516
+ self.logger,
1517
+ )
1230
1518
  pbar.update(1)
1231
1519
 
1232
1520
  # Store features_df - use optimized batch processing
1233
1521
  if self.features_df is not None and not self.features_df.is_empty():
1234
- self.logger.debug(f"Saving features_df with {len(self.features_df)} rows using optimized method")
1235
- _save_dataframe_optimized(self.features_df, features_group, schema, "features_df", self.logger)
1522
+ self.logger.debug(
1523
+ f"Saving features_df with {len(self.features_df)} rows using optimized method",
1524
+ )
1525
+ _save_dataframe_optimized(
1526
+ self.features_df,
1527
+ features_group,
1528
+ schema,
1529
+ "features_df",
1530
+ self.logger,
1531
+ )
1236
1532
  pbar.update(1)
1237
1533
 
1238
1534
  # Store consensus_df - use optimized batch processing
1239
1535
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1240
- self.logger.debug(f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method")
1241
- _save_dataframe_optimized(self.consensus_df, consensus_group, schema, "consensus_df", self.logger)
1536
+ self.logger.debug(
1537
+ f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method",
1538
+ )
1539
+ _save_dataframe_optimized(
1540
+ self.consensus_df,
1541
+ consensus_group,
1542
+ schema,
1543
+ "consensus_df",
1544
+ self.logger,
1545
+ )
1242
1546
  pbar.update(1)
1243
1547
 
1244
1548
  # Store consensus_mapping_df - keep existing fast method
1245
- if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1549
+ if (
1550
+ self.consensus_mapping_df is not None
1551
+ and not self.consensus_mapping_df.is_empty()
1552
+ ):
1246
1553
  consensus_mapping = self.consensus_mapping_df.clone()
1247
- self.logger.debug(f"Saving consensus_mapping_df with {len(consensus_mapping)} rows")
1554
+ self.logger.debug(
1555
+ f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
1556
+ )
1248
1557
  for col in consensus_mapping.columns:
1249
1558
  try:
1250
1559
  data = consensus_mapping[col].to_numpy()
1251
1560
  # Use LZF compression for consensus mapping data
1252
- consensus_mapping_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
1561
+ consensus_mapping_group.create_dataset(
1562
+ col,
1563
+ data=data,
1564
+ compression="lzf",
1565
+ shuffle=True,
1566
+ )
1253
1567
  except Exception as e:
1254
- self.logger.warning(f"Failed to save column '{col}' in consensus_mapping_df: {e}")
1568
+ self.logger.warning(
1569
+ f"Failed to save column '{col}' in consensus_mapping_df: {e}",
1570
+ )
1255
1571
  pbar.update(1)
1256
1572
 
1257
1573
  # Store consensus_ms2 - use optimized batch processing
1258
1574
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1259
- self.logger.debug(f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method")
1260
- _save_dataframe_optimized(self.consensus_ms2, consensus_ms2_group, schema, "consensus_ms2", self.logger)
1575
+ self.logger.debug(
1576
+ f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method",
1577
+ )
1578
+ _save_dataframe_optimized(
1579
+ self.consensus_ms2,
1580
+ consensus_ms2_group,
1581
+ schema,
1582
+ "consensus_ms2",
1583
+ self.logger,
1584
+ )
1261
1585
  pbar.update(1)
1262
1586
 
1263
1587
  # Store lib_df - library data
1264
- if hasattr(self, 'lib_df') and self.lib_df is not None and not self.lib_df.is_empty():
1265
- self.logger.debug(f"Saving lib_df with {len(self.lib_df)} rows using optimized method")
1266
- _save_dataframe_optimized(self.lib_df, lib_group, schema, "lib_df", self.logger)
1588
+ if (
1589
+ hasattr(self, "lib_df")
1590
+ and self.lib_df is not None
1591
+ and not self.lib_df.is_empty()
1592
+ ):
1593
+ self.logger.debug(
1594
+ f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
1595
+ )
1596
+ _save_dataframe_optimized(
1597
+ self.lib_df,
1598
+ lib_group,
1599
+ schema,
1600
+ "lib_df",
1601
+ self.logger,
1602
+ )
1267
1603
  pbar.update(1)
1268
1604
 
1269
1605
  # Store id_df - identification results
1270
- if hasattr(self, 'id_df') and self.id_df is not None and not self.id_df.is_empty():
1271
- self.logger.debug(f"Saving id_df with {len(self.id_df)} rows using optimized method")
1272
- _save_dataframe_optimized(self.id_df, id_group, schema, "id_df", self.logger)
1606
+ if (
1607
+ hasattr(self, "id_df")
1608
+ and self.id_df is not None
1609
+ and not self.id_df.is_empty()
1610
+ ):
1611
+ self.logger.debug(
1612
+ f"Saving id_df with {len(self.id_df)} rows using optimized method",
1613
+ )
1614
+ _save_dataframe_optimized(
1615
+ self.id_df,
1616
+ id_group,
1617
+ schema,
1618
+ "id_df",
1619
+ self.logger,
1620
+ )
1273
1621
  pbar.update(1)
1274
1622
 
1275
1623
  self.logger.info(f"Study saved successfully to {filename}")
@@ -1319,7 +1667,9 @@ def _load_study5(self, filename=None):
1319
1667
  schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
1320
1668
  schema = _load_schema(schema_path)
1321
1669
  if not schema:
1322
- self.logger.warning(f"Schema file {schema_path} not found. Using default types.")
1670
+ self.logger.warning(
1671
+ f"Schema file {schema_path} not found. Using default types.",
1672
+ )
1323
1673
 
1324
1674
  # Define loading steps for progress tracking
1325
1675
  loading_steps = [
@@ -1393,27 +1743,48 @@ def _load_study5(self, filename=None):
1393
1743
  if self.history and "study" in self.history:
1394
1744
  study_params = self.history["study"]
1395
1745
  if isinstance(study_params, dict):
1396
- failed_params = self.parameters.set_from_dict(study_params, validate=False)
1746
+ failed_params = self.parameters.set_from_dict(
1747
+ study_params,
1748
+ validate=False,
1749
+ )
1397
1750
  if failed_params:
1398
- self.logger.debug(f"Could not set study parameters: {failed_params}")
1751
+ self.logger.debug(
1752
+ f"Could not set study parameters: {failed_params}",
1753
+ )
1399
1754
  else:
1400
- self.logger.debug("Successfully updated parameters from loaded history")
1755
+ self.logger.debug(
1756
+ "Successfully updated parameters from loaded history",
1757
+ )
1401
1758
  else:
1402
- self.logger.debug("Study parameters in history are not a valid dictionary")
1759
+ self.logger.debug(
1760
+ "Study parameters in history are not a valid dictionary",
1761
+ )
1403
1762
  else:
1404
- self.logger.debug("No study parameters found in history, using defaults")
1763
+ self.logger.debug(
1764
+ "No study parameters found in history, using defaults",
1765
+ )
1405
1766
 
1406
1767
  # Synchronize instance attributes with parameters (similar to __init__)
1407
1768
  # Note: folder and label are already loaded from metadata attributes above
1408
1769
  # but we ensure they match the parameters for consistency
1409
- if hasattr(self.parameters, "folder") and self.parameters.folder is not None:
1770
+ if (
1771
+ hasattr(self.parameters, "folder")
1772
+ and self.parameters.folder is not None
1773
+ ):
1410
1774
  self.folder = self.parameters.folder
1411
- if hasattr(self.parameters, "label") and self.parameters.label is not None:
1775
+ if (
1776
+ hasattr(self.parameters, "label")
1777
+ and self.parameters.label is not None
1778
+ ):
1412
1779
  self.label = self.parameters.label
1413
1780
  if hasattr(self.parameters, "log_level"):
1414
1781
  self.log_level = self.parameters.log_level
1415
1782
  if hasattr(self.parameters, "log_label"):
1416
- self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
1783
+ self.log_label = (
1784
+ self.parameters.log_label
1785
+ if self.parameters.log_label is not None
1786
+ else ""
1787
+ )
1417
1788
  if hasattr(self.parameters, "log_sink"):
1418
1789
  self.log_sink = self.parameters.log_sink
1419
1790
  pbar.update(1)
@@ -1423,10 +1794,17 @@ def _load_study5(self, filename=None):
1423
1794
  f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
1424
1795
  )
1425
1796
  if "samples" in f and len(f["samples"].keys()) > 0:
1426
- self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
1797
+ self.samples_df = _load_dataframe_from_group(
1798
+ f["samples"],
1799
+ schema,
1800
+ "samples_df",
1801
+ self.logger,
1802
+ )
1427
1803
  else:
1428
1804
  # Initialize empty samples_df with the correct schema if no data exists
1429
- self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
1805
+ self.logger.debug(
1806
+ "No samples data found in study5 file. Initializing empty samples_df.",
1807
+ )
1430
1808
  self.samples_df = pl.DataFrame(
1431
1809
  {
1432
1810
  "sample_uid": [],
@@ -1463,10 +1841,17 @@ def _load_study5(self, filename=None):
1463
1841
  f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
1464
1842
  )
1465
1843
  if "samples" in f and len(f["samples"].keys()) > 0:
1466
- self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
1844
+ self.samples_df = _load_dataframe_from_group(
1845
+ f["samples"],
1846
+ schema,
1847
+ "samples_df",
1848
+ self.logger,
1849
+ )
1467
1850
  else:
1468
1851
  # Initialize empty samples_df with the correct schema if no data exists
1469
- self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
1852
+ self.logger.debug(
1853
+ "No samples data found in study5 file. Initializing empty samples_df.",
1854
+ )
1470
1855
  self.samples_df = pl.DataFrame(
1471
1856
  {
1472
1857
  "sample_uid": [],
@@ -1536,28 +1921,39 @@ def _load_study5(self, filename=None):
1536
1921
 
1537
1922
  # Backward compatibility: If adducts column doesn't exist, initialize with empty lists
1538
1923
  if self.consensus_df is not None:
1539
- if "adducts" not in self.consensus_df.columns or self.consensus_df["adducts"].dtype == pl.Null:
1540
- self.logger.info("Adding missing 'adducts' column for backward compatibility")
1541
- empty_adducts: list[list] = [[] for _ in range(len(self.consensus_df))]
1924
+ if (
1925
+ "adducts" not in self.consensus_df.columns
1926
+ or self.consensus_df["adducts"].dtype == pl.Null
1927
+ ):
1928
+ self.logger.info(
1929
+ "Adding missing 'adducts' column for backward compatibility",
1930
+ )
1931
+ empty_adducts: list[list] = [
1932
+ [] for _ in range(len(self.consensus_df))
1933
+ ]
1542
1934
 
1543
1935
  # If column exists but is Null, drop it first
1544
1936
  if "adducts" in self.consensus_df.columns:
1545
1937
  self.consensus_df = self.consensus_df.drop("adducts")
1546
1938
 
1547
- self.consensus_df = self.consensus_df.with_columns([
1548
- pl.Series(
1549
- "adducts",
1550
- empty_adducts,
1551
- dtype=pl.List(
1552
- pl.Struct([
1553
- pl.Field("adduct", pl.Utf8),
1554
- pl.Field("count", pl.Int64),
1555
- pl.Field("percentage", pl.Float64),
1556
- pl.Field("mass", pl.Float64),
1557
- ]),
1939
+ self.consensus_df = self.consensus_df.with_columns(
1940
+ [
1941
+ pl.Series(
1942
+ "adducts",
1943
+ empty_adducts,
1944
+ dtype=pl.List(
1945
+ pl.Struct(
1946
+ [
1947
+ pl.Field("adduct", pl.Utf8),
1948
+ pl.Field("count", pl.Int64),
1949
+ pl.Field("percentage", pl.Float64),
1950
+ pl.Field("mass", pl.Float64),
1951
+ ],
1952
+ ),
1953
+ ),
1558
1954
  ),
1559
- ),
1560
- ])
1955
+ ],
1956
+ )
1561
1957
  else:
1562
1958
  self.consensus_df = None
1563
1959
  pbar.update(1)
@@ -1641,8 +2037,14 @@ def _load_study5(self, filename=None):
1641
2037
  pbar.update(1)
1642
2038
 
1643
2039
  # Check and migrate old string-based map_id to integer indices
1644
- if self.samples_df is not None and not self.samples_df.is_empty() and self.samples_df["map_id"].dtype == pl.Utf8:
1645
- self.logger.info("Detected old string-based map_id format, migrating to integer indices")
2040
+ if (
2041
+ self.samples_df is not None
2042
+ and not self.samples_df.is_empty()
2043
+ and self.samples_df["map_id"].dtype == pl.Utf8
2044
+ ):
2045
+ self.logger.info(
2046
+ "Detected old string-based map_id format, migrating to integer indices",
2047
+ )
1646
2048
 
1647
2049
  # Convert string-based map_id to integer indices
1648
2050
  sample_count = len(self.samples_df)