masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +3 -9
  4. masster/data/libs/README.md +1 -1
  5. masster/data/libs/ccm.csv +120 -120
  6. masster/data/libs/ccm.py +116 -62
  7. masster/data/libs/central_carbon_README.md +1 -1
  8. masster/data/libs/urine.py +161 -65
  9. masster/data/libs/urine_metabolites.csv +4693 -4693
  10. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
  11. masster/logger.py +43 -78
  12. masster/sample/__init__.py +1 -1
  13. masster/sample/adducts.py +264 -338
  14. masster/sample/defaults/find_adducts_def.py +8 -21
  15. masster/sample/defaults/find_features_def.py +1 -6
  16. masster/sample/defaults/get_spectrum_def.py +1 -5
  17. masster/sample/defaults/sample_def.py +1 -5
  18. masster/sample/h5.py +282 -561
  19. masster/sample/helpers.py +75 -131
  20. masster/sample/lib.py +17 -42
  21. masster/sample/load.py +17 -31
  22. masster/sample/parameters.py +2 -6
  23. masster/sample/plot.py +27 -88
  24. masster/sample/processing.py +87 -117
  25. masster/sample/quant.py +51 -57
  26. masster/sample/sample.py +90 -103
  27. masster/sample/sample5_schema.json +44 -44
  28. masster/sample/save.py +12 -35
  29. masster/sample/sciex.py +19 -66
  30. masster/spectrum.py +20 -58
  31. masster/study/__init__.py +1 -1
  32. masster/study/defaults/align_def.py +1 -5
  33. masster/study/defaults/fill_chrom_def.py +1 -5
  34. masster/study/defaults/fill_def.py +1 -5
  35. masster/study/defaults/integrate_chrom_def.py +1 -5
  36. masster/study/defaults/integrate_def.py +1 -5
  37. masster/study/defaults/study_def.py +25 -58
  38. masster/study/export.py +207 -233
  39. masster/study/h5.py +136 -470
  40. masster/study/helpers.py +202 -495
  41. masster/study/helpers_optimized.py +13 -40
  42. masster/study/id.py +110 -213
  43. masster/study/load.py +143 -230
  44. masster/study/plot.py +257 -518
  45. masster/study/processing.py +257 -469
  46. masster/study/save.py +5 -15
  47. masster/study/study.py +276 -379
  48. masster/study/study5_schema.json +96 -96
  49. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
  50. masster-0.4.1.dist-info/RECORD +67 -0
  51. masster-0.4.0.dist-info/RECORD +0 -67
  52. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
  53. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
  54. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/study/h5.py CHANGED
@@ -35,8 +35,8 @@ import h5py
35
35
  import polars as pl
36
36
  from tqdm import tqdm
37
37
 
38
- from master.chromatogram import Chromatogram
39
- from master.spectrum import Spectrum
38
+ from masster.chromatogram import Chromatogram
39
+ from masster.spectrum import Spectrum
40
40
 
41
41
 
42
42
  # Helper functions for HDF5 operations
@@ -109,13 +109,7 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
109
109
 
110
110
  # Process object columns with optimized serialization
111
111
  if object_cols:
112
- _save_object_columns_optimized(
113
- group,
114
- df_ordered,
115
- object_cols,
116
- logger,
117
- chunk_size,
118
- )
112
+ _save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
119
113
 
120
114
  except Exception as e:
121
115
  logger.error(f"Failed to save DataFrame {df_name}: {e}")
@@ -152,33 +146,17 @@ def _save_numeric_column_fast(group, col, data_series, logger):
152
146
 
153
147
  # If sample value is a list/array, treat as object column
154
148
  if isinstance(sample_value, (list, tuple, np.ndarray)):
155
- logger.debug(
156
- f"Column '{col}' contains array-like data, treating as object",
157
- )
158
- _save_dataframe_column_legacy_single(
159
- group,
160
- col,
161
- data_series.to_list(),
162
- "object",
163
- logger,
164
- )
149
+ logger.debug(f"Column '{col}' contains array-like data, treating as object")
150
+ _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
165
151
  return
166
152
 
167
153
  # Otherwise, convert None values to -123 sentinel for mixed-type numeric columns
168
154
  try:
169
- data_array = np.array(
170
- [(-123 if x is None else float(x)) for x in data_array],
171
- )
155
+ data_array = np.array([(-123 if x is None else float(x)) for x in data_array])
172
156
  except (ValueError, TypeError):
173
157
  # If conversion fails, this is not a numeric column
174
158
  logger.debug(f"Column '{col}' is not numeric, treating as object")
175
- _save_dataframe_column_legacy_single(
176
- group,
177
- col,
178
- data_series.to_list(),
179
- "object",
180
- logger,
181
- )
159
+ _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
182
160
  return
183
161
 
184
162
  group.create_dataset(col, data=data_array, **compression_kwargs)
@@ -186,13 +164,7 @@ def _save_numeric_column_fast(group, col, data_series, logger):
186
164
  except Exception as e:
187
165
  logger.warning(f"Failed to save numeric column '{col}' efficiently: {e}")
188
166
  # Fallback to old method
189
- _save_dataframe_column_legacy_single(
190
- group,
191
- col,
192
- data_series.to_list(),
193
- str(data_series.dtype),
194
- logger,
195
- )
167
+ _save_dataframe_column_legacy_single(group, col, data_series.to_list(), str(data_series.dtype), logger)
196
168
 
197
169
 
198
170
  def _save_string_column_fast(group, col, data_series, logger):
@@ -207,13 +179,7 @@ def _save_string_column_fast(group, col, data_series, logger):
207
179
  except Exception as e:
208
180
  logger.warning(f"Failed to save string column '{col}' efficiently: {e}")
209
181
  # Fallback to old method
210
- _save_dataframe_column_legacy_single(
211
- group,
212
- col,
213
- data_series.to_list(),
214
- "string",
215
- logger,
216
- )
182
+ _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "string", logger)
217
183
 
218
184
 
219
185
  def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
@@ -266,9 +232,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
266
232
  else:
267
233
  serialized_chunk.append("None")
268
234
  else:
269
- logger.warning(
270
- f"Unknown object column '{col_name}', using default serialization",
271
- )
235
+ logger.warning(f"Unknown object column '{col_name}', using default serialization")
272
236
  for item in chunk_data:
273
237
  serialized_chunk.append(str(item) if item is not None else "None")
274
238
 
@@ -281,28 +245,16 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
281
245
  total_items = len(data_list)
282
246
 
283
247
  if total_items == 0:
284
- group.create_dataset(
285
- col,
286
- data=[],
287
- compression="gzip",
288
- compression_opts=6,
289
- )
248
+ group.create_dataset(col, data=[], compression="gzip", compression_opts=6)
290
249
  continue
291
250
 
292
251
  # For small datasets, process directly
293
252
  if total_items <= chunk_size:
294
253
  serialized_data = serialize_chunk(col, data_list)
295
- group.create_dataset(
296
- col,
297
- data=serialized_data,
298
- compression="gzip",
299
- compression_opts=6,
300
- )
254
+ group.create_dataset(col, data=serialized_data, compression="gzip", compression_opts=6)
301
255
  else:
302
256
  # For large datasets, use chunked processing with parallel serialization
303
- logger.debug(
304
- f"Processing large object column '{col}' with {total_items} items in chunks",
305
- )
257
+ logger.debug(f"Processing large object column '{col}' with {total_items} items in chunks")
306
258
 
307
259
  all_serialized = []
308
260
  num_chunks = (total_items + chunk_size - 1) // chunk_size
@@ -329,58 +281,28 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
329
281
  )
330
282
  # Fallback to simple string conversion for this chunk
331
283
  chunk = data_list[chunk_start : chunk_start + chunk_size]
332
- results[chunk_start] = [
333
- str(item) if item is not None else "None"
334
- for item in chunk
335
- ]
284
+ results[chunk_start] = [str(item) if item is not None else "None" for item in chunk]
336
285
 
337
286
  # Reassemble in correct order
338
287
  for i in range(0, total_items, chunk_size):
339
288
  if i in results:
340
289
  all_serialized.extend(results[i])
341
290
 
342
- group.create_dataset(
343
- col,
344
- data=all_serialized,
345
- compression="gzip",
346
- compression_opts=6,
347
- )
291
+ group.create_dataset(col, data=all_serialized, compression="gzip", compression_opts=6)
348
292
 
349
293
  except Exception as e:
350
- logger.warning(
351
- f"Failed to save object column '{col}' with optimization: {e}",
352
- )
294
+ logger.warning(f"Failed to save object column '{col}' with optimization: {e}")
353
295
  # Fallback to old method
354
- _save_dataframe_column_legacy_single(
355
- group,
356
- col,
357
- df[col].to_list(),
358
- "object",
359
- logger,
360
- )
296
+ _save_dataframe_column_legacy_single(group, col, df[col].to_list(), "object", logger)
361
297
 
362
298
 
363
- def _save_dataframe_column_legacy_single(
364
- group,
365
- col: str,
366
- data,
367
- dtype: str,
368
- logger,
369
- compression="gzip",
370
- ):
299
+ def _save_dataframe_column_legacy_single(group, col: str, data, dtype: str, logger, compression="gzip"):
371
300
  """Legacy single column save method for fallback."""
372
301
  # This is the original _save_dataframe_column method for compatibility
373
302
  return _save_dataframe_column_legacy(group, col, data, dtype, logger, compression)
374
303
 
375
304
 
376
- def _save_dataframe_column_legacy(
377
- group,
378
- col: str,
379
- data,
380
- dtype: str,
381
- logger,
382
- compression="gzip",
383
- ):
305
+ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, compression="gzip"):
384
306
  """
385
307
  Save a single DataFrame column to an HDF5 group with optimized compression.
386
308
 
@@ -405,10 +327,7 @@ def _save_dataframe_column_legacy(
405
327
 
406
328
  # Optimized compression configuration
407
329
  COMPRESSION_CONFIG = {
408
- "fast_access": {
409
- "compression": "lzf",
410
- "shuffle": True,
411
- }, # Fast I/O for IDs, rt, mz
330
+ "fast_access": {"compression": "lzf", "shuffle": True}, # Fast I/O for IDs, rt, mz
412
331
  "numeric": {"compression": "lzf"}, # Standard numeric data
413
332
  "string": {"compression": "gzip", "compression_opts": 6}, # String data
414
333
  "json": {"compression": "gzip", "compression_opts": 6}, # JSON objects
@@ -431,22 +350,11 @@ def _save_dataframe_column_legacy(
431
350
  return COMPRESSION_CONFIG["fast_access"]
432
351
 
433
352
  # JSON object columns (complex serialized data)
434
- elif column_name in [
435
- "spectrum",
436
- "chromatogram",
437
- "chromatograms",
438
- "ms2_specs",
439
- "chrom",
440
- ]:
353
+ elif column_name in ["spectrum", "chromatogram", "chromatograms", "ms2_specs", "chrom"]:
441
354
  return COMPRESSION_CONFIG["json"]
442
355
 
443
356
  # String/text columns
444
- elif data_type in ["string", "object"] and column_name in [
445
- "sample_name",
446
- "file_path",
447
- "label",
448
- "file_type",
449
- ]:
357
+ elif data_type in ["string", "object"] and column_name in ["sample_name", "file_path", "label", "file_type"]:
450
358
  return COMPRESSION_CONFIG["string"]
451
359
 
452
360
  # Large bulk numeric data
@@ -626,14 +534,9 @@ def _clean_string_nulls(df: pl.DataFrame) -> pl.DataFrame:
626
534
  """Convert string null representations to proper nulls."""
627
535
  for col in df.columns:
628
536
  if df[col].dtype == pl.Utf8:
629
- df = df.with_columns(
630
- [
631
- pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
632
- .then(None)
633
- .otherwise(pl.col(col))
634
- .alias(col),
635
- ],
636
- )
537
+ df = df.with_columns([
538
+ pl.when(pl.col(col).is_in(["None", "", "null", "NULL"])).then(None).otherwise(pl.col(col)).alias(col),
539
+ ])
637
540
  return df
638
541
 
639
542
 
@@ -674,11 +577,7 @@ def _apply_schema_casting(df: pl.DataFrame, schema: dict, df_name: str) -> pl.Da
674
577
  return df
675
578
 
676
579
 
677
- def _reorder_columns_by_schema(
678
- df: pl.DataFrame,
679
- schema: dict,
680
- df_name: str,
681
- ) -> pl.DataFrame:
580
+ def _reorder_columns_by_schema(df: pl.DataFrame, schema: dict, df_name: str) -> pl.DataFrame:
682
581
  """Reorder DataFrame columns to match schema order."""
683
582
  if df_name not in schema or "columns" not in schema[df_name]:
684
583
  return df
@@ -732,24 +631,20 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
732
631
  # print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
733
632
  if col == "adducts":
734
633
  # Handle adducts as List(Struct) - now contains dicts
735
- df = df.with_columns(
736
- [
737
- pl.Series(
738
- col,
739
- values,
740
- dtype=pl.List(
741
- pl.Struct(
742
- [
743
- pl.Field("adduct", pl.Utf8),
744
- pl.Field("count", pl.Int64),
745
- pl.Field("percentage", pl.Float64),
746
- pl.Field("mass", pl.Float64),
747
- ],
748
- ),
749
- ),
634
+ df = df.with_columns([
635
+ pl.Series(
636
+ col,
637
+ values,
638
+ dtype=pl.List(
639
+ pl.Struct([
640
+ pl.Field("adduct", pl.Utf8),
641
+ pl.Field("count", pl.Int64),
642
+ pl.Field("percentage", pl.Float64),
643
+ pl.Field("mass", pl.Float64),
644
+ ]),
750
645
  ),
751
- ],
752
- )
646
+ ),
647
+ ])
753
648
  else:
754
649
  # Other object columns stay as Object
755
650
  df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
@@ -760,24 +655,20 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
760
655
  # print(f"DEBUG: Creating object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
761
656
  if col == "adducts":
762
657
  # Handle adducts as List(Struct) - now contains dicts
763
- df = df.with_columns(
764
- [
765
- pl.Series(
766
- col,
767
- values,
768
- dtype=pl.List(
769
- pl.Struct(
770
- [
771
- pl.Field("adduct", pl.Utf8),
772
- pl.Field("count", pl.Int64),
773
- pl.Field("percentage", pl.Float64),
774
- pl.Field("mass", pl.Float64),
775
- ],
776
- ),
777
- ),
658
+ df = df.with_columns([
659
+ pl.Series(
660
+ col,
661
+ values,
662
+ dtype=pl.List(
663
+ pl.Struct([
664
+ pl.Field("adduct", pl.Utf8),
665
+ pl.Field("count", pl.Int64),
666
+ pl.Field("percentage", pl.Float64),
667
+ pl.Field("mass", pl.Float64),
668
+ ]),
778
669
  ),
779
- ],
780
- )
670
+ ),
671
+ ])
781
672
  else:
782
673
  # Other object columns stay as Object
783
674
  df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
@@ -785,13 +676,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
785
676
  return df
786
677
 
787
678
 
788
- def _load_dataframe_from_group(
789
- group,
790
- schema: dict,
791
- df_name: str,
792
- logger,
793
- object_columns: list | None = None,
794
- ) -> pl.DataFrame:
679
+ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object_columns: list | None = None) -> pl.DataFrame:
795
680
  """Load a DataFrame from HDF5 group using schema."""
796
681
  if object_columns is None:
797
682
  object_columns = []
@@ -805,9 +690,7 @@ def _load_dataframe_from_group(
805
690
  )
806
691
  schema_section = schema.get(df_name, {}) if isinstance(schema, dict) else {}
807
692
  logger.debug(f"Schema section for {df_name}: {schema_section}")
808
- schema_columns = (
809
- schema_section.get("columns", []) if isinstance(schema_section, dict) else []
810
- )
693
+ schema_columns = schema_section.get("columns", []) if isinstance(schema_section, dict) else []
811
694
  logger.debug(f"Schema columns for {df_name}: {schema_columns}")
812
695
  if schema_columns is None:
813
696
  schema_columns = []
@@ -830,9 +713,7 @@ def _load_dataframe_from_group(
830
713
  effective_columns = hdf5_columns.copy()
831
714
  for old_name, new_name in column_migrations.items():
832
715
  if old_name in effective_columns:
833
- logger.info(
834
- f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility",
835
- )
716
+ logger.info(f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility")
836
717
  # Add the new name to effective columns and optionally remove old name
837
718
  effective_columns.append(new_name)
838
719
 
@@ -897,9 +778,7 @@ def _load_dataframe_from_group(
897
778
  for col, values in data.items():
898
779
  if values is not None and hasattr(values, "__len__"):
899
780
  expected_length = len(values)
900
- logger.debug(
901
- f"Determined expected_length={expected_length} from loaded column '{col}'",
902
- )
781
+ logger.debug(f"Determined expected_length={expected_length} from loaded column '{col}'")
903
782
  break
904
783
 
905
784
  # If no data loaded yet, try HDF5 columns directly
@@ -909,9 +788,7 @@ def _load_dataframe_from_group(
909
788
  col_data = group[col][:]
910
789
  if expected_length is None:
911
790
  expected_length = len(col_data)
912
- logger.debug(
913
- f"Determined expected_length={expected_length} from HDF5 column '{col}'",
914
- )
791
+ logger.debug(f"Determined expected_length={expected_length} from HDF5 column '{col}'")
915
792
  break
916
793
 
917
794
  # Default to 0 if no data found
@@ -925,38 +802,26 @@ def _load_dataframe_from_group(
925
802
  # For missing columns, create appropriately sized array with appropriate defaults
926
803
  if col in object_columns:
927
804
  data[col] = [None] * expected_length
928
- logger.debug(
929
- f"Created missing object column '{col}' with length {expected_length}",
930
- )
805
+ logger.debug(f"Created missing object column '{col}' with length {expected_length}")
931
806
  else:
932
807
  # Provide specific default values for new columns for backward compatibility
933
808
  if df_name == "samples_df":
934
809
  if col == "sample_group":
935
810
  data[col] = [""] * expected_length # Empty string default
936
- logger.debug(
937
- f"Created missing column '{col}' with empty string defaults",
938
- )
811
+ logger.debug(f"Created missing column '{col}' with empty string defaults")
939
812
  elif col == "sample_batch":
940
813
  data[col] = [1] * expected_length # Batch 1 default
941
- logger.debug(
942
- f"Created missing column '{col}' with batch 1 defaults",
943
- )
814
+ logger.debug(f"Created missing column '{col}' with batch 1 defaults")
944
815
  elif col == "sample_sequence":
945
816
  # Create increasing sequence numbers
946
817
  data[col] = list(range(1, expected_length + 1))
947
- logger.debug(
948
- f"Created missing column '{col}' with sequence 1-{expected_length}",
949
- )
818
+ logger.debug(f"Created missing column '{col}' with sequence 1-{expected_length}")
950
819
  else:
951
820
  data[col] = [None] * expected_length
952
- logger.debug(
953
- f"Created missing regular column '{col}' with length {expected_length}",
954
- )
821
+ logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
955
822
  else:
956
823
  data[col] = [None] * expected_length
957
- logger.debug(
958
- f"Created missing regular column '{col}' with length {expected_length}",
959
- )
824
+ logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
960
825
 
961
826
  # Check for columns in HDF5 file that are not in schema (for backward compatibility)
962
827
  # But skip the old column names we already migrated
@@ -970,11 +835,7 @@ def _load_dataframe_from_group(
970
835
  }
971
836
  migrated_old_names = set(column_migrations.keys())
972
837
 
973
- extra_columns = [
974
- col
975
- for col in hdf5_columns
976
- if col not in (schema_columns or []) and col not in migrated_old_names
977
- ]
838
+ extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
978
839
 
979
840
  for col in extra_columns:
980
841
  logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
@@ -993,10 +854,7 @@ def _load_dataframe_from_group(
993
854
  object_columns.append(col)
994
855
  else:
995
856
  # Regular string data
996
- data[col] = [
997
- item.decode("utf-8") if isinstance(item, bytes) else item
998
- for item in column_data
999
- ]
857
+ data[col] = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
1000
858
  except Exception:
1001
859
  # If decoding fails, treat as regular data
1002
860
  data[col] = column_data
@@ -1009,19 +867,10 @@ def _load_dataframe_from_group(
1009
867
  # Handle byte string conversion for non-object columns
1010
868
  # Only convert to strings for columns that should actually be strings
1011
869
  for col, values in data.items():
1012
- if (
1013
- col not in object_columns
1014
- and values is not None
1015
- and len(values) > 0
1016
- and isinstance(values[0], bytes)
1017
- ):
870
+ if col not in object_columns and values is not None and len(values) > 0 and isinstance(values[0], bytes):
1018
871
  # Check schema to see if this should be a string column
1019
872
  should_be_string = False
1020
- if (
1021
- df_name in schema
1022
- and "columns" in schema[df_name]
1023
- and col in schema[df_name]["columns"]
1024
- ):
873
+ if df_name in schema and "columns" in schema[df_name] and col in schema[df_name]["columns"]:
1025
874
  dtype_str = schema[df_name]["columns"][col]["dtype"]
1026
875
  should_be_string = dtype_str == "pl.Utf8"
1027
876
 
@@ -1039,9 +888,7 @@ def _load_dataframe_from_group(
1039
888
  logger.debug(f"Creating DataFrame with object columns: {object_columns}")
1040
889
  for col in object_columns:
1041
890
  if col in data:
1042
- logger.debug(
1043
- f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}",
1044
- )
891
+ logger.debug(f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}")
1045
892
  df = _create_dataframe_with_objects(data, object_columns)
1046
893
  else:
1047
894
  df = pl.DataFrame(data)
@@ -1087,22 +934,15 @@ def _save_study5_compressed(self, filename):
1087
934
  dataframes_to_save.append(("features", len(self.features_df)))
1088
935
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1089
936
  dataframes_to_save.append(("consensus", len(self.consensus_df)))
1090
- if (
1091
- self.consensus_mapping_df is not None
1092
- and not self.consensus_mapping_df.is_empty()
1093
- ):
1094
- dataframes_to_save.append(
1095
- ("consensus_mapping", len(self.consensus_mapping_df)),
1096
- )
937
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
938
+ dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
1097
939
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1098
940
  dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
1099
941
 
1100
942
  total_steps = len(dataframes_to_save) + 1 # +1 for metadata
1101
943
 
1102
944
  # Show progress for large saves
1103
- tdqm_disable = (
1104
- self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1105
- )
945
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1106
946
 
1107
947
  with tqdm(
1108
948
  total=total_steps,
@@ -1118,14 +958,8 @@ def _save_study5_compressed(self, filename):
1118
958
 
1119
959
  # Store metadata
1120
960
  metadata_group.attrs["format"] = "master-study-1"
1121
- metadata_group.attrs["folder"] = (
1122
- str(self.folder) if self.folder is not None else ""
1123
- )
1124
- metadata_group.attrs["label"] = (
1125
- str(self.label)
1126
- if hasattr(self, "label") and self.label is not None
1127
- else ""
1128
- )
961
+ metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
962
+ metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
1129
963
 
1130
964
  # Store parameters as JSON
1131
965
  if hasattr(self, "parameters") and self.history is not None:
@@ -1146,16 +980,8 @@ def _save_study5_compressed(self, filename):
1146
980
  # Store samples_df - use optimized batch processing
1147
981
  if self.samples_df is not None and not self.samples_df.is_empty():
1148
982
  samples_group = f.create_group("samples")
1149
- self.logger.debug(
1150
- f"Saving samples_df with {len(self.samples_df)} rows using optimized method",
1151
- )
1152
- _save_dataframe_optimized(
1153
- self.samples_df,
1154
- samples_group,
1155
- schema,
1156
- "samples_df",
1157
- self.logger,
1158
- )
983
+ self.logger.debug(f"Saving samples_df with {len(self.samples_df)} rows using optimized method")
984
+ _save_dataframe_optimized(self.samples_df, samples_group, schema, "samples_df", self.logger)
1159
985
  pbar.update(1)
1160
986
 
1161
987
  # Store features_df - use fast method that skips chrom and ms2_specs columns
@@ -1163,79 +989,38 @@ def _save_study5_compressed(self, filename):
1163
989
  self.logger.debug(
1164
990
  f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)",
1165
991
  )
1166
- _save_dataframe_optimized_fast(
1167
- self.features_df,
1168
- features_group,
1169
- schema,
1170
- "features_df",
1171
- self.logger,
1172
- )
992
+ _save_dataframe_optimized_fast(self.features_df, features_group, schema, "features_df", self.logger)
1173
993
  pbar.update(1)
1174
994
 
1175
995
  # Store consensus_df - use optimized batch processing
1176
996
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1177
- self.logger.debug(
1178
- f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method",
1179
- )
1180
- _save_dataframe_optimized(
1181
- self.consensus_df,
1182
- consensus_group,
1183
- schema,
1184
- "consensus_df",
1185
- self.logger,
1186
- )
997
+ self.logger.debug(f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method")
998
+ _save_dataframe_optimized(self.consensus_df, consensus_group, schema, "consensus_df", self.logger)
1187
999
  pbar.update(1)
1188
1000
 
1189
1001
  # Store consensus_mapping_df - keep existing fast method
1190
- if (
1191
- self.consensus_mapping_df is not None
1192
- and not self.consensus_mapping_df.is_empty()
1193
- ):
1002
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1194
1003
  consensus_mapping = self.consensus_mapping_df.clone()
1195
- self.logger.debug(
1196
- f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
1197
- )
1004
+ self.logger.debug(f"Saving consensus_mapping_df with {len(consensus_mapping)} rows")
1198
1005
  for col in consensus_mapping.columns:
1199
1006
  try:
1200
1007
  data = consensus_mapping[col].to_numpy()
1201
1008
  # Use LZF compression for consensus mapping data
1202
- consensus_mapping_group.create_dataset(
1203
- col,
1204
- data=data,
1205
- compression="lzf",
1206
- shuffle=True,
1207
- )
1009
+ consensus_mapping_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
1208
1010
  except Exception as e:
1209
- self.logger.warning(
1210
- f"Failed to save column '{col}' in consensus_mapping_df: {e}",
1211
- )
1011
+ self.logger.warning(f"Failed to save column '{col}' in consensus_mapping_df: {e}")
1212
1012
  pbar.update(1)
1213
1013
 
1214
1014
  # Store consensus_ms2 - use optimized batch processing
1215
1015
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1216
- self.logger.debug(
1217
- f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method",
1218
- )
1219
- _save_dataframe_optimized(
1220
- self.consensus_ms2,
1221
- consensus_ms2_group,
1222
- schema,
1223
- "consensus_ms2",
1224
- self.logger,
1225
- )
1016
+ self.logger.debug(f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method")
1017
+ _save_dataframe_optimized(self.consensus_ms2, consensus_ms2_group, schema, "consensus_ms2", self.logger)
1226
1018
  pbar.update(1)
1227
1019
 
1228
1020
  self.logger.debug(f"Fast save completed for {filename}")
1229
1021
 
1230
1022
 
1231
- def _save_dataframe_optimized_fast(
1232
- df,
1233
- group,
1234
- schema,
1235
- df_name,
1236
- logger,
1237
- chunk_size=10000,
1238
- ):
1023
+ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_size=10000):
1239
1024
  """
1240
1025
  Save DataFrame with optimized batch processing, but skip chrom and ms2_specs columns for features_df.
1241
1026
 
@@ -1260,9 +1045,7 @@ def _save_dataframe_optimized_fast(
1260
1045
  # Skip chrom and ms2_specs columns for features_df
1261
1046
  if df_name == "features_df":
1262
1047
  skip_columns = ["chrom", "ms2_specs"]
1263
- df_ordered = df_ordered.select(
1264
- [col for col in df_ordered.columns if col not in skip_columns],
1265
- )
1048
+ df_ordered = df_ordered.select([col for col in df_ordered.columns if col not in skip_columns])
1266
1049
  logger.debug(f"Fast save: skipping columns {skip_columns} for {df_name}")
1267
1050
 
1268
1051
  total_rows = len(df_ordered)
@@ -1297,13 +1080,7 @@ def _save_dataframe_optimized_fast(
1297
1080
 
1298
1081
  # Process object columns with optimized serialization
1299
1082
  if object_cols:
1300
- _save_object_columns_optimized(
1301
- group,
1302
- df_ordered,
1303
- object_cols,
1304
- logger,
1305
- chunk_size,
1306
- )
1083
+ _save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
1307
1084
 
1308
1085
  except Exception as e:
1309
1086
  logger.error(f"Failed to save DataFrame {df_name}: {e}")
@@ -1366,22 +1143,15 @@ def _save_study5(self, filename):
1366
1143
  dataframes_to_save.append(("features", len(self.features_df)))
1367
1144
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1368
1145
  dataframes_to_save.append(("consensus", len(self.consensus_df)))
1369
- if (
1370
- self.consensus_mapping_df is not None
1371
- and not self.consensus_mapping_df.is_empty()
1372
- ):
1373
- dataframes_to_save.append(
1374
- ("consensus_mapping", len(self.consensus_mapping_df)),
1375
- )
1146
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1147
+ dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
1376
1148
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1377
1149
  dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
1378
1150
 
1379
1151
  total_steps = len(dataframes_to_save) + 1 # +1 for metadata
1380
1152
 
1381
1153
  # Show progress for large saves
1382
- tdqm_disable = (
1383
- self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1384
- )
1154
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
1385
1155
 
1386
1156
  with tqdm(
1387
1157
  total=total_steps,
@@ -1397,14 +1167,8 @@ def _save_study5(self, filename):
1397
1167
 
1398
1168
  # Store metadata
1399
1169
  metadata_group.attrs["format"] = "master-study-1"
1400
- metadata_group.attrs["folder"] = (
1401
- str(self.folder) if self.folder is not None else ""
1402
- )
1403
- metadata_group.attrs["label"] = (
1404
- str(self.label)
1405
- if hasattr(self, "label") and self.label is not None
1406
- else ""
1407
- )
1170
+ metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
1171
+ metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
1408
1172
 
1409
1173
  # Store parameters as JSON
1410
1174
  if hasattr(self, "parameters") and self.history is not None:
@@ -1425,83 +1189,39 @@ def _save_study5(self, filename):
1425
1189
  # Store samples_df - use optimized batch processing
1426
1190
  if self.samples_df is not None and not self.samples_df.is_empty():
1427
1191
  samples_group = f.create_group("samples")
1428
- self.logger.debug(
1429
- f"Saving samples_df with {len(self.samples_df)} rows using optimized method",
1430
- )
1431
- _save_dataframe_optimized(
1432
- self.samples_df,
1433
- samples_group,
1434
- schema,
1435
- "samples_df",
1436
- self.logger,
1437
- )
1192
+ self.logger.debug(f"Saving samples_df with {len(self.samples_df)} rows using optimized method")
1193
+ _save_dataframe_optimized(self.samples_df, samples_group, schema, "samples_df", self.logger)
1438
1194
  pbar.update(1)
1439
1195
 
1440
1196
  # Store features_df - use optimized batch processing
1441
1197
  if self.features_df is not None and not self.features_df.is_empty():
1442
- self.logger.debug(
1443
- f"Saving features_df with {len(self.features_df)} rows using optimized method",
1444
- )
1445
- _save_dataframe_optimized(
1446
- self.features_df,
1447
- features_group,
1448
- schema,
1449
- "features_df",
1450
- self.logger,
1451
- )
1198
+ self.logger.debug(f"Saving features_df with {len(self.features_df)} rows using optimized method")
1199
+ _save_dataframe_optimized(self.features_df, features_group, schema, "features_df", self.logger)
1452
1200
  pbar.update(1)
1453
1201
 
1454
1202
  # Store consensus_df - use optimized batch processing
1455
1203
  if self.consensus_df is not None and not self.consensus_df.is_empty():
1456
- self.logger.debug(
1457
- f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method",
1458
- )
1459
- _save_dataframe_optimized(
1460
- self.consensus_df,
1461
- consensus_group,
1462
- schema,
1463
- "consensus_df",
1464
- self.logger,
1465
- )
1204
+ self.logger.debug(f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method")
1205
+ _save_dataframe_optimized(self.consensus_df, consensus_group, schema, "consensus_df", self.logger)
1466
1206
  pbar.update(1)
1467
1207
 
1468
1208
  # Store consensus_mapping_df - keep existing fast method
1469
- if (
1470
- self.consensus_mapping_df is not None
1471
- and not self.consensus_mapping_df.is_empty()
1472
- ):
1209
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1473
1210
  consensus_mapping = self.consensus_mapping_df.clone()
1474
- self.logger.debug(
1475
- f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
1476
- )
1211
+ self.logger.debug(f"Saving consensus_mapping_df with {len(consensus_mapping)} rows")
1477
1212
  for col in consensus_mapping.columns:
1478
1213
  try:
1479
1214
  data = consensus_mapping[col].to_numpy()
1480
1215
  # Use LZF compression for consensus mapping data
1481
- consensus_mapping_group.create_dataset(
1482
- col,
1483
- data=data,
1484
- compression="lzf",
1485
- shuffle=True,
1486
- )
1216
+ consensus_mapping_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
1487
1217
  except Exception as e:
1488
- self.logger.warning(
1489
- f"Failed to save column '{col}' in consensus_mapping_df: {e}",
1490
- )
1218
+ self.logger.warning(f"Failed to save column '{col}' in consensus_mapping_df: {e}")
1491
1219
  pbar.update(1)
1492
1220
 
1493
1221
  # Store consensus_ms2 - use optimized batch processing
1494
1222
  if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1495
- self.logger.debug(
1496
- f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method",
1497
- )
1498
- _save_dataframe_optimized(
1499
- self.consensus_ms2,
1500
- consensus_ms2_group,
1501
- schema,
1502
- "consensus_ms2",
1503
- self.logger,
1504
- )
1223
+ self.logger.debug(f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method")
1224
+ _save_dataframe_optimized(self.consensus_ms2, consensus_ms2_group, schema, "consensus_ms2", self.logger)
1505
1225
  pbar.update(1)
1506
1226
 
1507
1227
  self.logger.info(f"Study saved successfully to {filename}")
@@ -1551,9 +1271,7 @@ def _load_study5(self, filename=None):
1551
1271
  schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
1552
1272
  schema = _load_schema(schema_path)
1553
1273
  if not schema:
1554
- self.logger.warning(
1555
- f"Schema file {schema_path} not found. Using default types.",
1556
- )
1274
+ self.logger.warning(f"Schema file {schema_path} not found. Using default types.")
1557
1275
 
1558
1276
  # Define loading steps for progress tracking
1559
1277
  loading_steps = [
@@ -1616,7 +1334,7 @@ def _load_study5(self, filename=None):
1616
1334
  self.history = {}
1617
1335
 
1618
1336
  # Reconstruct self.parameters from loaded history
1619
- from master.study.defaults.study_def import study_defaults
1337
+ from masster.study.defaults.study_def import study_defaults
1620
1338
 
1621
1339
  # Always create a fresh study_defaults object to ensure we have all defaults
1622
1340
  self.parameters = study_defaults()
@@ -1625,48 +1343,27 @@ def _load_study5(self, filename=None):
1625
1343
  if self.history and "study" in self.history:
1626
1344
  study_params = self.history["study"]
1627
1345
  if isinstance(study_params, dict):
1628
- failed_params = self.parameters.set_from_dict(
1629
- study_params,
1630
- validate=False,
1631
- )
1346
+ failed_params = self.parameters.set_from_dict(study_params, validate=False)
1632
1347
  if failed_params:
1633
- self.logger.debug(
1634
- f"Could not set study parameters: {failed_params}",
1635
- )
1348
+ self.logger.debug(f"Could not set study parameters: {failed_params}")
1636
1349
  else:
1637
- self.logger.debug(
1638
- "Successfully updated parameters from loaded history",
1639
- )
1350
+ self.logger.debug("Successfully updated parameters from loaded history")
1640
1351
  else:
1641
- self.logger.debug(
1642
- "Study parameters in history are not a valid dictionary",
1643
- )
1352
+ self.logger.debug("Study parameters in history are not a valid dictionary")
1644
1353
  else:
1645
- self.logger.debug(
1646
- "No study parameters found in history, using defaults",
1647
- )
1354
+ self.logger.debug("No study parameters found in history, using defaults")
1648
1355
 
1649
1356
  # Synchronize instance attributes with parameters (similar to __init__)
1650
1357
  # Note: folder and label are already loaded from metadata attributes above
1651
1358
  # but we ensure they match the parameters for consistency
1652
- if (
1653
- hasattr(self.parameters, "folder")
1654
- and self.parameters.folder is not None
1655
- ):
1359
+ if hasattr(self.parameters, "folder") and self.parameters.folder is not None:
1656
1360
  self.folder = self.parameters.folder
1657
- if (
1658
- hasattr(self.parameters, "label")
1659
- and self.parameters.label is not None
1660
- ):
1361
+ if hasattr(self.parameters, "label") and self.parameters.label is not None:
1661
1362
  self.label = self.parameters.label
1662
1363
  if hasattr(self.parameters, "log_level"):
1663
1364
  self.log_level = self.parameters.log_level
1664
1365
  if hasattr(self.parameters, "log_label"):
1665
- self.log_label = (
1666
- self.parameters.log_label
1667
- if self.parameters.log_label is not None
1668
- else ""
1669
- )
1366
+ self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
1670
1367
  if hasattr(self.parameters, "log_sink"):
1671
1368
  self.log_sink = self.parameters.log_sink
1672
1369
  pbar.update(1)
@@ -1676,17 +1373,10 @@ def _load_study5(self, filename=None):
1676
1373
  f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
1677
1374
  )
1678
1375
  if "samples" in f and len(f["samples"].keys()) > 0:
1679
- self.samples_df = _load_dataframe_from_group(
1680
- f["samples"],
1681
- schema,
1682
- "samples_df",
1683
- self.logger,
1684
- )
1376
+ self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
1685
1377
  else:
1686
1378
  # Initialize empty samples_df with the correct schema if no data exists
1687
- self.logger.debug(
1688
- "No samples data found in study5 file. Initializing empty samples_df.",
1689
- )
1379
+ self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
1690
1380
  self.samples_df = pl.DataFrame(
1691
1381
  {
1692
1382
  "sample_uid": [],
@@ -1723,17 +1413,10 @@ def _load_study5(self, filename=None):
1723
1413
  f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
1724
1414
  )
1725
1415
  if "samples" in f and len(f["samples"].keys()) > 0:
1726
- self.samples_df = _load_dataframe_from_group(
1727
- f["samples"],
1728
- schema,
1729
- "samples_df",
1730
- self.logger,
1731
- )
1416
+ self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
1732
1417
  else:
1733
1418
  # Initialize empty samples_df with the correct schema if no data exists
1734
- self.logger.debug(
1735
- "No samples data found in study5 file. Initializing empty samples_df.",
1736
- )
1419
+ self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
1737
1420
  self.samples_df = pl.DataFrame(
1738
1421
  {
1739
1422
  "sample_uid": [],
@@ -1803,39 +1486,28 @@ def _load_study5(self, filename=None):
1803
1486
 
1804
1487
  # Backward compatibility: If adducts column doesn't exist, initialize with empty lists
1805
1488
  if self.consensus_df is not None:
1806
- if (
1807
- "adducts" not in self.consensus_df.columns
1808
- or self.consensus_df["adducts"].dtype == pl.Null
1809
- ):
1810
- self.logger.info(
1811
- "Adding missing 'adducts' column for backward compatibility",
1812
- )
1813
- empty_adducts: list[list] = [
1814
- [] for _ in range(len(self.consensus_df))
1815
- ]
1489
+ if "adducts" not in self.consensus_df.columns or self.consensus_df["adducts"].dtype == pl.Null:
1490
+ self.logger.info("Adding missing 'adducts' column for backward compatibility")
1491
+ empty_adducts: list[list] = [[] for _ in range(len(self.consensus_df))]
1816
1492
 
1817
1493
  # If column exists but is Null, drop it first
1818
1494
  if "adducts" in self.consensus_df.columns:
1819
1495
  self.consensus_df = self.consensus_df.drop("adducts")
1820
1496
 
1821
- self.consensus_df = self.consensus_df.with_columns(
1822
- [
1823
- pl.Series(
1824
- "adducts",
1825
- empty_adducts,
1826
- dtype=pl.List(
1827
- pl.Struct(
1828
- [
1829
- pl.Field("adduct", pl.Utf8),
1830
- pl.Field("count", pl.Int64),
1831
- pl.Field("percentage", pl.Float64),
1832
- pl.Field("mass", pl.Float64),
1833
- ],
1834
- ),
1835
- ),
1497
+ self.consensus_df = self.consensus_df.with_columns([
1498
+ pl.Series(
1499
+ "adducts",
1500
+ empty_adducts,
1501
+ dtype=pl.List(
1502
+ pl.Struct([
1503
+ pl.Field("adduct", pl.Utf8),
1504
+ pl.Field("count", pl.Int64),
1505
+ pl.Field("percentage", pl.Float64),
1506
+ pl.Field("mass", pl.Float64),
1507
+ ]),
1836
1508
  ),
1837
- ],
1838
- )
1509
+ ),
1510
+ ])
1839
1511
  else:
1840
1512
  self.consensus_df = None
1841
1513
  pbar.update(1)
@@ -1887,14 +1559,8 @@ def _load_study5(self, filename=None):
1887
1559
  pbar.update(1)
1888
1560
 
1889
1561
  # Check and migrate old string-based map_id to integer indices
1890
- if (
1891
- self.samples_df is not None
1892
- and not self.samples_df.is_empty()
1893
- and self.samples_df["map_id"].dtype == pl.Utf8
1894
- ):
1895
- self.logger.info(
1896
- "Detected old string-based map_id format, migrating to integer indices",
1897
- )
1562
+ if self.samples_df is not None and not self.samples_df.is_empty() and self.samples_df["map_id"].dtype == pl.Utf8:
1563
+ self.logger.info("Detected old string-based map_id format, migrating to integer indices")
1898
1564
 
1899
1565
  # Convert string-based map_id to integer indices
1900
1566
  sample_count = len(self.samples_df)