masster 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/docs/SCX_API_Documentation.md +0 -0
- masster/docs/SCX_DLL_Analysis.md +0 -0
- masster/logger.py +92 -78
- masster/sample/defaults/find_features_def.py +90 -94
- masster/sample/defaults/sample_def.py +15 -0
- masster/sample/h5.py +2 -2
- masster/sample/helpers.py +137 -136
- masster/sample/lib.py +11 -11
- masster/sample/load.py +13 -9
- masster/sample/plot.py +167 -60
- masster/sample/processing.py +150 -153
- masster/sample/sample.py +4 -4
- masster/sample/sample5_schema.json +62 -62
- masster/sample/save.py +16 -13
- masster/sample/sciex.py +187 -176
- masster/study/defaults/align_def.py +224 -6
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/study_def.py +2 -2
- masster/study/export.py +144 -131
- masster/study/h5.py +193 -133
- masster/study/helpers.py +293 -245
- masster/study/helpers_optimized.py +99 -57
- masster/study/load.py +51 -25
- masster/study/plot.py +453 -17
- masster/study/processing.py +197 -123
- masster/study/save.py +7 -7
- masster/study/study.py +97 -88
- masster/study/study5_schema.json +82 -82
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/METADATA +1 -1
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/RECORD +34 -32
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/WHEEL +0 -0
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/entry_points.txt +0 -0
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/licenses/LICENSE +0 -0
masster/study/h5.py
CHANGED
|
@@ -59,10 +59,10 @@ def _decode_bytes_attr(attr_value):
|
|
|
59
59
|
def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=10000):
|
|
60
60
|
"""
|
|
61
61
|
Save an entire DataFrame to HDF5 with optimized batch processing and memory efficiency.
|
|
62
|
-
|
|
62
|
+
|
|
63
63
|
This function replaces individual column processing with batch operations for much
|
|
64
64
|
better performance on large datasets (300+ samples).
|
|
65
|
-
|
|
65
|
+
|
|
66
66
|
Args:
|
|
67
67
|
df: Polars DataFrame to save
|
|
68
68
|
group: HDF5 group to save to
|
|
@@ -73,17 +73,17 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
|
|
|
73
73
|
"""
|
|
74
74
|
if df is None or df.is_empty():
|
|
75
75
|
return
|
|
76
|
-
|
|
76
|
+
|
|
77
77
|
try:
|
|
78
78
|
# Reorder columns according to schema
|
|
79
79
|
df_ordered = _reorder_columns_by_schema(df.clone(), schema, df_name)
|
|
80
80
|
total_rows = len(df_ordered)
|
|
81
|
-
|
|
81
|
+
|
|
82
82
|
# Group columns by processing type for batch optimization
|
|
83
83
|
numeric_cols = []
|
|
84
84
|
string_cols = []
|
|
85
85
|
object_cols = []
|
|
86
|
-
|
|
86
|
+
|
|
87
87
|
for col in df_ordered.columns:
|
|
88
88
|
dtype = str(df_ordered[col].dtype).lower()
|
|
89
89
|
if dtype == "object":
|
|
@@ -92,23 +92,25 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
|
|
|
92
92
|
string_cols.append(col)
|
|
93
93
|
else:
|
|
94
94
|
numeric_cols.append(col)
|
|
95
|
-
|
|
96
|
-
logger.debug(
|
|
97
|
-
|
|
95
|
+
|
|
96
|
+
logger.debug(
|
|
97
|
+
f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns"
|
|
98
|
+
)
|
|
99
|
+
|
|
98
100
|
# Process numeric columns in batch (most efficient)
|
|
99
101
|
if numeric_cols:
|
|
100
102
|
for col in numeric_cols:
|
|
101
103
|
_save_numeric_column_fast(group, col, df_ordered[col], logger)
|
|
102
|
-
|
|
103
|
-
# Process string columns in batch
|
|
104
|
+
|
|
105
|
+
# Process string columns in batch
|
|
104
106
|
if string_cols:
|
|
105
107
|
for col in string_cols:
|
|
106
108
|
_save_string_column_fast(group, col, df_ordered[col], logger)
|
|
107
|
-
|
|
109
|
+
|
|
108
110
|
# Process object columns with optimized serialization
|
|
109
111
|
if object_cols:
|
|
110
112
|
_save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
|
|
111
|
-
|
|
113
|
+
|
|
112
114
|
except Exception as e:
|
|
113
115
|
logger.error(f"Failed to save DataFrame {df_name}: {e}")
|
|
114
116
|
# Fallback to old method for safety
|
|
@@ -119,20 +121,20 @@ def _save_numeric_column_fast(group, col, data_series, logger):
|
|
|
119
121
|
"""Fast numeric column saving with optimal compression."""
|
|
120
122
|
try:
|
|
121
123
|
import numpy as np
|
|
122
|
-
|
|
124
|
+
|
|
123
125
|
# Get compression settings based on column name
|
|
124
126
|
if col in ["consensus_uid", "feature_uid", "scan_id", "rt", "mz", "intensity"]:
|
|
125
127
|
compression_kwargs = {"compression": "lzf", "shuffle": True}
|
|
126
128
|
else:
|
|
127
129
|
compression_kwargs = {"compression": "lzf"}
|
|
128
|
-
|
|
130
|
+
|
|
129
131
|
# Convert to numpy array efficiently
|
|
130
132
|
try:
|
|
131
133
|
data_array = data_series.to_numpy()
|
|
132
134
|
except Exception:
|
|
133
135
|
# Fallback for complex data types
|
|
134
136
|
data_array = np.array(data_series.to_list())
|
|
135
|
-
|
|
137
|
+
|
|
136
138
|
# Handle None/null values efficiently
|
|
137
139
|
if data_array.dtype == object:
|
|
138
140
|
# Check if this is actually a list/array column that should be treated as object
|
|
@@ -141,13 +143,13 @@ def _save_numeric_column_fast(group, col, data_series, logger):
|
|
|
141
143
|
if val is not None:
|
|
142
144
|
sample_value = val
|
|
143
145
|
break
|
|
144
|
-
|
|
146
|
+
|
|
145
147
|
# If sample value is a list/array, treat as object column
|
|
146
148
|
if isinstance(sample_value, (list, tuple, np.ndarray)):
|
|
147
149
|
logger.debug(f"Column '{col}' contains array-like data, treating as object")
|
|
148
150
|
_save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
|
|
149
151
|
return
|
|
150
|
-
|
|
152
|
+
|
|
151
153
|
# Otherwise, convert None values to -123 sentinel for mixed-type numeric columns
|
|
152
154
|
try:
|
|
153
155
|
data_array = np.array([(-123 if x is None else float(x)) for x in data_array])
|
|
@@ -156,9 +158,9 @@ def _save_numeric_column_fast(group, col, data_series, logger):
|
|
|
156
158
|
logger.debug(f"Column '{col}' is not numeric, treating as object")
|
|
157
159
|
_save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
|
|
158
160
|
return
|
|
159
|
-
|
|
161
|
+
|
|
160
162
|
group.create_dataset(col, data=data_array, **compression_kwargs)
|
|
161
|
-
|
|
163
|
+
|
|
162
164
|
except Exception as e:
|
|
163
165
|
logger.warning(f"Failed to save numeric column '{col}' efficiently: {e}")
|
|
164
166
|
# Fallback to old method
|
|
@@ -170,10 +172,10 @@ def _save_string_column_fast(group, col, data_series, logger):
|
|
|
170
172
|
try:
|
|
171
173
|
# Convert to string array efficiently
|
|
172
174
|
string_data = ["None" if x is None else str(x) for x in data_series.to_list()]
|
|
173
|
-
|
|
175
|
+
|
|
174
176
|
compression_kwargs = {"compression": "gzip", "compression_opts": 6}
|
|
175
177
|
group.create_dataset(col, data=string_data, **compression_kwargs)
|
|
176
|
-
|
|
178
|
+
|
|
177
179
|
except Exception as e:
|
|
178
180
|
logger.warning(f"Failed to save string column '{col}' efficiently: {e}")
|
|
179
181
|
# Fallback to old method
|
|
@@ -183,11 +185,11 @@ def _save_string_column_fast(group, col, data_series, logger):
|
|
|
183
185
|
def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
184
186
|
"""Optimized object column processing with chunking and parallel serialization."""
|
|
185
187
|
import json
|
|
186
|
-
|
|
188
|
+
|
|
187
189
|
def serialize_chunk(col_name, chunk_data):
|
|
188
190
|
"""Serialize a chunk of object data."""
|
|
189
191
|
serialized_chunk = []
|
|
190
|
-
|
|
192
|
+
|
|
191
193
|
if col_name == "chrom":
|
|
192
194
|
# Handle Chromatogram objects
|
|
193
195
|
for item in chunk_data:
|
|
@@ -233,19 +235,19 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
233
235
|
logger.warning(f"Unknown object column '{col_name}', using default serialization")
|
|
234
236
|
for item in chunk_data:
|
|
235
237
|
serialized_chunk.append(str(item) if item is not None else "None")
|
|
236
|
-
|
|
238
|
+
|
|
237
239
|
return serialized_chunk
|
|
238
|
-
|
|
240
|
+
|
|
239
241
|
# Process each object column
|
|
240
242
|
for col in object_cols:
|
|
241
243
|
try:
|
|
242
244
|
data_list = df[col].to_list()
|
|
243
245
|
total_items = len(data_list)
|
|
244
|
-
|
|
246
|
+
|
|
245
247
|
if total_items == 0:
|
|
246
248
|
group.create_dataset(col, data=[], compression="gzip", compression_opts=6)
|
|
247
249
|
continue
|
|
248
|
-
|
|
250
|
+
|
|
249
251
|
# For small datasets, process directly
|
|
250
252
|
if total_items <= chunk_size:
|
|
251
253
|
serialized_data = serialize_chunk(col, data_list)
|
|
@@ -253,19 +255,19 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
253
255
|
else:
|
|
254
256
|
# For large datasets, use chunked processing with parallel serialization
|
|
255
257
|
logger.debug(f"Processing large object column '{col}' with {total_items} items in chunks")
|
|
256
|
-
|
|
258
|
+
|
|
257
259
|
all_serialized = []
|
|
258
260
|
num_chunks = (total_items + chunk_size - 1) // chunk_size
|
|
259
|
-
|
|
261
|
+
|
|
260
262
|
# Use thread pool for parallel serialization of chunks
|
|
261
263
|
with ThreadPoolExecutor(max_workers=min(4, num_chunks)) as executor:
|
|
262
264
|
futures = {}
|
|
263
|
-
|
|
265
|
+
|
|
264
266
|
for i in range(0, total_items, chunk_size):
|
|
265
|
-
chunk = data_list[i:i + chunk_size]
|
|
267
|
+
chunk = data_list[i : i + chunk_size]
|
|
266
268
|
future = executor.submit(serialize_chunk, col, chunk)
|
|
267
269
|
futures[future] = i
|
|
268
|
-
|
|
270
|
+
|
|
269
271
|
# Collect results in order
|
|
270
272
|
results = {}
|
|
271
273
|
for future in as_completed(futures):
|
|
@@ -274,18 +276,20 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
274
276
|
chunk_result = future.result()
|
|
275
277
|
results[chunk_start] = chunk_result
|
|
276
278
|
except Exception as e:
|
|
277
|
-
logger.warning(
|
|
279
|
+
logger.warning(
|
|
280
|
+
f"Failed to serialize chunk starting at {chunk_start} for column '{col}': {e}"
|
|
281
|
+
)
|
|
278
282
|
# Fallback to simple string conversion for this chunk
|
|
279
|
-
chunk = data_list[chunk_start:chunk_start + chunk_size]
|
|
283
|
+
chunk = data_list[chunk_start : chunk_start + chunk_size]
|
|
280
284
|
results[chunk_start] = [str(item) if item is not None else "None" for item in chunk]
|
|
281
|
-
|
|
285
|
+
|
|
282
286
|
# Reassemble in correct order
|
|
283
287
|
for i in range(0, total_items, chunk_size):
|
|
284
288
|
if i in results:
|
|
285
289
|
all_serialized.extend(results[i])
|
|
286
|
-
|
|
290
|
+
|
|
287
291
|
group.create_dataset(col, data=all_serialized, compression="gzip", compression_opts=6)
|
|
288
|
-
|
|
292
|
+
|
|
289
293
|
except Exception as e:
|
|
290
294
|
logger.warning(f"Failed to save object column '{col}' with optimization: {e}")
|
|
291
295
|
# Fallback to old method
|
|
@@ -430,7 +434,9 @@ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, com
|
|
|
430
434
|
data_as_str.append("None")
|
|
431
435
|
group.create_dataset(col, data=data_as_str, compression=compression)
|
|
432
436
|
else:
|
|
433
|
-
logger.warning(
|
|
437
|
+
logger.warning(
|
|
438
|
+
f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column."
|
|
439
|
+
)
|
|
434
440
|
elif dtype == "string":
|
|
435
441
|
# Handle string columns
|
|
436
442
|
string_data = ["None" if x is None else str(x) for x in data]
|
|
@@ -479,6 +485,7 @@ def _reconstruct_object_column(data_col, col_name: str):
|
|
|
479
485
|
# Handle non-string data (e.g., float32 NaN from corrupted compression)
|
|
480
486
|
if not isinstance(item, str):
|
|
481
487
|
import numpy as np
|
|
488
|
+
|
|
482
489
|
if isinstance(item, (float, np.floating)) and np.isnan(item):
|
|
483
490
|
reconstructed_data.append(None)
|
|
484
491
|
continue
|
|
@@ -594,16 +601,16 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
594
601
|
expected_length = None
|
|
595
602
|
if regular_data:
|
|
596
603
|
for values in regular_data.values():
|
|
597
|
-
if values is not None and hasattr(values,
|
|
604
|
+
if values is not None and hasattr(values, "__len__"):
|
|
598
605
|
expected_length = len(values)
|
|
599
606
|
break
|
|
600
|
-
|
|
607
|
+
|
|
601
608
|
if expected_length is None and object_data:
|
|
602
609
|
for values in object_data.values():
|
|
603
|
-
if values is not None and hasattr(values,
|
|
610
|
+
if values is not None and hasattr(values, "__len__"):
|
|
604
611
|
expected_length = len(values)
|
|
605
612
|
break
|
|
606
|
-
|
|
613
|
+
|
|
607
614
|
if expected_length is None:
|
|
608
615
|
expected_length = 0
|
|
609
616
|
|
|
@@ -611,7 +618,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
611
618
|
for col in object_columns:
|
|
612
619
|
if col in object_data:
|
|
613
620
|
values = object_data[col]
|
|
614
|
-
if values is None or (hasattr(values,
|
|
621
|
+
if values is None or (hasattr(values, "__len__") and len(values) == 0):
|
|
615
622
|
object_data[col] = [None] * expected_length
|
|
616
623
|
# print(f"DEBUG: Fixed object column '{col}' to have length {expected_length}")
|
|
617
624
|
|
|
@@ -624,12 +631,20 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
624
631
|
# print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
|
|
625
632
|
if col == "adducts":
|
|
626
633
|
# Handle adducts as List(Struct) - now contains dicts
|
|
627
|
-
df = df.with_columns([
|
|
628
|
-
pl.
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
634
|
+
df = df.with_columns([
|
|
635
|
+
pl.Series(
|
|
636
|
+
col,
|
|
637
|
+
values,
|
|
638
|
+
dtype=pl.List(
|
|
639
|
+
pl.Struct([
|
|
640
|
+
pl.Field("adduct", pl.Utf8),
|
|
641
|
+
pl.Field("count", pl.Int64),
|
|
642
|
+
pl.Field("percentage", pl.Float64),
|
|
643
|
+
pl.Field("mass", pl.Float64),
|
|
644
|
+
]),
|
|
645
|
+
),
|
|
646
|
+
),
|
|
647
|
+
])
|
|
633
648
|
else:
|
|
634
649
|
# Other object columns stay as Object
|
|
635
650
|
df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
|
|
@@ -640,12 +655,20 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
|
|
|
640
655
|
# print(f"DEBUG: Creating object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
|
|
641
656
|
if col == "adducts":
|
|
642
657
|
# Handle adducts as List(Struct) - now contains dicts
|
|
643
|
-
df = df.with_columns([
|
|
644
|
-
pl.
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
658
|
+
df = df.with_columns([
|
|
659
|
+
pl.Series(
|
|
660
|
+
col,
|
|
661
|
+
values,
|
|
662
|
+
dtype=pl.List(
|
|
663
|
+
pl.Struct([
|
|
664
|
+
pl.Field("adduct", pl.Utf8),
|
|
665
|
+
pl.Field("count", pl.Int64),
|
|
666
|
+
pl.Field("percentage", pl.Float64),
|
|
667
|
+
pl.Field("mass", pl.Float64),
|
|
668
|
+
]),
|
|
669
|
+
),
|
|
670
|
+
),
|
|
671
|
+
])
|
|
649
672
|
else:
|
|
650
673
|
# Other object columns stay as Object
|
|
651
674
|
df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
|
|
@@ -713,11 +736,11 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
713
736
|
# Determine expected DataFrame length from loaded columns
|
|
714
737
|
expected_length = None
|
|
715
738
|
for col, values in data.items():
|
|
716
|
-
if values is not None and hasattr(values,
|
|
739
|
+
if values is not None and hasattr(values, "__len__"):
|
|
717
740
|
expected_length = len(values)
|
|
718
741
|
logger.debug(f"Determined expected_length={expected_length} from loaded column '{col}'")
|
|
719
742
|
break
|
|
720
|
-
|
|
743
|
+
|
|
721
744
|
# If no data loaded yet, try HDF5 columns directly
|
|
722
745
|
if expected_length is None:
|
|
723
746
|
hdf5_columns = list(group.keys())
|
|
@@ -727,7 +750,7 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
727
750
|
expected_length = len(col_data)
|
|
728
751
|
logger.debug(f"Determined expected_length={expected_length} from HDF5 column '{col}'")
|
|
729
752
|
break
|
|
730
|
-
|
|
753
|
+
|
|
731
754
|
# Default to 0 if no data found
|
|
732
755
|
if expected_length is None:
|
|
733
756
|
expected_length = 0
|
|
@@ -747,25 +770,25 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
747
770
|
# Check for columns in HDF5 file that are not in schema (for backward compatibility)
|
|
748
771
|
hdf5_columns = list(group.keys())
|
|
749
772
|
extra_columns = [col for col in hdf5_columns if col not in (schema_columns or [])]
|
|
750
|
-
|
|
773
|
+
|
|
751
774
|
for col in extra_columns:
|
|
752
775
|
logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
|
|
753
776
|
column_data = group[col][:]
|
|
754
|
-
|
|
777
|
+
|
|
755
778
|
# Try to determine if this should be treated as an object column
|
|
756
779
|
# by checking if the data looks like JSON strings
|
|
757
780
|
if len(column_data) > 0 and isinstance(column_data[0], bytes):
|
|
758
781
|
try:
|
|
759
782
|
# Check if it looks like JSON
|
|
760
|
-
test_decode = column_data[0].decode(
|
|
761
|
-
if test_decode.startswith(
|
|
783
|
+
test_decode = column_data[0].decode("utf-8")
|
|
784
|
+
if test_decode.startswith("[") or test_decode.startswith("{"):
|
|
762
785
|
# Looks like JSON, treat as object column
|
|
763
786
|
data[col] = _reconstruct_object_column(column_data, col)
|
|
764
787
|
if col not in object_columns:
|
|
765
788
|
object_columns.append(col)
|
|
766
789
|
else:
|
|
767
790
|
# Regular string data
|
|
768
|
-
data[col] = [item.decode(
|
|
791
|
+
data[col] = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
|
|
769
792
|
except Exception:
|
|
770
793
|
# If decoding fails, treat as regular data
|
|
771
794
|
data[col] = column_data
|
|
@@ -784,7 +807,7 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
784
807
|
if df_name in schema and "columns" in schema[df_name] and col in schema[df_name]["columns"]:
|
|
785
808
|
dtype_str = schema[df_name]["columns"][col]["dtype"]
|
|
786
809
|
should_be_string = dtype_str == "pl.Utf8"
|
|
787
|
-
|
|
810
|
+
|
|
788
811
|
if should_be_string:
|
|
789
812
|
processed_values = []
|
|
790
813
|
for val in values:
|
|
@@ -815,11 +838,11 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
|
|
|
815
838
|
def _save_study5_compressed(self, filename=None):
|
|
816
839
|
"""
|
|
817
840
|
Compressed save identical to _save_study5 but skips serialization of chrom and ms2_specs columns in features_df.
|
|
818
|
-
|
|
841
|
+
|
|
819
842
|
This version maintains full compatibility with _load_study5() while providing performance benefits
|
|
820
843
|
by skipping the serialization of heavy object columns (chrom and ms2_specs) in features_df.
|
|
821
844
|
"""
|
|
822
|
-
|
|
845
|
+
|
|
823
846
|
# if no extension is given, add .study5
|
|
824
847
|
if not filename.endswith(".study5"):
|
|
825
848
|
filename += ".study5"
|
|
@@ -849,18 +872,17 @@ def _save_study5_compressed(self, filename=None):
|
|
|
849
872
|
dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
|
|
850
873
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
851
874
|
dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
|
|
852
|
-
|
|
875
|
+
|
|
853
876
|
total_steps = len(dataframes_to_save) + 1 # +1 for metadata
|
|
854
|
-
|
|
877
|
+
|
|
855
878
|
# Show progress for large saves
|
|
856
879
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
857
|
-
|
|
880
|
+
|
|
858
881
|
with tqdm(
|
|
859
882
|
total=total_steps,
|
|
860
883
|
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Fast saving study",
|
|
861
884
|
disable=tdqm_disable,
|
|
862
885
|
) as pbar:
|
|
863
|
-
|
|
864
886
|
# Create groups for organization
|
|
865
887
|
metadata_group = f.create_group("metadata")
|
|
866
888
|
features_group = f.create_group("features")
|
|
@@ -883,9 +905,11 @@ def _save_study5_compressed(self, filename=None):
|
|
|
883
905
|
metadata_group.create_dataset("parameters", data="")
|
|
884
906
|
else:
|
|
885
907
|
metadata_group.create_dataset("parameters", data="")
|
|
886
|
-
|
|
908
|
+
|
|
887
909
|
pbar.update(1)
|
|
888
|
-
pbar.set_description(
|
|
910
|
+
pbar.set_description(
|
|
911
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes"
|
|
912
|
+
)
|
|
889
913
|
|
|
890
914
|
# Store samples_df - use optimized batch processing
|
|
891
915
|
if self.samples_df is not None and not self.samples_df.is_empty():
|
|
@@ -896,7 +920,9 @@ def _save_study5_compressed(self, filename=None):
|
|
|
896
920
|
|
|
897
921
|
# Store features_df - use fast method that skips chrom and ms2_specs columns
|
|
898
922
|
if self.features_df is not None and not self.features_df.is_empty():
|
|
899
|
-
self.logger.debug(
|
|
923
|
+
self.logger.debug(
|
|
924
|
+
f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)"
|
|
925
|
+
)
|
|
900
926
|
_save_dataframe_optimized_fast(self.features_df, features_group, schema, "features_df", self.logger)
|
|
901
927
|
pbar.update(1)
|
|
902
928
|
|
|
@@ -932,10 +958,10 @@ def _save_study5_compressed(self, filename=None):
|
|
|
932
958
|
def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_size=10000):
|
|
933
959
|
"""
|
|
934
960
|
Save DataFrame with optimized batch processing, but skip chrom and ms2_specs columns for features_df.
|
|
935
|
-
|
|
961
|
+
|
|
936
962
|
This function is identical to _save_dataframe_optimized but excludes heavy object columns
|
|
937
963
|
(chrom and ms2_specs) when saving features_df to improve performance.
|
|
938
|
-
|
|
964
|
+
|
|
939
965
|
Args:
|
|
940
966
|
df: Polars DataFrame to save
|
|
941
967
|
group: HDF5 group to save to
|
|
@@ -946,24 +972,24 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
|
|
|
946
972
|
"""
|
|
947
973
|
if df is None or df.is_empty():
|
|
948
974
|
return
|
|
949
|
-
|
|
975
|
+
|
|
950
976
|
try:
|
|
951
977
|
# Reorder columns according to schema
|
|
952
978
|
df_ordered = _reorder_columns_by_schema(df.clone(), schema, df_name)
|
|
953
|
-
|
|
979
|
+
|
|
954
980
|
# Skip chrom and ms2_specs columns for features_df
|
|
955
981
|
if df_name == "features_df":
|
|
956
982
|
skip_columns = ["chrom", "ms2_specs"]
|
|
957
983
|
df_ordered = df_ordered.select([col for col in df_ordered.columns if col not in skip_columns])
|
|
958
984
|
logger.debug(f"Fast save: skipping columns {skip_columns} for {df_name}")
|
|
959
|
-
|
|
985
|
+
|
|
960
986
|
total_rows = len(df_ordered)
|
|
961
|
-
|
|
987
|
+
|
|
962
988
|
# Group columns by processing type for batch optimization
|
|
963
989
|
numeric_cols = []
|
|
964
990
|
string_cols = []
|
|
965
991
|
object_cols = []
|
|
966
|
-
|
|
992
|
+
|
|
967
993
|
for col in df_ordered.columns:
|
|
968
994
|
dtype = str(df_ordered[col].dtype).lower()
|
|
969
995
|
if dtype == "object":
|
|
@@ -972,23 +998,25 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
|
|
|
972
998
|
string_cols.append(col)
|
|
973
999
|
else:
|
|
974
1000
|
numeric_cols.append(col)
|
|
975
|
-
|
|
976
|
-
logger.debug(
|
|
977
|
-
|
|
1001
|
+
|
|
1002
|
+
logger.debug(
|
|
1003
|
+
f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns"
|
|
1004
|
+
)
|
|
1005
|
+
|
|
978
1006
|
# Process numeric columns in batch (most efficient)
|
|
979
1007
|
if numeric_cols:
|
|
980
1008
|
for col in numeric_cols:
|
|
981
1009
|
_save_numeric_column_fast(group, col, df_ordered[col], logger)
|
|
982
|
-
|
|
983
|
-
# Process string columns in batch
|
|
1010
|
+
|
|
1011
|
+
# Process string columns in batch
|
|
984
1012
|
if string_cols:
|
|
985
1013
|
for col in string_cols:
|
|
986
1014
|
_save_string_column_fast(group, col, df_ordered[col], logger)
|
|
987
|
-
|
|
1015
|
+
|
|
988
1016
|
# Process object columns with optimized serialization
|
|
989
1017
|
if object_cols:
|
|
990
1018
|
_save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
|
|
991
|
-
|
|
1019
|
+
|
|
992
1020
|
except Exception as e:
|
|
993
1021
|
logger.error(f"Failed to save DataFrame {df_name}: {e}")
|
|
994
1022
|
# Fallback to old method for safety
|
|
@@ -1054,18 +1082,17 @@ def _save_study5(self, filename=None):
|
|
|
1054
1082
|
dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
|
|
1055
1083
|
if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1056
1084
|
dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
|
|
1057
|
-
|
|
1085
|
+
|
|
1058
1086
|
total_steps = len(dataframes_to_save) + 1 # +1 for metadata
|
|
1059
|
-
|
|
1087
|
+
|
|
1060
1088
|
# Show progress for large saves
|
|
1061
1089
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
|
|
1062
|
-
|
|
1090
|
+
|
|
1063
1091
|
with tqdm(
|
|
1064
1092
|
total=total_steps,
|
|
1065
1093
|
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving study",
|
|
1066
1094
|
disable=tdqm_disable,
|
|
1067
1095
|
) as pbar:
|
|
1068
|
-
|
|
1069
1096
|
# Create groups for organization
|
|
1070
1097
|
metadata_group = f.create_group("metadata")
|
|
1071
1098
|
features_group = f.create_group("features")
|
|
@@ -1088,9 +1115,11 @@ def _save_study5(self, filename=None):
|
|
|
1088
1115
|
metadata_group.create_dataset("parameters", data="")
|
|
1089
1116
|
else:
|
|
1090
1117
|
metadata_group.create_dataset("parameters", data="")
|
|
1091
|
-
|
|
1118
|
+
|
|
1092
1119
|
pbar.update(1)
|
|
1093
|
-
pbar.set_description(
|
|
1120
|
+
pbar.set_description(
|
|
1121
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes"
|
|
1122
|
+
)
|
|
1094
1123
|
|
|
1095
1124
|
# Store samples_df - use optimized batch processing
|
|
1096
1125
|
if self.samples_df is not None and not self.samples_df.is_empty():
|
|
@@ -1099,7 +1128,7 @@ def _save_study5(self, filename=None):
|
|
|
1099
1128
|
_save_dataframe_optimized(self.samples_df, samples_group, schema, "samples_df", self.logger)
|
|
1100
1129
|
pbar.update(1)
|
|
1101
1130
|
|
|
1102
|
-
# Store features_df - use optimized batch processing
|
|
1131
|
+
# Store features_df - use optimized batch processing
|
|
1103
1132
|
if self.features_df is not None and not self.features_df.is_empty():
|
|
1104
1133
|
self.logger.debug(f"Saving features_df with {len(self.features_df)} rows using optimized method")
|
|
1105
1134
|
_save_dataframe_optimized(self.features_df, features_group, schema, "features_df", self.logger)
|
|
@@ -1154,7 +1183,7 @@ def _load_study5(self, filename=None):
|
|
|
1154
1183
|
- Properly handles MS2 scan lists and spectrum lists
|
|
1155
1184
|
- Restores parameters dictionary from JSON serialization
|
|
1156
1185
|
"""
|
|
1157
|
-
|
|
1186
|
+
|
|
1158
1187
|
self.logger.info(f"Loading study from {filename}")
|
|
1159
1188
|
|
|
1160
1189
|
# Handle default filename
|
|
@@ -1182,26 +1211,26 @@ def _load_study5(self, filename=None):
|
|
|
1182
1211
|
# Define loading steps for progress tracking
|
|
1183
1212
|
loading_steps = [
|
|
1184
1213
|
"metadata",
|
|
1185
|
-
"samples_df",
|
|
1214
|
+
"samples_df",
|
|
1186
1215
|
"features_df",
|
|
1187
1216
|
"consensus_df",
|
|
1188
1217
|
"consensus_mapping_df",
|
|
1189
|
-
"consensus_ms2"
|
|
1218
|
+
"consensus_ms2",
|
|
1190
1219
|
]
|
|
1191
|
-
|
|
1220
|
+
|
|
1192
1221
|
# Check if progress bar should be disabled based on log level
|
|
1193
1222
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1194
1223
|
|
|
1195
1224
|
# Define loading steps for progress tracking
|
|
1196
1225
|
loading_steps = [
|
|
1197
1226
|
"metadata",
|
|
1198
|
-
"samples_df",
|
|
1227
|
+
"samples_df",
|
|
1199
1228
|
"features_df",
|
|
1200
1229
|
"consensus_df",
|
|
1201
1230
|
"consensus_mapping_df",
|
|
1202
|
-
"consensus_ms2"
|
|
1231
|
+
"consensus_ms2",
|
|
1203
1232
|
]
|
|
1204
|
-
|
|
1233
|
+
|
|
1205
1234
|
# Check if progress bar should be disabled based on log level
|
|
1206
1235
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1207
1236
|
|
|
@@ -1212,9 +1241,10 @@ def _load_study5(self, filename=None):
|
|
|
1212
1241
|
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading study",
|
|
1213
1242
|
disable=tdqm_disable,
|
|
1214
1243
|
) as pbar:
|
|
1215
|
-
|
|
1216
1244
|
# Load metadata
|
|
1217
|
-
pbar.set_description(
|
|
1245
|
+
pbar.set_description(
|
|
1246
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata"
|
|
1247
|
+
)
|
|
1218
1248
|
if "metadata" in f:
|
|
1219
1249
|
metadata = f["metadata"]
|
|
1220
1250
|
self.folder = _decode_bytes_attr(metadata.attrs.get("folder", ""))
|
|
@@ -1240,10 +1270,10 @@ def _load_study5(self, filename=None):
|
|
|
1240
1270
|
|
|
1241
1271
|
# Reconstruct self.parameters from loaded history
|
|
1242
1272
|
from masster.study.defaults.study_def import study_defaults
|
|
1243
|
-
|
|
1273
|
+
|
|
1244
1274
|
# Always create a fresh study_defaults object to ensure we have all defaults
|
|
1245
1275
|
self.parameters = study_defaults()
|
|
1246
|
-
|
|
1276
|
+
|
|
1247
1277
|
# Update parameters from loaded history if available
|
|
1248
1278
|
if self.history and "study" in self.history:
|
|
1249
1279
|
study_params = self.history["study"]
|
|
@@ -1257,24 +1287,26 @@ def _load_study5(self, filename=None):
|
|
|
1257
1287
|
self.logger.debug("Study parameters in history are not a valid dictionary")
|
|
1258
1288
|
else:
|
|
1259
1289
|
self.logger.debug("No study parameters found in history, using defaults")
|
|
1260
|
-
|
|
1290
|
+
|
|
1261
1291
|
# Synchronize instance attributes with parameters (similar to __init__)
|
|
1262
1292
|
# Note: folder and label are already loaded from metadata attributes above
|
|
1263
1293
|
# but we ensure they match the parameters for consistency
|
|
1264
|
-
if hasattr(self.parameters,
|
|
1294
|
+
if hasattr(self.parameters, "folder") and self.parameters.folder is not None:
|
|
1265
1295
|
self.folder = self.parameters.folder
|
|
1266
|
-
if hasattr(self.parameters,
|
|
1296
|
+
if hasattr(self.parameters, "label") and self.parameters.label is not None:
|
|
1267
1297
|
self.label = self.parameters.label
|
|
1268
|
-
if hasattr(self.parameters,
|
|
1298
|
+
if hasattr(self.parameters, "log_level"):
|
|
1269
1299
|
self.log_level = self.parameters.log_level
|
|
1270
|
-
if hasattr(self.parameters,
|
|
1300
|
+
if hasattr(self.parameters, "log_label"):
|
|
1271
1301
|
self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
|
|
1272
|
-
if hasattr(self.parameters,
|
|
1302
|
+
if hasattr(self.parameters, "log_sink"):
|
|
1273
1303
|
self.log_sink = self.parameters.log_sink
|
|
1274
1304
|
pbar.update(1)
|
|
1275
1305
|
|
|
1276
1306
|
# Load samples_df
|
|
1277
|
-
pbar.set_description(
|
|
1307
|
+
pbar.set_description(
|
|
1308
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples"
|
|
1309
|
+
)
|
|
1278
1310
|
if "samples" in f and len(f["samples"].keys()) > 0:
|
|
1279
1311
|
self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
|
|
1280
1312
|
else:
|
|
@@ -1306,7 +1338,9 @@ def _load_study5(self, filename=None):
|
|
|
1306
1338
|
)
|
|
1307
1339
|
pbar.update(1)
|
|
1308
1340
|
# Load samples_df
|
|
1309
|
-
pbar.set_description(
|
|
1341
|
+
pbar.set_description(
|
|
1342
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples"
|
|
1343
|
+
)
|
|
1310
1344
|
if "samples" in f and len(f["samples"].keys()) > 0:
|
|
1311
1345
|
self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
|
|
1312
1346
|
else:
|
|
@@ -1339,66 +1373,92 @@ def _load_study5(self, filename=None):
|
|
|
1339
1373
|
pbar.update(1)
|
|
1340
1374
|
|
|
1341
1375
|
# Load features_df
|
|
1342
|
-
pbar.set_description(
|
|
1376
|
+
pbar.set_description(
|
|
1377
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features"
|
|
1378
|
+
)
|
|
1343
1379
|
if "features" in f and len(f["features"].keys()) > 0:
|
|
1344
1380
|
object_columns = ["chrom", "ms2_scans", "ms2_specs"]
|
|
1345
|
-
self.features_df = _load_dataframe_from_group(
|
|
1381
|
+
self.features_df = _load_dataframe_from_group(
|
|
1382
|
+
f["features"], schema, "features_df", self.logger, object_columns
|
|
1383
|
+
)
|
|
1346
1384
|
else:
|
|
1347
1385
|
self.features_df = None
|
|
1348
1386
|
pbar.update(1)
|
|
1349
1387
|
|
|
1350
1388
|
# Load consensus_df
|
|
1351
|
-
pbar.set_description(
|
|
1389
|
+
pbar.set_description(
|
|
1390
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus"
|
|
1391
|
+
)
|
|
1352
1392
|
if "consensus" in f and len(f["consensus"].keys()) > 0:
|
|
1353
1393
|
# Only include adducts in object_columns if it actually exists in the file
|
|
1354
1394
|
object_columns = []
|
|
1355
1395
|
if "adducts" in f["consensus"]:
|
|
1356
1396
|
object_columns.append("adducts")
|
|
1357
|
-
|
|
1358
|
-
self.consensus_df = _load_dataframe_from_group(
|
|
1359
|
-
|
|
1397
|
+
|
|
1398
|
+
self.consensus_df = _load_dataframe_from_group(
|
|
1399
|
+
f["consensus"], schema, "consensus_df", self.logger, object_columns
|
|
1400
|
+
)
|
|
1401
|
+
|
|
1360
1402
|
# Backward compatibility: If adducts column doesn't exist, initialize with empty lists
|
|
1361
1403
|
if self.consensus_df is not None:
|
|
1362
1404
|
if "adducts" not in self.consensus_df.columns or self.consensus_df["adducts"].dtype == pl.Null:
|
|
1363
1405
|
self.logger.info("Adding missing 'adducts' column for backward compatibility")
|
|
1364
1406
|
empty_adducts: list[list] = [[] for _ in range(len(self.consensus_df))]
|
|
1365
|
-
|
|
1407
|
+
|
|
1366
1408
|
# If column exists but is Null, drop it first
|
|
1367
1409
|
if "adducts" in self.consensus_df.columns:
|
|
1368
1410
|
self.consensus_df = self.consensus_df.drop("adducts")
|
|
1369
|
-
|
|
1411
|
+
|
|
1370
1412
|
self.consensus_df = self.consensus_df.with_columns([
|
|
1371
|
-
pl.Series(
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
pl.
|
|
1375
|
-
|
|
1376
|
-
|
|
1413
|
+
pl.Series(
|
|
1414
|
+
"adducts",
|
|
1415
|
+
empty_adducts,
|
|
1416
|
+
dtype=pl.List(
|
|
1417
|
+
pl.Struct([
|
|
1418
|
+
pl.Field("adduct", pl.Utf8),
|
|
1419
|
+
pl.Field("count", pl.Int64),
|
|
1420
|
+
pl.Field("percentage", pl.Float64),
|
|
1421
|
+
pl.Field("mass", pl.Float64),
|
|
1422
|
+
]),
|
|
1423
|
+
),
|
|
1424
|
+
),
|
|
1377
1425
|
])
|
|
1378
1426
|
else:
|
|
1379
1427
|
self.consensus_df = None
|
|
1380
1428
|
pbar.update(1)
|
|
1381
1429
|
|
|
1382
1430
|
# Load consensus_mapping_df
|
|
1383
|
-
pbar.set_description(
|
|
1431
|
+
pbar.set_description(
|
|
1432
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping"
|
|
1433
|
+
)
|
|
1384
1434
|
if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
|
|
1385
|
-
self.consensus_mapping_df = _load_dataframe_from_group(
|
|
1435
|
+
self.consensus_mapping_df = _load_dataframe_from_group(
|
|
1436
|
+
f["consensus_mapping"], schema, "consensus_mapping_df", self.logger
|
|
1437
|
+
)
|
|
1386
1438
|
else:
|
|
1387
1439
|
self.consensus_mapping_df = None
|
|
1388
1440
|
pbar.update(1)
|
|
1389
1441
|
# Load consensus_mapping_df
|
|
1390
|
-
pbar.set_description(
|
|
1442
|
+
pbar.set_description(
|
|
1443
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping"
|
|
1444
|
+
)
|
|
1391
1445
|
if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
|
|
1392
|
-
self.consensus_mapping_df = _load_dataframe_from_group(
|
|
1446
|
+
self.consensus_mapping_df = _load_dataframe_from_group(
|
|
1447
|
+
f["consensus_mapping"], schema, "consensus_mapping_df", self.logger
|
|
1448
|
+
)
|
|
1393
1449
|
else:
|
|
1394
1450
|
self.consensus_mapping_df = None
|
|
1395
1451
|
pbar.update(1)
|
|
1396
1452
|
|
|
1397
1453
|
# Load consensus_ms2
|
|
1398
|
-
pbar.set_description(
|
|
1454
|
+
pbar.set_description(
|
|
1455
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus MS2"
|
|
1456
|
+
)
|
|
1399
1457
|
if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
|
|
1400
1458
|
object_columns = ["spec"]
|
|
1401
|
-
self.consensus_ms2 = _load_dataframe_from_group(
|
|
1459
|
+
self.consensus_ms2 = _load_dataframe_from_group(
|
|
1460
|
+
f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns
|
|
1461
|
+
)
|
|
1402
1462
|
else:
|
|
1403
1463
|
self.consensus_ms2 = None
|
|
1404
1464
|
pbar.update(1)
|