masster 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/adducts.py +1 -1
- masster/sample/h5.py +11 -11
- masster/sample/helpers.py +2 -2
- masster/sample/load.py +10 -8
- masster/sample/processing.py +1 -1
- masster/sample/sample.py +7 -3
- masster/study/defaults/align_def.py +0 -204
- masster/study/defaults/fill_def.py +9 -1
- masster/study/defaults/merge_def.py +20 -69
- masster/study/export.py +25 -5
- masster/study/h5.py +230 -42
- masster/study/helpers.py +430 -53
- masster/study/load.py +986 -158
- masster/study/merge.py +683 -1076
- masster/study/plot.py +95 -73
- masster/study/processing.py +337 -280
- masster/study/study.py +58 -135
- masster/wizard/wizard.py +20 -6
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/METADATA +1 -1
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/RECORD +24 -25
- masster/study/defaults/fill_chrom_def.py +0 -260
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/WHEEL +0 -0
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/entry_points.txt +0 -0
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/licenses/LICENSE +0 -0
masster/study/h5.py
CHANGED
|
@@ -304,6 +304,30 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
|
|
|
304
304
|
serialized_chunk.append(item.to_json())
|
|
305
305
|
else:
|
|
306
306
|
serialized_chunk.append("None")
|
|
307
|
+
elif col_name == "iso":
|
|
308
|
+
# Handle isotope patterns (numpy arrays with [mz, intensity] data)
|
|
309
|
+
for item in chunk_data:
|
|
310
|
+
if item is not None:
|
|
311
|
+
try:
|
|
312
|
+
# Convert numpy array to nested list for JSON serialization
|
|
313
|
+
serialized_chunk.append(json.dumps(item.tolist()))
|
|
314
|
+
except (AttributeError, TypeError):
|
|
315
|
+
# Fallback for non-numpy data
|
|
316
|
+
serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
|
|
317
|
+
else:
|
|
318
|
+
serialized_chunk.append("None")
|
|
319
|
+
elif col_name == "ms1_spec":
|
|
320
|
+
# Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
|
|
321
|
+
for item in chunk_data:
|
|
322
|
+
if item is not None:
|
|
323
|
+
try:
|
|
324
|
+
# Convert numpy array to nested list for JSON serialization
|
|
325
|
+
serialized_chunk.append(json.dumps(item.tolist()))
|
|
326
|
+
except (AttributeError, TypeError):
|
|
327
|
+
# Fallback for non-numpy data
|
|
328
|
+
serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
|
|
329
|
+
else:
|
|
330
|
+
serialized_chunk.append("None")
|
|
307
331
|
else:
|
|
308
332
|
logger.warning(
|
|
309
333
|
f"Unknown object column '{col_name}', using default serialization",
|
|
@@ -564,6 +588,34 @@ def _save_dataframe_column_legacy(
|
|
|
564
588
|
else:
|
|
565
589
|
data_as_str.append("None")
|
|
566
590
|
group.create_dataset(col, data=data_as_str, compression=compression)
|
|
591
|
+
elif col == "iso":
|
|
592
|
+
# Handle isotope patterns (numpy arrays with [mz, intensity] data)
|
|
593
|
+
data_as_json_strings = []
|
|
594
|
+
for item in data:
|
|
595
|
+
if item is not None:
|
|
596
|
+
try:
|
|
597
|
+
# Convert numpy array to nested list for JSON serialization
|
|
598
|
+
data_as_json_strings.append(json.dumps(item.tolist()))
|
|
599
|
+
except (AttributeError, TypeError):
|
|
600
|
+
# Fallback for non-numpy data
|
|
601
|
+
data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
|
|
602
|
+
else:
|
|
603
|
+
data_as_json_strings.append("None")
|
|
604
|
+
group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
|
|
605
|
+
elif col == "ms1_spec":
|
|
606
|
+
# Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
|
|
607
|
+
data_as_json_strings = []
|
|
608
|
+
for item in data:
|
|
609
|
+
if item is not None:
|
|
610
|
+
try:
|
|
611
|
+
# Convert numpy array to nested list for JSON serialization
|
|
612
|
+
data_as_json_strings.append(json.dumps(item.tolist()))
|
|
613
|
+
except (AttributeError, TypeError):
|
|
614
|
+
# Fallback for non-numpy data
|
|
615
|
+
data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
|
|
616
|
+
else:
|
|
617
|
+
data_as_json_strings.append("None")
|
|
618
|
+
group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
|
|
567
619
|
else:
|
|
568
620
|
logger.warning(
|
|
569
621
|
f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column.",
|
|
@@ -666,6 +718,24 @@ def _reconstruct_object_column(data_col, col_name: str):
|
|
|
666
718
|
},
|
|
667
719
|
)
|
|
668
720
|
reconstructed_data.append(converted_adducts)
|
|
721
|
+
elif col_name == "iso":
|
|
722
|
+
# Handle isotope patterns (numpy arrays with [mz, intensity] data)
|
|
723
|
+
try:
|
|
724
|
+
import numpy as np
|
|
725
|
+
iso_data = json.loads(item)
|
|
726
|
+
# Convert back to numpy array
|
|
727
|
+
reconstructed_data.append(np.array(iso_data) if iso_data else None)
|
|
728
|
+
except (json.JSONDecodeError, ValueError, ImportError):
|
|
729
|
+
reconstructed_data.append(None)
|
|
730
|
+
elif col_name == "ms1_spec":
|
|
731
|
+
# Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
|
|
732
|
+
try:
|
|
733
|
+
import numpy as np
|
|
734
|
+
ms1_spec_data = json.loads(item)
|
|
735
|
+
# Convert back to numpy array
|
|
736
|
+
reconstructed_data.append(np.array(ms1_spec_data) if ms1_spec_data else None)
|
|
737
|
+
except (json.JSONDecodeError, ValueError, ImportError):
|
|
738
|
+
reconstructed_data.append(None)
|
|
669
739
|
else:
|
|
670
740
|
# Unknown object column
|
|
671
741
|
reconstructed_data.append(None)
|
|
@@ -974,7 +1044,7 @@ def _load_dataframe_from_group(
|
|
|
974
1044
|
|
|
975
1045
|
# Second pass: handle missing columns
|
|
976
1046
|
for col in missing_columns:
|
|
977
|
-
logger.
|
|
1047
|
+
logger.debug(f"Column '{col}' not found in {df_name}.")
|
|
978
1048
|
# For missing columns, create appropriately sized array with appropriate defaults
|
|
979
1049
|
if col in object_columns:
|
|
980
1050
|
data[col] = [None] * expected_length
|
|
@@ -1857,6 +1927,26 @@ def _load_study5(self, filename=None):
|
|
|
1857
1927
|
self.logger,
|
|
1858
1928
|
object_columns,
|
|
1859
1929
|
)
|
|
1930
|
+
|
|
1931
|
+
# Sanity check: replace any missing rt_original with rt values
|
|
1932
|
+
if self.features_df is not None and not self.features_df.is_empty():
|
|
1933
|
+
if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
|
|
1934
|
+
null_rt_original_count = self.features_df.filter(pl.col("rt_original").is_null()).height
|
|
1935
|
+
if null_rt_original_count > 0:
|
|
1936
|
+
self.logger.info(f"Replacing {null_rt_original_count} missing rt_original values with rt")
|
|
1937
|
+
self.features_df = self.features_df.with_columns(
|
|
1938
|
+
pl.when(pl.col("rt_original").is_null())
|
|
1939
|
+
.then(pl.col("rt"))
|
|
1940
|
+
.otherwise(pl.col("rt_original"))
|
|
1941
|
+
.alias("rt_original")
|
|
1942
|
+
)
|
|
1943
|
+
else:
|
|
1944
|
+
self.logger.debug("All rt_original values are present")
|
|
1945
|
+
else:
|
|
1946
|
+
if "rt_original" not in self.features_df.columns:
|
|
1947
|
+
self.logger.debug("rt_original column not found in features_df")
|
|
1948
|
+
if "rt" not in self.features_df.columns:
|
|
1949
|
+
self.logger.debug("rt column not found in features_df")
|
|
1860
1950
|
else:
|
|
1861
1951
|
self.features_df = _create_empty_dataframe_from_schema("features_df", schema)
|
|
1862
1952
|
pbar.update(1)
|
|
@@ -2008,12 +2098,12 @@ def _load_study5(self, filename=None):
|
|
|
2008
2098
|
)
|
|
2009
2099
|
|
|
2010
2100
|
# Sanitize null feature_id and consensus_id values with new UIDs (same method as merge)
|
|
2011
|
-
self
|
|
2101
|
+
_sanitize_nulls(self)
|
|
2012
2102
|
|
|
2013
2103
|
self.logger.debug("Study loaded")
|
|
2104
|
+
|
|
2014
2105
|
|
|
2015
|
-
|
|
2016
|
-
def _load_ms1(self, sample_path: str) -> pl.DataFrame:
|
|
2106
|
+
def _load_ms1(self, filename: str) -> pl.DataFrame:
|
|
2017
2107
|
"""
|
|
2018
2108
|
Optimized method to load only MS1 data from a sample5 file for isotope detection.
|
|
2019
2109
|
|
|
@@ -2030,48 +2120,146 @@ def _load_ms1(self, sample_path: str) -> pl.DataFrame:
|
|
|
2030
2120
|
Note:
|
|
2031
2121
|
Used by find_iso() for efficient isotope pattern detection without full sample loading
|
|
2032
2122
|
"""
|
|
2033
|
-
try:
|
|
2034
|
-
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2123
|
+
#try:
|
|
2124
|
+
# add .sample5 extension if not provided
|
|
2125
|
+
if not filename.endswith(".sample5"):
|
|
2126
|
+
filename += ".sample5"
|
|
2127
|
+
with h5py.File(filename, "r") as f:
|
|
2128
|
+
# Check if ms1 group exists
|
|
2129
|
+
if "ms1" not in f:
|
|
2130
|
+
self.logger.debug(f"No MS1 data found in {filename}")
|
|
2131
|
+
return pl.DataFrame()
|
|
2132
|
+
|
|
2133
|
+
ms1_group = f["ms1"]
|
|
2134
|
+
|
|
2135
|
+
# Load MS1 data efficiently
|
|
2136
|
+
ms1_data = {}
|
|
2137
|
+
for col in ms1_group.keys():
|
|
2138
|
+
ms1_data[col] = ms1_group[col][:]
|
|
2139
|
+
|
|
2140
|
+
if not ms1_data:
|
|
2141
|
+
self.logger.debug(f"Empty MS1 data in {filename}")
|
|
2142
|
+
return pl.DataFrame()
|
|
2041
2143
|
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2144
|
+
# Create DataFrame with proper schema
|
|
2145
|
+
ms1_df = pl.DataFrame(ms1_data)
|
|
2146
|
+
|
|
2147
|
+
# Apply expected schema for MS1 data
|
|
2148
|
+
expected_schema = {
|
|
2149
|
+
"cycle": pl.Int64,
|
|
2150
|
+
"scan_uid": pl.Int64,
|
|
2151
|
+
"rt": pl.Float64,
|
|
2152
|
+
"mz": pl.Float64,
|
|
2153
|
+
"inty": pl.Float64
|
|
2154
|
+
}
|
|
2155
|
+
|
|
2156
|
+
# Cast columns to expected types if they exist
|
|
2157
|
+
cast_expressions = []
|
|
2158
|
+
for col, dtype in expected_schema.items():
|
|
2159
|
+
if col in ms1_df.columns:
|
|
2160
|
+
cast_expressions.append(pl.col(col).cast(dtype))
|
|
2161
|
+
|
|
2162
|
+
if cast_expressions:
|
|
2163
|
+
ms1_df = ms1_df.with_columns(cast_expressions)
|
|
2164
|
+
|
|
2165
|
+
self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {filename}")
|
|
2166
|
+
return ms1_df
|
|
2046
2167
|
|
|
2047
|
-
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2168
|
+
#except Exception as e:
|
|
2169
|
+
# self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
|
|
2170
|
+
# return pl.DataFrame()
|
|
2171
|
+
|
|
2172
|
+
|
|
2173
|
+
def _sanitize_nulls(self):
|
|
2174
|
+
"""
|
|
2175
|
+
Sanitize null feature_id and consensus_id values by replacing them with new integer IDs.
|
|
2176
|
+
For feature_id: generates large sequential integers that can be converted by merge/align functions.
|
|
2177
|
+
For consensus_id: uses 16-character UUID strings (as expected by merge function).
|
|
2178
|
+
"""
|
|
2179
|
+
import uuid
|
|
2180
|
+
import polars as pl
|
|
2181
|
+
import time
|
|
2182
|
+
|
|
2183
|
+
# Sanitize features_df feature_id column
|
|
2184
|
+
if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
|
|
2185
|
+
# Check for null feature_ids
|
|
2186
|
+
null_feature_ids = self.features_df.filter(pl.col("feature_id").is_null()).shape[0]
|
|
2187
|
+
if null_feature_ids > 0:
|
|
2188
|
+
self.logger.debug(f"Sanitizing {null_feature_ids} null feature_id values with new integer IDs")
|
|
2053
2189
|
|
|
2054
|
-
#
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
|
|
2190
|
+
# Find the maximum existing feature_id (convert strings to int if possible)
|
|
2191
|
+
max_existing_id = 0
|
|
2192
|
+
existing_ids = self.features_df.filter(pl.col("feature_id").is_not_null())["feature_id"].to_list()
|
|
2193
|
+
for fid in existing_ids:
|
|
2194
|
+
try:
|
|
2195
|
+
int_id = int(fid)
|
|
2196
|
+
max_existing_id = max(max_existing_id, int_id)
|
|
2197
|
+
except (ValueError, TypeError):
|
|
2198
|
+
# Skip non-integer IDs
|
|
2199
|
+
pass
|
|
2062
2200
|
|
|
2063
|
-
#
|
|
2064
|
-
|
|
2065
|
-
|
|
2066
|
-
|
|
2067
|
-
|
|
2201
|
+
# Generate new sequential integer IDs starting from max + timestamp offset
|
|
2202
|
+
# Use timestamp to ensure uniqueness across different sanitization runs
|
|
2203
|
+
base_id = max(max_existing_id + 1, int(time.time() * 1000000)) # Microsecond timestamp
|
|
2204
|
+
new_int_ids = [str(base_id + i) for i in range(null_feature_ids)]
|
|
2205
|
+
uid_index = 0
|
|
2068
2206
|
|
|
2069
|
-
|
|
2070
|
-
|
|
2207
|
+
# Create a list to store all feature_ids
|
|
2208
|
+
feature_ids = []
|
|
2209
|
+
for feature_id in self.features_df["feature_id"].to_list():
|
|
2210
|
+
if feature_id is None:
|
|
2211
|
+
feature_ids.append(new_int_ids[uid_index])
|
|
2212
|
+
uid_index += 1
|
|
2213
|
+
else:
|
|
2214
|
+
feature_ids.append(feature_id)
|
|
2071
2215
|
|
|
2072
|
-
|
|
2073
|
-
|
|
2216
|
+
# Update the DataFrame with sanitized feature_ids
|
|
2217
|
+
self.features_df = self.features_df.with_columns(
|
|
2218
|
+
pl.Series("feature_id", feature_ids, dtype=pl.Utf8)
|
|
2219
|
+
)
|
|
2074
2220
|
|
|
2075
|
-
|
|
2076
|
-
|
|
2077
|
-
|
|
2221
|
+
self.logger.debug(f"Successfully sanitized {null_feature_ids} feature_id values")
|
|
2222
|
+
|
|
2223
|
+
# Sanitize consensus_df consensus_id column
|
|
2224
|
+
if hasattr(self, 'consensus_df') and self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
2225
|
+
if "consensus_id" in self.consensus_df.columns:
|
|
2226
|
+
null_consensus_ids = self.consensus_df.filter(pl.col("consensus_id").is_null()).shape[0]
|
|
2227
|
+
if null_consensus_ids > 0:
|
|
2228
|
+
self.logger.debug(f"Sanitizing {null_consensus_ids} null consensus_id values with new UIDs")
|
|
2229
|
+
|
|
2230
|
+
# Generate new UIDs for null values using the same method as merge()
|
|
2231
|
+
new_uids = [str(uuid.uuid4()).replace('-', '')[:16] for _ in range(null_consensus_ids)]
|
|
2232
|
+
uid_index = 0
|
|
2233
|
+
|
|
2234
|
+
# Create a list to store all consensus_ids
|
|
2235
|
+
consensus_ids = []
|
|
2236
|
+
for consensus_id in self.consensus_df["consensus_id"].to_list():
|
|
2237
|
+
if consensus_id is None:
|
|
2238
|
+
consensus_ids.append(new_uids[uid_index])
|
|
2239
|
+
uid_index += 1
|
|
2240
|
+
else:
|
|
2241
|
+
consensus_ids.append(consensus_id)
|
|
2242
|
+
|
|
2243
|
+
# Update the DataFrame with sanitized consensus_ids
|
|
2244
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
2245
|
+
pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
|
|
2246
|
+
)
|
|
2247
|
+
|
|
2248
|
+
self.logger.debug(f"Successfully sanitized {null_consensus_ids} consensus_id values")
|
|
2249
|
+
|
|
2250
|
+
# Sanitize rt_original in features_df by replacing null or NaN values with rt values
|
|
2251
|
+
if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
|
|
2252
|
+
if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
|
|
2253
|
+
# Check for null or NaN values in rt_original
|
|
2254
|
+
null_or_nan_rt_original = self.features_df.filter(
|
|
2255
|
+
pl.col("rt_original").is_null() | pl.col("rt_original").is_nan()
|
|
2256
|
+
).shape[0]
|
|
2257
|
+
if null_or_nan_rt_original > 0:
|
|
2258
|
+
self.logger.debug(f"Sanitizing {null_or_nan_rt_original} null or NaN rt_original values with rt values")
|
|
2259
|
+
self.features_df = self.features_df.with_columns(
|
|
2260
|
+
pl.when(pl.col("rt_original").is_null() | pl.col("rt_original").is_nan())
|
|
2261
|
+
.then(pl.col("rt"))
|
|
2262
|
+
.otherwise(pl.col("rt_original"))
|
|
2263
|
+
.alias("rt_original")
|
|
2264
|
+
)
|
|
2265
|
+
self.logger.debug(f"Successfully sanitized {null_or_nan_rt_original} rt_original values")
|