masster 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/load.py +5 -4
- masster/study/defaults/align_def.py +0 -204
- masster/study/defaults/fill_def.py +9 -1
- masster/study/defaults/merge_def.py +20 -69
- masster/study/export.py +25 -5
- masster/study/h5.py +160 -42
- masster/study/helpers.py +430 -53
- masster/study/load.py +986 -158
- masster/study/merge.py +683 -1076
- masster/study/plot.py +43 -38
- masster/study/processing.py +337 -280
- masster/study/study.py +58 -135
- masster/wizard/wizard.py +20 -6
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/METADATA +1 -1
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/RECORD +19 -20
- masster/study/defaults/fill_chrom_def.py +0 -260
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/WHEEL +0 -0
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/entry_points.txt +0 -0
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/licenses/LICENSE +0 -0
masster/study/h5.py
CHANGED
|
@@ -974,7 +974,7 @@ def _load_dataframe_from_group(
|
|
|
974
974
|
|
|
975
975
|
# Second pass: handle missing columns
|
|
976
976
|
for col in missing_columns:
|
|
977
|
-
logger.
|
|
977
|
+
logger.debug(f"Column '{col}' not found in {df_name}.")
|
|
978
978
|
# For missing columns, create appropriately sized array with appropriate defaults
|
|
979
979
|
if col in object_columns:
|
|
980
980
|
data[col] = [None] * expected_length
|
|
@@ -1857,6 +1857,26 @@ def _load_study5(self, filename=None):
|
|
|
1857
1857
|
self.logger,
|
|
1858
1858
|
object_columns,
|
|
1859
1859
|
)
|
|
1860
|
+
|
|
1861
|
+
# Sanity check: replace any missing rt_original with rt values
|
|
1862
|
+
if self.features_df is not None and not self.features_df.is_empty():
|
|
1863
|
+
if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
|
|
1864
|
+
null_rt_original_count = self.features_df.filter(pl.col("rt_original").is_null()).height
|
|
1865
|
+
if null_rt_original_count > 0:
|
|
1866
|
+
self.logger.info(f"Replacing {null_rt_original_count} missing rt_original values with rt")
|
|
1867
|
+
self.features_df = self.features_df.with_columns(
|
|
1868
|
+
pl.when(pl.col("rt_original").is_null())
|
|
1869
|
+
.then(pl.col("rt"))
|
|
1870
|
+
.otherwise(pl.col("rt_original"))
|
|
1871
|
+
.alias("rt_original")
|
|
1872
|
+
)
|
|
1873
|
+
else:
|
|
1874
|
+
self.logger.debug("All rt_original values are present")
|
|
1875
|
+
else:
|
|
1876
|
+
if "rt_original" not in self.features_df.columns:
|
|
1877
|
+
self.logger.debug("rt_original column not found in features_df")
|
|
1878
|
+
if "rt" not in self.features_df.columns:
|
|
1879
|
+
self.logger.debug("rt column not found in features_df")
|
|
1860
1880
|
else:
|
|
1861
1881
|
self.features_df = _create_empty_dataframe_from_schema("features_df", schema)
|
|
1862
1882
|
pbar.update(1)
|
|
@@ -2008,12 +2028,12 @@ def _load_study5(self, filename=None):
|
|
|
2008
2028
|
)
|
|
2009
2029
|
|
|
2010
2030
|
# Sanitize null feature_id and consensus_id values with new UIDs (same method as merge)
|
|
2011
|
-
self
|
|
2031
|
+
_sanitize_nulls(self)
|
|
2012
2032
|
|
|
2013
2033
|
self.logger.debug("Study loaded")
|
|
2034
|
+
|
|
2014
2035
|
|
|
2015
|
-
|
|
2016
|
-
def _load_ms1(self, sample_path: str) -> pl.DataFrame:
|
|
2036
|
+
def _load_ms1(self, filename: str) -> pl.DataFrame:
|
|
2017
2037
|
"""
|
|
2018
2038
|
Optimized method to load only MS1 data from a sample5 file for isotope detection.
|
|
2019
2039
|
|
|
@@ -2030,48 +2050,146 @@ def _load_ms1(self, sample_path: str) -> pl.DataFrame:
|
|
|
2030
2050
|
Note:
|
|
2031
2051
|
Used by find_iso() for efficient isotope pattern detection without full sample loading
|
|
2032
2052
|
"""
|
|
2033
|
-
try:
|
|
2034
|
-
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2053
|
+
#try:
|
|
2054
|
+
# add .sample5 extension if not provided
|
|
2055
|
+
if not filename.endswith(".sample5"):
|
|
2056
|
+
filename += ".sample5"
|
|
2057
|
+
with h5py.File(filename, "r") as f:
|
|
2058
|
+
# Check if ms1 group exists
|
|
2059
|
+
if "ms1" not in f:
|
|
2060
|
+
self.logger.debug(f"No MS1 data found in {filename}")
|
|
2061
|
+
return pl.DataFrame()
|
|
2062
|
+
|
|
2063
|
+
ms1_group = f["ms1"]
|
|
2064
|
+
|
|
2065
|
+
# Load MS1 data efficiently
|
|
2066
|
+
ms1_data = {}
|
|
2067
|
+
for col in ms1_group.keys():
|
|
2068
|
+
ms1_data[col] = ms1_group[col][:]
|
|
2069
|
+
|
|
2070
|
+
if not ms1_data:
|
|
2071
|
+
self.logger.debug(f"Empty MS1 data in {filename}")
|
|
2072
|
+
return pl.DataFrame()
|
|
2041
2073
|
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2074
|
+
# Create DataFrame with proper schema
|
|
2075
|
+
ms1_df = pl.DataFrame(ms1_data)
|
|
2076
|
+
|
|
2077
|
+
# Apply expected schema for MS1 data
|
|
2078
|
+
expected_schema = {
|
|
2079
|
+
"cycle": pl.Int64,
|
|
2080
|
+
"scan_uid": pl.Int64,
|
|
2081
|
+
"rt": pl.Float64,
|
|
2082
|
+
"mz": pl.Float64,
|
|
2083
|
+
"inty": pl.Float64
|
|
2084
|
+
}
|
|
2085
|
+
|
|
2086
|
+
# Cast columns to expected types if they exist
|
|
2087
|
+
cast_expressions = []
|
|
2088
|
+
for col, dtype in expected_schema.items():
|
|
2089
|
+
if col in ms1_df.columns:
|
|
2090
|
+
cast_expressions.append(pl.col(col).cast(dtype))
|
|
2091
|
+
|
|
2092
|
+
if cast_expressions:
|
|
2093
|
+
ms1_df = ms1_df.with_columns(cast_expressions)
|
|
2094
|
+
|
|
2095
|
+
self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {filename}")
|
|
2096
|
+
return ms1_df
|
|
2046
2097
|
|
|
2047
|
-
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2098
|
+
#except Exception as e:
|
|
2099
|
+
# self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
|
|
2100
|
+
# return pl.DataFrame()
|
|
2101
|
+
|
|
2102
|
+
|
|
2103
|
+
def _sanitize_nulls(self):
|
|
2104
|
+
"""
|
|
2105
|
+
Sanitize null feature_id and consensus_id values by replacing them with new integer IDs.
|
|
2106
|
+
For feature_id: generates large sequential integers that can be converted by merge/align functions.
|
|
2107
|
+
For consensus_id: uses 16-character UUID strings (as expected by merge function).
|
|
2108
|
+
"""
|
|
2109
|
+
import uuid
|
|
2110
|
+
import polars as pl
|
|
2111
|
+
import time
|
|
2112
|
+
|
|
2113
|
+
# Sanitize features_df feature_id column
|
|
2114
|
+
if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
|
|
2115
|
+
# Check for null feature_ids
|
|
2116
|
+
null_feature_ids = self.features_df.filter(pl.col("feature_id").is_null()).shape[0]
|
|
2117
|
+
if null_feature_ids > 0:
|
|
2118
|
+
self.logger.debug(f"Sanitizing {null_feature_ids} null feature_id values with new integer IDs")
|
|
2053
2119
|
|
|
2054
|
-
#
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
|
|
2120
|
+
# Find the maximum existing feature_id (convert strings to int if possible)
|
|
2121
|
+
max_existing_id = 0
|
|
2122
|
+
existing_ids = self.features_df.filter(pl.col("feature_id").is_not_null())["feature_id"].to_list()
|
|
2123
|
+
for fid in existing_ids:
|
|
2124
|
+
try:
|
|
2125
|
+
int_id = int(fid)
|
|
2126
|
+
max_existing_id = max(max_existing_id, int_id)
|
|
2127
|
+
except (ValueError, TypeError):
|
|
2128
|
+
# Skip non-integer IDs
|
|
2129
|
+
pass
|
|
2062
2130
|
|
|
2063
|
-
#
|
|
2064
|
-
|
|
2065
|
-
|
|
2066
|
-
|
|
2067
|
-
|
|
2131
|
+
# Generate new sequential integer IDs starting from max + timestamp offset
|
|
2132
|
+
# Use timestamp to ensure uniqueness across different sanitization runs
|
|
2133
|
+
base_id = max(max_existing_id + 1, int(time.time() * 1000000)) # Microsecond timestamp
|
|
2134
|
+
new_int_ids = [str(base_id + i) for i in range(null_feature_ids)]
|
|
2135
|
+
uid_index = 0
|
|
2068
2136
|
|
|
2069
|
-
|
|
2070
|
-
|
|
2137
|
+
# Create a list to store all feature_ids
|
|
2138
|
+
feature_ids = []
|
|
2139
|
+
for feature_id in self.features_df["feature_id"].to_list():
|
|
2140
|
+
if feature_id is None:
|
|
2141
|
+
feature_ids.append(new_int_ids[uid_index])
|
|
2142
|
+
uid_index += 1
|
|
2143
|
+
else:
|
|
2144
|
+
feature_ids.append(feature_id)
|
|
2071
2145
|
|
|
2072
|
-
|
|
2073
|
-
|
|
2146
|
+
# Update the DataFrame with sanitized feature_ids
|
|
2147
|
+
self.features_df = self.features_df.with_columns(
|
|
2148
|
+
pl.Series("feature_id", feature_ids, dtype=pl.Utf8)
|
|
2149
|
+
)
|
|
2074
2150
|
|
|
2075
|
-
|
|
2076
|
-
|
|
2077
|
-
|
|
2151
|
+
self.logger.debug(f"Successfully sanitized {null_feature_ids} feature_id values")
|
|
2152
|
+
|
|
2153
|
+
# Sanitize consensus_df consensus_id column
|
|
2154
|
+
if hasattr(self, 'consensus_df') and self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
2155
|
+
if "consensus_id" in self.consensus_df.columns:
|
|
2156
|
+
null_consensus_ids = self.consensus_df.filter(pl.col("consensus_id").is_null()).shape[0]
|
|
2157
|
+
if null_consensus_ids > 0:
|
|
2158
|
+
self.logger.debug(f"Sanitizing {null_consensus_ids} null consensus_id values with new UIDs")
|
|
2159
|
+
|
|
2160
|
+
# Generate new UIDs for null values using the same method as merge()
|
|
2161
|
+
new_uids = [str(uuid.uuid4()).replace('-', '')[:16] for _ in range(null_consensus_ids)]
|
|
2162
|
+
uid_index = 0
|
|
2163
|
+
|
|
2164
|
+
# Create a list to store all consensus_ids
|
|
2165
|
+
consensus_ids = []
|
|
2166
|
+
for consensus_id in self.consensus_df["consensus_id"].to_list():
|
|
2167
|
+
if consensus_id is None:
|
|
2168
|
+
consensus_ids.append(new_uids[uid_index])
|
|
2169
|
+
uid_index += 1
|
|
2170
|
+
else:
|
|
2171
|
+
consensus_ids.append(consensus_id)
|
|
2172
|
+
|
|
2173
|
+
# Update the DataFrame with sanitized consensus_ids
|
|
2174
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
2175
|
+
pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
|
|
2176
|
+
)
|
|
2177
|
+
|
|
2178
|
+
self.logger.debug(f"Successfully sanitized {null_consensus_ids} consensus_id values")
|
|
2179
|
+
|
|
2180
|
+
# Sanitize rt_original in features_df by replacing null or NaN values with rt values
|
|
2181
|
+
if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
|
|
2182
|
+
if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
|
|
2183
|
+
# Check for null or NaN values in rt_original
|
|
2184
|
+
null_or_nan_rt_original = self.features_df.filter(
|
|
2185
|
+
pl.col("rt_original").is_null() | pl.col("rt_original").is_nan()
|
|
2186
|
+
).shape[0]
|
|
2187
|
+
if null_or_nan_rt_original > 0:
|
|
2188
|
+
self.logger.debug(f"Sanitizing {null_or_nan_rt_original} null or NaN rt_original values with rt values")
|
|
2189
|
+
self.features_df = self.features_df.with_columns(
|
|
2190
|
+
pl.when(pl.col("rt_original").is_null() | pl.col("rt_original").is_nan())
|
|
2191
|
+
.then(pl.col("rt"))
|
|
2192
|
+
.otherwise(pl.col("rt_original"))
|
|
2193
|
+
.alias("rt_original")
|
|
2194
|
+
)
|
|
2195
|
+
self.logger.debug(f"Successfully sanitized {null_or_nan_rt_original} rt_original values")
|