masster 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/h5.py CHANGED
@@ -974,7 +974,7 @@ def _load_dataframe_from_group(
974
974
 
975
975
  # Second pass: handle missing columns
976
976
  for col in missing_columns:
977
- logger.info(f"Column '{col}' not found in {df_name}.")
977
+ logger.debug(f"Column '{col}' not found in {df_name}.")
978
978
  # For missing columns, create appropriately sized array with appropriate defaults
979
979
  if col in object_columns:
980
980
  data[col] = [None] * expected_length
@@ -1857,6 +1857,26 @@ def _load_study5(self, filename=None):
1857
1857
  self.logger,
1858
1858
  object_columns,
1859
1859
  )
1860
+
1861
+ # Sanity check: replace any missing rt_original with rt values
1862
+ if self.features_df is not None and not self.features_df.is_empty():
1863
+ if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
1864
+ null_rt_original_count = self.features_df.filter(pl.col("rt_original").is_null()).height
1865
+ if null_rt_original_count > 0:
1866
+ self.logger.info(f"Replacing {null_rt_original_count} missing rt_original values with rt")
1867
+ self.features_df = self.features_df.with_columns(
1868
+ pl.when(pl.col("rt_original").is_null())
1869
+ .then(pl.col("rt"))
1870
+ .otherwise(pl.col("rt_original"))
1871
+ .alias("rt_original")
1872
+ )
1873
+ else:
1874
+ self.logger.debug("All rt_original values are present")
1875
+ else:
1876
+ if "rt_original" not in self.features_df.columns:
1877
+ self.logger.debug("rt_original column not found in features_df")
1878
+ if "rt" not in self.features_df.columns:
1879
+ self.logger.debug("rt column not found in features_df")
1860
1880
  else:
1861
1881
  self.features_df = _create_empty_dataframe_from_schema("features_df", schema)
1862
1882
  pbar.update(1)
@@ -2008,12 +2028,12 @@ def _load_study5(self, filename=None):
2008
2028
  )
2009
2029
 
2010
2030
  # Sanitize null feature_id and consensus_id values with new UIDs (same method as merge)
2011
- self._sanitize_null_ids()
2031
+ _sanitize_nulls(self)
2012
2032
 
2013
2033
  self.logger.debug("Study loaded")
2034
+
2014
2035
 
2015
-
2016
- def _load_ms1(self, sample_path: str) -> pl.DataFrame:
2036
+ def _load_ms1(self, filename: str) -> pl.DataFrame:
2017
2037
  """
2018
2038
  Optimized method to load only MS1 data from a sample5 file for isotope detection.
2019
2039
 
@@ -2030,48 +2050,146 @@ def _load_ms1(self, sample_path: str) -> pl.DataFrame:
2030
2050
  Note:
2031
2051
  Used by find_iso() for efficient isotope pattern detection without full sample loading
2032
2052
  """
2033
- try:
2034
- with h5py.File(sample_path, "r") as f:
2035
- # Check if ms1 group exists
2036
- if "ms1" not in f:
2037
- self.logger.debug(f"No MS1 data found in {sample_path}")
2038
- return pl.DataFrame()
2039
-
2040
- ms1_group = f["ms1"]
2053
+ #try:
2054
+ # add .sample5 extension if not provided
2055
+ if not filename.endswith(".sample5"):
2056
+ filename += ".sample5"
2057
+ with h5py.File(filename, "r") as f:
2058
+ # Check if ms1 group exists
2059
+ if "ms1" not in f:
2060
+ self.logger.debug(f"No MS1 data found in {filename}")
2061
+ return pl.DataFrame()
2062
+
2063
+ ms1_group = f["ms1"]
2064
+
2065
+ # Load MS1 data efficiently
2066
+ ms1_data = {}
2067
+ for col in ms1_group.keys():
2068
+ ms1_data[col] = ms1_group[col][:]
2069
+
2070
+ if not ms1_data:
2071
+ self.logger.debug(f"Empty MS1 data in {filename}")
2072
+ return pl.DataFrame()
2041
2073
 
2042
- # Load MS1 data efficiently
2043
- ms1_data = {}
2044
- for col in ms1_group.keys():
2045
- ms1_data[col] = ms1_group[col][:]
2074
+ # Create DataFrame with proper schema
2075
+ ms1_df = pl.DataFrame(ms1_data)
2076
+
2077
+ # Apply expected schema for MS1 data
2078
+ expected_schema = {
2079
+ "cycle": pl.Int64,
2080
+ "scan_uid": pl.Int64,
2081
+ "rt": pl.Float64,
2082
+ "mz": pl.Float64,
2083
+ "inty": pl.Float64
2084
+ }
2085
+
2086
+ # Cast columns to expected types if they exist
2087
+ cast_expressions = []
2088
+ for col, dtype in expected_schema.items():
2089
+ if col in ms1_df.columns:
2090
+ cast_expressions.append(pl.col(col).cast(dtype))
2091
+
2092
+ if cast_expressions:
2093
+ ms1_df = ms1_df.with_columns(cast_expressions)
2094
+
2095
+ self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {filename}")
2096
+ return ms1_df
2046
2097
 
2047
- if not ms1_data:
2048
- self.logger.debug(f"Empty MS1 data in {sample_path}")
2049
- return pl.DataFrame()
2050
-
2051
- # Create DataFrame with proper schema
2052
- ms1_df = pl.DataFrame(ms1_data)
2098
+ #except Exception as e:
2099
+ # self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
2100
+ # return pl.DataFrame()
2101
+
2102
+
2103
+ def _sanitize_nulls(self):
2104
+ """
2105
+ Sanitize null feature_id and consensus_id values by replacing them with new integer IDs.
2106
+ For feature_id: generates large sequential integers that can be converted by merge/align functions.
2107
+ For consensus_id: uses 16-character UUID strings (as expected by merge function).
2108
+ """
2109
+ import uuid
2110
+ import polars as pl
2111
+ import time
2112
+
2113
+ # Sanitize features_df feature_id column
2114
+ if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
2115
+ # Check for null feature_ids
2116
+ null_feature_ids = self.features_df.filter(pl.col("feature_id").is_null()).shape[0]
2117
+ if null_feature_ids > 0:
2118
+ self.logger.debug(f"Sanitizing {null_feature_ids} null feature_id values with new integer IDs")
2053
2119
 
2054
- # Apply expected schema for MS1 data
2055
- expected_schema = {
2056
- "cycle": pl.Int64,
2057
- "scan_uid": pl.Int64,
2058
- "rt": pl.Float64,
2059
- "mz": pl.Float64,
2060
- "inty": pl.Float64
2061
- }
2120
+ # Find the maximum existing feature_id (convert strings to int if possible)
2121
+ max_existing_id = 0
2122
+ existing_ids = self.features_df.filter(pl.col("feature_id").is_not_null())["feature_id"].to_list()
2123
+ for fid in existing_ids:
2124
+ try:
2125
+ int_id = int(fid)
2126
+ max_existing_id = max(max_existing_id, int_id)
2127
+ except (ValueError, TypeError):
2128
+ # Skip non-integer IDs
2129
+ pass
2062
2130
 
2063
- # Cast columns to expected types if they exist
2064
- cast_expressions = []
2065
- for col, dtype in expected_schema.items():
2066
- if col in ms1_df.columns:
2067
- cast_expressions.append(pl.col(col).cast(dtype))
2131
+ # Generate new sequential integer IDs starting from max + timestamp offset
2132
+ # Use timestamp to ensure uniqueness across different sanitization runs
2133
+ base_id = max(max_existing_id + 1, int(time.time() * 1000000)) # Microsecond timestamp
2134
+ new_int_ids = [str(base_id + i) for i in range(null_feature_ids)]
2135
+ uid_index = 0
2068
2136
 
2069
- if cast_expressions:
2070
- ms1_df = ms1_df.with_columns(cast_expressions)
2137
+ # Create a list to store all feature_ids
2138
+ feature_ids = []
2139
+ for feature_id in self.features_df["feature_id"].to_list():
2140
+ if feature_id is None:
2141
+ feature_ids.append(new_int_ids[uid_index])
2142
+ uid_index += 1
2143
+ else:
2144
+ feature_ids.append(feature_id)
2071
2145
 
2072
- self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {sample_path}")
2073
- return ms1_df
2146
+ # Update the DataFrame with sanitized feature_ids
2147
+ self.features_df = self.features_df.with_columns(
2148
+ pl.Series("feature_id", feature_ids, dtype=pl.Utf8)
2149
+ )
2074
2150
 
2075
- except Exception as e:
2076
- self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
2077
- return pl.DataFrame()
2151
+ self.logger.debug(f"Successfully sanitized {null_feature_ids} feature_id values")
2152
+
2153
+ # Sanitize consensus_df consensus_id column
2154
+ if hasattr(self, 'consensus_df') and self.consensus_df is not None and not self.consensus_df.is_empty():
2155
+ if "consensus_id" in self.consensus_df.columns:
2156
+ null_consensus_ids = self.consensus_df.filter(pl.col("consensus_id").is_null()).shape[0]
2157
+ if null_consensus_ids > 0:
2158
+ self.logger.debug(f"Sanitizing {null_consensus_ids} null consensus_id values with new UIDs")
2159
+
2160
+ # Generate new UIDs for null values using the same method as merge()
2161
+ new_uids = [str(uuid.uuid4()).replace('-', '')[:16] for _ in range(null_consensus_ids)]
2162
+ uid_index = 0
2163
+
2164
+ # Create a list to store all consensus_ids
2165
+ consensus_ids = []
2166
+ for consensus_id in self.consensus_df["consensus_id"].to_list():
2167
+ if consensus_id is None:
2168
+ consensus_ids.append(new_uids[uid_index])
2169
+ uid_index += 1
2170
+ else:
2171
+ consensus_ids.append(consensus_id)
2172
+
2173
+ # Update the DataFrame with sanitized consensus_ids
2174
+ self.consensus_df = self.consensus_df.with_columns(
2175
+ pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
2176
+ )
2177
+
2178
+ self.logger.debug(f"Successfully sanitized {null_consensus_ids} consensus_id values")
2179
+
2180
+ # Sanitize rt_original in features_df by replacing null or NaN values with rt values
2181
+ if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
2182
+ if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
2183
+ # Check for null or NaN values in rt_original
2184
+ null_or_nan_rt_original = self.features_df.filter(
2185
+ pl.col("rt_original").is_null() | pl.col("rt_original").is_nan()
2186
+ ).shape[0]
2187
+ if null_or_nan_rt_original > 0:
2188
+ self.logger.debug(f"Sanitizing {null_or_nan_rt_original} null or NaN rt_original values with rt values")
2189
+ self.features_df = self.features_df.with_columns(
2190
+ pl.when(pl.col("rt_original").is_null() | pl.col("rt_original").is_nan())
2191
+ .then(pl.col("rt"))
2192
+ .otherwise(pl.col("rt_original"))
2193
+ .alias("rt_original")
2194
+ )
2195
+ self.logger.debug(f"Successfully sanitized {null_or_nan_rt_original} rt_original values")