masster 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/h5.py CHANGED
@@ -304,6 +304,30 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
304
304
  serialized_chunk.append(item.to_json())
305
305
  else:
306
306
  serialized_chunk.append("None")
307
+ elif col_name == "iso":
308
+ # Handle isotope patterns (numpy arrays with [mz, intensity] data)
309
+ for item in chunk_data:
310
+ if item is not None:
311
+ try:
312
+ # Convert numpy array to nested list for JSON serialization
313
+ serialized_chunk.append(json.dumps(item.tolist()))
314
+ except (AttributeError, TypeError):
315
+ # Fallback for non-numpy data
316
+ serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
317
+ else:
318
+ serialized_chunk.append("None")
319
+ elif col_name == "ms1_spec":
320
+ # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
321
+ for item in chunk_data:
322
+ if item is not None:
323
+ try:
324
+ # Convert numpy array to nested list for JSON serialization
325
+ serialized_chunk.append(json.dumps(item.tolist()))
326
+ except (AttributeError, TypeError):
327
+ # Fallback for non-numpy data
328
+ serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
329
+ else:
330
+ serialized_chunk.append("None")
307
331
  else:
308
332
  logger.warning(
309
333
  f"Unknown object column '{col_name}', using default serialization",
@@ -564,6 +588,34 @@ def _save_dataframe_column_legacy(
564
588
  else:
565
589
  data_as_str.append("None")
566
590
  group.create_dataset(col, data=data_as_str, compression=compression)
591
+ elif col == "iso":
592
+ # Handle isotope patterns (numpy arrays with [mz, intensity] data)
593
+ data_as_json_strings = []
594
+ for item in data:
595
+ if item is not None:
596
+ try:
597
+ # Convert numpy array to nested list for JSON serialization
598
+ data_as_json_strings.append(json.dumps(item.tolist()))
599
+ except (AttributeError, TypeError):
600
+ # Fallback for non-numpy data
601
+ data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
602
+ else:
603
+ data_as_json_strings.append("None")
604
+ group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
605
+ elif col == "ms1_spec":
606
+ # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
607
+ data_as_json_strings = []
608
+ for item in data:
609
+ if item is not None:
610
+ try:
611
+ # Convert numpy array to nested list for JSON serialization
612
+ data_as_json_strings.append(json.dumps(item.tolist()))
613
+ except (AttributeError, TypeError):
614
+ # Fallback for non-numpy data
615
+ data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
616
+ else:
617
+ data_as_json_strings.append("None")
618
+ group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
567
619
  else:
568
620
  logger.warning(
569
621
  f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column.",
@@ -666,6 +718,24 @@ def _reconstruct_object_column(data_col, col_name: str):
666
718
  },
667
719
  )
668
720
  reconstructed_data.append(converted_adducts)
721
+ elif col_name == "iso":
722
+ # Handle isotope patterns (numpy arrays with [mz, intensity] data)
723
+ try:
724
+ import numpy as np
725
+ iso_data = json.loads(item)
726
+ # Convert back to numpy array
727
+ reconstructed_data.append(np.array(iso_data) if iso_data else None)
728
+ except (json.JSONDecodeError, ValueError, ImportError):
729
+ reconstructed_data.append(None)
730
+ elif col_name == "ms1_spec":
731
+ # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
732
+ try:
733
+ import numpy as np
734
+ ms1_spec_data = json.loads(item)
735
+ # Convert back to numpy array
736
+ reconstructed_data.append(np.array(ms1_spec_data) if ms1_spec_data else None)
737
+ except (json.JSONDecodeError, ValueError, ImportError):
738
+ reconstructed_data.append(None)
669
739
  else:
670
740
  # Unknown object column
671
741
  reconstructed_data.append(None)
@@ -974,7 +1044,7 @@ def _load_dataframe_from_group(
974
1044
 
975
1045
  # Second pass: handle missing columns
976
1046
  for col in missing_columns:
977
- logger.info(f"Column '{col}' not found in {df_name}.")
1047
+ logger.debug(f"Column '{col}' not found in {df_name}.")
978
1048
  # For missing columns, create appropriately sized array with appropriate defaults
979
1049
  if col in object_columns:
980
1050
  data[col] = [None] * expected_length
@@ -1857,6 +1927,26 @@ def _load_study5(self, filename=None):
1857
1927
  self.logger,
1858
1928
  object_columns,
1859
1929
  )
1930
+
1931
+ # Sanity check: replace any missing rt_original with rt values
1932
+ if self.features_df is not None and not self.features_df.is_empty():
1933
+ if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
1934
+ null_rt_original_count = self.features_df.filter(pl.col("rt_original").is_null()).height
1935
+ if null_rt_original_count > 0:
1936
+ self.logger.info(f"Replacing {null_rt_original_count} missing rt_original values with rt")
1937
+ self.features_df = self.features_df.with_columns(
1938
+ pl.when(pl.col("rt_original").is_null())
1939
+ .then(pl.col("rt"))
1940
+ .otherwise(pl.col("rt_original"))
1941
+ .alias("rt_original")
1942
+ )
1943
+ else:
1944
+ self.logger.debug("All rt_original values are present")
1945
+ else:
1946
+ if "rt_original" not in self.features_df.columns:
1947
+ self.logger.debug("rt_original column not found in features_df")
1948
+ if "rt" not in self.features_df.columns:
1949
+ self.logger.debug("rt column not found in features_df")
1860
1950
  else:
1861
1951
  self.features_df = _create_empty_dataframe_from_schema("features_df", schema)
1862
1952
  pbar.update(1)
@@ -2008,12 +2098,12 @@ def _load_study5(self, filename=None):
2008
2098
  )
2009
2099
 
2010
2100
  # Sanitize null feature_id and consensus_id values with new UIDs (same method as merge)
2011
- self._sanitize_null_ids()
2101
+ _sanitize_nulls(self)
2012
2102
 
2013
2103
  self.logger.debug("Study loaded")
2104
+
2014
2105
 
2015
-
2016
- def _load_ms1(self, sample_path: str) -> pl.DataFrame:
2106
+ def _load_ms1(self, filename: str) -> pl.DataFrame:
2017
2107
  """
2018
2108
  Optimized method to load only MS1 data from a sample5 file for isotope detection.
2019
2109
 
@@ -2030,48 +2120,146 @@ def _load_ms1(self, sample_path: str) -> pl.DataFrame:
2030
2120
  Note:
2031
2121
  Used by find_iso() for efficient isotope pattern detection without full sample loading
2032
2122
  """
2033
- try:
2034
- with h5py.File(sample_path, "r") as f:
2035
- # Check if ms1 group exists
2036
- if "ms1" not in f:
2037
- self.logger.debug(f"No MS1 data found in {sample_path}")
2038
- return pl.DataFrame()
2039
-
2040
- ms1_group = f["ms1"]
2123
+ #try:
2124
+ # add .sample5 extension if not provided
2125
+ if not filename.endswith(".sample5"):
2126
+ filename += ".sample5"
2127
+ with h5py.File(filename, "r") as f:
2128
+ # Check if ms1 group exists
2129
+ if "ms1" not in f:
2130
+ self.logger.debug(f"No MS1 data found in {filename}")
2131
+ return pl.DataFrame()
2132
+
2133
+ ms1_group = f["ms1"]
2134
+
2135
+ # Load MS1 data efficiently
2136
+ ms1_data = {}
2137
+ for col in ms1_group.keys():
2138
+ ms1_data[col] = ms1_group[col][:]
2139
+
2140
+ if not ms1_data:
2141
+ self.logger.debug(f"Empty MS1 data in {filename}")
2142
+ return pl.DataFrame()
2041
2143
 
2042
- # Load MS1 data efficiently
2043
- ms1_data = {}
2044
- for col in ms1_group.keys():
2045
- ms1_data[col] = ms1_group[col][:]
2144
+ # Create DataFrame with proper schema
2145
+ ms1_df = pl.DataFrame(ms1_data)
2146
+
2147
+ # Apply expected schema for MS1 data
2148
+ expected_schema = {
2149
+ "cycle": pl.Int64,
2150
+ "scan_uid": pl.Int64,
2151
+ "rt": pl.Float64,
2152
+ "mz": pl.Float64,
2153
+ "inty": pl.Float64
2154
+ }
2155
+
2156
+ # Cast columns to expected types if they exist
2157
+ cast_expressions = []
2158
+ for col, dtype in expected_schema.items():
2159
+ if col in ms1_df.columns:
2160
+ cast_expressions.append(pl.col(col).cast(dtype))
2161
+
2162
+ if cast_expressions:
2163
+ ms1_df = ms1_df.with_columns(cast_expressions)
2164
+
2165
+ self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {filename}")
2166
+ return ms1_df
2046
2167
 
2047
- if not ms1_data:
2048
- self.logger.debug(f"Empty MS1 data in {sample_path}")
2049
- return pl.DataFrame()
2050
-
2051
- # Create DataFrame with proper schema
2052
- ms1_df = pl.DataFrame(ms1_data)
2168
+ #except Exception as e:
2169
+ # self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
2170
+ # return pl.DataFrame()
2171
+
2172
+
2173
+ def _sanitize_nulls(self):
2174
+ """
2175
+ Sanitize null feature_id and consensus_id values by replacing them with new integer IDs.
2176
+ For feature_id: generates large sequential integers that can be converted by merge/align functions.
2177
+ For consensus_id: uses 16-character UUID strings (as expected by merge function).
2178
+ """
2179
+ import uuid
2180
+ import polars as pl
2181
+ import time
2182
+
2183
+ # Sanitize features_df feature_id column
2184
+ if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
2185
+ # Check for null feature_ids
2186
+ null_feature_ids = self.features_df.filter(pl.col("feature_id").is_null()).shape[0]
2187
+ if null_feature_ids > 0:
2188
+ self.logger.debug(f"Sanitizing {null_feature_ids} null feature_id values with new integer IDs")
2053
2189
 
2054
- # Apply expected schema for MS1 data
2055
- expected_schema = {
2056
- "cycle": pl.Int64,
2057
- "scan_uid": pl.Int64,
2058
- "rt": pl.Float64,
2059
- "mz": pl.Float64,
2060
- "inty": pl.Float64
2061
- }
2190
+ # Find the maximum existing feature_id (convert strings to int if possible)
2191
+ max_existing_id = 0
2192
+ existing_ids = self.features_df.filter(pl.col("feature_id").is_not_null())["feature_id"].to_list()
2193
+ for fid in existing_ids:
2194
+ try:
2195
+ int_id = int(fid)
2196
+ max_existing_id = max(max_existing_id, int_id)
2197
+ except (ValueError, TypeError):
2198
+ # Skip non-integer IDs
2199
+ pass
2062
2200
 
2063
- # Cast columns to expected types if they exist
2064
- cast_expressions = []
2065
- for col, dtype in expected_schema.items():
2066
- if col in ms1_df.columns:
2067
- cast_expressions.append(pl.col(col).cast(dtype))
2201
+ # Generate new sequential integer IDs starting from max + timestamp offset
2202
+ # Use timestamp to ensure uniqueness across different sanitization runs
2203
+ base_id = max(max_existing_id + 1, int(time.time() * 1000000)) # Microsecond timestamp
2204
+ new_int_ids = [str(base_id + i) for i in range(null_feature_ids)]
2205
+ uid_index = 0
2068
2206
 
2069
- if cast_expressions:
2070
- ms1_df = ms1_df.with_columns(cast_expressions)
2207
+ # Create a list to store all feature_ids
2208
+ feature_ids = []
2209
+ for feature_id in self.features_df["feature_id"].to_list():
2210
+ if feature_id is None:
2211
+ feature_ids.append(new_int_ids[uid_index])
2212
+ uid_index += 1
2213
+ else:
2214
+ feature_ids.append(feature_id)
2071
2215
 
2072
- self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {sample_path}")
2073
- return ms1_df
2216
+ # Update the DataFrame with sanitized feature_ids
2217
+ self.features_df = self.features_df.with_columns(
2218
+ pl.Series("feature_id", feature_ids, dtype=pl.Utf8)
2219
+ )
2074
2220
 
2075
- except Exception as e:
2076
- self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
2077
- return pl.DataFrame()
2221
+ self.logger.debug(f"Successfully sanitized {null_feature_ids} feature_id values")
2222
+
2223
+ # Sanitize consensus_df consensus_id column
2224
+ if hasattr(self, 'consensus_df') and self.consensus_df is not None and not self.consensus_df.is_empty():
2225
+ if "consensus_id" in self.consensus_df.columns:
2226
+ null_consensus_ids = self.consensus_df.filter(pl.col("consensus_id").is_null()).shape[0]
2227
+ if null_consensus_ids > 0:
2228
+ self.logger.debug(f"Sanitizing {null_consensus_ids} null consensus_id values with new UIDs")
2229
+
2230
+ # Generate new UIDs for null values using the same method as merge()
2231
+ new_uids = [str(uuid.uuid4()).replace('-', '')[:16] for _ in range(null_consensus_ids)]
2232
+ uid_index = 0
2233
+
2234
+ # Create a list to store all consensus_ids
2235
+ consensus_ids = []
2236
+ for consensus_id in self.consensus_df["consensus_id"].to_list():
2237
+ if consensus_id is None:
2238
+ consensus_ids.append(new_uids[uid_index])
2239
+ uid_index += 1
2240
+ else:
2241
+ consensus_ids.append(consensus_id)
2242
+
2243
+ # Update the DataFrame with sanitized consensus_ids
2244
+ self.consensus_df = self.consensus_df.with_columns(
2245
+ pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
2246
+ )
2247
+
2248
+ self.logger.debug(f"Successfully sanitized {null_consensus_ids} consensus_id values")
2249
+
2250
+ # Sanitize rt_original in features_df by replacing null or NaN values with rt values
2251
+ if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
2252
+ if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
2253
+ # Check for null or NaN values in rt_original
2254
+ null_or_nan_rt_original = self.features_df.filter(
2255
+ pl.col("rt_original").is_null() | pl.col("rt_original").is_nan()
2256
+ ).shape[0]
2257
+ if null_or_nan_rt_original > 0:
2258
+ self.logger.debug(f"Sanitizing {null_or_nan_rt_original} null or NaN rt_original values with rt values")
2259
+ self.features_df = self.features_df.with_columns(
2260
+ pl.when(pl.col("rt_original").is_null() | pl.col("rt_original").is_nan())
2261
+ .then(pl.col("rt"))
2262
+ .otherwise(pl.col("rt_original"))
2263
+ .alias("rt_original")
2264
+ )
2265
+ self.logger.debug(f"Successfully sanitized {null_or_nan_rt_original} rt_original values")