masster 0.5.9__py3-none-any.whl → 0.5.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -1264,7 +1264,7 @@ def find_ms2(self, **kwargs):
1264
1264
 
1265
1265
  # Log completion
1266
1266
  self.logger.success(
1267
- f"MS2 linking completed. Total features with MS2 data: {c}",
1267
+ f"MS2 linking completed. Features with MS2 data: {c}.",
1268
1268
  )
1269
1269
  self.features_df = features_df
1270
1270
 
masster/sample/sample.py CHANGED
@@ -1,35 +1,98 @@
1
1
  """
2
- sample.py
2
+ sample.py - Mass Spectrometry Sample Analysis Module
3
3
 
4
- This module provides tools for processing and analyzing Data-Dependent Acquisition (DDA) mass spectrometry data.
5
- It defines the `Sample` class, which offers methods to load, process, analyze, and visualize mass spectrometry data
6
- from various file formats, including mzML, Thermo RAW, and Sciex WIFF formats.
4
+ This module provides comprehensive tools for processing and analyzing Data-Dependent Acquisition (DDA)
5
+ mass spectrometry data. It defines the `Sample` class, which offers methods to load, process, analyze,
6
+ and visualize mass spectrometry data from various file formats.
7
+
8
+ Supported File Formats:
9
+ - mzML (standard XML format for mass spectrometry data)
10
+ - Thermo RAW (native Thermo Fisher Scientific format)
11
+ - Sciex WIFF (native Sciex format)
12
+ - Sample5 (MASSter's native HDF5-based format for optimized storage)
7
13
 
8
14
  Key Features:
9
- - **File Handling**: Load and save data in multiple formats.
10
- - **Feature Detection**: Detect and process mass spectrometry features.
11
- - **Spectrum Analysis**: Retrieve and analyze MS1/MS2 spectra.
12
- - **Visualization**: Generate interactive and static plots for spectra and chromatograms.
13
- - **Statistics**: Compute and export detailed DDA run statistics.
14
-
15
- Dependencies:
16
- - `pyopenms`: For file handling and feature detection.
17
- - `polars` and `pandas`: For data manipulation.
18
- - `numpy`: For numerical computations.
19
- - `bokeh`, `panel`, `holoviews`, `datashader`: For interactive visualizations.
15
+ - **File Handling**: Load and save data in multiple formats with automatic format detection
16
+ - **Feature Detection**: Detect and process mass spectrometry features using advanced algorithms
17
+ - **Spectrum Analysis**: Retrieve and analyze MS1/MS2 spectra with comprehensive metadata
18
+ - **Adduct Detection**: Find and annotate adducts and in-source fragments
19
+ - **Isotope Analysis**: Detect and process isotopic patterns
20
+ - **Chromatogram Extraction**: Extract and analyze chromatograms (EIC, BPC, TIC)
21
+ - **Visualization**: Generate interactive and static plots for spectra, chromatograms, and 2D maps
22
+ - **Statistics**: Compute and export detailed DDA run statistics and quality metrics
23
+ - **Data Export**: Export processed data to various formats (XLSX, MGF, etc.)
24
+ - **Memory Management**: Efficient handling of large datasets with on-disk storage options
25
+
26
+ Core Dependencies:
27
+ - `pyopenms`: OpenMS library for file handling and feature detection algorithms
28
+ - `polars`: High-performance data manipulation and analysis
29
+ - `numpy`: Numerical computations and array operations
30
+ - `bokeh`, `panel`, `holoviews`, `datashader`: Interactive visualizations and dashboards
31
+ - `h5py`: HDF5 file format support for Sample5 files
20
32
 
21
33
  Classes:
22
- - `Sample`: Main class for handling DDA data, providing methods for data import, processing, and visualization.
23
-
24
- Example Usage:
25
- ```python
26
- from masster.sample import Sample
34
+ Sample: Main class for handling DDA mass spectrometry data, providing methods for
35
+ data import, processing, analysis, and visualization.
27
36
 
28
- sample = Sample(file="example.mzML")
29
- sample.find_features()
30
- sample.plot_2d()
31
- ```
37
+ Typical Workflow:
38
+ 1. Load mass spectrometry data file
39
+ 2. Detect features using find_features()
40
+ 3. Optionally find MS2 spectra with find_ms2()
41
+ 4. Analyze and visualize results
42
+ 5. Export processed data
32
43
 
44
+ Example Usage:
45
+ Basic analysis workflow:
46
+
47
+ ```python
48
+ from masster.sample import Sample
49
+
50
+ # Load a mass spectrometry file
51
+ sample = Sample(filename="experiment.mzML")
52
+
53
+ # Detect features
54
+ sample.find_features()
55
+
56
+ # Find MS2 spectra for features
57
+ sample.find_ms2()
58
+
59
+ # Generate 2D visualization
60
+ sample.plot_2d()
61
+
62
+ # Export results
63
+ sample.export_features("features.xlsx")
64
+ ```
65
+
66
+ Advanced usage with custom parameters:
67
+
68
+ ```python
69
+ from masster.sample import Sample
70
+ from masster.sample.defaults import sample_defaults, find_features_defaults
71
+
72
+ # Create custom parameters
73
+ params = sample_defaults(log_level="DEBUG", label="My Experiment")
74
+ ff_params = find_features_defaults(noise_threshold_int=1000)
75
+
76
+ # Initialize with custom parameters
77
+ sample = Sample(params=params)
78
+ sample.load("data.raw")
79
+
80
+ # Feature detection with custom parameters
81
+ sample.find_features(params=ff_params)
82
+
83
+ # Generate comprehensive statistics
84
+ stats = sample.get_dda_stats()
85
+ sample.plot_dda_stats()
86
+ ```
87
+
88
+ Notes:
89
+ - The Sample class maintains processing history and parameters for reproducibility
90
+ - Large files can be processed with on-disk storage to manage memory usage
91
+ - All visualizations are interactive by default and can be exported as static images
92
+ - The module supports both individual sample analysis and batch processing workflows
93
+
94
+ Version: Part of the MASSter mass spectrometry analysis framework
95
+ Author: Zamboni Lab, ETH Zurich
33
96
  """
34
97
 
35
98
  import importlib
@@ -49,16 +112,12 @@ from masster.sample.defaults.get_spectrum_def import get_spectrum_defaults
49
112
 
50
113
  # Sample-specific imports - keeping these private, only for internal use
51
114
  from masster.sample.h5 import _load_sample5
52
- # from masster.sample.h5 import _load_sample5_study
53
115
  from masster.sample.h5 import _save_sample5
54
- # from masster.sample.helpers import _delete_ms2
55
116
  from masster.sample.helpers import _estimate_memory_usage
56
117
  from masster.sample.helpers import _get_scan_uids
57
118
  from masster.sample.helpers import _get_feature_uids
58
- # from masster.sample.helpers import _features_sync - made internal only
59
119
  from masster.sample.adducts import find_adducts
60
120
  from masster.sample.adducts import _get_adducts
61
- # Removed _get_adducts - only used in study modules
62
121
  from masster.sample.helpers import features_delete
63
122
  from masster.sample.helpers import features_filter
64
123
  from masster.sample.helpers import features_select
@@ -70,23 +129,17 @@ from masster.sample.helpers import get_eic
70
129
  from masster.sample.helpers import set_source
71
130
  from masster.sample.helpers import _recreate_feature_map
72
131
  from masster.sample.helpers import _get_feature_map
73
- # Load functions - keeping only specific ones needed for external API
74
- # from masster.sample.load import _load_featureXML - made internal only
75
- # from masster.sample.load import _load_ms2data - made internal only
76
- # from masster.sample.load import _load_mzML - made internal only
77
- # from masster.sample.load import _load_raw - made internal only
78
- # from masster.sample.load import _load_wiff - made internal only
79
132
  from masster.sample.load import chrom_extract
80
133
  from masster.sample.load import _index_file
81
134
  from masster.sample.load import load
82
135
  from masster.sample.load import load_noms1
83
- from masster.sample.load import _load_ms1 # Renamed from load_study
136
+ from masster.sample.load import _load_ms1
84
137
  from masster.sample.load import sanitize
85
138
  from masster.sample.plot import plot_2d
86
139
  from masster.sample.plot import plot_2d_oracle
87
140
  from masster.sample.plot import plot_dda_stats
88
141
  from masster.sample.plot import plot_chrom
89
- from masster.sample.plot import plot_features_stats # Renamed from plot_feature_stats
142
+ from masster.sample.plot import plot_features_stats
90
143
  from masster.sample.plot import plot_ms2_cycle
91
144
  from masster.sample.plot import plot_ms2_eic
92
145
  from masster.sample.plot import plot_ms2_q1
@@ -113,7 +166,6 @@ from masster.sample.save import export_features
113
166
  from masster.sample.save import export_mgf
114
167
  from masster.sample.save import export_xlsx
115
168
  from masster.sample.save import save
116
- # Removed internal-only import: _save_featureXML
117
169
 
118
170
 
119
171
  class Sample:
@@ -402,6 +454,7 @@ class Sample:
402
454
  f"{base_modname}.chromatogram",
403
455
  f"{base_modname}.spectrum",
404
456
  f"{base_modname}.logger",
457
+ f"{base_modname}.lib",
405
458
  ]
406
459
 
407
460
  # Add study submodules
@@ -414,17 +467,9 @@ class Sample:
414
467
  ):
415
468
  study_modules.append(module_name)
416
469
 
417
- """ # Add parameters submodules
418
- parameters_modules = []
419
- parameters_module_prefix = f"{base_modname}.parameters."
420
- for module_name in sys.modules:
421
- if module_name.startswith(parameters_module_prefix) and module_name != current_module:
422
- parameters_modules.append(module_name)
423
- """
424
-
425
470
  all_modules_to_reload = (
426
471
  core_modules + sample_modules + study_modules
427
- ) # + parameters_modules
472
+ )
428
473
 
429
474
  # Reload all discovered modules
430
475
  for full_module_name in all_modules_to_reload:
@@ -466,8 +511,6 @@ class Sample:
466
511
  else:
467
512
  str += "Features: 0\n"
468
513
  str += "Features with MS2 spectra: 0\n"
469
-
470
- # estimate memory usage
471
514
  mem_usage = self._estimate_memory_usage()
472
515
  str += f"Estimated memory usage: {mem_usage:.2f} MB\n"
473
516
 
masster/study/export.py CHANGED
@@ -496,7 +496,7 @@ def export_mgf(self, **kwargs):
496
496
  # Write END IONS
497
497
  f.write("END IONS\n\n")
498
498
 
499
- self.logger.info(f"Exported {len(mgf_data)} spectra to {filename}")
499
+ self.logger.success(f"Exported {len(mgf_data)} spectra to {filename}")
500
500
 
501
501
 
502
502
  def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs) -> None:
@@ -1183,7 +1183,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
1183
1183
  for line in mgf_lines:
1184
1184
  f.write(line + "\n")
1185
1185
 
1186
- self.logger.info(f"Exported mzTab-M to {filename}")
1186
+ self.logger.success(f"Exported mzTab-M to {filename}")
1187
1187
 
1188
1188
 
1189
1189
  def export_xlsx(self, filename: str | None = None) -> None:
@@ -1311,7 +1311,7 @@ def export_xlsx(self, filename: str | None = None) -> None:
1311
1311
  f"Written worksheet '{sheet_name}' with shape {data.shape}",
1312
1312
  )
1313
1313
 
1314
- self.logger.info(f"Study exported to {filename}")
1314
+ self.logger.success(f"Study exported to {filename}")
1315
1315
 
1316
1316
  except Exception as e:
1317
1317
  self.logger.error(f"Error writing Excel file: {e}")
@@ -1424,8 +1424,6 @@ def export_parquet(self, filename: str | None = None) -> None:
1424
1424
 
1425
1425
  # Report results
1426
1426
  if exported_files:
1427
- self.logger.info(f"Study exported to {len(exported_files)} Parquet files:")
1428
- for file_path in exported_files:
1429
- self.logger.info(f" - {file_path}")
1427
+ self.logger.success(f"Study exported to {len(exported_files)} Parquet files.")
1430
1428
  else:
1431
1429
  self.logger.error("No Parquet files were created - no data available to export")
masster/study/h5.py CHANGED
@@ -818,9 +818,35 @@ def _reorder_columns_by_schema(
818
818
 
819
819
  def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataFrame:
820
820
  """Create DataFrame handling Object columns properly."""
821
+ # First check all data for numpy object arrays and move them to object columns
822
+ additional_object_cols = []
823
+ for k, v in data.items():
824
+ if k not in object_columns and hasattr(v, 'dtype') and str(v.dtype) == 'object':
825
+ # This is a numpy object array that should be treated as object
826
+ additional_object_cols.append(k)
827
+ object_columns.append(k)
828
+
829
+ if additional_object_cols:
830
+ # Re-run reconstruction for these columns
831
+ for col in additional_object_cols:
832
+ data[col] = _reconstruct_object_column(data[col], col)
833
+
821
834
  object_data = {k: v for k, v in data.items() if k in object_columns}
822
835
  regular_data = {k: v for k, v in data.items() if k not in object_columns}
823
836
 
837
+ # Final check: ensure no numpy object arrays in regular_data
838
+ problematic_cols = []
839
+ for k, v in regular_data.items():
840
+ if hasattr(v, 'dtype') and str(v.dtype) == 'object':
841
+ problematic_cols.append(k)
842
+
843
+ if problematic_cols:
844
+ # Move these to object_data
845
+ for col in problematic_cols:
846
+ object_data[col] = _reconstruct_object_column(regular_data[col], col)
847
+ del regular_data[col]
848
+ object_columns.append(col)
849
+
824
850
  # Determine expected length from regular data or first object column
825
851
  expected_length = None
826
852
  if regular_data:
@@ -1103,11 +1129,18 @@ def _load_dataframe_from_group(
1103
1129
  logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
1104
1130
  column_data = group[col][:]
1105
1131
 
1106
- # Try to determine if this should be treated as an object column
1107
- # by checking if the data looks like JSON strings
1108
- if len(column_data) > 0 and isinstance(column_data[0], bytes):
1132
+ # Check if this is a known object column by name
1133
+ known_object_columns = {"ms1_spec", "chrom", "ms2_scans", "ms2_specs", "spec", "adducts", "iso"}
1134
+ is_known_object = col in known_object_columns
1135
+
1136
+ if is_known_object:
1137
+ # Known object column, always reconstruct
1138
+ data[col] = _reconstruct_object_column(column_data, col)
1139
+ if col not in object_columns:
1140
+ object_columns.append(col)
1141
+ elif len(column_data) > 0 and isinstance(column_data[0], bytes):
1109
1142
  try:
1110
- # Check if it looks like JSON
1143
+ # Check if it looks like JSON for unknown columns
1111
1144
  test_decode = column_data[0].decode("utf-8")
1112
1145
  if test_decode.startswith("[") or test_decode.startswith("{"):
1113
1146
  # Looks like JSON, treat as object column
@@ -1165,9 +1198,29 @@ def _load_dataframe_from_group(
1165
1198
  logger.debug(
1166
1199
  f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}",
1167
1200
  )
1201
+
1202
+ # Debug: check for problematic data types in all columns before DataFrame creation
1203
+ for col, values in data.items():
1204
+ if hasattr(values, 'dtype') and str(values.dtype) == 'object':
1205
+ logger.warning(f"Column '{col}' has numpy object dtype but is not in object_columns: {object_columns}")
1206
+ if col not in object_columns:
1207
+ object_columns.append(col)
1208
+
1168
1209
  df = _create_dataframe_with_objects(data, object_columns)
1169
1210
  else:
1170
- df = pl.DataFrame(data)
1211
+ # Debug: check for problematic data types when no object columns are expected
1212
+ for col, values in data.items():
1213
+ if hasattr(values, 'dtype') and str(values.dtype) == 'object':
1214
+ logger.warning(f"Column '{col}' has numpy object dtype but no object_columns specified!")
1215
+ # Treat as object column
1216
+ if object_columns is None:
1217
+ object_columns = []
1218
+ object_columns.append(col)
1219
+
1220
+ if object_columns:
1221
+ df = _create_dataframe_with_objects(data, object_columns)
1222
+ else:
1223
+ df = pl.DataFrame(data)
1171
1224
 
1172
1225
  # Clean null values and apply schema
1173
1226
  df = _clean_string_nulls(df)
@@ -1738,9 +1791,7 @@ def _save_study5(self, filename):
1738
1791
  )
1739
1792
  pbar.update(1)
1740
1793
 
1741
- self.logger.success(f"Study saved successfully to {filename}")
1742
- self.logger.debug(f"Save completed for {filename}")
1743
- self.logger.debug(f"Save completed for {filename}")
1794
+ self.logger.success(f"Study saved to {filename}")
1744
1795
 
1745
1796
 
1746
1797
  def _load_study5(self, filename=None):
@@ -1859,7 +1910,7 @@ def _load_study5(self, filename=None):
1859
1910
  )
1860
1911
  else:
1861
1912
  self.logger.debug(
1862
- "Successfully updated parameters from loaded history",
1913
+ "Updated parameters from loaded history",
1863
1914
  )
1864
1915
  else:
1865
1916
  self.logger.debug(
@@ -2093,8 +2144,8 @@ def _load_study5(self, filename=None):
2093
2144
  # Ensure the column is Int64 type
2094
2145
  self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
2095
2146
 
2096
- self.logger.info(
2097
- f"Successfully migrated {sample_count} samples to indexed map_id format (0 to {sample_count - 1})",
2147
+ self.logger.debug(
2148
+ f"Sanitized {sample_count} samples to indexed map_id format (0 to {sample_count - 1})",
2098
2149
  )
2099
2150
 
2100
2151
  # Sanitize null feature_id and consensus_id values with new UIDs (same method as merge)
@@ -2218,7 +2269,7 @@ def _sanitize_nulls(self):
2218
2269
  pl.Series("feature_id", feature_ids, dtype=pl.Utf8)
2219
2270
  )
2220
2271
 
2221
- self.logger.debug(f"Successfully sanitized {null_feature_ids} feature_id values")
2272
+ self.logger.debug(f"Sanitized {null_feature_ids} feature_id values")
2222
2273
 
2223
2274
  # Sanitize consensus_df consensus_id column
2224
2275
  if hasattr(self, 'consensus_df') and self.consensus_df is not None and not self.consensus_df.is_empty():
@@ -2244,8 +2295,8 @@ def _sanitize_nulls(self):
2244
2295
  self.consensus_df = self.consensus_df.with_columns(
2245
2296
  pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
2246
2297
  )
2247
-
2248
- self.logger.debug(f"Successfully sanitized {null_consensus_ids} consensus_id values")
2298
+
2299
+ self.logger.debug(f"Sanitized {null_consensus_ids} consensus_id values")
2249
2300
 
2250
2301
  # Sanitize rt_original in features_df by replacing null or NaN values with rt values
2251
2302
  if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
@@ -2262,4 +2313,4 @@ def _sanitize_nulls(self):
2262
2313
  .otherwise(pl.col("rt_original"))
2263
2314
  .alias("rt_original")
2264
2315
  )
2265
- self.logger.debug(f"Successfully sanitized {null_or_nan_rt_original} rt_original values")
2316
+ self.logger.debug(f"Sanitized {null_or_nan_rt_original} rt_original values")
masster/study/helpers.py CHANGED
@@ -1630,7 +1630,7 @@ def restore_features(self, samples=None, maps=False):
1630
1630
  self.logger.error(f"Failed to load sample {sample_name}: {e}")
1631
1631
  continue
1632
1632
 
1633
- self.logger.info(
1633
+ self.logger.success(
1634
1634
  f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples",
1635
1635
  )
1636
1636
 
@@ -2663,7 +2663,7 @@ def features_filter(
2663
2663
  removed_count = initial_count - final_count
2664
2664
 
2665
2665
  self.logger.info(
2666
- f"Filtered features: kept {final_count:,}, removed {removed_count:,}"
2666
+ f"Filtered features. Kept: {final_count:,}. Removed: {removed_count:,}."
2667
2667
  )
2668
2668
 
2669
2669
 
@@ -2940,6 +2940,7 @@ def features_delete(self, features):
2940
2940
 
2941
2941
  def consensus_select(
2942
2942
  self,
2943
+ uid=None,
2943
2944
  mz=None,
2944
2945
  rt=None,
2945
2946
  inty_mean=None,
@@ -2956,14 +2957,12 @@ def consensus_select(
2956
2957
  rt_delta_mean=None,
2957
2958
  id_top_score=None,
2958
2959
  identified=None,
2959
- # New adduct filter parameters
2960
2960
  adduct_top=None,
2961
2961
  adduct_charge_top=None,
2962
2962
  adduct_mass_neutral_top=None,
2963
2963
  adduct_mass_shift_top=None,
2964
2964
  adduct_group=None,
2965
2965
  adduct_of=None,
2966
- # New identification filter parameters
2967
2966
  id_top_name=None,
2968
2967
  id_top_class=None,
2969
2968
  id_top_adduct=None,
@@ -2976,6 +2975,11 @@ def consensus_select(
2976
2975
  OPTIMIZED VERSION: Enhanced performance with lazy evaluation, vectorized operations, and efficient filtering.
2977
2976
 
2978
2977
  Parameters:
2978
+ uid: consensus UID filter with flexible formats:
2979
+ - None: include all consensus features (default)
2980
+ - int: single specific consensus_uid
2981
+ - tuple: range of consensus_uids (consensus_uid_min, consensus_uid_max)
2982
+ - list: specific list of consensus_uid values
2979
2983
  mz: m/z filter with flexible formats:
2980
2984
  - float: m/z value ± default tolerance (uses study.parameters.eic_mz_tol)
2981
2985
  - tuple (mz_min, mz_max): range where mz_max > mz_min
@@ -3023,7 +3027,7 @@ def consensus_select(
3023
3027
  return pl.DataFrame()
3024
3028
 
3025
3029
  # Early return optimization - check if any filters are provided
3026
- filter_params = [mz, rt, inty_mean, consensus_uid, consensus_id, number_samples,
3030
+ filter_params = [uid, mz, rt, inty_mean, consensus_uid, consensus_id, number_samples,
3027
3031
  number_ms2, quality, bl, chrom_coherence_mean, chrom_prominence_mean,
3028
3032
  chrom_prominence_scaled_mean, chrom_height_scaled_mean,
3029
3033
  rt_delta_mean, id_top_score, identified,
@@ -3044,6 +3048,21 @@ def consensus_select(
3044
3048
  warnings = []
3045
3049
 
3046
3050
  # Build all filter conditions efficiently
3051
+ # Handle uid parameter first (consensus_uid filter with flexible formats)
3052
+ if uid is not None:
3053
+ if isinstance(uid, int):
3054
+ # Single specific consensus_uid
3055
+ filter_conditions.append(pl.col("consensus_uid") == uid)
3056
+ elif isinstance(uid, tuple) and len(uid) == 2:
3057
+ # Range of consensus_uids (consensus_uid_min, consensus_uid_max)
3058
+ min_uid, max_uid = uid
3059
+ filter_conditions.append((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
3060
+ elif isinstance(uid, list):
3061
+ # Specific list of consensus_uid values
3062
+ filter_conditions.append(pl.col("consensus_uid").is_in(uid))
3063
+ else:
3064
+ self.logger.warning(f"Invalid uid parameter type: {type(uid)}. Expected int, tuple, or list.")
3065
+
3047
3066
  if mz is not None:
3048
3067
  if isinstance(mz, tuple) and len(mz) == 2:
3049
3068
  if mz[1] < mz[0]:
masster/study/load.py CHANGED
@@ -139,7 +139,7 @@ def add(
139
139
  f"No files found in {folder}. Please check the folder path or file patterns.",
140
140
  )
141
141
  else:
142
- self.logger.debug(f"Successfully added {counter} samples to the study.")
142
+ self.logger.debug(f"Added {counter} samples to the study.")
143
143
 
144
144
  # Return a simple summary to suppress marimo's automatic object display
145
145
  return f"Added {counter} samples to study"
@@ -2055,169 +2055,6 @@ def _sanitize(self):
2055
2055
  except Exception as e:
2056
2056
  self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
2057
2057
 
2058
- '''
2059
- def _load_features(self):
2060
- """
2061
- Load features by reconstructing FeatureMaps from the processed features_df data.
2062
-
2063
- This ensures that the loaded FeatureMaps contain the same processed features
2064
- as stored in features_df, rather than loading raw features from .featureXML files
2065
- which may not match the processed data after filtering, alignment, etc.
2066
- """
2067
- import polars as pl
2068
- import pyopenms as oms
2069
- from tqdm import tqdm
2070
- from datetime import datetime
2071
-
2072
- self.features_maps = []
2073
-
2074
- # Check if features_df exists and is not empty
2075
- if self.features_df is None:
2076
- self.logger.warning("features_df is None. Falling back to XML loading.")
2077
- self._load_features_from_xml()
2078
- return
2079
-
2080
- if len(self.features_df) == 0:
2081
- self.logger.warning("features_df is empty. Falling back to XML loading.")
2082
- self._load_features_from_xml()
2083
- return
2084
-
2085
- # If we get here, we should use the new method
2086
- self.logger.debug("Reconstructing FeatureMaps from features_df.")
2087
-
2088
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
2089
-
2090
- # Process each sample in order
2091
- for sample_index, row_dict in tqdm(
2092
- enumerate(self.samples_df.iter_rows(named=True)),
2093
- total=len(self.samples_df),
2094
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Reconstruct FeatureMaps from DataFrame",
2095
- disable=tdqm_disable,
2096
- ):
2097
- sample_uid = row_dict["sample_uid"]
2098
- sample_name = row_dict["sample_name"]
2099
-
2100
- # Get features for this sample from features_df
2101
- sample_features = self.features_df.filter(pl.col("sample_uid") == sample_uid)
2102
-
2103
- # Create new FeatureMap
2104
- feature_map = oms.FeatureMap()
2105
-
2106
- # Convert DataFrame features to OpenMS Features
2107
- # Keep track of next available feature_id for this sample
2108
- next_feature_id = 1
2109
- used_feature_ids = set()
2110
-
2111
- # First pass: collect existing feature_ids to avoid conflicts
2112
- for feature_row in sample_features.iter_rows(named=True):
2113
- if feature_row["feature_id"] is not None:
2114
- used_feature_ids.add(int(feature_row["feature_id"]))
2115
-
2116
- # Find the next available feature_id
2117
- while next_feature_id in used_feature_ids:
2118
- next_feature_id += 1
2119
-
2120
- for feature_row in sample_features.iter_rows(named=True):
2121
- feature = oms.Feature()
2122
-
2123
- # Set properties from DataFrame (handle missing values gracefully)
2124
- try:
2125
- # Skip features with missing critical data
2126
- if feature_row["mz"] is None:
2127
- self.logger.warning("Skipping feature due to missing mz")
2128
- continue
2129
- if feature_row["rt"] is None:
2130
- self.logger.warning("Skipping feature due to missing rt")
2131
- continue
2132
- if feature_row["inty"] is None:
2133
- self.logger.warning("Skipping feature due to missing inty")
2134
- continue
2135
-
2136
- # Handle missing feature_id by generating a new one
2137
- if feature_row["feature_id"] is None:
2138
- feature_id = next_feature_id
2139
- next_feature_id += 1
2140
- self.logger.debug(f"Generated new feature_id {feature_id} for feature with missing ID")
2141
- else:
2142
- feature_id = int(feature_row["feature_id"])
2143
-
2144
- feature.setUniqueId(feature_id)
2145
- feature.setMZ(float(feature_row["mz"]))
2146
- feature.setRT(float(feature_row["rt"]))
2147
- feature.setIntensity(float(feature_row["inty"]))
2148
-
2149
- # Handle optional fields that might be None
2150
- if feature_row.get("quality") is not None:
2151
- feature.setOverallQuality(float(feature_row["quality"]))
2152
- if feature_row.get("charge") is not None:
2153
- feature.setCharge(int(feature_row["charge"]))
2154
-
2155
- # Add to feature map
2156
- feature_map.push_back(feature)
2157
- except (ValueError, TypeError) as e:
2158
- self.logger.warning(f"Skipping feature due to conversion error: {e}")
2159
- continue
2160
-
2161
- self.features_maps.append(feature_map)
2162
-
2163
- self.logger.debug(
2164
- f"Successfully reconstructed {len(self.features_maps)} FeatureMaps from features_df.",
2165
- )
2166
- '''
2167
-
2168
- '''
2169
- def _load_features_from_xml(self):
2170
- """
2171
- Original load_features method that loads from .featureXML files.
2172
- Used as fallback when features_df is not available.
2173
- """
2174
- self.features_maps = []
2175
- self.logger.debug("Loading features from featureXML files.")
2176
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
2177
- for _index, row_dict in tqdm(
2178
- enumerate(self.samples_df.iter_rows(named=True)),
2179
- total=len(self.samples_df),
2180
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Load feature maps from XML",
2181
- disable=tdqm_disable,
2182
- ):
2183
- if self.folder is not None:
2184
- filename = os.path.join(
2185
- self.folder,
2186
- row_dict["sample_name"] + ".featureXML",
2187
- )
2188
- else:
2189
- filename = os.path.join(
2190
- os.getcwd(),
2191
- row_dict["sample_name"] + ".featureXML",
2192
- )
2193
- # check if file exists
2194
- if not os.path.exists(filename):
2195
- filename = row_dict["sample_path"].replace(".sample5", ".featureXML")
2196
-
2197
- if not os.path.exists(filename):
2198
- self.features_maps.append(None)
2199
- continue
2200
-
2201
- fh = oms.FeatureXMLFile()
2202
- fm = oms.FeatureMap()
2203
- fh.load(filename, fm)
2204
- self.features_maps.append(fm)
2205
- self.logger.debug("Features loaded successfully.")
2206
- '''
2207
- '''
2208
- def _load_consensusXML(self, filename="alignment.consensusXML"):
2209
- """
2210
- Load a consensus map from a file.
2211
- """
2212
- if not os.path.exists(filename):
2213
- self.logger.error(f"File {filename} does not exist.")
2214
- return
2215
- fh = oms.ConsensusXMLFile()
2216
- self.consensus_map = oms.ConsensusMap()
2217
- fh.load(filename, self.consensus_map)
2218
- self.logger.debug(f"Loaded consensus map from {filename}.")
2219
- '''
2220
-
2221
2058
  def _add_samples_batch(
2222
2059
  self,
2223
2060
  files,