masster 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +6 -0
- masster/_version.py +1 -1
- masster/sample/h5.py +58 -1
- masster/sample/load.py +7 -1
- masster/sample/plot.py +56 -65
- masster/sample/processing.py +158 -0
- masster/sample/sample.py +2 -0
- masster/sample/sample5_schema.json +3 -0
- masster/sample/save.py +135 -59
- masster/spectrum.py +58 -9
- masster/study/export.py +240 -154
- masster/study/h5.py +65 -1
- masster/study/helpers.py +3 -3
- masster/study/load.py +39 -3
- masster/study/merge.py +25 -10
- masster/study/plot.py +162 -192
- masster/study/processing.py +362 -12
- masster/study/save.py +48 -5
- masster/study/study.py +16 -3
- masster/study/study5_schema.json +3 -0
- masster/wizard/__init__.py +5 -2
- masster/wizard/wizard.py +435 -1871
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/METADATA +1 -1
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/RECORD +27 -29
- masster/wizard/test_structure.py +0 -49
- masster/wizard/test_wizard.py +0 -285
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/WHEEL +0 -0
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/entry_points.txt +0 -0
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/licenses/LICENSE +0 -0
masster/study/processing.py
CHANGED
|
@@ -15,6 +15,85 @@ from masster.study.defaults import (
|
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
def _generate_feature_maps_on_demand_for_align(study):
|
|
19
|
+
"""
|
|
20
|
+
Generate feature maps on-demand from study.features_df for alignment operations.
|
|
21
|
+
Returns temporary feature maps that are not cached in the study.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
study: Study object containing features_df and samples_df
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
list: List of temporary FeatureMap objects
|
|
28
|
+
"""
|
|
29
|
+
import polars as pl
|
|
30
|
+
import pyopenms as oms
|
|
31
|
+
|
|
32
|
+
if study.features_df is None or len(study.features_df) == 0:
|
|
33
|
+
study.logger.error("No features_df available for generating feature maps")
|
|
34
|
+
return []
|
|
35
|
+
|
|
36
|
+
temp_feature_maps = []
|
|
37
|
+
|
|
38
|
+
# Process each sample in order
|
|
39
|
+
for sample_index, row_dict in enumerate(study.samples_df.iter_rows(named=True)):
|
|
40
|
+
sample_uid = row_dict["sample_uid"]
|
|
41
|
+
sample_name = row_dict["sample_name"]
|
|
42
|
+
|
|
43
|
+
# Get features for this sample from features_df
|
|
44
|
+
sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
|
|
45
|
+
|
|
46
|
+
# Create new FeatureMap
|
|
47
|
+
feature_map = oms.FeatureMap()
|
|
48
|
+
|
|
49
|
+
# Convert DataFrame features to OpenMS Features
|
|
50
|
+
for feature_row in sample_features.iter_rows(named=True):
|
|
51
|
+
feature = oms.Feature()
|
|
52
|
+
|
|
53
|
+
# Set properties from DataFrame (handle missing values gracefully)
|
|
54
|
+
try:
|
|
55
|
+
# Skip features with missing critical data
|
|
56
|
+
if feature_row["mz"] is None:
|
|
57
|
+
study.logger.warning("Skipping feature due to missing mz")
|
|
58
|
+
continue
|
|
59
|
+
if feature_row["rt"] is None:
|
|
60
|
+
study.logger.warning("Skipping feature due to missing rt")
|
|
61
|
+
continue
|
|
62
|
+
if feature_row["inty"] is None:
|
|
63
|
+
study.logger.warning("Skipping feature due to missing inty")
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
# Handle missing feature_id by generating a new one
|
|
67
|
+
if feature_row["feature_id"] is None:
|
|
68
|
+
# Use a simple incremental ID for alignment purposes
|
|
69
|
+
feature_id = len(temp_feature_maps) * 100000 + feature_map.size() + 1
|
|
70
|
+
study.logger.debug(f"Generated new feature_id {feature_id} for feature with missing ID in sample {sample_name}")
|
|
71
|
+
else:
|
|
72
|
+
feature_id = int(feature_row["feature_id"])
|
|
73
|
+
|
|
74
|
+
feature.setUniqueId(feature_id)
|
|
75
|
+
feature.setMZ(float(feature_row["mz"]))
|
|
76
|
+
feature.setRT(float(feature_row["rt"]))
|
|
77
|
+
feature.setIntensity(float(feature_row["inty"]))
|
|
78
|
+
|
|
79
|
+
# Handle optional fields that might be None
|
|
80
|
+
if feature_row.get("quality") is not None:
|
|
81
|
+
feature.setOverallQuality(float(feature_row["quality"]))
|
|
82
|
+
if feature_row.get("charge") is not None:
|
|
83
|
+
feature.setCharge(int(feature_row["charge"]))
|
|
84
|
+
|
|
85
|
+
# Add to feature map
|
|
86
|
+
feature_map.push_back(feature)
|
|
87
|
+
except (ValueError, TypeError) as e:
|
|
88
|
+
study.logger.warning(f"Skipping feature due to conversion error: {e}")
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
temp_feature_maps.append(feature_map)
|
|
92
|
+
|
|
93
|
+
study.logger.debug(f"Generated {len(temp_feature_maps)} temporary feature maps from features_df for alignment")
|
|
94
|
+
return temp_feature_maps
|
|
95
|
+
|
|
96
|
+
|
|
18
97
|
def align(self, **kwargs):
|
|
19
98
|
"""Align feature maps using pose clustering or KD algorithm and update feature RTs.
|
|
20
99
|
|
|
@@ -59,6 +138,17 @@ def align(self, **kwargs):
|
|
|
59
138
|
"""
|
|
60
139
|
# parameters initialization
|
|
61
140
|
params = align_defaults()
|
|
141
|
+
|
|
142
|
+
# Handle 'params' keyword argument specifically (like merge does)
|
|
143
|
+
if 'params' in kwargs:
|
|
144
|
+
provided_params = kwargs.pop('params')
|
|
145
|
+
if isinstance(provided_params, align_defaults):
|
|
146
|
+
params = provided_params
|
|
147
|
+
self.logger.debug("Using provided align_defaults parameters from 'params' argument")
|
|
148
|
+
else:
|
|
149
|
+
self.logger.warning("'params' argument is not an align_defaults instance, ignoring")
|
|
150
|
+
|
|
151
|
+
# Process remaining kwargs
|
|
62
152
|
for key, value in kwargs.items():
|
|
63
153
|
if isinstance(value, align_defaults):
|
|
64
154
|
params = value
|
|
@@ -72,20 +162,16 @@ def align(self, **kwargs):
|
|
|
72
162
|
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
73
163
|
)
|
|
74
164
|
else:
|
|
75
|
-
self.logger.
|
|
165
|
+
self.logger.warning(f"Unknown parameter '{key}' ignored")
|
|
76
166
|
# end of parameter initialization
|
|
77
167
|
|
|
78
168
|
# Store parameters in the Study object
|
|
79
169
|
self.store_history(["align"], params.to_dict())
|
|
80
170
|
self.logger.debug("Parameters stored to align")
|
|
81
171
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
# self.logger.debug("Starting alignment")
|
|
87
|
-
|
|
88
|
-
fmaps = self.features_maps
|
|
172
|
+
# Generate temporary feature maps on-demand from features_df instead of using cached data
|
|
173
|
+
self.logger.debug("Generating feature maps on-demand from features_df for alignment")
|
|
174
|
+
fmaps = _generate_feature_maps_on_demand_for_align(self)
|
|
89
175
|
|
|
90
176
|
# Choose alignment algorithm
|
|
91
177
|
algorithm = params.get("algorithm").lower()
|
|
@@ -97,6 +183,9 @@ def align(self, **kwargs):
|
|
|
97
183
|
_align_kd_algorithm(self, fmaps, params)
|
|
98
184
|
else:
|
|
99
185
|
self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
|
|
186
|
+
# Clean up temporary feature maps to release memory
|
|
187
|
+
del fmaps
|
|
188
|
+
return
|
|
100
189
|
|
|
101
190
|
# check if rt_original exists in features_df, if not, add it after rt
|
|
102
191
|
if "rt_original" not in self.features_df.columns:
|
|
@@ -245,6 +334,10 @@ def align(self, **kwargs):
|
|
|
245
334
|
if params.get("save_features"):
|
|
246
335
|
self.save_samples()
|
|
247
336
|
|
|
337
|
+
# Clean up temporary feature maps to release memory
|
|
338
|
+
del fmaps
|
|
339
|
+
self.logger.debug("Temporary feature maps deleted to release memory")
|
|
340
|
+
|
|
248
341
|
|
|
249
342
|
def find_ms2(self, **kwargs):
|
|
250
343
|
"""
|
|
@@ -776,10 +869,22 @@ def _align_pose_clustering(study_obj, fmaps, params):
|
|
|
776
869
|
and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank"
|
|
777
870
|
):
|
|
778
871
|
continue
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
872
|
+
|
|
873
|
+
# Skip feature maps with insufficient data points for alignment
|
|
874
|
+
if fm.size() < 2:
|
|
875
|
+
sample_name = study_obj.samples_df.row(index, named=True)["sample_name"]
|
|
876
|
+
study_obj.logger.warning(f"Skipping alignment for sample '{sample_name}' - insufficient features ({fm.size()} features)")
|
|
877
|
+
continue
|
|
878
|
+
|
|
879
|
+
try:
|
|
880
|
+
trafo = oms.TransformationDescription()
|
|
881
|
+
aligner.align(fm, trafo)
|
|
882
|
+
transformer = oms.MapAlignmentTransformer()
|
|
883
|
+
transformer.transformRetentionTimes(fm, trafo, True)
|
|
884
|
+
except RuntimeError as e:
|
|
885
|
+
sample_name = study_obj.samples_df.row(index, named=True)["sample_name"]
|
|
886
|
+
study_obj.logger.warning(f"Failed to align sample '{sample_name}': {e}")
|
|
887
|
+
continue
|
|
783
888
|
|
|
784
889
|
study_obj.alignment_ref_index = ref_index
|
|
785
890
|
|
|
@@ -825,6 +930,11 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
825
930
|
f"Align time axes with rt_tol={params.get('rt_tol')}, min_samples={params.get('min_samples')}, max_points={max_points}",
|
|
826
931
|
)
|
|
827
932
|
|
|
933
|
+
# Check if feature maps are empty before proceeding
|
|
934
|
+
if not fmaps:
|
|
935
|
+
study_obj.logger.error("No feature maps available for alignment. Cannot proceed with alignment.")
|
|
936
|
+
raise ValueError("No feature maps available for alignment. This usually indicates that all samples failed to load properly.")
|
|
937
|
+
|
|
828
938
|
# Choose reference map (largest number of features)
|
|
829
939
|
ref_index = max(range(len(fmaps)), key=lambda i: fmaps[i].size())
|
|
830
940
|
ref_map = fmaps[ref_index]
|
|
@@ -1003,3 +1113,243 @@ def _align_pose_clustering_fallback(study_obj, fmaps, params):
|
|
|
1003
1113
|
transformer.transformRetentionTimes(fm, trafo, True)
|
|
1004
1114
|
|
|
1005
1115
|
study_obj.alignment_ref_index = ref_index
|
|
1116
|
+
|
|
1117
|
+
|
|
1118
|
+
def find_iso(self, rt_tol=0.1, mz_tol=0.01):
|
|
1119
|
+
"""
|
|
1120
|
+
Find isotope patterns for consensus features by searching raw MS1 data.
|
|
1121
|
+
OPTIMIZED VERSION: Each sample file is loaded only once for maximum efficiency.
|
|
1122
|
+
|
|
1123
|
+
For each consensus feature:
|
|
1124
|
+
1. Find the associated feature with highest intensity
|
|
1125
|
+
2. Load the corresponding sample5 file to access raw MS1 data
|
|
1126
|
+
3. Use original_rt (before alignment) to find the correct scan
|
|
1127
|
+
4. Search for isotope patterns in raw MS1 spectra
|
|
1128
|
+
5. Look for isotope patterns: 0.33, 0.50, 0.66, 1.00, 1.50, 2.00, 3.00, 4.00, 5.00 Da
|
|
1129
|
+
6. Store results as numpy arrays with [mz, inty] in the iso column
|
|
1130
|
+
|
|
1131
|
+
Parameters:
|
|
1132
|
+
rt_tol (float): RT tolerance for scan matching in seconds
|
|
1133
|
+
mz_tol (float): Additional m/z tolerance for isotope matching in Da
|
|
1134
|
+
"""
|
|
1135
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1136
|
+
self.logger.error("No consensus features found. Please run merge() first.")
|
|
1137
|
+
return
|
|
1138
|
+
|
|
1139
|
+
if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
|
|
1140
|
+
self.logger.error("No consensus mapping found. Please run merge() first.")
|
|
1141
|
+
return
|
|
1142
|
+
|
|
1143
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
1144
|
+
self.logger.error("No features found.")
|
|
1145
|
+
return
|
|
1146
|
+
|
|
1147
|
+
if self.samples_df is None or self.samples_df.is_empty():
|
|
1148
|
+
self.logger.error("No samples found.")
|
|
1149
|
+
return
|
|
1150
|
+
|
|
1151
|
+
# Add iso column if it doesn't exist
|
|
1152
|
+
if "iso" not in self.consensus_df.columns:
|
|
1153
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
1154
|
+
pl.lit(None, dtype=pl.Object).alias("iso")
|
|
1155
|
+
)
|
|
1156
|
+
|
|
1157
|
+
self.logger.info("Extracting isotopomers from raw MS1 data...")
|
|
1158
|
+
|
|
1159
|
+
# Isotope mass shifts to search for (up to 7x 13C isotopes)
|
|
1160
|
+
isotope_shifts = [
|
|
1161
|
+
0.33,
|
|
1162
|
+
0.50,
|
|
1163
|
+
0.66,
|
|
1164
|
+
1.00335,
|
|
1165
|
+
1.50502,
|
|
1166
|
+
2.00670,
|
|
1167
|
+
3.01005,
|
|
1168
|
+
4.01340,
|
|
1169
|
+
5.01675,
|
|
1170
|
+
6.02010,
|
|
1171
|
+
7.02345,
|
|
1172
|
+
]
|
|
1173
|
+
|
|
1174
|
+
consensus_iso_data = {}
|
|
1175
|
+
|
|
1176
|
+
# SUPER OPTIMIZATION: Vectorized pre-calculation using joins (10-100x faster)
|
|
1177
|
+
self.logger.debug("Building sample-to-consensus mapping using vectorized operations...")
|
|
1178
|
+
|
|
1179
|
+
# Step 1: Join consensus_mapping with features to get intensities in one operation
|
|
1180
|
+
consensus_with_features = self.consensus_mapping_df.join(
|
|
1181
|
+
self.features_df.select(['feature_uid', 'sample_uid', 'inty', 'mz', 'rt', 'rt_original']),
|
|
1182
|
+
on=['feature_uid', 'sample_uid'],
|
|
1183
|
+
how='left'
|
|
1184
|
+
)
|
|
1185
|
+
|
|
1186
|
+
# Step 2: Find the best feature (highest intensity) for each consensus using window functions
|
|
1187
|
+
best_features = consensus_with_features.with_columns(
|
|
1188
|
+
pl.col('inty').fill_null(0) # Handle null intensities
|
|
1189
|
+
).with_columns(
|
|
1190
|
+
pl.col('inty').max().over('consensus_uid').alias('max_inty')
|
|
1191
|
+
).filter(
|
|
1192
|
+
pl.col('inty') == pl.col('max_inty')
|
|
1193
|
+
).group_by('consensus_uid').first() # Take first if there are ties
|
|
1194
|
+
|
|
1195
|
+
# Step 3: Join with samples to get sample paths in one operation
|
|
1196
|
+
best_features_with_paths = best_features.join(
|
|
1197
|
+
self.samples_df.select(['sample_uid', 'sample_path']),
|
|
1198
|
+
on='sample_uid',
|
|
1199
|
+
how='left'
|
|
1200
|
+
).filter(
|
|
1201
|
+
pl.col('sample_path').is_not_null()
|
|
1202
|
+
)
|
|
1203
|
+
|
|
1204
|
+
# Step 4: Group by sample path for batch processing (much faster than nested loops)
|
|
1205
|
+
sample_to_consensus = {}
|
|
1206
|
+
for row in best_features_with_paths.iter_rows(named=True):
|
|
1207
|
+
sample_path = row['sample_path']
|
|
1208
|
+
consensus_uid = row['consensus_uid']
|
|
1209
|
+
|
|
1210
|
+
# Create feature data dictionary for compatibility
|
|
1211
|
+
feature_data = {
|
|
1212
|
+
'mz': row['mz'],
|
|
1213
|
+
'rt': row['rt'],
|
|
1214
|
+
'rt_original': row.get('rt_original', row['rt']),
|
|
1215
|
+
'inty': row['inty']
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1218
|
+
if sample_path not in sample_to_consensus:
|
|
1219
|
+
sample_to_consensus[sample_path] = []
|
|
1220
|
+
|
|
1221
|
+
sample_to_consensus[sample_path].append((consensus_uid, feature_data))
|
|
1222
|
+
|
|
1223
|
+
# Initialize failed consensus features (those not in the mapping)
|
|
1224
|
+
processed_consensus_uids = set(best_features_with_paths['consensus_uid'].to_list())
|
|
1225
|
+
for consensus_row in self.consensus_df.iter_rows(named=True):
|
|
1226
|
+
consensus_uid = consensus_row["consensus_uid"]
|
|
1227
|
+
if consensus_uid not in processed_consensus_uids:
|
|
1228
|
+
consensus_iso_data[consensus_uid] = None
|
|
1229
|
+
|
|
1230
|
+
self.logger.debug(f"Will read {len(sample_to_consensus)} unique sample files for {len(self.consensus_df)} consensus features")
|
|
1231
|
+
|
|
1232
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1233
|
+
|
|
1234
|
+
# OPTIMIZATION 2: Process by sample file (load each file only once)
|
|
1235
|
+
for sample_path, consensus_list in tqdm(
|
|
1236
|
+
sample_to_consensus.items(),
|
|
1237
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Read files",
|
|
1238
|
+
disable=tdqm_disable,
|
|
1239
|
+
):
|
|
1240
|
+
try:
|
|
1241
|
+
# Load MS1 data once per sample
|
|
1242
|
+
ms1_df = self._load_ms1(sample_path)
|
|
1243
|
+
|
|
1244
|
+
if ms1_df is None or ms1_df.is_empty():
|
|
1245
|
+
# Mark all consensus features from this sample as failed
|
|
1246
|
+
for consensus_uid, _ in consensus_list:
|
|
1247
|
+
consensus_iso_data[consensus_uid] = None
|
|
1248
|
+
continue
|
|
1249
|
+
|
|
1250
|
+
# Process all consensus features for this sample
|
|
1251
|
+
for consensus_uid, best_feature in consensus_list:
|
|
1252
|
+
# Get the original RT (before alignment correction)
|
|
1253
|
+
base_mz = best_feature["mz"]
|
|
1254
|
+
original_rt = best_feature.get("rt_original", best_feature["rt"])
|
|
1255
|
+
|
|
1256
|
+
# Find MS1 scans near the original RT
|
|
1257
|
+
rt_min = original_rt - rt_tol
|
|
1258
|
+
rt_max = original_rt + rt_tol
|
|
1259
|
+
|
|
1260
|
+
# Filter MS1 data for scans within RT window
|
|
1261
|
+
ms1_window = ms1_df.filter(
|
|
1262
|
+
(pl.col("rt") >= rt_min) & (pl.col("rt") <= rt_max)
|
|
1263
|
+
)
|
|
1264
|
+
|
|
1265
|
+
if ms1_window.is_empty():
|
|
1266
|
+
consensus_iso_data[consensus_uid] = None
|
|
1267
|
+
continue
|
|
1268
|
+
|
|
1269
|
+
isotope_matches = []
|
|
1270
|
+
|
|
1271
|
+
# Search for each isotope shift
|
|
1272
|
+
for shift in isotope_shifts:
|
|
1273
|
+
target_mz = base_mz + shift
|
|
1274
|
+
mz_min_iso = target_mz - mz_tol
|
|
1275
|
+
mz_max_iso = target_mz + mz_tol
|
|
1276
|
+
|
|
1277
|
+
# Find peaks in MS1 data within m/z tolerance
|
|
1278
|
+
isotope_peaks = ms1_window.filter(
|
|
1279
|
+
(pl.col("mz") >= mz_min_iso) & (pl.col("mz") <= mz_max_iso)
|
|
1280
|
+
)
|
|
1281
|
+
|
|
1282
|
+
if not isotope_peaks.is_empty():
|
|
1283
|
+
# Get the peak with maximum intensity for this isotope
|
|
1284
|
+
max_peak = isotope_peaks.filter(
|
|
1285
|
+
pl.col("inty") == pl.col("inty").max()
|
|
1286
|
+
).row(0, named=True)
|
|
1287
|
+
|
|
1288
|
+
# Store as float with specific precision: m/z to 4 decimals, intensity rounded to integer
|
|
1289
|
+
mz_formatted = round(float(max_peak["mz"]), 4)
|
|
1290
|
+
inty_formatted = float(round(max_peak["inty"])) # Round to integer, but keep as float
|
|
1291
|
+
isotope_matches.append([mz_formatted, inty_formatted])
|
|
1292
|
+
|
|
1293
|
+
# Store results as numpy array
|
|
1294
|
+
if isotope_matches:
|
|
1295
|
+
consensus_iso_data[consensus_uid] = np.array(isotope_matches)
|
|
1296
|
+
else:
|
|
1297
|
+
consensus_iso_data[consensus_uid] = None
|
|
1298
|
+
|
|
1299
|
+
except Exception as e:
|
|
1300
|
+
self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
|
|
1301
|
+
# Mark all consensus features from this sample as failed
|
|
1302
|
+
for consensus_uid, _ in consensus_list:
|
|
1303
|
+
consensus_iso_data[consensus_uid] = None
|
|
1304
|
+
continue
|
|
1305
|
+
|
|
1306
|
+
# Update consensus_df with isotope data
|
|
1307
|
+
# Create mapping function for update
|
|
1308
|
+
def get_iso_data(uid):
|
|
1309
|
+
return consensus_iso_data.get(uid, None)
|
|
1310
|
+
|
|
1311
|
+
# Update the iso column
|
|
1312
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
1313
|
+
pl.col("consensus_uid").map_elements(
|
|
1314
|
+
lambda uid: get_iso_data(uid),
|
|
1315
|
+
return_dtype=pl.Object
|
|
1316
|
+
).alias("iso")
|
|
1317
|
+
)
|
|
1318
|
+
|
|
1319
|
+
# Count how many consensus features have isotope data
|
|
1320
|
+
iso_count = sum(1 for data in consensus_iso_data.values() if data is not None and len(data) > 0)
|
|
1321
|
+
|
|
1322
|
+
self.logger.info(f"Optimized isotope detection completed. Found isotope patterns for {iso_count}/{len(self.consensus_df)} consensus features.")
|
|
1323
|
+
|
|
1324
|
+
|
|
1325
|
+
def reset_iso(self):
|
|
1326
|
+
"""
|
|
1327
|
+
Reset the iso column in consensus_df to None, clearing all isotope data.
|
|
1328
|
+
|
|
1329
|
+
This function clears any previously computed isotope patterns from the
|
|
1330
|
+
consensus_df, setting the 'iso' column to None for all features. This
|
|
1331
|
+
is useful before re-running isotope detection with different parameters
|
|
1332
|
+
or to clear isotope data entirely.
|
|
1333
|
+
|
|
1334
|
+
Returns:
|
|
1335
|
+
None
|
|
1336
|
+
"""
|
|
1337
|
+
if self.consensus_df is None:
|
|
1338
|
+
self.logger.warning("No consensus_df found. Nothing to reset.")
|
|
1339
|
+
return
|
|
1340
|
+
|
|
1341
|
+
if "iso" not in self.consensus_df.columns:
|
|
1342
|
+
self.logger.warning("No 'iso' column found in consensus_df. Nothing to reset.")
|
|
1343
|
+
return
|
|
1344
|
+
|
|
1345
|
+
# Count how many features currently have isotope data
|
|
1346
|
+
iso_count = self.consensus_df.select(
|
|
1347
|
+
pl.col("iso").is_not_null().sum().alias("count")
|
|
1348
|
+
).item(0, "count")
|
|
1349
|
+
|
|
1350
|
+
# Reset the iso column to None
|
|
1351
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
1352
|
+
pl.lit(None, dtype=pl.Object).alias("iso")
|
|
1353
|
+
)
|
|
1354
|
+
|
|
1355
|
+
self.logger.info(f"Reset isotope data for {iso_count} features. All 'iso' values set to None.")
|
masster/study/save.py
CHANGED
|
@@ -154,13 +154,56 @@ def save_samples(self, samples=None):
|
|
|
154
154
|
|
|
155
155
|
|
|
156
156
|
def _save_consensusXML(self, filename: str):
|
|
157
|
-
if self.
|
|
158
|
-
self.logger.error("No consensus
|
|
157
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
158
|
+
self.logger.error("No consensus features found.")
|
|
159
159
|
return
|
|
160
|
-
|
|
160
|
+
|
|
161
|
+
# Build consensus map from consensus_df with proper consensus_id values
|
|
162
|
+
import pyopenms as oms
|
|
163
|
+
consensus_map = oms.ConsensusMap()
|
|
164
|
+
|
|
165
|
+
# Set up file descriptions for all samples
|
|
166
|
+
file_descriptions = consensus_map.getColumnHeaders()
|
|
167
|
+
if hasattr(self, 'samples_df') and not self.samples_df.is_empty():
|
|
168
|
+
for i, sample_row in enumerate(self.samples_df.iter_rows(named=True)):
|
|
169
|
+
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
170
|
+
file_description.filename = sample_row.get("sample_name", f"sample_{i}")
|
|
171
|
+
file_description.size = 0 # Will be updated if needed
|
|
172
|
+
file_description.unique_id = i + 1
|
|
173
|
+
file_descriptions[i] = file_description
|
|
174
|
+
consensus_map.setColumnHeaders(file_descriptions)
|
|
175
|
+
|
|
176
|
+
# Add consensus features to the map (simplified version without individual features)
|
|
177
|
+
for consensus_row in self.consensus_df.iter_rows(named=True):
|
|
178
|
+
consensus_feature = oms.ConsensusFeature()
|
|
179
|
+
|
|
180
|
+
# Set basic properties
|
|
181
|
+
consensus_feature.setRT(float(consensus_row.get("rt", 0.0)))
|
|
182
|
+
consensus_feature.setMZ(float(consensus_row.get("mz", 0.0)))
|
|
183
|
+
consensus_feature.setIntensity(float(consensus_row.get("inty_mean", 0.0)))
|
|
184
|
+
consensus_feature.setQuality(float(consensus_row.get("quality", 1.0)))
|
|
185
|
+
|
|
186
|
+
# Set the unique consensus_id as the unique ID
|
|
187
|
+
consensus_id_str = consensus_row.get("consensus_id", "")
|
|
188
|
+
if consensus_id_str and len(consensus_id_str) == 16:
|
|
189
|
+
try:
|
|
190
|
+
# Convert 16-character hex string to integer for OpenMS
|
|
191
|
+
consensus_uid = int(consensus_id_str, 16)
|
|
192
|
+
consensus_feature.setUniqueId(consensus_uid)
|
|
193
|
+
except ValueError:
|
|
194
|
+
# Fallback to hash if not hex
|
|
195
|
+
consensus_feature.setUniqueId(hash(consensus_id_str) & 0x7FFFFFFFFFFFFFFF)
|
|
196
|
+
else:
|
|
197
|
+
# Fallback to consensus_uid
|
|
198
|
+
consensus_feature.setUniqueId(consensus_row.get("consensus_uid", 0))
|
|
199
|
+
|
|
200
|
+
consensus_map.push_back(consensus_feature)
|
|
201
|
+
|
|
202
|
+
# Save the consensus map
|
|
161
203
|
fh = oms.ConsensusXMLFile()
|
|
162
|
-
fh.store(filename,
|
|
163
|
-
self.logger.debug(f"Saved consensus map to {filename}")
|
|
204
|
+
fh.store(filename, consensus_map)
|
|
205
|
+
self.logger.debug(f"Saved consensus map with {len(self.consensus_df)} features to {filename}")
|
|
206
|
+
self.logger.debug("Features use unique 16-character consensus_id strings")
|
|
164
207
|
|
|
165
208
|
|
|
166
209
|
def save_consensus(self, **kwargs):
|
masster/study/study.py
CHANGED
|
@@ -55,6 +55,7 @@ import polars as pl
|
|
|
55
55
|
from masster.study.h5 import _load_study5
|
|
56
56
|
from masster.study.h5 import _save_study5
|
|
57
57
|
from masster.study.h5 import _save_study5_compressed
|
|
58
|
+
from masster.study.h5 import _load_ms1
|
|
58
59
|
from masster.study.helpers import _get_consensus_uids
|
|
59
60
|
from masster.study.helpers import _get_feature_uids
|
|
60
61
|
from masster.study.helpers import _get_sample_uids
|
|
@@ -126,6 +127,8 @@ from masster.study.merge import _finalize_merge
|
|
|
126
127
|
from masster.study.merge import _count_tight_clusters
|
|
127
128
|
from masster.study.processing import integrate
|
|
128
129
|
from masster.study.processing import find_ms2
|
|
130
|
+
from masster.study.processing import find_iso
|
|
131
|
+
from masster.study.processing import reset_iso
|
|
129
132
|
from masster.study.parameters import store_history
|
|
130
133
|
from masster.study.parameters import get_parameters
|
|
131
134
|
from masster.study.parameters import update_parameters
|
|
@@ -385,6 +388,9 @@ class Study:
|
|
|
385
388
|
merge = merge
|
|
386
389
|
find_consensus = merge # Backward compatibility alias
|
|
387
390
|
find_ms2 = find_ms2
|
|
391
|
+
find_iso = find_iso
|
|
392
|
+
reset_iso = reset_iso
|
|
393
|
+
iso_reset = reset_iso
|
|
388
394
|
integrate = integrate
|
|
389
395
|
integrate_chrom = integrate # Backward compatibility alias
|
|
390
396
|
fill = fill
|
|
@@ -421,9 +427,11 @@ class Study:
|
|
|
421
427
|
set_source = set_source
|
|
422
428
|
sample_color = sample_color
|
|
423
429
|
sample_color_reset = sample_color_reset
|
|
430
|
+
reset_sample_color = sample_color_reset
|
|
424
431
|
name_replace = sample_name_replace
|
|
425
432
|
name_reset = sample_name_reset
|
|
426
|
-
|
|
433
|
+
reset_name = sample_name_reset
|
|
434
|
+
|
|
427
435
|
# === Data Compression and Storage ===
|
|
428
436
|
compress = compress
|
|
429
437
|
compress_features = compress_features
|
|
@@ -436,8 +444,10 @@ class Study:
|
|
|
436
444
|
|
|
437
445
|
# === Reset Operations ===
|
|
438
446
|
fill_reset = fill_reset
|
|
447
|
+
reset_fill = fill_reset
|
|
439
448
|
align_reset = align_reset
|
|
440
|
-
|
|
449
|
+
reset_align = align_reset
|
|
450
|
+
|
|
441
451
|
# === Plotting and Visualization ===
|
|
442
452
|
plot_alignment = plot_alignment
|
|
443
453
|
plot_chrom = plot_chrom
|
|
@@ -461,8 +471,10 @@ class Study:
|
|
|
461
471
|
identify = identify
|
|
462
472
|
get_id = get_id
|
|
463
473
|
id_reset = id_reset
|
|
474
|
+
reset_id = id_reset
|
|
464
475
|
lib_reset = lib_reset
|
|
465
|
-
|
|
476
|
+
reset_lib = lib_reset
|
|
477
|
+
|
|
466
478
|
# === Parameter Management ===
|
|
467
479
|
store_history = store_history
|
|
468
480
|
get_parameters = get_parameters
|
|
@@ -478,6 +490,7 @@ class Study:
|
|
|
478
490
|
_load_study5 = _load_study5
|
|
479
491
|
_save_study5 = _save_study5
|
|
480
492
|
_save_study5_compressed = _save_study5_compressed
|
|
493
|
+
_load_ms1 = _load_ms1
|
|
481
494
|
_get_consensus_uids = _get_consensus_uids
|
|
482
495
|
_get_feature_uids = _get_feature_uids
|
|
483
496
|
_get_sample_uids = _get_sample_uids
|
masster/study/study5_schema.json
CHANGED
masster/wizard/__init__.py
CHANGED
|
@@ -7,8 +7,11 @@ alignment, merging, plotting, and export.
|
|
|
7
7
|
|
|
8
8
|
The create_script() function allows immediate generation of standalone analysis
|
|
9
9
|
scripts without creating a Wizard instance first.
|
|
10
|
+
|
|
11
|
+
The execute() function combines create_script() with immediate execution of the
|
|
12
|
+
generated script for fully automated processing.
|
|
10
13
|
"""
|
|
11
14
|
|
|
12
|
-
from .wizard import Wizard, wizard_def, create_script
|
|
15
|
+
from .wizard import Wizard, wizard_def, create_script, execute
|
|
13
16
|
|
|
14
|
-
__all__ = ["Wizard", "wizard_def", "create_script"]
|
|
17
|
+
__all__ = ["Wizard", "wizard_def", "create_script", "execute"]
|