masster 0.4.19__py3-none-any.whl → 0.4.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +6 -1
- masster/_version.py +1 -1
- masster/logger.py +42 -0
- masster/sample/h5.py +58 -1
- masster/sample/load.py +12 -5
- masster/sample/plot.py +56 -65
- masster/sample/processing.py +158 -0
- masster/sample/sample.py +2 -9
- masster/sample/sample5_schema.json +3 -0
- masster/sample/save.py +137 -59
- masster/spectrum.py +58 -9
- masster/study/export.py +238 -152
- masster/study/h5.py +65 -1
- masster/study/helpers.py +55 -14
- masster/study/merge.py +910 -67
- masster/study/plot.py +50 -7
- masster/study/processing.py +257 -1
- masster/study/save.py +48 -5
- masster/study/study.py +34 -3
- masster/study/study5_schema.json +3 -0
- masster/wizard/__init__.py +8 -2
- masster/wizard/wizard.py +612 -876
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/METADATA +1 -1
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/RECORD +27 -30
- masster/wizard/test_structure.py +0 -49
- masster/wizard/test_wizard.py +0 -285
- masster/wizard.py +0 -1175
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/WHEEL +0 -0
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/entry_points.txt +0 -0
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/licenses/LICENSE +0 -0
masster/study/plot.py
CHANGED
|
@@ -42,9 +42,10 @@ def _isolated_save_plot(plot_object, filename, abs_filename, logger, plot_title=
|
|
|
42
42
|
from bokeh.io.export import export_png
|
|
43
43
|
export_png(plot_object, filename=filename)
|
|
44
44
|
logger.info(f"Plot saved to: {abs_filename}")
|
|
45
|
-
except Exception:
|
|
45
|
+
except Exception as e:
|
|
46
46
|
# Fall back to HTML if PNG export not available
|
|
47
47
|
html_filename = filename.replace('.png', '.html')
|
|
48
|
+
abs_html_filename = html_filename if abs_filename == filename else abs_filename.replace('.png', '.html')
|
|
48
49
|
from bokeh.resources import Resources
|
|
49
50
|
from bokeh.embed import file_html
|
|
50
51
|
|
|
@@ -54,7 +55,7 @@ def _isolated_save_plot(plot_object, filename, abs_filename, logger, plot_title=
|
|
|
54
55
|
with open(html_filename, 'w', encoding='utf-8') as f:
|
|
55
56
|
f.write(html)
|
|
56
57
|
|
|
57
|
-
logger.warning(f"PNG export not available
|
|
58
|
+
logger.warning(f"PNG export not available ({str(e)}). Use export_png. Saved as HTML instead: {abs_html_filename}")
|
|
58
59
|
elif filename.endswith(".pdf"):
|
|
59
60
|
# Try to save as PDF, fall back to HTML if not available
|
|
60
61
|
try:
|
|
@@ -74,6 +75,26 @@ def _isolated_save_plot(plot_object, filename, abs_filename, logger, plot_title=
|
|
|
74
75
|
f.write(html)
|
|
75
76
|
|
|
76
77
|
logger.warning(f"PDF export not available, saved as HTML instead: {html_filename}")
|
|
78
|
+
elif filename.endswith(".svg"):
|
|
79
|
+
# Try to save as SVG, fall back to HTML if not available
|
|
80
|
+
try:
|
|
81
|
+
from bokeh.io.export import export_svg
|
|
82
|
+
export_svg(plot_object, filename=filename)
|
|
83
|
+
logger.info(f"Plot saved to: {abs_filename}")
|
|
84
|
+
except Exception as e:
|
|
85
|
+
# Fall back to HTML if SVG export not available
|
|
86
|
+
html_filename = filename.replace('.svg', '.html')
|
|
87
|
+
abs_html_filename = html_filename if abs_filename == filename else abs_filename.replace('.svg', '.html')
|
|
88
|
+
from bokeh.resources import Resources
|
|
89
|
+
from bokeh.embed import file_html
|
|
90
|
+
|
|
91
|
+
resources = Resources(mode='cdn')
|
|
92
|
+
html = file_html(plot_object, resources, title=plot_title)
|
|
93
|
+
|
|
94
|
+
with open(html_filename, 'w', encoding='utf-8') as f:
|
|
95
|
+
f.write(html)
|
|
96
|
+
|
|
97
|
+
logger.warning(f"SVG export not available ({str(e)}). Saved as HTML instead: {abs_html_filename}")
|
|
77
98
|
else:
|
|
78
99
|
# Default to HTML for unknown extensions using isolated approach
|
|
79
100
|
from bokeh.resources import Resources
|
|
@@ -181,6 +202,22 @@ def _isolated_save_panel_plot(panel_obj, filename, abs_filename, logger, plot_ti
|
|
|
181
202
|
logger.warning(f"PDF export not available, saved as HTML instead: {abs_html_filename}")
|
|
182
203
|
except Exception as e:
|
|
183
204
|
logger.error(f"Failed to save {plot_title} as HTML fallback: {e}")
|
|
205
|
+
elif filename.endswith(".svg"):
|
|
206
|
+
# Try to save as SVG, fall back to HTML if not available
|
|
207
|
+
try:
|
|
208
|
+
from bokeh.io.export import export_svg
|
|
209
|
+
bokeh_layout = panel_obj.get_root()
|
|
210
|
+
export_svg(bokeh_layout, filename=filename)
|
|
211
|
+
logger.info(f"{plot_title} saved to: {abs_filename}")
|
|
212
|
+
except Exception as e:
|
|
213
|
+
# Fall back to HTML if SVG export not available
|
|
214
|
+
html_filename = filename.replace('.svg', '.html')
|
|
215
|
+
abs_html_filename = os.path.abspath(html_filename)
|
|
216
|
+
try:
|
|
217
|
+
panel_obj.save(html_filename, embed=True)
|
|
218
|
+
logger.warning(f"SVG export not available ({str(e)}), saved as HTML instead: {abs_html_filename}")
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.error(f"Failed to save {plot_title} as HTML fallback: {e}")
|
|
184
221
|
else:
|
|
185
222
|
# Default to HTML for unknown extensions
|
|
186
223
|
try:
|
|
@@ -512,9 +549,9 @@ def plot_consensus_2d(
|
|
|
512
549
|
filename=None,
|
|
513
550
|
colorby="number_samples",
|
|
514
551
|
cmap=None,
|
|
515
|
-
markersize=
|
|
552
|
+
markersize=8,
|
|
516
553
|
sizeby="inty_mean",
|
|
517
|
-
scaling="
|
|
554
|
+
scaling="static",
|
|
518
555
|
alpha=0.7,
|
|
519
556
|
width=600,
|
|
520
557
|
height=450,
|
|
@@ -529,7 +566,7 @@ def plot_consensus_2d(
|
|
|
529
566
|
colorby (str): Column name to use for color mapping (default: "number_samples")
|
|
530
567
|
sizeby (str): Column name to use for size mapping (default: "inty_mean")
|
|
531
568
|
markersize (int): Base marker size (default: 6)
|
|
532
|
-
|
|
569
|
+
scaling (str): Controls whether points scale with zoom. Options:
|
|
533
570
|
'dynamic' - points use circle() and scale with zoom
|
|
534
571
|
'static' - points use scatter() and maintain fixed pixel size
|
|
535
572
|
alpha (float): Transparency level (default: 0.7)
|
|
@@ -553,7 +590,7 @@ def plot_consensus_2d(
|
|
|
553
590
|
if colorby not in data.columns:
|
|
554
591
|
self.logger.error(f"Column {colorby} not found in consensus_df.")
|
|
555
592
|
return
|
|
556
|
-
if sizeby not in data.columns:
|
|
593
|
+
if sizeby is not None and sizeby not in data.columns:
|
|
557
594
|
self.logger.warning(f"Column {sizeby} not found in consensus_df.")
|
|
558
595
|
sizeby = None
|
|
559
596
|
# if sizeby is not None, set markersize to sizeby
|
|
@@ -673,10 +710,16 @@ def plot_consensus_2d(
|
|
|
673
710
|
p.yaxis.axis_label = "m/z"
|
|
674
711
|
scatter_renderer: Any = None
|
|
675
712
|
if scaling.lower() in ["dyn", "dynamic"]:
|
|
713
|
+
# Calculate appropriate radius for dynamic scaling based on data range
|
|
714
|
+
rt_range = data["rt"].max() - data["rt"].min()
|
|
715
|
+
mz_range = data["mz"].max() - data["mz"].min()
|
|
716
|
+
# Use a fraction of the smaller dimension for radius, similar to sample plotting
|
|
717
|
+
dynamic_radius = min(rt_range, mz_range) * 0.0005 * markersize
|
|
718
|
+
|
|
676
719
|
scatter_renderer = p.circle(
|
|
677
720
|
x="rt",
|
|
678
721
|
y="mz",
|
|
679
|
-
radius=
|
|
722
|
+
radius=dynamic_radius,
|
|
680
723
|
fill_color={"field": colorby, "transform": color_mapper},
|
|
681
724
|
line_color=None,
|
|
682
725
|
alpha=alpha,
|
masster/study/processing.py
CHANGED
|
@@ -59,6 +59,17 @@ def align(self, **kwargs):
|
|
|
59
59
|
"""
|
|
60
60
|
# parameters initialization
|
|
61
61
|
params = align_defaults()
|
|
62
|
+
|
|
63
|
+
# Handle 'params' keyword argument specifically (like merge does)
|
|
64
|
+
if 'params' in kwargs:
|
|
65
|
+
provided_params = kwargs.pop('params')
|
|
66
|
+
if isinstance(provided_params, align_defaults):
|
|
67
|
+
params = provided_params
|
|
68
|
+
self.logger.debug("Using provided align_defaults parameters from 'params' argument")
|
|
69
|
+
else:
|
|
70
|
+
self.logger.warning("'params' argument is not an align_defaults instance, ignoring")
|
|
71
|
+
|
|
72
|
+
# Process remaining kwargs
|
|
62
73
|
for key, value in kwargs.items():
|
|
63
74
|
if isinstance(value, align_defaults):
|
|
64
75
|
params = value
|
|
@@ -72,7 +83,7 @@ def align(self, **kwargs):
|
|
|
72
83
|
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
73
84
|
)
|
|
74
85
|
else:
|
|
75
|
-
self.logger.
|
|
86
|
+
self.logger.warning(f"Unknown parameter '{key}' ignored")
|
|
76
87
|
# end of parameter initialization
|
|
77
88
|
|
|
78
89
|
# Store parameters in the Study object
|
|
@@ -825,6 +836,11 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
825
836
|
f"Align time axes with rt_tol={params.get('rt_tol')}, min_samples={params.get('min_samples')}, max_points={max_points}",
|
|
826
837
|
)
|
|
827
838
|
|
|
839
|
+
# Check if feature maps are empty before proceeding
|
|
840
|
+
if not fmaps:
|
|
841
|
+
study_obj.logger.error("No feature maps available for alignment. Cannot proceed with alignment.")
|
|
842
|
+
raise ValueError("No feature maps available for alignment. This usually indicates that all samples failed to load properly.")
|
|
843
|
+
|
|
828
844
|
# Choose reference map (largest number of features)
|
|
829
845
|
ref_index = max(range(len(fmaps)), key=lambda i: fmaps[i].size())
|
|
830
846
|
ref_map = fmaps[ref_index]
|
|
@@ -1003,3 +1019,243 @@ def _align_pose_clustering_fallback(study_obj, fmaps, params):
|
|
|
1003
1019
|
transformer.transformRetentionTimes(fm, trafo, True)
|
|
1004
1020
|
|
|
1005
1021
|
study_obj.alignment_ref_index = ref_index
|
|
1022
|
+
|
|
1023
|
+
|
|
1024
|
+
def find_iso(self, rt_tol=0.1, mz_tol=0.01):
|
|
1025
|
+
"""
|
|
1026
|
+
Find isotope patterns for consensus features by searching raw MS1 data.
|
|
1027
|
+
OPTIMIZED VERSION: Each sample file is loaded only once for maximum efficiency.
|
|
1028
|
+
|
|
1029
|
+
For each consensus feature:
|
|
1030
|
+
1. Find the associated feature with highest intensity
|
|
1031
|
+
2. Load the corresponding sample5 file to access raw MS1 data
|
|
1032
|
+
3. Use original_rt (before alignment) to find the correct scan
|
|
1033
|
+
4. Search for isotope patterns in raw MS1 spectra
|
|
1034
|
+
5. Look for isotope patterns: 0.33, 0.50, 0.66, 1.00, 1.50, 2.00, 3.00, 4.00, 5.00 Da
|
|
1035
|
+
6. Store results as numpy arrays with [mz, inty] in the iso column
|
|
1036
|
+
|
|
1037
|
+
Parameters:
|
|
1038
|
+
rt_tol (float): RT tolerance for scan matching in seconds
|
|
1039
|
+
mz_tol (float): Additional m/z tolerance for isotope matching in Da
|
|
1040
|
+
"""
|
|
1041
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1042
|
+
self.logger.error("No consensus features found. Please run merge() first.")
|
|
1043
|
+
return
|
|
1044
|
+
|
|
1045
|
+
if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
|
|
1046
|
+
self.logger.error("No consensus mapping found. Please run merge() first.")
|
|
1047
|
+
return
|
|
1048
|
+
|
|
1049
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
1050
|
+
self.logger.error("No features found.")
|
|
1051
|
+
return
|
|
1052
|
+
|
|
1053
|
+
if self.samples_df is None or self.samples_df.is_empty():
|
|
1054
|
+
self.logger.error("No samples found.")
|
|
1055
|
+
return
|
|
1056
|
+
|
|
1057
|
+
# Add iso column if it doesn't exist
|
|
1058
|
+
if "iso" not in self.consensus_df.columns:
|
|
1059
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
1060
|
+
pl.lit(None, dtype=pl.Object).alias("iso")
|
|
1061
|
+
)
|
|
1062
|
+
|
|
1063
|
+
self.logger.info("Extracting isotopomers from raw MS1 data...")
|
|
1064
|
+
|
|
1065
|
+
# Isotope mass shifts to search for (up to 7x 13C isotopes)
|
|
1066
|
+
isotope_shifts = [
|
|
1067
|
+
0.33,
|
|
1068
|
+
0.50,
|
|
1069
|
+
0.66,
|
|
1070
|
+
1.00335,
|
|
1071
|
+
1.50502,
|
|
1072
|
+
2.00670,
|
|
1073
|
+
3.01005,
|
|
1074
|
+
4.01340,
|
|
1075
|
+
5.01675,
|
|
1076
|
+
6.02010,
|
|
1077
|
+
7.02345,
|
|
1078
|
+
]
|
|
1079
|
+
|
|
1080
|
+
consensus_iso_data = {}
|
|
1081
|
+
|
|
1082
|
+
# SUPER OPTIMIZATION: Vectorized pre-calculation using joins (10-100x faster)
|
|
1083
|
+
self.logger.debug("Building sample-to-consensus mapping using vectorized operations...")
|
|
1084
|
+
|
|
1085
|
+
# Step 1: Join consensus_mapping with features to get intensities in one operation
|
|
1086
|
+
consensus_with_features = self.consensus_mapping_df.join(
|
|
1087
|
+
self.features_df.select(['feature_uid', 'sample_uid', 'inty', 'mz', 'rt', 'rt_original']),
|
|
1088
|
+
on=['feature_uid', 'sample_uid'],
|
|
1089
|
+
how='left'
|
|
1090
|
+
)
|
|
1091
|
+
|
|
1092
|
+
# Step 2: Find the best feature (highest intensity) for each consensus using window functions
|
|
1093
|
+
best_features = consensus_with_features.with_columns(
|
|
1094
|
+
pl.col('inty').fill_null(0) # Handle null intensities
|
|
1095
|
+
).with_columns(
|
|
1096
|
+
pl.col('inty').max().over('consensus_uid').alias('max_inty')
|
|
1097
|
+
).filter(
|
|
1098
|
+
pl.col('inty') == pl.col('max_inty')
|
|
1099
|
+
).group_by('consensus_uid').first() # Take first if there are ties
|
|
1100
|
+
|
|
1101
|
+
# Step 3: Join with samples to get sample paths in one operation
|
|
1102
|
+
best_features_with_paths = best_features.join(
|
|
1103
|
+
self.samples_df.select(['sample_uid', 'sample_path']),
|
|
1104
|
+
on='sample_uid',
|
|
1105
|
+
how='left'
|
|
1106
|
+
).filter(
|
|
1107
|
+
pl.col('sample_path').is_not_null()
|
|
1108
|
+
)
|
|
1109
|
+
|
|
1110
|
+
# Step 4: Group by sample path for batch processing (much faster than nested loops)
|
|
1111
|
+
sample_to_consensus = {}
|
|
1112
|
+
for row in best_features_with_paths.iter_rows(named=True):
|
|
1113
|
+
sample_path = row['sample_path']
|
|
1114
|
+
consensus_uid = row['consensus_uid']
|
|
1115
|
+
|
|
1116
|
+
# Create feature data dictionary for compatibility
|
|
1117
|
+
feature_data = {
|
|
1118
|
+
'mz': row['mz'],
|
|
1119
|
+
'rt': row['rt'],
|
|
1120
|
+
'rt_original': row.get('rt_original', row['rt']),
|
|
1121
|
+
'inty': row['inty']
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
if sample_path not in sample_to_consensus:
|
|
1125
|
+
sample_to_consensus[sample_path] = []
|
|
1126
|
+
|
|
1127
|
+
sample_to_consensus[sample_path].append((consensus_uid, feature_data))
|
|
1128
|
+
|
|
1129
|
+
# Initialize failed consensus features (those not in the mapping)
|
|
1130
|
+
processed_consensus_uids = set(best_features_with_paths['consensus_uid'].to_list())
|
|
1131
|
+
for consensus_row in self.consensus_df.iter_rows(named=True):
|
|
1132
|
+
consensus_uid = consensus_row["consensus_uid"]
|
|
1133
|
+
if consensus_uid not in processed_consensus_uids:
|
|
1134
|
+
consensus_iso_data[consensus_uid] = None
|
|
1135
|
+
|
|
1136
|
+
self.logger.debug(f"Will read {len(sample_to_consensus)} unique sample files for {len(self.consensus_df)} consensus features")
|
|
1137
|
+
|
|
1138
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1139
|
+
|
|
1140
|
+
# OPTIMIZATION 2: Process by sample file (load each file only once)
|
|
1141
|
+
for sample_path, consensus_list in tqdm(
|
|
1142
|
+
sample_to_consensus.items(),
|
|
1143
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Read files",
|
|
1144
|
+
disable=tdqm_disable,
|
|
1145
|
+
):
|
|
1146
|
+
try:
|
|
1147
|
+
# Load MS1 data once per sample
|
|
1148
|
+
ms1_df = self._load_ms1(sample_path)
|
|
1149
|
+
|
|
1150
|
+
if ms1_df is None or ms1_df.is_empty():
|
|
1151
|
+
# Mark all consensus features from this sample as failed
|
|
1152
|
+
for consensus_uid, _ in consensus_list:
|
|
1153
|
+
consensus_iso_data[consensus_uid] = None
|
|
1154
|
+
continue
|
|
1155
|
+
|
|
1156
|
+
# Process all consensus features for this sample
|
|
1157
|
+
for consensus_uid, best_feature in consensus_list:
|
|
1158
|
+
# Get the original RT (before alignment correction)
|
|
1159
|
+
base_mz = best_feature["mz"]
|
|
1160
|
+
original_rt = best_feature.get("rt_original", best_feature["rt"])
|
|
1161
|
+
|
|
1162
|
+
# Find MS1 scans near the original RT
|
|
1163
|
+
rt_min = original_rt - rt_tol
|
|
1164
|
+
rt_max = original_rt + rt_tol
|
|
1165
|
+
|
|
1166
|
+
# Filter MS1 data for scans within RT window
|
|
1167
|
+
ms1_window = ms1_df.filter(
|
|
1168
|
+
(pl.col("rt") >= rt_min) & (pl.col("rt") <= rt_max)
|
|
1169
|
+
)
|
|
1170
|
+
|
|
1171
|
+
if ms1_window.is_empty():
|
|
1172
|
+
consensus_iso_data[consensus_uid] = None
|
|
1173
|
+
continue
|
|
1174
|
+
|
|
1175
|
+
isotope_matches = []
|
|
1176
|
+
|
|
1177
|
+
# Search for each isotope shift
|
|
1178
|
+
for shift in isotope_shifts:
|
|
1179
|
+
target_mz = base_mz + shift
|
|
1180
|
+
mz_min_iso = target_mz - mz_tol
|
|
1181
|
+
mz_max_iso = target_mz + mz_tol
|
|
1182
|
+
|
|
1183
|
+
# Find peaks in MS1 data within m/z tolerance
|
|
1184
|
+
isotope_peaks = ms1_window.filter(
|
|
1185
|
+
(pl.col("mz") >= mz_min_iso) & (pl.col("mz") <= mz_max_iso)
|
|
1186
|
+
)
|
|
1187
|
+
|
|
1188
|
+
if not isotope_peaks.is_empty():
|
|
1189
|
+
# Get the peak with maximum intensity for this isotope
|
|
1190
|
+
max_peak = isotope_peaks.filter(
|
|
1191
|
+
pl.col("inty") == pl.col("inty").max()
|
|
1192
|
+
).row(0, named=True)
|
|
1193
|
+
|
|
1194
|
+
# Store as float with specific precision: m/z to 4 decimals, intensity rounded to integer
|
|
1195
|
+
mz_formatted = round(float(max_peak["mz"]), 4)
|
|
1196
|
+
inty_formatted = float(round(max_peak["inty"])) # Round to integer, but keep as float
|
|
1197
|
+
isotope_matches.append([mz_formatted, inty_formatted])
|
|
1198
|
+
|
|
1199
|
+
# Store results as numpy array
|
|
1200
|
+
if isotope_matches:
|
|
1201
|
+
consensus_iso_data[consensus_uid] = np.array(isotope_matches)
|
|
1202
|
+
else:
|
|
1203
|
+
consensus_iso_data[consensus_uid] = None
|
|
1204
|
+
|
|
1205
|
+
except Exception as e:
|
|
1206
|
+
self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
|
|
1207
|
+
# Mark all consensus features from this sample as failed
|
|
1208
|
+
for consensus_uid, _ in consensus_list:
|
|
1209
|
+
consensus_iso_data[consensus_uid] = None
|
|
1210
|
+
continue
|
|
1211
|
+
|
|
1212
|
+
# Update consensus_df with isotope data
|
|
1213
|
+
# Create mapping function for update
|
|
1214
|
+
def get_iso_data(uid):
|
|
1215
|
+
return consensus_iso_data.get(uid, None)
|
|
1216
|
+
|
|
1217
|
+
# Update the iso column
|
|
1218
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
1219
|
+
pl.col("consensus_uid").map_elements(
|
|
1220
|
+
lambda uid: get_iso_data(uid),
|
|
1221
|
+
return_dtype=pl.Object
|
|
1222
|
+
).alias("iso")
|
|
1223
|
+
)
|
|
1224
|
+
|
|
1225
|
+
# Count how many consensus features have isotope data
|
|
1226
|
+
iso_count = sum(1 for data in consensus_iso_data.values() if data is not None and len(data) > 0)
|
|
1227
|
+
|
|
1228
|
+
self.logger.info(f"Optimized isotope detection completed. Found isotope patterns for {iso_count}/{len(self.consensus_df)} consensus features.")
|
|
1229
|
+
|
|
1230
|
+
|
|
1231
|
+
def reset_iso(self):
|
|
1232
|
+
"""
|
|
1233
|
+
Reset the iso column in consensus_df to None, clearing all isotope data.
|
|
1234
|
+
|
|
1235
|
+
This function clears any previously computed isotope patterns from the
|
|
1236
|
+
consensus_df, setting the 'iso' column to None for all features. This
|
|
1237
|
+
is useful before re-running isotope detection with different parameters
|
|
1238
|
+
or to clear isotope data entirely.
|
|
1239
|
+
|
|
1240
|
+
Returns:
|
|
1241
|
+
None
|
|
1242
|
+
"""
|
|
1243
|
+
if self.consensus_df is None:
|
|
1244
|
+
self.logger.warning("No consensus_df found. Nothing to reset.")
|
|
1245
|
+
return
|
|
1246
|
+
|
|
1247
|
+
if "iso" not in self.consensus_df.columns:
|
|
1248
|
+
self.logger.warning("No 'iso' column found in consensus_df. Nothing to reset.")
|
|
1249
|
+
return
|
|
1250
|
+
|
|
1251
|
+
# Count how many features currently have isotope data
|
|
1252
|
+
iso_count = self.consensus_df.select(
|
|
1253
|
+
pl.col("iso").is_not_null().sum().alias("count")
|
|
1254
|
+
).item(0, "count")
|
|
1255
|
+
|
|
1256
|
+
# Reset the iso column to None
|
|
1257
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
1258
|
+
pl.lit(None, dtype=pl.Object).alias("iso")
|
|
1259
|
+
)
|
|
1260
|
+
|
|
1261
|
+
self.logger.info(f"Reset isotope data for {iso_count} features. All 'iso' values set to None.")
|
masster/study/save.py
CHANGED
|
@@ -154,13 +154,56 @@ def save_samples(self, samples=None):
|
|
|
154
154
|
|
|
155
155
|
|
|
156
156
|
def _save_consensusXML(self, filename: str):
|
|
157
|
-
if self.
|
|
158
|
-
self.logger.error("No consensus
|
|
157
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
158
|
+
self.logger.error("No consensus features found.")
|
|
159
159
|
return
|
|
160
|
-
|
|
160
|
+
|
|
161
|
+
# Build consensus map from consensus_df with proper consensus_id values
|
|
162
|
+
import pyopenms as oms
|
|
163
|
+
consensus_map = oms.ConsensusMap()
|
|
164
|
+
|
|
165
|
+
# Set up file descriptions for all samples
|
|
166
|
+
file_descriptions = consensus_map.getColumnHeaders()
|
|
167
|
+
if hasattr(self, 'samples_df') and not self.samples_df.is_empty():
|
|
168
|
+
for i, sample_row in enumerate(self.samples_df.iter_rows(named=True)):
|
|
169
|
+
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
170
|
+
file_description.filename = sample_row.get("sample_name", f"sample_{i}")
|
|
171
|
+
file_description.size = 0 # Will be updated if needed
|
|
172
|
+
file_description.unique_id = i + 1
|
|
173
|
+
file_descriptions[i] = file_description
|
|
174
|
+
consensus_map.setColumnHeaders(file_descriptions)
|
|
175
|
+
|
|
176
|
+
# Add consensus features to the map (simplified version without individual features)
|
|
177
|
+
for consensus_row in self.consensus_df.iter_rows(named=True):
|
|
178
|
+
consensus_feature = oms.ConsensusFeature()
|
|
179
|
+
|
|
180
|
+
# Set basic properties
|
|
181
|
+
consensus_feature.setRT(float(consensus_row.get("rt", 0.0)))
|
|
182
|
+
consensus_feature.setMZ(float(consensus_row.get("mz", 0.0)))
|
|
183
|
+
consensus_feature.setIntensity(float(consensus_row.get("inty_mean", 0.0)))
|
|
184
|
+
consensus_feature.setQuality(float(consensus_row.get("quality", 1.0)))
|
|
185
|
+
|
|
186
|
+
# Set the unique consensus_id as the unique ID
|
|
187
|
+
consensus_id_str = consensus_row.get("consensus_id", "")
|
|
188
|
+
if consensus_id_str and len(consensus_id_str) == 16:
|
|
189
|
+
try:
|
|
190
|
+
# Convert 16-character hex string to integer for OpenMS
|
|
191
|
+
consensus_uid = int(consensus_id_str, 16)
|
|
192
|
+
consensus_feature.setUniqueId(consensus_uid)
|
|
193
|
+
except ValueError:
|
|
194
|
+
# Fallback to hash if not hex
|
|
195
|
+
consensus_feature.setUniqueId(hash(consensus_id_str) & 0x7FFFFFFFFFFFFFFF)
|
|
196
|
+
else:
|
|
197
|
+
# Fallback to consensus_uid
|
|
198
|
+
consensus_feature.setUniqueId(consensus_row.get("consensus_uid", 0))
|
|
199
|
+
|
|
200
|
+
consensus_map.push_back(consensus_feature)
|
|
201
|
+
|
|
202
|
+
# Save the consensus map
|
|
161
203
|
fh = oms.ConsensusXMLFile()
|
|
162
|
-
fh.store(filename,
|
|
163
|
-
self.logger.debug(f"Saved consensus map to {filename}")
|
|
204
|
+
fh.store(filename, consensus_map)
|
|
205
|
+
self.logger.debug(f"Saved consensus map with {len(self.consensus_df)} features to {filename}")
|
|
206
|
+
self.logger.debug("Features use unique 16-character consensus_id strings")
|
|
164
207
|
|
|
165
208
|
|
|
166
209
|
def save_consensus(self, **kwargs):
|
masster/study/study.py
CHANGED
|
@@ -55,6 +55,7 @@ import polars as pl
|
|
|
55
55
|
from masster.study.h5 import _load_study5
|
|
56
56
|
from masster.study.h5 import _save_study5
|
|
57
57
|
from masster.study.h5 import _save_study5_compressed
|
|
58
|
+
from masster.study.h5 import _load_ms1
|
|
58
59
|
from masster.study.helpers import _get_consensus_uids
|
|
59
60
|
from masster.study.helpers import _get_feature_uids
|
|
60
61
|
from masster.study.helpers import _get_sample_uids
|
|
@@ -120,9 +121,14 @@ from masster.study.merge import merge
|
|
|
120
121
|
from masster.study.merge import _reset_consensus_data
|
|
121
122
|
from masster.study.merge import _extract_consensus_features
|
|
122
123
|
from masster.study.merge import _perform_adduct_grouping
|
|
124
|
+
from masster.study.merge import _consensus_cleanup
|
|
125
|
+
from masster.study.merge import _identify_adduct_by_mass_shift
|
|
123
126
|
from masster.study.merge import _finalize_merge
|
|
127
|
+
from masster.study.merge import _count_tight_clusters
|
|
124
128
|
from masster.study.processing import integrate
|
|
125
129
|
from masster.study.processing import find_ms2
|
|
130
|
+
from masster.study.processing import find_iso
|
|
131
|
+
from masster.study.processing import reset_iso
|
|
126
132
|
from masster.study.parameters import store_history
|
|
127
133
|
from masster.study.parameters import get_parameters
|
|
128
134
|
from masster.study.parameters import update_parameters
|
|
@@ -382,6 +388,9 @@ class Study:
|
|
|
382
388
|
merge = merge
|
|
383
389
|
find_consensus = merge # Backward compatibility alias
|
|
384
390
|
find_ms2 = find_ms2
|
|
391
|
+
find_iso = find_iso
|
|
392
|
+
reset_iso = reset_iso
|
|
393
|
+
iso_reset = reset_iso
|
|
385
394
|
integrate = integrate
|
|
386
395
|
integrate_chrom = integrate # Backward compatibility alias
|
|
387
396
|
fill = fill
|
|
@@ -418,9 +427,11 @@ class Study:
|
|
|
418
427
|
set_source = set_source
|
|
419
428
|
sample_color = sample_color
|
|
420
429
|
sample_color_reset = sample_color_reset
|
|
430
|
+
reset_sample_color = sample_color_reset
|
|
421
431
|
name_replace = sample_name_replace
|
|
422
432
|
name_reset = sample_name_reset
|
|
423
|
-
|
|
433
|
+
reset_name = sample_name_reset
|
|
434
|
+
|
|
424
435
|
# === Data Compression and Storage ===
|
|
425
436
|
compress = compress
|
|
426
437
|
compress_features = compress_features
|
|
@@ -433,8 +444,10 @@ class Study:
|
|
|
433
444
|
|
|
434
445
|
# === Reset Operations ===
|
|
435
446
|
fill_reset = fill_reset
|
|
447
|
+
reset_fill = fill_reset
|
|
436
448
|
align_reset = align_reset
|
|
437
|
-
|
|
449
|
+
reset_align = align_reset
|
|
450
|
+
|
|
438
451
|
# === Plotting and Visualization ===
|
|
439
452
|
plot_alignment = plot_alignment
|
|
440
453
|
plot_chrom = plot_chrom
|
|
@@ -458,8 +471,10 @@ class Study:
|
|
|
458
471
|
identify = identify
|
|
459
472
|
get_id = get_id
|
|
460
473
|
id_reset = id_reset
|
|
474
|
+
reset_id = id_reset
|
|
461
475
|
lib_reset = lib_reset
|
|
462
|
-
|
|
476
|
+
reset_lib = lib_reset
|
|
477
|
+
|
|
463
478
|
# === Parameter Management ===
|
|
464
479
|
store_history = store_history
|
|
465
480
|
get_parameters = get_parameters
|
|
@@ -475,6 +490,7 @@ class Study:
|
|
|
475
490
|
_load_study5 = _load_study5
|
|
476
491
|
_save_study5 = _save_study5
|
|
477
492
|
_save_study5_compressed = _save_study5_compressed
|
|
493
|
+
_load_ms1 = _load_ms1
|
|
478
494
|
_get_consensus_uids = _get_consensus_uids
|
|
479
495
|
_get_feature_uids = _get_feature_uids
|
|
480
496
|
_get_sample_uids = _get_sample_uids
|
|
@@ -496,6 +512,8 @@ class Study:
|
|
|
496
512
|
_reset_consensus_data = _reset_consensus_data
|
|
497
513
|
_extract_consensus_features = _extract_consensus_features
|
|
498
514
|
_perform_adduct_grouping = _perform_adduct_grouping
|
|
515
|
+
_consensus_cleanup = _consensus_cleanup
|
|
516
|
+
_identify_adduct_by_mass_shift = _identify_adduct_by_mass_shift
|
|
499
517
|
_finalize_merge = _finalize_merge
|
|
500
518
|
|
|
501
519
|
# === Default Parameters ===
|
|
@@ -873,6 +891,15 @@ class Study:
|
|
|
873
891
|
+ (self.consensus_mapping_df.estimated_size() if self.consensus_mapping_df is not None else 0)
|
|
874
892
|
)
|
|
875
893
|
|
|
894
|
+
# Calculate tight clusters count
|
|
895
|
+
tight_clusters_count = 0
|
|
896
|
+
if consensus_df_len > 0:
|
|
897
|
+
try:
|
|
898
|
+
tight_clusters_count = _count_tight_clusters(self, mz_tol=0.04, rt_tol=0.3)
|
|
899
|
+
except Exception as e:
|
|
900
|
+
# If tight clusters calculation fails, just use 0
|
|
901
|
+
tight_clusters_count = 0
|
|
902
|
+
|
|
876
903
|
# Add warning symbols for out-of-range values
|
|
877
904
|
consensus_warning = f" {_WARNING_SYMBOL}" if consensus_df_len < 50 else ""
|
|
878
905
|
|
|
@@ -901,6 +928,9 @@ class Study:
|
|
|
901
928
|
elif max_samples < samples_df_len * 0.8:
|
|
902
929
|
max_samples_warning = f" {_WARNING_SYMBOL}"
|
|
903
930
|
|
|
931
|
+
# Add warning for tight clusters
|
|
932
|
+
tight_clusters_warning = f" {_WARNING_SYMBOL}" if tight_clusters_count > 10 else ""
|
|
933
|
+
|
|
904
934
|
summary = (
|
|
905
935
|
f"Study folder: {self.folder}\n"
|
|
906
936
|
f"Last save: {self.filename}\n"
|
|
@@ -910,6 +940,7 @@ class Study:
|
|
|
910
940
|
f"- not in consensus: {ratio_not_in_consensus_to_total:.0f}%\n"
|
|
911
941
|
f"Consensus: {consensus_df_len}{consensus_warning}\n"
|
|
912
942
|
f"- RT spread: {rt_spread_text}{rt_spread_warning}\n"
|
|
943
|
+
f"- Tight clusters: {tight_clusters_count}{tight_clusters_warning}\n"
|
|
913
944
|
f"- Min samples count: {min_samples:.0f}\n"
|
|
914
945
|
f"- Mean samples count: {mean_samples:.0f}\n"
|
|
915
946
|
f"- Max samples count: {max_samples:.0f}{max_samples_warning}\n"
|
masster/study/study5_schema.json
CHANGED
masster/wizard/__init__.py
CHANGED
|
@@ -4,8 +4,14 @@ Wizard module for automated processing of mass spectrometry studies.
|
|
|
4
4
|
This module provides the Wizard class for fully automated processing of MS data
|
|
5
5
|
from raw files to final study results, including batch conversion, assembly,
|
|
6
6
|
alignment, merging, plotting, and export.
|
|
7
|
+
|
|
8
|
+
The create_script() function allows immediate generation of standalone analysis
|
|
9
|
+
scripts without creating a Wizard instance first.
|
|
10
|
+
|
|
11
|
+
The execute() function combines create_script() with immediate execution of the
|
|
12
|
+
generated script for fully automated processing.
|
|
7
13
|
"""
|
|
8
14
|
|
|
9
|
-
from .wizard import Wizard, wizard_def
|
|
15
|
+
from .wizard import Wizard, wizard_def, create_script, execute
|
|
10
16
|
|
|
11
|
-
__all__ = ["Wizard", "wizard_def"]
|
|
17
|
+
__all__ = ["Wizard", "wizard_def", "create_script", "execute"]
|