masster 0.4.20__py3-none-any.whl → 0.4.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +6 -0
- masster/_version.py +1 -1
- masster/sample/h5.py +58 -1
- masster/sample/load.py +7 -1
- masster/sample/plot.py +56 -65
- masster/sample/processing.py +158 -0
- masster/sample/sample.py +2 -0
- masster/sample/sample5_schema.json +3 -0
- masster/sample/save.py +137 -59
- masster/spectrum.py +58 -9
- masster/study/export.py +238 -152
- masster/study/h5.py +65 -1
- masster/study/helpers.py +3 -3
- masster/study/merge.py +25 -10
- masster/study/plot.py +39 -2
- masster/study/processing.py +257 -1
- masster/study/save.py +48 -5
- masster/study/study.py +16 -3
- masster/study/study5_schema.json +3 -0
- masster/wizard/__init__.py +5 -2
- masster/wizard/wizard.py +430 -1866
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/METADATA +1 -1
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/RECORD +26 -28
- masster/wizard/test_structure.py +0 -49
- masster/wizard/test_wizard.py +0 -285
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/WHEEL +0 -0
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/entry_points.txt +0 -0
- {masster-0.4.20.dist-info → masster-0.4.21.dist-info}/licenses/LICENSE +0 -0
masster/study/merge.py
CHANGED
|
@@ -400,7 +400,7 @@ def merge(self, **kwargs) -> None:
|
|
|
400
400
|
# Feature maps will be generated on-demand within each merge method
|
|
401
401
|
|
|
402
402
|
self.logger.info(
|
|
403
|
-
f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da
|
|
403
|
+
f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da"
|
|
404
404
|
)
|
|
405
405
|
|
|
406
406
|
# Initialize
|
|
@@ -446,7 +446,7 @@ def merge(self, **kwargs) -> None:
|
|
|
446
446
|
# Note: _merge_qt_chunked populates consensus_df directly, no need to extract
|
|
447
447
|
|
|
448
448
|
# Enhanced post-clustering to merge over-segmented features (for qt and kd methods)
|
|
449
|
-
if params.method in ['qt', 'sensitivity', 'qt_chunked', 'kd_chunked']:
|
|
449
|
+
if params.method in ['qt', 'sensitivity', 'qt_chunked', 'kd_chunked', 'quality']:
|
|
450
450
|
self._consensus_cleanup(params.rt_tol, params.mz_tol)
|
|
451
451
|
|
|
452
452
|
# Perform adduct grouping
|
|
@@ -705,11 +705,11 @@ def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
705
705
|
optimized_params = params
|
|
706
706
|
|
|
707
707
|
# Phase 1: Standard KD clustering
|
|
708
|
-
self.logger.
|
|
708
|
+
self.logger.debug("Initial KD clustering")
|
|
709
709
|
consensus_map = _merge_kd(self, optimized_params)
|
|
710
710
|
|
|
711
711
|
# Phase 2: Post-processing quality control
|
|
712
|
-
self.logger.
|
|
712
|
+
self.logger.debug("Post-processing quality control")
|
|
713
713
|
consensus_map = _apply_kd_strict_postprocessing(self, consensus_map, optimized_params)
|
|
714
714
|
|
|
715
715
|
return consensus_map
|
|
@@ -911,7 +911,7 @@ def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, param
|
|
|
911
911
|
final_feature_count = len(self.consensus_df)
|
|
912
912
|
reduction_pct = ((initial_feature_count - final_feature_count) / initial_feature_count * 100) if initial_feature_count > 0 else 0
|
|
913
913
|
|
|
914
|
-
self.logger.info(f"
|
|
914
|
+
self.logger.info(f"Consensus cleanup complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
|
|
915
915
|
|
|
916
916
|
# Create a new consensus map for compatibility (the processed data is in consensus_df)
|
|
917
917
|
processed_consensus_map = oms.ConsensusMap()
|
|
@@ -1691,8 +1691,12 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1691
1691
|
mz_min_local = mz_max_local = consensus_mz
|
|
1692
1692
|
|
|
1693
1693
|
# Store chunk consensus with feature tracking
|
|
1694
|
+
# Generate unique 16-character consensus_id string
|
|
1695
|
+
import uuid
|
|
1696
|
+
consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
|
|
1697
|
+
|
|
1694
1698
|
chunk_consensus_data = {
|
|
1695
|
-
'consensus_id':
|
|
1699
|
+
'consensus_id': consensus_id_str,
|
|
1696
1700
|
'chunk_idx': chunk_idx,
|
|
1697
1701
|
'chunk_start_idx': chunk_start_idx,
|
|
1698
1702
|
'mz': consensus_mz,
|
|
@@ -1710,7 +1714,6 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1710
1714
|
}
|
|
1711
1715
|
|
|
1712
1716
|
all_chunk_consensus.append(chunk_consensus_data)
|
|
1713
|
-
consensus_id_counter += 1
|
|
1714
1717
|
|
|
1715
1718
|
if not all_chunk_consensus:
|
|
1716
1719
|
# No valid consensus features found
|
|
@@ -2094,9 +2097,13 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
|
|
|
2094
2097
|
ms2_count += len(ms2_scans)
|
|
2095
2098
|
|
|
2096
2099
|
# Build consensus metadata
|
|
2100
|
+
# Generate unique 16-character consensus_id string
|
|
2101
|
+
import uuid
|
|
2102
|
+
consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
|
|
2103
|
+
|
|
2097
2104
|
return {
|
|
2098
2105
|
"consensus_uid": int(consensus_uid),
|
|
2099
|
-
"consensus_id":
|
|
2106
|
+
"consensus_id": consensus_id_str, # Use unique 16-char string ID
|
|
2100
2107
|
"quality": round(float(np.mean(quality_values)), 3) if len(quality_values) > 0 else 1.0,
|
|
2101
2108
|
"number_samples": number_samples if number_samples is not None else len(feature_data_list),
|
|
2102
2109
|
"rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
|
|
@@ -2118,6 +2125,7 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
|
|
|
2118
2125
|
"chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
|
|
2119
2126
|
"chrom_prominence_scaled_mean": round(float(np.mean(prominence_scaled_values)), 3) if len(prominence_scaled_values) > 0 else 0.0,
|
|
2120
2127
|
"chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3) if len(height_scaled_values) > 0 else 0.0,
|
|
2128
|
+
"iso": None, # Will be filled by find_iso() function
|
|
2121
2129
|
"iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
|
|
2122
2130
|
"charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
|
|
2123
2131
|
"number_ms2": int(ms2_count),
|
|
@@ -2509,10 +2517,14 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
|
|
|
2509
2517
|
if ms2_scans is not None:
|
|
2510
2518
|
ms2_count += len(ms2_scans)
|
|
2511
2519
|
|
|
2520
|
+
# Generate unique 16-character consensus_id string (UUID-based)
|
|
2521
|
+
import uuid
|
|
2522
|
+
consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
|
|
2523
|
+
|
|
2512
2524
|
metadata_list.append(
|
|
2513
2525
|
{
|
|
2514
2526
|
"consensus_uid": int(i), # "consensus_id": i,
|
|
2515
|
-
"consensus_id":
|
|
2527
|
+
"consensus_id": consensus_id_str, # Use unique 16-char string ID
|
|
2516
2528
|
"quality": round(float(feature.getQuality()), 3),
|
|
2517
2529
|
"number_samples": len(feature_data_list),
|
|
2518
2530
|
# "number_ext": int(len(features_list)),
|
|
@@ -2577,6 +2589,7 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
|
|
|
2577
2589
|
)
|
|
2578
2590
|
if len(height_scaled_values) > 0
|
|
2579
2591
|
else 0.0,
|
|
2592
|
+
"iso": None, # Will be filled by find_iso() function
|
|
2580
2593
|
"iso_mean": round(float(np.mean(iso_values)), 2)
|
|
2581
2594
|
if len(iso_values) > 0
|
|
2582
2595
|
else 0.0,
|
|
@@ -3325,7 +3338,9 @@ def _finalize_merge(self, link_ms2, min_samples):
|
|
|
3325
3338
|
f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
|
|
3326
3339
|
f"This may be due to min_samples ({min_samples}) being too high for the available data.",
|
|
3327
3340
|
)
|
|
3328
|
-
|
|
3341
|
+
|
|
3342
|
+
# add iso data from raw files.
|
|
3343
|
+
self.find_iso()
|
|
3329
3344
|
if link_ms2:
|
|
3330
3345
|
self.find_ms2()
|
|
3331
3346
|
|
masster/study/plot.py
CHANGED
|
@@ -42,9 +42,10 @@ def _isolated_save_plot(plot_object, filename, abs_filename, logger, plot_title=
|
|
|
42
42
|
from bokeh.io.export import export_png
|
|
43
43
|
export_png(plot_object, filename=filename)
|
|
44
44
|
logger.info(f"Plot saved to: {abs_filename}")
|
|
45
|
-
except Exception:
|
|
45
|
+
except Exception as e:
|
|
46
46
|
# Fall back to HTML if PNG export not available
|
|
47
47
|
html_filename = filename.replace('.png', '.html')
|
|
48
|
+
abs_html_filename = html_filename if abs_filename == filename else abs_filename.replace('.png', '.html')
|
|
48
49
|
from bokeh.resources import Resources
|
|
49
50
|
from bokeh.embed import file_html
|
|
50
51
|
|
|
@@ -54,7 +55,7 @@ def _isolated_save_plot(plot_object, filename, abs_filename, logger, plot_title=
|
|
|
54
55
|
with open(html_filename, 'w', encoding='utf-8') as f:
|
|
55
56
|
f.write(html)
|
|
56
57
|
|
|
57
|
-
logger.warning(f"PNG export not available
|
|
58
|
+
logger.warning(f"PNG export not available ({str(e)}). Use export_png. Saved as HTML instead: {abs_html_filename}")
|
|
58
59
|
elif filename.endswith(".pdf"):
|
|
59
60
|
# Try to save as PDF, fall back to HTML if not available
|
|
60
61
|
try:
|
|
@@ -74,6 +75,26 @@ def _isolated_save_plot(plot_object, filename, abs_filename, logger, plot_title=
|
|
|
74
75
|
f.write(html)
|
|
75
76
|
|
|
76
77
|
logger.warning(f"PDF export not available, saved as HTML instead: {html_filename}")
|
|
78
|
+
elif filename.endswith(".svg"):
|
|
79
|
+
# Try to save as SVG, fall back to HTML if not available
|
|
80
|
+
try:
|
|
81
|
+
from bokeh.io.export import export_svg
|
|
82
|
+
export_svg(plot_object, filename=filename)
|
|
83
|
+
logger.info(f"Plot saved to: {abs_filename}")
|
|
84
|
+
except Exception as e:
|
|
85
|
+
# Fall back to HTML if SVG export not available
|
|
86
|
+
html_filename = filename.replace('.svg', '.html')
|
|
87
|
+
abs_html_filename = html_filename if abs_filename == filename else abs_filename.replace('.svg', '.html')
|
|
88
|
+
from bokeh.resources import Resources
|
|
89
|
+
from bokeh.embed import file_html
|
|
90
|
+
|
|
91
|
+
resources = Resources(mode='cdn')
|
|
92
|
+
html = file_html(plot_object, resources, title=plot_title)
|
|
93
|
+
|
|
94
|
+
with open(html_filename, 'w', encoding='utf-8') as f:
|
|
95
|
+
f.write(html)
|
|
96
|
+
|
|
97
|
+
logger.warning(f"SVG export not available ({str(e)}). Saved as HTML instead: {abs_html_filename}")
|
|
77
98
|
else:
|
|
78
99
|
# Default to HTML for unknown extensions using isolated approach
|
|
79
100
|
from bokeh.resources import Resources
|
|
@@ -181,6 +202,22 @@ def _isolated_save_panel_plot(panel_obj, filename, abs_filename, logger, plot_ti
|
|
|
181
202
|
logger.warning(f"PDF export not available, saved as HTML instead: {abs_html_filename}")
|
|
182
203
|
except Exception as e:
|
|
183
204
|
logger.error(f"Failed to save {plot_title} as HTML fallback: {e}")
|
|
205
|
+
elif filename.endswith(".svg"):
|
|
206
|
+
# Try to save as SVG, fall back to HTML if not available
|
|
207
|
+
try:
|
|
208
|
+
from bokeh.io.export import export_svg
|
|
209
|
+
bokeh_layout = panel_obj.get_root()
|
|
210
|
+
export_svg(bokeh_layout, filename=filename)
|
|
211
|
+
logger.info(f"{plot_title} saved to: {abs_filename}")
|
|
212
|
+
except Exception as e:
|
|
213
|
+
# Fall back to HTML if SVG export not available
|
|
214
|
+
html_filename = filename.replace('.svg', '.html')
|
|
215
|
+
abs_html_filename = os.path.abspath(html_filename)
|
|
216
|
+
try:
|
|
217
|
+
panel_obj.save(html_filename, embed=True)
|
|
218
|
+
logger.warning(f"SVG export not available ({str(e)}), saved as HTML instead: {abs_html_filename}")
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.error(f"Failed to save {plot_title} as HTML fallback: {e}")
|
|
184
221
|
else:
|
|
185
222
|
# Default to HTML for unknown extensions
|
|
186
223
|
try:
|
masster/study/processing.py
CHANGED
|
@@ -59,6 +59,17 @@ def align(self, **kwargs):
|
|
|
59
59
|
"""
|
|
60
60
|
# parameters initialization
|
|
61
61
|
params = align_defaults()
|
|
62
|
+
|
|
63
|
+
# Handle 'params' keyword argument specifically (like merge does)
|
|
64
|
+
if 'params' in kwargs:
|
|
65
|
+
provided_params = kwargs.pop('params')
|
|
66
|
+
if isinstance(provided_params, align_defaults):
|
|
67
|
+
params = provided_params
|
|
68
|
+
self.logger.debug("Using provided align_defaults parameters from 'params' argument")
|
|
69
|
+
else:
|
|
70
|
+
self.logger.warning("'params' argument is not an align_defaults instance, ignoring")
|
|
71
|
+
|
|
72
|
+
# Process remaining kwargs
|
|
62
73
|
for key, value in kwargs.items():
|
|
63
74
|
if isinstance(value, align_defaults):
|
|
64
75
|
params = value
|
|
@@ -72,7 +83,7 @@ def align(self, **kwargs):
|
|
|
72
83
|
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
73
84
|
)
|
|
74
85
|
else:
|
|
75
|
-
self.logger.
|
|
86
|
+
self.logger.warning(f"Unknown parameter '{key}' ignored")
|
|
76
87
|
# end of parameter initialization
|
|
77
88
|
|
|
78
89
|
# Store parameters in the Study object
|
|
@@ -825,6 +836,11 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
825
836
|
f"Align time axes with rt_tol={params.get('rt_tol')}, min_samples={params.get('min_samples')}, max_points={max_points}",
|
|
826
837
|
)
|
|
827
838
|
|
|
839
|
+
# Check if feature maps are empty before proceeding
|
|
840
|
+
if not fmaps:
|
|
841
|
+
study_obj.logger.error("No feature maps available for alignment. Cannot proceed with alignment.")
|
|
842
|
+
raise ValueError("No feature maps available for alignment. This usually indicates that all samples failed to load properly.")
|
|
843
|
+
|
|
828
844
|
# Choose reference map (largest number of features)
|
|
829
845
|
ref_index = max(range(len(fmaps)), key=lambda i: fmaps[i].size())
|
|
830
846
|
ref_map = fmaps[ref_index]
|
|
@@ -1003,3 +1019,243 @@ def _align_pose_clustering_fallback(study_obj, fmaps, params):
|
|
|
1003
1019
|
transformer.transformRetentionTimes(fm, trafo, True)
|
|
1004
1020
|
|
|
1005
1021
|
study_obj.alignment_ref_index = ref_index
|
|
1022
|
+
|
|
1023
|
+
|
|
1024
|
+
def find_iso(self, rt_tol=0.1, mz_tol=0.01):
|
|
1025
|
+
"""
|
|
1026
|
+
Find isotope patterns for consensus features by searching raw MS1 data.
|
|
1027
|
+
OPTIMIZED VERSION: Each sample file is loaded only once for maximum efficiency.
|
|
1028
|
+
|
|
1029
|
+
For each consensus feature:
|
|
1030
|
+
1. Find the associated feature with highest intensity
|
|
1031
|
+
2. Load the corresponding sample5 file to access raw MS1 data
|
|
1032
|
+
3. Use original_rt (before alignment) to find the correct scan
|
|
1033
|
+
4. Search for isotope patterns in raw MS1 spectra
|
|
1034
|
+
5. Look for isotope patterns: 0.33, 0.50, 0.66, 1.00, 1.50, 2.00, 3.00, 4.00, 5.00 Da
|
|
1035
|
+
6. Store results as numpy arrays with [mz, inty] in the iso column
|
|
1036
|
+
|
|
1037
|
+
Parameters:
|
|
1038
|
+
rt_tol (float): RT tolerance for scan matching in seconds
|
|
1039
|
+
mz_tol (float): Additional m/z tolerance for isotope matching in Da
|
|
1040
|
+
"""
|
|
1041
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1042
|
+
self.logger.error("No consensus features found. Please run merge() first.")
|
|
1043
|
+
return
|
|
1044
|
+
|
|
1045
|
+
if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
|
|
1046
|
+
self.logger.error("No consensus mapping found. Please run merge() first.")
|
|
1047
|
+
return
|
|
1048
|
+
|
|
1049
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
1050
|
+
self.logger.error("No features found.")
|
|
1051
|
+
return
|
|
1052
|
+
|
|
1053
|
+
if self.samples_df is None or self.samples_df.is_empty():
|
|
1054
|
+
self.logger.error("No samples found.")
|
|
1055
|
+
return
|
|
1056
|
+
|
|
1057
|
+
# Add iso column if it doesn't exist
|
|
1058
|
+
if "iso" not in self.consensus_df.columns:
|
|
1059
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
1060
|
+
pl.lit(None, dtype=pl.Object).alias("iso")
|
|
1061
|
+
)
|
|
1062
|
+
|
|
1063
|
+
self.logger.info("Extracting isotopomers from raw MS1 data...")
|
|
1064
|
+
|
|
1065
|
+
# Isotope mass shifts to search for (up to 7x 13C isotopes)
|
|
1066
|
+
isotope_shifts = [
|
|
1067
|
+
0.33,
|
|
1068
|
+
0.50,
|
|
1069
|
+
0.66,
|
|
1070
|
+
1.00335,
|
|
1071
|
+
1.50502,
|
|
1072
|
+
2.00670,
|
|
1073
|
+
3.01005,
|
|
1074
|
+
4.01340,
|
|
1075
|
+
5.01675,
|
|
1076
|
+
6.02010,
|
|
1077
|
+
7.02345,
|
|
1078
|
+
]
|
|
1079
|
+
|
|
1080
|
+
consensus_iso_data = {}
|
|
1081
|
+
|
|
1082
|
+
# SUPER OPTIMIZATION: Vectorized pre-calculation using joins (10-100x faster)
|
|
1083
|
+
self.logger.debug("Building sample-to-consensus mapping using vectorized operations...")
|
|
1084
|
+
|
|
1085
|
+
# Step 1: Join consensus_mapping with features to get intensities in one operation
|
|
1086
|
+
consensus_with_features = self.consensus_mapping_df.join(
|
|
1087
|
+
self.features_df.select(['feature_uid', 'sample_uid', 'inty', 'mz', 'rt', 'rt_original']),
|
|
1088
|
+
on=['feature_uid', 'sample_uid'],
|
|
1089
|
+
how='left'
|
|
1090
|
+
)
|
|
1091
|
+
|
|
1092
|
+
# Step 2: Find the best feature (highest intensity) for each consensus using window functions
|
|
1093
|
+
best_features = consensus_with_features.with_columns(
|
|
1094
|
+
pl.col('inty').fill_null(0) # Handle null intensities
|
|
1095
|
+
).with_columns(
|
|
1096
|
+
pl.col('inty').max().over('consensus_uid').alias('max_inty')
|
|
1097
|
+
).filter(
|
|
1098
|
+
pl.col('inty') == pl.col('max_inty')
|
|
1099
|
+
).group_by('consensus_uid').first() # Take first if there are ties
|
|
1100
|
+
|
|
1101
|
+
# Step 3: Join with samples to get sample paths in one operation
|
|
1102
|
+
best_features_with_paths = best_features.join(
|
|
1103
|
+
self.samples_df.select(['sample_uid', 'sample_path']),
|
|
1104
|
+
on='sample_uid',
|
|
1105
|
+
how='left'
|
|
1106
|
+
).filter(
|
|
1107
|
+
pl.col('sample_path').is_not_null()
|
|
1108
|
+
)
|
|
1109
|
+
|
|
1110
|
+
# Step 4: Group by sample path for batch processing (much faster than nested loops)
|
|
1111
|
+
sample_to_consensus = {}
|
|
1112
|
+
for row in best_features_with_paths.iter_rows(named=True):
|
|
1113
|
+
sample_path = row['sample_path']
|
|
1114
|
+
consensus_uid = row['consensus_uid']
|
|
1115
|
+
|
|
1116
|
+
# Create feature data dictionary for compatibility
|
|
1117
|
+
feature_data = {
|
|
1118
|
+
'mz': row['mz'],
|
|
1119
|
+
'rt': row['rt'],
|
|
1120
|
+
'rt_original': row.get('rt_original', row['rt']),
|
|
1121
|
+
'inty': row['inty']
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
if sample_path not in sample_to_consensus:
|
|
1125
|
+
sample_to_consensus[sample_path] = []
|
|
1126
|
+
|
|
1127
|
+
sample_to_consensus[sample_path].append((consensus_uid, feature_data))
|
|
1128
|
+
|
|
1129
|
+
# Initialize failed consensus features (those not in the mapping)
|
|
1130
|
+
processed_consensus_uids = set(best_features_with_paths['consensus_uid'].to_list())
|
|
1131
|
+
for consensus_row in self.consensus_df.iter_rows(named=True):
|
|
1132
|
+
consensus_uid = consensus_row["consensus_uid"]
|
|
1133
|
+
if consensus_uid not in processed_consensus_uids:
|
|
1134
|
+
consensus_iso_data[consensus_uid] = None
|
|
1135
|
+
|
|
1136
|
+
self.logger.debug(f"Will read {len(sample_to_consensus)} unique sample files for {len(self.consensus_df)} consensus features")
|
|
1137
|
+
|
|
1138
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1139
|
+
|
|
1140
|
+
# OPTIMIZATION 2: Process by sample file (load each file only once)
|
|
1141
|
+
for sample_path, consensus_list in tqdm(
|
|
1142
|
+
sample_to_consensus.items(),
|
|
1143
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Read files",
|
|
1144
|
+
disable=tdqm_disable,
|
|
1145
|
+
):
|
|
1146
|
+
try:
|
|
1147
|
+
# Load MS1 data once per sample
|
|
1148
|
+
ms1_df = self._load_ms1(sample_path)
|
|
1149
|
+
|
|
1150
|
+
if ms1_df is None or ms1_df.is_empty():
|
|
1151
|
+
# Mark all consensus features from this sample as failed
|
|
1152
|
+
for consensus_uid, _ in consensus_list:
|
|
1153
|
+
consensus_iso_data[consensus_uid] = None
|
|
1154
|
+
continue
|
|
1155
|
+
|
|
1156
|
+
# Process all consensus features for this sample
|
|
1157
|
+
for consensus_uid, best_feature in consensus_list:
|
|
1158
|
+
# Get the original RT (before alignment correction)
|
|
1159
|
+
base_mz = best_feature["mz"]
|
|
1160
|
+
original_rt = best_feature.get("rt_original", best_feature["rt"])
|
|
1161
|
+
|
|
1162
|
+
# Find MS1 scans near the original RT
|
|
1163
|
+
rt_min = original_rt - rt_tol
|
|
1164
|
+
rt_max = original_rt + rt_tol
|
|
1165
|
+
|
|
1166
|
+
# Filter MS1 data for scans within RT window
|
|
1167
|
+
ms1_window = ms1_df.filter(
|
|
1168
|
+
(pl.col("rt") >= rt_min) & (pl.col("rt") <= rt_max)
|
|
1169
|
+
)
|
|
1170
|
+
|
|
1171
|
+
if ms1_window.is_empty():
|
|
1172
|
+
consensus_iso_data[consensus_uid] = None
|
|
1173
|
+
continue
|
|
1174
|
+
|
|
1175
|
+
isotope_matches = []
|
|
1176
|
+
|
|
1177
|
+
# Search for each isotope shift
|
|
1178
|
+
for shift in isotope_shifts:
|
|
1179
|
+
target_mz = base_mz + shift
|
|
1180
|
+
mz_min_iso = target_mz - mz_tol
|
|
1181
|
+
mz_max_iso = target_mz + mz_tol
|
|
1182
|
+
|
|
1183
|
+
# Find peaks in MS1 data within m/z tolerance
|
|
1184
|
+
isotope_peaks = ms1_window.filter(
|
|
1185
|
+
(pl.col("mz") >= mz_min_iso) & (pl.col("mz") <= mz_max_iso)
|
|
1186
|
+
)
|
|
1187
|
+
|
|
1188
|
+
if not isotope_peaks.is_empty():
|
|
1189
|
+
# Get the peak with maximum intensity for this isotope
|
|
1190
|
+
max_peak = isotope_peaks.filter(
|
|
1191
|
+
pl.col("inty") == pl.col("inty").max()
|
|
1192
|
+
).row(0, named=True)
|
|
1193
|
+
|
|
1194
|
+
# Store as float with specific precision: m/z to 4 decimals, intensity rounded to integer
|
|
1195
|
+
mz_formatted = round(float(max_peak["mz"]), 4)
|
|
1196
|
+
inty_formatted = float(round(max_peak["inty"])) # Round to integer, but keep as float
|
|
1197
|
+
isotope_matches.append([mz_formatted, inty_formatted])
|
|
1198
|
+
|
|
1199
|
+
# Store results as numpy array
|
|
1200
|
+
if isotope_matches:
|
|
1201
|
+
consensus_iso_data[consensus_uid] = np.array(isotope_matches)
|
|
1202
|
+
else:
|
|
1203
|
+
consensus_iso_data[consensus_uid] = None
|
|
1204
|
+
|
|
1205
|
+
except Exception as e:
|
|
1206
|
+
self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
|
|
1207
|
+
# Mark all consensus features from this sample as failed
|
|
1208
|
+
for consensus_uid, _ in consensus_list:
|
|
1209
|
+
consensus_iso_data[consensus_uid] = None
|
|
1210
|
+
continue
|
|
1211
|
+
|
|
1212
|
+
# Update consensus_df with isotope data
|
|
1213
|
+
# Create mapping function for update
|
|
1214
|
+
def get_iso_data(uid):
|
|
1215
|
+
return consensus_iso_data.get(uid, None)
|
|
1216
|
+
|
|
1217
|
+
# Update the iso column
|
|
1218
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
1219
|
+
pl.col("consensus_uid").map_elements(
|
|
1220
|
+
lambda uid: get_iso_data(uid),
|
|
1221
|
+
return_dtype=pl.Object
|
|
1222
|
+
).alias("iso")
|
|
1223
|
+
)
|
|
1224
|
+
|
|
1225
|
+
# Count how many consensus features have isotope data
|
|
1226
|
+
iso_count = sum(1 for data in consensus_iso_data.values() if data is not None and len(data) > 0)
|
|
1227
|
+
|
|
1228
|
+
self.logger.info(f"Optimized isotope detection completed. Found isotope patterns for {iso_count}/{len(self.consensus_df)} consensus features.")
|
|
1229
|
+
|
|
1230
|
+
|
|
1231
|
+
def reset_iso(self):
|
|
1232
|
+
"""
|
|
1233
|
+
Reset the iso column in consensus_df to None, clearing all isotope data.
|
|
1234
|
+
|
|
1235
|
+
This function clears any previously computed isotope patterns from the
|
|
1236
|
+
consensus_df, setting the 'iso' column to None for all features. This
|
|
1237
|
+
is useful before re-running isotope detection with different parameters
|
|
1238
|
+
or to clear isotope data entirely.
|
|
1239
|
+
|
|
1240
|
+
Returns:
|
|
1241
|
+
None
|
|
1242
|
+
"""
|
|
1243
|
+
if self.consensus_df is None:
|
|
1244
|
+
self.logger.warning("No consensus_df found. Nothing to reset.")
|
|
1245
|
+
return
|
|
1246
|
+
|
|
1247
|
+
if "iso" not in self.consensus_df.columns:
|
|
1248
|
+
self.logger.warning("No 'iso' column found in consensus_df. Nothing to reset.")
|
|
1249
|
+
return
|
|
1250
|
+
|
|
1251
|
+
# Count how many features currently have isotope data
|
|
1252
|
+
iso_count = self.consensus_df.select(
|
|
1253
|
+
pl.col("iso").is_not_null().sum().alias("count")
|
|
1254
|
+
).item(0, "count")
|
|
1255
|
+
|
|
1256
|
+
# Reset the iso column to None
|
|
1257
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
1258
|
+
pl.lit(None, dtype=pl.Object).alias("iso")
|
|
1259
|
+
)
|
|
1260
|
+
|
|
1261
|
+
self.logger.info(f"Reset isotope data for {iso_count} features. All 'iso' values set to None.")
|
masster/study/save.py
CHANGED
|
@@ -154,13 +154,56 @@ def save_samples(self, samples=None):
|
|
|
154
154
|
|
|
155
155
|
|
|
156
156
|
def _save_consensusXML(self, filename: str):
|
|
157
|
-
if self.
|
|
158
|
-
self.logger.error("No consensus
|
|
157
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
158
|
+
self.logger.error("No consensus features found.")
|
|
159
159
|
return
|
|
160
|
-
|
|
160
|
+
|
|
161
|
+
# Build consensus map from consensus_df with proper consensus_id values
|
|
162
|
+
import pyopenms as oms
|
|
163
|
+
consensus_map = oms.ConsensusMap()
|
|
164
|
+
|
|
165
|
+
# Set up file descriptions for all samples
|
|
166
|
+
file_descriptions = consensus_map.getColumnHeaders()
|
|
167
|
+
if hasattr(self, 'samples_df') and not self.samples_df.is_empty():
|
|
168
|
+
for i, sample_row in enumerate(self.samples_df.iter_rows(named=True)):
|
|
169
|
+
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
170
|
+
file_description.filename = sample_row.get("sample_name", f"sample_{i}")
|
|
171
|
+
file_description.size = 0 # Will be updated if needed
|
|
172
|
+
file_description.unique_id = i + 1
|
|
173
|
+
file_descriptions[i] = file_description
|
|
174
|
+
consensus_map.setColumnHeaders(file_descriptions)
|
|
175
|
+
|
|
176
|
+
# Add consensus features to the map (simplified version without individual features)
|
|
177
|
+
for consensus_row in self.consensus_df.iter_rows(named=True):
|
|
178
|
+
consensus_feature = oms.ConsensusFeature()
|
|
179
|
+
|
|
180
|
+
# Set basic properties
|
|
181
|
+
consensus_feature.setRT(float(consensus_row.get("rt", 0.0)))
|
|
182
|
+
consensus_feature.setMZ(float(consensus_row.get("mz", 0.0)))
|
|
183
|
+
consensus_feature.setIntensity(float(consensus_row.get("inty_mean", 0.0)))
|
|
184
|
+
consensus_feature.setQuality(float(consensus_row.get("quality", 1.0)))
|
|
185
|
+
|
|
186
|
+
# Set the unique consensus_id as the unique ID
|
|
187
|
+
consensus_id_str = consensus_row.get("consensus_id", "")
|
|
188
|
+
if consensus_id_str and len(consensus_id_str) == 16:
|
|
189
|
+
try:
|
|
190
|
+
# Convert 16-character hex string to integer for OpenMS
|
|
191
|
+
consensus_uid = int(consensus_id_str, 16)
|
|
192
|
+
consensus_feature.setUniqueId(consensus_uid)
|
|
193
|
+
except ValueError:
|
|
194
|
+
# Fallback to hash if not hex
|
|
195
|
+
consensus_feature.setUniqueId(hash(consensus_id_str) & 0x7FFFFFFFFFFFFFFF)
|
|
196
|
+
else:
|
|
197
|
+
# Fallback to consensus_uid
|
|
198
|
+
consensus_feature.setUniqueId(consensus_row.get("consensus_uid", 0))
|
|
199
|
+
|
|
200
|
+
consensus_map.push_back(consensus_feature)
|
|
201
|
+
|
|
202
|
+
# Save the consensus map
|
|
161
203
|
fh = oms.ConsensusXMLFile()
|
|
162
|
-
fh.store(filename,
|
|
163
|
-
self.logger.debug(f"Saved consensus map to {filename}")
|
|
204
|
+
fh.store(filename, consensus_map)
|
|
205
|
+
self.logger.debug(f"Saved consensus map with {len(self.consensus_df)} features to {filename}")
|
|
206
|
+
self.logger.debug("Features use unique 16-character consensus_id strings")
|
|
164
207
|
|
|
165
208
|
|
|
166
209
|
def save_consensus(self, **kwargs):
|
masster/study/study.py
CHANGED
|
@@ -55,6 +55,7 @@ import polars as pl
|
|
|
55
55
|
from masster.study.h5 import _load_study5
|
|
56
56
|
from masster.study.h5 import _save_study5
|
|
57
57
|
from masster.study.h5 import _save_study5_compressed
|
|
58
|
+
from masster.study.h5 import _load_ms1
|
|
58
59
|
from masster.study.helpers import _get_consensus_uids
|
|
59
60
|
from masster.study.helpers import _get_feature_uids
|
|
60
61
|
from masster.study.helpers import _get_sample_uids
|
|
@@ -126,6 +127,8 @@ from masster.study.merge import _finalize_merge
|
|
|
126
127
|
from masster.study.merge import _count_tight_clusters
|
|
127
128
|
from masster.study.processing import integrate
|
|
128
129
|
from masster.study.processing import find_ms2
|
|
130
|
+
from masster.study.processing import find_iso
|
|
131
|
+
from masster.study.processing import reset_iso
|
|
129
132
|
from masster.study.parameters import store_history
|
|
130
133
|
from masster.study.parameters import get_parameters
|
|
131
134
|
from masster.study.parameters import update_parameters
|
|
@@ -385,6 +388,9 @@ class Study:
|
|
|
385
388
|
merge = merge
|
|
386
389
|
find_consensus = merge # Backward compatibility alias
|
|
387
390
|
find_ms2 = find_ms2
|
|
391
|
+
find_iso = find_iso
|
|
392
|
+
reset_iso = reset_iso
|
|
393
|
+
iso_reset = reset_iso
|
|
388
394
|
integrate = integrate
|
|
389
395
|
integrate_chrom = integrate # Backward compatibility alias
|
|
390
396
|
fill = fill
|
|
@@ -421,9 +427,11 @@ class Study:
|
|
|
421
427
|
set_source = set_source
|
|
422
428
|
sample_color = sample_color
|
|
423
429
|
sample_color_reset = sample_color_reset
|
|
430
|
+
reset_sample_color = sample_color_reset
|
|
424
431
|
name_replace = sample_name_replace
|
|
425
432
|
name_reset = sample_name_reset
|
|
426
|
-
|
|
433
|
+
reset_name = sample_name_reset
|
|
434
|
+
|
|
427
435
|
# === Data Compression and Storage ===
|
|
428
436
|
compress = compress
|
|
429
437
|
compress_features = compress_features
|
|
@@ -436,8 +444,10 @@ class Study:
|
|
|
436
444
|
|
|
437
445
|
# === Reset Operations ===
|
|
438
446
|
fill_reset = fill_reset
|
|
447
|
+
reset_fill = fill_reset
|
|
439
448
|
align_reset = align_reset
|
|
440
|
-
|
|
449
|
+
reset_align = align_reset
|
|
450
|
+
|
|
441
451
|
# === Plotting and Visualization ===
|
|
442
452
|
plot_alignment = plot_alignment
|
|
443
453
|
plot_chrom = plot_chrom
|
|
@@ -461,8 +471,10 @@ class Study:
|
|
|
461
471
|
identify = identify
|
|
462
472
|
get_id = get_id
|
|
463
473
|
id_reset = id_reset
|
|
474
|
+
reset_id = id_reset
|
|
464
475
|
lib_reset = lib_reset
|
|
465
|
-
|
|
476
|
+
reset_lib = lib_reset
|
|
477
|
+
|
|
466
478
|
# === Parameter Management ===
|
|
467
479
|
store_history = store_history
|
|
468
480
|
get_parameters = get_parameters
|
|
@@ -478,6 +490,7 @@ class Study:
|
|
|
478
490
|
_load_study5 = _load_study5
|
|
479
491
|
_save_study5 = _save_study5
|
|
480
492
|
_save_study5_compressed = _save_study5_compressed
|
|
493
|
+
_load_ms1 = _load_ms1
|
|
481
494
|
_get_consensus_uids = _get_consensus_uids
|
|
482
495
|
_get_feature_uids = _get_feature_uids
|
|
483
496
|
_get_sample_uids = _get_sample_uids
|
masster/study/study5_schema.json
CHANGED
masster/wizard/__init__.py
CHANGED
|
@@ -7,8 +7,11 @@ alignment, merging, plotting, and export.
|
|
|
7
7
|
|
|
8
8
|
The create_script() function allows immediate generation of standalone analysis
|
|
9
9
|
scripts without creating a Wizard instance first.
|
|
10
|
+
|
|
11
|
+
The execute() function combines create_script() with immediate execution of the
|
|
12
|
+
generated script for fully automated processing.
|
|
10
13
|
"""
|
|
11
14
|
|
|
12
|
-
from .wizard import Wizard, wizard_def, create_script
|
|
15
|
+
from .wizard import Wizard, wizard_def, create_script, execute
|
|
13
16
|
|
|
14
|
-
__all__ = ["Wizard", "wizard_def", "create_script"]
|
|
17
|
+
__all__ = ["Wizard", "wizard_def", "create_script", "execute"]
|