masster 0.4.17__py3-none-any.whl → 0.4.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/study/defaults/merge_def.py +10 -9
- masster/study/merge.py +85 -17
- masster/study/processing.py +0 -1
- {masster-0.4.17.dist-info → masster-0.4.18.dist-info}/METADATA +1 -1
- {masster-0.4.17.dist-info → masster-0.4.18.dist-info}/RECORD +9 -9
- {masster-0.4.17.dist-info → masster-0.4.18.dist-info}/WHEEL +0 -0
- {masster-0.4.17.dist-info → masster-0.4.18.dist-info}/entry_points.txt +0 -0
- {masster-0.4.17.dist-info → masster-0.4.18.dist-info}/licenses/LICENSE +0 -0
masster/_version.py
CHANGED
|
@@ -25,25 +25,25 @@ class merge_defaults:
|
|
|
25
25
|
link_ms2 (bool): Whether to link MS2 spectra to consensus features. Default is True.
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
|
-
method: str = "
|
|
29
|
-
min_samples: int =
|
|
28
|
+
method: str = "qt"
|
|
29
|
+
min_samples: int = 2
|
|
30
30
|
rt_tol: float = 5.0
|
|
31
31
|
mz_tol: float = 0.01
|
|
32
|
-
chunk_size: int =
|
|
32
|
+
chunk_size: int = 500
|
|
33
33
|
nr_partitions: int = 1000
|
|
34
|
-
min_rel_cc_size: float = 0.
|
|
34
|
+
min_rel_cc_size: float = 0.1
|
|
35
35
|
max_pairwise_log_fc: float = -1.0
|
|
36
36
|
max_nr_conflicts: int = 0
|
|
37
37
|
link_ms2: bool = True
|
|
38
38
|
|
|
39
39
|
# KD-Strict specific parameters
|
|
40
40
|
optimize_rt_tol: bool = False
|
|
41
|
-
rt_tol_range: tuple = (0.
|
|
42
|
-
rt_tol_steps: int =
|
|
43
|
-
secondary_merge_rt_tol: float = 0
|
|
41
|
+
rt_tol_range: tuple = (0.5, 4.0)
|
|
42
|
+
rt_tol_steps: int = 7
|
|
43
|
+
secondary_merge_rt_tol: float = 1.0
|
|
44
44
|
secondary_merge_mz_tol: float = 0.005
|
|
45
45
|
min_sample_overlap: float = 0.8
|
|
46
|
-
max_rt_spread: float =
|
|
46
|
+
max_rt_spread: float = 2.0 # Will default to 2x rt_tol
|
|
47
47
|
min_coherence: float = 0.0
|
|
48
48
|
|
|
49
49
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
@@ -53,7 +53,8 @@ class merge_defaults:
|
|
|
53
53
|
"description": "Merge method (algorithm) to use",
|
|
54
54
|
"default": "quality",
|
|
55
55
|
"allowed_values": ["sensitivity", "qt", "nowarp", "chunked", "quality",
|
|
56
|
-
"kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict"
|
|
56
|
+
"kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict",
|
|
57
|
+
"kd_chunked", "kd-chunked", "qt_chunked", "qt-chunked"],
|
|
57
58
|
},
|
|
58
59
|
"min_samples": {
|
|
59
60
|
"dtype": int,
|
masster/study/merge.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Unified merge module for the Study class.
|
|
3
|
-
Supports multiple merge methods: 'kd', 'qt', 'kd-nowarp', '
|
|
3
|
+
Supports multiple merge methods: 'kd', 'qt', 'kd-nowarp', 'kd_chunked', 'qt_chunked'
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import time
|
|
@@ -25,7 +25,7 @@ def merge(self, **kwargs) -> None:
|
|
|
25
25
|
**kwargs : dict
|
|
26
26
|
Parameters from merge_defaults class:
|
|
27
27
|
- method : str, default 'quality'
|
|
28
|
-
Merge algorithm: 'sensitivity', 'qt', 'nowarp', '
|
|
28
|
+
Merge algorithm: 'sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality'
|
|
29
29
|
- min_samples : int, default 10
|
|
30
30
|
Minimum number of samples for consensus feature
|
|
31
31
|
- rt_tol : float, default 2.0
|
|
@@ -52,9 +52,11 @@ def merge(self, **kwargs) -> None:
|
|
|
52
52
|
- Sensitivity: Best raw sensitivity, O(n log n), maximum feature detection
|
|
53
53
|
- QT: Thorough but slow O(n²), good for <1000 samples
|
|
54
54
|
- NoWarp: Memory efficient KD without RT warping for large datasets
|
|
55
|
-
- Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
|
|
55
|
+
- KD-Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
|
|
56
56
|
Uses optimized partitioning for better memory management while maintaining
|
|
57
57
|
full cross-sample consensus feature detection.
|
|
58
|
+
- QT-Chunked: Memory-optimized QT algorithm for very large datasets (>5000 samples)
|
|
59
|
+
Uses QT clustering in first stage with optimized cross-chunk consensus building.
|
|
58
60
|
"""
|
|
59
61
|
start_time = time.time()
|
|
60
62
|
|
|
@@ -76,7 +78,12 @@ def merge(self, **kwargs) -> None:
|
|
|
76
78
|
'kd_nowarp': 'nowarp',
|
|
77
79
|
'kd-strict': 'quality',
|
|
78
80
|
'kd_strict': 'quality',
|
|
79
|
-
'kdstrict': 'quality'
|
|
81
|
+
'kdstrict': 'quality',
|
|
82
|
+
'chunked': 'kd_chunked', # Map old 'chunked' to 'kd_chunked'
|
|
83
|
+
'qtchunked': 'qt_chunked', # QT chunked variants
|
|
84
|
+
'qt-chunked': 'qt_chunked',
|
|
85
|
+
'kdchunked': 'kd_chunked', # KD chunked variants
|
|
86
|
+
'kd-chunked': 'kd_chunked'
|
|
80
87
|
}
|
|
81
88
|
|
|
82
89
|
if params.method in method_mapping:
|
|
@@ -85,8 +92,8 @@ def merge(self, **kwargs) -> None:
|
|
|
85
92
|
self.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
|
|
86
93
|
|
|
87
94
|
# Validate method
|
|
88
|
-
if params.method not in ['sensitivity', 'qt', 'nowarp', '
|
|
89
|
-
raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', '
|
|
95
|
+
if params.method not in ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']:
|
|
96
|
+
raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']")
|
|
90
97
|
|
|
91
98
|
# Persist last used params for diagnostics
|
|
92
99
|
try:
|
|
@@ -147,9 +154,12 @@ def merge(self, **kwargs) -> None:
|
|
|
147
154
|
elif params.method == 'quality':
|
|
148
155
|
consensus_map = _merge_kd_strict(self, params)
|
|
149
156
|
# Note: _merge_kd_strict handles both consensus_df and consensus_mapping_df directly
|
|
150
|
-
elif params.method == '
|
|
151
|
-
consensus_map =
|
|
152
|
-
# Note:
|
|
157
|
+
elif params.method == 'kd_chunked':
|
|
158
|
+
consensus_map = _merge_kd_chunked(self, params, cached_adducts_df, cached_valid_adducts)
|
|
159
|
+
# Note: _merge_kd_chunked populates consensus_df directly, no need to extract
|
|
160
|
+
elif params.method == 'qt_chunked':
|
|
161
|
+
consensus_map = _merge_qt_chunked(self, params, cached_adducts_df, cached_valid_adducts)
|
|
162
|
+
# Note: _merge_qt_chunked populates consensus_df directly, no need to extract
|
|
153
163
|
|
|
154
164
|
# Perform adduct grouping
|
|
155
165
|
self._perform_adduct_grouping(params.rt_tol, params.mz_tol)
|
|
@@ -189,9 +199,9 @@ def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
189
199
|
params_oms.setValue("warp:mz_tol", params.mz_tol)
|
|
190
200
|
params_oms.setValue("link:rt_tol", params.rt_tol)
|
|
191
201
|
params_oms.setValue("link:mz_tol", params.mz_tol)
|
|
192
|
-
params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
193
|
-
params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
194
|
-
params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
202
|
+
#params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
203
|
+
#params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
204
|
+
#params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
195
205
|
#params_oms.setValue("link:charge_merging", "With_charge_zero") THIS LEADS TO A CRASH
|
|
196
206
|
|
|
197
207
|
grouper.setParameters(params_oms)
|
|
@@ -227,9 +237,9 @@ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
227
237
|
params_oms.setValue("distance_MZ:max_difference", params.mz_tol)
|
|
228
238
|
params_oms.setValue("distance_MZ:unit", "Da") # QT now uses Da like all other methods
|
|
229
239
|
params_oms.setValue("ignore_charge", "true")
|
|
230
|
-
params_oms.setValue("min_rel_cc_size", params.min_rel_cc_size)
|
|
231
|
-
params_oms.setValue("max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
232
|
-
params_oms.setValue("max_nr_conflicts", params.max_nr_conflicts)
|
|
240
|
+
#params_oms.setValue("min_rel_cc_size", params.min_rel_cc_size)
|
|
241
|
+
#params_oms.setValue("max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
242
|
+
#params_oms.setValue("max_nr_conflicts", params.max_nr_conflicts)
|
|
233
243
|
params_oms.setValue("nr_partitions", params.nr_partitions)
|
|
234
244
|
|
|
235
245
|
grouper.setParameters(params_oms)
|
|
@@ -763,8 +773,8 @@ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
763
773
|
return consensus_map
|
|
764
774
|
|
|
765
775
|
|
|
766
|
-
def
|
|
767
|
-
"""
|
|
776
|
+
def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
777
|
+
"""KD-based chunked merge with proper cross-chunk consensus building"""
|
|
768
778
|
|
|
769
779
|
n_samples = len(self.features_maps)
|
|
770
780
|
if n_samples <= params.chunk_size:
|
|
@@ -826,6 +836,64 @@ def _merge_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_
|
|
|
826
836
|
return consensus_map
|
|
827
837
|
|
|
828
838
|
|
|
839
|
+
def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
840
|
+
"""QT-based chunked merge with proper cross-chunk consensus building"""
|
|
841
|
+
|
|
842
|
+
n_samples = len(self.features_maps)
|
|
843
|
+
if n_samples <= params.chunk_size:
|
|
844
|
+
self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
|
|
845
|
+
consensus_map = _merge_qt(self, params)
|
|
846
|
+
# Extract consensus features to populate consensus_df for chunked method consistency
|
|
847
|
+
self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
848
|
+
return consensus_map
|
|
849
|
+
|
|
850
|
+
# Process in chunks
|
|
851
|
+
chunks = []
|
|
852
|
+
for i in range(0, n_samples, params.chunk_size):
|
|
853
|
+
chunk_end = min(i + params.chunk_size, n_samples)
|
|
854
|
+
chunks.append((i, self.features_maps[i:chunk_end]))
|
|
855
|
+
|
|
856
|
+
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples")
|
|
857
|
+
|
|
858
|
+
# Process each chunk to create chunk consensus maps
|
|
859
|
+
chunk_consensus_maps = []
|
|
860
|
+
|
|
861
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc="Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
|
|
862
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
863
|
+
|
|
864
|
+
# Set up file descriptions for chunk
|
|
865
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
866
|
+
for j, feature_map in enumerate(chunk_maps):
|
|
867
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
868
|
+
file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
|
|
869
|
+
file_description.size = feature_map.size()
|
|
870
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
871
|
+
file_descriptions[j] = file_description
|
|
872
|
+
|
|
873
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
874
|
+
|
|
875
|
+
# Use QT algorithm for chunk (main difference from KD chunked)
|
|
876
|
+
grouper = oms.FeatureGroupingAlgorithmQT()
|
|
877
|
+
chunk_params = grouper.getParameters()
|
|
878
|
+
chunk_params.setValue("distance_RT:max_difference", params.rt_tol)
|
|
879
|
+
chunk_params.setValue("distance_MZ:max_difference", params.mz_tol)
|
|
880
|
+
chunk_params.setValue("distance_MZ:unit", "Da")
|
|
881
|
+
chunk_params.setValue("ignore_charge", "true")
|
|
882
|
+
chunk_params.setValue("nr_partitions", params.nr_partitions)
|
|
883
|
+
|
|
884
|
+
grouper.setParameters(chunk_params)
|
|
885
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
886
|
+
|
|
887
|
+
chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
|
|
888
|
+
|
|
889
|
+
# Merge chunk results with proper cross-chunk consensus building
|
|
890
|
+
_merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
891
|
+
|
|
892
|
+
# Create a dummy consensus map for compatibility (since other functions expect it)
|
|
893
|
+
consensus_map = oms.ConsensusMap()
|
|
894
|
+
return consensus_map
|
|
895
|
+
|
|
896
|
+
|
|
829
897
|
def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
|
|
830
898
|
"""
|
|
831
899
|
Scalable aggregation of chunk consensus maps into final consensus_df.
|
masster/study/processing.py
CHANGED
|
@@ -97,7 +97,6 @@ def align(self, **kwargs):
|
|
|
97
97
|
_align_kd_algorithm(self, fmaps, params)
|
|
98
98
|
else:
|
|
99
99
|
self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
|
|
100
|
-
self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
|
|
101
100
|
|
|
102
101
|
# check if rt_original exists in features_df, if not, add it after rt
|
|
103
102
|
if "rt_original" not in self.features_df.columns:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
masster/__init__.py,sha256=HHjKhCjkAc98LhoQfu4C6L-W2vfTEc1iXaPTxxcl_4A,800
|
|
2
|
-
masster/_version.py,sha256=
|
|
2
|
+
masster/_version.py,sha256=OUcHIwT4wa5AqV46S88edNYE4u4sKsoESNk3lFdxs_c,257
|
|
3
3
|
masster/chromatogram.py,sha256=iYpdv8C17zVnlWvOFgAn9ns2uFGiF-GgoYf5QVVAbHs,19319
|
|
4
4
|
masster/logger.py,sha256=W50V_uh8RSYwGxDrDFhOuj5jpu2tKJyt_16lMw9kQwA,14755
|
|
5
5
|
masster/spectrum.py,sha256=_upC_g2N9gwTaflXAugs9pSXpKUmzbIehofDordk7WI,47718
|
|
@@ -43,10 +43,10 @@ masster/study/h5.py,sha256=LiVGUAtULyPpZIUmKVJSaV38huJb8FsKOUWBOqiv0QU,82363
|
|
|
43
43
|
masster/study/helpers.py,sha256=M5_q8O5tuFchKPW04PTuj3X335lDA2VZqcs4D8ZQJEk,158604
|
|
44
44
|
masster/study/id.py,sha256=6NUBBKZCFOU1wlDKM0eXQeOIStSZCRNJ_3x7ZaIHzmM,55263
|
|
45
45
|
masster/study/load.py,sha256=CQQY_7BzagE3oQTdDlqNyfuMdVWIAft-M4a2WCFnxp0,70695
|
|
46
|
-
masster/study/merge.py,sha256
|
|
46
|
+
masster/study/merge.py,sha256=2Vqj0OaTZxwtjYu1l5PmRpMmT8_cHh-R761FUvBE_Sk,95741
|
|
47
47
|
masster/study/parameters.py,sha256=0elaF7YspTsB7qyajWAbRNL2VfKlGz5GJLifmO8IGkk,3276
|
|
48
48
|
masster/study/plot.py,sha256=SimX-IlqISEItAnTBsx4xsdYHRAevfN41cCENVns1lw,88236
|
|
49
|
-
masster/study/processing.py,sha256=
|
|
49
|
+
masster/study/processing.py,sha256=u1MSRKTzcqHNz_dClSUSfgTxkNRdBLXtVyO5LXuW_uk,41031
|
|
50
50
|
masster/study/save.py,sha256=YCvp4xhnG16sNXaT2mFDBoCrIMub0Es61B97qLo0maw,6705
|
|
51
51
|
masster/study/study.py,sha256=LO_hbJOOCZzeA3uterPKImFgPG6fCNQKMSVMtEwW3DU,38815
|
|
52
52
|
masster/study/study5_schema.json,sha256=c0w24QdHak01m04I1VPu97KvF2468FcaqROhf6pmLk4,7507
|
|
@@ -60,7 +60,7 @@ masster/study/defaults/find_ms2_def.py,sha256=RL0DFG41wQ05U8UQKUGr3vzSl3mU0m0knQ
|
|
|
60
60
|
masster/study/defaults/identify_def.py,sha256=96rxoCAPQj_yX-3mRoD2LTkTLJgG27eJQqwarLv5jL0,10580
|
|
61
61
|
masster/study/defaults/integrate_chrom_def.py,sha256=0MNIWGTjty-Zu-NTQsIweuj3UVqEY3x1x8pK0mPwYak,7264
|
|
62
62
|
masster/study/defaults/integrate_def.py,sha256=Vf4SAzdBfnsSZ3IRaF0qZvWu3gMDPHdgPfMYoPKeWv8,7246
|
|
63
|
-
masster/study/defaults/merge_def.py,sha256=
|
|
63
|
+
masster/study/defaults/merge_def.py,sha256=X7mTCgtQhglOTjwg06oSMFSbLBJSKsHmJeVVfYE2qHE,13272
|
|
64
64
|
masster/study/defaults/study_def.py,sha256=h8dYbi9xv0sesCSQik49Z53IkskMmNtW6ixl7it5pL0,16033
|
|
65
65
|
masster/wizard/README.md,sha256=mL1A3YWJZOefpJ6D0-HqGLkVRmUlOpwyVFdvJBeeoZM,14149
|
|
66
66
|
masster/wizard/__init__.py,sha256=A9GHQvkq4lSRIA8V6AKB-TJy8s_npH8i1baUGdkw_is,364
|
|
@@ -68,8 +68,8 @@ masster/wizard/example.py,sha256=xEZFTH9UZ8HKOm6s3JL8Js0Uw5ChnISWBHSZCL32vsM,798
|
|
|
68
68
|
masster/wizard/test_structure.py,sha256=h88gsYYCG6iDRjqPZC_r1H1T8y79j0E-K6OrwuHaSCU,1586
|
|
69
69
|
masster/wizard/test_wizard.py,sha256=CMp1cpjH3iYYC5Fy6puF_K0kfwwk3bgOsSbUGW-t7Xk,8986
|
|
70
70
|
masster/wizard/wizard.py,sha256=jMLHy4cXgNEE_-vshFmA7BNEByhfA6tV7O91jhiMYuw,48054
|
|
71
|
-
masster-0.4.
|
|
72
|
-
masster-0.4.
|
|
73
|
-
masster-0.4.
|
|
74
|
-
masster-0.4.
|
|
75
|
-
masster-0.4.
|
|
71
|
+
masster-0.4.18.dist-info/METADATA,sha256=pn-XNHgHqlY1KgiYkQ2Dyke9E1nnCP3mn-ja5W5QPyM,44207
|
|
72
|
+
masster-0.4.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
73
|
+
masster-0.4.18.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
|
|
74
|
+
masster-0.4.18.dist-info/licenses/LICENSE,sha256=bx5iLIKjgAdYQ7sISn7DsfHRKkoCUm1154sJJKhgqnU,35184
|
|
75
|
+
masster-0.4.18.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|