masster 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -17,7 +17,7 @@ from masster.study.defaults import (
17
17
 
18
18
 
19
19
  def align(self, **kwargs):
20
- """Align feature maps using pose clustering and update feature RTs.
20
+ """Align feature maps using pose clustering or KD algorithm and update feature RTs.
21
21
 
22
22
  Parameters can be provided as an ``align_defaults`` instance or as
23
23
  individual keyword arguments; they are validated against the defaults class.
@@ -30,6 +30,32 @@ def align(self, **kwargs):
30
30
  - num_used_points (int): Number of points to use for alignment estimation.
31
31
  - save_features (bool): If True, save updated features after alignment.
32
32
  - skip_blanks (bool): If True, skip blank samples during alignment.
33
+ - algo (str): Alignment algorithm ('pc' for PoseClustering, 'kd' for KD).
34
+
35
+ KD algorithm specific parameters:
36
+ - nr_partitions (int): Number of partitions in m/z dimension.
37
+ - warp_enabled (bool): Enable non-linear retention time transformation.
38
+ - warp_rt_tol (float): RT tolerance for the LOWESS fit.
39
+ - warp_mz_tol (float): m/z tolerance for the LOWESS fit.
40
+ - warp_max_pairwise_log_fc (float): Maximum absolute log10 fold-change threshold for pairing.
41
+ - warp_min_rel_cc_size (float): Minimum relative connected component size.
42
+ - warp_max_nr_conflicts (int): Allow up to this many conflicts per connected component for alignment.
43
+ - link_rt_tol (float): Width of RT tolerance window for linking features.
44
+ - link_mz_tol (float): m/z tolerance for linking features.
45
+ - link_charge_merging (str): Charge merging strategy for linking features.
46
+ - link_adduct_merging (str): Adduct merging strategy for linking features.
47
+ - distance_RT_exponent (float): Exponent for normalized RT differences.
48
+ - distance_RT_weight (float): Weight factor for final RT distances.
49
+ - distance_MZ_exponent (float): Exponent for normalized m/z differences.
50
+ - distance_MZ_weight (float): Weight factor for final m/z distances.
51
+ - distance_intensity_exponent (float): Exponent for differences in relative intensity.
52
+ - distance_intensity_weight (float): Weight factor for final intensity distances.
53
+ - distance_intensity_log_transform (str): Log-transform intensities.
54
+ - LOWESS_span (float): Fraction of datapoints for each local regression.
55
+ - LOWESS_num_iterations (int): Number of robustifying iterations for LOWESS fitting.
56
+ - LOWESS_delta (float): Parameter for LOWESS computations (negative auto-computes).
57
+ - LOWESS_interpolation_type (str): Method for interpolation between datapoints.
58
+ - LOWESS_extrapolation_type (str): Method for extrapolation outside data range.
33
59
  """
34
60
  # parameters initialization
35
61
  params = align_defaults()
@@ -57,78 +83,135 @@ def align(self, **kwargs):
57
83
  self.features_maps = []
58
84
  self.load_features()
59
85
 
60
- self.logger.debug("Starting alignment")
86
+ # self.logger.debug("Starting alignment")
61
87
 
62
88
  fmaps = self.features_maps
63
- # set ref_index to feature map index with largest number of features
64
- ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
65
-
66
- self.logger.info(
67
- f"Align on {self.samples_df.row(ref_index, named=True)['sample_name']}",
68
- )
69
-
70
- aligner = oms.MapAlignmentAlgorithmPoseClustering()
71
89
 
90
+ # Initialize OpenMS parameters
72
91
  params_oms = oms.Param()
73
- params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
74
- params_oms.setValue("pairfinder:ignore_charge", "true")
75
- params_oms.setValue("max_num_peaks_considered", 1000)
76
- params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
77
- params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
78
- params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
79
- params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
80
- params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
81
- params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
82
- params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
83
- aligner.setParameters(params_oms)
84
- """
85
- {b'max_num_peaks_considered': 1000,
86
- b'superimposer:mz_pair_max_distance': 0.5,
87
- b'superimposer:rt_pair_distance_fraction': 0.1,
88
- b'superimposer:num_used_points': 2000,
89
- b'superimposer:scaling_bucket_size': 0.005,
90
- b'superimposer:shift_bucket_size': 3.0,
91
- b'superimposer:max_shift': 1000.0,
92
- b'superimposer:max_scaling': 2.0,
93
- b'superimposer:dump_buckets': '',
94
- b'superimposer:dump_pairs': '',
95
- b'pairfinder:second_nearest_gap': 2.0,
96
- b'pairfinder:use_identifications': 'false',
97
- b'pairfinder:ignore_charge': 'false',
98
- b'pairfinder:ignore_adduct': 'true',
99
- b'pairfinder:distance_RT:max_difference': 100.0,
100
- b'pairfinder:distance_RT:exponent': 1.0,
101
- b'pairfinder:distance_RT:weight': 1.0,
102
- b'pairfinder:distance_MZ:max_difference': 0.3,
103
- b'pairfinder:distance_MZ:unit': 'Da',
104
- b'pairfinder:distance_MZ:exponent': 2.0,
105
- b'pairfinder:distance_MZ:weight': 1.0,
106
- b'pairfinder:distance_intensity:exponent': 1.0,
107
- b'pairfinder:distance_intensity:weight': 0.0,
108
- b'pairfinder:distance_intensity:log_transform': 'disabled'} """
109
-
110
- aligner.setReference(fmaps[ref_index])
111
-
112
- self.logger.debug(f"Parameters for alignment: {params}")
113
-
114
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
115
- # perform alignment and transformation of feature maps to the reference map (exclude reference map)
116
- for index, fm in tqdm(
117
- list(enumerate(fmaps)),
118
- total=len(fmaps),
119
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Align feature maps",
120
- disable=tdqm_disable,
121
- ):
122
- if index == ref_index:
123
- continue
124
- if params.get("skip_blanks") and self.samples_df.row(index, named=True)["sample_type"] == "blank":
125
- continue
126
- trafo = oms.TransformationDescription()
127
- aligner.align(fm, trafo)
128
- transformer = oms.MapAlignmentTransformer()
129
- transformer.transformRetentionTimes(fm, trafo, True)
92
+ # Choose alignment algorithm based on parameter
93
+ algo = params.get("algo").lower()
94
+
95
+ # Set common parameters for both algorithms
96
+ if algo == "pc":
97
+ # Parameters specific to PoseClustering
98
+ params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
99
+ params_oms.setValue("pairfinder:ignore_charge", "true")
100
+ params_oms.setValue("max_num_peaks_considered", 1000)
101
+ params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
102
+ params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
103
+ params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
104
+ params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
105
+ params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
106
+ params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
107
+ params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
108
+
109
+ """
110
+ {b'max_num_peaks_considered': 1000,
111
+ b'superimposer:mz_pair_max_distance': 0.5,
112
+ b'superimposer:rt_pair_distance_fraction': 0.1,
113
+ b'superimposer:num_used_points': 2000,
114
+ b'superimposer:scaling_bucket_size': 0.005,
115
+ b'superimposer:shift_bucket_size': 3.0,
116
+ b'superimposer:max_shift': 1000.0,
117
+ b'superimposer:max_scaling': 2.0,
118
+ b'superimposer:dump_buckets': '',
119
+ b'superimposer:dump_pairs': '',
120
+ b'pairfinder:second_nearest_gap': 2.0,
121
+ b'pairfinder:use_identifications': 'false',
122
+ b'pairfinder:ignore_charge': 'false',
123
+ b'pairfinder:ignore_adduct': 'true',
124
+ b'pairfinder:distance_RT:max_difference': 100.0,
125
+ b'pairfinder:distance_RT:exponent': 1.0,
126
+ b'pairfinder:distance_RT:weight': 1.0,
127
+ b'pairfinder:distance_MZ:max_difference': 0.3,
128
+ b'pairfinder:distance_MZ:unit': 'Da',
129
+ b'pairfinder:distance_MZ:exponent': 2.0,
130
+ b'pairfinder:distance_MZ:weight': 1.0,
131
+ b'pairfinder:distance_intensity:exponent': 1.0,
132
+ b'pairfinder:distance_intensity:weight': 0.0,
133
+ b'pairfinder:distance_intensity:log_transform': 'disabled'}
134
+ """
135
+ elif algo == "kd":
136
+ # Parameters specific to KD algorithm
137
+ params_oms.setValue("mz_unit", "Da")
138
+ params_oms.setValue("nr_partitions", params.get("nr_partitions"))
139
+
140
+ # Warp parameters for non-linear RT transformation
141
+ params_oms.setValue("warp:enabled", "true" if params.get("warp_enabled") else "false")
142
+ params_oms.setValue("warp:rt_tol", params.get("warp_rt_tol"))
143
+ params_oms.setValue("warp:mz_tol", params.get("warp_mz_tol"))
144
+ params_oms.setValue("warp:max_pairwise_log_fc", params.get("warp_max_pairwise_log_fc"))
145
+ params_oms.setValue("warp:min_rel_cc_size", params.get("warp_min_rel_cc_size"))
146
+ params_oms.setValue("warp:max_nr_conflicts", params.get("warp_max_nr_conflicts"))
147
+
148
+ # Link parameters
149
+ params_oms.setValue("link:rt_tol", params.get("link_rt_tol"))
150
+ params_oms.setValue("link:mz_tol", params.get("link_mz_tol"))
151
+ params_oms.setValue("link:charge_merging", params.get("link_charge_merging"))
152
+ params_oms.setValue("link:adduct_merging", params.get("link_adduct_merging"))
153
+
154
+ # Distance parameters
155
+ params_oms.setValue("distance_RT:exponent", params.get("distance_RT_exponent"))
156
+ params_oms.setValue("distance_RT:weight", params.get("distance_RT_weight"))
157
+ params_oms.setValue("distance_MZ:exponent", params.get("distance_MZ_exponent"))
158
+ params_oms.setValue("distance_MZ:weight", params.get("distance_MZ_weight"))
159
+ params_oms.setValue("distance_intensity:exponent", params.get("distance_intensity_exponent"))
160
+ params_oms.setValue("distance_intensity:weight", params.get("distance_intensity_weight"))
161
+ params_oms.setValue("distance_intensity:log_transform", params.get("distance_intensity_log_transform"))
162
+
163
+ # LOWESS parameters
164
+ params_oms.setValue("LOWESS:span", params.get("LOWESS_span"))
165
+ params_oms.setValue("LOWESS:num_iterations", params.get("LOWESS_num_iterations"))
166
+ params_oms.setValue("LOWESS:delta", params.get("LOWESS_delta"))
167
+ params_oms.setValue("LOWESS:interpolation_type", params.get("LOWESS_interpolation_type"))
168
+ params_oms.setValue("LOWESS:extrapolation_type", params.get("LOWESS_extrapolation_type"))
169
+
170
+ if algo == "pc":
171
+ aligner = oms.MapAlignmentAlgorithmPoseClustering()
172
+ self.logger.info("Starting alignment with PoseClustering")
173
+ # set ref_index to feature map index with largest number of features
174
+ ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
175
+ self.logger.debug(
176
+ f"Reference map is {self.samples_df.row(ref_index, named=True)['sample_name']}",
177
+ )
178
+ aligner.setParameters(params_oms)
179
+ aligner.setReference(fmaps[ref_index])
180
+ self.logger.debug(f"Parameters for alignment: {params}")
181
+ # perform alignment and transformation of feature maps to the reference map (exclude reference map)
182
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
183
+ for index, fm in tqdm(
184
+ list(enumerate(fmaps)),
185
+ total=len(fmaps),
186
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Align feature maps",
187
+ disable=tdqm_disable,
188
+ ):
189
+ if index == ref_index:
190
+ continue
191
+ if params.get("skip_blanks") and self.samples_df.row(index, named=True)["sample_type"] == "blank":
192
+ continue
193
+ trafo = oms.TransformationDescription()
194
+ aligner.align(fm, trafo)
195
+ transformer = oms.MapAlignmentTransformer()
196
+ transformer.transformRetentionTimes(fm, trafo, True)
197
+
198
+ self.alignment_ref_index = ref_index
199
+
200
+ elif algo == "kd":
201
+ # KD algorithm requires num_maps and Param parameters
202
+ num_maps = len(fmaps)
203
+ aligner = oms.MapAlignmentAlgorithmKD(3, params_oms)
204
+ self.logger.info(f"Starting alignment with KD algorithm using {num_maps} maps")
205
+
206
+ kdtree = oms.KDTreeFeatureMaps()
207
+ kdtree.addMaps(fmaps) # Add all feature maps to the KDTree
208
+ # kdtree.optimizeTree()
209
+ aligner.addRTFitData(kdtree)
210
+ aligner.fitLOWESS()
211
+ aligner.transform(kdtree)
130
212
 
131
- self.alignment_ref_index = ref_index
213
+ else:
214
+ self.logger.error(f"Unknown alignment algorithm '{algo}'")
132
215
 
133
216
  # check if rt_original exists in features_df, if not, add it after rt
134
217
  if "rt_original" not in self.features_df.columns:
@@ -238,8 +321,8 @@ def merge(self, **kwargs):
238
321
  self.consensus_df = pl.DataFrame()
239
322
  self.consensus_ms2 = pl.DataFrame()
240
323
  self.consensus_mapping_df = pl.DataFrame()
241
-
242
- self.logger.info('Merging...')
324
+
325
+ self.logger.info("Merging...")
243
326
  # parameters initialization
244
327
  params = merge_defaults()
245
328
  for key, value in kwargs.items():
@@ -482,17 +565,17 @@ def merge(self, **kwargs):
482
565
  # Collect all adducts from feature_data_list to create consensus adduct information
483
566
  all_adducts = []
484
567
  adduct_masses = {}
485
-
568
+
486
569
  for fd in feature_data_list:
487
570
  # Get individual adduct and mass from each feature data (fd)
488
571
  adduct = fd.get("adduct")
489
572
  adduct_mass = fd.get("adduct_mass")
490
-
573
+
491
574
  if adduct is not None:
492
575
  all_adducts.append(adduct)
493
576
  if adduct_mass is not None:
494
577
  adduct_masses[adduct] = adduct_mass
495
-
578
+
496
579
  # Calculate adduct_values for the consensus feature
497
580
  adduct_values = []
498
581
  if all_adducts:
@@ -506,9 +589,9 @@ def merge(self, **kwargs):
506
589
  "adduct": str(adduct),
507
590
  "count": int(count),
508
591
  "percentage": float(round(percentage, 2)),
509
- "mass": float(mass) if mass is not None else None
592
+ "mass": float(mass) if mass is not None else None,
510
593
  })
511
-
594
+
512
595
  # Sort adduct_values by count in descending order
513
596
  adduct_values.sort(key=lambda x: x["count"], reverse=True) # type: ignore[arg-type,return-value]
514
597
  # Store adduct_values for use in metadata
@@ -613,7 +696,7 @@ def find_ms2(self, **kwargs):
613
696
  """
614
697
  # Reset consensus_ms2 DataFrame at the start
615
698
  self.consensus_ms2 = pl.DataFrame()
616
-
699
+
617
700
  # parameters initialization
618
701
  params = find_ms2_defaults()
619
702
  for key, value in kwargs.items():
masster/study/save.py CHANGED
@@ -21,7 +21,7 @@ def save(self, filename=None, add_timestamp=True, compress=False):
21
21
  filename (str, optional): Target file name. If None, uses default.
22
22
  add_timestamp (bool, optional): If True, appends timestamp to avoid overwriting.
23
23
  Default True for safety (original behavior).
24
- compress (bool, optional): If True, uses compressed mode and skips
24
+ compress (bool, optional): If True, uses compressed mode and skips
25
25
  some heavy columns for maximum speed. Default False.
26
26
  """
27
27
 
@@ -46,11 +46,11 @@ def save(self, filename=None, add_timestamp=True, compress=False):
46
46
  filename = f"{filename.replace('.study5', '')}_{timestamp}.study5"
47
47
 
48
48
  # Log file size information for performance monitoring
49
- if hasattr(self, 'features_df') and not self.features_df.is_empty():
49
+ if hasattr(self, "features_df") and not self.features_df.is_empty():
50
50
  feature_count = len(self.features_df)
51
- sample_count = len(self.samples_df) if hasattr(self, 'samples_df') and not self.samples_df.is_empty() else 0
51
+ sample_count = len(self.samples_df) if hasattr(self, "samples_df") and not self.samples_df.is_empty() else 0
52
52
  self.logger.info(f"Saving study with {sample_count} samples and {feature_count} features to {filename}")
53
-
53
+
54
54
  # Use compressed mode for large datasets
55
55
  if compress:
56
56
  self._save_study5_compressed(filename)
@@ -106,7 +106,7 @@ def save_samples(self, samples=None):
106
106
  ddaobj.save()
107
107
  sample_name = sample_row.row(0, named=True)["sample_name"]
108
108
  sample_path = sample_row.row(0, named=True)["sample_path"]
109
-
109
+
110
110
  # Find the index of this sample in the original order for features_maps
111
111
  sample_index = next(
112
112
  (
@@ -116,7 +116,7 @@ def save_samples(self, samples=None):
116
116
  ),
117
117
  None,
118
118
  )
119
-
119
+
120
120
  # Determine where to save the featureXML file based on sample_path location
121
121
  if sample_path.endswith(".sample5"):
122
122
  # If sample_path is a .sample5 file, save featureXML in the same directory
@@ -135,7 +135,7 @@ def save_samples(self, samples=None):
135
135
  sample_name + ".featureXML",
136
136
  )
137
137
  self.logger.debug(f"Saving featureXML to default location: {featurexml_filename}")
138
-
138
+
139
139
  fh = oms.FeatureXMLFile()
140
140
  if sample_index is not None and sample_index < len(self.features_maps):
141
141
  fh.store(featurexml_filename, self.features_maps[sample_index])