masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -158,9 +158,7 @@ class identify_defaults:
158
158
  if not isinstance(value, list):
159
159
  return False
160
160
  # For heteroatoms, ensure all elements are strings
161
- if param_name == "heteroatoms" and not all(
162
- isinstance(item, str) for item in value
163
- ):
161
+ if param_name == "heteroatoms" and not all(isinstance(item, str) for item in value):
164
162
  return False
165
163
 
166
164
  # Range validation for numeric types
@@ -36,10 +36,10 @@ class merge_defaults:
36
36
  max_nr_conflicts: int = 0
37
37
  link_ms2: bool = True
38
38
  extract_ms1: bool = True
39
-
39
+
40
40
  # Cross-chunk merging parameters
41
41
  dechunking: str = "hierarchical"
42
-
42
+
43
43
  # Parallel processing parameters
44
44
  threads: Optional[int] = None
45
45
 
@@ -49,8 +49,7 @@ class merge_defaults:
49
49
  "dtype": str,
50
50
  "description": "Merge method (algorithm) to use",
51
51
  "default": "kd",
52
- "allowed_values": ["kd", "qt",
53
- "kd_chunked", "kd-chunked", "qt_chunked", "qt-chunked"],
52
+ "allowed_values": ["kd", "qt", "kd_chunked", "kd-chunked", "qt_chunked", "qt-chunked"],
54
53
  },
55
54
  "min_samples": {
56
55
  "dtype": int,
@@ -198,14 +197,14 @@ class merge_defaults:
198
197
  elif dtype is str and isinstance(value, str):
199
198
  valid_type = True
200
199
  break
201
-
200
+
202
201
  if not valid_type:
203
202
  return False
204
-
203
+
205
204
  # For None values, skip further validation
206
205
  if value is None:
207
206
  return True
208
-
207
+
209
208
  # Use the first non-None type for range validation
210
209
  expected_dtype = next((dt for dt in expected_dtype if dt is not type(None)), expected_dtype[0])
211
210
 
@@ -345,11 +345,7 @@ class study_defaults:
345
345
  expected_dtype = self._param_metadata[param_name]["dtype"]
346
346
 
347
347
  # Handle optional types
348
- if (
349
- isinstance(expected_dtype, str)
350
- and expected_dtype.startswith("Optional")
351
- and value is not None
352
- ):
348
+ if isinstance(expected_dtype, str) and expected_dtype.startswith("Optional") and value is not None:
353
349
  if "int" in expected_dtype and not isinstance(value, int):
354
350
  try:
355
351
  value = int(value)
masster/study/export.py CHANGED
@@ -78,7 +78,7 @@ def _get_mgf_df(self, **kwargs):
78
78
  if self.consensus_df is None:
79
79
  self.logger.error("No consensus map found. Please run merge() first.")
80
80
  return None
81
-
81
+
82
82
  # MS2 data is optional - we can generate MS1 data without it
83
83
  ms2_available = self.consensus_ms2 is not None and not self.consensus_ms2.is_empty()
84
84
  if not ms2_available:
@@ -112,11 +112,7 @@ def _get_mgf_df(self, **kwargs):
112
112
  mask = mask & (spec.inty >= inty_min)
113
113
  for attr in spec.__dict__:
114
114
  arr = getattr(spec, attr)
115
- if (
116
- isinstance(arr, list | np.ndarray)
117
- and hasattr(arr, "__len__")
118
- and len(arr) == length
119
- ):
115
+ if isinstance(arr, list | np.ndarray) and hasattr(arr, "__len__") and len(arr) == length:
120
116
  setattr(spec, attr, np.array(arr)[mask])
121
117
  return spec
122
118
 
@@ -132,12 +128,8 @@ def _get_mgf_df(self, **kwargs):
132
128
  return None
133
129
 
134
130
  # Prepare spectrum data
135
- spectrum_mz = (
136
- spect.mz.tolist() if hasattr(spect.mz, "tolist") else list(spect.mz)
137
- )
138
- spectrum_inty = (
139
- spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
140
- )
131
+ spectrum_mz = spect.mz.tolist() if hasattr(spect.mz, "tolist") else list(spect.mz)
132
+ spectrum_inty = spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
141
133
 
142
134
  # Determine MS level
143
135
  ms_level = spect.ms_level if spect.ms_level is not None else 1
@@ -181,7 +173,7 @@ def _get_mgf_df(self, **kwargs):
181
173
  consensus_mz = row["mz"]
182
174
  consensus_rt = row["rt"]
183
175
  consensus_inty_mean = row.get("inty_mean", 0)
184
-
176
+
185
177
  if mz_start is not None and consensus_mz < mz_start:
186
178
  continue
187
179
  if mz_end is not None and consensus_mz > mz_end:
@@ -190,10 +182,10 @@ def _get_mgf_df(self, **kwargs):
190
182
  continue
191
183
  if rt_end is not None and consensus_rt > rt_end:
192
184
  continue
193
-
185
+
194
186
  # Create MS1 spectrum using isotope data
195
187
  iso_data = row.get("iso", None)
196
-
188
+
197
189
  if iso_data is not None and len(iso_data) > 0:
198
190
  # Use isotope data for spectrum
199
191
  spectrum_mz = [float(peak[0]) for peak in iso_data]
@@ -202,10 +194,12 @@ def _get_mgf_df(self, **kwargs):
202
194
  # Use consensus mz and inty_mean as single peak
203
195
  spectrum_mz = [float(consensus_mz)]
204
196
  spectrum_inty = [float(consensus_inty_mean)]
205
-
197
+
206
198
  # Apply intensity minimum filter if specified
207
199
  if inty_min is not None and inty_min > 0:
208
- filtered_pairs = [(mz, inty) for mz, inty in zip(spectrum_mz, spectrum_inty, strict=False) if inty >= inty_min]
200
+ filtered_pairs = [
201
+ (mz, inty) for mz, inty in zip(spectrum_mz, spectrum_inty, strict=False) if inty >= inty_min
202
+ ]
209
203
  if filtered_pairs:
210
204
  spectrum_mz, spectrum_inty = zip(*filtered_pairs, strict=False)
211
205
  spectrum_mz = list(spectrum_mz)
@@ -213,9 +207,9 @@ def _get_mgf_df(self, **kwargs):
213
207
  else:
214
208
  # If all peaks are below threshold, skip this feature
215
209
  continue
216
-
210
+
217
211
  mgf_counter += 1
218
-
212
+
219
213
  # Create MS1 spectrum object to use with create_ion_dict
220
214
  class SimpleSpectrum:
221
215
  def __init__(self, mz_list, inty_list):
@@ -223,9 +217,9 @@ def _get_mgf_df(self, **kwargs):
223
217
  self.inty = np.array(inty_list)
224
218
  self.ms_level = 1
225
219
  self.energy = None
226
-
220
+
227
221
  ms1_spectrum = SimpleSpectrum(spectrum_mz, spectrum_inty)
228
-
222
+
229
223
  # Use create_ion_dict to ensure consistent schema
230
224
  ion_dict = create_ion_dict(
231
225
  f"uid:{consensus_uid}, rt:{consensus_rt:.2f}, mz:{consensus_mz:.4f}, MS1",
@@ -237,7 +231,7 @@ def _get_mgf_df(self, **kwargs):
237
231
  ms1_spectrum,
238
232
  mgf_counter,
239
233
  )
240
-
234
+
241
235
  if ion_dict is not None:
242
236
  ion_data.append(ion_dict)
243
237
 
@@ -350,11 +344,7 @@ def _get_mgf_df(self, **kwargs):
350
344
 
351
345
  elif selection == "all":
352
346
  if merge:
353
- specs = [
354
- row_e["spec"]
355
- for row_e in cons_ms2.iter_rows(named=True)
356
- if row_e["spec"] is not None
357
- ]
347
+ specs = [row_e["spec"] for row_e in cons_ms2.iter_rows(named=True) if row_e["spec"] is not None]
358
348
  if not specs:
359
349
  continue
360
350
  spect = combine_peaks(specs)
@@ -443,6 +433,7 @@ def export_mgf(self, **kwargs):
443
433
  """
444
434
  # Get mgf data as DataFrame
445
435
  from masster.study.export import _get_mgf_df
436
+
446
437
  mgf_data = _get_mgf_df(self, **kwargs)
447
438
 
448
439
  if mgf_data is None or len(mgf_data) == 0:
@@ -543,11 +534,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
543
534
  .sort("consensus_uid")
544
535
  )
545
536
  # Keep raw id_data for backward compatibility (if needed elsewhere)
546
- id_data = (
547
- self.id_df
548
- if hasattr(self, "id_df") and self.id_df is not None
549
- else None
550
- )
537
+ id_data = self.id_df if hasattr(self, "id_df") and self.id_df is not None else None
551
538
  else:
552
539
  self.logger.info("No identification data available for mzTab export")
553
540
  except Exception as e:
@@ -561,6 +548,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
561
548
  mgf_mapping: dict[str, list[int]] = {}
562
549
  if include_mgf:
563
550
  from masster.study.export import _get_mgf_df
551
+
564
552
  mgf_data = _get_mgf_df(self, **kwargs)
565
553
  # Create mapping from feature_uid to MGF indexes
566
554
  if mgf_data is not None and len(mgf_data) > 0:
@@ -616,12 +604,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
616
604
  mtd_lines.append("")
617
605
 
618
606
  # Database information - updated based on identification data
619
- if (
620
- full_id_data is not None
621
- and hasattr(self, "lib_df")
622
- and self.lib_df is not None
623
- and not self.lib_df.is_empty()
624
- ):
607
+ if full_id_data is not None and hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
625
608
  mtd_lines.append('MTD\tdatabase[1]\t[, , "compound library", ]')
626
609
  mtd_lines.append("MTD\tdatabase[1]-prefix\tcmpd")
627
610
  mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
@@ -688,11 +671,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
688
671
  # round to int - handle both Polars and Pandas DataFrames
689
672
  if hasattr(abundance_matrix, "with_columns"):
690
673
  # Polars DataFrame
691
- numeric_cols = [
692
- col
693
- for col in abundance_matrix.columns
694
- if abundance_matrix[col].dtype.is_numeric()
695
- ]
674
+ numeric_cols = [col for col in abundance_matrix.columns if abundance_matrix[col].dtype.is_numeric()]
696
675
  abundance_matrix = abundance_matrix.with_columns(
697
676
  [abundance_matrix[col].round(0) for col in numeric_cols],
698
677
  )
@@ -738,9 +717,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
738
717
  best_id_confidence_measure = "null"
739
718
  best_id_confidence_value = "null"
740
719
  reliability = "4" # Default: unknown compound
741
- theoretical_neutral_mass = (
742
- "null" # Only set when we have database identification
743
- )
720
+ theoretical_neutral_mass = "null" # Only set when we have database identification
744
721
 
745
722
  if id_info:
746
723
  # Use cmpd_uid as database identifier with prefix
@@ -817,27 +794,15 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
817
794
  # Get the first (and should be only) matching row
818
795
  abundance_row = filtered_matrix.row(0, named=True)
819
796
  # Extract values excluding the consensus_uid column
820
- abundance_values = [
821
- abundance_row[col]
822
- for col in abundance_matrix.columns
823
- if col != "consensus_uid"
824
- ]
825
- sml_row += [
826
- safe_str(val) if val is not None else "null" for val in abundance_values
827
- ]
797
+ abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
798
+ sml_row += [safe_str(val) if val is not None else "null" for val in abundance_values]
828
799
 
829
800
  # Calculate study variable statistics
830
801
  non_null_values = [val for val in abundance_values if val is not None]
831
802
  if non_null_values:
832
803
  abundance_study_variable = sum(non_null_values) / len(non_null_values)
833
804
  abundance_variation_study_variable = (
834
- (
835
- sum(
836
- (x - abundance_study_variable) ** 2 for x in non_null_values
837
- )
838
- / len(non_null_values)
839
- )
840
- ** 0.5
805
+ (sum((x - abundance_study_variable) ** 2 for x in non_null_values) / len(non_null_values)) ** 0.5
841
806
  if len(non_null_values) > 1
842
807
  else 0
843
808
  )
@@ -896,9 +861,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
896
861
  some_ids = []
897
862
  for i, some_row in enumerate(some_matches.iter_rows(named=True)):
898
863
  # Create a unique SOME ID based on consensus_uid and position
899
- some_id_base = (
900
- consensus_uid * 1000
901
- ) # Ensure uniqueness across consensus features
864
+ some_id_base = consensus_uid * 1000 # Ensure uniqueness across consensus features
902
865
  some_id = some_id_base + i + 1
903
866
  some_ids.append(str(some_id))
904
867
 
@@ -950,14 +913,8 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
950
913
  # Get the first (and should be only) matching row
951
914
  abundance_row = filtered_matrix.row(0, named=True)
952
915
  # Extract values excluding the consensus_uid column
953
- abundance_values = [
954
- abundance_row[col]
955
- for col in abundance_matrix.columns
956
- if col != "consensus_uid"
957
- ]
958
- abundance_strings = [
959
- safe_str(val) if val is not None else "null" for val in abundance_values
960
- ]
916
+ abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
917
+ abundance_strings = [safe_str(val) if val is not None else "null" for val in abundance_values]
961
918
  smf_row += abundance_strings
962
919
 
963
920
  # Calculate study variable statistics (same as in SML section)
@@ -965,13 +922,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
965
922
  if non_null_values:
966
923
  abundance_study_variable = sum(non_null_values) / len(non_null_values)
967
924
  abundance_variation_study_variable = (
968
- (
969
- sum(
970
- (x - abundance_study_variable) ** 2 for x in non_null_values
971
- )
972
- / len(non_null_values)
973
- )
974
- ** 0.5
925
+ (sum((x - abundance_study_variable) ** 2 for x in non_null_values) / len(non_null_values)) ** 0.5
975
926
  if len(non_null_values) > 1
976
927
  else 0
977
928
  )
@@ -1023,9 +974,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
1023
974
  some_lines.append("\t".join(some_header))
1024
975
 
1025
976
  # Create SOME entries for all identification results using enriched data
1026
- for consensus_uid in (
1027
- self.consensus_df.select("consensus_uid").to_series().unique()
1028
- ):
977
+ for consensus_uid in self.consensus_df.select("consensus_uid").to_series().unique():
1029
978
  # Get consensus feature data for this consensus_uid
1030
979
  consensus_feature_data = self.consensus_df.filter(
1031
980
  pl.col("consensus_uid") == consensus_uid,
@@ -1081,9 +1030,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
1081
1030
 
1082
1031
  # Theoretical mass-to-charge from lib_df
1083
1032
  theoretical_mz = "null"
1084
- if (
1085
- some_row.get("mz") is not None
1086
- ): # This comes from lib_df via get_id() join
1033
+ if some_row.get("mz") is not None: # This comes from lib_df via get_id() join
1087
1034
  theoretical_mz = safe_str(some_row["mz"])
1088
1035
 
1089
1036
  some_line = [
@@ -1148,23 +1095,15 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
1148
1095
  spec_len = row["spec_len"] if row["spec_len"] is not None else 0
1149
1096
 
1150
1097
  # Format spectrum data as pipe-separated strings
1151
- spec_mz_str = (
1152
- "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
1153
- )
1154
- spec_int_str = (
1155
- "|".join([f"{int(inty)}" for inty in spectrum_inty])
1156
- if spectrum_inty
1157
- else ""
1158
- )
1098
+ spec_mz_str = "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
1099
+ spec_int_str = "|".join([f"{int(inty)}" for inty in spectrum_inty]) if spectrum_inty else ""
1159
1100
 
1160
1101
  mgf_row = [
1161
1102
  "COM",
1162
1103
  "MGF",
1163
1104
  str(row["mgf_index"]) if row["mgf_index"] is not None else "null",
1164
1105
  str(row["feature_id"]) if row["feature_id"] is not None else "null",
1165
- f"{row['rtinseconds']:.2f}"
1166
- if row["rtinseconds"] is not None
1167
- else "null",
1106
+ f"{row['rtinseconds']:.2f}" if row["rtinseconds"] is not None else "null",
1168
1107
  f"{row['pepmass']:.4f}" if row["pepmass"] is not None else "null",
1169
1108
  "null", # prec_int - not available in current data
1170
1109
  str(row["energy"]) if row["energy"] is not None else "null",