masster 0.5.27__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/lib/lib.py CHANGED
@@ -772,7 +772,7 @@ class Lib:
772
772
  skipped_compounds += 1
773
773
  continue
774
774
 
775
- formula = compound_record.get("formula", "")
775
+ formula = compound_record.get("formula", compound_record.get("Formula", ""))
776
776
  if not formula or not isinstance(formula, str):
777
777
  skipped_compounds += 1
778
778
  continue
masster/logger.py CHANGED
@@ -136,10 +136,8 @@ class MassterLogger:
136
136
  f"\x1b[90m{module_name}:{func_name}:{line_no}\x1b[0m | " # dim gray for location info
137
137
  )
138
138
 
139
- # Universal format: timestamp | level | location | label - message
140
139
  # Universal format: timestamp | level | location | label - message
141
140
  return (
142
- f"\x1b[90m{timestamp}\x1b[0m | " # gray timestamp (universal for both themes)
143
141
  f"\x1b[90m{timestamp}\x1b[0m | " # gray timestamp (universal for both themes)
144
142
  f"{level_color}{level_str}\x1b[0m | " # colored level
145
143
  f"{location_info}" # location info for DEBUG/TRACE
@@ -200,7 +198,6 @@ class MassterLogger:
200
198
 
201
199
  level_str = record.levelname.ljust(8)
202
200
  level_color = level_colors.get(record.levelname, "\x1b[90m") # default to gray instead of white
203
- level_color = level_colors.get(record.levelname, "\x1b[90m") # default to gray instead of white
204
201
  label_part = self.label + " | " if self.label else ""
205
202
 
206
203
  # For DEBUG and TRACE levels, add module/location information
@@ -221,7 +218,6 @@ class MassterLogger:
221
218
 
222
219
  # Universal format: timestamp | level | location | label - message
223
220
  return (
224
- f"\x1b[90m{timestamp}\x1b[0m | " # gray timestamp (universal for both themes)
225
221
  f"\x1b[90m{timestamp}\x1b[0m | " # gray timestamp (universal for both themes)
226
222
  f"{level_color}{level_str}\x1b[0m | " # colored level
227
223
  f"{location_info}" # location info for DEBUG/TRACE
@@ -267,7 +263,6 @@ class MassterLogger:
267
263
 
268
264
  level_str = record.levelname.ljust(8)
269
265
  level_color = level_colors.get(record.levelname, "\x1b[90m") # default to gray instead of white
270
- level_color = level_colors.get(record.levelname, "\x1b[90m") # default to gray instead of white
271
266
  label_part = self.label + " | " if self.label else ""
272
267
 
273
268
  # For DEBUG and TRACE levels, add module/location information
@@ -288,7 +283,6 @@ class MassterLogger:
288
283
 
289
284
  # Universal format: timestamp | level | location | label - message
290
285
  return (
291
- f"\x1b[90m{timestamp}\x1b[0m | " # gray timestamp (universal for both themes)
292
286
  f"\x1b[90m{timestamp}\x1b[0m | " # gray timestamp (universal for both themes)
293
287
  f"{level_color}{level_str}\x1b[0m | " # colored level
294
288
  f"{location_info}" # location info for DEBUG/TRACE
masster/sample/adducts.py CHANGED
@@ -137,7 +137,7 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
137
137
  "formatted_name": formatted_name,
138
138
  "total_mass_shift": spec["mass_shift"] * multiplier,
139
139
  "total_charge": total_charge,
140
- "combined_probability": spec["probability"] ** multiplier,
140
+ "combined_probability": (spec["probability"] ** multiplier) / 2.0,
141
141
  "complexity": multiplier,
142
142
  },
143
143
  )
@@ -356,7 +356,7 @@ class find_adducts_defaults:
356
356
  ]
357
357
  elif adducts in ["neg", "negative"]:
358
358
  return [
359
- "-H:-1:0.9",
359
+ "-H:-1:0.90",
360
360
  "+Cl:-1:0.1",
361
361
  "+CH2O2:0:0.15",
362
362
  "-H2O:0:0.15",
masster/sample/h5.py CHANGED
@@ -319,7 +319,78 @@ def _save_sample5(
319
319
  params_json = json.dumps(save_data, indent=2)
320
320
  metadata_group.attrs["parameters"] = params_json
321
321
 
322
- # Store lib and lib_match - removed (no longer saving lib data)
322
+ # Store lib_df and id_df (identification DataFrames)
323
+ if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
324
+ lib_group = f.create_group("lib")
325
+ for col in self.lib_df.columns:
326
+ data = self.lib_df[col].to_numpy()
327
+ # Handle different data types safely
328
+ if data.dtype == object:
329
+ try:
330
+ str_data = np.array(
331
+ ["" if x is None else str(x) for x in data],
332
+ dtype="S",
333
+ )
334
+ lib_group.create_dataset(
335
+ col,
336
+ data=str_data,
337
+ compression="gzip",
338
+ )
339
+ lib_group[col].attrs["dtype"] = "string_converted"
340
+ except Exception:
341
+ json_data = np.array(
342
+ [json.dumps(x, default=str) for x in data],
343
+ dtype="S",
344
+ )
345
+ lib_group.create_dataset(
346
+ col,
347
+ data=json_data,
348
+ compression="gzip",
349
+ )
350
+ lib_group[col].attrs["dtype"] = "json"
351
+ else:
352
+ lib_group.create_dataset(
353
+ col,
354
+ data=data,
355
+ compression="gzip",
356
+ )
357
+ lib_group.attrs["columns"] = list(self.lib_df.columns)
358
+
359
+ if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
360
+ id_group = f.create_group("id")
361
+ for col in self.id_df.columns:
362
+ data = self.id_df[col].to_numpy()
363
+ # Handle different data types safely
364
+ if data.dtype == object:
365
+ try:
366
+ str_data = np.array(
367
+ ["" if x is None else str(x) for x in data],
368
+ dtype="S",
369
+ )
370
+ id_group.create_dataset(
371
+ col,
372
+ data=str_data,
373
+ compression="gzip",
374
+ )
375
+ id_group[col].attrs["dtype"] = "string_converted"
376
+ except Exception:
377
+ json_data = np.array(
378
+ [json.dumps(x, default=str) for x in data],
379
+ dtype="S",
380
+ )
381
+ id_group.create_dataset(
382
+ col,
383
+ data=json_data,
384
+ compression="gzip",
385
+ )
386
+ id_group[col].attrs["dtype"] = "json"
387
+ else:
388
+ id_group.create_dataset(
389
+ col,
390
+ data=data,
391
+ compression="gzip",
392
+ )
393
+ id_group.attrs["columns"] = list(self.id_df.columns)
323
394
 
324
395
  self.logger.success(f"Sample saved to {filename}")
325
396
  if save_featurexml:
@@ -1004,8 +1075,87 @@ def _load_sample5(self, filename: str, map: bool = False):
1004
1075
  else:
1005
1076
  self.ms1_df = None
1006
1077
 
1078
+ # Load lib_df (library DataFrame)
1079
+ if "lib" in f:
1080
+ lib_group = f["lib"]
1081
+ data = {}
1082
+
1083
+ # Get all datasets in the lib group
1084
+ for col in lib_group.keys():
1085
+ data_col = lib_group[col][:]
1086
+ # Handle string data
1087
+ if hasattr(lib_group[col], "attrs") and lib_group[col].attrs.get("dtype") in ["string_converted", "json"]:
1088
+ data[col] = [x.decode("utf-8") if isinstance(x, bytes) else x for x in data_col]
1089
+ else:
1090
+ data[col] = data_col
1091
+
1092
+ if data:
1093
+ # Create DataFrame directly with Polars
1094
+ self.lib_df = pl.DataFrame(data)
1095
+
1096
+ # Apply schema if available
1097
+ if "lib_df" in schema and "columns" in schema["lib_df"]:
1098
+ schema_columns = schema["lib_df"]["columns"]
1099
+ for col in self.lib_df.columns:
1100
+ if col in schema_columns:
1101
+ dtype_str = schema_columns[col]["dtype"]
1102
+ try:
1103
+ self.lib_df = self.lib_df.with_columns(
1104
+ [pl.col(col).cast(eval(dtype_str), strict=False)]
1105
+ )
1106
+ except Exception as e:
1107
+ self.logger.warning(
1108
+ f"Failed to apply schema type {dtype_str} to column {col}: {e}",
1109
+ )
1110
+
1111
+ # Convert "None" strings and NaN values to proper null values
1112
+ self.lib_df = clean_null_values_polars(self.lib_df)
1113
+ else:
1114
+ self.lib_df = None
1115
+ else:
1116
+ self.lib_df = None
1117
+
1118
+ # Load id_df (identification results DataFrame)
1119
+ if "id" in f:
1120
+ id_group = f["id"]
1121
+ data = {}
1122
+
1123
+ # Get all datasets in the id group
1124
+ for col in id_group.keys():
1125
+ data_col = id_group[col][:]
1126
+ # Handle string data
1127
+ if hasattr(id_group[col], "attrs") and id_group[col].attrs.get("dtype") in ["string_converted", "json"]:
1128
+ data[col] = [x.decode("utf-8") if isinstance(x, bytes) else x for x in data_col]
1129
+ else:
1130
+ data[col] = data_col
1131
+
1132
+ if data:
1133
+ # Create DataFrame directly with Polars
1134
+ self.id_df = pl.DataFrame(data)
1135
+
1136
+ # Apply schema if available
1137
+ if "id_df" in schema and "columns" in schema["id_df"]:
1138
+ schema_columns = schema["id_df"]["columns"]
1139
+ for col in self.id_df.columns:
1140
+ if col in schema_columns:
1141
+ dtype_str = schema_columns[col]["dtype"]
1142
+ try:
1143
+ self.id_df = self.id_df.with_columns(
1144
+ [pl.col(col).cast(eval(dtype_str), strict=False)]
1145
+ )
1146
+ except Exception as e:
1147
+ self.logger.warning(
1148
+ f"Failed to apply schema type {dtype_str} to column {col}: {e}",
1149
+ )
1150
+
1151
+ # Convert "None" strings and NaN values to proper null values
1152
+ self.id_df = clean_null_values_polars(self.id_df)
1153
+ else:
1154
+ self.id_df = None
1155
+ else:
1156
+ self.id_df = None
1157
+
1007
1158
  # Parameters are now loaded from metadata JSON (see above)
1008
- # Lib and lib_match are no longer saved/loaded
1009
1159
 
1010
1160
  # if map:
1011
1161
  # featureXML = filename.replace(".sample5", ".featureXML")
masster/sample/helpers.py CHANGED
@@ -359,17 +359,22 @@ def features_select(
359
359
  uid=None,
360
360
  mz=None,
361
361
  rt=None,
362
- coherence=None,
363
- inty=None,
364
362
  rt_delta=None,
365
- iso=None,
366
- iso_of=None,
367
- has_MS2=None,
363
+ inty=None,
364
+ coherence=None,
368
365
  prominence_scaled=None,
369
366
  prominence=None,
370
367
  height_scaled=None,
371
368
  height=None,
369
+ iso=None,
370
+ iso_of=None,
371
+ has_MS2=None,
372
372
  adduct_group=None,
373
+ id=None,
374
+ id_top_name=None,
375
+ id_top_class=None,
376
+ id_top_adduct=None,
377
+ id_top_score=None,
373
378
  ):
374
379
  """
375
380
  Select features based on specified criteria and return the filtered DataFrame.
@@ -389,6 +394,11 @@ def features_select(
389
394
  prominence: prominence filter (tuple for range, single value for minimum)
390
395
  height: height filter (tuple for range, single value for minimum)
391
396
  adduct_group: adduct group filter (single value for exact match, list of values for multiple groups, tuple for range, or None for all)
397
+ id: filter for features with/without identification (bool: True for identified, False for unidentified)
398
+ id_top_name: filter by top identification name using regex (str for regex pattern, list of str for multiple patterns combined with OR)
399
+ id_top_class: filter by top identification class using regex (str for regex pattern, list of str for multiple patterns combined with OR)
400
+ id_top_adduct: filter by top identification adduct (str for exact match, list of str for multiple adducts)
401
+ id_top_score: filter by top identification score (tuple for range, single value for minimum)
392
402
  Returns:
393
403
  polars.DataFrame: Filtered features DataFrame
394
404
  """
@@ -600,6 +610,82 @@ def features_select(
600
610
  f"Selected features by adduct_group. Features removed: {feats_len_before_filter - len(feats)}",
601
611
  )
602
612
 
613
+ if id is not None:
614
+ feats_len_before_filter = len(feats)
615
+ if "id_top_name" not in feats.columns:
616
+ self.logger.warning("No identification data found in features.")
617
+ else:
618
+ if id:
619
+ # Filter for features with identification (non-null id_top_name)
620
+ feats = feats.filter(pl.col("id_top_name").is_not_null())
621
+ else:
622
+ # Filter for features without identification (null id_top_name)
623
+ feats = feats.filter(pl.col("id_top_name").is_null())
624
+ self.logger.debug(
625
+ f"Selected features by identification presence. Features removed: {feats_len_before_filter - len(feats)}",
626
+ )
627
+
628
+ if id_top_name is not None:
629
+ feats_len_before_filter = len(feats)
630
+ if "id_top_name" not in feats.columns:
631
+ self.logger.warning("No id_top_name data found in features.")
632
+ else:
633
+ if isinstance(id_top_name, list):
634
+ # Use regex matching for each pattern in the list (OR logic)
635
+ pattern = "|".join(id_top_name)
636
+ feats = feats.filter(pl.col("id_top_name").str.contains(pattern))
637
+ else:
638
+ # Use regex matching for single pattern
639
+ feats = feats.filter(pl.col("id_top_name").str.contains(id_top_name))
640
+ self.logger.debug(
641
+ f"Selected features by id_top_name (regex). Features removed: {feats_len_before_filter - len(feats)}",
642
+ )
643
+
644
+ if id_top_class is not None:
645
+ feats_len_before_filter = len(feats)
646
+ if "id_top_class" not in feats.columns:
647
+ self.logger.warning("No id_top_class data found in features.")
648
+ else:
649
+ if isinstance(id_top_class, list):
650
+ # Use regex matching for each pattern in the list (OR logic)
651
+ pattern = "|".join(id_top_class)
652
+ feats = feats.filter(pl.col("id_top_class").str.contains(pattern))
653
+ else:
654
+ # Use regex matching for single pattern
655
+ feats = feats.filter(pl.col("id_top_class").str.contains(id_top_class))
656
+ self.logger.debug(
657
+ f"Selected features by id_top_class (regex). Features removed: {feats_len_before_filter - len(feats)}",
658
+ )
659
+
660
+ if id_top_adduct is not None:
661
+ feats_len_before_filter = len(feats)
662
+ if "id_top_adduct" not in feats.columns:
663
+ self.logger.warning("No id_top_adduct data found in features.")
664
+ else:
665
+ if isinstance(id_top_adduct, list):
666
+ feats = feats.filter(pl.col("id_top_adduct").is_in(id_top_adduct))
667
+ else:
668
+ feats = feats.filter(pl.col("id_top_adduct") == id_top_adduct)
669
+ self.logger.debug(
670
+ f"Selected features by id_top_adduct. Features removed: {feats_len_before_filter - len(feats)}",
671
+ )
672
+
673
+ if id_top_score is not None:
674
+ feats_len_before_filter = len(feats)
675
+ if "id_top_score" not in feats.columns:
676
+ self.logger.warning("No id_top_score data found in features.")
677
+ else:
678
+ if isinstance(id_top_score, tuple) and len(id_top_score) == 2:
679
+ min_score, max_score = id_top_score
680
+ feats = feats.filter(
681
+ (pl.col("id_top_score") >= min_score) & (pl.col("id_top_score") <= max_score)
682
+ )
683
+ else:
684
+ feats = feats.filter(pl.col("id_top_score") >= id_top_score)
685
+ self.logger.debug(
686
+ f"Selected features by id_top_score. Features removed: {feats_len_before_filter - len(feats)}",
687
+ )
688
+
603
689
  if len(feats) == 0:
604
690
  self.logger.warning("No features remaining after applying selection criteria.")
605
691
  else: