masster 0.5.27__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/data/libs/aa_nort.json +240 -0
- masster/data/libs/ccm_nort.json +1319 -0
- masster/lib/lib.py +1 -1
- masster/logger.py +0 -6
- masster/sample/adducts.py +1 -1
- masster/sample/defaults/find_adducts_def.py +1 -1
- masster/sample/h5.py +152 -2
- masster/sample/helpers.py +91 -5
- masster/sample/id.py +1160 -0
- masster/sample/importers.py +316 -0
- masster/sample/plot.py +175 -71
- masster/sample/sample.py +18 -3
- masster/sample/sample5_schema.json +99 -1
- masster/study/defaults/study_def.py +8 -12
- masster/study/id.py +59 -12
- masster/study/load.py +0 -11
- masster/study/merge.py +153 -0
- masster/study/plot.py +197 -0
- masster/study/study.py +3 -1
- masster/study/study5_schema.json +15 -0
- masster/wizard/wizard.py +11 -12
- {masster-0.5.27.dist-info → masster-0.6.0.dist-info}/METADATA +99 -60
- {masster-0.5.27.dist-info → masster-0.6.0.dist-info}/RECORD +27 -26
- masster/data/libs/aa.csv +0 -22
- masster/data/libs/ccm.csv +0 -120
- masster/data/libs/urine.csv +0 -4693
- {masster-0.5.27.dist-info → masster-0.6.0.dist-info}/WHEEL +0 -0
- {masster-0.5.27.dist-info → masster-0.6.0.dist-info}/entry_points.txt +0 -0
- {masster-0.5.27.dist-info → masster-0.6.0.dist-info}/licenses/LICENSE +0 -0
masster/lib/lib.py
CHANGED
|
@@ -772,7 +772,7 @@ class Lib:
|
|
|
772
772
|
skipped_compounds += 1
|
|
773
773
|
continue
|
|
774
774
|
|
|
775
|
-
formula = compound_record.get("formula", "")
|
|
775
|
+
formula = compound_record.get("formula", compound_record.get("Formula", ""))
|
|
776
776
|
if not formula or not isinstance(formula, str):
|
|
777
777
|
skipped_compounds += 1
|
|
778
778
|
continue
|
masster/logger.py
CHANGED
|
@@ -136,10 +136,8 @@ class MassterLogger:
|
|
|
136
136
|
f"\x1b[90m{module_name}:{func_name}:{line_no}\x1b[0m | " # dim gray for location info
|
|
137
137
|
)
|
|
138
138
|
|
|
139
|
-
# Universal format: timestamp | level | location | label - message
|
|
140
139
|
# Universal format: timestamp | level | location | label - message
|
|
141
140
|
return (
|
|
142
|
-
f"\x1b[90m{timestamp}\x1b[0m | " # gray timestamp (universal for both themes)
|
|
143
141
|
f"\x1b[90m{timestamp}\x1b[0m | " # gray timestamp (universal for both themes)
|
|
144
142
|
f"{level_color}{level_str}\x1b[0m | " # colored level
|
|
145
143
|
f"{location_info}" # location info for DEBUG/TRACE
|
|
@@ -200,7 +198,6 @@ class MassterLogger:
|
|
|
200
198
|
|
|
201
199
|
level_str = record.levelname.ljust(8)
|
|
202
200
|
level_color = level_colors.get(record.levelname, "\x1b[90m") # default to gray instead of white
|
|
203
|
-
level_color = level_colors.get(record.levelname, "\x1b[90m") # default to gray instead of white
|
|
204
201
|
label_part = self.label + " | " if self.label else ""
|
|
205
202
|
|
|
206
203
|
# For DEBUG and TRACE levels, add module/location information
|
|
@@ -221,7 +218,6 @@ class MassterLogger:
|
|
|
221
218
|
|
|
222
219
|
# Universal format: timestamp | level | location | label - message
|
|
223
220
|
return (
|
|
224
|
-
f"\x1b[90m{timestamp}\x1b[0m | " # gray timestamp (universal for both themes)
|
|
225
221
|
f"\x1b[90m{timestamp}\x1b[0m | " # gray timestamp (universal for both themes)
|
|
226
222
|
f"{level_color}{level_str}\x1b[0m | " # colored level
|
|
227
223
|
f"{location_info}" # location info for DEBUG/TRACE
|
|
@@ -267,7 +263,6 @@ class MassterLogger:
|
|
|
267
263
|
|
|
268
264
|
level_str = record.levelname.ljust(8)
|
|
269
265
|
level_color = level_colors.get(record.levelname, "\x1b[90m") # default to gray instead of white
|
|
270
|
-
level_color = level_colors.get(record.levelname, "\x1b[90m") # default to gray instead of white
|
|
271
266
|
label_part = self.label + " | " if self.label else ""
|
|
272
267
|
|
|
273
268
|
# For DEBUG and TRACE levels, add module/location information
|
|
@@ -288,7 +283,6 @@ class MassterLogger:
|
|
|
288
283
|
|
|
289
284
|
# Universal format: timestamp | level | location | label - message
|
|
290
285
|
return (
|
|
291
|
-
f"\x1b[90m{timestamp}\x1b[0m | " # gray timestamp (universal for both themes)
|
|
292
286
|
f"\x1b[90m{timestamp}\x1b[0m | " # gray timestamp (universal for both themes)
|
|
293
287
|
f"{level_color}{level_str}\x1b[0m | " # colored level
|
|
294
288
|
f"{location_info}" # location info for DEBUG/TRACE
|
masster/sample/adducts.py
CHANGED
|
@@ -137,7 +137,7 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
|
137
137
|
"formatted_name": formatted_name,
|
|
138
138
|
"total_mass_shift": spec["mass_shift"] * multiplier,
|
|
139
139
|
"total_charge": total_charge,
|
|
140
|
-
"combined_probability": spec["probability"] ** multiplier,
|
|
140
|
+
"combined_probability": (spec["probability"] ** multiplier) / 2.0,
|
|
141
141
|
"complexity": multiplier,
|
|
142
142
|
},
|
|
143
143
|
)
|
masster/sample/h5.py
CHANGED
|
@@ -319,7 +319,78 @@ def _save_sample5(
|
|
|
319
319
|
params_json = json.dumps(save_data, indent=2)
|
|
320
320
|
metadata_group.attrs["parameters"] = params_json
|
|
321
321
|
|
|
322
|
-
# Store
|
|
322
|
+
# Store lib_df and id_df (identification DataFrames)
|
|
323
|
+
if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
|
|
324
|
+
lib_group = f.create_group("lib")
|
|
325
|
+
for col in self.lib_df.columns:
|
|
326
|
+
data = self.lib_df[col].to_numpy()
|
|
327
|
+
# Handle different data types safely
|
|
328
|
+
if data.dtype == object:
|
|
329
|
+
try:
|
|
330
|
+
str_data = np.array(
|
|
331
|
+
["" if x is None else str(x) for x in data],
|
|
332
|
+
dtype="S",
|
|
333
|
+
)
|
|
334
|
+
lib_group.create_dataset(
|
|
335
|
+
col,
|
|
336
|
+
data=str_data,
|
|
337
|
+
compression="gzip",
|
|
338
|
+
)
|
|
339
|
+
lib_group[col].attrs["dtype"] = "string_converted"
|
|
340
|
+
except Exception:
|
|
341
|
+
json_data = np.array(
|
|
342
|
+
[json.dumps(x, default=str) for x in data],
|
|
343
|
+
dtype="S",
|
|
344
|
+
)
|
|
345
|
+
lib_group.create_dataset(
|
|
346
|
+
col,
|
|
347
|
+
data=json_data,
|
|
348
|
+
compression="gzip",
|
|
349
|
+
)
|
|
350
|
+
lib_group[col].attrs["dtype"] = "json"
|
|
351
|
+
else:
|
|
352
|
+
lib_group.create_dataset(
|
|
353
|
+
col,
|
|
354
|
+
data=data,
|
|
355
|
+
compression="gzip",
|
|
356
|
+
)
|
|
357
|
+
lib_group.attrs["columns"] = list(self.lib_df.columns)
|
|
358
|
+
|
|
359
|
+
if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
|
|
360
|
+
id_group = f.create_group("id")
|
|
361
|
+
for col in self.id_df.columns:
|
|
362
|
+
data = self.id_df[col].to_numpy()
|
|
363
|
+
# Handle different data types safely
|
|
364
|
+
if data.dtype == object:
|
|
365
|
+
try:
|
|
366
|
+
str_data = np.array(
|
|
367
|
+
["" if x is None else str(x) for x in data],
|
|
368
|
+
dtype="S",
|
|
369
|
+
)
|
|
370
|
+
id_group.create_dataset(
|
|
371
|
+
col,
|
|
372
|
+
data=str_data,
|
|
373
|
+
compression="gzip",
|
|
374
|
+
)
|
|
375
|
+
id_group[col].attrs["dtype"] = "string_converted"
|
|
376
|
+
except Exception:
|
|
377
|
+
json_data = np.array(
|
|
378
|
+
[json.dumps(x, default=str) for x in data],
|
|
379
|
+
dtype="S",
|
|
380
|
+
)
|
|
381
|
+
id_group.create_dataset(
|
|
382
|
+
col,
|
|
383
|
+
data=json_data,
|
|
384
|
+
compression="gzip",
|
|
385
|
+
)
|
|
386
|
+
id_group[col].attrs["dtype"] = "json"
|
|
387
|
+
else:
|
|
388
|
+
id_group.create_dataset(
|
|
389
|
+
col,
|
|
390
|
+
data=data,
|
|
391
|
+
compression="gzip",
|
|
392
|
+
)
|
|
393
|
+
id_group.attrs["columns"] = list(self.id_df.columns)
|
|
323
394
|
|
|
324
395
|
self.logger.success(f"Sample saved to {filename}")
|
|
325
396
|
if save_featurexml:
|
|
@@ -1004,8 +1075,87 @@ def _load_sample5(self, filename: str, map: bool = False):
|
|
|
1004
1075
|
else:
|
|
1005
1076
|
self.ms1_df = None
|
|
1006
1077
|
|
|
1078
|
+
# Load lib_df (library DataFrame)
|
|
1079
|
+
if "lib" in f:
|
|
1080
|
+
lib_group = f["lib"]
|
|
1081
|
+
data = {}
|
|
1082
|
+
|
|
1083
|
+
# Get all datasets in the lib group
|
|
1084
|
+
for col in lib_group.keys():
|
|
1085
|
+
data_col = lib_group[col][:]
|
|
1086
|
+
# Handle string data
|
|
1087
|
+
if hasattr(lib_group[col], "attrs") and lib_group[col].attrs.get("dtype") in ["string_converted", "json"]:
|
|
1088
|
+
data[col] = [x.decode("utf-8") if isinstance(x, bytes) else x for x in data_col]
|
|
1089
|
+
else:
|
|
1090
|
+
data[col] = data_col
|
|
1091
|
+
|
|
1092
|
+
if data:
|
|
1093
|
+
# Create DataFrame directly with Polars
|
|
1094
|
+
self.lib_df = pl.DataFrame(data)
|
|
1095
|
+
|
|
1096
|
+
# Apply schema if available
|
|
1097
|
+
if "lib_df" in schema and "columns" in schema["lib_df"]:
|
|
1098
|
+
schema_columns = schema["lib_df"]["columns"]
|
|
1099
|
+
for col in self.lib_df.columns:
|
|
1100
|
+
if col in schema_columns:
|
|
1101
|
+
dtype_str = schema_columns[col]["dtype"]
|
|
1102
|
+
try:
|
|
1103
|
+
self.lib_df = self.lib_df.with_columns(
|
|
1104
|
+
[pl.col(col).cast(eval(dtype_str), strict=False)]
|
|
1105
|
+
)
|
|
1106
|
+
except Exception as e:
|
|
1107
|
+
self.logger.warning(
|
|
1108
|
+
f"Failed to apply schema type {dtype_str} to column {col}: {e}",
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
# Convert "None" strings and NaN values to proper null values
|
|
1112
|
+
self.lib_df = clean_null_values_polars(self.lib_df)
|
|
1113
|
+
else:
|
|
1114
|
+
self.lib_df = None
|
|
1115
|
+
else:
|
|
1116
|
+
self.lib_df = None
|
|
1117
|
+
|
|
1118
|
+
# Load id_df (identification results DataFrame)
|
|
1119
|
+
if "id" in f:
|
|
1120
|
+
id_group = f["id"]
|
|
1121
|
+
data = {}
|
|
1122
|
+
|
|
1123
|
+
# Get all datasets in the id group
|
|
1124
|
+
for col in id_group.keys():
|
|
1125
|
+
data_col = id_group[col][:]
|
|
1126
|
+
# Handle string data
|
|
1127
|
+
if hasattr(id_group[col], "attrs") and id_group[col].attrs.get("dtype") in ["string_converted", "json"]:
|
|
1128
|
+
data[col] = [x.decode("utf-8") if isinstance(x, bytes) else x for x in data_col]
|
|
1129
|
+
else:
|
|
1130
|
+
data[col] = data_col
|
|
1131
|
+
|
|
1132
|
+
if data:
|
|
1133
|
+
# Create DataFrame directly with Polars
|
|
1134
|
+
self.id_df = pl.DataFrame(data)
|
|
1135
|
+
|
|
1136
|
+
# Apply schema if available
|
|
1137
|
+
if "id_df" in schema and "columns" in schema["id_df"]:
|
|
1138
|
+
schema_columns = schema["id_df"]["columns"]
|
|
1139
|
+
for col in self.id_df.columns:
|
|
1140
|
+
if col in schema_columns:
|
|
1141
|
+
dtype_str = schema_columns[col]["dtype"]
|
|
1142
|
+
try:
|
|
1143
|
+
self.id_df = self.id_df.with_columns(
|
|
1144
|
+
[pl.col(col).cast(eval(dtype_str), strict=False)]
|
|
1145
|
+
)
|
|
1146
|
+
except Exception as e:
|
|
1147
|
+
self.logger.warning(
|
|
1148
|
+
f"Failed to apply schema type {dtype_str} to column {col}: {e}",
|
|
1149
|
+
)
|
|
1150
|
+
|
|
1151
|
+
# Convert "None" strings and NaN values to proper null values
|
|
1152
|
+
self.id_df = clean_null_values_polars(self.id_df)
|
|
1153
|
+
else:
|
|
1154
|
+
self.id_df = None
|
|
1155
|
+
else:
|
|
1156
|
+
self.id_df = None
|
|
1157
|
+
|
|
1007
1158
|
# Parameters are now loaded from metadata JSON (see above)
|
|
1008
|
-
# Lib and lib_match are no longer saved/loaded
|
|
1009
1159
|
|
|
1010
1160
|
# if map:
|
|
1011
1161
|
# featureXML = filename.replace(".sample5", ".featureXML")
|
masster/sample/helpers.py
CHANGED
|
@@ -359,17 +359,22 @@ def features_select(
|
|
|
359
359
|
uid=None,
|
|
360
360
|
mz=None,
|
|
361
361
|
rt=None,
|
|
362
|
-
coherence=None,
|
|
363
|
-
inty=None,
|
|
364
362
|
rt_delta=None,
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
has_MS2=None,
|
|
363
|
+
inty=None,
|
|
364
|
+
coherence=None,
|
|
368
365
|
prominence_scaled=None,
|
|
369
366
|
prominence=None,
|
|
370
367
|
height_scaled=None,
|
|
371
368
|
height=None,
|
|
369
|
+
iso=None,
|
|
370
|
+
iso_of=None,
|
|
371
|
+
has_MS2=None,
|
|
372
372
|
adduct_group=None,
|
|
373
|
+
id=None,
|
|
374
|
+
id_top_name=None,
|
|
375
|
+
id_top_class=None,
|
|
376
|
+
id_top_adduct=None,
|
|
377
|
+
id_top_score=None,
|
|
373
378
|
):
|
|
374
379
|
"""
|
|
375
380
|
Select features based on specified criteria and return the filtered DataFrame.
|
|
@@ -389,6 +394,11 @@ def features_select(
|
|
|
389
394
|
prominence: prominence filter (tuple for range, single value for minimum)
|
|
390
395
|
height: height filter (tuple for range, single value for minimum)
|
|
391
396
|
adduct_group: adduct group filter (single value for exact match, list of values for multiple groups, tuple for range, or None for all)
|
|
397
|
+
id: filter for features with/without identification (bool: True for identified, False for unidentified)
|
|
398
|
+
id_top_name: filter by top identification name using regex (str for regex pattern, list of str for multiple patterns combined with OR)
|
|
399
|
+
id_top_class: filter by top identification class using regex (str for regex pattern, list of str for multiple patterns combined with OR)
|
|
400
|
+
id_top_adduct: filter by top identification adduct (str for exact match, list of str for multiple adducts)
|
|
401
|
+
id_top_score: filter by top identification score (tuple for range, single value for minimum)
|
|
392
402
|
Returns:
|
|
393
403
|
polars.DataFrame: Filtered features DataFrame
|
|
394
404
|
"""
|
|
@@ -600,6 +610,82 @@ def features_select(
|
|
|
600
610
|
f"Selected features by adduct_group. Features removed: {feats_len_before_filter - len(feats)}",
|
|
601
611
|
)
|
|
602
612
|
|
|
613
|
+
if id is not None:
|
|
614
|
+
feats_len_before_filter = len(feats)
|
|
615
|
+
if "id_top_name" not in feats.columns:
|
|
616
|
+
self.logger.warning("No identification data found in features.")
|
|
617
|
+
else:
|
|
618
|
+
if id:
|
|
619
|
+
# Filter for features with identification (non-null id_top_name)
|
|
620
|
+
feats = feats.filter(pl.col("id_top_name").is_not_null())
|
|
621
|
+
else:
|
|
622
|
+
# Filter for features without identification (null id_top_name)
|
|
623
|
+
feats = feats.filter(pl.col("id_top_name").is_null())
|
|
624
|
+
self.logger.debug(
|
|
625
|
+
f"Selected features by identification presence. Features removed: {feats_len_before_filter - len(feats)}",
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
if id_top_name is not None:
|
|
629
|
+
feats_len_before_filter = len(feats)
|
|
630
|
+
if "id_top_name" not in feats.columns:
|
|
631
|
+
self.logger.warning("No id_top_name data found in features.")
|
|
632
|
+
else:
|
|
633
|
+
if isinstance(id_top_name, list):
|
|
634
|
+
# Use regex matching for each pattern in the list (OR logic)
|
|
635
|
+
pattern = "|".join(id_top_name)
|
|
636
|
+
feats = feats.filter(pl.col("id_top_name").str.contains(pattern))
|
|
637
|
+
else:
|
|
638
|
+
# Use regex matching for single pattern
|
|
639
|
+
feats = feats.filter(pl.col("id_top_name").str.contains(id_top_name))
|
|
640
|
+
self.logger.debug(
|
|
641
|
+
f"Selected features by id_top_name (regex). Features removed: {feats_len_before_filter - len(feats)}",
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
if id_top_class is not None:
|
|
645
|
+
feats_len_before_filter = len(feats)
|
|
646
|
+
if "id_top_class" not in feats.columns:
|
|
647
|
+
self.logger.warning("No id_top_class data found in features.")
|
|
648
|
+
else:
|
|
649
|
+
if isinstance(id_top_class, list):
|
|
650
|
+
# Use regex matching for each pattern in the list (OR logic)
|
|
651
|
+
pattern = "|".join(id_top_class)
|
|
652
|
+
feats = feats.filter(pl.col("id_top_class").str.contains(pattern))
|
|
653
|
+
else:
|
|
654
|
+
# Use regex matching for single pattern
|
|
655
|
+
feats = feats.filter(pl.col("id_top_class").str.contains(id_top_class))
|
|
656
|
+
self.logger.debug(
|
|
657
|
+
f"Selected features by id_top_class (regex). Features removed: {feats_len_before_filter - len(feats)}",
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
if id_top_adduct is not None:
|
|
661
|
+
feats_len_before_filter = len(feats)
|
|
662
|
+
if "id_top_adduct" not in feats.columns:
|
|
663
|
+
self.logger.warning("No id_top_adduct data found in features.")
|
|
664
|
+
else:
|
|
665
|
+
if isinstance(id_top_adduct, list):
|
|
666
|
+
feats = feats.filter(pl.col("id_top_adduct").is_in(id_top_adduct))
|
|
667
|
+
else:
|
|
668
|
+
feats = feats.filter(pl.col("id_top_adduct") == id_top_adduct)
|
|
669
|
+
self.logger.debug(
|
|
670
|
+
f"Selected features by id_top_adduct. Features removed: {feats_len_before_filter - len(feats)}",
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
if id_top_score is not None:
|
|
674
|
+
feats_len_before_filter = len(feats)
|
|
675
|
+
if "id_top_score" not in feats.columns:
|
|
676
|
+
self.logger.warning("No id_top_score data found in features.")
|
|
677
|
+
else:
|
|
678
|
+
if isinstance(id_top_score, tuple) and len(id_top_score) == 2:
|
|
679
|
+
min_score, max_score = id_top_score
|
|
680
|
+
feats = feats.filter(
|
|
681
|
+
(pl.col("id_top_score") >= min_score) & (pl.col("id_top_score") <= max_score)
|
|
682
|
+
)
|
|
683
|
+
else:
|
|
684
|
+
feats = feats.filter(pl.col("id_top_score") >= id_top_score)
|
|
685
|
+
self.logger.debug(
|
|
686
|
+
f"Selected features by id_top_score. Features removed: {feats_len_before_filter - len(feats)}",
|
|
687
|
+
)
|
|
688
|
+
|
|
603
689
|
if len(feats) == 0:
|
|
604
690
|
self.logger.warning("No features remaining after applying selection criteria.")
|
|
605
691
|
else:
|