masster 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/adducts.py +1 -1
- masster/sample/h5.py +11 -11
- masster/sample/helpers.py +2 -2
- masster/sample/load.py +10 -8
- masster/sample/processing.py +1 -1
- masster/sample/sample.py +7 -3
- masster/study/defaults/align_def.py +0 -204
- masster/study/defaults/fill_def.py +9 -1
- masster/study/defaults/merge_def.py +20 -69
- masster/study/export.py +25 -5
- masster/study/h5.py +230 -42
- masster/study/helpers.py +430 -53
- masster/study/load.py +986 -158
- masster/study/merge.py +683 -1076
- masster/study/plot.py +95 -73
- masster/study/processing.py +337 -280
- masster/study/study.py +58 -135
- masster/wizard/wizard.py +20 -6
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/METADATA +1 -1
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/RECORD +24 -25
- masster/study/defaults/fill_chrom_def.py +0 -260
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/WHEEL +0 -0
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/entry_points.txt +0 -0
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/licenses/LICENSE +0 -0
masster/study/plot.py
CHANGED
|
@@ -603,7 +603,7 @@ def plot_consensus_2d(
|
|
|
603
603
|
pl.when(
|
|
604
604
|
(pl.col(sizeby).is_not_null()) & (pl.col(sizeby).is_finite()) & (pl.col(sizeby) > 0),
|
|
605
605
|
)
|
|
606
|
-
.then((pl.col(sizeby).log10() * markersize / 12).pow(
|
|
606
|
+
.then((pl.col(sizeby).log10() * markersize / 12).pow(1.5))
|
|
607
607
|
.otherwise(markersize)
|
|
608
608
|
.alias("markersize"),
|
|
609
609
|
])
|
|
@@ -1385,6 +1385,7 @@ def plot_rt_correction(
|
|
|
1385
1385
|
"""
|
|
1386
1386
|
Plot RT correction per sample: (rt - rt_original) vs rt overlaid for selected samples.
|
|
1387
1387
|
|
|
1388
|
+
Only features with filled==False are used for the RT correction plot.
|
|
1388
1389
|
This uses the same color mapping as `plot_bpc` so curves for the same samples match.
|
|
1389
1390
|
"""
|
|
1390
1391
|
from bokeh.plotting import figure, show, output_file
|
|
@@ -1420,74 +1421,97 @@ def plot_rt_correction(
|
|
|
1420
1421
|
p.xaxis.axis_label = f"Retention Time ({rt_unit})"
|
|
1421
1422
|
p.yaxis.axis_label = "RT - RT_original (s)"
|
|
1422
1423
|
|
|
1423
|
-
|
|
1424
|
+
# Create sample name lookup dictionary from samples_df (all in Polars)
|
|
1425
|
+
sample_names_dict = {}
|
|
1424
1426
|
if hasattr(self, "samples_df") and self.samples_df is not None:
|
|
1425
1427
|
try:
|
|
1426
|
-
|
|
1428
|
+
sample_name_mapping = (
|
|
1429
|
+
self.samples_df
|
|
1430
|
+
.filter(pl.col("sample_uid").is_in(sample_uids))
|
|
1431
|
+
.select(["sample_uid", "sample_name"])
|
|
1432
|
+
)
|
|
1433
|
+
sample_names_dict = dict(zip(
|
|
1434
|
+
sample_name_mapping["sample_uid"].to_list(),
|
|
1435
|
+
sample_name_mapping["sample_name"].to_list()
|
|
1436
|
+
))
|
|
1427
1437
|
except Exception:
|
|
1428
|
-
|
|
1438
|
+
pass
|
|
1429
1439
|
|
|
1430
1440
|
renderers = []
|
|
1431
1441
|
|
|
1432
|
-
#
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
self.logger.debug("No sample identifier column in features_df; skipping sample filtering")
|
|
1442
|
-
continue
|
|
1443
|
-
except Exception as e:
|
|
1444
|
-
self.logger.debug(f"Error filtering features for sample {uid}: {e}")
|
|
1445
|
-
continue
|
|
1442
|
+
# Check sample identifier column
|
|
1443
|
+
if "sample_uid" not in self.features_df.columns:
|
|
1444
|
+
if "sample_name" in self.features_df.columns:
|
|
1445
|
+
sample_id_col = "sample_name"
|
|
1446
|
+
else:
|
|
1447
|
+
self.logger.debug("No sample identifier column in features_df")
|
|
1448
|
+
return
|
|
1449
|
+
else:
|
|
1450
|
+
sample_id_col = "sample_uid"
|
|
1446
1451
|
|
|
1447
|
-
|
|
1448
|
-
|
|
1452
|
+
# OPTIMIZED: Filter once, group once instead of per-sample filtering
|
|
1453
|
+
try:
|
|
1454
|
+
# Filter all data once for selected samples and required conditions
|
|
1455
|
+
all_sample_feats = self.features_df.filter(
|
|
1456
|
+
pl.col(sample_id_col).is_in(sample_uids)
|
|
1457
|
+
)
|
|
1458
|
+
|
|
1459
|
+
if all_sample_feats.is_empty():
|
|
1460
|
+
self.logger.warning("No features found for the selected samples.")
|
|
1461
|
+
return
|
|
1449
1462
|
|
|
1450
|
-
#
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1463
|
+
# Filter to only use features with filled==False if column exists
|
|
1464
|
+
if "filled" in all_sample_feats.columns:
|
|
1465
|
+
all_sample_feats = all_sample_feats.filter(~pl.col("filled"))
|
|
1466
|
+
if all_sample_feats.is_empty():
|
|
1467
|
+
self.logger.warning("No non-filled features found for the selected samples.")
|
|
1468
|
+
return
|
|
1455
1469
|
|
|
1456
|
-
#
|
|
1457
|
-
if "rt" not in
|
|
1458
|
-
|
|
1470
|
+
# Check required columns
|
|
1471
|
+
if "rt" not in all_sample_feats.columns or "rt_original" not in all_sample_feats.columns:
|
|
1472
|
+
self.logger.error("Required columns 'rt' or 'rt_original' not found in features_df.")
|
|
1473
|
+
return
|
|
1459
1474
|
|
|
1460
|
-
#
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1475
|
+
# Filter nulls, add delta column, and sort - all in one operation
|
|
1476
|
+
all_sample_feats = (
|
|
1477
|
+
all_sample_feats
|
|
1478
|
+
.filter(
|
|
1479
|
+
pl.col("rt").is_not_null() &
|
|
1480
|
+
pl.col("rt_original").is_not_null()
|
|
1481
|
+
)
|
|
1482
|
+
.with_columns([
|
|
1483
|
+
(pl.col("rt") - pl.col("rt_original")).alias("delta")
|
|
1484
|
+
])
|
|
1485
|
+
.sort([sample_id_col, "rt"])
|
|
1486
|
+
)
|
|
1464
1487
|
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1488
|
+
if all_sample_feats.is_empty():
|
|
1489
|
+
self.logger.warning("No valid RT data found for the selected samples.")
|
|
1490
|
+
return
|
|
1468
1491
|
|
|
1469
|
-
#
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1492
|
+
# Group by sample and process each group (much faster than individual filtering)
|
|
1493
|
+
for (sample_uid,), sample_group in all_sample_feats.group_by(sample_id_col):
|
|
1494
|
+
if sample_group.is_empty():
|
|
1495
|
+
continue
|
|
1473
1496
|
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
row = samples_info[samples_info["sample_uid"] == uid]
|
|
1478
|
-
if not row.empty:
|
|
1479
|
-
sample_name = row.iloc[0].get("sample_name", sample_name)
|
|
1480
|
-
except Exception:
|
|
1481
|
-
pass
|
|
1497
|
+
# Extract arrays directly from Polars
|
|
1498
|
+
rt = sample_group["rt"].to_numpy()
|
|
1499
|
+
delta = sample_group["delta"].to_numpy()
|
|
1482
1500
|
|
|
1483
|
-
|
|
1501
|
+
# Get sample name efficiently from pre-built dictionary
|
|
1502
|
+
sample_name = sample_names_dict.get(sample_uid, str(sample_uid))
|
|
1503
|
+
color = color_map.get(sample_uid, "#000000")
|
|
1484
1504
|
|
|
1485
|
-
|
|
1486
|
-
|
|
1505
|
+
data = {"rt": rt, "delta": delta, "sample": [sample_name] * len(rt), "sample_color": [color] * len(rt)}
|
|
1506
|
+
src = ColumnDataSource(data)
|
|
1487
1507
|
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1508
|
+
r_line = p.line("rt", "delta", source=src, line_width=1, color=color)
|
|
1509
|
+
p.scatter("rt", "delta", source=src, size=2, color=color, alpha=0.6)
|
|
1510
|
+
renderers.append(r_line)
|
|
1511
|
+
|
|
1512
|
+
except Exception as e:
|
|
1513
|
+
self.logger.error(f"Error in optimized RT correction plotting: {e}")
|
|
1514
|
+
return
|
|
1491
1515
|
|
|
1492
1516
|
if not renderers:
|
|
1493
1517
|
self.logger.warning("No RT correction curves to plot for the selected samples.")
|
|
@@ -1759,21 +1783,26 @@ def plot_consensus_stats(
|
|
|
1759
1783
|
import polars as pl
|
|
1760
1784
|
import numpy as np
|
|
1761
1785
|
|
|
1762
|
-
#
|
|
1763
|
-
|
|
1764
|
-
|
|
1786
|
+
# Get the consensus statistics data using the new helper method
|
|
1787
|
+
data_df = self.get_consensus_stats()
|
|
1788
|
+
|
|
1789
|
+
if data_df is None or data_df.is_empty():
|
|
1790
|
+
self.logger.error("No consensus statistics data available.")
|
|
1765
1791
|
return
|
|
1766
1792
|
|
|
1767
|
-
#
|
|
1768
|
-
|
|
1793
|
+
# Remove consensus_uid column for plotting (keep only numeric columns)
|
|
1794
|
+
if "consensus_uid" in data_df.columns:
|
|
1795
|
+
data_df_clean = data_df.drop("consensus_uid")
|
|
1796
|
+
else:
|
|
1797
|
+
data_df_clean = data_df
|
|
1769
1798
|
|
|
1770
|
-
# Define specific columns to plot in the exact order requested
|
|
1799
|
+
# Define specific columns to plot in the exact order requested (excluding consensus_uid)
|
|
1771
1800
|
desired_columns = [
|
|
1772
1801
|
"rt",
|
|
1773
1802
|
"rt_delta_mean",
|
|
1774
1803
|
"mz",
|
|
1775
|
-
"mz_range", # mz_max-mz_min
|
|
1776
|
-
"log10_inty_mean", # log10(inty_mean)
|
|
1804
|
+
"mz_range", # mz_max-mz_min
|
|
1805
|
+
"log10_inty_mean", # log10(inty_mean)
|
|
1777
1806
|
"number_samples",
|
|
1778
1807
|
"number_ms2",
|
|
1779
1808
|
"charge_mean",
|
|
@@ -1783,20 +1812,13 @@ def plot_consensus_stats(
|
|
|
1783
1812
|
"chrom_prominence_scaled_mean"
|
|
1784
1813
|
]
|
|
1785
1814
|
|
|
1786
|
-
# Calculate derived columns if they don't exist
|
|
1787
|
-
if "mz_range" not in data_df.columns and "mz_max" in data_df.columns and "mz_min" in data_df.columns:
|
|
1788
|
-
data_df = data_df.with_columns((pl.col("mz_max") - pl.col("mz_min")).alias("mz_range"))
|
|
1789
|
-
|
|
1790
|
-
if "log10_inty_mean" not in data_df.columns and "inty_mean" in data_df.columns:
|
|
1791
|
-
data_df = data_df.with_columns(pl.col("inty_mean").log10().alias("log10_inty_mean"))
|
|
1792
|
-
|
|
1793
1815
|
# Filter to only include columns that exist in the dataframe, preserving order
|
|
1794
|
-
numeric_columns = [col for col in desired_columns if col in
|
|
1816
|
+
numeric_columns = [col for col in desired_columns if col in data_df_clean.columns]
|
|
1795
1817
|
|
|
1796
1818
|
# Check if the numeric columns are actually numeric
|
|
1797
1819
|
final_numeric_columns = []
|
|
1798
1820
|
for col in numeric_columns:
|
|
1799
|
-
dtype =
|
|
1821
|
+
dtype = data_df_clean[col].dtype
|
|
1800
1822
|
if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
1801
1823
|
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
1802
1824
|
pl.Float32, pl.Float64]:
|
|
@@ -1805,13 +1827,13 @@ def plot_consensus_stats(
|
|
|
1805
1827
|
numeric_columns = final_numeric_columns
|
|
1806
1828
|
|
|
1807
1829
|
if len(numeric_columns) == 0:
|
|
1808
|
-
self.logger.error(f"None of the requested consensus statistics columns were found or are numeric. Available columns: {list(
|
|
1830
|
+
self.logger.error(f"None of the requested consensus statistics columns were found or are numeric. Available columns: {list(data_df_clean.columns)}")
|
|
1809
1831
|
return
|
|
1810
1832
|
|
|
1811
1833
|
self.logger.debug(f"Creating distribution plots for {len(numeric_columns)} specific consensus columns: {numeric_columns}")
|
|
1812
1834
|
|
|
1813
|
-
#
|
|
1814
|
-
data_df_clean =
|
|
1835
|
+
# Select only the numeric columns for plotting
|
|
1836
|
+
data_df_clean = data_df_clean.select(numeric_columns)
|
|
1815
1837
|
|
|
1816
1838
|
# Check if all numeric columns are empty
|
|
1817
1839
|
all_columns_empty = True
|