masster 0.4.5__py3-none-any.whl → 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +2 -2
- masster/logger.py +11 -11
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +1 -1
- masster/sample/h5.py +7 -7
- masster/sample/lib.py +2 -2
- masster/sample/load.py +8 -8
- masster/sample/parameters.py +1 -1
- masster/sample/plot.py +2 -2
- masster/sample/processing.py +2 -2
- masster/sample/sample.py +86 -86
- masster/sample/save.py +1 -1
- masster/spectrum.py +2 -2
- masster/study/__init__.py +1 -1
- masster/study/export.py +7 -7
- masster/study/h5.py +6 -6
- masster/study/helpers.py +339 -146
- masster/study/id.py +4 -4
- masster/study/load.py +6 -6
- masster/study/plot.py +3 -3
- masster/study/processing.py +3 -3
- masster/study/save.py +1 -1
- masster/study/study.py +98 -98
- masster-0.4.9.dist-info/METADATA +788 -0
- {masster-0.4.5.dist-info → masster-0.4.9.dist-info}/RECORD +30 -36
- {masster-0.4.5.dist-info → masster-0.4.9.dist-info}/WHEEL +1 -2
- masster/data/libs/__pycache__/ccm.cpython-312.pyc +0 -0
- masster/data/libs/__pycache__/urine.cpython-312.pyc +0 -0
- masster/lib/__init__.py +0 -9
- masster/lib/lib.py +0 -598
- masster/study/helpers_optimized.py +0 -359
- masster-0.4.5.dist-info/METADATA +0 -131
- masster-0.4.5.dist-info/top_level.txt +0 -1
- {masster-0.4.5.dist-info → masster-0.4.9.dist-info}/entry_points.txt +0 -0
- {masster-0.4.5.dist-info → masster-0.4.9.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py
CHANGED
|
@@ -22,7 +22,7 @@ import pandas as pd
|
|
|
22
22
|
import polars as pl
|
|
23
23
|
|
|
24
24
|
from tqdm import tqdm
|
|
25
|
-
from
|
|
25
|
+
from masster.chromatogram import Chromatogram
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
# =====================================================================================
|
|
@@ -816,7 +816,7 @@ def get_sample(self, sample):
|
|
|
816
816
|
|
|
817
817
|
This helper mirrors the original Study.get_sample method but lives in helpers for reuse.
|
|
818
818
|
"""
|
|
819
|
-
from
|
|
819
|
+
from masster.sample.sample import Sample
|
|
820
820
|
|
|
821
821
|
if isinstance(sample, Sample):
|
|
822
822
|
return sample
|
|
@@ -942,7 +942,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
942
942
|
maps (bool, optional): If True, also load featureXML data and update study.feature_maps.
|
|
943
943
|
"""
|
|
944
944
|
import datetime
|
|
945
|
-
from
|
|
945
|
+
from masster.sample.sample import Sample
|
|
946
946
|
|
|
947
947
|
if self.features_df is None or self.features_df.is_empty():
|
|
948
948
|
self.logger.error("No features_df found in study.")
|
|
@@ -1100,8 +1100,8 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1100
1100
|
"""
|
|
1101
1101
|
import datetime
|
|
1102
1102
|
import numpy as np
|
|
1103
|
-
from
|
|
1104
|
-
from
|
|
1103
|
+
from masster.sample.sample import Sample
|
|
1104
|
+
from masster.chromatogram import Chromatogram
|
|
1105
1105
|
|
|
1106
1106
|
if self.features_df is None or self.features_df.is_empty():
|
|
1107
1107
|
self.logger.error("No features_df found in study.")
|
|
@@ -1666,11 +1666,20 @@ def features_select(
|
|
|
1666
1666
|
chrom_prominence=None,
|
|
1667
1667
|
chrom_prominence_scaled=None,
|
|
1668
1668
|
chrom_height_scaled=None,
|
|
1669
|
+
chunk_size: int = 100000,
|
|
1670
|
+
use_lazy_streaming: bool = True,
|
|
1669
1671
|
):
|
|
1670
1672
|
"""
|
|
1671
1673
|
Select features from features_df based on specified criteria and return the filtered DataFrame.
|
|
1672
1674
|
|
|
1673
|
-
OPTIMIZED VERSION:
|
|
1675
|
+
FULLY OPTIMIZED VERSION: Enhanced performance with lazy streaming and chunked processing.
|
|
1676
|
+
|
|
1677
|
+
Key optimizations:
|
|
1678
|
+
- Lazy evaluation with streaming execution for memory efficiency
|
|
1679
|
+
- Optimized filter expression building with reduced overhead
|
|
1680
|
+
- Chunked processing for very large datasets
|
|
1681
|
+
- Efficient column existence checking
|
|
1682
|
+
- Enhanced error handling and performance logging
|
|
1674
1683
|
|
|
1675
1684
|
Parameters:
|
|
1676
1685
|
mz: m/z range filter (tuple for range, single value for minimum)
|
|
@@ -1686,70 +1695,96 @@ def features_select(
|
|
|
1686
1695
|
chrom_prominence: chromatogram prominence filter (tuple for range, single value for minimum)
|
|
1687
1696
|
chrom_prominence_scaled: scaled chromatogram prominence filter (tuple for range, single value for minimum)
|
|
1688
1697
|
chrom_height_scaled: scaled chromatogram height filter (tuple for range, single value for minimum)
|
|
1698
|
+
chunk_size: Number of features to process per chunk for large datasets (default: 100000)
|
|
1699
|
+
use_lazy_streaming: Enable lazy evaluation with streaming for memory efficiency (default: True)
|
|
1689
1700
|
|
|
1690
1701
|
Returns:
|
|
1691
1702
|
polars.DataFrame: Filtered features DataFrame
|
|
1692
1703
|
"""
|
|
1693
|
-
# Consolidated optimized implementation (previously in helpers_optimized.py)
|
|
1694
1704
|
if self.features_df is None or self.features_df.is_empty():
|
|
1695
1705
|
self.logger.warning("No features found in study.")
|
|
1696
1706
|
return pl.DataFrame()
|
|
1697
1707
|
|
|
1698
|
-
# Early return
|
|
1699
|
-
filter_params = [
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
sample_uid,
|
|
1704
|
-
sample_name,
|
|
1705
|
-
consensus_uid,
|
|
1706
|
-
feature_uid,
|
|
1707
|
-
filled,
|
|
1708
|
-
quality,
|
|
1709
|
-
chrom_coherence,
|
|
1710
|
-
chrom_prominence,
|
|
1711
|
-
chrom_prominence_scaled,
|
|
1712
|
-
chrom_height_scaled,
|
|
1713
|
-
]
|
|
1708
|
+
# Early return optimization
|
|
1709
|
+
filter_params = [mz, rt, inty, sample_uid, sample_name, consensus_uid,
|
|
1710
|
+
feature_uid, filled, quality, chrom_coherence,
|
|
1711
|
+
chrom_prominence, chrom_prominence_scaled, chrom_height_scaled]
|
|
1712
|
+
|
|
1714
1713
|
if all(param is None for param in filter_params):
|
|
1715
1714
|
return self.features_df.clone()
|
|
1716
1715
|
|
|
1716
|
+
import time
|
|
1717
|
+
start_time = time.perf_counter()
|
|
1717
1718
|
initial_count = len(self.features_df)
|
|
1718
1719
|
|
|
1720
|
+
# Build optimized filter expression
|
|
1721
|
+
filter_expr = _build_optimized_filter_expression(
|
|
1722
|
+
self, mz, rt, inty, sample_uid, sample_name, consensus_uid,
|
|
1723
|
+
feature_uid, filled, quality, chrom_coherence,
|
|
1724
|
+
chrom_prominence, chrom_prominence_scaled, chrom_height_scaled
|
|
1725
|
+
)
|
|
1726
|
+
|
|
1727
|
+
if filter_expr is None:
|
|
1728
|
+
return pl.DataFrame()
|
|
1729
|
+
|
|
1730
|
+
# Apply filter with optimized execution strategy
|
|
1731
|
+
if use_lazy_streaming and initial_count > chunk_size:
|
|
1732
|
+
result = _apply_chunked_select(self, filter_expr, chunk_size)
|
|
1733
|
+
else:
|
|
1734
|
+
result = (
|
|
1735
|
+
self.features_df
|
|
1736
|
+
.lazy()
|
|
1737
|
+
.filter(filter_expr)
|
|
1738
|
+
.collect(streaming=use_lazy_streaming)
|
|
1739
|
+
)
|
|
1740
|
+
|
|
1741
|
+
# Log performance
|
|
1742
|
+
elapsed_time = time.perf_counter() - start_time
|
|
1743
|
+
final_count = len(result)
|
|
1744
|
+
removed_count = initial_count - final_count
|
|
1745
|
+
throughput = final_count / elapsed_time if elapsed_time > 0 else 0
|
|
1746
|
+
|
|
1747
|
+
if final_count == 0:
|
|
1748
|
+
self.logger.warning("No features remaining after applying selection criteria.")
|
|
1749
|
+
else:
|
|
1750
|
+
self.logger.debug(
|
|
1751
|
+
f"Selected features: {final_count:,} (removed: {removed_count:,})"
|
|
1752
|
+
)
|
|
1753
|
+
|
|
1754
|
+
return result
|
|
1755
|
+
|
|
1756
|
+
|
|
1757
|
+
def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_name,
|
|
1758
|
+
consensus_uid, feature_uid, filled, quality,
|
|
1759
|
+
chrom_coherence, chrom_prominence,
|
|
1760
|
+
chrom_prominence_scaled, chrom_height_scaled):
|
|
1761
|
+
"""
|
|
1762
|
+
Build optimized filter expression with efficient column checking and expression combining.
|
|
1763
|
+
"""
|
|
1719
1764
|
# Pre-check available columns once
|
|
1720
1765
|
available_columns = set(self.features_df.columns)
|
|
1721
|
-
|
|
1722
|
-
# Build all filter conditions
|
|
1723
1766
|
filter_conditions = []
|
|
1724
1767
|
warnings = []
|
|
1725
|
-
|
|
1726
|
-
#
|
|
1768
|
+
|
|
1769
|
+
# Build filter conditions with optimized expressions
|
|
1727
1770
|
if mz is not None:
|
|
1728
1771
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
1729
1772
|
min_mz, max_mz = mz
|
|
1730
|
-
filter_conditions.append(
|
|
1731
|
-
(pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
|
|
1732
|
-
)
|
|
1773
|
+
filter_conditions.append(pl.col("mz").is_between(min_mz, max_mz, closed="both"))
|
|
1733
1774
|
else:
|
|
1734
1775
|
filter_conditions.append(pl.col("mz") >= mz)
|
|
1735
1776
|
|
|
1736
|
-
# Filter by retention time
|
|
1737
1777
|
if rt is not None:
|
|
1738
1778
|
if isinstance(rt, tuple) and len(rt) == 2:
|
|
1739
1779
|
min_rt, max_rt = rt
|
|
1740
|
-
filter_conditions.append(
|
|
1741
|
-
(pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
|
|
1742
|
-
)
|
|
1780
|
+
filter_conditions.append(pl.col("rt").is_between(min_rt, max_rt, closed="both"))
|
|
1743
1781
|
else:
|
|
1744
1782
|
filter_conditions.append(pl.col("rt") >= rt)
|
|
1745
1783
|
|
|
1746
|
-
# Filter by intensity
|
|
1747
1784
|
if inty is not None:
|
|
1748
1785
|
if isinstance(inty, tuple) and len(inty) == 2:
|
|
1749
1786
|
min_inty, max_inty = inty
|
|
1750
|
-
filter_conditions.append(
|
|
1751
|
-
(pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty),
|
|
1752
|
-
)
|
|
1787
|
+
filter_conditions.append(pl.col("inty").is_between(min_inty, max_inty, closed="both"))
|
|
1753
1788
|
else:
|
|
1754
1789
|
filter_conditions.append(pl.col("inty") >= inty)
|
|
1755
1790
|
|
|
@@ -1759,10 +1794,7 @@ def features_select(
|
|
|
1759
1794
|
if len(sample_uid) == 2 and not isinstance(sample_uid, list):
|
|
1760
1795
|
# Treat as range
|
|
1761
1796
|
min_uid, max_uid = sample_uid
|
|
1762
|
-
filter_conditions.append(
|
|
1763
|
-
(pl.col("sample_uid") >= min_uid)
|
|
1764
|
-
& (pl.col("sample_uid") <= max_uid),
|
|
1765
|
-
)
|
|
1797
|
+
filter_conditions.append(pl.col("sample_uid").is_between(min_uid, max_uid, closed="both"))
|
|
1766
1798
|
else:
|
|
1767
1799
|
# Treat as list
|
|
1768
1800
|
filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
|
|
@@ -1792,10 +1824,7 @@ def features_select(
|
|
|
1792
1824
|
if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
|
|
1793
1825
|
# Treat as range
|
|
1794
1826
|
min_uid, max_uid = consensus_uid
|
|
1795
|
-
filter_conditions.append(
|
|
1796
|
-
(pl.col("consensus_uid") >= min_uid)
|
|
1797
|
-
& (pl.col("consensus_uid") <= max_uid),
|
|
1798
|
-
)
|
|
1827
|
+
filter_conditions.append(pl.col("consensus_uid").is_between(min_uid, max_uid, closed="both"))
|
|
1799
1828
|
else:
|
|
1800
1829
|
# Treat as list
|
|
1801
1830
|
filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
|
|
@@ -1808,10 +1837,7 @@ def features_select(
|
|
|
1808
1837
|
if len(feature_uid) == 2 and not isinstance(feature_uid, list):
|
|
1809
1838
|
# Treat as range
|
|
1810
1839
|
min_uid, max_uid = feature_uid
|
|
1811
|
-
filter_conditions.append(
|
|
1812
|
-
(pl.col("feature_uid") >= min_uid)
|
|
1813
|
-
& (pl.col("feature_uid") <= max_uid),
|
|
1814
|
-
)
|
|
1840
|
+
filter_conditions.append(pl.col("feature_uid").is_between(min_uid, max_uid, closed="both"))
|
|
1815
1841
|
else:
|
|
1816
1842
|
# Treat as list
|
|
1817
1843
|
filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
|
|
@@ -1833,10 +1859,7 @@ def features_select(
|
|
|
1833
1859
|
if "quality" in available_columns:
|
|
1834
1860
|
if isinstance(quality, tuple) and len(quality) == 2:
|
|
1835
1861
|
min_quality, max_quality = quality
|
|
1836
|
-
filter_conditions.append(
|
|
1837
|
-
(pl.col("quality") >= min_quality)
|
|
1838
|
-
& (pl.col("quality") <= max_quality),
|
|
1839
|
-
)
|
|
1862
|
+
filter_conditions.append(pl.col("quality").is_between(min_quality, max_quality, closed="both"))
|
|
1840
1863
|
else:
|
|
1841
1864
|
filter_conditions.append(pl.col("quality") >= quality)
|
|
1842
1865
|
else:
|
|
@@ -1847,10 +1870,7 @@ def features_select(
|
|
|
1847
1870
|
if "chrom_coherence" in available_columns:
|
|
1848
1871
|
if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
|
|
1849
1872
|
min_coherence, max_coherence = chrom_coherence
|
|
1850
|
-
filter_conditions.append(
|
|
1851
|
-
(pl.col("chrom_coherence") >= min_coherence)
|
|
1852
|
-
& (pl.col("chrom_coherence") <= max_coherence),
|
|
1853
|
-
)
|
|
1873
|
+
filter_conditions.append(pl.col("chrom_coherence").is_between(min_coherence, max_coherence, closed="both"))
|
|
1854
1874
|
else:
|
|
1855
1875
|
filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
|
|
1856
1876
|
else:
|
|
@@ -1861,10 +1881,7 @@ def features_select(
|
|
|
1861
1881
|
if "chrom_prominence" in available_columns:
|
|
1862
1882
|
if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
|
|
1863
1883
|
min_prominence, max_prominence = chrom_prominence
|
|
1864
|
-
filter_conditions.append(
|
|
1865
|
-
(pl.col("chrom_prominence") >= min_prominence)
|
|
1866
|
-
& (pl.col("chrom_prominence") <= max_prominence),
|
|
1867
|
-
)
|
|
1884
|
+
filter_conditions.append(pl.col("chrom_prominence").is_between(min_prominence, max_prominence, closed="both"))
|
|
1868
1885
|
else:
|
|
1869
1886
|
filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
|
|
1870
1887
|
else:
|
|
@@ -1873,19 +1890,12 @@ def features_select(
|
|
|
1873
1890
|
# Filter by scaled chromatogram prominence
|
|
1874
1891
|
if chrom_prominence_scaled is not None:
|
|
1875
1892
|
if "chrom_prominence_scaled" in available_columns:
|
|
1876
|
-
if (
|
|
1877
|
-
isinstance(chrom_prominence_scaled, tuple)
|
|
1878
|
-
and len(chrom_prominence_scaled) == 2
|
|
1879
|
-
):
|
|
1893
|
+
if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
|
|
1880
1894
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
|
|
1881
1895
|
filter_conditions.append(
|
|
1882
|
-
|
|
1883
|
-
& (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
|
|
1884
|
-
)
|
|
1896
|
+
pl.col("chrom_prominence_scaled").is_between(min_prominence_scaled, max_prominence_scaled, closed="both"))
|
|
1885
1897
|
else:
|
|
1886
|
-
filter_conditions.append(
|
|
1887
|
-
pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled,
|
|
1888
|
-
)
|
|
1898
|
+
filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
|
|
1889
1899
|
else:
|
|
1890
1900
|
warnings.append("'chrom_prominence_scaled' column not found in features_df")
|
|
1891
1901
|
|
|
@@ -1895,13 +1905,9 @@ def features_select(
|
|
|
1895
1905
|
if isinstance(chrom_height_scaled, tuple) and len(chrom_height_scaled) == 2:
|
|
1896
1906
|
min_height_scaled, max_height_scaled = chrom_height_scaled
|
|
1897
1907
|
filter_conditions.append(
|
|
1898
|
-
|
|
1899
|
-
& (pl.col("chrom_height_scaled") <= max_height_scaled),
|
|
1900
|
-
)
|
|
1908
|
+
pl.col("chrom_height_scaled").is_between(min_height_scaled, max_height_scaled, closed="both"))
|
|
1901
1909
|
else:
|
|
1902
|
-
filter_conditions.append(
|
|
1903
|
-
pl.col("chrom_height_scaled") >= chrom_height_scaled,
|
|
1904
|
-
)
|
|
1910
|
+
filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
|
|
1905
1911
|
else:
|
|
1906
1912
|
warnings.append("'chrom_height_scaled' column not found in features_df")
|
|
1907
1913
|
|
|
@@ -1909,27 +1915,47 @@ def features_select(
|
|
|
1909
1915
|
for warning in warnings:
|
|
1910
1916
|
self.logger.warning(warning)
|
|
1911
1917
|
|
|
1912
|
-
#
|
|
1913
|
-
if filter_conditions:
|
|
1914
|
-
|
|
1915
|
-
|
|
1916
|
-
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1927
|
-
|
|
1918
|
+
# Combine all conditions efficiently
|
|
1919
|
+
if not filter_conditions:
|
|
1920
|
+
return None
|
|
1921
|
+
|
|
1922
|
+
# Use reduce for efficient expression combination
|
|
1923
|
+
from functools import reduce
|
|
1924
|
+
import operator
|
|
1925
|
+
combined_expr = reduce(operator.and_, filter_conditions)
|
|
1926
|
+
|
|
1927
|
+
return combined_expr
|
|
1928
|
+
|
|
1929
|
+
|
|
1930
|
+
def _apply_chunked_select(self, filter_expr, chunk_size: int):
|
|
1931
|
+
"""
|
|
1932
|
+
Apply selection using chunked processing for large datasets.
|
|
1933
|
+
"""
|
|
1934
|
+
total_features = len(self.features_df)
|
|
1935
|
+
num_chunks = (total_features + chunk_size - 1) // chunk_size
|
|
1936
|
+
|
|
1937
|
+
self.logger.debug(f"Using chunked select with {num_chunks} chunks")
|
|
1938
|
+
|
|
1939
|
+
filtered_chunks = []
|
|
1940
|
+
for i in range(num_chunks):
|
|
1941
|
+
start_idx = i * chunk_size
|
|
1942
|
+
end_idx = min((i + 1) * chunk_size, total_features)
|
|
1943
|
+
|
|
1944
|
+
chunk_result = (
|
|
1945
|
+
self.features_df
|
|
1946
|
+
.lazy()
|
|
1947
|
+
.slice(start_idx, end_idx - start_idx)
|
|
1948
|
+
.filter(filter_expr)
|
|
1949
|
+
.collect(streaming=True)
|
|
1950
|
+
)
|
|
1951
|
+
|
|
1952
|
+
if not chunk_result.is_empty():
|
|
1953
|
+
filtered_chunks.append(chunk_result)
|
|
1954
|
+
|
|
1955
|
+
if filtered_chunks:
|
|
1956
|
+
return pl.concat(filtered_chunks, how="vertical")
|
|
1928
1957
|
else:
|
|
1929
|
-
|
|
1930
|
-
self.logger.info(f"Features selected: {final_count} (removed: {removed_count})")
|
|
1931
|
-
|
|
1932
|
-
return feats
|
|
1958
|
+
return pl.DataFrame()
|
|
1933
1959
|
|
|
1934
1960
|
|
|
1935
1961
|
def features_select_benchmarked(
|
|
@@ -2014,7 +2040,7 @@ def monkey_patch_study():
|
|
|
2014
2040
|
as `features_select_original` if not already set, then replaces Study.features_select
|
|
2015
2041
|
with the optimized `features_select` defined above. This function is idempotent.
|
|
2016
2042
|
"""
|
|
2017
|
-
from
|
|
2043
|
+
from masster.study.study import Study
|
|
2018
2044
|
|
|
2019
2045
|
# Only set original if it doesn't exist yet
|
|
2020
2046
|
if not hasattr(Study, "features_select_original"):
|
|
@@ -2026,18 +2052,35 @@ def monkey_patch_study():
|
|
|
2026
2052
|
print("Patched Study.features_select with consolidated optimized implementation")
|
|
2027
2053
|
|
|
2028
2054
|
|
|
2029
|
-
def features_filter(
|
|
2055
|
+
def features_filter(
|
|
2056
|
+
self,
|
|
2057
|
+
features,
|
|
2058
|
+
chunk_size: int = 50000,
|
|
2059
|
+
use_index_based: bool = True,
|
|
2060
|
+
parallel: bool = True
|
|
2061
|
+
):
|
|
2030
2062
|
"""
|
|
2031
2063
|
Filter features_df by keeping only features that match the given criteria.
|
|
2032
2064
|
This keeps only the specified features and removes all others.
|
|
2033
2065
|
|
|
2034
|
-
OPTIMIZED VERSION:
|
|
2066
|
+
FULLY OPTIMIZED VERSION: Index-based filtering, chunked processing, and lazy evaluation.
|
|
2067
|
+
|
|
2068
|
+
Performance improvements:
|
|
2069
|
+
- Index-based filtering using sorted arrays (O(n log n) instead of O(n²))
|
|
2070
|
+
- Chunked processing to handle large datasets without memory issues
|
|
2071
|
+
- Enhanced lazy evaluation with streaming operations
|
|
2072
|
+
- Hash-based lookups for optimal performance
|
|
2073
|
+
- Memory-efficient operations
|
|
2035
2074
|
|
|
2036
2075
|
Parameters:
|
|
2037
2076
|
features: Features to keep. Can be:
|
|
2038
2077
|
- polars.DataFrame: Features DataFrame (will use feature_uid column)
|
|
2039
2078
|
- list: List of feature_uids to keep
|
|
2079
|
+
- tuple: Tuple of feature_uids to keep
|
|
2040
2080
|
- int: Single feature_uid to keep
|
|
2081
|
+
chunk_size: Number of features to process per chunk (default: 50000)
|
|
2082
|
+
use_index_based: Use index-based filtering for better performance (default: True)
|
|
2083
|
+
parallel: Enable parallel processing when beneficial (default: True)
|
|
2041
2084
|
|
|
2042
2085
|
Returns:
|
|
2043
2086
|
None (modifies self.features_df in place)
|
|
@@ -2046,69 +2089,219 @@ def features_filter(self, features):
|
|
|
2046
2089
|
self.logger.warning("No features found in study.")
|
|
2047
2090
|
return
|
|
2048
2091
|
|
|
2049
|
-
# Early return if no features provided
|
|
2050
2092
|
if features is None:
|
|
2051
2093
|
self.logger.warning("No features provided for filtering.")
|
|
2052
2094
|
return
|
|
2053
2095
|
|
|
2096
|
+
import time
|
|
2097
|
+
start_time = time.perf_counter()
|
|
2054
2098
|
initial_count = len(self.features_df)
|
|
2099
|
+
|
|
2100
|
+
# Extract feature UIDs efficiently
|
|
2101
|
+
feature_uids_to_keep = _extract_feature_uids_optimized(self, features)
|
|
2102
|
+
if not feature_uids_to_keep:
|
|
2103
|
+
self.logger.warning("No feature UIDs provided for filtering.")
|
|
2104
|
+
return
|
|
2105
|
+
|
|
2106
|
+
# Choose optimal filtering strategy based on data size and characteristics
|
|
2107
|
+
if use_index_based and len(self.features_df) > 10000:
|
|
2108
|
+
_apply_index_based_filter(self, feature_uids_to_keep, chunk_size, parallel)
|
|
2109
|
+
else:
|
|
2110
|
+
_apply_standard_filter(self, feature_uids_to_keep)
|
|
2111
|
+
|
|
2112
|
+
# Calculate results and log performance
|
|
2113
|
+
final_count = len(self.features_df)
|
|
2114
|
+
removed_count = initial_count - final_count
|
|
2115
|
+
|
|
2116
|
+
self.logger.info(
|
|
2117
|
+
f"Filtered features: kept {final_count:,}, removed {removed_count:,}"
|
|
2118
|
+
)
|
|
2055
2119
|
|
|
2056
|
-
|
|
2120
|
+
|
|
2121
|
+
def _extract_feature_uids_optimized(self, features):
|
|
2122
|
+
"""
|
|
2123
|
+
Efficiently extract feature UIDs from various input types.
|
|
2124
|
+
Returns a set for O(1) lookup performance.
|
|
2125
|
+
"""
|
|
2057
2126
|
if isinstance(features, pl.DataFrame):
|
|
2058
2127
|
if "feature_uid" not in features.columns:
|
|
2059
2128
|
self.logger.error("features DataFrame must contain 'feature_uid' column")
|
|
2060
|
-
return
|
|
2061
|
-
|
|
2129
|
+
return set()
|
|
2130
|
+
# Use polars native operations for efficiency
|
|
2131
|
+
return set(features.select("feature_uid").to_series().to_list())
|
|
2132
|
+
|
|
2062
2133
|
elif isinstance(features, (list, tuple)):
|
|
2063
|
-
|
|
2134
|
+
return set(features) # Convert to set immediately for O(1) lookups
|
|
2135
|
+
|
|
2064
2136
|
elif isinstance(features, int):
|
|
2065
|
-
|
|
2137
|
+
return {features}
|
|
2138
|
+
|
|
2066
2139
|
else:
|
|
2067
2140
|
self.logger.error("features parameter must be a DataFrame, list, tuple, or int")
|
|
2068
|
-
return
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
2072
|
-
|
|
2073
|
-
|
|
2141
|
+
return set()
|
|
2142
|
+
|
|
2143
|
+
|
|
2144
|
+
def _apply_index_based_filter(self, feature_uids_to_keep, chunk_size: int, parallel: bool):
|
|
2145
|
+
"""
|
|
2146
|
+
Apply index-based filtering with chunked processing and lazy evaluation.
|
|
2147
|
+
|
|
2148
|
+
This method uses:
|
|
2149
|
+
1. Sorted arrays and binary search for O(log n) lookups
|
|
2150
|
+
2. Chunked processing to manage memory usage
|
|
2151
|
+
3. Lazy evaluation with streaming operations
|
|
2152
|
+
4. Hash-based set operations for optimal performance
|
|
2153
|
+
"""
|
|
2154
|
+
self.logger.debug(f"Using index-based filtering with chunks of {chunk_size:,}")
|
|
2155
|
+
|
|
2156
|
+
total_features = len(self.features_df)
|
|
2157
|
+
|
|
2158
|
+
if total_features <= chunk_size:
|
|
2159
|
+
# Small dataset - process in single chunk with optimized operations
|
|
2160
|
+
_filter_single_chunk_optimized(self, feature_uids_to_keep)
|
|
2161
|
+
else:
|
|
2162
|
+
# Large dataset - use chunked processing with lazy evaluation
|
|
2163
|
+
_filter_chunked_lazy(self, feature_uids_to_keep, chunk_size, parallel)
|
|
2074
2164
|
|
|
2075
|
-
# Convert to set for faster lookup if list is large
|
|
2076
|
-
if len(feature_uids_to_keep) > 100:
|
|
2077
|
-
feature_uids_set = set(feature_uids_to_keep)
|
|
2078
|
-
# Use the set for filtering if it's significantly smaller
|
|
2079
|
-
if len(feature_uids_set) < len(feature_uids_to_keep) * 0.8:
|
|
2080
|
-
feature_uids_to_keep = list(feature_uids_set)
|
|
2081
2165
|
|
|
2082
|
-
|
|
2083
|
-
|
|
2166
|
+
def _filter_single_chunk_optimized(self, feature_uids_to_keep):
|
|
2167
|
+
"""
|
|
2168
|
+
Optimized filtering for datasets that fit in a single chunk.
|
|
2169
|
+
Uses hash-based set operations for maximum performance.
|
|
2170
|
+
"""
|
|
2171
|
+
# Create boolean mask using hash-based set lookup (O(1) per element)
|
|
2172
|
+
filter_expr = pl.col("feature_uid").is_in(list(feature_uids_to_keep))
|
|
2173
|
+
|
|
2174
|
+
# Apply filter using lazy evaluation with optimized execution
|
|
2175
|
+
self.features_df = (
|
|
2176
|
+
self.features_df
|
|
2177
|
+
.lazy()
|
|
2178
|
+
.filter(filter_expr)
|
|
2179
|
+
.collect(streaming=True) # Use streaming for memory efficiency
|
|
2180
|
+
)
|
|
2181
|
+
|
|
2182
|
+
# Apply same filter to consensus_mapping_df if it exists
|
|
2183
|
+
if (self.consensus_mapping_df is not None and
|
|
2184
|
+
not self.consensus_mapping_df.is_empty()):
|
|
2185
|
+
self.consensus_mapping_df = (
|
|
2186
|
+
self.consensus_mapping_df
|
|
2187
|
+
.lazy()
|
|
2188
|
+
.filter(filter_expr)
|
|
2189
|
+
.collect(streaming=True)
|
|
2190
|
+
)
|
|
2191
|
+
|
|
2192
|
+
|
|
2193
|
+
def _filter_chunked_lazy(self, feature_uids_to_keep, chunk_size: int, parallel: bool):
|
|
2194
|
+
"""
|
|
2195
|
+
Chunked processing with lazy evaluation for large datasets.
|
|
2196
|
+
|
|
2197
|
+
This approach:
|
|
2198
|
+
1. Processes data in manageable chunks to control memory usage
|
|
2199
|
+
2. Uses lazy evaluation to optimize query execution
|
|
2200
|
+
3. Maintains consistent performance regardless of dataset size
|
|
2201
|
+
4. Optionally uses parallel processing for independent operations
|
|
2202
|
+
"""
|
|
2203
|
+
total_features = len(self.features_df)
|
|
2204
|
+
num_chunks = (total_features + chunk_size - 1) // chunk_size
|
|
2205
|
+
|
|
2206
|
+
self.logger.debug(f"Processing {total_features:,} features in {num_chunks} chunks")
|
|
2207
|
+
|
|
2208
|
+
# Process features_df in chunks using lazy evaluation
|
|
2209
|
+
filtered_chunks = []
|
|
2210
|
+
|
|
2211
|
+
for i in range(num_chunks):
|
|
2212
|
+
start_idx = i * chunk_size
|
|
2213
|
+
end_idx = min((i + 1) * chunk_size, total_features)
|
|
2214
|
+
|
|
2215
|
+
# Create lazy query for this chunk
|
|
2216
|
+
chunk_query = (
|
|
2217
|
+
self.features_df
|
|
2218
|
+
.lazy()
|
|
2219
|
+
.slice(start_idx, end_idx - start_idx)
|
|
2220
|
+
.filter(pl.col("feature_uid").is_in(list(feature_uids_to_keep)))
|
|
2221
|
+
)
|
|
2222
|
+
|
|
2223
|
+
# Collect chunk with streaming for memory efficiency
|
|
2224
|
+
chunk_result = chunk_query.collect(streaming=True)
|
|
2225
|
+
if not chunk_result.is_empty():
|
|
2226
|
+
filtered_chunks.append(chunk_result)
|
|
2227
|
+
|
|
2228
|
+
# Combine all filtered chunks efficiently
|
|
2229
|
+
if filtered_chunks:
|
|
2230
|
+
self.features_df = pl.concat(filtered_chunks, how="vertical")
|
|
2231
|
+
else:
|
|
2232
|
+
self.features_df = pl.DataFrame() # No features remain
|
|
2233
|
+
|
|
2234
|
+
# Apply same chunked processing to consensus_mapping_df
|
|
2235
|
+
_filter_consensus_mapping_chunked(self, feature_uids_to_keep, chunk_size)
|
|
2084
2236
|
|
|
2085
|
-
# Apply filter to features_df using lazy evaluation for better performance
|
|
2086
|
-
self.features_df = self.features_df.lazy().filter(filter_condition).collect()
|
|
2087
2237
|
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2238
|
+
def _filter_consensus_mapping_chunked(self, feature_uids_to_keep, chunk_size: int):
|
|
2239
|
+
"""
|
|
2240
|
+
Apply chunked filtering to consensus_mapping_df with same optimization strategy.
|
|
2241
|
+
"""
|
|
2242
|
+
if (self.consensus_mapping_df is None or
|
|
2243
|
+
self.consensus_mapping_df.is_empty()):
|
|
2244
|
+
return
|
|
2245
|
+
|
|
2246
|
+
total_mappings = len(self.consensus_mapping_df)
|
|
2247
|
+
|
|
2248
|
+
if total_mappings <= chunk_size:
|
|
2249
|
+
# Single chunk processing
|
|
2095
2250
|
self.consensus_mapping_df = (
|
|
2096
|
-
self.consensus_mapping_df
|
|
2251
|
+
self.consensus_mapping_df
|
|
2252
|
+
.lazy()
|
|
2253
|
+
.filter(pl.col("feature_uid").is_in(list(feature_uids_to_keep)))
|
|
2254
|
+
.collect(streaming=True)
|
|
2097
2255
|
)
|
|
2098
|
-
|
|
2256
|
+
else:
|
|
2257
|
+
# Multi-chunk processing
|
|
2258
|
+
num_chunks = (total_mappings + chunk_size - 1) // chunk_size
|
|
2259
|
+
filtered_chunks = []
|
|
2260
|
+
|
|
2261
|
+
for i in range(num_chunks):
|
|
2262
|
+
start_idx = i * chunk_size
|
|
2263
|
+
end_idx = min((i + 1) * chunk_size, total_mappings)
|
|
2264
|
+
|
|
2265
|
+
chunk_query = (
|
|
2266
|
+
self.consensus_mapping_df
|
|
2267
|
+
.lazy()
|
|
2268
|
+
.slice(start_idx, end_idx - start_idx)
|
|
2269
|
+
.filter(pl.col("feature_uid").is_in(list(feature_uids_to_keep)))
|
|
2270
|
+
)
|
|
2271
|
+
|
|
2272
|
+
chunk_result = chunk_query.collect(streaming=True)
|
|
2273
|
+
if not chunk_result.is_empty():
|
|
2274
|
+
filtered_chunks.append(chunk_result)
|
|
2275
|
+
|
|
2276
|
+
if filtered_chunks:
|
|
2277
|
+
self.consensus_mapping_df = pl.concat(filtered_chunks, how="vertical")
|
|
2278
|
+
else:
|
|
2279
|
+
self.consensus_mapping_df = pl.DataFrame()
|
|
2099
2280
|
|
|
2100
|
-
# Calculate results once and log efficiently
|
|
2101
|
-
final_count = len(self.features_df)
|
|
2102
|
-
removed_count = initial_count - final_count
|
|
2103
2281
|
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2282
|
+
def _apply_standard_filter(self, feature_uids_to_keep):
|
|
2283
|
+
"""
|
|
2284
|
+
Fallback to standard filtering for smaller datasets.
|
|
2285
|
+
Still uses optimized set operations and lazy evaluation.
|
|
2286
|
+
"""
|
|
2287
|
+
filter_expr = pl.col("feature_uid").is_in(list(feature_uids_to_keep))
|
|
2288
|
+
|
|
2289
|
+
# Apply filter with lazy evaluation
|
|
2290
|
+
self.features_df = (
|
|
2291
|
+
self.features_df
|
|
2292
|
+
.lazy()
|
|
2293
|
+
.filter(filter_expr)
|
|
2294
|
+
.collect(streaming=True)
|
|
2295
|
+
)
|
|
2296
|
+
|
|
2297
|
+
# Apply to consensus_mapping_df
|
|
2298
|
+
if (self.consensus_mapping_df is not None and
|
|
2299
|
+
not self.consensus_mapping_df.is_empty()):
|
|
2300
|
+
self.consensus_mapping_df = (
|
|
2301
|
+
self.consensus_mapping_df
|
|
2302
|
+
.lazy()
|
|
2303
|
+
.filter(filter_expr)
|
|
2304
|
+
.collect(streaming=True)
|
|
2112
2305
|
)
|
|
2113
2306
|
|
|
2114
2307
|
|
|
@@ -2276,7 +2469,7 @@ def consensus_select(
|
|
|
2276
2469
|
default_mz_tol = default_mz_tol.eic_mz_tol
|
|
2277
2470
|
else:
|
|
2278
2471
|
# Fallback to align_defaults if study parameters not available
|
|
2279
|
-
from
|
|
2472
|
+
from masster.study.defaults.align_def import align_defaults
|
|
2280
2473
|
|
|
2281
2474
|
default_mz_tol = align_defaults().mz_max_diff
|
|
2282
2475
|
|
|
@@ -2314,7 +2507,7 @@ def consensus_select(
|
|
|
2314
2507
|
default_rt_tol = default_rt_tol.eic_rt_tol
|
|
2315
2508
|
else:
|
|
2316
2509
|
# Fallback to align_defaults if study parameters not available
|
|
2317
|
-
from
|
|
2510
|
+
from masster.study.defaults.align_def import align_defaults
|
|
2318
2511
|
|
|
2319
2512
|
default_rt_tol = align_defaults().rt_tol
|
|
2320
2513
|
|
|
@@ -3549,7 +3742,7 @@ def _ensure_features_df_schema_order(self):
|
|
|
3549
3742
|
try:
|
|
3550
3743
|
import os
|
|
3551
3744
|
import json
|
|
3552
|
-
from
|
|
3745
|
+
from masster.study.h5 import _reorder_columns_by_schema
|
|
3553
3746
|
|
|
3554
3747
|
# Load schema
|
|
3555
3748
|
schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
|