masster 0.4.21__py3-none-any.whl → 0.4.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/save.py +0 -2
- masster/study/export.py +3 -3
- masster/study/load.py +39 -3
- masster/study/plot.py +123 -190
- masster/study/processing.py +105 -11
- masster/wizard/wizard.py +8 -8
- {masster-0.4.21.dist-info → masster-0.4.22.dist-info}/METADATA +1 -1
- {masster-0.4.21.dist-info → masster-0.4.22.dist-info}/RECORD +12 -12
- {masster-0.4.21.dist-info → masster-0.4.22.dist-info}/WHEEL +0 -0
- {masster-0.4.21.dist-info → masster-0.4.22.dist-info}/entry_points.txt +0 -0
- {masster-0.4.21.dist-info → masster-0.4.22.dist-info}/licenses/LICENSE +0 -0
masster/_version.py
CHANGED
masster/sample/save.py
CHANGED
|
@@ -344,7 +344,6 @@ def export_mgf(
|
|
|
344
344
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
345
345
|
|
|
346
346
|
# First pass: Export MS1 spectra for ALL features with ms1_spec data
|
|
347
|
-
print("Exporting MS1 spectra...")
|
|
348
347
|
for row in tqdm(
|
|
349
348
|
features_list,
|
|
350
349
|
total=len(features_list),
|
|
@@ -398,7 +397,6 @@ def export_mgf(
|
|
|
398
397
|
ms1_fallback_count += 1
|
|
399
398
|
|
|
400
399
|
# Second pass: Export MS2 spectra for features with MS2 data
|
|
401
|
-
print("Exporting MS2 spectra...")
|
|
402
400
|
for row in tqdm(
|
|
403
401
|
features_list,
|
|
404
402
|
total=len(features_list),
|
masster/study/export.py
CHANGED
|
@@ -498,7 +498,7 @@ def export_mgf(self, **kwargs):
|
|
|
498
498
|
self.logger.info(f"Exported {len(mgf_data)} spectra to {filename}")
|
|
499
499
|
|
|
500
500
|
|
|
501
|
-
def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None:
|
|
501
|
+
def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs) -> None:
|
|
502
502
|
"""
|
|
503
503
|
Export the study as a fully compliant mzTab-M file.
|
|
504
504
|
|
|
@@ -1184,7 +1184,7 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
|
|
|
1184
1184
|
self.logger.info(f"Exported mzTab-M to {filename}")
|
|
1185
1185
|
|
|
1186
1186
|
|
|
1187
|
-
def export_xlsx(self, filename: str = None) -> None:
|
|
1187
|
+
def export_xlsx(self, filename: str | None = None) -> None:
|
|
1188
1188
|
"""
|
|
1189
1189
|
Export the study data to an Excel workbook with multiple worksheets.
|
|
1190
1190
|
|
|
@@ -1295,7 +1295,7 @@ def export_xlsx(self, filename: str = None) -> None:
|
|
|
1295
1295
|
self.logger.error(f"Error writing Excel file: {e}")
|
|
1296
1296
|
|
|
1297
1297
|
|
|
1298
|
-
def export_parquet(self, filename: str = None) -> None:
|
|
1298
|
+
def export_parquet(self, filename: str | None = None) -> None:
|
|
1299
1299
|
"""
|
|
1300
1300
|
Export the study data to multiple Parquet files with different suffixes.
|
|
1301
1301
|
|
masster/study/load.py
CHANGED
|
@@ -1257,17 +1257,53 @@ def load_features(self):
|
|
|
1257
1257
|
feature_map = oms.FeatureMap()
|
|
1258
1258
|
|
|
1259
1259
|
# Convert DataFrame features to OpenMS Features
|
|
1260
|
+
# Keep track of next available feature_id for this sample
|
|
1261
|
+
next_feature_id = 1
|
|
1262
|
+
used_feature_ids = set()
|
|
1263
|
+
|
|
1264
|
+
# First pass: collect existing feature_ids to avoid conflicts
|
|
1265
|
+
for feature_row in sample_features.iter_rows(named=True):
|
|
1266
|
+
if feature_row["feature_id"] is not None:
|
|
1267
|
+
used_feature_ids.add(int(feature_row["feature_id"]))
|
|
1268
|
+
|
|
1269
|
+
# Find the next available feature_id
|
|
1270
|
+
while next_feature_id in used_feature_ids:
|
|
1271
|
+
next_feature_id += 1
|
|
1272
|
+
|
|
1260
1273
|
for feature_row in sample_features.iter_rows(named=True):
|
|
1261
1274
|
feature = oms.Feature()
|
|
1262
1275
|
|
|
1263
1276
|
# Set properties from DataFrame (handle missing values gracefully)
|
|
1264
1277
|
try:
|
|
1265
|
-
|
|
1278
|
+
# Skip features with missing critical data
|
|
1279
|
+
if feature_row["mz"] is None:
|
|
1280
|
+
self.logger.warning("Skipping feature due to missing mz")
|
|
1281
|
+
continue
|
|
1282
|
+
if feature_row["rt"] is None:
|
|
1283
|
+
self.logger.warning("Skipping feature due to missing rt")
|
|
1284
|
+
continue
|
|
1285
|
+
if feature_row["inty"] is None:
|
|
1286
|
+
self.logger.warning("Skipping feature due to missing inty")
|
|
1287
|
+
continue
|
|
1288
|
+
|
|
1289
|
+
# Handle missing feature_id by generating a new one
|
|
1290
|
+
if feature_row["feature_id"] is None:
|
|
1291
|
+
feature_id = next_feature_id
|
|
1292
|
+
next_feature_id += 1
|
|
1293
|
+
self.logger.debug(f"Generated new feature_id {feature_id} for feature with missing ID")
|
|
1294
|
+
else:
|
|
1295
|
+
feature_id = int(feature_row["feature_id"])
|
|
1296
|
+
|
|
1297
|
+
feature.setUniqueId(feature_id)
|
|
1266
1298
|
feature.setMZ(float(feature_row["mz"]))
|
|
1267
1299
|
feature.setRT(float(feature_row["rt"]))
|
|
1268
1300
|
feature.setIntensity(float(feature_row["inty"]))
|
|
1269
|
-
|
|
1270
|
-
|
|
1301
|
+
|
|
1302
|
+
# Handle optional fields that might be None
|
|
1303
|
+
if feature_row.get("quality") is not None:
|
|
1304
|
+
feature.setOverallQuality(float(feature_row["quality"]))
|
|
1305
|
+
if feature_row.get("charge") is not None:
|
|
1306
|
+
feature.setCharge(int(feature_row["charge"]))
|
|
1271
1307
|
|
|
1272
1308
|
# Add to feature map
|
|
1273
1309
|
feature_map.push_back(feature)
|
masster/study/plot.py
CHANGED
|
@@ -1724,221 +1724,154 @@ def plot_consensus_stats(
|
|
|
1724
1724
|
self,
|
|
1725
1725
|
filename=None,
|
|
1726
1726
|
width=1200,
|
|
1727
|
-
height=
|
|
1727
|
+
height=None,
|
|
1728
1728
|
alpha=0.6,
|
|
1729
|
-
|
|
1729
|
+
bins=30,
|
|
1730
|
+
n_cols=4,
|
|
1730
1731
|
):
|
|
1731
1732
|
"""
|
|
1732
|
-
Plot
|
|
1733
|
-
|
|
1733
|
+
Plot histograms/distributions for all numeric columns in consensus_df.
|
|
1734
|
+
|
|
1734
1735
|
Parameters:
|
|
1735
1736
|
filename (str, optional): Output filename for saving the plot
|
|
1736
1737
|
width (int): Overall width of the plot (default: 1200)
|
|
1737
|
-
height (int): Overall height of the plot (
|
|
1738
|
-
alpha (float):
|
|
1739
|
-
|
|
1738
|
+
height (int, optional): Overall height of the plot (auto-calculated if None)
|
|
1739
|
+
alpha (float): Histogram transparency (default: 0.6)
|
|
1740
|
+
bins (int): Number of histogram bins (default: 30)
|
|
1741
|
+
n_cols (int): Number of columns in the grid layout (default: 4)
|
|
1740
1742
|
"""
|
|
1741
1743
|
from bokeh.layouts import gridplot
|
|
1742
|
-
from bokeh.
|
|
1743
|
-
|
|
1744
|
+
from bokeh.plotting import figure
|
|
1745
|
+
import polars as pl
|
|
1746
|
+
import numpy as np
|
|
1744
1747
|
|
|
1745
1748
|
# Check if consensus_df exists and has data
|
|
1746
1749
|
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1747
1750
|
self.logger.error("No consensus data available. Run merge/find_consensus first.")
|
|
1748
1751
|
return
|
|
1749
1752
|
|
|
1750
|
-
#
|
|
1751
|
-
columns = [
|
|
1752
|
-
"rt",
|
|
1753
|
-
"mz",
|
|
1754
|
-
"number_samples",
|
|
1755
|
-
"log10_quality",
|
|
1756
|
-
"mz_delta_mean",
|
|
1757
|
-
"rt_delta_mean",
|
|
1758
|
-
"chrom_coherence_mean",
|
|
1759
|
-
"chrom_prominence_scaled_mean",
|
|
1760
|
-
"inty_mean",
|
|
1761
|
-
"number_ms2",
|
|
1762
|
-
]
|
|
1763
|
-
|
|
1764
|
-
# Check which columns exist in the dataframe and compute missing ones
|
|
1765
|
-
available_columns = self.consensus_df.columns
|
|
1753
|
+
# Get all columns and their data types - work with original dataframe
|
|
1766
1754
|
data_df = self.consensus_df.clone()
|
|
1767
1755
|
|
|
1768
|
-
#
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
|
|
1756
|
+
# Identify numeric columns (excluding ID columns that are typically strings)
|
|
1757
|
+
id_columns = ["consensus_uid", "consensus_id", "uid", "id"]
|
|
1758
|
+
numeric_columns = []
|
|
1759
|
+
|
|
1760
|
+
for col in data_df.columns:
|
|
1761
|
+
if col not in id_columns:
|
|
1762
|
+
dtype = data_df[col].dtype
|
|
1763
|
+
# Check if column is numeric (int, float, or can be converted to numeric)
|
|
1764
|
+
if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
1765
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
1766
|
+
pl.Float32, pl.Float64]:
|
|
1767
|
+
numeric_columns.append(col)
|
|
1768
|
+
|
|
1769
|
+
if len(numeric_columns) == 0:
|
|
1770
|
+
self.logger.error("No numeric columns found in consensus_df for plotting distributions.")
|
|
1779
1771
|
return
|
|
1780
1772
|
|
|
1781
|
-
self.logger.debug(f"Creating
|
|
1782
|
-
|
|
1783
|
-
# Add important ID columns for tooltips even if not plotting them
|
|
1784
|
-
tooltip_columns = []
|
|
1785
|
-
for id_col in ["consensus_uid", "consensus_id"]:
|
|
1786
|
-
if id_col in data_df.columns and id_col not in final_columns:
|
|
1787
|
-
tooltip_columns.append(id_col)
|
|
1773
|
+
self.logger.debug(f"Creating distribution plots for {len(numeric_columns)} numeric columns: {numeric_columns}")
|
|
1788
1774
|
|
|
1789
|
-
#
|
|
1790
|
-
|
|
1791
|
-
data_pd = data_df.select(all_columns).to_pandas()
|
|
1775
|
+
# Work directly with Polars - no conversion to pandas needed
|
|
1776
|
+
data_df_clean = data_df.select(numeric_columns)
|
|
1792
1777
|
|
|
1793
|
-
#
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1778
|
+
# Check if all numeric columns are empty
|
|
1779
|
+
all_columns_empty = True
|
|
1780
|
+
for col in numeric_columns:
|
|
1781
|
+
# Check if column has any non-null, finite values
|
|
1782
|
+
non_null_count = data_df_clean[col].filter(
|
|
1783
|
+
data_df_clean[col].is_not_null() &
|
|
1784
|
+
(data_df_clean[col].is_finite() if data_df_clean[col].dtype in [pl.Float32, pl.Float64] else pl.lit(True))
|
|
1785
|
+
).len()
|
|
1786
|
+
|
|
1787
|
+
if non_null_count > 0:
|
|
1788
|
+
all_columns_empty = False
|
|
1789
|
+
break
|
|
1790
|
+
|
|
1791
|
+
if all_columns_empty:
|
|
1792
|
+
self.logger.error("All numeric columns contain only NaN/infinite values.")
|
|
1798
1793
|
return
|
|
1799
1794
|
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
#
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
plot_height_normal = 120 # Standard height
|
|
1813
|
-
plot_height_last = 155 # Taller last row to accommodate x-axis labels while keeping inner plot area same size
|
|
1795
|
+
# Calculate grid dimensions
|
|
1796
|
+
n_plots = len(numeric_columns)
|
|
1797
|
+
n_rows = (n_plots + n_cols - 1) // n_cols # Ceiling division
|
|
1798
|
+
|
|
1799
|
+
# Auto-calculate height if not provided
|
|
1800
|
+
if height is None:
|
|
1801
|
+
plot_height = 300
|
|
1802
|
+
height = plot_height * n_rows + 100 # Add some padding
|
|
1803
|
+
else:
|
|
1804
|
+
plot_height = (height - 100) // n_rows # Subtract padding and divide
|
|
1805
|
+
|
|
1806
|
+
plot_width = (width - 100) // n_cols # Subtract padding and divide
|
|
1814
1807
|
|
|
1815
|
-
# Create
|
|
1808
|
+
# Create plots grid
|
|
1816
1809
|
plots = []
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
|
|
1849
|
-
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
hist, edges = np.histogram(data_pd[x_var], bins=30)
|
|
1880
|
-
p.quad(
|
|
1881
|
-
top=hist,
|
|
1882
|
-
bottom=0,
|
|
1883
|
-
left=edges[:-1],
|
|
1884
|
-
right=edges[1:],
|
|
1885
|
-
fill_color="green",
|
|
1886
|
-
line_color="white",
|
|
1887
|
-
alpha=alpha,
|
|
1888
|
-
)
|
|
1889
|
-
else:
|
|
1890
|
-
# Off-diagonal: scatter plot
|
|
1891
|
-
scatter = p.scatter(
|
|
1892
|
-
x=x_var,
|
|
1893
|
-
y=y_var,
|
|
1894
|
-
size=markersize,
|
|
1895
|
-
alpha=alpha,
|
|
1896
|
-
color="blue",
|
|
1897
|
-
source=source,
|
|
1898
|
-
)
|
|
1899
|
-
|
|
1900
|
-
# Add hover tool
|
|
1901
|
-
hover = HoverTool(
|
|
1902
|
-
tooltips=[
|
|
1903
|
-
(x_var, f"@{x_var}{{0.0000}}"),
|
|
1904
|
-
(y_var, f"@{y_var}{{0.0000}}"),
|
|
1905
|
-
(
|
|
1906
|
-
"consensus_uid",
|
|
1907
|
-
"@consensus_uid"
|
|
1908
|
-
if "consensus_uid" in data_pd.columns
|
|
1909
|
-
else "@consensus_id"
|
|
1910
|
-
if "consensus_id" in data_pd.columns
|
|
1911
|
-
else "N/A",
|
|
1912
|
-
),
|
|
1913
|
-
("rt", "@rt{0.00}" if "rt" in data_pd.columns else "N/A"),
|
|
1914
|
-
("mz", "@mz{0.0000}" if "mz" in data_pd.columns else "N/A"),
|
|
1915
|
-
],
|
|
1916
|
-
renderers=[scatter],
|
|
1917
|
-
)
|
|
1918
|
-
p.add_tools(hover)
|
|
1919
|
-
|
|
1920
|
-
row.append(p)
|
|
1921
|
-
plots.append(row)
|
|
1922
|
-
|
|
1923
|
-
# Link axes for same variables
|
|
1924
|
-
for i in range(n_vars):
|
|
1925
|
-
for j in range(n_vars):
|
|
1926
|
-
if i != j: # Don't link diagonal plots
|
|
1927
|
-
# Link x-axis to other plots in same column
|
|
1928
|
-
for k in range(n_vars):
|
|
1929
|
-
if k != i and k != j:
|
|
1930
|
-
plots[i][j].x_range = plots[k][j].x_range
|
|
1931
|
-
|
|
1932
|
-
# Link y-axis to other plots in same row
|
|
1933
|
-
for k in range(n_vars):
|
|
1934
|
-
if k != j and k != i:
|
|
1935
|
-
plots[i][j].y_range = plots[i][k].y_range
|
|
1936
|
-
|
|
1937
|
-
# Create grid layout and force overall background/border to white so the outer
|
|
1938
|
-
# container doesn't show dark UI colors in night mode.
|
|
1810
|
+
current_row = []
|
|
1811
|
+
|
|
1812
|
+
for i, col in enumerate(numeric_columns):
|
|
1813
|
+
# Create histogram for this column
|
|
1814
|
+
p = figure(
|
|
1815
|
+
width=plot_width,
|
|
1816
|
+
height=plot_height,
|
|
1817
|
+
title=col,
|
|
1818
|
+
toolbar_location="above",
|
|
1819
|
+
tools="pan,wheel_zoom,box_zoom,reset,save"
|
|
1820
|
+
)
|
|
1821
|
+
|
|
1822
|
+
# Set white background
|
|
1823
|
+
p.background_fill_color = "white"
|
|
1824
|
+
p.border_fill_color = "white"
|
|
1825
|
+
|
|
1826
|
+
# Calculate histogram using Polars
|
|
1827
|
+
# Get valid (non-null, finite) values for this column
|
|
1828
|
+
if data_df_clean[col].dtype in [pl.Float32, pl.Float64]:
|
|
1829
|
+
valid_values = data_df_clean.filter(
|
|
1830
|
+
data_df_clean[col].is_not_null() & data_df_clean[col].is_finite()
|
|
1831
|
+
)[col]
|
|
1832
|
+
else:
|
|
1833
|
+
valid_values = data_df_clean.filter(data_df_clean[col].is_not_null())[col]
|
|
1834
|
+
|
|
1835
|
+
if valid_values.len() == 0:
|
|
1836
|
+
self.logger.warning(f"No valid values for column {col}")
|
|
1837
|
+
continue
|
|
1838
|
+
|
|
1839
|
+
# Convert to numpy for histogram calculation
|
|
1840
|
+
values_array = valid_values.to_numpy()
|
|
1841
|
+
hist, edges = np.histogram(values_array, bins=bins)
|
|
1842
|
+
|
|
1843
|
+
# Create histogram bars
|
|
1844
|
+
p.quad(
|
|
1845
|
+
top=hist,
|
|
1846
|
+
bottom=0,
|
|
1847
|
+
left=edges[:-1],
|
|
1848
|
+
right=edges[1:],
|
|
1849
|
+
fill_color="steelblue",
|
|
1850
|
+
line_color="white",
|
|
1851
|
+
alpha=alpha,
|
|
1852
|
+
)
|
|
1853
|
+
|
|
1854
|
+
# Style the plot
|
|
1855
|
+
p.title.text_font_size = "12pt"
|
|
1856
|
+
p.xaxis.axis_label = col
|
|
1857
|
+
p.yaxis.axis_label = "Count"
|
|
1858
|
+
p.grid.visible = True
|
|
1859
|
+
p.grid.grid_line_color = "#E0E0E0"
|
|
1860
|
+
|
|
1861
|
+
current_row.append(p)
|
|
1862
|
+
|
|
1863
|
+
# If we've filled a row or reached the end, add the row to plots
|
|
1864
|
+
if len(current_row) == n_cols or i == n_plots - 1:
|
|
1865
|
+
# Fill remaining spots in the last row with None if needed
|
|
1866
|
+
while len(current_row) < n_cols and i == n_plots - 1:
|
|
1867
|
+
current_row.append(None)
|
|
1868
|
+
plots.append(current_row)
|
|
1869
|
+
current_row = []
|
|
1870
|
+
|
|
1871
|
+
# Create grid layout
|
|
1939
1872
|
grid = gridplot(plots)
|
|
1940
|
-
|
|
1941
|
-
# Set overall background
|
|
1873
|
+
|
|
1874
|
+
# Set overall background to white
|
|
1942
1875
|
if hasattr(grid, "background_fill_color"):
|
|
1943
1876
|
grid.background_fill_color = "white"
|
|
1944
1877
|
if hasattr(grid, "border_fill_color"):
|
masster/study/processing.py
CHANGED
|
@@ -15,6 +15,85 @@ from masster.study.defaults import (
|
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
def _generate_feature_maps_on_demand_for_align(study):
|
|
19
|
+
"""
|
|
20
|
+
Generate feature maps on-demand from study.features_df for alignment operations.
|
|
21
|
+
Returns temporary feature maps that are not cached in the study.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
study: Study object containing features_df and samples_df
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
list: List of temporary FeatureMap objects
|
|
28
|
+
"""
|
|
29
|
+
import polars as pl
|
|
30
|
+
import pyopenms as oms
|
|
31
|
+
|
|
32
|
+
if study.features_df is None or len(study.features_df) == 0:
|
|
33
|
+
study.logger.error("No features_df available for generating feature maps")
|
|
34
|
+
return []
|
|
35
|
+
|
|
36
|
+
temp_feature_maps = []
|
|
37
|
+
|
|
38
|
+
# Process each sample in order
|
|
39
|
+
for sample_index, row_dict in enumerate(study.samples_df.iter_rows(named=True)):
|
|
40
|
+
sample_uid = row_dict["sample_uid"]
|
|
41
|
+
sample_name = row_dict["sample_name"]
|
|
42
|
+
|
|
43
|
+
# Get features for this sample from features_df
|
|
44
|
+
sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
|
|
45
|
+
|
|
46
|
+
# Create new FeatureMap
|
|
47
|
+
feature_map = oms.FeatureMap()
|
|
48
|
+
|
|
49
|
+
# Convert DataFrame features to OpenMS Features
|
|
50
|
+
for feature_row in sample_features.iter_rows(named=True):
|
|
51
|
+
feature = oms.Feature()
|
|
52
|
+
|
|
53
|
+
# Set properties from DataFrame (handle missing values gracefully)
|
|
54
|
+
try:
|
|
55
|
+
# Skip features with missing critical data
|
|
56
|
+
if feature_row["mz"] is None:
|
|
57
|
+
study.logger.warning("Skipping feature due to missing mz")
|
|
58
|
+
continue
|
|
59
|
+
if feature_row["rt"] is None:
|
|
60
|
+
study.logger.warning("Skipping feature due to missing rt")
|
|
61
|
+
continue
|
|
62
|
+
if feature_row["inty"] is None:
|
|
63
|
+
study.logger.warning("Skipping feature due to missing inty")
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
# Handle missing feature_id by generating a new one
|
|
67
|
+
if feature_row["feature_id"] is None:
|
|
68
|
+
# Use a simple incremental ID for alignment purposes
|
|
69
|
+
feature_id = len(temp_feature_maps) * 100000 + feature_map.size() + 1
|
|
70
|
+
study.logger.debug(f"Generated new feature_id {feature_id} for feature with missing ID in sample {sample_name}")
|
|
71
|
+
else:
|
|
72
|
+
feature_id = int(feature_row["feature_id"])
|
|
73
|
+
|
|
74
|
+
feature.setUniqueId(feature_id)
|
|
75
|
+
feature.setMZ(float(feature_row["mz"]))
|
|
76
|
+
feature.setRT(float(feature_row["rt"]))
|
|
77
|
+
feature.setIntensity(float(feature_row["inty"]))
|
|
78
|
+
|
|
79
|
+
# Handle optional fields that might be None
|
|
80
|
+
if feature_row.get("quality") is not None:
|
|
81
|
+
feature.setOverallQuality(float(feature_row["quality"]))
|
|
82
|
+
if feature_row.get("charge") is not None:
|
|
83
|
+
feature.setCharge(int(feature_row["charge"]))
|
|
84
|
+
|
|
85
|
+
# Add to feature map
|
|
86
|
+
feature_map.push_back(feature)
|
|
87
|
+
except (ValueError, TypeError) as e:
|
|
88
|
+
study.logger.warning(f"Skipping feature due to conversion error: {e}")
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
temp_feature_maps.append(feature_map)
|
|
92
|
+
|
|
93
|
+
study.logger.debug(f"Generated {len(temp_feature_maps)} temporary feature maps from features_df for alignment")
|
|
94
|
+
return temp_feature_maps
|
|
95
|
+
|
|
96
|
+
|
|
18
97
|
def align(self, **kwargs):
|
|
19
98
|
"""Align feature maps using pose clustering or KD algorithm and update feature RTs.
|
|
20
99
|
|
|
@@ -90,13 +169,9 @@ def align(self, **kwargs):
|
|
|
90
169
|
self.store_history(["align"], params.to_dict())
|
|
91
170
|
self.logger.debug("Parameters stored to align")
|
|
92
171
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
# self.logger.debug("Starting alignment")
|
|
98
|
-
|
|
99
|
-
fmaps = self.features_maps
|
|
172
|
+
# Generate temporary feature maps on-demand from features_df instead of using cached data
|
|
173
|
+
self.logger.debug("Generating feature maps on-demand from features_df for alignment")
|
|
174
|
+
fmaps = _generate_feature_maps_on_demand_for_align(self)
|
|
100
175
|
|
|
101
176
|
# Choose alignment algorithm
|
|
102
177
|
algorithm = params.get("algorithm").lower()
|
|
@@ -108,6 +183,9 @@ def align(self, **kwargs):
|
|
|
108
183
|
_align_kd_algorithm(self, fmaps, params)
|
|
109
184
|
else:
|
|
110
185
|
self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
|
|
186
|
+
# Clean up temporary feature maps to release memory
|
|
187
|
+
del fmaps
|
|
188
|
+
return
|
|
111
189
|
|
|
112
190
|
# check if rt_original exists in features_df, if not, add it after rt
|
|
113
191
|
if "rt_original" not in self.features_df.columns:
|
|
@@ -256,6 +334,10 @@ def align(self, **kwargs):
|
|
|
256
334
|
if params.get("save_features"):
|
|
257
335
|
self.save_samples()
|
|
258
336
|
|
|
337
|
+
# Clean up temporary feature maps to release memory
|
|
338
|
+
del fmaps
|
|
339
|
+
self.logger.debug("Temporary feature maps deleted to release memory")
|
|
340
|
+
|
|
259
341
|
|
|
260
342
|
def find_ms2(self, **kwargs):
|
|
261
343
|
"""
|
|
@@ -787,10 +869,22 @@ def _align_pose_clustering(study_obj, fmaps, params):
|
|
|
787
869
|
and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank"
|
|
788
870
|
):
|
|
789
871
|
continue
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
872
|
+
|
|
873
|
+
# Skip feature maps with insufficient data points for alignment
|
|
874
|
+
if fm.size() < 2:
|
|
875
|
+
sample_name = study_obj.samples_df.row(index, named=True)["sample_name"]
|
|
876
|
+
study_obj.logger.warning(f"Skipping alignment for sample '{sample_name}' - insufficient features ({fm.size()} features)")
|
|
877
|
+
continue
|
|
878
|
+
|
|
879
|
+
try:
|
|
880
|
+
trafo = oms.TransformationDescription()
|
|
881
|
+
aligner.align(fm, trafo)
|
|
882
|
+
transformer = oms.MapAlignmentTransformer()
|
|
883
|
+
transformer.transformRetentionTimes(fm, trafo, True)
|
|
884
|
+
except RuntimeError as e:
|
|
885
|
+
sample_name = study_obj.samples_df.row(index, named=True)["sample_name"]
|
|
886
|
+
study_obj.logger.warning(f"Failed to align sample '{sample_name}': {e}")
|
|
887
|
+
continue
|
|
794
888
|
|
|
795
889
|
study_obj.alignment_ref_index = ref_index
|
|
796
890
|
|
masster/wizard/wizard.py
CHANGED
|
@@ -127,7 +127,7 @@ class wizard_def:
|
|
|
127
127
|
|
|
128
128
|
# === Feature Detection ===
|
|
129
129
|
chrom_fwhm: float = 0.5
|
|
130
|
-
|
|
130
|
+
noise: float = 50.0
|
|
131
131
|
chrom_peak_snr: float = 5.0
|
|
132
132
|
tol_ppm: float = 10.0
|
|
133
133
|
detector_type: str = "unknown" # Detected detector type ("orbitrap", "quadrupole", "unknown")
|
|
@@ -307,15 +307,15 @@ class Wizard:
|
|
|
307
307
|
"""
|
|
308
308
|
try:
|
|
309
309
|
# Find first file
|
|
310
|
-
for extension in ['.wiff', '.raw', '.mzML'
|
|
310
|
+
for extension in ['.wiff', '.raw', '.mzML']:
|
|
311
311
|
pattern = f"**/*{extension}" if True else f"*{extension}" # search_subfolders=True
|
|
312
312
|
files = list(self.source_path.rglob(pattern))
|
|
313
313
|
if files:
|
|
314
314
|
first_file = files[0]
|
|
315
315
|
break
|
|
316
316
|
else:
|
|
317
|
-
return
|
|
318
|
-
|
|
317
|
+
return 'positive'
|
|
318
|
+
|
|
319
319
|
# Only implement for .wiff files initially (most common format)
|
|
320
320
|
if first_file.suffix.lower() == '.wiff':
|
|
321
321
|
from masster.sample.load import _wiff_to_dict
|
|
@@ -337,7 +337,7 @@ class Wizard:
|
|
|
337
337
|
# Silently fall back to default if inference fails
|
|
338
338
|
pass
|
|
339
339
|
|
|
340
|
-
return
|
|
340
|
+
return 'positive'
|
|
341
341
|
|
|
342
342
|
@property
|
|
343
343
|
def polarity(self) -> str:
|
|
@@ -543,9 +543,9 @@ class Wizard:
|
|
|
543
543
|
' sample = Sample(log_label=sample_name)',
|
|
544
544
|
' sample.load(filename=str(raw_file))',
|
|
545
545
|
' sample.find_features(',
|
|
546
|
-
' noise=PARAMS[\'
|
|
547
|
-
' chrom_fwhm=PARAMS[\'
|
|
548
|
-
' chrom_peak_snr=PARAMS[\'
|
|
546
|
+
' noise=PARAMS[\'noise\'],',
|
|
547
|
+
' chrom_fwhm=PARAMS[\'chrom_fwhm\'],',
|
|
548
|
+
' chrom_peak_snr=PARAMS[\'chrom_peak_snr\']',
|
|
549
549
|
' )',
|
|
550
550
|
' sample.find_adducts(adducts=PARAMS[\'adducts\'])',
|
|
551
551
|
' sample.find_ms2()',
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
masster/__init__.py,sha256=ueZ224WPNRRjQEYTaQUol818nwQgJwB93HbEfmtPRmg,1041
|
|
2
|
-
masster/_version.py,sha256=
|
|
2
|
+
masster/_version.py,sha256=vQFUBi9UR5DFflCbwobRTLg-jW9TsSQB1GlM1tfxYuc,257
|
|
3
3
|
masster/chromatogram.py,sha256=iYpdv8C17zVnlWvOFgAn9ns2uFGiF-GgoYf5QVVAbHs,19319
|
|
4
4
|
masster/logger.py,sha256=tR65N23zfrNpcZNbZm2ot_Aual9XrGB1MWjLrovZkMs,16749
|
|
5
5
|
masster/spectrum.py,sha256=XJSUrqXZSzfpWnD8v5IMClXMRZLKLYIk014qaMOS9_k,49738
|
|
@@ -28,7 +28,7 @@ masster/sample/processing.py,sha256=A1u5u7lGG0HR_ciUhJFmmwgugher7_AZQopNnbu65Bs,
|
|
|
28
28
|
masster/sample/quant.py,sha256=tHNjvUFTdehKR31BXBZnVsBxMD9XJHgaltITOjr71uE,7562
|
|
29
29
|
masster/sample/sample.py,sha256=uQP5DLdsRSC2YwZZvspsL9rgl_HefB-oxrL2dpgg_fc,19788
|
|
30
30
|
masster/sample/sample5_schema.json,sha256=H5e2T6rHIDzul2kp_yP-ILUUWUpW08wP2pEQjMR0nSk,3977
|
|
31
|
-
masster/sample/save.py,sha256=
|
|
31
|
+
masster/sample/save.py,sha256=2yQtcQcRJjgAKPImTydj7LpyyMop_Q9JKRlNEK4yU6k,36339
|
|
32
32
|
masster/sample/sciex.py,sha256=vnbxsq_qnAQVuzcpziP1o3IC4kM5amGBcPmC2TAuDLw,46319
|
|
33
33
|
masster/sample/defaults/__init__.py,sha256=A09AOP44cxD_oYohyt7XFUho0zndRcrzVD4DUaGnKH4,447
|
|
34
34
|
masster/sample/defaults/find_adducts_def.py,sha256=Bu2KiBJRxD0SAnOPNMm_Nk-6fx6QYoRXjFNGzz-0_o0,13570
|
|
@@ -37,15 +37,15 @@ masster/sample/defaults/find_ms2_def.py,sha256=KTELMAnioGLYbhzAwOgK14TZqboPEvzeB
|
|
|
37
37
|
masster/sample/defaults/get_spectrum_def.py,sha256=o62p31PhGd-LiIkTOzKQhwPtnO2AtQDHcPu-O-YoQPs,11460
|
|
38
38
|
masster/sample/defaults/sample_def.py,sha256=keoXyMyrm_iLgbYqfIbqCpJ3XHBVlNwCNmb5iMQL0iY,14579
|
|
39
39
|
masster/study/__init__.py,sha256=55axdFuqRX4aXtJ8ocnhcLB32fNtmmJpCi58moO0r4g,237
|
|
40
|
-
masster/study/export.py,sha256=
|
|
40
|
+
masster/study/export.py,sha256=c-UQPYRwNBde8E1cYOB-0ZZz2tBDTwglRMlPfSKYB0w,59291
|
|
41
41
|
masster/study/h5.py,sha256=eINlVmcJuntwbkkZHwzm10c63Kg7zib49vkzLDj1PyU,84790
|
|
42
42
|
masster/study/helpers.py,sha256=6nDTNlsZbZWf9L6D5qzK2TUO2y7UBq51Ftj8N4bkIAk,160260
|
|
43
43
|
masster/study/id.py,sha256=6NUBBKZCFOU1wlDKM0eXQeOIStSZCRNJ_3x7ZaIHzmM,55263
|
|
44
|
-
masster/study/load.py,sha256=
|
|
44
|
+
masster/study/load.py,sha256=mI6UyErlj3vIzSuG93fOjsxA7IIDCaiKfcuAcc2538o,72425
|
|
45
45
|
masster/study/merge.py,sha256=3R_Dg6l2mnJUu3gFVAgrAN5hFSQyfHbqYPmc2cUfJqQ,159232
|
|
46
46
|
masster/study/parameters.py,sha256=0elaF7YspTsB7qyajWAbRNL2VfKlGz5GJLifmO8IGkk,3276
|
|
47
|
-
masster/study/plot.py,sha256=
|
|
48
|
-
masster/study/processing.py,sha256=
|
|
47
|
+
masster/study/plot.py,sha256=OGUa_dDTD2QydbLg-4APRZc7Jx1kk9eXC9-GOLLgI1I,87666
|
|
48
|
+
masster/study/processing.py,sha256=p0d-DyxA0YI6K9OPQZYTEs00DC6obr6-kLHPVWljEO0,56437
|
|
49
49
|
masster/study/save.py,sha256=BANh9F1s-q7MclO1Mq_-v4xQyHeloEgmoPgRDVc-9aE,9037
|
|
50
50
|
masster/study/study.py,sha256=rk-pJNg80N6xbROa9fqPfwVxFgzL_FLoSUNOTYeD5E0,40116
|
|
51
51
|
masster/study/study5_schema.json,sha256=ghBeAXFS4a4Uavdn6TUVs9GaR1QOTnADCjQTOkN0tjU,7563
|
|
@@ -64,9 +64,9 @@ masster/study/defaults/study_def.py,sha256=h8dYbi9xv0sesCSQik49Z53IkskMmNtW6ixl7
|
|
|
64
64
|
masster/wizard/README.md,sha256=mL1A3YWJZOefpJ6D0-HqGLkVRmUlOpwyVFdvJBeeoZM,14149
|
|
65
65
|
masster/wizard/__init__.py,sha256=a2hcZnHASjfuw1lqZhZnvTR58rc33rRnoGAY_JfvGhI,683
|
|
66
66
|
masster/wizard/example.py,sha256=xEZFTH9UZ8HKOm6s3JL8Js0Uw5ChnISWBHSZCL32vsM,7983
|
|
67
|
-
masster/wizard/wizard.py,sha256=
|
|
68
|
-
masster-0.4.
|
|
69
|
-
masster-0.4.
|
|
70
|
-
masster-0.4.
|
|
71
|
-
masster-0.4.
|
|
72
|
-
masster-0.4.
|
|
67
|
+
masster/wizard/wizard.py,sha256=esgaifLRyaGxytif9qOkTy-21VxlUQxrvl47K-l-BpE,37666
|
|
68
|
+
masster-0.4.22.dist-info/METADATA,sha256=CXrrzzCC5cZ_G9plLZyCtiNpTXevD0wPuUNm0mIy-a4,44207
|
|
69
|
+
masster-0.4.22.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
70
|
+
masster-0.4.22.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
|
|
71
|
+
masster-0.4.22.dist-info/licenses/LICENSE,sha256=bx5iLIKjgAdYQ7sISn7DsfHRKkoCUm1154sJJKhgqnU,35184
|
|
72
|
+
masster-0.4.22.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|