masster 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +6 -0
- masster/_version.py +1 -1
- masster/sample/h5.py +58 -1
- masster/sample/load.py +7 -1
- masster/sample/plot.py +56 -65
- masster/sample/processing.py +158 -0
- masster/sample/sample.py +2 -0
- masster/sample/sample5_schema.json +3 -0
- masster/sample/save.py +135 -59
- masster/spectrum.py +58 -9
- masster/study/export.py +240 -154
- masster/study/h5.py +65 -1
- masster/study/helpers.py +3 -3
- masster/study/load.py +39 -3
- masster/study/merge.py +25 -10
- masster/study/plot.py +162 -192
- masster/study/processing.py +362 -12
- masster/study/save.py +48 -5
- masster/study/study.py +16 -3
- masster/study/study5_schema.json +3 -0
- masster/wizard/__init__.py +5 -2
- masster/wizard/wizard.py +435 -1871
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/METADATA +1 -1
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/RECORD +27 -29
- masster/wizard/test_structure.py +0 -49
- masster/wizard/test_wizard.py +0 -285
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/WHEEL +0 -0
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/entry_points.txt +0 -0
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/licenses/LICENSE +0 -0
masster/study/load.py
CHANGED
|
@@ -1257,17 +1257,53 @@ def load_features(self):
|
|
|
1257
1257
|
feature_map = oms.FeatureMap()
|
|
1258
1258
|
|
|
1259
1259
|
# Convert DataFrame features to OpenMS Features
|
|
1260
|
+
# Keep track of next available feature_id for this sample
|
|
1261
|
+
next_feature_id = 1
|
|
1262
|
+
used_feature_ids = set()
|
|
1263
|
+
|
|
1264
|
+
# First pass: collect existing feature_ids to avoid conflicts
|
|
1265
|
+
for feature_row in sample_features.iter_rows(named=True):
|
|
1266
|
+
if feature_row["feature_id"] is not None:
|
|
1267
|
+
used_feature_ids.add(int(feature_row["feature_id"]))
|
|
1268
|
+
|
|
1269
|
+
# Find the next available feature_id
|
|
1270
|
+
while next_feature_id in used_feature_ids:
|
|
1271
|
+
next_feature_id += 1
|
|
1272
|
+
|
|
1260
1273
|
for feature_row in sample_features.iter_rows(named=True):
|
|
1261
1274
|
feature = oms.Feature()
|
|
1262
1275
|
|
|
1263
1276
|
# Set properties from DataFrame (handle missing values gracefully)
|
|
1264
1277
|
try:
|
|
1265
|
-
|
|
1278
|
+
# Skip features with missing critical data
|
|
1279
|
+
if feature_row["mz"] is None:
|
|
1280
|
+
self.logger.warning("Skipping feature due to missing mz")
|
|
1281
|
+
continue
|
|
1282
|
+
if feature_row["rt"] is None:
|
|
1283
|
+
self.logger.warning("Skipping feature due to missing rt")
|
|
1284
|
+
continue
|
|
1285
|
+
if feature_row["inty"] is None:
|
|
1286
|
+
self.logger.warning("Skipping feature due to missing inty")
|
|
1287
|
+
continue
|
|
1288
|
+
|
|
1289
|
+
# Handle missing feature_id by generating a new one
|
|
1290
|
+
if feature_row["feature_id"] is None:
|
|
1291
|
+
feature_id = next_feature_id
|
|
1292
|
+
next_feature_id += 1
|
|
1293
|
+
self.logger.debug(f"Generated new feature_id {feature_id} for feature with missing ID")
|
|
1294
|
+
else:
|
|
1295
|
+
feature_id = int(feature_row["feature_id"])
|
|
1296
|
+
|
|
1297
|
+
feature.setUniqueId(feature_id)
|
|
1266
1298
|
feature.setMZ(float(feature_row["mz"]))
|
|
1267
1299
|
feature.setRT(float(feature_row["rt"]))
|
|
1268
1300
|
feature.setIntensity(float(feature_row["inty"]))
|
|
1269
|
-
|
|
1270
|
-
|
|
1301
|
+
|
|
1302
|
+
# Handle optional fields that might be None
|
|
1303
|
+
if feature_row.get("quality") is not None:
|
|
1304
|
+
feature.setOverallQuality(float(feature_row["quality"]))
|
|
1305
|
+
if feature_row.get("charge") is not None:
|
|
1306
|
+
feature.setCharge(int(feature_row["charge"]))
|
|
1271
1307
|
|
|
1272
1308
|
# Add to feature map
|
|
1273
1309
|
feature_map.push_back(feature)
|
masster/study/merge.py
CHANGED
|
@@ -400,7 +400,7 @@ def merge(self, **kwargs) -> None:
|
|
|
400
400
|
# Feature maps will be generated on-demand within each merge method
|
|
401
401
|
|
|
402
402
|
self.logger.info(
|
|
403
|
-
f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da
|
|
403
|
+
f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da"
|
|
404
404
|
)
|
|
405
405
|
|
|
406
406
|
# Initialize
|
|
@@ -446,7 +446,7 @@ def merge(self, **kwargs) -> None:
|
|
|
446
446
|
# Note: _merge_qt_chunked populates consensus_df directly, no need to extract
|
|
447
447
|
|
|
448
448
|
# Enhanced post-clustering to merge over-segmented features (for qt and kd methods)
|
|
449
|
-
if params.method in ['qt', 'sensitivity', 'qt_chunked', 'kd_chunked']:
|
|
449
|
+
if params.method in ['qt', 'sensitivity', 'qt_chunked', 'kd_chunked', 'quality']:
|
|
450
450
|
self._consensus_cleanup(params.rt_tol, params.mz_tol)
|
|
451
451
|
|
|
452
452
|
# Perform adduct grouping
|
|
@@ -705,11 +705,11 @@ def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
705
705
|
optimized_params = params
|
|
706
706
|
|
|
707
707
|
# Phase 1: Standard KD clustering
|
|
708
|
-
self.logger.
|
|
708
|
+
self.logger.debug("Initial KD clustering")
|
|
709
709
|
consensus_map = _merge_kd(self, optimized_params)
|
|
710
710
|
|
|
711
711
|
# Phase 2: Post-processing quality control
|
|
712
|
-
self.logger.
|
|
712
|
+
self.logger.debug("Post-processing quality control")
|
|
713
713
|
consensus_map = _apply_kd_strict_postprocessing(self, consensus_map, optimized_params)
|
|
714
714
|
|
|
715
715
|
return consensus_map
|
|
@@ -911,7 +911,7 @@ def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, param
|
|
|
911
911
|
final_feature_count = len(self.consensus_df)
|
|
912
912
|
reduction_pct = ((initial_feature_count - final_feature_count) / initial_feature_count * 100) if initial_feature_count > 0 else 0
|
|
913
913
|
|
|
914
|
-
self.logger.info(f"
|
|
914
|
+
self.logger.info(f"Consensus cleanup complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
|
|
915
915
|
|
|
916
916
|
# Create a new consensus map for compatibility (the processed data is in consensus_df)
|
|
917
917
|
processed_consensus_map = oms.ConsensusMap()
|
|
@@ -1691,8 +1691,12 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1691
1691
|
mz_min_local = mz_max_local = consensus_mz
|
|
1692
1692
|
|
|
1693
1693
|
# Store chunk consensus with feature tracking
|
|
1694
|
+
# Generate unique 16-character consensus_id string
|
|
1695
|
+
import uuid
|
|
1696
|
+
consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
|
|
1697
|
+
|
|
1694
1698
|
chunk_consensus_data = {
|
|
1695
|
-
'consensus_id':
|
|
1699
|
+
'consensus_id': consensus_id_str,
|
|
1696
1700
|
'chunk_idx': chunk_idx,
|
|
1697
1701
|
'chunk_start_idx': chunk_start_idx,
|
|
1698
1702
|
'mz': consensus_mz,
|
|
@@ -1710,7 +1714,6 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1710
1714
|
}
|
|
1711
1715
|
|
|
1712
1716
|
all_chunk_consensus.append(chunk_consensus_data)
|
|
1713
|
-
consensus_id_counter += 1
|
|
1714
1717
|
|
|
1715
1718
|
if not all_chunk_consensus:
|
|
1716
1719
|
# No valid consensus features found
|
|
@@ -2094,9 +2097,13 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
|
|
|
2094
2097
|
ms2_count += len(ms2_scans)
|
|
2095
2098
|
|
|
2096
2099
|
# Build consensus metadata
|
|
2100
|
+
# Generate unique 16-character consensus_id string
|
|
2101
|
+
import uuid
|
|
2102
|
+
consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
|
|
2103
|
+
|
|
2097
2104
|
return {
|
|
2098
2105
|
"consensus_uid": int(consensus_uid),
|
|
2099
|
-
"consensus_id":
|
|
2106
|
+
"consensus_id": consensus_id_str, # Use unique 16-char string ID
|
|
2100
2107
|
"quality": round(float(np.mean(quality_values)), 3) if len(quality_values) > 0 else 1.0,
|
|
2101
2108
|
"number_samples": number_samples if number_samples is not None else len(feature_data_list),
|
|
2102
2109
|
"rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
|
|
@@ -2118,6 +2125,7 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
|
|
|
2118
2125
|
"chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
|
|
2119
2126
|
"chrom_prominence_scaled_mean": round(float(np.mean(prominence_scaled_values)), 3) if len(prominence_scaled_values) > 0 else 0.0,
|
|
2120
2127
|
"chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3) if len(height_scaled_values) > 0 else 0.0,
|
|
2128
|
+
"iso": None, # Will be filled by find_iso() function
|
|
2121
2129
|
"iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
|
|
2122
2130
|
"charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
|
|
2123
2131
|
"number_ms2": int(ms2_count),
|
|
@@ -2509,10 +2517,14 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
|
|
|
2509
2517
|
if ms2_scans is not None:
|
|
2510
2518
|
ms2_count += len(ms2_scans)
|
|
2511
2519
|
|
|
2520
|
+
# Generate unique 16-character consensus_id string (UUID-based)
|
|
2521
|
+
import uuid
|
|
2522
|
+
consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
|
|
2523
|
+
|
|
2512
2524
|
metadata_list.append(
|
|
2513
2525
|
{
|
|
2514
2526
|
"consensus_uid": int(i), # "consensus_id": i,
|
|
2515
|
-
"consensus_id":
|
|
2527
|
+
"consensus_id": consensus_id_str, # Use unique 16-char string ID
|
|
2516
2528
|
"quality": round(float(feature.getQuality()), 3),
|
|
2517
2529
|
"number_samples": len(feature_data_list),
|
|
2518
2530
|
# "number_ext": int(len(features_list)),
|
|
@@ -2577,6 +2589,7 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
|
|
|
2577
2589
|
)
|
|
2578
2590
|
if len(height_scaled_values) > 0
|
|
2579
2591
|
else 0.0,
|
|
2592
|
+
"iso": None, # Will be filled by find_iso() function
|
|
2580
2593
|
"iso_mean": round(float(np.mean(iso_values)), 2)
|
|
2581
2594
|
if len(iso_values) > 0
|
|
2582
2595
|
else 0.0,
|
|
@@ -3325,7 +3338,9 @@ def _finalize_merge(self, link_ms2, min_samples):
|
|
|
3325
3338
|
f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
|
|
3326
3339
|
f"This may be due to min_samples ({min_samples}) being too high for the available data.",
|
|
3327
3340
|
)
|
|
3328
|
-
|
|
3341
|
+
|
|
3342
|
+
# add iso data from raw files.
|
|
3343
|
+
self.find_iso()
|
|
3329
3344
|
if link_ms2:
|
|
3330
3345
|
self.find_ms2()
|
|
3331
3346
|
|
masster/study/plot.py
CHANGED
|
@@ -42,9 +42,10 @@ def _isolated_save_plot(plot_object, filename, abs_filename, logger, plot_title=
|
|
|
42
42
|
from bokeh.io.export import export_png
|
|
43
43
|
export_png(plot_object, filename=filename)
|
|
44
44
|
logger.info(f"Plot saved to: {abs_filename}")
|
|
45
|
-
except Exception:
|
|
45
|
+
except Exception as e:
|
|
46
46
|
# Fall back to HTML if PNG export not available
|
|
47
47
|
html_filename = filename.replace('.png', '.html')
|
|
48
|
+
abs_html_filename = html_filename if abs_filename == filename else abs_filename.replace('.png', '.html')
|
|
48
49
|
from bokeh.resources import Resources
|
|
49
50
|
from bokeh.embed import file_html
|
|
50
51
|
|
|
@@ -54,7 +55,7 @@ def _isolated_save_plot(plot_object, filename, abs_filename, logger, plot_title=
|
|
|
54
55
|
with open(html_filename, 'w', encoding='utf-8') as f:
|
|
55
56
|
f.write(html)
|
|
56
57
|
|
|
57
|
-
logger.warning(f"PNG export not available
|
|
58
|
+
logger.warning(f"PNG export not available ({str(e)}). Use export_png. Saved as HTML instead: {abs_html_filename}")
|
|
58
59
|
elif filename.endswith(".pdf"):
|
|
59
60
|
# Try to save as PDF, fall back to HTML if not available
|
|
60
61
|
try:
|
|
@@ -74,6 +75,26 @@ def _isolated_save_plot(plot_object, filename, abs_filename, logger, plot_title=
|
|
|
74
75
|
f.write(html)
|
|
75
76
|
|
|
76
77
|
logger.warning(f"PDF export not available, saved as HTML instead: {html_filename}")
|
|
78
|
+
elif filename.endswith(".svg"):
|
|
79
|
+
# Try to save as SVG, fall back to HTML if not available
|
|
80
|
+
try:
|
|
81
|
+
from bokeh.io.export import export_svg
|
|
82
|
+
export_svg(plot_object, filename=filename)
|
|
83
|
+
logger.info(f"Plot saved to: {abs_filename}")
|
|
84
|
+
except Exception as e:
|
|
85
|
+
# Fall back to HTML if SVG export not available
|
|
86
|
+
html_filename = filename.replace('.svg', '.html')
|
|
87
|
+
abs_html_filename = html_filename if abs_filename == filename else abs_filename.replace('.svg', '.html')
|
|
88
|
+
from bokeh.resources import Resources
|
|
89
|
+
from bokeh.embed import file_html
|
|
90
|
+
|
|
91
|
+
resources = Resources(mode='cdn')
|
|
92
|
+
html = file_html(plot_object, resources, title=plot_title)
|
|
93
|
+
|
|
94
|
+
with open(html_filename, 'w', encoding='utf-8') as f:
|
|
95
|
+
f.write(html)
|
|
96
|
+
|
|
97
|
+
logger.warning(f"SVG export not available ({str(e)}). Saved as HTML instead: {abs_html_filename}")
|
|
77
98
|
else:
|
|
78
99
|
# Default to HTML for unknown extensions using isolated approach
|
|
79
100
|
from bokeh.resources import Resources
|
|
@@ -181,6 +202,22 @@ def _isolated_save_panel_plot(panel_obj, filename, abs_filename, logger, plot_ti
|
|
|
181
202
|
logger.warning(f"PDF export not available, saved as HTML instead: {abs_html_filename}")
|
|
182
203
|
except Exception as e:
|
|
183
204
|
logger.error(f"Failed to save {plot_title} as HTML fallback: {e}")
|
|
205
|
+
elif filename.endswith(".svg"):
|
|
206
|
+
# Try to save as SVG, fall back to HTML if not available
|
|
207
|
+
try:
|
|
208
|
+
from bokeh.io.export import export_svg
|
|
209
|
+
bokeh_layout = panel_obj.get_root()
|
|
210
|
+
export_svg(bokeh_layout, filename=filename)
|
|
211
|
+
logger.info(f"{plot_title} saved to: {abs_filename}")
|
|
212
|
+
except Exception as e:
|
|
213
|
+
# Fall back to HTML if SVG export not available
|
|
214
|
+
html_filename = filename.replace('.svg', '.html')
|
|
215
|
+
abs_html_filename = os.path.abspath(html_filename)
|
|
216
|
+
try:
|
|
217
|
+
panel_obj.save(html_filename, embed=True)
|
|
218
|
+
logger.warning(f"SVG export not available ({str(e)}), saved as HTML instead: {abs_html_filename}")
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.error(f"Failed to save {plot_title} as HTML fallback: {e}")
|
|
184
221
|
else:
|
|
185
222
|
# Default to HTML for unknown extensions
|
|
186
223
|
try:
|
|
@@ -1687,221 +1724,154 @@ def plot_consensus_stats(
|
|
|
1687
1724
|
self,
|
|
1688
1725
|
filename=None,
|
|
1689
1726
|
width=1200,
|
|
1690
|
-
height=
|
|
1727
|
+
height=None,
|
|
1691
1728
|
alpha=0.6,
|
|
1692
|
-
|
|
1729
|
+
bins=30,
|
|
1730
|
+
n_cols=4,
|
|
1693
1731
|
):
|
|
1694
1732
|
"""
|
|
1695
|
-
Plot
|
|
1696
|
-
|
|
1733
|
+
Plot histograms/distributions for all numeric columns in consensus_df.
|
|
1734
|
+
|
|
1697
1735
|
Parameters:
|
|
1698
1736
|
filename (str, optional): Output filename for saving the plot
|
|
1699
1737
|
width (int): Overall width of the plot (default: 1200)
|
|
1700
|
-
height (int): Overall height of the plot (
|
|
1701
|
-
alpha (float):
|
|
1702
|
-
|
|
1738
|
+
height (int, optional): Overall height of the plot (auto-calculated if None)
|
|
1739
|
+
alpha (float): Histogram transparency (default: 0.6)
|
|
1740
|
+
bins (int): Number of histogram bins (default: 30)
|
|
1741
|
+
n_cols (int): Number of columns in the grid layout (default: 4)
|
|
1703
1742
|
"""
|
|
1704
1743
|
from bokeh.layouts import gridplot
|
|
1705
|
-
from bokeh.
|
|
1706
|
-
|
|
1744
|
+
from bokeh.plotting import figure
|
|
1745
|
+
import polars as pl
|
|
1746
|
+
import numpy as np
|
|
1707
1747
|
|
|
1708
1748
|
# Check if consensus_df exists and has data
|
|
1709
1749
|
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1710
1750
|
self.logger.error("No consensus data available. Run merge/find_consensus first.")
|
|
1711
1751
|
return
|
|
1712
1752
|
|
|
1713
|
-
#
|
|
1714
|
-
columns = [
|
|
1715
|
-
"rt",
|
|
1716
|
-
"mz",
|
|
1717
|
-
"number_samples",
|
|
1718
|
-
"log10_quality",
|
|
1719
|
-
"mz_delta_mean",
|
|
1720
|
-
"rt_delta_mean",
|
|
1721
|
-
"chrom_coherence_mean",
|
|
1722
|
-
"chrom_prominence_scaled_mean",
|
|
1723
|
-
"inty_mean",
|
|
1724
|
-
"number_ms2",
|
|
1725
|
-
]
|
|
1726
|
-
|
|
1727
|
-
# Check which columns exist in the dataframe and compute missing ones
|
|
1728
|
-
available_columns = self.consensus_df.columns
|
|
1753
|
+
# Get all columns and their data types - work with original dataframe
|
|
1729
1754
|
data_df = self.consensus_df.clone()
|
|
1730
1755
|
|
|
1731
|
-
#
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1756
|
+
# Identify numeric columns (excluding ID columns that are typically strings)
|
|
1757
|
+
id_columns = ["consensus_uid", "consensus_id", "uid", "id"]
|
|
1758
|
+
numeric_columns = []
|
|
1759
|
+
|
|
1760
|
+
for col in data_df.columns:
|
|
1761
|
+
if col not in id_columns:
|
|
1762
|
+
dtype = data_df[col].dtype
|
|
1763
|
+
# Check if column is numeric (int, float, or can be converted to numeric)
|
|
1764
|
+
if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
1765
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
1766
|
+
pl.Float32, pl.Float64]:
|
|
1767
|
+
numeric_columns.append(col)
|
|
1768
|
+
|
|
1769
|
+
if len(numeric_columns) == 0:
|
|
1770
|
+
self.logger.error("No numeric columns found in consensus_df for plotting distributions.")
|
|
1742
1771
|
return
|
|
1743
1772
|
|
|
1744
|
-
self.logger.debug(f"Creating
|
|
1745
|
-
|
|
1746
|
-
# Add important ID columns for tooltips even if not plotting them
|
|
1747
|
-
tooltip_columns = []
|
|
1748
|
-
for id_col in ["consensus_uid", "consensus_id"]:
|
|
1749
|
-
if id_col in data_df.columns and id_col not in final_columns:
|
|
1750
|
-
tooltip_columns.append(id_col)
|
|
1773
|
+
self.logger.debug(f"Creating distribution plots for {len(numeric_columns)} numeric columns: {numeric_columns}")
|
|
1751
1774
|
|
|
1752
|
-
#
|
|
1753
|
-
|
|
1754
|
-
data_pd = data_df.select(all_columns).to_pandas()
|
|
1775
|
+
# Work directly with Polars - no conversion to pandas needed
|
|
1776
|
+
data_df_clean = data_df.select(numeric_columns)
|
|
1755
1777
|
|
|
1756
|
-
#
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1778
|
+
# Check if all numeric columns are empty
|
|
1779
|
+
all_columns_empty = True
|
|
1780
|
+
for col in numeric_columns:
|
|
1781
|
+
# Check if column has any non-null, finite values
|
|
1782
|
+
non_null_count = data_df_clean[col].filter(
|
|
1783
|
+
data_df_clean[col].is_not_null() &
|
|
1784
|
+
(data_df_clean[col].is_finite() if data_df_clean[col].dtype in [pl.Float32, pl.Float64] else pl.lit(True))
|
|
1785
|
+
).len()
|
|
1786
|
+
|
|
1787
|
+
if non_null_count > 0:
|
|
1788
|
+
all_columns_empty = False
|
|
1789
|
+
break
|
|
1790
|
+
|
|
1791
|
+
if all_columns_empty:
|
|
1792
|
+
self.logger.error("All numeric columns contain only NaN/infinite values.")
|
|
1761
1793
|
return
|
|
1762
1794
|
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
#
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
plot_height_normal = 120 # Standard height
|
|
1776
|
-
plot_height_last = 155 # Taller last row to accommodate x-axis labels while keeping inner plot area same size
|
|
1795
|
+
# Calculate grid dimensions
|
|
1796
|
+
n_plots = len(numeric_columns)
|
|
1797
|
+
n_rows = (n_plots + n_cols - 1) // n_cols # Ceiling division
|
|
1798
|
+
|
|
1799
|
+
# Auto-calculate height if not provided
|
|
1800
|
+
if height is None:
|
|
1801
|
+
plot_height = 300
|
|
1802
|
+
height = plot_height * n_rows + 100 # Add some padding
|
|
1803
|
+
else:
|
|
1804
|
+
plot_height = (height - 100) // n_rows # Subtract padding and divide
|
|
1805
|
+
|
|
1806
|
+
plot_width = (width - 100) // n_cols # Subtract padding and divide
|
|
1777
1807
|
|
|
1778
|
-
# Create
|
|
1808
|
+
# Create plots grid
|
|
1779
1809
|
plots = []
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
hist, edges = np.histogram(data_pd[x_var], bins=30)
|
|
1843
|
-
p.quad(
|
|
1844
|
-
top=hist,
|
|
1845
|
-
bottom=0,
|
|
1846
|
-
left=edges[:-1],
|
|
1847
|
-
right=edges[1:],
|
|
1848
|
-
fill_color="green",
|
|
1849
|
-
line_color="white",
|
|
1850
|
-
alpha=alpha,
|
|
1851
|
-
)
|
|
1852
|
-
else:
|
|
1853
|
-
# Off-diagonal: scatter plot
|
|
1854
|
-
scatter = p.scatter(
|
|
1855
|
-
x=x_var,
|
|
1856
|
-
y=y_var,
|
|
1857
|
-
size=markersize,
|
|
1858
|
-
alpha=alpha,
|
|
1859
|
-
color="blue",
|
|
1860
|
-
source=source,
|
|
1861
|
-
)
|
|
1862
|
-
|
|
1863
|
-
# Add hover tool
|
|
1864
|
-
hover = HoverTool(
|
|
1865
|
-
tooltips=[
|
|
1866
|
-
(x_var, f"@{x_var}{{0.0000}}"),
|
|
1867
|
-
(y_var, f"@{y_var}{{0.0000}}"),
|
|
1868
|
-
(
|
|
1869
|
-
"consensus_uid",
|
|
1870
|
-
"@consensus_uid"
|
|
1871
|
-
if "consensus_uid" in data_pd.columns
|
|
1872
|
-
else "@consensus_id"
|
|
1873
|
-
if "consensus_id" in data_pd.columns
|
|
1874
|
-
else "N/A",
|
|
1875
|
-
),
|
|
1876
|
-
("rt", "@rt{0.00}" if "rt" in data_pd.columns else "N/A"),
|
|
1877
|
-
("mz", "@mz{0.0000}" if "mz" in data_pd.columns else "N/A"),
|
|
1878
|
-
],
|
|
1879
|
-
renderers=[scatter],
|
|
1880
|
-
)
|
|
1881
|
-
p.add_tools(hover)
|
|
1882
|
-
|
|
1883
|
-
row.append(p)
|
|
1884
|
-
plots.append(row)
|
|
1885
|
-
|
|
1886
|
-
# Link axes for same variables
|
|
1887
|
-
for i in range(n_vars):
|
|
1888
|
-
for j in range(n_vars):
|
|
1889
|
-
if i != j: # Don't link diagonal plots
|
|
1890
|
-
# Link x-axis to other plots in same column
|
|
1891
|
-
for k in range(n_vars):
|
|
1892
|
-
if k != i and k != j:
|
|
1893
|
-
plots[i][j].x_range = plots[k][j].x_range
|
|
1894
|
-
|
|
1895
|
-
# Link y-axis to other plots in same row
|
|
1896
|
-
for k in range(n_vars):
|
|
1897
|
-
if k != j and k != i:
|
|
1898
|
-
plots[i][j].y_range = plots[i][k].y_range
|
|
1899
|
-
|
|
1900
|
-
# Create grid layout and force overall background/border to white so the outer
|
|
1901
|
-
# container doesn't show dark UI colors in night mode.
|
|
1810
|
+
current_row = []
|
|
1811
|
+
|
|
1812
|
+
for i, col in enumerate(numeric_columns):
|
|
1813
|
+
# Create histogram for this column
|
|
1814
|
+
p = figure(
|
|
1815
|
+
width=plot_width,
|
|
1816
|
+
height=plot_height,
|
|
1817
|
+
title=col,
|
|
1818
|
+
toolbar_location="above",
|
|
1819
|
+
tools="pan,wheel_zoom,box_zoom,reset,save"
|
|
1820
|
+
)
|
|
1821
|
+
|
|
1822
|
+
# Set white background
|
|
1823
|
+
p.background_fill_color = "white"
|
|
1824
|
+
p.border_fill_color = "white"
|
|
1825
|
+
|
|
1826
|
+
# Calculate histogram using Polars
|
|
1827
|
+
# Get valid (non-null, finite) values for this column
|
|
1828
|
+
if data_df_clean[col].dtype in [pl.Float32, pl.Float64]:
|
|
1829
|
+
valid_values = data_df_clean.filter(
|
|
1830
|
+
data_df_clean[col].is_not_null() & data_df_clean[col].is_finite()
|
|
1831
|
+
)[col]
|
|
1832
|
+
else:
|
|
1833
|
+
valid_values = data_df_clean.filter(data_df_clean[col].is_not_null())[col]
|
|
1834
|
+
|
|
1835
|
+
if valid_values.len() == 0:
|
|
1836
|
+
self.logger.warning(f"No valid values for column {col}")
|
|
1837
|
+
continue
|
|
1838
|
+
|
|
1839
|
+
# Convert to numpy for histogram calculation
|
|
1840
|
+
values_array = valid_values.to_numpy()
|
|
1841
|
+
hist, edges = np.histogram(values_array, bins=bins)
|
|
1842
|
+
|
|
1843
|
+
# Create histogram bars
|
|
1844
|
+
p.quad(
|
|
1845
|
+
top=hist,
|
|
1846
|
+
bottom=0,
|
|
1847
|
+
left=edges[:-1],
|
|
1848
|
+
right=edges[1:],
|
|
1849
|
+
fill_color="steelblue",
|
|
1850
|
+
line_color="white",
|
|
1851
|
+
alpha=alpha,
|
|
1852
|
+
)
|
|
1853
|
+
|
|
1854
|
+
# Style the plot
|
|
1855
|
+
p.title.text_font_size = "12pt"
|
|
1856
|
+
p.xaxis.axis_label = col
|
|
1857
|
+
p.yaxis.axis_label = "Count"
|
|
1858
|
+
p.grid.visible = True
|
|
1859
|
+
p.grid.grid_line_color = "#E0E0E0"
|
|
1860
|
+
|
|
1861
|
+
current_row.append(p)
|
|
1862
|
+
|
|
1863
|
+
# If we've filled a row or reached the end, add the row to plots
|
|
1864
|
+
if len(current_row) == n_cols or i == n_plots - 1:
|
|
1865
|
+
# Fill remaining spots in the last row with None if needed
|
|
1866
|
+
while len(current_row) < n_cols and i == n_plots - 1:
|
|
1867
|
+
current_row.append(None)
|
|
1868
|
+
plots.append(current_row)
|
|
1869
|
+
current_row = []
|
|
1870
|
+
|
|
1871
|
+
# Create grid layout
|
|
1902
1872
|
grid = gridplot(plots)
|
|
1903
|
-
|
|
1904
|
-
# Set overall background
|
|
1873
|
+
|
|
1874
|
+
# Set overall background to white
|
|
1905
1875
|
if hasattr(grid, "background_fill_color"):
|
|
1906
1876
|
grid.background_fill_color = "white"
|
|
1907
1877
|
if hasattr(grid, "border_fill_color"):
|