masster 0.4.21__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/adducts.py +1 -1
- masster/sample/load.py +10 -9
- masster/sample/plot.py +1 -1
- masster/sample/processing.py +4 -4
- masster/sample/sample.py +29 -32
- masster/sample/save.py +0 -2
- masster/study/analysis.py +1762 -0
- masster/study/export.py +8 -6
- masster/study/helpers.py +153 -80
- masster/study/id.py +3 -3
- masster/study/load.py +56 -55
- masster/study/merge.py +316 -313
- masster/study/parameters.py +3 -3
- masster/study/plot.py +491 -203
- masster/study/processing.py +109 -15
- masster/study/save.py +8 -4
- masster/study/study.py +97 -139
- masster/wizard/wizard.py +8 -8
- {masster-0.4.21.dist-info → masster-0.5.0.dist-info}/METADATA +54 -14
- {masster-0.4.21.dist-info → masster-0.5.0.dist-info}/RECORD +24 -23
- {masster-0.4.21.dist-info → masster-0.5.0.dist-info}/WHEEL +0 -0
- {masster-0.4.21.dist-info → masster-0.5.0.dist-info}/entry_points.txt +0 -0
- {masster-0.4.21.dist-info → masster-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1762 @@
|
|
|
1
|
+
"""
|
|
2
|
+
analysis.py
|
|
3
|
+
|
|
4
|
+
Advanced analytical methods for mass spectrometry study data including
|
|
5
|
+
UMAP clustering, statistical association testing, and text pattern analysis.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
Optimized analysis module for mass spectrometry data.
|
|
12
|
+
"""
|
|
13
|
+
import warnings
|
|
14
|
+
import re
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
from scipy import stats
|
|
18
|
+
|
|
19
|
+
# Suppress sklearn deprecation warnings
|
|
20
|
+
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")
|
|
21
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning, module="sklearn")
|
|
22
|
+
|
|
23
|
+
# Check for optional dependencies
|
|
24
|
+
UMAP_AVAILABLE = False
|
|
25
|
+
HDBSCAN_AVAILABLE = False
|
|
26
|
+
SKLEARN_AVAILABLE = False
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
import umap
|
|
30
|
+
UMAP_AVAILABLE = True
|
|
31
|
+
except ImportError:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
import hdbscan
|
|
36
|
+
HDBSCAN_AVAILABLE = True
|
|
37
|
+
except ImportError:
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
from sklearn.preprocessing import StandardScaler
|
|
42
|
+
from sklearn.cluster import KMeans, DBSCAN
|
|
43
|
+
from sklearn.metrics import silhouette_score
|
|
44
|
+
SKLEARN_AVAILABLE = True
|
|
45
|
+
except ImportError:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
# Compiled regex patterns for efficient text processing
|
|
49
|
+
TOKEN_PATTERN = re.compile(r'[_\-\s\|\.]+')
|
|
50
|
+
ALPHANUMERIC_PATTERN = re.compile(r'^[A-Za-z0-9]+$')
|
|
51
|
+
|
|
52
|
+
# Simple cache for tokenization
|
|
53
|
+
_tokenization_cache = {}
|
|
54
|
+
|
|
55
|
+
def tokenize_text_cached(text):
|
|
56
|
+
"""Cached text tokenization for repeated strings - preserves original case."""
|
|
57
|
+
if text in _tokenization_cache:
|
|
58
|
+
return _tokenization_cache[text]
|
|
59
|
+
|
|
60
|
+
if pd.isna(text) or text == '' or not isinstance(text, str):
|
|
61
|
+
result = tuple()
|
|
62
|
+
else:
|
|
63
|
+
# Split by common delimiters to create atoms (same as original)
|
|
64
|
+
atoms = TOKEN_PATTERN.split(str(text).strip())
|
|
65
|
+
# Clean and filter atoms - preserve original case
|
|
66
|
+
meaningful_tokens = []
|
|
67
|
+
for atom in atoms:
|
|
68
|
+
atom = atom.strip() # Remove .lower() to preserve case
|
|
69
|
+
if atom and len(atom) > 1: # Original was > 1, not >= 1
|
|
70
|
+
meaningful_tokens.append(atom)
|
|
71
|
+
|
|
72
|
+
result = tuple(meaningful_tokens)
|
|
73
|
+
|
|
74
|
+
# Prevent cache from growing too large
|
|
75
|
+
if len(_tokenization_cache) < 10000:
|
|
76
|
+
_tokenization_cache[text] = result
|
|
77
|
+
|
|
78
|
+
return result
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# Clear cache to ensure fresh start
|
|
82
|
+
_tokenization_cache.clear()
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def analyze_umap(
|
|
86
|
+
self,
|
|
87
|
+
n_neighbors=15,
|
|
88
|
+
min_dist=0.1,
|
|
89
|
+
metric="euclidean",
|
|
90
|
+
random_state=42,
|
|
91
|
+
cluster_methods=["hdbscan", "kmeans", "dbscan"],
|
|
92
|
+
n_clusters_range=(2, 8),
|
|
93
|
+
min_cluster_size=3,
|
|
94
|
+
significance_threshold=0.01,
|
|
95
|
+
plot_results=True,
|
|
96
|
+
filename=None,
|
|
97
|
+
markersize=4,
|
|
98
|
+
):
|
|
99
|
+
"""
|
|
100
|
+
Perform UMAP dimensionality reduction followed by clustering analysis with enriched term labeling.
|
|
101
|
+
|
|
102
|
+
This method performs comprehensive cluster analysis on the study's consensus matrix, including:
|
|
103
|
+
- UMAP dimensionality reduction for visualization
|
|
104
|
+
- Automated clustering with multiple algorithms (HDBSCAN, K-means, DBSCAN)
|
|
105
|
+
- Metadata association discovery using statistical tests
|
|
106
|
+
- Text pattern analysis to identify enriched sample characteristics
|
|
107
|
+
- Enhanced visualization with intelligent label positioning for enriched terms
|
|
108
|
+
|
|
109
|
+
The enhanced visualization features cluster-aware enriched term labels with connecting spikes:
|
|
110
|
+
- Terms shared across multiple clusters are positioned at the geometric center with lines to each cluster
|
|
111
|
+
- Terms specific to single clusters are positioned nearby with short spikes
|
|
112
|
+
- Terms are ranked by presence percentage within clusters (favoring common terms)
|
|
113
|
+
- Empty/blank terms are automatically filtered out
|
|
114
|
+
- Label positioning adapts to line direction for optimal text alignment
|
|
115
|
+
- Dashed edges and color-coordinated labels provide visual clarity
|
|
116
|
+
|
|
117
|
+
Unlike plot_samples_umap() which colors by metadata columns, this function performs clustering
|
|
118
|
+
and colors points by cluster assignments, with tooltips showing enrichment information.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
n_neighbors : int, default=15
|
|
123
|
+
Number of neighbors for UMAP embedding. Higher values preserve more global structure,
|
|
124
|
+
lower values preserve more local structure.
|
|
125
|
+
|
|
126
|
+
min_dist : float, default=0.1
|
|
127
|
+
Minimum distance parameter for UMAP. Controls how tightly points are packed in the
|
|
128
|
+
embedding. Values closer to 0 result in tighter clusters.
|
|
129
|
+
|
|
130
|
+
metric : str, default="euclidean"
|
|
131
|
+
Distance metric for UMAP. Options include 'euclidean', 'manhattan', 'cosine', etc.
|
|
132
|
+
|
|
133
|
+
random_state : int, default=42
|
|
134
|
+
Random seed for reproducibility of UMAP embedding and clustering.
|
|
135
|
+
|
|
136
|
+
cluster_methods : list, default=["hdbscan", "kmeans", "dbscan"]
|
|
137
|
+
Clustering algorithms to evaluate. Available options:
|
|
138
|
+
- 'hdbscan': Hierarchical density-based clustering (requires hdbscan package)
|
|
139
|
+
- 'kmeans': K-means clustering with multiple k values
|
|
140
|
+
- 'dbscan': Density-based spatial clustering with multiple eps values
|
|
141
|
+
|
|
142
|
+
n_clusters_range : tuple, default=(2, 8)
|
|
143
|
+
Range of cluster numbers to test for K-means (min_clusters, max_clusters).
|
|
144
|
+
|
|
145
|
+
min_cluster_size : int, default=3
|
|
146
|
+
Minimum cluster size for HDBSCAN and DBSCAN algorithms.
|
|
147
|
+
|
|
148
|
+
significance_threshold : float, default=0.05
|
|
149
|
+
P-value threshold for statistical significance of metadata associations.
|
|
150
|
+
|
|
151
|
+
plot_results : bool, default=True
|
|
152
|
+
Whether to generate interactive Bokeh plots with enhanced labeling.
|
|
153
|
+
When False, only returns analysis results without visualization.
|
|
154
|
+
|
|
155
|
+
filename : str, optional
|
|
156
|
+
If provided, saves the interactive plot to this HTML file.
|
|
157
|
+
|
|
158
|
+
markersize : int, default=4
|
|
159
|
+
Size of scatter plot markers representing samples.
|
|
160
|
+
|
|
161
|
+
Returns
|
|
162
|
+
-------
|
|
163
|
+
dict
|
|
164
|
+
Comprehensive results dictionary containing:
|
|
165
|
+
|
|
166
|
+
- **umap_coords** : numpy.ndarray
|
|
167
|
+
2D UMAP coordinates for all samples (n_samples x 2)
|
|
168
|
+
|
|
169
|
+
- **best_clustering** : dict
|
|
170
|
+
Best clustering result based on silhouette score, containing:
|
|
171
|
+
- 'labels': cluster assignments for each sample
|
|
172
|
+
- 'score': silhouette score (quality metric)
|
|
173
|
+
- 'n_clusters': number of identified clusters
|
|
174
|
+
- 'n_noise': number of noise points (outliers)
|
|
175
|
+
- 'method': clustering algorithm used
|
|
176
|
+
|
|
177
|
+
- **all_clustering_results** : dict
|
|
178
|
+
Results from all tested clustering configurations, keyed by method name
|
|
179
|
+
|
|
180
|
+
- **significant_associations** : list
|
|
181
|
+
All statistically significant associations (both numeric and text), sorted by
|
|
182
|
+
cluster presence percentage. Each association includes:
|
|
183
|
+
- Statistical test results (p-value, effect size)
|
|
184
|
+
- Cluster-specific enrichment information
|
|
185
|
+
- Interpretation of effect size magnitude
|
|
186
|
+
|
|
187
|
+
- **text_associations** : list
|
|
188
|
+
Subset of associations specifically for text pattern enrichment, ranked by
|
|
189
|
+
presence percentage within clusters rather than statistical enrichment
|
|
190
|
+
|
|
191
|
+
- **cluster_summaries** : dict
|
|
192
|
+
Summary information for each cluster:
|
|
193
|
+
- 'n_samples': number of samples in cluster
|
|
194
|
+
- 'sample_names': list of sample names in cluster
|
|
195
|
+
|
|
196
|
+
- **analysis_dataframe** : pandas.DataFrame
|
|
197
|
+
Complete dataframe with UMAP coordinates, cluster assignments, and all
|
|
198
|
+
sample metadata used for association analysis
|
|
199
|
+
|
|
200
|
+
Raises
|
|
201
|
+
------
|
|
202
|
+
ImportError
|
|
203
|
+
If required dependencies (umap-learn, scikit-learn) are not installed
|
|
204
|
+
|
|
205
|
+
ValueError
|
|
206
|
+
If consensus matrix is empty or samples data is unavailable
|
|
207
|
+
|
|
208
|
+
Examples
|
|
209
|
+
--------
|
|
210
|
+
Basic UMAP analysis with default parameters:
|
|
211
|
+
|
|
212
|
+
>>> results = study.analyze_umap()
|
|
213
|
+
>>> print(f"Found {results['best_clustering']['n_clusters']} clusters")
|
|
214
|
+
>>> print(f"Silhouette score: {results['best_clustering']['score']:.3f}")
|
|
215
|
+
|
|
216
|
+
Custom analysis with specific clustering and enhanced visualization:
|
|
217
|
+
|
|
218
|
+
>>> results = study.analyze_umap(
|
|
219
|
+
... n_neighbors=20,
|
|
220
|
+
... min_dist=0.05,
|
|
221
|
+
... cluster_methods=["hdbscan", "dbscan"],
|
|
222
|
+
... significance_threshold=0.01,
|
|
223
|
+
... filename="cluster_analysis.html"
|
|
224
|
+
... )
|
|
225
|
+
|
|
226
|
+
Fast analysis for large datasets:
|
|
227
|
+
|
|
228
|
+
>>> results = study.analyze_umap(
|
|
229
|
+
... cluster_methods=["hdbscan"]
|
|
230
|
+
... )
|
|
231
|
+
|
|
232
|
+
Notes
|
|
233
|
+
-----
|
|
234
|
+
The enhanced visualization automatically identifies and labels enriched terms based on:
|
|
235
|
+
|
|
236
|
+
1. **Presence-based ranking**: Terms are ranked by their prevalence within clusters
|
|
237
|
+
rather than statistical enrichment, favoring terms common across cluster members
|
|
238
|
+
|
|
239
|
+
2. **Intelligent positioning**:
|
|
240
|
+
- Shared terms (multiple clusters) positioned at geometric center with connecting lines
|
|
241
|
+
- Individual terms positioned adjacent to their cluster with short spikes
|
|
242
|
+
- Westward lines position labels to the left with right-aligned text
|
|
243
|
+
- Eastward lines position labels to the right with left-aligned text
|
|
244
|
+
|
|
245
|
+
3. **Quality filtering**: Empty terms (variants of 'empty', 'blank', 'qc') are
|
|
246
|
+
automatically excluded from enrichment analysis and visualization
|
|
247
|
+
|
|
248
|
+
4. **Visual styling**: Dashed edges, color-coordinated labels and lines, and
|
|
249
|
+
moderate boundary expansion (5%) create professional, readable plots
|
|
250
|
+
|
|
251
|
+
The method automatically handles missing dependencies by falling back to simplified
|
|
252
|
+
analysis when optional packages (hdbscan) are unavailable.
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
# Check dependencies
|
|
256
|
+
if not UMAP_AVAILABLE:
|
|
257
|
+
self.logger.error("UMAP is required. Install with: pip install umap-learn")
|
|
258
|
+
return None
|
|
259
|
+
|
|
260
|
+
if not SKLEARN_AVAILABLE:
|
|
261
|
+
self.logger.error("scikit-learn is required. Install with: pip install scikit-learn")
|
|
262
|
+
return None
|
|
263
|
+
|
|
264
|
+
self.logger.info("Starting UMAP cluster analysis...")
|
|
265
|
+
|
|
266
|
+
# Get data
|
|
267
|
+
consensus_matrix = self.get_consensus_matrix()
|
|
268
|
+
samples_df = self.samples_df
|
|
269
|
+
|
|
270
|
+
if consensus_matrix is None or consensus_matrix.shape[0] == 0:
|
|
271
|
+
self.logger.error("No consensus matrix available. Run feature detection first.")
|
|
272
|
+
return None
|
|
273
|
+
|
|
274
|
+
if samples_df is None or len(samples_df) == 0:
|
|
275
|
+
self.logger.error("No samples data available.")
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
# Prepare data for UMAP
|
|
279
|
+
sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
|
|
280
|
+
|
|
281
|
+
if hasattr(consensus_matrix, "select"):
|
|
282
|
+
matrix_data = consensus_matrix.select(sample_cols).to_numpy()
|
|
283
|
+
else:
|
|
284
|
+
matrix_sample_data = consensus_matrix.drop(columns=["consensus_uid"], errors="ignore")
|
|
285
|
+
matrix_data = matrix_sample_data.values if hasattr(matrix_sample_data, "values") else np.array(matrix_sample_data)
|
|
286
|
+
|
|
287
|
+
# Transpose so samples are rows
|
|
288
|
+
matrix_data = matrix_data.T
|
|
289
|
+
matrix_data = np.nan_to_num(matrix_data, nan=0.0, posinf=0.0, neginf=0.0)
|
|
290
|
+
|
|
291
|
+
# Standardize data
|
|
292
|
+
from sklearn.preprocessing import StandardScaler
|
|
293
|
+
scaler = StandardScaler()
|
|
294
|
+
matrix_scaled = scaler.fit_transform(matrix_data)
|
|
295
|
+
|
|
296
|
+
# Perform UMAP with optimizations
|
|
297
|
+
self.logger.debug(f"Computing UMAP with n_neighbors={n_neighbors}, min_dist={min_dist}")
|
|
298
|
+
import umap
|
|
299
|
+
|
|
300
|
+
# UMAP optimization: use limited threads to save memory
|
|
301
|
+
n_jobs = 1
|
|
302
|
+
|
|
303
|
+
reducer = umap.UMAP(
|
|
304
|
+
n_components=2,
|
|
305
|
+
n_neighbors=n_neighbors,
|
|
306
|
+
min_dist=min_dist,
|
|
307
|
+
metric=metric,
|
|
308
|
+
random_state=random_state,
|
|
309
|
+
n_jobs=n_jobs,
|
|
310
|
+
low_memory=False
|
|
311
|
+
)
|
|
312
|
+
umap_coords = reducer.fit_transform(matrix_scaled)
|
|
313
|
+
|
|
314
|
+
# Convert samples_df to pandas for easier analysis
|
|
315
|
+
samples_pd = samples_df.to_pandas() if hasattr(samples_df, 'to_pandas') else samples_df
|
|
316
|
+
|
|
317
|
+
# Get the actual sample columns present in consensus matrix
|
|
318
|
+
sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
|
|
319
|
+
consensus_sample_names = set(sample_cols)
|
|
320
|
+
|
|
321
|
+
# Filter samples_df to only include samples present in consensus matrix
|
|
322
|
+
if 'sample_name' in samples_pd.columns:
|
|
323
|
+
# Create a mask for samples present in consensus matrix
|
|
324
|
+
sample_mask = samples_pd['sample_name'].isin(consensus_sample_names)
|
|
325
|
+
|
|
326
|
+
if sample_mask.sum() != len(samples_pd):
|
|
327
|
+
missing_samples = set(samples_pd['sample_name']) - consensus_sample_names
|
|
328
|
+
self.logger.warning(f"Filtering out {len(missing_samples)} samples not in consensus matrix: {list(missing_samples)}")
|
|
329
|
+
samples_pd = samples_pd[sample_mask].copy()
|
|
330
|
+
|
|
331
|
+
# Reorder samples_pd to match the order in consensus matrix sample_cols
|
|
332
|
+
samples_pd = samples_pd.set_index('sample_name').reindex(sample_cols).reset_index()
|
|
333
|
+
|
|
334
|
+
# Final check - ensure we have the same number of samples
|
|
335
|
+
if len(samples_pd) != len(umap_coords):
|
|
336
|
+
self.logger.error(f"After filtering, still have mismatch: samples_df has {len(samples_pd)} rows, UMAP has {len(umap_coords)} points")
|
|
337
|
+
return None
|
|
338
|
+
|
|
339
|
+
self.logger.info(f"Using {len(samples_pd)} samples for analysis")
|
|
340
|
+
|
|
341
|
+
# Try different clustering methods
|
|
342
|
+
clustering_results = {}
|
|
343
|
+
|
|
344
|
+
for method in cluster_methods:
|
|
345
|
+
self.logger.debug(f"Trying clustering method: {method}")
|
|
346
|
+
|
|
347
|
+
if method == "hdbscan" and HDBSCAN_AVAILABLE:
|
|
348
|
+
import hdbscan
|
|
349
|
+
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean')
|
|
350
|
+
cluster_labels = clusterer.fit_predict(umap_coords)
|
|
351
|
+
|
|
352
|
+
# Calculate silhouette score (excluding noise points for HDBSCAN)
|
|
353
|
+
valid_labels = cluster_labels[cluster_labels != -1]
|
|
354
|
+
valid_coords = umap_coords[cluster_labels != -1]
|
|
355
|
+
|
|
356
|
+
if len(np.unique(valid_labels)) > 1:
|
|
357
|
+
from sklearn.metrics import silhouette_score
|
|
358
|
+
score = silhouette_score(valid_coords, valid_labels)
|
|
359
|
+
n_clusters = len(np.unique(valid_labels))
|
|
360
|
+
n_noise = np.sum(cluster_labels == -1)
|
|
361
|
+
|
|
362
|
+
clustering_results[f"{method}"] = {
|
|
363
|
+
'labels': cluster_labels,
|
|
364
|
+
'score': score,
|
|
365
|
+
'n_clusters': n_clusters,
|
|
366
|
+
'n_noise': n_noise,
|
|
367
|
+
'method': method
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
elif method == "kmeans":
|
|
371
|
+
from sklearn.cluster import KMeans
|
|
372
|
+
from sklearn.metrics import silhouette_score
|
|
373
|
+
|
|
374
|
+
for n_clusters in range(n_clusters_range[0], n_clusters_range[1] + 1):
|
|
375
|
+
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
|
|
376
|
+
cluster_labels = kmeans.fit_predict(umap_coords)
|
|
377
|
+
score = silhouette_score(umap_coords, cluster_labels)
|
|
378
|
+
|
|
379
|
+
clustering_results[f"{method}_k{n_clusters}"] = {
|
|
380
|
+
'labels': cluster_labels,
|
|
381
|
+
'score': score,
|
|
382
|
+
'n_clusters': n_clusters,
|
|
383
|
+
'n_noise': 0,
|
|
384
|
+
'method': f"{method} (k={n_clusters})"
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
elif method == "dbscan":
|
|
388
|
+
from sklearn.cluster import DBSCAN
|
|
389
|
+
# Standard DBSCAN eps values for exploration
|
|
390
|
+
eps_values = [0.3, 0.5, 0.7, 1.0, 1.5]
|
|
391
|
+
|
|
392
|
+
for eps in eps_values:
|
|
393
|
+
dbscan = DBSCAN(eps=eps, min_samples=min_cluster_size, n_jobs=-1)
|
|
394
|
+
cluster_labels = dbscan.fit_predict(umap_coords)
|
|
395
|
+
|
|
396
|
+
n_clusters = len(np.unique(cluster_labels[cluster_labels != -1]))
|
|
397
|
+
n_noise = np.sum(cluster_labels == -1)
|
|
398
|
+
|
|
399
|
+
# Only consider valid clusterings
|
|
400
|
+
if n_clusters > 1:
|
|
401
|
+
from sklearn.metrics import silhouette_score
|
|
402
|
+
valid_labels = cluster_labels[cluster_labels != -1]
|
|
403
|
+
valid_coords = umap_coords[cluster_labels != -1]
|
|
404
|
+
|
|
405
|
+
if len(valid_coords) > 0 and len(np.unique(valid_labels)) > 1:
|
|
406
|
+
score = silhouette_score(valid_coords, valid_labels)
|
|
407
|
+
|
|
408
|
+
clustering_results[f"{method}_eps{eps}"] = {
|
|
409
|
+
'labels': cluster_labels,
|
|
410
|
+
'score': score,
|
|
411
|
+
'n_clusters': n_clusters,
|
|
412
|
+
'n_noise': n_noise,
|
|
413
|
+
'method': f"{method} (eps={eps})"
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
if not clustering_results:
|
|
417
|
+
self.logger.error("No valid clustering results found")
|
|
418
|
+
return None
|
|
419
|
+
|
|
420
|
+
# Select best clustering based on silhouette score
|
|
421
|
+
best_key = max(clustering_results.keys(), key=lambda k: clustering_results[k]['score'])
|
|
422
|
+
best_clustering = clustering_results[best_key]
|
|
423
|
+
|
|
424
|
+
self.logger.info(f"Best clustering: {best_clustering['method']} with {best_clustering['n_clusters']} clusters, "
|
|
425
|
+
f"silhouette score: {best_clustering['score']:.3f}")
|
|
426
|
+
|
|
427
|
+
# Analyze associations between clusters and sample metadata
|
|
428
|
+
cluster_labels = best_clustering['labels']
|
|
429
|
+
|
|
430
|
+
# Add cluster labels to samples dataframe for analysis
|
|
431
|
+
analysis_df = samples_pd.copy()
|
|
432
|
+
analysis_df['cluster'] = cluster_labels
|
|
433
|
+
|
|
434
|
+
# Remove noise points (label -1) for association analysis
|
|
435
|
+
analysis_df_clean = analysis_df[analysis_df['cluster'] != -1].copy()
|
|
436
|
+
|
|
437
|
+
if len(analysis_df_clean) == 0:
|
|
438
|
+
self.logger.error("No samples assigned to clusters (all noise)")
|
|
439
|
+
return None
|
|
440
|
+
|
|
441
|
+
# Analyze associations with specific columns only
|
|
442
|
+
significant_associations = []
|
|
443
|
+
|
|
444
|
+
# Define which columns to analyze for associations (non-text)
|
|
445
|
+
association_cols = {'sample_sequence', 'num_features'}
|
|
446
|
+
|
|
447
|
+
# Define which columns to analyze for text patterns - include all relevant text columns
|
|
448
|
+
text_pattern_cols = {'sample_name', 'sample_group', 'sample_batch', 'sample_type'}
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
for col in samples_pd.columns:
|
|
452
|
+
if col not in association_cols:
|
|
453
|
+
continue
|
|
454
|
+
|
|
455
|
+
try:
|
|
456
|
+
# Check if column has enough variation
|
|
457
|
+
col_data = analysis_df_clean[col].dropna()
|
|
458
|
+
if len(col_data.unique()) < 2:
|
|
459
|
+
continue
|
|
460
|
+
|
|
461
|
+
# Determine if column is numeric or categorical
|
|
462
|
+
if pd.api.types.is_numeric_dtype(col_data):
|
|
463
|
+
# Numeric variable - use ANOVA or Kruskal-Wallis
|
|
464
|
+
cluster_groups = [group[col].dropna().values for name, group in analysis_df_clean.groupby('cluster')]
|
|
465
|
+
cluster_groups = [group for group in cluster_groups if len(group) > 0]
|
|
466
|
+
|
|
467
|
+
if len(cluster_groups) > 1:
|
|
468
|
+
# Try ANOVA first
|
|
469
|
+
try:
|
|
470
|
+
f_stat, p_value = stats.f_oneway(*cluster_groups)
|
|
471
|
+
test_name = "ANOVA"
|
|
472
|
+
except Exception:
|
|
473
|
+
# Fall back to Kruskal-Wallis (non-parametric)
|
|
474
|
+
h_stat, p_value = stats.kruskal(*cluster_groups)
|
|
475
|
+
test_name = "Kruskal-Wallis"
|
|
476
|
+
f_stat = h_stat
|
|
477
|
+
|
|
478
|
+
if p_value < significance_threshold:
|
|
479
|
+
# Calculate effect size (eta-squared approximation)
|
|
480
|
+
ss_between = sum(len(group) * (np.mean(group) - np.mean(col_data))**2 for group in cluster_groups)
|
|
481
|
+
ss_total = np.sum((col_data - np.mean(col_data))**2)
|
|
482
|
+
eta_squared = ss_between / ss_total if ss_total > 0 else 0
|
|
483
|
+
|
|
484
|
+
significant_associations.append({
|
|
485
|
+
'column': col,
|
|
486
|
+
'variable_type': 'numeric',
|
|
487
|
+
'test': test_name,
|
|
488
|
+
'statistic': f_stat,
|
|
489
|
+
'p_value': p_value,
|
|
490
|
+
'effect_size': eta_squared,
|
|
491
|
+
'interpretation': 'Large effect' if eta_squared > 0.14 else 'Medium effect' if eta_squared > 0.06 else 'Small effect'
|
|
492
|
+
})
|
|
493
|
+
|
|
494
|
+
else:
|
|
495
|
+
# Categorical variable - use Chi-square test
|
|
496
|
+
contingency_table = pd.crosstab(analysis_df_clean['cluster'], analysis_df_clean[col])
|
|
497
|
+
|
|
498
|
+
# Only test if we have enough observations
|
|
499
|
+
if contingency_table.sum().sum() > 10 and contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
|
|
500
|
+
try:
|
|
501
|
+
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
|
|
502
|
+
|
|
503
|
+
if p_value < significance_threshold:
|
|
504
|
+
# Calculate Cramer's V (effect size for chi-square)
|
|
505
|
+
n = contingency_table.sum().sum()
|
|
506
|
+
cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))
|
|
507
|
+
|
|
508
|
+
significant_associations.append({
|
|
509
|
+
'column': col,
|
|
510
|
+
'variable_type': 'categorical',
|
|
511
|
+
'test': 'Chi-square',
|
|
512
|
+
'statistic': chi2,
|
|
513
|
+
'p_value': p_value,
|
|
514
|
+
'effect_size': cramers_v,
|
|
515
|
+
'interpretation': 'Large effect' if cramers_v > 0.5 else 'Medium effect' if cramers_v > 0.3 else 'Small effect',
|
|
516
|
+
'contingency_table': contingency_table
|
|
517
|
+
})
|
|
518
|
+
except Exception:
|
|
519
|
+
continue
|
|
520
|
+
|
|
521
|
+
except Exception as e:
|
|
522
|
+
self.logger.debug(f"Error analyzing column {col}: {e}")
|
|
523
|
+
continue
|
|
524
|
+
|
|
525
|
+
# Sort by effect size (descending)
|
|
526
|
+
significant_associations.sort(key=lambda x: x['effect_size'], reverse=True)
|
|
527
|
+
|
|
528
|
+
# Enhanced cluster-centric text analysis - analyze what makes each cluster unique
|
|
529
|
+
self.logger.debug("Performing cluster-centric enrichment analysis...")
|
|
530
|
+
|
|
531
|
+
text_associations = []
|
|
532
|
+
|
|
533
|
+
# Optimized text tokenization using cached function
|
|
534
|
+
def tokenize_text_optimized(text):
|
|
535
|
+
"""Optimized text tokenization with caching"""
|
|
536
|
+
return tokenize_text_cached(text)
|
|
537
|
+
|
|
538
|
+
# Collect all atoms from specified string columns only
|
|
539
|
+
string_columns = []
|
|
540
|
+
for col in text_pattern_cols:
|
|
541
|
+
if col in analysis_df_clean.columns:
|
|
542
|
+
col_data = analysis_df_clean[col].dropna()
|
|
543
|
+
if len(col_data) > 0 and not pd.api.types.is_numeric_dtype(col_data):
|
|
544
|
+
if len(col_data.astype(str).unique()) > 1: # Has variation
|
|
545
|
+
string_columns.append(col)
|
|
546
|
+
|
|
547
|
+
if string_columns:
|
|
548
|
+
# Text analysis for string columns
|
|
549
|
+
self.logger.debug(f"Analyzing cluster enrichments in {len(string_columns)} string columns")
|
|
550
|
+
|
|
551
|
+
# Build cluster-centric atom analysis using cached tokenization
|
|
552
|
+
cluster_atoms = {} # cluster_id -> {atom -> count}
|
|
553
|
+
global_atom_counts = {} # atom -> total_count_across_all_samples
|
|
554
|
+
|
|
555
|
+
# Pre-tokenize all text data once for efficiency with column prefixes
|
|
556
|
+
sample_atom_sets = {}
|
|
557
|
+
for idx, row in analysis_df_clean.iterrows():
|
|
558
|
+
sample_atoms = set()
|
|
559
|
+
for col in string_columns:
|
|
560
|
+
atoms = tokenize_text_optimized(row[col])
|
|
561
|
+
# Add column prefix to distinguish where tokens come from
|
|
562
|
+
col_prefix = col.replace('sample_', '') + ':' # e.g., "name:", "group:", "batch:", "type:"
|
|
563
|
+
prefixed_atoms = [f"{col_prefix}{atom}" for atom in atoms]
|
|
564
|
+
sample_atoms.update(prefixed_atoms)
|
|
565
|
+
sample_atom_sets[idx] = sample_atoms
|
|
566
|
+
|
|
567
|
+
# Collect atoms by cluster
|
|
568
|
+
for idx, row in analysis_df_clean.iterrows():
|
|
569
|
+
cluster_id = row['cluster']
|
|
570
|
+
if cluster_id not in cluster_atoms:
|
|
571
|
+
cluster_atoms[cluster_id] = {}
|
|
572
|
+
|
|
573
|
+
# Use pre-tokenized atoms
|
|
574
|
+
sample_atoms = sample_atom_sets[idx]
|
|
575
|
+
|
|
576
|
+
# Count atoms for this cluster and globally
|
|
577
|
+
for atom in sample_atoms:
|
|
578
|
+
cluster_atoms[cluster_id][atom] = cluster_atoms[cluster_id].get(atom, 0) + 1
|
|
579
|
+
global_atom_counts[atom] = global_atom_counts.get(atom, 0) + 1
|
|
580
|
+
|
|
581
|
+
# Calculate cluster enrichments using hypergeometric test (same for both modes)
|
|
582
|
+
if string_columns:
|
|
583
|
+
n_total_samples = len(analysis_df_clean)
|
|
584
|
+
|
|
585
|
+
# For each cluster, find significantly enriched terms
|
|
586
|
+
for cluster_id, cluster_atom_counts in cluster_atoms.items():
|
|
587
|
+
cluster_size = len(analysis_df_clean[analysis_df_clean['cluster'] == cluster_id])
|
|
588
|
+
|
|
589
|
+
for atom, cluster_count in cluster_atom_counts.items():
|
|
590
|
+
global_count = global_atom_counts[atom]
|
|
591
|
+
|
|
592
|
+
# Skip empty terms from enrichment analysis and plotting
|
|
593
|
+
if (atom == '<empty>' or
|
|
594
|
+
atom.lower() == 'empty' or
|
|
595
|
+
atom.strip() == '' or
|
|
596
|
+
':empty' in atom.lower() or
|
|
597
|
+
atom.lower().endswith('empty') or
|
|
598
|
+
':blank' in atom.lower() or
|
|
599
|
+
atom.lower().endswith('blank')):
|
|
600
|
+
continue
|
|
601
|
+
|
|
602
|
+
# Skip atoms with low frequency
|
|
603
|
+
if global_count < 2:
|
|
604
|
+
continue
|
|
605
|
+
|
|
606
|
+
# Skip terms that occur in fewer than 5 samples within this cluster
|
|
607
|
+
if cluster_count < 5:
|
|
608
|
+
continue
|
|
609
|
+
|
|
610
|
+
# IMPORTANT: Skip atoms that appear in too many clusters (not cluster-specific)
|
|
611
|
+
# Count how many clusters this atom appears in
|
|
612
|
+
clusters_with_atom = set()
|
|
613
|
+
for other_cluster_id, other_cluster_atom_counts in cluster_atoms.items():
|
|
614
|
+
if atom in other_cluster_atom_counts:
|
|
615
|
+
clusters_with_atom.add(other_cluster_id)
|
|
616
|
+
|
|
617
|
+
total_clusters = len(cluster_atoms)
|
|
618
|
+
cluster_specificity = len(clusters_with_atom) / total_clusters if total_clusters > 0 else 1
|
|
619
|
+
|
|
620
|
+
# Skip if atom appears in more than 50% of clusters (not specific enough)
|
|
621
|
+
if cluster_specificity > 0.5:
|
|
622
|
+
# Note: logger not available in standalone function, would need to pass self
|
|
623
|
+
continue
|
|
624
|
+
|
|
625
|
+
# Additional check: ensure this cluster has significantly more of this atom than others
|
|
626
|
+
#max_other_cluster_count = 0
|
|
627
|
+
#for other_cluster_id, other_cluster_atom_counts in cluster_atoms.items():
|
|
628
|
+
# if other_cluster_id != cluster_id and atom in other_cluster_atom_counts:
|
|
629
|
+
# max_other_cluster_count = max(max_other_cluster_count, other_cluster_atom_counts[atom])
|
|
630
|
+
|
|
631
|
+
# Skip if current cluster doesn't have significantly more instances than the next highest
|
|
632
|
+
#if cluster_count <= max_other_cluster_count * 1.5:
|
|
633
|
+
# Note: logger not available in standalone function, would need to pass self
|
|
634
|
+
# continue
|
|
635
|
+
|
|
636
|
+
# Calculate enrichment using hypergeometric test
|
|
637
|
+
try:
|
|
638
|
+
from scipy.stats import hypergeom
|
|
639
|
+
|
|
640
|
+
M = n_total_samples
|
|
641
|
+
n = global_count
|
|
642
|
+
N = cluster_size
|
|
643
|
+
k = cluster_count
|
|
644
|
+
|
|
645
|
+
# Calculate p-value (probability of observing k or more successes)
|
|
646
|
+
p_value = hypergeom.sf(k-1, M, n, N)
|
|
647
|
+
|
|
648
|
+
# Calculate enrichment ratio
|
|
649
|
+
expected_freq = (n / M) * N
|
|
650
|
+
enrichment_ratio = cluster_count / expected_freq if expected_freq > 0 else float('inf')
|
|
651
|
+
|
|
652
|
+
# Only consider significantly enriched terms (p < threshold and enrichment > 1.5x)
|
|
653
|
+
if p_value < significance_threshold and enrichment_ratio > 1.5:
|
|
654
|
+
|
|
655
|
+
# Calculate percentage of cluster samples with this atom
|
|
656
|
+
cluster_percentage = (cluster_count / cluster_size) * 100
|
|
657
|
+
global_percentage = (global_count / n_total_samples) * 100
|
|
658
|
+
|
|
659
|
+
text_associations.append({
|
|
660
|
+
'atom': atom,
|
|
661
|
+
'cluster_id': cluster_id,
|
|
662
|
+
'type': 'cluster_enrichment',
|
|
663
|
+
'test': 'Hypergeometric',
|
|
664
|
+
'p_value': p_value,
|
|
665
|
+
'enrichment_ratio': enrichment_ratio,
|
|
666
|
+
'effect_size': enrichment_ratio, # Use enrichment ratio as effect size
|
|
667
|
+
'interpretation': 'Large enrichment' if enrichment_ratio > 3 else 'Medium enrichment' if enrichment_ratio > 2 else 'Small enrichment',
|
|
668
|
+
'cluster_count': cluster_count,
|
|
669
|
+
'cluster_size': cluster_size,
|
|
670
|
+
'cluster_percentage': cluster_percentage,
|
|
671
|
+
'global_count': global_count,
|
|
672
|
+
'global_percentage': global_percentage,
|
|
673
|
+
'cluster_samples_with_atom': cluster_count,
|
|
674
|
+
'total_samples_with_atom': global_count
|
|
675
|
+
})
|
|
676
|
+
|
|
677
|
+
except Exception as e:
|
|
678
|
+
self.logger.debug(f"Error analyzing enrichment of '{atom}' in cluster {cluster_id}: {e}")
|
|
679
|
+
continue
|
|
680
|
+
|
|
681
|
+
# Sort text associations by cluster presence percentage (favors common terms in clusters)
|
|
682
|
+
text_associations.sort(key=lambda x: x['cluster_percentage'], reverse=True)
|
|
683
|
+
|
|
684
|
+
# Combine regular and text associations
|
|
685
|
+
all_associations = significant_associations + text_associations
|
|
686
|
+
# Sort by cluster percentage for text associations, effect size for others
|
|
687
|
+
all_associations.sort(key=lambda x: x.get('cluster_percentage', x.get('effect_size', 0)), reverse=True)
|
|
688
|
+
|
|
689
|
+
# Generate cluster summaries
|
|
690
|
+
cluster_summaries = {}
|
|
691
|
+
for cluster_id in analysis_df_clean['cluster'].unique():
|
|
692
|
+
cluster_data = analysis_df_clean[analysis_df_clean['cluster'] == cluster_id]
|
|
693
|
+
cluster_summaries[cluster_id] = {
|
|
694
|
+
'n_samples': len(cluster_data),
|
|
695
|
+
'sample_names': cluster_data['sample_name'].tolist() if 'sample_name' in cluster_data else [],
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
# Create results dictionary
|
|
699
|
+
results = {
|
|
700
|
+
'umap_coords': umap_coords,
|
|
701
|
+
'best_clustering': best_clustering,
|
|
702
|
+
'all_clustering_results': clustering_results,
|
|
703
|
+
'significant_associations': all_associations,
|
|
704
|
+
'text_associations': text_associations,
|
|
705
|
+
'cluster_summaries': cluster_summaries,
|
|
706
|
+
'analysis_dataframe': analysis_df_clean
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
# Create sample-specific enrichment tooltips with optimization
|
|
710
|
+
sample_enrichments = {}
|
|
711
|
+
|
|
712
|
+
# For each sample, find which text atoms it contains that are significant
|
|
713
|
+
if text_associations:
|
|
714
|
+
max_check_terms = 10 # Standard limit for tooltip calculation
|
|
715
|
+
|
|
716
|
+
for idx, row in analysis_df_clean.iterrows():
|
|
717
|
+
sample_name = row.get('sample_name', f'sample_{idx}')
|
|
718
|
+
sample_enrichments[sample_name] = []
|
|
719
|
+
|
|
720
|
+
# Check which significant atoms this sample contains
|
|
721
|
+
for assoc in text_associations[:max_check_terms]: # Check fewer terms in fast mode
|
|
722
|
+
atom = assoc['atom']
|
|
723
|
+
|
|
724
|
+
# Check if this sample contains this atom in any of the text columns
|
|
725
|
+
sample_has_atom = False
|
|
726
|
+
for col in text_pattern_cols:
|
|
727
|
+
if col in row:
|
|
728
|
+
text_value = str(row[col]) if not pd.isna(row[col]) else ""
|
|
729
|
+
if atom.lower() in text_value.lower():
|
|
730
|
+
sample_has_atom = True
|
|
731
|
+
break
|
|
732
|
+
|
|
733
|
+
if sample_has_atom:
|
|
734
|
+
sample_enrichments[sample_name].append(f"{atom} ({assoc['p_value']:.3f})")
|
|
735
|
+
if len(sample_enrichments[sample_name]) >= 3: # Only show top 3 per sample
|
|
736
|
+
break
|
|
737
|
+
|
|
738
|
+
# Create embedded plots if requested
|
|
739
|
+
if plot_results:
|
|
740
|
+
plots = {}
|
|
741
|
+
|
|
742
|
+
# Plot 1: Enhanced UMAP with clusters and enriched term labels (EMBEDDED PLOTTING)
|
|
743
|
+
from bokeh.models import ColumnDataSource, HoverTool, LabelSet, LegendItem, Legend
|
|
744
|
+
from bokeh.plotting import figure
|
|
745
|
+
from collections import defaultdict
|
|
746
|
+
|
|
747
|
+
# Create cluster plot with enhanced size
|
|
748
|
+
p1 = figure(
|
|
749
|
+
width=900, height=700,
|
|
750
|
+
title=f"UMAP Clusters with Enriched Terms ({best_clustering['method']})",
|
|
751
|
+
tools="pan,wheel_zoom,box_zoom,reset,save"
|
|
752
|
+
)
|
|
753
|
+
p1.xaxis.axis_label = "UMAP1"
|
|
754
|
+
p1.yaxis.axis_label = "UMAP2"
|
|
755
|
+
|
|
756
|
+
# Remove grid
|
|
757
|
+
p1.grid.visible = False
|
|
758
|
+
|
|
759
|
+
# Color points by cluster
|
|
760
|
+
unique_clusters = np.unique(cluster_labels)
|
|
761
|
+
n_clusters = len(unique_clusters)
|
|
762
|
+
|
|
763
|
+
# Handle color mapping for many clusters - use turbo colormap
|
|
764
|
+
if n_clusters <= 10:
|
|
765
|
+
from bokeh.palettes import turbo
|
|
766
|
+
colors = turbo(max(10, n_clusters))[:n_clusters]
|
|
767
|
+
elif n_clusters <= 20:
|
|
768
|
+
from bokeh.palettes import turbo
|
|
769
|
+
colors = turbo(20)[:n_clusters]
|
|
770
|
+
else:
|
|
771
|
+
# For many clusters, use a continuous colormap
|
|
772
|
+
from bokeh.palettes import turbo
|
|
773
|
+
colors = turbo(min(256, n_clusters))
|
|
774
|
+
|
|
775
|
+
# Calculate cluster centers and plot points
|
|
776
|
+
cluster_centers = {}
|
|
777
|
+
for i, cluster_id in enumerate(unique_clusters):
|
|
778
|
+
mask = cluster_labels == cluster_id
|
|
779
|
+
if cluster_id == -1:
|
|
780
|
+
color = "gray"
|
|
781
|
+
label = "Noise"
|
|
782
|
+
else:
|
|
783
|
+
color = colors[i % len(colors)]
|
|
784
|
+
label = f"Cluster {cluster_id}"
|
|
785
|
+
|
|
786
|
+
cluster_coords = umap_coords[mask]
|
|
787
|
+
|
|
788
|
+
# Calculate cluster center
|
|
789
|
+
if len(cluster_coords) > 0:
|
|
790
|
+
center_x = np.mean(cluster_coords[:, 0])
|
|
791
|
+
center_y = np.mean(cluster_coords[:, 1])
|
|
792
|
+
cluster_centers[cluster_id] = (center_x, center_y)
|
|
793
|
+
|
|
794
|
+
cluster_samples = samples_pd[mask] if len(samples_pd) == len(mask) else None
|
|
795
|
+
sample_names = cluster_samples['sample_name'].tolist() if cluster_samples is not None and 'sample_name' in cluster_samples else [f"Sample_{j}" for j in range(np.sum(mask))]
|
|
796
|
+
sample_uids = cluster_samples['sample_uid'].tolist() if cluster_samples is not None and 'sample_uid' in cluster_samples else [f"UID_{j}" for j in range(np.sum(mask))]
|
|
797
|
+
|
|
798
|
+
# Create enrichment tooltip text for this cluster
|
|
799
|
+
cluster_associations = [assoc for assoc in text_associations if assoc.get('cluster_id') == cluster_id]
|
|
800
|
+
|
|
801
|
+
# Get the top enrichments for this cluster (not individual samples)
|
|
802
|
+
cluster_enrichments = []
|
|
803
|
+
for assoc in cluster_associations[:3]: # Top 3 enrichments for this cluster
|
|
804
|
+
atom = assoc['atom']
|
|
805
|
+
# Skip color codes and other non-meaningful atoms
|
|
806
|
+
if not ((atom.startswith('#') and len(atom) == 7) or atom in ['nan', 'None', 'null']):
|
|
807
|
+
cluster_enrichments.append(atom)
|
|
808
|
+
|
|
809
|
+
# Create the same enrichment text for ALL samples in this cluster
|
|
810
|
+
if cluster_enrichments:
|
|
811
|
+
cluster_enrichment_text = "; ".join(cluster_enrichments)
|
|
812
|
+
else:
|
|
813
|
+
cluster_enrichment_text = "No enrichments found"
|
|
814
|
+
|
|
815
|
+
# Apply the same enrichment text to all samples in this cluster
|
|
816
|
+
sample_enrichment_texts = [cluster_enrichment_text] * np.sum(mask)
|
|
817
|
+
|
|
818
|
+
source = ColumnDataSource({
|
|
819
|
+
'x': umap_coords[mask, 0],
|
|
820
|
+
'y': umap_coords[mask, 1],
|
|
821
|
+
'cluster': [cluster_id] * np.sum(mask),
|
|
822
|
+
'sample_name': sample_names[:np.sum(mask)],
|
|
823
|
+
'sample_uid': sample_uids[:np.sum(mask)],
|
|
824
|
+
'enrichments': sample_enrichment_texts[:np.sum(mask)]
|
|
825
|
+
})
|
|
826
|
+
|
|
827
|
+
p1.scatter('x', 'y', size=markersize, color=color, alpha=0.7,
|
|
828
|
+
source=source)
|
|
829
|
+
|
|
830
|
+
# Enhanced enriched term visualization
|
|
831
|
+
max_terms_per_cluster = 2
|
|
832
|
+
min_enrichment = 2.0
|
|
833
|
+
|
|
834
|
+
# Process enriched terms - group by cluster and filter
|
|
835
|
+
cluster_terms = defaultdict(list)
|
|
836
|
+
for assoc in text_associations:
|
|
837
|
+
# Skip empty terms from plotting
|
|
838
|
+
atom = assoc.get('atom', '')
|
|
839
|
+
if (atom == '<empty>' or
|
|
840
|
+
atom.lower() == 'empty' or
|
|
841
|
+
atom.strip() == '' or
|
|
842
|
+
':empty' in atom.lower() or
|
|
843
|
+
atom.lower().endswith('empty') or
|
|
844
|
+
':blank' in atom.lower() or
|
|
845
|
+
atom.lower().endswith('blank')):
|
|
846
|
+
continue
|
|
847
|
+
|
|
848
|
+
if (assoc['enrichment_ratio'] >= min_enrichment and
|
|
849
|
+
assoc['cluster_id'] in cluster_centers):
|
|
850
|
+
cluster_terms[assoc['cluster_id']].append(assoc)
|
|
851
|
+
|
|
852
|
+
# Limit terms per cluster and sort by cluster presence percentage (favors common terms)
|
|
853
|
+
for cluster_id in cluster_terms:
|
|
854
|
+
cluster_terms[cluster_id] = sorted(
|
|
855
|
+
cluster_terms[cluster_id],
|
|
856
|
+
key=lambda x: x['cluster_percentage'],
|
|
857
|
+
reverse=True
|
|
858
|
+
)[:max_terms_per_cluster]
|
|
859
|
+
|
|
860
|
+
# Collect all unique terms for shared term handling
|
|
861
|
+
all_terms = {}
|
|
862
|
+
for cluster_id, terms in cluster_terms.items():
|
|
863
|
+
for term in terms:
|
|
864
|
+
atom = term['atom']
|
|
865
|
+
if atom not in all_terms:
|
|
866
|
+
all_terms[atom] = []
|
|
867
|
+
all_terms[atom].append(cluster_id)
|
|
868
|
+
|
|
869
|
+
# Separate terms into shared vs cluster-specific
|
|
870
|
+
shared_terms = {atom: clusters for atom, clusters in all_terms.items() if len(clusters) > 1}
|
|
871
|
+
specific_terms = {atom: clusters[0] for atom, clusters in all_terms.items() if len(clusters) == 1}
|
|
872
|
+
|
|
873
|
+
# Merge overlapping terms that refer to the same concept
|
|
874
|
+
# E.g., "type:qc" and "name:PooledQC" both refer to QC samples
|
|
875
|
+
def should_merge_terms(term1, term2):
|
|
876
|
+
"""Check if two terms should be merged based on semantic overlap"""
|
|
877
|
+
# Extract the actual values (remove prefixes)
|
|
878
|
+
val1 = term1.replace('name:', '').replace('type:', '').replace('group:', '').replace('batch:', '').lower()
|
|
879
|
+
val2 = term2.replace('name:', '').replace('type:', '').replace('group:', '').replace('batch:', '').lower()
|
|
880
|
+
|
|
881
|
+
# Define known overlapping concepts
|
|
882
|
+
qc_terms = {'qc', 'pooledqc', 'pooled_qc', 'quality_control', 'qualitycontrol'}
|
|
883
|
+
blank_terms = {'blank', 'blk', 'empty', 'background'}
|
|
884
|
+
|
|
885
|
+
# Check if both terms belong to the same concept group
|
|
886
|
+
if val1 in qc_terms and val2 in qc_terms:
|
|
887
|
+
return True
|
|
888
|
+
if val1 in blank_terms and val2 in blank_terms:
|
|
889
|
+
return True
|
|
890
|
+
|
|
891
|
+
# Also check for direct string similarity (e.g., case variations)
|
|
892
|
+
if val1 == val2:
|
|
893
|
+
return True
|
|
894
|
+
|
|
895
|
+
return False
|
|
896
|
+
|
|
897
|
+
def merge_overlapping_terms(shared_terms, specific_terms):
|
|
898
|
+
"""Merge terms that refer to the same concept"""
|
|
899
|
+
all_atoms = list(shared_terms.keys()) + list(specific_terms.keys())
|
|
900
|
+
merged_groups = []
|
|
901
|
+
used_atoms = set()
|
|
902
|
+
|
|
903
|
+
for i, atom1 in enumerate(all_atoms):
|
|
904
|
+
if atom1 in used_atoms:
|
|
905
|
+
continue
|
|
906
|
+
|
|
907
|
+
group = [atom1]
|
|
908
|
+
used_atoms.add(atom1)
|
|
909
|
+
|
|
910
|
+
# Find all atoms that should be merged with this one
|
|
911
|
+
for j, atom2 in enumerate(all_atoms[i+1:], i+1):
|
|
912
|
+
if atom2 in used_atoms:
|
|
913
|
+
continue
|
|
914
|
+
if should_merge_terms(atom1, atom2):
|
|
915
|
+
group.append(atom2)
|
|
916
|
+
used_atoms.add(atom2)
|
|
917
|
+
|
|
918
|
+
if len(group) > 1:
|
|
919
|
+
merged_groups.append(group)
|
|
920
|
+
|
|
921
|
+
return merged_groups
|
|
922
|
+
|
|
923
|
+
# Find terms that should be merged
|
|
924
|
+
merged_groups = merge_overlapping_terms(shared_terms, specific_terms)
|
|
925
|
+
|
|
926
|
+
# Apply merging: create new combined terms and remove originals
|
|
927
|
+
for group in merged_groups:
|
|
928
|
+
# Determine the combined clusters for this group
|
|
929
|
+
combined_clusters = set()
|
|
930
|
+
for atom in group:
|
|
931
|
+
if atom in shared_terms:
|
|
932
|
+
combined_clusters.update(shared_terms[atom])
|
|
933
|
+
elif atom in specific_terms:
|
|
934
|
+
combined_clusters.add(specific_terms[atom])
|
|
935
|
+
|
|
936
|
+
# Create a new combined term name using newlines
|
|
937
|
+
# Keep the original prefixes and atom names
|
|
938
|
+
combined_atom = '\n'.join(group)
|
|
939
|
+
|
|
940
|
+
# Remove original terms from both dictionaries
|
|
941
|
+
for atom in group:
|
|
942
|
+
shared_terms.pop(atom, None)
|
|
943
|
+
specific_terms.pop(atom, None)
|
|
944
|
+
|
|
945
|
+
# Add the combined term to appropriate dictionary
|
|
946
|
+
combined_clusters_list = list(combined_clusters)
|
|
947
|
+
if len(combined_clusters_list) > 1:
|
|
948
|
+
shared_terms[combined_atom] = combined_clusters_list
|
|
949
|
+
else:
|
|
950
|
+
specific_terms[combined_atom] = combined_clusters_list[0]
|
|
951
|
+
|
|
952
|
+
# Create label sources for enriched terms
|
|
953
|
+
label_sources = {}
|
|
954
|
+
line_sources = {}
|
|
955
|
+
line_cluster_mapping = {} # Track which cluster each line belongs to
|
|
956
|
+
|
|
957
|
+
# Handle shared terms (place at center of all clusters that share it, but in empty areas)
|
|
958
|
+
for atom, clusters in shared_terms.items():
|
|
959
|
+
if len(clusters) > 1:
|
|
960
|
+
# Calculate center of all clusters sharing this term
|
|
961
|
+
cluster_coords_list = [cluster_centers[cid] for cid in clusters if cid in cluster_centers]
|
|
962
|
+
if cluster_coords_list:
|
|
963
|
+
center_x = np.mean([coord[0] for coord in cluster_coords_list])
|
|
964
|
+
center_y = np.mean([coord[1] for coord in cluster_coords_list])
|
|
965
|
+
|
|
966
|
+
# Calculate data bounds using simple approach
|
|
967
|
+
all_x = [pt[0] for pt in umap_coords]
|
|
968
|
+
all_y = [pt[1] for pt in umap_coords]
|
|
969
|
+
x_min, x_max = min(all_x), max(all_x)
|
|
970
|
+
y_min, y_max = min(all_y), max(all_y)
|
|
971
|
+
data_range_x = x_max - x_min
|
|
972
|
+
data_range_y = y_max - y_min
|
|
973
|
+
|
|
974
|
+
# Find empty area around the center
|
|
975
|
+
best_distance = 0
|
|
976
|
+
best_position = None
|
|
977
|
+
|
|
978
|
+
for distance_factor in [1.0, 1.5, 2.0]:
|
|
979
|
+
offset_distance = distance_factor * max(data_range_x, data_range_y) * 0.1
|
|
980
|
+
|
|
981
|
+
for angle in np.linspace(0, 2*np.pi, 8):
|
|
982
|
+
label_x = center_x + offset_distance * np.cos(angle)
|
|
983
|
+
label_y = center_y + offset_distance * np.sin(angle)
|
|
984
|
+
|
|
985
|
+
# Calculate minimum distance to any data point
|
|
986
|
+
distances = [np.sqrt((pt[0] - label_x)**2 + (pt[1] - label_y)**2) for pt in umap_coords]
|
|
987
|
+
min_distance = min(distances)
|
|
988
|
+
|
|
989
|
+
if min_distance > best_distance:
|
|
990
|
+
best_distance = min_distance
|
|
991
|
+
best_position = (label_x, label_y)
|
|
992
|
+
|
|
993
|
+
# Use best position or fallback to center
|
|
994
|
+
if best_position is not None:
|
|
995
|
+
label_x, label_y = best_position
|
|
996
|
+
else:
|
|
997
|
+
label_x, label_y = center_x, center_y
|
|
998
|
+
|
|
999
|
+
# Check if label would be outside plot bounds and adjust
|
|
1000
|
+
label_margin = max(data_range_x, data_range_y) * 0.05
|
|
1001
|
+
if label_x < x_min - label_margin:
|
|
1002
|
+
label_x = x_min - label_margin
|
|
1003
|
+
elif label_x > x_max + label_margin:
|
|
1004
|
+
label_x = x_max + label_margin
|
|
1005
|
+
|
|
1006
|
+
if label_y < y_min - label_margin:
|
|
1007
|
+
label_y = y_min - label_margin
|
|
1008
|
+
elif label_y > y_max + label_margin:
|
|
1009
|
+
label_y = y_max + label_margin
|
|
1010
|
+
|
|
1011
|
+
# Keep the original atom name with prefixes for display
|
|
1012
|
+
display_atom = atom # Keep prefixes like name:, group:, batch:, type:
|
|
1013
|
+
|
|
1014
|
+
# Create label source with center alignment for shared terms
|
|
1015
|
+
label_source = ColumnDataSource({
|
|
1016
|
+
'x': [label_x],
|
|
1017
|
+
'y': [label_y],
|
|
1018
|
+
'text': [display_atom],
|
|
1019
|
+
'atom': [atom],
|
|
1020
|
+
'text_align': ['center']
|
|
1021
|
+
})
|
|
1022
|
+
label_sources[atom] = label_source
|
|
1023
|
+
|
|
1024
|
+
# Create lines to each cluster center
|
|
1025
|
+
line_x = []
|
|
1026
|
+
line_y = []
|
|
1027
|
+
for cluster_id in clusters:
|
|
1028
|
+
if cluster_id in cluster_centers:
|
|
1029
|
+
cx, cy = cluster_centers[cluster_id]
|
|
1030
|
+
line_x.extend([label_x, cx, np.nan]) # nan to break line
|
|
1031
|
+
line_y.extend([label_y, cy, np.nan])
|
|
1032
|
+
|
|
1033
|
+
line_source = ColumnDataSource({
|
|
1034
|
+
'x': line_x,
|
|
1035
|
+
'y': line_y
|
|
1036
|
+
})
|
|
1037
|
+
line_sources[atom] = line_source
|
|
1038
|
+
line_cluster_mapping[atom] = 'shared'
|
|
1039
|
+
|
|
1040
|
+
# Handle cluster-specific terms (arrange multiple terms per cluster to avoid overlap)
|
|
1041
|
+
# Group specific terms by cluster to handle multiple terms per cluster
|
|
1042
|
+
cluster_specific_terms = defaultdict(list)
|
|
1043
|
+
for atom, cluster_id in specific_terms.items():
|
|
1044
|
+
cluster_specific_terms[cluster_id].append(atom)
|
|
1045
|
+
|
|
1046
|
+
# Calculate data bounds once
|
|
1047
|
+
all_x = [pt[0] for pt in umap_coords]
|
|
1048
|
+
all_y = [pt[1] for pt in umap_coords]
|
|
1049
|
+
x_min, x_max = min(all_x), max(all_x)
|
|
1050
|
+
y_min, y_max = min(all_y), max(all_y)
|
|
1051
|
+
data_range_x = x_max - x_min
|
|
1052
|
+
data_range_y = y_max - y_min
|
|
1053
|
+
|
|
1054
|
+
# Expand plot ranges to accommodate labels (add 15% margin on all sides)
|
|
1055
|
+
margin = 0.15
|
|
1056
|
+
x_margin = data_range_x * margin
|
|
1057
|
+
y_margin = data_range_y * margin
|
|
1058
|
+
plot_x_min = x_min - x_margin
|
|
1059
|
+
plot_x_max = x_max + x_margin
|
|
1060
|
+
plot_y_min = y_min - y_margin
|
|
1061
|
+
plot_y_max = y_max + y_margin
|
|
1062
|
+
|
|
1063
|
+
# Set expanded plot ranges
|
|
1064
|
+
p1.x_range.start = plot_x_min
|
|
1065
|
+
p1.x_range.end = plot_x_max
|
|
1066
|
+
p1.y_range.start = plot_y_min
|
|
1067
|
+
p1.y_range.end = plot_y_max
|
|
1068
|
+
|
|
1069
|
+
# Process each cluster that has specific terms
|
|
1070
|
+
for cluster_id, cluster_atoms in cluster_specific_terms.items():
|
|
1071
|
+
if cluster_id not in cluster_centers:
|
|
1072
|
+
continue
|
|
1073
|
+
|
|
1074
|
+
cx, cy = cluster_centers[cluster_id]
|
|
1075
|
+
n_terms = len(cluster_atoms)
|
|
1076
|
+
|
|
1077
|
+
if n_terms == 1:
|
|
1078
|
+
# Single term - use smart positioning with shorter distances
|
|
1079
|
+
atom = cluster_atoms[0]
|
|
1080
|
+
|
|
1081
|
+
# Try multiple candidate positions with shorter distances and more angles
|
|
1082
|
+
best_distance = 0
|
|
1083
|
+
best_position = None
|
|
1084
|
+
|
|
1085
|
+
# Use shorter base distance and test many angles
|
|
1086
|
+
base_distance = max(data_range_x, data_range_y) * 0.08 # Much shorter base distance
|
|
1087
|
+
|
|
1088
|
+
# Test positions at different angles and short distances
|
|
1089
|
+
for distance_factor in [0.8, 1.0, 1.3]: # Shorter distance factors
|
|
1090
|
+
offset_distance = base_distance * distance_factor
|
|
1091
|
+
|
|
1092
|
+
for angle in np.linspace(0, 2*np.pi, 24): # More angles (24 directions)
|
|
1093
|
+
label_x = cx + offset_distance * np.cos(angle)
|
|
1094
|
+
label_y = cy + offset_distance * np.sin(angle)
|
|
1095
|
+
|
|
1096
|
+
# Calculate minimum distance to any data point
|
|
1097
|
+
distances = [np.sqrt((pt[0] - label_x)**2 + (pt[1] - label_y)**2) for pt in umap_coords]
|
|
1098
|
+
min_distance = min(distances)
|
|
1099
|
+
|
|
1100
|
+
# Check distance to other labels to avoid overlap
|
|
1101
|
+
min_label_distance = float('inf')
|
|
1102
|
+
for other_atom, other_source in label_sources.items():
|
|
1103
|
+
if other_atom != atom:
|
|
1104
|
+
other_data = other_source.data
|
|
1105
|
+
if other_data['x'] and other_data['y']:
|
|
1106
|
+
other_x, other_y = other_data['x'][0], other_data['y'][0]
|
|
1107
|
+
label_distance = np.sqrt((label_x - other_x)**2 + (label_y - other_y)**2)
|
|
1108
|
+
min_label_distance = min(min_label_distance, label_distance)
|
|
1109
|
+
|
|
1110
|
+
# Prefer positions that are reasonably far from data points and other labels
|
|
1111
|
+
combined_distance = min(min_distance, min_label_distance if min_label_distance != float('inf') else min_distance)
|
|
1112
|
+
|
|
1113
|
+
if combined_distance > best_distance:
|
|
1114
|
+
best_distance = combined_distance
|
|
1115
|
+
best_position = (label_x, label_y)
|
|
1116
|
+
|
|
1117
|
+
# Use best position found, or fallback to simple short offset
|
|
1118
|
+
if best_position is not None:
|
|
1119
|
+
label_x, label_y = best_position
|
|
1120
|
+
else:
|
|
1121
|
+
# Fallback: simple short radial offset
|
|
1122
|
+
offset_distance = base_distance
|
|
1123
|
+
angle = (cluster_id * 45) % 360 # Deterministic angle based on cluster
|
|
1124
|
+
angle_rad = np.radians(angle)
|
|
1125
|
+
label_x = cx + offset_distance * np.cos(angle_rad)
|
|
1126
|
+
label_y = cy + offset_distance * np.sin(angle_rad)
|
|
1127
|
+
|
|
1128
|
+
# Check if label would be outside plot bounds and adjust
|
|
1129
|
+
label_margin = max(data_range_x, data_range_y) * 0.05
|
|
1130
|
+
|
|
1131
|
+
# Instead of clamping to bounds, let labels go outside and plot bounds will be expanded later
|
|
1132
|
+
# Only apply minimal adjustments to prevent labels from being extremely far out
|
|
1133
|
+
extreme_margin = max(data_range_x, data_range_y) * 0.25 # Allow 25% outside data range
|
|
1134
|
+
|
|
1135
|
+
if label_x < x_min - extreme_margin:
|
|
1136
|
+
label_x = x_min - extreme_margin
|
|
1137
|
+
elif label_x > x_max + extreme_margin:
|
|
1138
|
+
label_x = x_max + extreme_margin
|
|
1139
|
+
|
|
1140
|
+
if label_y < y_min - extreme_margin:
|
|
1141
|
+
label_y = y_min - extreme_margin
|
|
1142
|
+
elif label_y > y_max + extreme_margin:
|
|
1143
|
+
label_y = y_max + extreme_margin
|
|
1144
|
+
|
|
1145
|
+
# Determine text alignment based on position relative to cluster
|
|
1146
|
+
text_align = 'right' if label_x > cx else 'left'
|
|
1147
|
+
|
|
1148
|
+
# Clean up atom name for display but keep prefixes
|
|
1149
|
+
display_atom = atom # Keep prefixes like name:, group:, batch:, type:
|
|
1150
|
+
|
|
1151
|
+
# Create label source with alignment
|
|
1152
|
+
label_source = ColumnDataSource({
|
|
1153
|
+
'x': [label_x],
|
|
1154
|
+
'y': [label_y],
|
|
1155
|
+
'text': [display_atom],
|
|
1156
|
+
'atom': [atom],
|
|
1157
|
+
'text_align': [text_align]
|
|
1158
|
+
})
|
|
1159
|
+
label_sources[atom] = label_source
|
|
1160
|
+
|
|
1161
|
+
# Create spike line from cluster center to label
|
|
1162
|
+
line_source = ColumnDataSource({
|
|
1163
|
+
'x': [cx, label_x],
|
|
1164
|
+
'y': [cy, label_y]
|
|
1165
|
+
})
|
|
1166
|
+
line_sources[atom] = line_source
|
|
1167
|
+
line_cluster_mapping[atom] = cluster_id
|
|
1168
|
+
|
|
1169
|
+
else:
|
|
1170
|
+
# Multiple terms - stack them vertically with one line to cluster center
|
|
1171
|
+
# Determine if this cluster has shared vs non-shared terms to adjust positioning
|
|
1172
|
+
has_shared = any(atom in shared_terms for atom in cluster_atoms)
|
|
1173
|
+
has_specific = any(atom in specific_terms for atom in cluster_atoms)
|
|
1174
|
+
|
|
1175
|
+
# Adjust base distance: put non-shared (cluster-specific) labels further out
|
|
1176
|
+
if has_specific and not has_shared:
|
|
1177
|
+
# Pure cluster-specific terms - place further from center to reduce overlap
|
|
1178
|
+
base_distance = max(data_range_x, data_range_y) * 0.15 # Further out
|
|
1179
|
+
elif has_shared and not has_specific:
|
|
1180
|
+
# Pure shared terms - place closer to center
|
|
1181
|
+
base_distance = max(data_range_x, data_range_y) * 0.08 # Closer
|
|
1182
|
+
else:
|
|
1183
|
+
# Mixed terms - use intermediate distance
|
|
1184
|
+
base_distance = max(data_range_x, data_range_y) * 0.1 # Standard distance
|
|
1185
|
+
|
|
1186
|
+
# Calculate a good angle for the stack based on cluster position and available space
|
|
1187
|
+
# For non-shared terms, prefer angles that point away from plot center
|
|
1188
|
+
best_angle = None
|
|
1189
|
+
best_distance = 0
|
|
1190
|
+
|
|
1191
|
+
# Get plot center for reference
|
|
1192
|
+
plot_center_x = (x_min + x_max) / 2
|
|
1193
|
+
plot_center_y = (y_min + y_max) / 2
|
|
1194
|
+
|
|
1195
|
+
# Calculate angle from plot center to cluster center
|
|
1196
|
+
center_to_cluster_angle = np.arctan2(cy - plot_center_y, cx - plot_center_x)
|
|
1197
|
+
|
|
1198
|
+
if has_specific and not has_shared:
|
|
1199
|
+
# For non-shared terms, prefer angles that point away from plot center
|
|
1200
|
+
# Create angles around the center-to-cluster direction
|
|
1201
|
+
base_angle = center_to_cluster_angle
|
|
1202
|
+
preferred_angles = [
|
|
1203
|
+
base_angle, # Directly away from center
|
|
1204
|
+
base_angle + np.pi/4, # 45° offset
|
|
1205
|
+
base_angle - np.pi/4, # -45° offset
|
|
1206
|
+
base_angle + np.pi/6, # 30° offset
|
|
1207
|
+
base_angle - np.pi/6, # -30° offset
|
|
1208
|
+
base_angle + np.pi/3, # 60° offset
|
|
1209
|
+
base_angle - np.pi/3, # -60° offset
|
|
1210
|
+
base_angle + np.pi/2, # 90° offset
|
|
1211
|
+
base_angle - np.pi/2 # -90° offset
|
|
1212
|
+
]
|
|
1213
|
+
else:
|
|
1214
|
+
# For shared terms or mixed, use the original preferred angles
|
|
1215
|
+
preferred_angles = [np.pi/4, 3*np.pi/4, 5*np.pi/4, 7*np.pi/4, # 45°, 135°, 225°, 315°
|
|
1216
|
+
np.pi/6, np.pi/3, 2*np.pi/3, 5*np.pi/6, # 30°, 60°, 120°, 150°
|
|
1217
|
+
7*np.pi/6, 4*np.pi/3, 5*np.pi/3, 11*np.pi/6] # 210°, 240°, 300°, 330°
|
|
1218
|
+
|
|
1219
|
+
for test_angle in preferred_angles:
|
|
1220
|
+
test_x = cx + base_distance * np.cos(test_angle)
|
|
1221
|
+
test_y = cy + base_distance * np.sin(test_angle)
|
|
1222
|
+
|
|
1223
|
+
# Calculate minimum distance to any data point
|
|
1224
|
+
distances = [np.sqrt((pt[0] - test_x)**2 + (pt[1] - test_y)**2) for pt in umap_coords]
|
|
1225
|
+
min_distance = min(distances)
|
|
1226
|
+
|
|
1227
|
+
if min_distance > best_distance:
|
|
1228
|
+
best_distance = min_distance
|
|
1229
|
+
best_angle = test_angle
|
|
1230
|
+
|
|
1231
|
+
# Use the best angle found, or fallback to 45°
|
|
1232
|
+
if best_angle is not None:
|
|
1233
|
+
stack_angle = best_angle
|
|
1234
|
+
else:
|
|
1235
|
+
# Fallback: use 45° based on cluster
|
|
1236
|
+
angle_options = [np.pi/4, 3*np.pi/4, 5*np.pi/4, 7*np.pi/4]
|
|
1237
|
+
stack_angle = angle_options[cluster_id % len(angle_options)]
|
|
1238
|
+
|
|
1239
|
+
# Position for the end of the line (before labels start)
|
|
1240
|
+
line_end_x = cx + base_distance * np.cos(stack_angle)
|
|
1241
|
+
line_end_y = cy + base_distance * np.sin(stack_angle)
|
|
1242
|
+
|
|
1243
|
+
# Simplified approach: center labels at line end, then add 20pt offset in same direction
|
|
1244
|
+
# Calculate 20pt offset in the same direction as the line
|
|
1245
|
+
label_offset_distance = 20 # 20 points in the same direction
|
|
1246
|
+
|
|
1247
|
+
# Convert 20 points to data coordinates (approximate)
|
|
1248
|
+
# Assuming typical plot size, 20pt ≈ 1-2% of data range
|
|
1249
|
+
data_range = max(data_range_x, data_range_y)
|
|
1250
|
+
offset_in_data_coords = data_range * 0.02 # 2% of data range for 20pt
|
|
1251
|
+
|
|
1252
|
+
# Add offset in direction based on line orientation for better text placement
|
|
1253
|
+
# For westward lines: place label LEFT of endpoint with RIGHT alignment
|
|
1254
|
+
# For eastward lines: place label RIGHT of endpoint with LEFT alignment
|
|
1255
|
+
|
|
1256
|
+
angle_degrees = (stack_angle * 180 / np.pi) % 360
|
|
1257
|
+
if 90 < angle_degrees < 270:
|
|
1258
|
+
# Line goes LEFT (westward) - place label to the LEFT of line end
|
|
1259
|
+
label_center_x = line_end_x - offset_in_data_coords # SUBTRACT to go left
|
|
1260
|
+
label_center_y = line_end_y # Keep same Y position
|
|
1261
|
+
text_align = 'right' # Right-align so text ends near line endpoint
|
|
1262
|
+
else:
|
|
1263
|
+
# Line goes RIGHT (eastward) - place label to the RIGHT of line end
|
|
1264
|
+
label_center_x = line_end_x + offset_in_data_coords # ADD to go right
|
|
1265
|
+
label_center_y = line_end_y # Keep same Y position
|
|
1266
|
+
text_align = 'left' # Left-align so text starts near line endpoint
|
|
1267
|
+
|
|
1268
|
+
# Calculate consistent vertical spacing for stacked labels
|
|
1269
|
+
# BETTER APPROACH: Use single LabelSet with newline characters
|
|
1270
|
+
|
|
1271
|
+
# Create a single multi-line text string with all terms
|
|
1272
|
+
display_atoms = [atom for atom in cluster_atoms] # Keep original atom names with prefixes
|
|
1273
|
+
combined_text = '\n'.join(display_atoms)
|
|
1274
|
+
|
|
1275
|
+
# Check if label would be outside plot bounds and adjust
|
|
1276
|
+
label_margin = max(data_range_x, data_range_y) * 0.05
|
|
1277
|
+
label_x = label_center_x
|
|
1278
|
+
label_y = label_center_y
|
|
1279
|
+
|
|
1280
|
+
if label_x < x_min - label_margin:
|
|
1281
|
+
label_x = x_min - label_margin
|
|
1282
|
+
text_align = 'left'
|
|
1283
|
+
elif label_x > x_max + label_margin:
|
|
1284
|
+
label_x = x_max + label_margin
|
|
1285
|
+
text_align = 'right'
|
|
1286
|
+
|
|
1287
|
+
if label_y < y_min - label_margin:
|
|
1288
|
+
label_y = y_min - label_margin
|
|
1289
|
+
elif label_y > y_max + label_margin:
|
|
1290
|
+
label_y = y_max + label_margin
|
|
1291
|
+
|
|
1292
|
+
# Create single label source with multi-line text and alignment
|
|
1293
|
+
label_source = ColumnDataSource({
|
|
1294
|
+
'x': [label_x],
|
|
1295
|
+
'y': [label_y],
|
|
1296
|
+
'text': [combined_text],
|
|
1297
|
+
'atoms': [cluster_atoms], # Store all atoms for reference
|
|
1298
|
+
'text_align': [text_align]
|
|
1299
|
+
})
|
|
1300
|
+
|
|
1301
|
+
# Store this single label source using a unique key for the cluster stack
|
|
1302
|
+
stack_label_key = f"cluster_{cluster_id}_labels"
|
|
1303
|
+
label_sources[stack_label_key] = label_source
|
|
1304
|
+
|
|
1305
|
+
# Create single line from cluster center to line end (before labels)
|
|
1306
|
+
stack_line_source = ColumnDataSource({
|
|
1307
|
+
'x': [cx, line_end_x],
|
|
1308
|
+
'y': [cy, line_end_y]
|
|
1309
|
+
})
|
|
1310
|
+
# Use a unique key for the stack line
|
|
1311
|
+
stack_key = f"cluster_{cluster_id}_stack"
|
|
1312
|
+
line_sources[stack_key] = stack_line_source
|
|
1313
|
+
line_cluster_mapping[stack_key] = cluster_id
|
|
1314
|
+
|
|
1315
|
+
# Add lines (spikes) to plot with matching cluster colors
|
|
1316
|
+
line_renderers = {}
|
|
1317
|
+
for line_key, line_source in line_sources.items():
|
|
1318
|
+
# Get the cluster color for this line
|
|
1319
|
+
if line_key in shared_terms:
|
|
1320
|
+
# For shared terms, use the same style as cluster-specific terms
|
|
1321
|
+
# Use a neutral color or the color of the first cluster it appears in
|
|
1322
|
+
first_cluster_id = list(shared_terms[line_key])[0]
|
|
1323
|
+
if first_cluster_id == -1:
|
|
1324
|
+
line_color = 'gray'
|
|
1325
|
+
else:
|
|
1326
|
+
cluster_idx = list(unique_clusters).index(first_cluster_id) if first_cluster_id in unique_clusters else 0
|
|
1327
|
+
line_color = colors[cluster_idx % len(colors)]
|
|
1328
|
+
line_dash = 'dashed' # Use dashed for all edges
|
|
1329
|
+
elif line_key in specific_terms:
|
|
1330
|
+
# For cluster-specific terms, use the cluster's color
|
|
1331
|
+
cluster_id = specific_terms[line_key]
|
|
1332
|
+
if cluster_id == -1:
|
|
1333
|
+
line_color = 'gray'
|
|
1334
|
+
else:
|
|
1335
|
+
cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
|
|
1336
|
+
line_color = colors[cluster_idx % len(colors)]
|
|
1337
|
+
line_dash = 'dashed' # Use dashed for all edges
|
|
1338
|
+
elif line_key in line_cluster_mapping:
|
|
1339
|
+
# For stack lines, use the cluster's color
|
|
1340
|
+
cluster_info = line_cluster_mapping[line_key]
|
|
1341
|
+
if cluster_info == 'shared':
|
|
1342
|
+
# For shared stacks, use a neutral color or first cluster color
|
|
1343
|
+
line_color = 'black'
|
|
1344
|
+
line_dash = 'dashed' # Use dashed for all edges
|
|
1345
|
+
else:
|
|
1346
|
+
cluster_id = cluster_info
|
|
1347
|
+
if cluster_id == -1:
|
|
1348
|
+
line_color = 'gray'
|
|
1349
|
+
else:
|
|
1350
|
+
cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
|
|
1351
|
+
line_color = colors[cluster_idx % len(colors)]
|
|
1352
|
+
line_dash = 'dashed' # Use dashed for all edges
|
|
1353
|
+
else:
|
|
1354
|
+
# Fallback
|
|
1355
|
+
line_color = 'gray'
|
|
1356
|
+
line_dash = 'dashed' # Use dashed for all edges
|
|
1357
|
+
|
|
1358
|
+
line_renderer = p1.line('x', 'y', source=line_source,
|
|
1359
|
+
line_color=line_color, line_width=2,
|
|
1360
|
+
alpha=0.8, line_dash=line_dash)
|
|
1361
|
+
line_renderers[line_key] = line_renderer
|
|
1362
|
+
|
|
1363
|
+
# Add labels to plot (simple and direct approach)
|
|
1364
|
+
label_renderers = {} # Store label renderers for legend control
|
|
1365
|
+
for label_key, label_source in label_sources.items():
|
|
1366
|
+
# Determine color and style based on label key type
|
|
1367
|
+
if label_key.startswith('cluster_') and label_key.endswith('_labels'):
|
|
1368
|
+
# This is a cluster stack with multiple terms
|
|
1369
|
+
cluster_id = int(label_key.split('_')[1])
|
|
1370
|
+
if cluster_id == -1:
|
|
1371
|
+
text_color = 'gray'
|
|
1372
|
+
else:
|
|
1373
|
+
cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
|
|
1374
|
+
text_color = colors[cluster_idx % len(colors)]
|
|
1375
|
+
text_font_style = 'bold'
|
|
1376
|
+
elif label_key in shared_terms:
|
|
1377
|
+
# Shared term - use same color as edge (first cluster's color)
|
|
1378
|
+
first_cluster_id = list(shared_terms[label_key])[0]
|
|
1379
|
+
if first_cluster_id == -1:
|
|
1380
|
+
text_color = 'gray'
|
|
1381
|
+
else:
|
|
1382
|
+
cluster_idx = list(unique_clusters).index(first_cluster_id) if first_cluster_id in unique_clusters else 0
|
|
1383
|
+
text_color = colors[cluster_idx % len(colors)]
|
|
1384
|
+
text_font_style = 'bold'
|
|
1385
|
+
elif label_key in specific_terms:
|
|
1386
|
+
# Individual cluster-specific term
|
|
1387
|
+
cluster_id = specific_terms[label_key]
|
|
1388
|
+
if cluster_id == -1:
|
|
1389
|
+
text_color = 'gray'
|
|
1390
|
+
else:
|
|
1391
|
+
cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
|
|
1392
|
+
text_color = colors[cluster_idx % len(colors)]
|
|
1393
|
+
text_font_style = 'bold'
|
|
1394
|
+
else:
|
|
1395
|
+
# Fallback
|
|
1396
|
+
text_color = 'black'
|
|
1397
|
+
text_font_style = 'bold'
|
|
1398
|
+
|
|
1399
|
+
# Get text alignment from label source, default to center
|
|
1400
|
+
label_data = label_source.data
|
|
1401
|
+
text_align = label_data.get('text_align', ['center'])[0] if 'text_align' in label_data else 'center'
|
|
1402
|
+
|
|
1403
|
+
label_set = LabelSet(
|
|
1404
|
+
x='x', y='y', text='text',
|
|
1405
|
+
source=label_source,
|
|
1406
|
+
text_font_size='11pt',
|
|
1407
|
+
text_color=text_color,
|
|
1408
|
+
text_font_style=text_font_style,
|
|
1409
|
+
text_align=text_align,
|
|
1410
|
+
text_baseline='middle'
|
|
1411
|
+
)
|
|
1412
|
+
p1.add_layout(label_set)
|
|
1413
|
+
label_renderers[label_key] = label_set # Store for legend control
|
|
1414
|
+
|
|
1415
|
+
# Check if any labels are close to plot boundaries and expand if needed
|
|
1416
|
+
if label_sources:
|
|
1417
|
+
# Collect all label positions
|
|
1418
|
+
all_label_positions = []
|
|
1419
|
+
for source in label_sources.values():
|
|
1420
|
+
data = source.data
|
|
1421
|
+
if 'x' in data and 'y' in data and data['x'] and data['y']:
|
|
1422
|
+
all_label_positions.extend(zip(data['x'], data['y']))
|
|
1423
|
+
|
|
1424
|
+
if all_label_positions:
|
|
1425
|
+
# Check if any labels are close to current plot boundaries
|
|
1426
|
+
current_x_min, current_x_max = p1.x_range.start, p1.x_range.end
|
|
1427
|
+
current_y_min, current_y_max = p1.y_range.start, p1.y_range.end
|
|
1428
|
+
|
|
1429
|
+
# Define "close to boundary" as within 5% of the plot range
|
|
1430
|
+
x_range = current_x_max - current_x_min
|
|
1431
|
+
y_range = current_y_max - current_y_min
|
|
1432
|
+
boundary_threshold_x = x_range * 0.05
|
|
1433
|
+
boundary_threshold_y = y_range * 0.05
|
|
1434
|
+
|
|
1435
|
+
needs_expansion = False
|
|
1436
|
+
for label_x, label_y in all_label_positions:
|
|
1437
|
+
if (label_x < current_x_min + boundary_threshold_x or
|
|
1438
|
+
label_x > current_x_max - boundary_threshold_x or
|
|
1439
|
+
label_y < current_y_min + boundary_threshold_y or
|
|
1440
|
+
label_y > current_y_max - boundary_threshold_y):
|
|
1441
|
+
needs_expansion = True
|
|
1442
|
+
break
|
|
1443
|
+
|
|
1444
|
+
# If labels are close to boundaries, expand plot by 5% (reduced from 10%)
|
|
1445
|
+
if needs_expansion:
|
|
1446
|
+
expansion_factor = 0.05 # 5% expansion (half of previous 10%)
|
|
1447
|
+
x_expansion = x_range * expansion_factor
|
|
1448
|
+
y_expansion = y_range * expansion_factor
|
|
1449
|
+
|
|
1450
|
+
p1.x_range.start = current_x_min - x_expansion
|
|
1451
|
+
p1.x_range.end = current_x_max + x_expansion
|
|
1452
|
+
p1.y_range.start = current_y_min - y_expansion
|
|
1453
|
+
p1.y_range.end = current_y_max + y_expansion
|
|
1454
|
+
|
|
1455
|
+
|
|
1456
|
+
# Add hover tool with enrichment information
|
|
1457
|
+
hover = HoverTool(tooltips=[
|
|
1458
|
+
("Cluster", "@cluster"),
|
|
1459
|
+
("Sample", "@sample_name"),
|
|
1460
|
+
("Sample UID", "@sample_uid"),
|
|
1461
|
+
("Enrichments", "@enrichments")
|
|
1462
|
+
])
|
|
1463
|
+
p1.add_tools(hover)
|
|
1464
|
+
|
|
1465
|
+
# Remove cluster legend labels from scatter plots (already done above)
|
|
1466
|
+
# But keep any existing legend structure for now
|
|
1467
|
+
|
|
1468
|
+
# Create custom legend for enrichment terms (line/label pairs) ONLY
|
|
1469
|
+
if line_renderers and (shared_terms or specific_terms):
|
|
1470
|
+
legend_items = []
|
|
1471
|
+
renderer_to_terms = {} # Group terms by their renderer
|
|
1472
|
+
|
|
1473
|
+
# Get all enriched terms and group them by their line renderer
|
|
1474
|
+
all_enriched_atoms = set(shared_terms.keys()) | set(specific_terms.keys())
|
|
1475
|
+
|
|
1476
|
+
# First pass: map each term to its renderer
|
|
1477
|
+
for atom in all_enriched_atoms:
|
|
1478
|
+
renderer = None
|
|
1479
|
+
renderer_key = None
|
|
1480
|
+
|
|
1481
|
+
if atom in shared_terms:
|
|
1482
|
+
# Shared term
|
|
1483
|
+
if atom in line_renderers:
|
|
1484
|
+
renderer = line_renderers[atom]
|
|
1485
|
+
renderer_key = atom
|
|
1486
|
+
else:
|
|
1487
|
+
# Look for any stack renderer from clusters that have this shared term
|
|
1488
|
+
for cluster_id in shared_terms[atom]:
|
|
1489
|
+
stack_key = f"cluster_{cluster_id}_stack"
|
|
1490
|
+
if stack_key in line_renderers:
|
|
1491
|
+
renderer = line_renderers[stack_key]
|
|
1492
|
+
renderer_key = stack_key
|
|
1493
|
+
break
|
|
1494
|
+
|
|
1495
|
+
elif atom in specific_terms:
|
|
1496
|
+
# Cluster-specific term
|
|
1497
|
+
cluster_id = specific_terms[atom]
|
|
1498
|
+
if atom in line_renderers:
|
|
1499
|
+
renderer = line_renderers[atom]
|
|
1500
|
+
renderer_key = atom
|
|
1501
|
+
else:
|
|
1502
|
+
stack_key = f"cluster_{cluster_id}_stack"
|
|
1503
|
+
if stack_key in line_renderers:
|
|
1504
|
+
renderer = line_renderers[stack_key]
|
|
1505
|
+
renderer_key = stack_key
|
|
1506
|
+
|
|
1507
|
+
# Group terms by renderer
|
|
1508
|
+
if renderer and renderer_key:
|
|
1509
|
+
if renderer_key not in renderer_to_terms:
|
|
1510
|
+
renderer_to_terms[renderer_key] = {
|
|
1511
|
+
'renderer': renderer,
|
|
1512
|
+
'shared_terms': [],
|
|
1513
|
+
'specific_terms': [],
|
|
1514
|
+
'cluster_id': None
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
if atom in shared_terms:
|
|
1518
|
+
renderer_to_terms[renderer_key]['shared_terms'].append(atom)
|
|
1519
|
+
else:
|
|
1520
|
+
renderer_to_terms[renderer_key]['specific_terms'].append(atom)
|
|
1521
|
+
renderer_to_terms[renderer_key]['cluster_id'] = specific_terms[atom]
|
|
1522
|
+
|
|
1523
|
+
# Second pass: create legend entries, one per renderer
|
|
1524
|
+
for renderer_key, term_info in renderer_to_terms.items():
|
|
1525
|
+
shared_list = term_info['shared_terms']
|
|
1526
|
+
specific_list = term_info['specific_terms']
|
|
1527
|
+
line_renderer = term_info['renderer']
|
|
1528
|
+
|
|
1529
|
+
# For now, legend can only control the line renderer
|
|
1530
|
+
# Label visibility will be handled via JavaScript callback if needed
|
|
1531
|
+
# (Note: LabelSet cannot be directly controlled by Bokeh legends)
|
|
1532
|
+
|
|
1533
|
+
# Create combined label text
|
|
1534
|
+
if shared_list:
|
|
1535
|
+
# Shared terms - remove "Shared:" prefix and just show the terms
|
|
1536
|
+
clean_terms = [atom.replace('name:', '').replace('group:', '').replace('batch:', '').replace('type:', '')
|
|
1537
|
+
for atom in shared_list]
|
|
1538
|
+
if len(clean_terms) == 1:
|
|
1539
|
+
label_text = clean_terms[0]
|
|
1540
|
+
else:
|
|
1541
|
+
label_text = ', '.join(clean_terms)
|
|
1542
|
+
|
|
1543
|
+
elif specific_list:
|
|
1544
|
+
# Cluster-specific terms
|
|
1545
|
+
cluster_id = term_info['cluster_id']
|
|
1546
|
+
clean_terms = [atom.replace('name:', '').replace('group:', '').replace('batch:', '').replace('type:', '')
|
|
1547
|
+
for atom in specific_list]
|
|
1548
|
+
if len(clean_terms) == 1:
|
|
1549
|
+
label_text = f"C{cluster_id}: {clean_terms[0]}"
|
|
1550
|
+
else:
|
|
1551
|
+
label_text = f"C{cluster_id}: {', '.join(clean_terms)}"
|
|
1552
|
+
|
|
1553
|
+
# Add single legend entry for the line renderer only
|
|
1554
|
+
# (Labels cannot be controlled by Bokeh legends directly)
|
|
1555
|
+
legend_items.append(
|
|
1556
|
+
LegendItem(label=label_text, renderers=[line_renderer])
|
|
1557
|
+
)
|
|
1558
|
+
|
|
1559
|
+
# Hide cluster legend after we've created our enrichment legend
|
|
1560
|
+
if hasattr(p1, 'legend') and p1.legend:
|
|
1561
|
+
if isinstance(p1.legend, list):
|
|
1562
|
+
for legend in p1.legend:
|
|
1563
|
+
legend.visible = False
|
|
1564
|
+
else:
|
|
1565
|
+
p1.legend.visible = False
|
|
1566
|
+
|
|
1567
|
+
# Create and add the custom enrichment legend
|
|
1568
|
+
if legend_items:
|
|
1569
|
+
enrichment_legend = Legend(
|
|
1570
|
+
items=legend_items,
|
|
1571
|
+
location="center_right",
|
|
1572
|
+
click_policy="hide"
|
|
1573
|
+
)
|
|
1574
|
+
p1.add_layout(enrichment_legend, 'right')
|
|
1575
|
+
|
|
1576
|
+
plots['cluster_plot'] = p1
|
|
1577
|
+
|
|
1578
|
+
# Save cluster plot if filename provided
|
|
1579
|
+
if filename:
|
|
1580
|
+
# Handle filename extension properly
|
|
1581
|
+
if filename.endswith('.html'):
|
|
1582
|
+
base_filename = filename[:-5] # Remove .html extension
|
|
1583
|
+
cluster_filename = f"{base_filename}_clusters.html"
|
|
1584
|
+
else:
|
|
1585
|
+
cluster_filename = f"{filename}_clusters.html"
|
|
1586
|
+
|
|
1587
|
+
if not filename.startswith('/') and not filename[1:3] == ':\\':
|
|
1588
|
+
cluster_filename = f"{self.folder}/{cluster_filename}"
|
|
1589
|
+
_isolated_save_plot(p1, cluster_filename, cluster_filename, self.logger, "UMAP Cluster Plot")
|
|
1590
|
+
else:
|
|
1591
|
+
_isolated_show_notebook(p1)
|
|
1592
|
+
|
|
1593
|
+
results['plots'] = plots
|
|
1594
|
+
|
|
1595
|
+
# Print summary
|
|
1596
|
+
self.logger.debug("\n=== UMAP Cluster Analysis Summary ===")
|
|
1597
|
+
self.logger.debug(f"Best clustering: {best_clustering['method']}")
|
|
1598
|
+
self.logger.debug(f"Number of clusters: {best_clustering['n_clusters']}")
|
|
1599
|
+
self.logger.debug(f"Silhouette score: {best_clustering['score']:.3f}")
|
|
1600
|
+
if best_clustering['n_noise'] > 0:
|
|
1601
|
+
self.logger.debug(f"Noise points: {best_clustering['n_noise']}")
|
|
1602
|
+
|
|
1603
|
+
self.logger.info(f"\nFound {len(all_associations)} total significant associations:")
|
|
1604
|
+
|
|
1605
|
+
# Show regular column associations
|
|
1606
|
+
regular_assocs = [a for a in all_associations if 'column' in a]
|
|
1607
|
+
if regular_assocs:
|
|
1608
|
+
self.logger.info(f" {len(regular_assocs)} column-level associations:")
|
|
1609
|
+
for assoc in regular_assocs[:3]: # Show top 3
|
|
1610
|
+
self.logger.info(f" {assoc['column']} ({assoc['variable_type']}): {assoc['test']} p={assoc['p_value']:.4f}, "
|
|
1611
|
+
f"effect_size={assoc['effect_size']:.3f} ({assoc['interpretation']})")
|
|
1612
|
+
|
|
1613
|
+
# Show text atom associations
|
|
1614
|
+
text_assocs = [a for a in all_associations if 'atom' in a]
|
|
1615
|
+
if text_assocs:
|
|
1616
|
+
self.logger.info(f" {len(text_assocs)} text pattern associations:")
|
|
1617
|
+
for assoc in text_assocs[:3]: # Show top 3
|
|
1618
|
+
freq = assoc.get('atom_frequency', 0)
|
|
1619
|
+
percentage = (freq / len(analysis_df_clean)) * 100 if len(analysis_df_clean) > 0 else 0
|
|
1620
|
+
|
|
1621
|
+
self.logger.info(f" '{assoc['atom']}' ({assoc['type']}): p={assoc['p_value']:.4f}, "
|
|
1622
|
+
f"effect_size={assoc['effect_size']:.3f} ({assoc['interpretation']}) "
|
|
1623
|
+
f"[{freq} samples, {percentage:.1f}%]")
|
|
1624
|
+
|
|
1625
|
+
if len(all_associations) > 20:
|
|
1626
|
+
self.logger.info(f" ... and {len(all_associations) - 20} more associations")
|
|
1627
|
+
|
|
1628
|
+
return results
|
|
1629
|
+
|
|
1630
|
+
def _analyze_umap_simplified(
|
|
1631
|
+
self,
|
|
1632
|
+
n_neighbors=15,
|
|
1633
|
+
min_dist=0.1,
|
|
1634
|
+
metric="euclidean",
|
|
1635
|
+
random_state=42,
|
|
1636
|
+
cluster_methods=["hdbscan", "kmeans"],
|
|
1637
|
+
n_clusters_range=(2, 8),
|
|
1638
|
+
min_cluster_size=3,
|
|
1639
|
+
significance_threshold=0.05,
|
|
1640
|
+
plot_results=True,
|
|
1641
|
+
filename=None,
|
|
1642
|
+
):
|
|
1643
|
+
"""Simplified fallback version of UMAP analysis."""
|
|
1644
|
+
|
|
1645
|
+
self.logger.info("Starting simplified UMAP analysis...")
|
|
1646
|
+
|
|
1647
|
+
# Check dependencies
|
|
1648
|
+
if not UMAP_AVAILABLE or not HDBSCAN_AVAILABLE:
|
|
1649
|
+
self.logger.error("Required dependencies not available")
|
|
1650
|
+
return {
|
|
1651
|
+
'umap_coords': None,
|
|
1652
|
+
'best_clustering': None,
|
|
1653
|
+
'all_clustering_results': {},
|
|
1654
|
+
'significant_associations': [],
|
|
1655
|
+
'text_associations': [],
|
|
1656
|
+
'cluster_summaries': {},
|
|
1657
|
+
'analysis_dataframe': None
|
|
1658
|
+
}
|
|
1659
|
+
|
|
1660
|
+
try:
|
|
1661
|
+
# Get data
|
|
1662
|
+
consensus_matrix = self.get_consensus_matrix()
|
|
1663
|
+
samples_df = self.samples_df
|
|
1664
|
+
|
|
1665
|
+
if consensus_matrix is None or samples_df is None:
|
|
1666
|
+
self.logger.error("No data available")
|
|
1667
|
+
return {
|
|
1668
|
+
'umap_coords': None,
|
|
1669
|
+
'best_clustering': None,
|
|
1670
|
+
'all_clustering_results': {},
|
|
1671
|
+
'significant_associations': [],
|
|
1672
|
+
'text_associations': [],
|
|
1673
|
+
'cluster_summaries': {},
|
|
1674
|
+
'analysis_dataframe': None
|
|
1675
|
+
}
|
|
1676
|
+
|
|
1677
|
+
# Basic UMAP
|
|
1678
|
+
sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
|
|
1679
|
+
|
|
1680
|
+
if hasattr(consensus_matrix, "select"):
|
|
1681
|
+
matrix_data = consensus_matrix.select(sample_cols).to_numpy()
|
|
1682
|
+
else:
|
|
1683
|
+
matrix_data = consensus_matrix.drop(columns=["consensus_uid"], errors="ignore").values
|
|
1684
|
+
|
|
1685
|
+
matrix_data = matrix_data.T
|
|
1686
|
+
matrix_data = np.nan_to_num(matrix_data)
|
|
1687
|
+
|
|
1688
|
+
scaler = StandardScaler()
|
|
1689
|
+
matrix_scaled = scaler.fit_transform(matrix_data)
|
|
1690
|
+
|
|
1691
|
+
# Import dependencies locally
|
|
1692
|
+
import umap
|
|
1693
|
+
import hdbscan
|
|
1694
|
+
|
|
1695
|
+
reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, random_state=random_state)
|
|
1696
|
+
umap_coords = reducer.fit_transform(matrix_scaled)
|
|
1697
|
+
|
|
1698
|
+
# Simple clustering
|
|
1699
|
+
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
|
|
1700
|
+
cluster_labels = clusterer.fit_predict(umap_coords)
|
|
1701
|
+
|
|
1702
|
+
best_clustering = {
|
|
1703
|
+
'labels': cluster_labels,
|
|
1704
|
+
'n_clusters': len(np.unique(cluster_labels[cluster_labels != -1])),
|
|
1705
|
+
'n_noise': np.sum(cluster_labels == -1),
|
|
1706
|
+
'silhouette_score': 0.5, # Placeholder
|
|
1707
|
+
'method': 'hdbscan'
|
|
1708
|
+
}
|
|
1709
|
+
|
|
1710
|
+
self.logger.info(f"Simplified analysis found {best_clustering['n_clusters']} clusters")
|
|
1711
|
+
|
|
1712
|
+
return {
|
|
1713
|
+
'umap_coords': umap_coords,
|
|
1714
|
+
'best_clustering': best_clustering,
|
|
1715
|
+
'all_clustering_results': {'hdbscan': best_clustering},
|
|
1716
|
+
'significant_associations': [],
|
|
1717
|
+
'text_associations': [],
|
|
1718
|
+
'cluster_summaries': {},
|
|
1719
|
+
'analysis_dataframe': None
|
|
1720
|
+
}
|
|
1721
|
+
|
|
1722
|
+
except Exception as e:
|
|
1723
|
+
self.logger.error(f"Error in simplified analysis: {e}")
|
|
1724
|
+
return {
|
|
1725
|
+
'umap_coords': None,
|
|
1726
|
+
'best_clustering': None,
|
|
1727
|
+
'all_clustering_results': {},
|
|
1728
|
+
'significant_associations': [],
|
|
1729
|
+
'text_associations': [],
|
|
1730
|
+
'cluster_summaries': {},
|
|
1731
|
+
'analysis_dataframe': None
|
|
1732
|
+
}
|
|
1733
|
+
|
|
1734
|
+
|
|
1735
|
+
# ========================================
|
|
1736
|
+
# Helper Functions for Plotting
|
|
1737
|
+
# ========================================
|
|
1738
|
+
|
|
1739
|
+
def _isolated_save_plot(plot, filename, title, logger, plot_type):
|
|
1740
|
+
"""Save plot to file in isolation"""
|
|
1741
|
+
try:
|
|
1742
|
+
from bokeh.io import output_file, save
|
|
1743
|
+
from bokeh.models import Title
|
|
1744
|
+
|
|
1745
|
+
# Add title to plot
|
|
1746
|
+
plot.add_layout(Title(text=title, text_font_size="16pt"), 'above')
|
|
1747
|
+
|
|
1748
|
+
# Configure output
|
|
1749
|
+
output_file(filename)
|
|
1750
|
+
save(plot)
|
|
1751
|
+
logger.info(f"Saved {plot_type} to: {filename}")
|
|
1752
|
+
|
|
1753
|
+
except Exception as e:
|
|
1754
|
+
logger.error(f"Error saving {plot_type}: {e}")
|
|
1755
|
+
|
|
1756
|
+
def _isolated_show_notebook(plot):
|
|
1757
|
+
"""Show plot in notebook if available"""
|
|
1758
|
+
try:
|
|
1759
|
+
from bokeh.io import show
|
|
1760
|
+
show(plot)
|
|
1761
|
+
except Exception:
|
|
1762
|
+
pass # Silently fail if not in notebook
|