masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/logger.py +35 -19
- masster/sample/adducts.py +15 -29
- masster/sample/defaults/find_adducts_def.py +1 -3
- masster/sample/defaults/sample_def.py +4 -4
- masster/sample/h5.py +203 -361
- masster/sample/helpers.py +14 -30
- masster/sample/lib.py +3 -3
- masster/sample/load.py +21 -29
- masster/sample/plot.py +222 -132
- masster/sample/processing.py +42 -55
- masster/sample/sample.py +37 -46
- masster/sample/save.py +37 -61
- masster/sample/sciex.py +13 -11
- masster/sample/thermo.py +69 -74
- masster/spectrum.py +15 -15
- masster/study/analysis.py +650 -586
- masster/study/defaults/identify_def.py +1 -3
- masster/study/defaults/merge_def.py +6 -7
- masster/study/defaults/study_def.py +1 -5
- masster/study/export.py +35 -96
- masster/study/h5.py +134 -211
- masster/study/helpers.py +385 -459
- masster/study/id.py +239 -290
- masster/study/importers.py +84 -93
- masster/study/load.py +159 -178
- masster/study/merge.py +1112 -1098
- masster/study/plot.py +195 -149
- masster/study/processing.py +144 -191
- masster/study/save.py +14 -13
- masster/study/study.py +89 -130
- masster/wizard/wizard.py +764 -714
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0
masster/study/analysis.py
CHANGED
|
@@ -27,12 +27,14 @@ SKLEARN_AVAILABLE = False
|
|
|
27
27
|
|
|
28
28
|
try:
|
|
29
29
|
import umap
|
|
30
|
+
|
|
30
31
|
UMAP_AVAILABLE = True
|
|
31
32
|
except ImportError:
|
|
32
33
|
pass
|
|
33
34
|
|
|
34
35
|
try:
|
|
35
36
|
import hdbscan
|
|
37
|
+
|
|
36
38
|
HDBSCAN_AVAILABLE = True
|
|
37
39
|
except ImportError:
|
|
38
40
|
pass
|
|
@@ -41,23 +43,25 @@ try:
|
|
|
41
43
|
from sklearn.preprocessing import StandardScaler
|
|
42
44
|
from sklearn.cluster import KMeans, DBSCAN
|
|
43
45
|
from sklearn.metrics import silhouette_score
|
|
46
|
+
|
|
44
47
|
SKLEARN_AVAILABLE = True
|
|
45
48
|
except ImportError:
|
|
46
49
|
pass
|
|
47
50
|
|
|
48
51
|
# Compiled regex patterns for efficient text processing
|
|
49
|
-
TOKEN_PATTERN = re.compile(r
|
|
50
|
-
ALPHANUMERIC_PATTERN = re.compile(r
|
|
52
|
+
TOKEN_PATTERN = re.compile(r"[_\-\s\|\.]+")
|
|
53
|
+
ALPHANUMERIC_PATTERN = re.compile(r"^[A-Za-z0-9]+$")
|
|
51
54
|
|
|
52
55
|
# Simple cache for tokenization
|
|
53
56
|
_tokenization_cache = {}
|
|
54
57
|
|
|
58
|
+
|
|
55
59
|
def tokenize_text_cached(text):
|
|
56
60
|
"""Cached text tokenization for repeated strings - preserves original case."""
|
|
57
61
|
if text in _tokenization_cache:
|
|
58
62
|
return _tokenization_cache[text]
|
|
59
|
-
|
|
60
|
-
if pd.isna(text) or text ==
|
|
63
|
+
|
|
64
|
+
if pd.isna(text) or text == "" or not isinstance(text, str):
|
|
61
65
|
result = tuple()
|
|
62
66
|
else:
|
|
63
67
|
# Split by common delimiters to create atoms (same as original)
|
|
@@ -68,13 +72,13 @@ def tokenize_text_cached(text):
|
|
|
68
72
|
atom = atom.strip() # Remove .lower() to preserve case
|
|
69
73
|
if atom and len(atom) > 1: # Original was > 1, not >= 1
|
|
70
74
|
meaningful_tokens.append(atom)
|
|
71
|
-
|
|
75
|
+
|
|
72
76
|
result = tuple(meaningful_tokens)
|
|
73
|
-
|
|
77
|
+
|
|
74
78
|
# Prevent cache from growing too large
|
|
75
79
|
if len(_tokenization_cache) < 10000:
|
|
76
80
|
_tokenization_cache[text] = result
|
|
77
|
-
|
|
81
|
+
|
|
78
82
|
return result
|
|
79
83
|
|
|
80
84
|
|
|
@@ -98,14 +102,14 @@ def analyze_umap(
|
|
|
98
102
|
):
|
|
99
103
|
"""
|
|
100
104
|
Perform UMAP dimensionality reduction followed by clustering analysis with enriched term labeling.
|
|
101
|
-
|
|
105
|
+
|
|
102
106
|
This method performs comprehensive cluster analysis on the study's consensus matrix, including:
|
|
103
107
|
- UMAP dimensionality reduction for visualization
|
|
104
108
|
- Automated clustering with multiple algorithms (HDBSCAN, K-means, DBSCAN)
|
|
105
109
|
- Metadata association discovery using statistical tests
|
|
106
110
|
- Text pattern analysis to identify enriched sample characteristics
|
|
107
111
|
- Enhanced visualization with intelligent label positioning for enriched terms
|
|
108
|
-
|
|
112
|
+
|
|
109
113
|
The enhanced visualization features cluster-aware enriched term labels with connecting spikes:
|
|
110
114
|
- Terms shared across multiple clusters are positioned at the geometric center with lines to each cluster
|
|
111
115
|
- Terms specific to single clusters are positioned nearby with short spikes
|
|
@@ -113,59 +117,59 @@ def analyze_umap(
|
|
|
113
117
|
- Empty/blank terms are automatically filtered out
|
|
114
118
|
- Label positioning adapts to line direction for optimal text alignment
|
|
115
119
|
- Dashed edges and color-coordinated labels provide visual clarity
|
|
116
|
-
|
|
120
|
+
|
|
117
121
|
Unlike plot_samples_umap() which colors by metadata columns, this function performs clustering
|
|
118
122
|
and colors points by cluster assignments, with tooltips showing enrichment information.
|
|
119
|
-
|
|
123
|
+
|
|
120
124
|
Parameters
|
|
121
125
|
----------
|
|
122
126
|
n_neighbors : int, default=15
|
|
123
127
|
Number of neighbors for UMAP embedding. Higher values preserve more global structure,
|
|
124
128
|
lower values preserve more local structure.
|
|
125
|
-
|
|
129
|
+
|
|
126
130
|
min_dist : float, default=0.1
|
|
127
131
|
Minimum distance parameter for UMAP. Controls how tightly points are packed in the
|
|
128
132
|
embedding. Values closer to 0 result in tighter clusters.
|
|
129
|
-
|
|
133
|
+
|
|
130
134
|
metric : str, default="euclidean"
|
|
131
135
|
Distance metric for UMAP. Options include 'euclidean', 'manhattan', 'cosine', etc.
|
|
132
|
-
|
|
136
|
+
|
|
133
137
|
random_state : int, default=42
|
|
134
138
|
Random seed for reproducibility of UMAP embedding and clustering.
|
|
135
|
-
|
|
139
|
+
|
|
136
140
|
cluster_methods : list, default=["hdbscan", "kmeans", "dbscan"]
|
|
137
141
|
Clustering algorithms to evaluate. Available options:
|
|
138
142
|
- 'hdbscan': Hierarchical density-based clustering (requires hdbscan package)
|
|
139
143
|
- 'kmeans': K-means clustering with multiple k values
|
|
140
144
|
- 'dbscan': Density-based spatial clustering with multiple eps values
|
|
141
|
-
|
|
145
|
+
|
|
142
146
|
n_clusters_range : tuple, default=(2, 8)
|
|
143
147
|
Range of cluster numbers to test for K-means (min_clusters, max_clusters).
|
|
144
|
-
|
|
148
|
+
|
|
145
149
|
min_cluster_size : int, default=3
|
|
146
150
|
Minimum cluster size for HDBSCAN and DBSCAN algorithms.
|
|
147
|
-
|
|
151
|
+
|
|
148
152
|
significance_threshold : float, default=0.05
|
|
149
153
|
P-value threshold for statistical significance of metadata associations.
|
|
150
|
-
|
|
154
|
+
|
|
151
155
|
plot_results : bool, default=True
|
|
152
156
|
Whether to generate interactive Bokeh plots with enhanced labeling.
|
|
153
157
|
When False, only returns analysis results without visualization.
|
|
154
|
-
|
|
158
|
+
|
|
155
159
|
filename : str, optional
|
|
156
160
|
If provided, saves the interactive plot to this HTML file.
|
|
157
|
-
|
|
161
|
+
|
|
158
162
|
markersize : int, default=4
|
|
159
163
|
Size of scatter plot markers representing samples.
|
|
160
|
-
|
|
164
|
+
|
|
161
165
|
Returns
|
|
162
166
|
-------
|
|
163
167
|
dict
|
|
164
168
|
Comprehensive results dictionary containing:
|
|
165
|
-
|
|
169
|
+
|
|
166
170
|
- **umap_coords** : numpy.ndarray
|
|
167
171
|
2D UMAP coordinates for all samples (n_samples x 2)
|
|
168
|
-
|
|
172
|
+
|
|
169
173
|
- **best_clustering** : dict
|
|
170
174
|
Best clustering result based on silhouette score, containing:
|
|
171
175
|
- 'labels': cluster assignments for each sample
|
|
@@ -173,48 +177,48 @@ def analyze_umap(
|
|
|
173
177
|
- 'n_clusters': number of identified clusters
|
|
174
178
|
- 'n_noise': number of noise points (outliers)
|
|
175
179
|
- 'method': clustering algorithm used
|
|
176
|
-
|
|
180
|
+
|
|
177
181
|
- **all_clustering_results** : dict
|
|
178
182
|
Results from all tested clustering configurations, keyed by method name
|
|
179
|
-
|
|
183
|
+
|
|
180
184
|
- **significant_associations** : list
|
|
181
185
|
All statistically significant associations (both numeric and text), sorted by
|
|
182
186
|
cluster presence percentage. Each association includes:
|
|
183
187
|
- Statistical test results (p-value, effect size)
|
|
184
188
|
- Cluster-specific enrichment information
|
|
185
189
|
- Interpretation of effect size magnitude
|
|
186
|
-
|
|
190
|
+
|
|
187
191
|
- **text_associations** : list
|
|
188
192
|
Subset of associations specifically for text pattern enrichment, ranked by
|
|
189
193
|
presence percentage within clusters rather than statistical enrichment
|
|
190
|
-
|
|
194
|
+
|
|
191
195
|
- **cluster_summaries** : dict
|
|
192
196
|
Summary information for each cluster:
|
|
193
197
|
- 'n_samples': number of samples in cluster
|
|
194
198
|
- 'sample_names': list of sample names in cluster
|
|
195
|
-
|
|
199
|
+
|
|
196
200
|
- **analysis_dataframe** : pandas.DataFrame
|
|
197
201
|
Complete dataframe with UMAP coordinates, cluster assignments, and all
|
|
198
202
|
sample metadata used for association analysis
|
|
199
|
-
|
|
203
|
+
|
|
200
204
|
Raises
|
|
201
205
|
------
|
|
202
206
|
ImportError
|
|
203
207
|
If required dependencies (umap-learn, scikit-learn) are not installed
|
|
204
|
-
|
|
208
|
+
|
|
205
209
|
ValueError
|
|
206
210
|
If consensus matrix is empty or samples data is unavailable
|
|
207
|
-
|
|
211
|
+
|
|
208
212
|
Examples
|
|
209
213
|
--------
|
|
210
214
|
Basic UMAP analysis with default parameters:
|
|
211
|
-
|
|
215
|
+
|
|
212
216
|
>>> results = study.analyze_umap()
|
|
213
217
|
>>> print(f"Found {results['best_clustering']['n_clusters']} clusters")
|
|
214
218
|
>>> print(f"Silhouette score: {results['best_clustering']['score']:.3f}")
|
|
215
|
-
|
|
219
|
+
|
|
216
220
|
Custom analysis with specific clustering and enhanced visualization:
|
|
217
|
-
|
|
221
|
+
|
|
218
222
|
>>> results = study.analyze_umap(
|
|
219
223
|
... n_neighbors=20,
|
|
220
224
|
... min_dist=0.05,
|
|
@@ -222,41 +226,41 @@ def analyze_umap(
|
|
|
222
226
|
... significance_threshold=0.01,
|
|
223
227
|
... filename="cluster_analysis.html"
|
|
224
228
|
... )
|
|
225
|
-
|
|
229
|
+
|
|
226
230
|
Fast analysis for large datasets:
|
|
227
|
-
|
|
231
|
+
|
|
228
232
|
>>> results = study.analyze_umap(
|
|
229
233
|
... cluster_methods=["hdbscan"]
|
|
230
234
|
... )
|
|
231
|
-
|
|
235
|
+
|
|
232
236
|
Notes
|
|
233
237
|
-----
|
|
234
238
|
The enhanced visualization automatically identifies and labels enriched terms based on:
|
|
235
|
-
|
|
239
|
+
|
|
236
240
|
1. **Presence-based ranking**: Terms are ranked by their prevalence within clusters
|
|
237
241
|
rather than statistical enrichment, favoring terms common across cluster members
|
|
238
|
-
|
|
239
|
-
2. **Intelligent positioning**:
|
|
242
|
+
|
|
243
|
+
2. **Intelligent positioning**:
|
|
240
244
|
- Shared terms (multiple clusters) positioned at geometric center with connecting lines
|
|
241
245
|
- Individual terms positioned adjacent to their cluster with short spikes
|
|
242
246
|
- Westward lines position labels to the left with right-aligned text
|
|
243
247
|
- Eastward lines position labels to the right with left-aligned text
|
|
244
|
-
|
|
248
|
+
|
|
245
249
|
3. **Quality filtering**: Empty terms (variants of 'empty', 'blank', 'qc') are
|
|
246
250
|
automatically excluded from enrichment analysis and visualization
|
|
247
|
-
|
|
251
|
+
|
|
248
252
|
4. **Visual styling**: Dashed edges, color-coordinated labels and lines, and
|
|
249
253
|
moderate boundary expansion (5%) create professional, readable plots
|
|
250
|
-
|
|
254
|
+
|
|
251
255
|
The method automatically handles missing dependencies by falling back to simplified
|
|
252
256
|
analysis when optional packages (hdbscan) are unavailable.
|
|
253
257
|
"""
|
|
254
|
-
|
|
258
|
+
|
|
255
259
|
# Check dependencies
|
|
256
260
|
if not UMAP_AVAILABLE:
|
|
257
261
|
self.logger.error("UMAP is required. Install with: pip install umap-learn")
|
|
258
262
|
return None
|
|
259
|
-
|
|
263
|
+
|
|
260
264
|
if not SKLEARN_AVAILABLE:
|
|
261
265
|
self.logger.error("scikit-learn is required. Install with: pip install scikit-learn")
|
|
262
266
|
return None
|
|
@@ -277,12 +281,14 @@ def analyze_umap(
|
|
|
277
281
|
|
|
278
282
|
# Prepare data for UMAP
|
|
279
283
|
sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
|
|
280
|
-
|
|
284
|
+
|
|
281
285
|
if hasattr(consensus_matrix, "select"):
|
|
282
286
|
matrix_data = consensus_matrix.select(sample_cols).to_numpy()
|
|
283
287
|
else:
|
|
284
288
|
matrix_sample_data = consensus_matrix.drop(columns=["consensus_uid"], errors="ignore")
|
|
285
|
-
matrix_data =
|
|
289
|
+
matrix_data = (
|
|
290
|
+
matrix_sample_data.values if hasattr(matrix_sample_data, "values") else np.array(matrix_sample_data)
|
|
291
|
+
)
|
|
286
292
|
|
|
287
293
|
# Transpose so samples are rows
|
|
288
294
|
matrix_data = matrix_data.T
|
|
@@ -290,16 +296,17 @@ def analyze_umap(
|
|
|
290
296
|
|
|
291
297
|
# Standardize data
|
|
292
298
|
from sklearn.preprocessing import StandardScaler
|
|
299
|
+
|
|
293
300
|
scaler = StandardScaler()
|
|
294
301
|
matrix_scaled = scaler.fit_transform(matrix_data)
|
|
295
302
|
|
|
296
303
|
# Perform UMAP with optimizations
|
|
297
304
|
self.logger.debug(f"Computing UMAP with n_neighbors={n_neighbors}, min_dist={min_dist}")
|
|
298
305
|
import umap
|
|
299
|
-
|
|
306
|
+
|
|
300
307
|
# UMAP optimization: use limited threads to save memory
|
|
301
308
|
n_jobs = 1
|
|
302
|
-
|
|
309
|
+
|
|
303
310
|
reducer = umap.UMAP(
|
|
304
311
|
n_components=2,
|
|
305
312
|
n_neighbors=n_neighbors,
|
|
@@ -307,110 +314,118 @@ def analyze_umap(
|
|
|
307
314
|
metric=metric,
|
|
308
315
|
random_state=random_state,
|
|
309
316
|
n_jobs=n_jobs,
|
|
310
|
-
low_memory=False
|
|
317
|
+
low_memory=False,
|
|
311
318
|
)
|
|
312
319
|
umap_coords = reducer.fit_transform(matrix_scaled)
|
|
313
320
|
|
|
314
321
|
# Convert samples_df to pandas for easier analysis
|
|
315
|
-
samples_pd = samples_df.to_pandas() if hasattr(samples_df,
|
|
316
|
-
|
|
322
|
+
samples_pd = samples_df.to_pandas() if hasattr(samples_df, "to_pandas") else samples_df
|
|
323
|
+
|
|
317
324
|
# Get the actual sample columns present in consensus matrix
|
|
318
325
|
sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
|
|
319
326
|
consensus_sample_names = set(sample_cols)
|
|
320
|
-
|
|
327
|
+
|
|
321
328
|
# Filter samples_df to only include samples present in consensus matrix
|
|
322
|
-
if
|
|
329
|
+
if "sample_name" in samples_pd.columns:
|
|
323
330
|
# Create a mask for samples present in consensus matrix
|
|
324
|
-
sample_mask = samples_pd[
|
|
325
|
-
|
|
331
|
+
sample_mask = samples_pd["sample_name"].isin(consensus_sample_names)
|
|
332
|
+
|
|
326
333
|
if sample_mask.sum() != len(samples_pd):
|
|
327
|
-
missing_samples = set(samples_pd[
|
|
328
|
-
self.logger.warning(
|
|
334
|
+
missing_samples = set(samples_pd["sample_name"]) - consensus_sample_names
|
|
335
|
+
self.logger.warning(
|
|
336
|
+
f"Filtering out {len(missing_samples)} samples not in consensus matrix: {list(missing_samples)}"
|
|
337
|
+
)
|
|
329
338
|
samples_pd = samples_pd[sample_mask].copy()
|
|
330
|
-
|
|
339
|
+
|
|
331
340
|
# Reorder samples_pd to match the order in consensus matrix sample_cols
|
|
332
|
-
samples_pd = samples_pd.set_index(
|
|
333
|
-
|
|
341
|
+
samples_pd = samples_pd.set_index("sample_name").reindex(sample_cols).reset_index()
|
|
342
|
+
|
|
334
343
|
# Final check - ensure we have the same number of samples
|
|
335
344
|
if len(samples_pd) != len(umap_coords):
|
|
336
|
-
self.logger.error(
|
|
345
|
+
self.logger.error(
|
|
346
|
+
f"After filtering, still have mismatch: samples_df has {len(samples_pd)} rows, UMAP has {len(umap_coords)} points"
|
|
347
|
+
)
|
|
337
348
|
return None
|
|
338
|
-
|
|
349
|
+
|
|
339
350
|
self.logger.info(f"Using {len(samples_pd)} samples for analysis")
|
|
340
351
|
|
|
341
352
|
# Try different clustering methods
|
|
342
353
|
clustering_results = {}
|
|
343
|
-
|
|
354
|
+
|
|
344
355
|
for method in cluster_methods:
|
|
345
356
|
self.logger.debug(f"Trying clustering method: {method}")
|
|
346
|
-
|
|
357
|
+
|
|
347
358
|
if method == "hdbscan" and HDBSCAN_AVAILABLE:
|
|
348
359
|
import hdbscan
|
|
349
|
-
|
|
360
|
+
|
|
361
|
+
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric="euclidean")
|
|
350
362
|
cluster_labels = clusterer.fit_predict(umap_coords)
|
|
351
|
-
|
|
363
|
+
|
|
352
364
|
# Calculate silhouette score (excluding noise points for HDBSCAN)
|
|
353
365
|
valid_labels = cluster_labels[cluster_labels != -1]
|
|
354
366
|
valid_coords = umap_coords[cluster_labels != -1]
|
|
355
|
-
|
|
367
|
+
|
|
356
368
|
if len(np.unique(valid_labels)) > 1:
|
|
357
369
|
from sklearn.metrics import silhouette_score
|
|
370
|
+
|
|
358
371
|
score = silhouette_score(valid_coords, valid_labels)
|
|
359
372
|
n_clusters = len(np.unique(valid_labels))
|
|
360
373
|
n_noise = np.sum(cluster_labels == -1)
|
|
361
|
-
|
|
374
|
+
|
|
362
375
|
clustering_results[f"{method}"] = {
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
376
|
+
"labels": cluster_labels,
|
|
377
|
+
"score": score,
|
|
378
|
+
"n_clusters": n_clusters,
|
|
379
|
+
"n_noise": n_noise,
|
|
380
|
+
"method": method,
|
|
368
381
|
}
|
|
369
|
-
|
|
382
|
+
|
|
370
383
|
elif method == "kmeans":
|
|
371
384
|
from sklearn.cluster import KMeans
|
|
372
385
|
from sklearn.metrics import silhouette_score
|
|
373
|
-
|
|
386
|
+
|
|
374
387
|
for n_clusters in range(n_clusters_range[0], n_clusters_range[1] + 1):
|
|
375
388
|
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
|
|
376
389
|
cluster_labels = kmeans.fit_predict(umap_coords)
|
|
377
390
|
score = silhouette_score(umap_coords, cluster_labels)
|
|
378
|
-
|
|
391
|
+
|
|
379
392
|
clustering_results[f"{method}_k{n_clusters}"] = {
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
393
|
+
"labels": cluster_labels,
|
|
394
|
+
"score": score,
|
|
395
|
+
"n_clusters": n_clusters,
|
|
396
|
+
"n_noise": 0,
|
|
397
|
+
"method": f"{method} (k={n_clusters})",
|
|
385
398
|
}
|
|
386
|
-
|
|
399
|
+
|
|
387
400
|
elif method == "dbscan":
|
|
388
401
|
from sklearn.cluster import DBSCAN
|
|
402
|
+
|
|
389
403
|
# Standard DBSCAN eps values for exploration
|
|
390
404
|
eps_values = [0.3, 0.5, 0.7, 1.0, 1.5]
|
|
391
|
-
|
|
405
|
+
|
|
392
406
|
for eps in eps_values:
|
|
393
407
|
dbscan = DBSCAN(eps=eps, min_samples=min_cluster_size, n_jobs=-1)
|
|
394
408
|
cluster_labels = dbscan.fit_predict(umap_coords)
|
|
395
|
-
|
|
409
|
+
|
|
396
410
|
n_clusters = len(np.unique(cluster_labels[cluster_labels != -1]))
|
|
397
411
|
n_noise = np.sum(cluster_labels == -1)
|
|
398
|
-
|
|
412
|
+
|
|
399
413
|
# Only consider valid clusterings
|
|
400
414
|
if n_clusters > 1:
|
|
401
415
|
from sklearn.metrics import silhouette_score
|
|
416
|
+
|
|
402
417
|
valid_labels = cluster_labels[cluster_labels != -1]
|
|
403
418
|
valid_coords = umap_coords[cluster_labels != -1]
|
|
404
|
-
|
|
419
|
+
|
|
405
420
|
if len(valid_coords) > 0 and len(np.unique(valid_labels)) > 1:
|
|
406
421
|
score = silhouette_score(valid_coords, valid_labels)
|
|
407
|
-
|
|
422
|
+
|
|
408
423
|
clustering_results[f"{method}_eps{eps}"] = {
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
424
|
+
"labels": cluster_labels,
|
|
425
|
+
"score": score,
|
|
426
|
+
"n_clusters": n_clusters,
|
|
427
|
+
"n_noise": n_noise,
|
|
428
|
+
"method": f"{method} (eps={eps})",
|
|
414
429
|
}
|
|
415
430
|
|
|
416
431
|
if not clustering_results:
|
|
@@ -418,52 +433,53 @@ def analyze_umap(
|
|
|
418
433
|
return None
|
|
419
434
|
|
|
420
435
|
# Select best clustering based on silhouette score
|
|
421
|
-
best_key = max(clustering_results.keys(), key=lambda k: clustering_results[k][
|
|
436
|
+
best_key = max(clustering_results.keys(), key=lambda k: clustering_results[k]["score"])
|
|
422
437
|
best_clustering = clustering_results[best_key]
|
|
423
|
-
|
|
424
|
-
self.logger.info(
|
|
425
|
-
|
|
438
|
+
|
|
439
|
+
self.logger.info(
|
|
440
|
+
f"Best clustering: {best_clustering['method']} with {best_clustering['n_clusters']} clusters, "
|
|
441
|
+
f"silhouette score: {best_clustering['score']:.3f}"
|
|
442
|
+
)
|
|
426
443
|
|
|
427
444
|
# Analyze associations between clusters and sample metadata
|
|
428
|
-
cluster_labels = best_clustering[
|
|
429
|
-
|
|
445
|
+
cluster_labels = best_clustering["labels"]
|
|
446
|
+
|
|
430
447
|
# Add cluster labels to samples dataframe for analysis
|
|
431
448
|
analysis_df = samples_pd.copy()
|
|
432
|
-
analysis_df[
|
|
433
|
-
|
|
449
|
+
analysis_df["cluster"] = cluster_labels
|
|
450
|
+
|
|
434
451
|
# Remove noise points (label -1) for association analysis
|
|
435
|
-
analysis_df_clean = analysis_df[analysis_df[
|
|
436
|
-
|
|
452
|
+
analysis_df_clean = analysis_df[analysis_df["cluster"] != -1].copy()
|
|
453
|
+
|
|
437
454
|
if len(analysis_df_clean) == 0:
|
|
438
455
|
self.logger.error("No samples assigned to clusters (all noise)")
|
|
439
456
|
return None
|
|
440
457
|
|
|
441
458
|
# Analyze associations with specific columns only
|
|
442
459
|
significant_associations = []
|
|
443
|
-
|
|
460
|
+
|
|
444
461
|
# Define which columns to analyze for associations (non-text)
|
|
445
|
-
association_cols = {
|
|
446
|
-
|
|
462
|
+
association_cols = {"sample_sequence", "num_features"}
|
|
463
|
+
|
|
447
464
|
# Define which columns to analyze for text patterns - include all relevant text columns
|
|
448
|
-
text_pattern_cols = {
|
|
449
|
-
|
|
450
|
-
|
|
465
|
+
text_pattern_cols = {"sample_name", "sample_group", "sample_batch", "sample_type"}
|
|
466
|
+
|
|
451
467
|
for col in samples_pd.columns:
|
|
452
468
|
if col not in association_cols:
|
|
453
469
|
continue
|
|
454
|
-
|
|
470
|
+
|
|
455
471
|
try:
|
|
456
472
|
# Check if column has enough variation
|
|
457
473
|
col_data = analysis_df_clean[col].dropna()
|
|
458
474
|
if len(col_data.unique()) < 2:
|
|
459
475
|
continue
|
|
460
|
-
|
|
476
|
+
|
|
461
477
|
# Determine if column is numeric or categorical
|
|
462
478
|
if pd.api.types.is_numeric_dtype(col_data):
|
|
463
479
|
# Numeric variable - use ANOVA or Kruskal-Wallis
|
|
464
|
-
cluster_groups = [group[col].dropna().values for name, group in analysis_df_clean.groupby(
|
|
480
|
+
cluster_groups = [group[col].dropna().values for name, group in analysis_df_clean.groupby("cluster")]
|
|
465
481
|
cluster_groups = [group for group in cluster_groups if len(group) > 0]
|
|
466
|
-
|
|
482
|
+
|
|
467
483
|
if len(cluster_groups) > 1:
|
|
468
484
|
# Try ANOVA first
|
|
469
485
|
try:
|
|
@@ -474,67 +490,81 @@ def analyze_umap(
|
|
|
474
490
|
h_stat, p_value = stats.kruskal(*cluster_groups)
|
|
475
491
|
test_name = "Kruskal-Wallis"
|
|
476
492
|
f_stat = h_stat
|
|
477
|
-
|
|
493
|
+
|
|
478
494
|
if p_value < significance_threshold:
|
|
479
495
|
# Calculate effect size (eta-squared approximation)
|
|
480
|
-
ss_between = sum(
|
|
481
|
-
|
|
496
|
+
ss_between = sum(
|
|
497
|
+
len(group) * (np.mean(group) - np.mean(col_data)) ** 2 for group in cluster_groups
|
|
498
|
+
)
|
|
499
|
+
ss_total = np.sum((col_data - np.mean(col_data)) ** 2)
|
|
482
500
|
eta_squared = ss_between / ss_total if ss_total > 0 else 0
|
|
483
|
-
|
|
501
|
+
|
|
484
502
|
significant_associations.append({
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
503
|
+
"column": col,
|
|
504
|
+
"variable_type": "numeric",
|
|
505
|
+
"test": test_name,
|
|
506
|
+
"statistic": f_stat,
|
|
507
|
+
"p_value": p_value,
|
|
508
|
+
"effect_size": eta_squared,
|
|
509
|
+
"interpretation": "Large effect"
|
|
510
|
+
if eta_squared > 0.14
|
|
511
|
+
else "Medium effect"
|
|
512
|
+
if eta_squared > 0.06
|
|
513
|
+
else "Small effect",
|
|
492
514
|
})
|
|
493
|
-
|
|
515
|
+
|
|
494
516
|
else:
|
|
495
517
|
# Categorical variable - use Chi-square test
|
|
496
|
-
contingency_table = pd.crosstab(analysis_df_clean[
|
|
497
|
-
|
|
518
|
+
contingency_table = pd.crosstab(analysis_df_clean["cluster"], analysis_df_clean[col])
|
|
519
|
+
|
|
498
520
|
# Only test if we have enough observations
|
|
499
|
-
if
|
|
521
|
+
if (
|
|
522
|
+
contingency_table.sum().sum() > 10
|
|
523
|
+
and contingency_table.shape[0] > 1
|
|
524
|
+
and contingency_table.shape[1] > 1
|
|
525
|
+
):
|
|
500
526
|
try:
|
|
501
527
|
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
|
|
502
|
-
|
|
528
|
+
|
|
503
529
|
if p_value < significance_threshold:
|
|
504
530
|
# Calculate Cramer's V (effect size for chi-square)
|
|
505
531
|
n = contingency_table.sum().sum()
|
|
506
532
|
cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))
|
|
507
|
-
|
|
533
|
+
|
|
508
534
|
significant_associations.append({
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
535
|
+
"column": col,
|
|
536
|
+
"variable_type": "categorical",
|
|
537
|
+
"test": "Chi-square",
|
|
538
|
+
"statistic": chi2,
|
|
539
|
+
"p_value": p_value,
|
|
540
|
+
"effect_size": cramers_v,
|
|
541
|
+
"interpretation": "Large effect"
|
|
542
|
+
if cramers_v > 0.5
|
|
543
|
+
else "Medium effect"
|
|
544
|
+
if cramers_v > 0.3
|
|
545
|
+
else "Small effect",
|
|
546
|
+
"contingency_table": contingency_table,
|
|
517
547
|
})
|
|
518
548
|
except Exception:
|
|
519
549
|
continue
|
|
520
|
-
|
|
550
|
+
|
|
521
551
|
except Exception as e:
|
|
522
552
|
self.logger.debug(f"Error analyzing column {col}: {e}")
|
|
523
553
|
continue
|
|
524
554
|
|
|
525
555
|
# Sort by effect size (descending)
|
|
526
|
-
significant_associations.sort(key=lambda x: x[
|
|
556
|
+
significant_associations.sort(key=lambda x: x["effect_size"], reverse=True)
|
|
527
557
|
|
|
528
558
|
# Enhanced cluster-centric text analysis - analyze what makes each cluster unique
|
|
529
559
|
self.logger.debug("Performing cluster-centric enrichment analysis...")
|
|
530
|
-
|
|
560
|
+
|
|
531
561
|
text_associations = []
|
|
532
|
-
|
|
562
|
+
|
|
533
563
|
# Optimized text tokenization using cached function
|
|
534
564
|
def tokenize_text_optimized(text):
|
|
535
565
|
"""Optimized text tokenization with caching"""
|
|
536
566
|
return tokenize_text_cached(text)
|
|
537
|
-
|
|
567
|
+
|
|
538
568
|
# Collect all atoms from specified string columns only
|
|
539
569
|
string_columns = []
|
|
540
570
|
for col in text_pattern_cols:
|
|
@@ -543,15 +573,15 @@ def analyze_umap(
|
|
|
543
573
|
if len(col_data) > 0 and not pd.api.types.is_numeric_dtype(col_data):
|
|
544
574
|
if len(col_data.astype(str).unique()) > 1: # Has variation
|
|
545
575
|
string_columns.append(col)
|
|
546
|
-
|
|
576
|
+
|
|
547
577
|
if string_columns:
|
|
548
578
|
# Text analysis for string columns
|
|
549
579
|
self.logger.debug(f"Analyzing cluster enrichments in {len(string_columns)} string columns")
|
|
550
|
-
|
|
580
|
+
|
|
551
581
|
# Build cluster-centric atom analysis using cached tokenization
|
|
552
582
|
cluster_atoms = {} # cluster_id -> {atom -> count}
|
|
553
583
|
global_atom_counts = {} # atom -> total_count_across_all_samples
|
|
554
|
-
|
|
584
|
+
|
|
555
585
|
# Pre-tokenize all text data once for efficiency with column prefixes
|
|
556
586
|
sample_atom_sets = {}
|
|
557
587
|
for idx, row in analysis_df_clean.iterrows():
|
|
@@ -559,61 +589,63 @@ def analyze_umap(
|
|
|
559
589
|
for col in string_columns:
|
|
560
590
|
atoms = tokenize_text_optimized(row[col])
|
|
561
591
|
# Add column prefix to distinguish where tokens come from
|
|
562
|
-
col_prefix = col.replace(
|
|
592
|
+
col_prefix = col.replace("sample_", "") + ":" # e.g., "name:", "group:", "batch:", "type:"
|
|
563
593
|
prefixed_atoms = [f"{col_prefix}{atom}" for atom in atoms]
|
|
564
594
|
sample_atoms.update(prefixed_atoms)
|
|
565
595
|
sample_atom_sets[idx] = sample_atoms
|
|
566
|
-
|
|
596
|
+
|
|
567
597
|
# Collect atoms by cluster
|
|
568
598
|
for idx, row in analysis_df_clean.iterrows():
|
|
569
|
-
cluster_id = row[
|
|
599
|
+
cluster_id = row["cluster"]
|
|
570
600
|
if cluster_id not in cluster_atoms:
|
|
571
601
|
cluster_atoms[cluster_id] = {}
|
|
572
|
-
|
|
602
|
+
|
|
573
603
|
# Use pre-tokenized atoms
|
|
574
604
|
sample_atoms = sample_atom_sets[idx]
|
|
575
|
-
|
|
605
|
+
|
|
576
606
|
# Count atoms for this cluster and globally
|
|
577
607
|
for atom in sample_atoms:
|
|
578
608
|
cluster_atoms[cluster_id][atom] = cluster_atoms[cluster_id].get(atom, 0) + 1
|
|
579
609
|
global_atom_counts[atom] = global_atom_counts.get(atom, 0) + 1
|
|
580
|
-
|
|
610
|
+
|
|
581
611
|
# Calculate cluster enrichments using hypergeometric test (same for both modes)
|
|
582
612
|
if string_columns:
|
|
583
613
|
n_total_samples = len(analysis_df_clean)
|
|
584
|
-
|
|
614
|
+
|
|
585
615
|
# For each cluster, find significantly enriched terms
|
|
586
616
|
for cluster_id, cluster_atom_counts in cluster_atoms.items():
|
|
587
|
-
cluster_size = len(analysis_df_clean[analysis_df_clean[
|
|
588
|
-
|
|
617
|
+
cluster_size = len(analysis_df_clean[analysis_df_clean["cluster"] == cluster_id])
|
|
618
|
+
|
|
589
619
|
for atom, cluster_count in cluster_atom_counts.items():
|
|
590
620
|
global_count = global_atom_counts[atom]
|
|
591
|
-
|
|
621
|
+
|
|
592
622
|
# Skip empty terms from enrichment analysis and plotting
|
|
593
|
-
if (
|
|
594
|
-
atom
|
|
595
|
-
atom.
|
|
596
|
-
|
|
597
|
-
atom.lower()
|
|
598
|
-
|
|
599
|
-
atom.lower()
|
|
623
|
+
if (
|
|
624
|
+
atom == "<empty>"
|
|
625
|
+
or atom.lower() == "empty"
|
|
626
|
+
or atom.strip() == ""
|
|
627
|
+
or ":empty" in atom.lower()
|
|
628
|
+
or atom.lower().endswith("empty")
|
|
629
|
+
or ":blank" in atom.lower()
|
|
630
|
+
or atom.lower().endswith("blank")
|
|
631
|
+
):
|
|
600
632
|
continue
|
|
601
|
-
|
|
633
|
+
|
|
602
634
|
# Skip atoms with low frequency
|
|
603
635
|
if global_count < 2:
|
|
604
636
|
continue
|
|
605
|
-
|
|
637
|
+
|
|
606
638
|
# Skip terms that occur in fewer than 5 samples within this cluster
|
|
607
639
|
if cluster_count < 5:
|
|
608
640
|
continue
|
|
609
|
-
|
|
641
|
+
|
|
610
642
|
# IMPORTANT: Skip atoms that appear in too many clusters (not cluster-specific)
|
|
611
643
|
# Count how many clusters this atom appears in
|
|
612
644
|
clusters_with_atom = set()
|
|
613
645
|
for other_cluster_id, other_cluster_atom_counts in cluster_atoms.items():
|
|
614
646
|
if atom in other_cluster_atom_counts:
|
|
615
647
|
clusters_with_atom.add(other_cluster_id)
|
|
616
|
-
|
|
648
|
+
|
|
617
649
|
total_clusters = len(cluster_atoms)
|
|
618
650
|
cluster_specificity = len(clusters_with_atom) / total_clusters if total_clusters > 0 else 1
|
|
619
651
|
|
|
@@ -621,106 +653,109 @@ def analyze_umap(
|
|
|
621
653
|
if cluster_specificity > 0.5:
|
|
622
654
|
# Note: logger not available in standalone function, would need to pass self
|
|
623
655
|
continue
|
|
624
|
-
|
|
656
|
+
|
|
625
657
|
# Additional check: ensure this cluster has significantly more of this atom than others
|
|
626
|
-
#max_other_cluster_count = 0
|
|
627
|
-
#for other_cluster_id, other_cluster_atom_counts in cluster_atoms.items():
|
|
658
|
+
# max_other_cluster_count = 0
|
|
659
|
+
# for other_cluster_id, other_cluster_atom_counts in cluster_atoms.items():
|
|
628
660
|
# if other_cluster_id != cluster_id and atom in other_cluster_atom_counts:
|
|
629
661
|
# max_other_cluster_count = max(max_other_cluster_count, other_cluster_atom_counts[atom])
|
|
630
|
-
|
|
662
|
+
|
|
631
663
|
# Skip if current cluster doesn't have significantly more instances than the next highest
|
|
632
|
-
#if cluster_count <= max_other_cluster_count * 1.5:
|
|
633
|
-
|
|
664
|
+
# if cluster_count <= max_other_cluster_count * 1.5:
|
|
665
|
+
# Note: logger not available in standalone function, would need to pass self
|
|
634
666
|
# continue
|
|
635
|
-
|
|
667
|
+
|
|
636
668
|
# Calculate enrichment using hypergeometric test
|
|
637
669
|
try:
|
|
638
670
|
from scipy.stats import hypergeom
|
|
639
|
-
|
|
671
|
+
|
|
640
672
|
M = n_total_samples
|
|
641
673
|
n = global_count
|
|
642
674
|
N = cluster_size
|
|
643
675
|
k = cluster_count
|
|
644
|
-
|
|
676
|
+
|
|
645
677
|
# Calculate p-value (probability of observing k or more successes)
|
|
646
|
-
p_value = hypergeom.sf(k-1, M, n, N)
|
|
647
|
-
|
|
678
|
+
p_value = hypergeom.sf(k - 1, M, n, N)
|
|
679
|
+
|
|
648
680
|
# Calculate enrichment ratio
|
|
649
681
|
expected_freq = (n / M) * N
|
|
650
|
-
enrichment_ratio = cluster_count / expected_freq if expected_freq > 0 else float(
|
|
651
|
-
|
|
682
|
+
enrichment_ratio = cluster_count / expected_freq if expected_freq > 0 else float("inf")
|
|
683
|
+
|
|
652
684
|
# Only consider significantly enriched terms (p < threshold and enrichment > 1.5x)
|
|
653
685
|
if p_value < significance_threshold and enrichment_ratio > 1.5:
|
|
654
|
-
|
|
655
686
|
# Calculate percentage of cluster samples with this atom
|
|
656
687
|
cluster_percentage = (cluster_count / cluster_size) * 100
|
|
657
688
|
global_percentage = (global_count / n_total_samples) * 100
|
|
658
|
-
|
|
689
|
+
|
|
659
690
|
text_associations.append({
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
691
|
+
"atom": atom,
|
|
692
|
+
"cluster_id": cluster_id,
|
|
693
|
+
"type": "cluster_enrichment",
|
|
694
|
+
"test": "Hypergeometric",
|
|
695
|
+
"p_value": p_value,
|
|
696
|
+
"enrichment_ratio": enrichment_ratio,
|
|
697
|
+
"effect_size": enrichment_ratio, # Use enrichment ratio as effect size
|
|
698
|
+
"interpretation": "Large enrichment"
|
|
699
|
+
if enrichment_ratio > 3
|
|
700
|
+
else "Medium enrichment"
|
|
701
|
+
if enrichment_ratio > 2
|
|
702
|
+
else "Small enrichment",
|
|
703
|
+
"cluster_count": cluster_count,
|
|
704
|
+
"cluster_size": cluster_size,
|
|
705
|
+
"cluster_percentage": cluster_percentage,
|
|
706
|
+
"global_count": global_count,
|
|
707
|
+
"global_percentage": global_percentage,
|
|
708
|
+
"cluster_samples_with_atom": cluster_count,
|
|
709
|
+
"total_samples_with_atom": global_count,
|
|
675
710
|
})
|
|
676
|
-
|
|
711
|
+
|
|
677
712
|
except Exception as e:
|
|
678
713
|
self.logger.debug(f"Error analyzing enrichment of '{atom}' in cluster {cluster_id}: {e}")
|
|
679
714
|
continue
|
|
680
|
-
|
|
715
|
+
|
|
681
716
|
# Sort text associations by cluster presence percentage (favors common terms in clusters)
|
|
682
|
-
text_associations.sort(key=lambda x: x[
|
|
683
|
-
|
|
717
|
+
text_associations.sort(key=lambda x: x["cluster_percentage"], reverse=True)
|
|
718
|
+
|
|
684
719
|
# Combine regular and text associations
|
|
685
720
|
all_associations = significant_associations + text_associations
|
|
686
721
|
# Sort by cluster percentage for text associations, effect size for others
|
|
687
|
-
all_associations.sort(key=lambda x: x.get(
|
|
722
|
+
all_associations.sort(key=lambda x: x.get("cluster_percentage", x.get("effect_size", 0)), reverse=True)
|
|
688
723
|
|
|
689
724
|
# Generate cluster summaries
|
|
690
725
|
cluster_summaries = {}
|
|
691
|
-
for cluster_id in analysis_df_clean[
|
|
692
|
-
cluster_data = analysis_df_clean[analysis_df_clean[
|
|
726
|
+
for cluster_id in analysis_df_clean["cluster"].unique():
|
|
727
|
+
cluster_data = analysis_df_clean[analysis_df_clean["cluster"] == cluster_id]
|
|
693
728
|
cluster_summaries[cluster_id] = {
|
|
694
|
-
|
|
695
|
-
|
|
729
|
+
"n_samples": len(cluster_data),
|
|
730
|
+
"sample_names": cluster_data["sample_name"].tolist() if "sample_name" in cluster_data else [],
|
|
696
731
|
}
|
|
697
732
|
|
|
698
733
|
# Create results dictionary
|
|
699
734
|
results = {
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
735
|
+
"umap_coords": umap_coords,
|
|
736
|
+
"best_clustering": best_clustering,
|
|
737
|
+
"all_clustering_results": clustering_results,
|
|
738
|
+
"significant_associations": all_associations,
|
|
739
|
+
"text_associations": text_associations,
|
|
740
|
+
"cluster_summaries": cluster_summaries,
|
|
741
|
+
"analysis_dataframe": analysis_df_clean,
|
|
707
742
|
}
|
|
708
743
|
|
|
709
744
|
# Create sample-specific enrichment tooltips with optimization
|
|
710
745
|
sample_enrichments = {}
|
|
711
|
-
|
|
746
|
+
|
|
712
747
|
# For each sample, find which text atoms it contains that are significant
|
|
713
748
|
if text_associations:
|
|
714
749
|
max_check_terms = 10 # Standard limit for tooltip calculation
|
|
715
|
-
|
|
750
|
+
|
|
716
751
|
for idx, row in analysis_df_clean.iterrows():
|
|
717
|
-
sample_name = row.get(
|
|
752
|
+
sample_name = row.get("sample_name", f"sample_{idx}")
|
|
718
753
|
sample_enrichments[sample_name] = []
|
|
719
|
-
|
|
754
|
+
|
|
720
755
|
# Check which significant atoms this sample contains
|
|
721
756
|
for assoc in text_associations[:max_check_terms]: # Check fewer terms in fast mode
|
|
722
|
-
atom = assoc[
|
|
723
|
-
|
|
757
|
+
atom = assoc["atom"]
|
|
758
|
+
|
|
724
759
|
# Check if this sample contains this atom in any of the text columns
|
|
725
760
|
sample_has_atom = False
|
|
726
761
|
for col in text_pattern_cols:
|
|
@@ -729,49 +764,53 @@ def analyze_umap(
|
|
|
729
764
|
if atom.lower() in text_value.lower():
|
|
730
765
|
sample_has_atom = True
|
|
731
766
|
break
|
|
732
|
-
|
|
767
|
+
|
|
733
768
|
if sample_has_atom:
|
|
734
769
|
sample_enrichments[sample_name].append(f"{atom} ({assoc['p_value']:.3f})")
|
|
735
770
|
if len(sample_enrichments[sample_name]) >= 3: # Only show top 3 per sample
|
|
736
771
|
break
|
|
737
|
-
|
|
772
|
+
|
|
738
773
|
# Create embedded plots if requested
|
|
739
774
|
if plot_results:
|
|
740
775
|
plots = {}
|
|
741
|
-
|
|
776
|
+
|
|
742
777
|
# Plot 1: Enhanced UMAP with clusters and enriched term labels (EMBEDDED PLOTTING)
|
|
743
778
|
from bokeh.models import ColumnDataSource, HoverTool, LabelSet, LegendItem, Legend
|
|
744
779
|
from bokeh.plotting import figure
|
|
745
780
|
from collections import defaultdict
|
|
746
|
-
|
|
781
|
+
|
|
747
782
|
# Create cluster plot with enhanced size
|
|
748
783
|
p1 = figure(
|
|
749
|
-
width=900,
|
|
784
|
+
width=900,
|
|
785
|
+
height=700,
|
|
750
786
|
title=f"UMAP Clusters with Enriched Terms ({best_clustering['method']})",
|
|
751
|
-
tools="pan,wheel_zoom,box_zoom,reset,save"
|
|
787
|
+
tools="pan,wheel_zoom,box_zoom,reset,save",
|
|
752
788
|
)
|
|
753
789
|
p1.xaxis.axis_label = "UMAP1"
|
|
754
790
|
p1.yaxis.axis_label = "UMAP2"
|
|
755
|
-
|
|
791
|
+
|
|
756
792
|
# Remove grid
|
|
757
793
|
p1.grid.visible = False
|
|
758
|
-
|
|
794
|
+
|
|
759
795
|
# Color points by cluster
|
|
760
796
|
unique_clusters = np.unique(cluster_labels)
|
|
761
797
|
n_clusters = len(unique_clusters)
|
|
762
|
-
|
|
798
|
+
|
|
763
799
|
# Handle color mapping for many clusters - use turbo colormap
|
|
764
800
|
if n_clusters <= 10:
|
|
765
801
|
from bokeh.palettes import turbo
|
|
802
|
+
|
|
766
803
|
colors = turbo(max(10, n_clusters))[:n_clusters]
|
|
767
804
|
elif n_clusters <= 20:
|
|
768
805
|
from bokeh.palettes import turbo
|
|
806
|
+
|
|
769
807
|
colors = turbo(20)[:n_clusters]
|
|
770
808
|
else:
|
|
771
809
|
# For many clusters, use a continuous colormap
|
|
772
810
|
from bokeh.palettes import turbo
|
|
811
|
+
|
|
773
812
|
colors = turbo(min(256, n_clusters))
|
|
774
|
-
|
|
813
|
+
|
|
775
814
|
# Calculate cluster centers and plot points
|
|
776
815
|
cluster_centers = {}
|
|
777
816
|
for i, cluster_id in enumerate(unique_clusters):
|
|
@@ -782,147 +821,153 @@ def analyze_umap(
|
|
|
782
821
|
else:
|
|
783
822
|
color = colors[i % len(colors)]
|
|
784
823
|
label = f"Cluster {cluster_id}"
|
|
785
|
-
|
|
824
|
+
|
|
786
825
|
cluster_coords = umap_coords[mask]
|
|
787
|
-
|
|
826
|
+
|
|
788
827
|
# Calculate cluster center
|
|
789
828
|
if len(cluster_coords) > 0:
|
|
790
829
|
center_x = np.mean(cluster_coords[:, 0])
|
|
791
830
|
center_y = np.mean(cluster_coords[:, 1])
|
|
792
831
|
cluster_centers[cluster_id] = (center_x, center_y)
|
|
793
|
-
|
|
832
|
+
|
|
794
833
|
cluster_samples = samples_pd[mask] if len(samples_pd) == len(mask) else None
|
|
795
|
-
sample_names =
|
|
796
|
-
|
|
797
|
-
|
|
834
|
+
sample_names = (
|
|
835
|
+
cluster_samples["sample_name"].tolist()
|
|
836
|
+
if cluster_samples is not None and "sample_name" in cluster_samples
|
|
837
|
+
else [f"Sample_{j}" for j in range(np.sum(mask))]
|
|
838
|
+
)
|
|
839
|
+
sample_uids = (
|
|
840
|
+
cluster_samples["sample_uid"].tolist()
|
|
841
|
+
if cluster_samples is not None and "sample_uid" in cluster_samples
|
|
842
|
+
else [f"UID_{j}" for j in range(np.sum(mask))]
|
|
843
|
+
)
|
|
844
|
+
|
|
798
845
|
# Create enrichment tooltip text for this cluster
|
|
799
|
-
cluster_associations = [assoc for assoc in text_associations if assoc.get(
|
|
800
|
-
|
|
846
|
+
cluster_associations = [assoc for assoc in text_associations if assoc.get("cluster_id") == cluster_id]
|
|
847
|
+
|
|
801
848
|
# Get the top enrichments for this cluster (not individual samples)
|
|
802
849
|
cluster_enrichments = []
|
|
803
850
|
for assoc in cluster_associations[:3]: # Top 3 enrichments for this cluster
|
|
804
|
-
atom = assoc[
|
|
851
|
+
atom = assoc["atom"]
|
|
805
852
|
# Skip color codes and other non-meaningful atoms
|
|
806
|
-
if not ((atom.startswith(
|
|
853
|
+
if not ((atom.startswith("#") and len(atom) == 7) or atom in ["nan", "None", "null"]):
|
|
807
854
|
cluster_enrichments.append(atom)
|
|
808
|
-
|
|
855
|
+
|
|
809
856
|
# Create the same enrichment text for ALL samples in this cluster
|
|
810
857
|
if cluster_enrichments:
|
|
811
858
|
cluster_enrichment_text = "; ".join(cluster_enrichments)
|
|
812
859
|
else:
|
|
813
860
|
cluster_enrichment_text = "No enrichments found"
|
|
814
|
-
|
|
861
|
+
|
|
815
862
|
# Apply the same enrichment text to all samples in this cluster
|
|
816
863
|
sample_enrichment_texts = [cluster_enrichment_text] * np.sum(mask)
|
|
817
|
-
|
|
864
|
+
|
|
818
865
|
source = ColumnDataSource({
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
866
|
+
"x": umap_coords[mask, 0],
|
|
867
|
+
"y": umap_coords[mask, 1],
|
|
868
|
+
"cluster": [cluster_id] * np.sum(mask),
|
|
869
|
+
"sample_name": sample_names[: np.sum(mask)],
|
|
870
|
+
"sample_uid": sample_uids[: np.sum(mask)],
|
|
871
|
+
"enrichments": sample_enrichment_texts[: np.sum(mask)],
|
|
825
872
|
})
|
|
826
|
-
|
|
827
|
-
p1.scatter(
|
|
828
|
-
|
|
829
|
-
|
|
873
|
+
|
|
874
|
+
p1.scatter("x", "y", size=markersize, color=color, alpha=0.7, source=source)
|
|
875
|
+
|
|
830
876
|
# Enhanced enriched term visualization
|
|
831
877
|
max_terms_per_cluster = 2
|
|
832
878
|
min_enrichment = 2.0
|
|
833
|
-
|
|
879
|
+
|
|
834
880
|
# Process enriched terms - group by cluster and filter
|
|
835
881
|
cluster_terms = defaultdict(list)
|
|
836
882
|
for assoc in text_associations:
|
|
837
883
|
# Skip empty terms from plotting
|
|
838
|
-
atom = assoc.get(
|
|
839
|
-
if (
|
|
840
|
-
atom
|
|
841
|
-
atom.
|
|
842
|
-
|
|
843
|
-
atom.lower()
|
|
844
|
-
|
|
845
|
-
atom.lower()
|
|
884
|
+
atom = assoc.get("atom", "")
|
|
885
|
+
if (
|
|
886
|
+
atom == "<empty>"
|
|
887
|
+
or atom.lower() == "empty"
|
|
888
|
+
or atom.strip() == ""
|
|
889
|
+
or ":empty" in atom.lower()
|
|
890
|
+
or atom.lower().endswith("empty")
|
|
891
|
+
or ":blank" in atom.lower()
|
|
892
|
+
or atom.lower().endswith("blank")
|
|
893
|
+
):
|
|
846
894
|
continue
|
|
847
|
-
|
|
848
|
-
if
|
|
849
|
-
assoc[
|
|
850
|
-
|
|
851
|
-
|
|
895
|
+
|
|
896
|
+
if assoc["enrichment_ratio"] >= min_enrichment and assoc["cluster_id"] in cluster_centers:
|
|
897
|
+
cluster_terms[assoc["cluster_id"]].append(assoc)
|
|
898
|
+
|
|
852
899
|
# Limit terms per cluster and sort by cluster presence percentage (favors common terms)
|
|
853
900
|
for cluster_id in cluster_terms:
|
|
854
901
|
cluster_terms[cluster_id] = sorted(
|
|
855
|
-
cluster_terms[cluster_id],
|
|
856
|
-
key=lambda x: x['cluster_percentage'],
|
|
857
|
-
reverse=True
|
|
902
|
+
cluster_terms[cluster_id], key=lambda x: x["cluster_percentage"], reverse=True
|
|
858
903
|
)[:max_terms_per_cluster]
|
|
859
|
-
|
|
904
|
+
|
|
860
905
|
# Collect all unique terms for shared term handling
|
|
861
906
|
all_terms = {}
|
|
862
907
|
for cluster_id, terms in cluster_terms.items():
|
|
863
908
|
for term in terms:
|
|
864
|
-
atom = term[
|
|
909
|
+
atom = term["atom"]
|
|
865
910
|
if atom not in all_terms:
|
|
866
911
|
all_terms[atom] = []
|
|
867
912
|
all_terms[atom].append(cluster_id)
|
|
868
|
-
|
|
913
|
+
|
|
869
914
|
# Separate terms into shared vs cluster-specific
|
|
870
915
|
shared_terms = {atom: clusters for atom, clusters in all_terms.items() if len(clusters) > 1}
|
|
871
916
|
specific_terms = {atom: clusters[0] for atom, clusters in all_terms.items() if len(clusters) == 1}
|
|
872
|
-
|
|
917
|
+
|
|
873
918
|
# Merge overlapping terms that refer to the same concept
|
|
874
919
|
# E.g., "type:qc" and "name:PooledQC" both refer to QC samples
|
|
875
920
|
def should_merge_terms(term1, term2):
|
|
876
921
|
"""Check if two terms should be merged based on semantic overlap"""
|
|
877
922
|
# Extract the actual values (remove prefixes)
|
|
878
|
-
val1 = term1.replace(
|
|
879
|
-
val2 = term2.replace(
|
|
880
|
-
|
|
923
|
+
val1 = term1.replace("name:", "").replace("type:", "").replace("group:", "").replace("batch:", "").lower()
|
|
924
|
+
val2 = term2.replace("name:", "").replace("type:", "").replace("group:", "").replace("batch:", "").lower()
|
|
925
|
+
|
|
881
926
|
# Define known overlapping concepts
|
|
882
|
-
qc_terms = {
|
|
883
|
-
blank_terms = {
|
|
884
|
-
|
|
927
|
+
qc_terms = {"qc", "pooledqc", "pooled_qc", "quality_control", "qualitycontrol"}
|
|
928
|
+
blank_terms = {"blank", "blk", "empty", "background"}
|
|
929
|
+
|
|
885
930
|
# Check if both terms belong to the same concept group
|
|
886
931
|
if val1 in qc_terms and val2 in qc_terms:
|
|
887
932
|
return True
|
|
888
933
|
if val1 in blank_terms and val2 in blank_terms:
|
|
889
934
|
return True
|
|
890
|
-
|
|
935
|
+
|
|
891
936
|
# Also check for direct string similarity (e.g., case variations)
|
|
892
937
|
if val1 == val2:
|
|
893
938
|
return True
|
|
894
|
-
|
|
939
|
+
|
|
895
940
|
return False
|
|
896
|
-
|
|
941
|
+
|
|
897
942
|
def merge_overlapping_terms(shared_terms, specific_terms):
|
|
898
943
|
"""Merge terms that refer to the same concept"""
|
|
899
944
|
all_atoms = list(shared_terms.keys()) + list(specific_terms.keys())
|
|
900
945
|
merged_groups = []
|
|
901
946
|
used_atoms = set()
|
|
902
|
-
|
|
947
|
+
|
|
903
948
|
for i, atom1 in enumerate(all_atoms):
|
|
904
949
|
if atom1 in used_atoms:
|
|
905
950
|
continue
|
|
906
|
-
|
|
951
|
+
|
|
907
952
|
group = [atom1]
|
|
908
953
|
used_atoms.add(atom1)
|
|
909
|
-
|
|
954
|
+
|
|
910
955
|
# Find all atoms that should be merged with this one
|
|
911
|
-
for j, atom2 in enumerate(all_atoms[i+1:], i+1):
|
|
956
|
+
for j, atom2 in enumerate(all_atoms[i + 1 :], i + 1):
|
|
912
957
|
if atom2 in used_atoms:
|
|
913
958
|
continue
|
|
914
959
|
if should_merge_terms(atom1, atom2):
|
|
915
960
|
group.append(atom2)
|
|
916
961
|
used_atoms.add(atom2)
|
|
917
|
-
|
|
962
|
+
|
|
918
963
|
if len(group) > 1:
|
|
919
964
|
merged_groups.append(group)
|
|
920
|
-
|
|
965
|
+
|
|
921
966
|
return merged_groups
|
|
922
|
-
|
|
967
|
+
|
|
923
968
|
# Find terms that should be merged
|
|
924
969
|
merged_groups = merge_overlapping_terms(shared_terms, specific_terms)
|
|
925
|
-
|
|
970
|
+
|
|
926
971
|
# Apply merging: create new combined terms and remove originals
|
|
927
972
|
for group in merged_groups:
|
|
928
973
|
# Determine the combined clusters for this group
|
|
@@ -932,28 +977,28 @@ def analyze_umap(
|
|
|
932
977
|
combined_clusters.update(shared_terms[atom])
|
|
933
978
|
elif atom in specific_terms:
|
|
934
979
|
combined_clusters.add(specific_terms[atom])
|
|
935
|
-
|
|
980
|
+
|
|
936
981
|
# Create a new combined term name using newlines
|
|
937
982
|
# Keep the original prefixes and atom names
|
|
938
|
-
combined_atom =
|
|
939
|
-
|
|
983
|
+
combined_atom = "\n".join(group)
|
|
984
|
+
|
|
940
985
|
# Remove original terms from both dictionaries
|
|
941
986
|
for atom in group:
|
|
942
987
|
shared_terms.pop(atom, None)
|
|
943
988
|
specific_terms.pop(atom, None)
|
|
944
|
-
|
|
989
|
+
|
|
945
990
|
# Add the combined term to appropriate dictionary
|
|
946
991
|
combined_clusters_list = list(combined_clusters)
|
|
947
992
|
if len(combined_clusters_list) > 1:
|
|
948
993
|
shared_terms[combined_atom] = combined_clusters_list
|
|
949
994
|
else:
|
|
950
995
|
specific_terms[combined_atom] = combined_clusters_list[0]
|
|
951
|
-
|
|
996
|
+
|
|
952
997
|
# Create label sources for enriched terms
|
|
953
998
|
label_sources = {}
|
|
954
999
|
line_sources = {}
|
|
955
1000
|
line_cluster_mapping = {} # Track which cluster each line belongs to
|
|
956
|
-
|
|
1001
|
+
|
|
957
1002
|
# Handle shared terms (place at center of all clusters that share it, but in empty areas)
|
|
958
1003
|
for atom, clusters in shared_terms.items():
|
|
959
1004
|
if len(clusters) > 1:
|
|
@@ -962,7 +1007,7 @@ def analyze_umap(
|
|
|
962
1007
|
if cluster_coords_list:
|
|
963
1008
|
center_x = np.mean([coord[0] for coord in cluster_coords_list])
|
|
964
1009
|
center_y = np.mean([coord[1] for coord in cluster_coords_list])
|
|
965
|
-
|
|
1010
|
+
|
|
966
1011
|
# Calculate data bounds using simple approach
|
|
967
1012
|
all_x = [pt[0] for pt in umap_coords]
|
|
968
1013
|
all_y = [pt[1] for pt in umap_coords]
|
|
@@ -970,57 +1015,57 @@ def analyze_umap(
|
|
|
970
1015
|
y_min, y_max = min(all_y), max(all_y)
|
|
971
1016
|
data_range_x = x_max - x_min
|
|
972
1017
|
data_range_y = y_max - y_min
|
|
973
|
-
|
|
1018
|
+
|
|
974
1019
|
# Find empty area around the center
|
|
975
1020
|
best_distance = 0
|
|
976
1021
|
best_position = None
|
|
977
|
-
|
|
1022
|
+
|
|
978
1023
|
for distance_factor in [1.0, 1.5, 2.0]:
|
|
979
1024
|
offset_distance = distance_factor * max(data_range_x, data_range_y) * 0.1
|
|
980
|
-
|
|
981
|
-
for angle in np.linspace(0, 2*np.pi, 8):
|
|
1025
|
+
|
|
1026
|
+
for angle in np.linspace(0, 2 * np.pi, 8):
|
|
982
1027
|
label_x = center_x + offset_distance * np.cos(angle)
|
|
983
1028
|
label_y = center_y + offset_distance * np.sin(angle)
|
|
984
|
-
|
|
1029
|
+
|
|
985
1030
|
# Calculate minimum distance to any data point
|
|
986
|
-
distances = [np.sqrt((pt[0] - label_x)**2 + (pt[1] - label_y)**2) for pt in umap_coords]
|
|
1031
|
+
distances = [np.sqrt((pt[0] - label_x) ** 2 + (pt[1] - label_y) ** 2) for pt in umap_coords]
|
|
987
1032
|
min_distance = min(distances)
|
|
988
|
-
|
|
1033
|
+
|
|
989
1034
|
if min_distance > best_distance:
|
|
990
1035
|
best_distance = min_distance
|
|
991
1036
|
best_position = (label_x, label_y)
|
|
992
|
-
|
|
1037
|
+
|
|
993
1038
|
# Use best position or fallback to center
|
|
994
1039
|
if best_position is not None:
|
|
995
1040
|
label_x, label_y = best_position
|
|
996
1041
|
else:
|
|
997
1042
|
label_x, label_y = center_x, center_y
|
|
998
|
-
|
|
1043
|
+
|
|
999
1044
|
# Check if label would be outside plot bounds and adjust
|
|
1000
1045
|
label_margin = max(data_range_x, data_range_y) * 0.05
|
|
1001
1046
|
if label_x < x_min - label_margin:
|
|
1002
1047
|
label_x = x_min - label_margin
|
|
1003
1048
|
elif label_x > x_max + label_margin:
|
|
1004
1049
|
label_x = x_max + label_margin
|
|
1005
|
-
|
|
1050
|
+
|
|
1006
1051
|
if label_y < y_min - label_margin:
|
|
1007
1052
|
label_y = y_min - label_margin
|
|
1008
1053
|
elif label_y > y_max + label_margin:
|
|
1009
1054
|
label_y = y_max + label_margin
|
|
1010
|
-
|
|
1055
|
+
|
|
1011
1056
|
# Keep the original atom name with prefixes for display
|
|
1012
1057
|
display_atom = atom # Keep prefixes like name:, group:, batch:, type:
|
|
1013
|
-
|
|
1058
|
+
|
|
1014
1059
|
# Create label source with center alignment for shared terms
|
|
1015
1060
|
label_source = ColumnDataSource({
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1061
|
+
"x": [label_x],
|
|
1062
|
+
"y": [label_y],
|
|
1063
|
+
"text": [display_atom],
|
|
1064
|
+
"atom": [atom],
|
|
1065
|
+
"text_align": ["center"],
|
|
1021
1066
|
})
|
|
1022
1067
|
label_sources[atom] = label_source
|
|
1023
|
-
|
|
1068
|
+
|
|
1024
1069
|
# Create lines to each cluster center
|
|
1025
1070
|
line_x = []
|
|
1026
1071
|
line_y = []
|
|
@@ -1029,20 +1074,17 @@ def analyze_umap(
|
|
|
1029
1074
|
cx, cy = cluster_centers[cluster_id]
|
|
1030
1075
|
line_x.extend([label_x, cx, np.nan]) # nan to break line
|
|
1031
1076
|
line_y.extend([label_y, cy, np.nan])
|
|
1032
|
-
|
|
1033
|
-
line_source = ColumnDataSource({
|
|
1034
|
-
'x': line_x,
|
|
1035
|
-
'y': line_y
|
|
1036
|
-
})
|
|
1077
|
+
|
|
1078
|
+
line_source = ColumnDataSource({"x": line_x, "y": line_y})
|
|
1037
1079
|
line_sources[atom] = line_source
|
|
1038
|
-
line_cluster_mapping[atom] =
|
|
1039
|
-
|
|
1080
|
+
line_cluster_mapping[atom] = "shared"
|
|
1081
|
+
|
|
1040
1082
|
# Handle cluster-specific terms (arrange multiple terms per cluster to avoid overlap)
|
|
1041
1083
|
# Group specific terms by cluster to handle multiple terms per cluster
|
|
1042
1084
|
cluster_specific_terms = defaultdict(list)
|
|
1043
1085
|
for atom, cluster_id in specific_terms.items():
|
|
1044
1086
|
cluster_specific_terms[cluster_id].append(atom)
|
|
1045
|
-
|
|
1087
|
+
|
|
1046
1088
|
# Calculate data bounds once
|
|
1047
1089
|
all_x = [pt[0] for pt in umap_coords]
|
|
1048
1090
|
all_y = [pt[1] for pt in umap_coords]
|
|
@@ -1050,7 +1092,7 @@ def analyze_umap(
|
|
|
1050
1092
|
y_min, y_max = min(all_y), max(all_y)
|
|
1051
1093
|
data_range_x = x_max - x_min
|
|
1052
1094
|
data_range_y = y_max - y_min
|
|
1053
|
-
|
|
1095
|
+
|
|
1054
1096
|
# Expand plot ranges to accommodate labels (add 15% margin on all sides)
|
|
1055
1097
|
margin = 0.15
|
|
1056
1098
|
x_margin = data_range_x * margin
|
|
@@ -1059,61 +1101,63 @@ def analyze_umap(
|
|
|
1059
1101
|
plot_x_max = x_max + x_margin
|
|
1060
1102
|
plot_y_min = y_min - y_margin
|
|
1061
1103
|
plot_y_max = y_max + y_margin
|
|
1062
|
-
|
|
1104
|
+
|
|
1063
1105
|
# Set expanded plot ranges
|
|
1064
1106
|
p1.x_range.start = plot_x_min
|
|
1065
1107
|
p1.x_range.end = plot_x_max
|
|
1066
1108
|
p1.y_range.start = plot_y_min
|
|
1067
1109
|
p1.y_range.end = plot_y_max
|
|
1068
|
-
|
|
1110
|
+
|
|
1069
1111
|
# Process each cluster that has specific terms
|
|
1070
1112
|
for cluster_id, cluster_atoms in cluster_specific_terms.items():
|
|
1071
1113
|
if cluster_id not in cluster_centers:
|
|
1072
1114
|
continue
|
|
1073
|
-
|
|
1115
|
+
|
|
1074
1116
|
cx, cy = cluster_centers[cluster_id]
|
|
1075
1117
|
n_terms = len(cluster_atoms)
|
|
1076
|
-
|
|
1118
|
+
|
|
1077
1119
|
if n_terms == 1:
|
|
1078
1120
|
# Single term - use smart positioning with shorter distances
|
|
1079
1121
|
atom = cluster_atoms[0]
|
|
1080
|
-
|
|
1122
|
+
|
|
1081
1123
|
# Try multiple candidate positions with shorter distances and more angles
|
|
1082
1124
|
best_distance = 0
|
|
1083
1125
|
best_position = None
|
|
1084
|
-
|
|
1126
|
+
|
|
1085
1127
|
# Use shorter base distance and test many angles
|
|
1086
1128
|
base_distance = max(data_range_x, data_range_y) * 0.08 # Much shorter base distance
|
|
1087
|
-
|
|
1129
|
+
|
|
1088
1130
|
# Test positions at different angles and short distances
|
|
1089
1131
|
for distance_factor in [0.8, 1.0, 1.3]: # Shorter distance factors
|
|
1090
1132
|
offset_distance = base_distance * distance_factor
|
|
1091
|
-
|
|
1092
|
-
for angle in np.linspace(0, 2*np.pi, 24): # More angles (24 directions)
|
|
1133
|
+
|
|
1134
|
+
for angle in np.linspace(0, 2 * np.pi, 24): # More angles (24 directions)
|
|
1093
1135
|
label_x = cx + offset_distance * np.cos(angle)
|
|
1094
1136
|
label_y = cy + offset_distance * np.sin(angle)
|
|
1095
|
-
|
|
1137
|
+
|
|
1096
1138
|
# Calculate minimum distance to any data point
|
|
1097
|
-
distances = [np.sqrt((pt[0] - label_x)**2 + (pt[1] - label_y)**2) for pt in umap_coords]
|
|
1139
|
+
distances = [np.sqrt((pt[0] - label_x) ** 2 + (pt[1] - label_y) ** 2) for pt in umap_coords]
|
|
1098
1140
|
min_distance = min(distances)
|
|
1099
|
-
|
|
1141
|
+
|
|
1100
1142
|
# Check distance to other labels to avoid overlap
|
|
1101
|
-
min_label_distance = float(
|
|
1143
|
+
min_label_distance = float("inf")
|
|
1102
1144
|
for other_atom, other_source in label_sources.items():
|
|
1103
1145
|
if other_atom != atom:
|
|
1104
1146
|
other_data = other_source.data
|
|
1105
|
-
if other_data[
|
|
1106
|
-
other_x, other_y = other_data[
|
|
1107
|
-
label_distance = np.sqrt((label_x - other_x)**2 + (label_y - other_y)**2)
|
|
1147
|
+
if other_data["x"] and other_data["y"]:
|
|
1148
|
+
other_x, other_y = other_data["x"][0], other_data["y"][0]
|
|
1149
|
+
label_distance = np.sqrt((label_x - other_x) ** 2 + (label_y - other_y) ** 2)
|
|
1108
1150
|
min_label_distance = min(min_label_distance, label_distance)
|
|
1109
|
-
|
|
1151
|
+
|
|
1110
1152
|
# Prefer positions that are reasonably far from data points and other labels
|
|
1111
|
-
combined_distance = min(
|
|
1112
|
-
|
|
1153
|
+
combined_distance = min(
|
|
1154
|
+
min_distance, min_label_distance if min_label_distance != float("inf") else min_distance
|
|
1155
|
+
)
|
|
1156
|
+
|
|
1113
1157
|
if combined_distance > best_distance:
|
|
1114
1158
|
best_distance = combined_distance
|
|
1115
1159
|
best_position = (label_x, label_y)
|
|
1116
|
-
|
|
1160
|
+
|
|
1117
1161
|
# Use best position found, or fallback to simple short offset
|
|
1118
1162
|
if best_position is not None:
|
|
1119
1163
|
label_x, label_y = best_position
|
|
@@ -1124,54 +1168,51 @@ def analyze_umap(
|
|
|
1124
1168
|
angle_rad = np.radians(angle)
|
|
1125
1169
|
label_x = cx + offset_distance * np.cos(angle_rad)
|
|
1126
1170
|
label_y = cy + offset_distance * np.sin(angle_rad)
|
|
1127
|
-
|
|
1171
|
+
|
|
1128
1172
|
# Check if label would be outside plot bounds and adjust
|
|
1129
1173
|
label_margin = max(data_range_x, data_range_y) * 0.05
|
|
1130
|
-
|
|
1174
|
+
|
|
1131
1175
|
# Instead of clamping to bounds, let labels go outside and plot bounds will be expanded later
|
|
1132
1176
|
# Only apply minimal adjustments to prevent labels from being extremely far out
|
|
1133
1177
|
extreme_margin = max(data_range_x, data_range_y) * 0.25 # Allow 25% outside data range
|
|
1134
|
-
|
|
1178
|
+
|
|
1135
1179
|
if label_x < x_min - extreme_margin:
|
|
1136
1180
|
label_x = x_min - extreme_margin
|
|
1137
1181
|
elif label_x > x_max + extreme_margin:
|
|
1138
1182
|
label_x = x_max + extreme_margin
|
|
1139
|
-
|
|
1183
|
+
|
|
1140
1184
|
if label_y < y_min - extreme_margin:
|
|
1141
1185
|
label_y = y_min - extreme_margin
|
|
1142
1186
|
elif label_y > y_max + extreme_margin:
|
|
1143
1187
|
label_y = y_max + extreme_margin
|
|
1144
|
-
|
|
1188
|
+
|
|
1145
1189
|
# Determine text alignment based on position relative to cluster
|
|
1146
|
-
text_align =
|
|
1147
|
-
|
|
1190
|
+
text_align = "right" if label_x > cx else "left"
|
|
1191
|
+
|
|
1148
1192
|
# Clean up atom name for display but keep prefixes
|
|
1149
1193
|
display_atom = atom # Keep prefixes like name:, group:, batch:, type:
|
|
1150
|
-
|
|
1194
|
+
|
|
1151
1195
|
# Create label source with alignment
|
|
1152
1196
|
label_source = ColumnDataSource({
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1197
|
+
"x": [label_x],
|
|
1198
|
+
"y": [label_y],
|
|
1199
|
+
"text": [display_atom],
|
|
1200
|
+
"atom": [atom],
|
|
1201
|
+
"text_align": [text_align],
|
|
1158
1202
|
})
|
|
1159
1203
|
label_sources[atom] = label_source
|
|
1160
|
-
|
|
1204
|
+
|
|
1161
1205
|
# Create spike line from cluster center to label
|
|
1162
|
-
line_source = ColumnDataSource({
|
|
1163
|
-
'x': [cx, label_x],
|
|
1164
|
-
'y': [cy, label_y]
|
|
1165
|
-
})
|
|
1206
|
+
line_source = ColumnDataSource({"x": [cx, label_x], "y": [cy, label_y]})
|
|
1166
1207
|
line_sources[atom] = line_source
|
|
1167
1208
|
line_cluster_mapping[atom] = cluster_id
|
|
1168
|
-
|
|
1209
|
+
|
|
1169
1210
|
else:
|
|
1170
1211
|
# Multiple terms - stack them vertically with one line to cluster center
|
|
1171
1212
|
# Determine if this cluster has shared vs non-shared terms to adjust positioning
|
|
1172
1213
|
has_shared = any(atom in shared_terms for atom in cluster_atoms)
|
|
1173
1214
|
has_specific = any(atom in specific_terms for atom in cluster_atoms)
|
|
1174
|
-
|
|
1215
|
+
|
|
1175
1216
|
# Adjust base distance: put non-shared (cluster-specific) labels further out
|
|
1176
1217
|
if has_specific and not has_shared:
|
|
1177
1218
|
# Pure cluster-specific terms - place further from center to reduce overlap
|
|
@@ -1181,137 +1222,145 @@ def analyze_umap(
|
|
|
1181
1222
|
base_distance = max(data_range_x, data_range_y) * 0.08 # Closer
|
|
1182
1223
|
else:
|
|
1183
1224
|
# Mixed terms - use intermediate distance
|
|
1184
|
-
base_distance = max(data_range_x, data_range_y) * 0.1
|
|
1185
|
-
|
|
1225
|
+
base_distance = max(data_range_x, data_range_y) * 0.1 # Standard distance
|
|
1226
|
+
|
|
1186
1227
|
# Calculate a good angle for the stack based on cluster position and available space
|
|
1187
1228
|
# For non-shared terms, prefer angles that point away from plot center
|
|
1188
1229
|
best_angle = None
|
|
1189
1230
|
best_distance = 0
|
|
1190
|
-
|
|
1231
|
+
|
|
1191
1232
|
# Get plot center for reference
|
|
1192
1233
|
plot_center_x = (x_min + x_max) / 2
|
|
1193
1234
|
plot_center_y = (y_min + y_max) / 2
|
|
1194
|
-
|
|
1235
|
+
|
|
1195
1236
|
# Calculate angle from plot center to cluster center
|
|
1196
1237
|
center_to_cluster_angle = np.arctan2(cy - plot_center_y, cx - plot_center_x)
|
|
1197
|
-
|
|
1238
|
+
|
|
1198
1239
|
if has_specific and not has_shared:
|
|
1199
1240
|
# For non-shared terms, prefer angles that point away from plot center
|
|
1200
1241
|
# Create angles around the center-to-cluster direction
|
|
1201
1242
|
base_angle = center_to_cluster_angle
|
|
1202
1243
|
preferred_angles = [
|
|
1203
|
-
base_angle,
|
|
1204
|
-
base_angle + np.pi/4,
|
|
1205
|
-
base_angle - np.pi/4,
|
|
1206
|
-
base_angle + np.pi/6,
|
|
1207
|
-
base_angle - np.pi/6,
|
|
1208
|
-
base_angle + np.pi/3,
|
|
1209
|
-
base_angle - np.pi/3,
|
|
1210
|
-
base_angle + np.pi/2,
|
|
1211
|
-
base_angle - np.pi/2
|
|
1244
|
+
base_angle, # Directly away from center
|
|
1245
|
+
base_angle + np.pi / 4, # 45° offset
|
|
1246
|
+
base_angle - np.pi / 4, # -45° offset
|
|
1247
|
+
base_angle + np.pi / 6, # 30° offset
|
|
1248
|
+
base_angle - np.pi / 6, # -30° offset
|
|
1249
|
+
base_angle + np.pi / 3, # 60° offset
|
|
1250
|
+
base_angle - np.pi / 3, # -60° offset
|
|
1251
|
+
base_angle + np.pi / 2, # 90° offset
|
|
1252
|
+
base_angle - np.pi / 2, # -90° offset
|
|
1212
1253
|
]
|
|
1213
1254
|
else:
|
|
1214
1255
|
# For shared terms or mixed, use the original preferred angles
|
|
1215
|
-
preferred_angles = [
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1256
|
+
preferred_angles = [
|
|
1257
|
+
np.pi / 4,
|
|
1258
|
+
3 * np.pi / 4,
|
|
1259
|
+
5 * np.pi / 4,
|
|
1260
|
+
7 * np.pi / 4, # 45°, 135°, 225°, 315°
|
|
1261
|
+
np.pi / 6,
|
|
1262
|
+
np.pi / 3,
|
|
1263
|
+
2 * np.pi / 3,
|
|
1264
|
+
5 * np.pi / 6, # 30°, 60°, 120°, 150°
|
|
1265
|
+
7 * np.pi / 6,
|
|
1266
|
+
4 * np.pi / 3,
|
|
1267
|
+
5 * np.pi / 3,
|
|
1268
|
+
11 * np.pi / 6,
|
|
1269
|
+
] # 210°, 240°, 300°, 330°
|
|
1270
|
+
|
|
1219
1271
|
for test_angle in preferred_angles:
|
|
1220
1272
|
test_x = cx + base_distance * np.cos(test_angle)
|
|
1221
1273
|
test_y = cy + base_distance * np.sin(test_angle)
|
|
1222
|
-
|
|
1274
|
+
|
|
1223
1275
|
# Calculate minimum distance to any data point
|
|
1224
|
-
distances = [np.sqrt((pt[0] - test_x)**2 + (pt[1] - test_y)**2) for pt in umap_coords]
|
|
1276
|
+
distances = [np.sqrt((pt[0] - test_x) ** 2 + (pt[1] - test_y) ** 2) for pt in umap_coords]
|
|
1225
1277
|
min_distance = min(distances)
|
|
1226
|
-
|
|
1278
|
+
|
|
1227
1279
|
if min_distance > best_distance:
|
|
1228
1280
|
best_distance = min_distance
|
|
1229
1281
|
best_angle = test_angle
|
|
1230
|
-
|
|
1282
|
+
|
|
1231
1283
|
# Use the best angle found, or fallback to 45°
|
|
1232
1284
|
if best_angle is not None:
|
|
1233
1285
|
stack_angle = best_angle
|
|
1234
1286
|
else:
|
|
1235
1287
|
# Fallback: use 45° based on cluster
|
|
1236
|
-
angle_options = [np.pi/4, 3*np.pi/4, 5*np.pi/4, 7*np.pi/4]
|
|
1288
|
+
angle_options = [np.pi / 4, 3 * np.pi / 4, 5 * np.pi / 4, 7 * np.pi / 4]
|
|
1237
1289
|
stack_angle = angle_options[cluster_id % len(angle_options)]
|
|
1238
|
-
|
|
1290
|
+
|
|
1239
1291
|
# Position for the end of the line (before labels start)
|
|
1240
1292
|
line_end_x = cx + base_distance * np.cos(stack_angle)
|
|
1241
1293
|
line_end_y = cy + base_distance * np.sin(stack_angle)
|
|
1242
|
-
|
|
1294
|
+
|
|
1243
1295
|
# Simplified approach: center labels at line end, then add 20pt offset in same direction
|
|
1244
1296
|
# Calculate 20pt offset in the same direction as the line
|
|
1245
1297
|
label_offset_distance = 20 # 20 points in the same direction
|
|
1246
|
-
|
|
1298
|
+
|
|
1247
1299
|
# Convert 20 points to data coordinates (approximate)
|
|
1248
1300
|
# Assuming typical plot size, 20pt ≈ 1-2% of data range
|
|
1249
1301
|
data_range = max(data_range_x, data_range_y)
|
|
1250
1302
|
offset_in_data_coords = data_range * 0.02 # 2% of data range for 20pt
|
|
1251
|
-
|
|
1303
|
+
|
|
1252
1304
|
# Add offset in direction based on line orientation for better text placement
|
|
1253
1305
|
# For westward lines: place label LEFT of endpoint with RIGHT alignment
|
|
1254
1306
|
# For eastward lines: place label RIGHT of endpoint with LEFT alignment
|
|
1255
|
-
|
|
1307
|
+
|
|
1256
1308
|
angle_degrees = (stack_angle * 180 / np.pi) % 360
|
|
1257
1309
|
if 90 < angle_degrees < 270:
|
|
1258
1310
|
# Line goes LEFT (westward) - place label to the LEFT of line end
|
|
1259
1311
|
label_center_x = line_end_x - offset_in_data_coords # SUBTRACT to go left
|
|
1260
1312
|
label_center_y = line_end_y # Keep same Y position
|
|
1261
|
-
text_align =
|
|
1313
|
+
text_align = "right" # Right-align so text ends near line endpoint
|
|
1262
1314
|
else:
|
|
1263
|
-
# Line goes RIGHT (eastward) - place label to the RIGHT of line end
|
|
1315
|
+
# Line goes RIGHT (eastward) - place label to the RIGHT of line end
|
|
1264
1316
|
label_center_x = line_end_x + offset_in_data_coords # ADD to go right
|
|
1265
1317
|
label_center_y = line_end_y # Keep same Y position
|
|
1266
|
-
text_align =
|
|
1267
|
-
|
|
1318
|
+
text_align = "left" # Left-align so text starts near line endpoint
|
|
1319
|
+
|
|
1268
1320
|
# Calculate consistent vertical spacing for stacked labels
|
|
1269
1321
|
# BETTER APPROACH: Use single LabelSet with newline characters
|
|
1270
|
-
|
|
1322
|
+
|
|
1271
1323
|
# Create a single multi-line text string with all terms
|
|
1272
1324
|
display_atoms = [atom for atom in cluster_atoms] # Keep original atom names with prefixes
|
|
1273
|
-
combined_text =
|
|
1274
|
-
|
|
1325
|
+
combined_text = "\n".join(display_atoms)
|
|
1326
|
+
|
|
1275
1327
|
# Check if label would be outside plot bounds and adjust
|
|
1276
1328
|
label_margin = max(data_range_x, data_range_y) * 0.05
|
|
1277
1329
|
label_x = label_center_x
|
|
1278
1330
|
label_y = label_center_y
|
|
1279
|
-
|
|
1331
|
+
|
|
1280
1332
|
if label_x < x_min - label_margin:
|
|
1281
1333
|
label_x = x_min - label_margin
|
|
1282
|
-
text_align =
|
|
1334
|
+
text_align = "left"
|
|
1283
1335
|
elif label_x > x_max + label_margin:
|
|
1284
1336
|
label_x = x_max + label_margin
|
|
1285
|
-
text_align =
|
|
1286
|
-
|
|
1337
|
+
text_align = "right"
|
|
1338
|
+
|
|
1287
1339
|
if label_y < y_min - label_margin:
|
|
1288
1340
|
label_y = y_min - label_margin
|
|
1289
1341
|
elif label_y > y_max + label_margin:
|
|
1290
1342
|
label_y = y_max + label_margin
|
|
1291
|
-
|
|
1343
|
+
|
|
1292
1344
|
# Create single label source with multi-line text and alignment
|
|
1293
1345
|
label_source = ColumnDataSource({
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1346
|
+
"x": [label_x],
|
|
1347
|
+
"y": [label_y],
|
|
1348
|
+
"text": [combined_text],
|
|
1349
|
+
"atoms": [cluster_atoms], # Store all atoms for reference
|
|
1350
|
+
"text_align": [text_align],
|
|
1299
1351
|
})
|
|
1300
|
-
|
|
1352
|
+
|
|
1301
1353
|
# Store this single label source using a unique key for the cluster stack
|
|
1302
1354
|
stack_label_key = f"cluster_{cluster_id}_labels"
|
|
1303
1355
|
label_sources[stack_label_key] = label_source
|
|
1304
|
-
|
|
1356
|
+
|
|
1305
1357
|
# Create single line from cluster center to line end (before labels)
|
|
1306
|
-
stack_line_source = ColumnDataSource({
|
|
1307
|
-
'x': [cx, line_end_x],
|
|
1308
|
-
'y': [cy, line_end_y]
|
|
1309
|
-
})
|
|
1358
|
+
stack_line_source = ColumnDataSource({"x": [cx, line_end_x], "y": [cy, line_end_y]})
|
|
1310
1359
|
# Use a unique key for the stack line
|
|
1311
1360
|
stack_key = f"cluster_{cluster_id}_stack"
|
|
1312
1361
|
line_sources[stack_key] = stack_line_source
|
|
1313
1362
|
line_cluster_mapping[stack_key] = cluster_id
|
|
1314
|
-
|
|
1363
|
+
|
|
1315
1364
|
# Add lines (spikes) to plot with matching cluster colors
|
|
1316
1365
|
line_renderers = {}
|
|
1317
1366
|
for line_key, line_source in line_sources.items():
|
|
@@ -1321,163 +1370,172 @@ def analyze_umap(
|
|
|
1321
1370
|
# Use a neutral color or the color of the first cluster it appears in
|
|
1322
1371
|
first_cluster_id = list(shared_terms[line_key])[0]
|
|
1323
1372
|
if first_cluster_id == -1:
|
|
1324
|
-
line_color =
|
|
1373
|
+
line_color = "gray"
|
|
1325
1374
|
else:
|
|
1326
|
-
cluster_idx =
|
|
1375
|
+
cluster_idx = (
|
|
1376
|
+
list(unique_clusters).index(first_cluster_id) if first_cluster_id in unique_clusters else 0
|
|
1377
|
+
)
|
|
1327
1378
|
line_color = colors[cluster_idx % len(colors)]
|
|
1328
|
-
line_dash =
|
|
1379
|
+
line_dash = "dashed" # Use dashed for all edges
|
|
1329
1380
|
elif line_key in specific_terms:
|
|
1330
1381
|
# For cluster-specific terms, use the cluster's color
|
|
1331
1382
|
cluster_id = specific_terms[line_key]
|
|
1332
1383
|
if cluster_id == -1:
|
|
1333
|
-
line_color =
|
|
1384
|
+
line_color = "gray"
|
|
1334
1385
|
else:
|
|
1335
1386
|
cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
|
|
1336
1387
|
line_color = colors[cluster_idx % len(colors)]
|
|
1337
|
-
line_dash =
|
|
1388
|
+
line_dash = "dashed" # Use dashed for all edges
|
|
1338
1389
|
elif line_key in line_cluster_mapping:
|
|
1339
1390
|
# For stack lines, use the cluster's color
|
|
1340
1391
|
cluster_info = line_cluster_mapping[line_key]
|
|
1341
|
-
if cluster_info ==
|
|
1392
|
+
if cluster_info == "shared":
|
|
1342
1393
|
# For shared stacks, use a neutral color or first cluster color
|
|
1343
|
-
line_color =
|
|
1344
|
-
line_dash =
|
|
1394
|
+
line_color = "black"
|
|
1395
|
+
line_dash = "dashed" # Use dashed for all edges
|
|
1345
1396
|
else:
|
|
1346
1397
|
cluster_id = cluster_info
|
|
1347
1398
|
if cluster_id == -1:
|
|
1348
|
-
line_color =
|
|
1399
|
+
line_color = "gray"
|
|
1349
1400
|
else:
|
|
1350
1401
|
cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
|
|
1351
1402
|
line_color = colors[cluster_idx % len(colors)]
|
|
1352
|
-
line_dash =
|
|
1403
|
+
line_dash = "dashed" # Use dashed for all edges
|
|
1353
1404
|
else:
|
|
1354
1405
|
# Fallback
|
|
1355
|
-
line_color =
|
|
1356
|
-
line_dash =
|
|
1357
|
-
|
|
1358
|
-
line_renderer = p1.line(
|
|
1359
|
-
|
|
1360
|
-
|
|
1406
|
+
line_color = "gray"
|
|
1407
|
+
line_dash = "dashed" # Use dashed for all edges
|
|
1408
|
+
|
|
1409
|
+
line_renderer = p1.line(
|
|
1410
|
+
"x", "y", source=line_source, line_color=line_color, line_width=2, alpha=0.8, line_dash=line_dash
|
|
1411
|
+
)
|
|
1361
1412
|
line_renderers[line_key] = line_renderer
|
|
1362
|
-
|
|
1413
|
+
|
|
1363
1414
|
# Add labels to plot (simple and direct approach)
|
|
1364
1415
|
label_renderers = {} # Store label renderers for legend control
|
|
1365
1416
|
for label_key, label_source in label_sources.items():
|
|
1366
1417
|
# Determine color and style based on label key type
|
|
1367
|
-
if label_key.startswith(
|
|
1418
|
+
if label_key.startswith("cluster_") and label_key.endswith("_labels"):
|
|
1368
1419
|
# This is a cluster stack with multiple terms
|
|
1369
|
-
cluster_id = int(label_key.split(
|
|
1420
|
+
cluster_id = int(label_key.split("_")[1])
|
|
1370
1421
|
if cluster_id == -1:
|
|
1371
|
-
text_color =
|
|
1422
|
+
text_color = "gray"
|
|
1372
1423
|
else:
|
|
1373
1424
|
cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
|
|
1374
1425
|
text_color = colors[cluster_idx % len(colors)]
|
|
1375
|
-
text_font_style =
|
|
1426
|
+
text_font_style = "bold"
|
|
1376
1427
|
elif label_key in shared_terms:
|
|
1377
1428
|
# Shared term - use same color as edge (first cluster's color)
|
|
1378
1429
|
first_cluster_id = list(shared_terms[label_key])[0]
|
|
1379
1430
|
if first_cluster_id == -1:
|
|
1380
|
-
text_color =
|
|
1431
|
+
text_color = "gray"
|
|
1381
1432
|
else:
|
|
1382
|
-
cluster_idx =
|
|
1433
|
+
cluster_idx = (
|
|
1434
|
+
list(unique_clusters).index(first_cluster_id) if first_cluster_id in unique_clusters else 0
|
|
1435
|
+
)
|
|
1383
1436
|
text_color = colors[cluster_idx % len(colors)]
|
|
1384
|
-
text_font_style =
|
|
1437
|
+
text_font_style = "bold"
|
|
1385
1438
|
elif label_key in specific_terms:
|
|
1386
1439
|
# Individual cluster-specific term
|
|
1387
1440
|
cluster_id = specific_terms[label_key]
|
|
1388
1441
|
if cluster_id == -1:
|
|
1389
|
-
text_color =
|
|
1442
|
+
text_color = "gray"
|
|
1390
1443
|
else:
|
|
1391
1444
|
cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
|
|
1392
1445
|
text_color = colors[cluster_idx % len(colors)]
|
|
1393
|
-
text_font_style =
|
|
1446
|
+
text_font_style = "bold"
|
|
1394
1447
|
else:
|
|
1395
1448
|
# Fallback
|
|
1396
|
-
text_color =
|
|
1397
|
-
text_font_style =
|
|
1398
|
-
|
|
1449
|
+
text_color = "black"
|
|
1450
|
+
text_font_style = "bold"
|
|
1451
|
+
|
|
1399
1452
|
# Get text alignment from label source, default to center
|
|
1400
1453
|
label_data = label_source.data
|
|
1401
|
-
text_align = label_data.get(
|
|
1402
|
-
|
|
1454
|
+
text_align = label_data.get("text_align", ["center"])[0] if "text_align" in label_data else "center"
|
|
1455
|
+
|
|
1403
1456
|
label_set = LabelSet(
|
|
1404
|
-
x=
|
|
1457
|
+
x="x",
|
|
1458
|
+
y="y",
|
|
1459
|
+
text="text",
|
|
1405
1460
|
source=label_source,
|
|
1406
|
-
text_font_size=
|
|
1461
|
+
text_font_size="11pt",
|
|
1407
1462
|
text_color=text_color,
|
|
1408
1463
|
text_font_style=text_font_style,
|
|
1409
1464
|
text_align=text_align,
|
|
1410
|
-
text_baseline=
|
|
1465
|
+
text_baseline="middle",
|
|
1411
1466
|
)
|
|
1412
1467
|
p1.add_layout(label_set)
|
|
1413
1468
|
label_renderers[label_key] = label_set # Store for legend control
|
|
1414
|
-
|
|
1469
|
+
|
|
1415
1470
|
# Check if any labels are close to plot boundaries and expand if needed
|
|
1416
1471
|
if label_sources:
|
|
1417
1472
|
# Collect all label positions
|
|
1418
1473
|
all_label_positions = []
|
|
1419
1474
|
for source in label_sources.values():
|
|
1420
1475
|
data = source.data
|
|
1421
|
-
if
|
|
1422
|
-
all_label_positions.extend(zip(data[
|
|
1423
|
-
|
|
1476
|
+
if "x" in data and "y" in data and data["x"] and data["y"]:
|
|
1477
|
+
all_label_positions.extend(zip(data["x"], data["y"]))
|
|
1478
|
+
|
|
1424
1479
|
if all_label_positions:
|
|
1425
1480
|
# Check if any labels are close to current plot boundaries
|
|
1426
1481
|
current_x_min, current_x_max = p1.x_range.start, p1.x_range.end
|
|
1427
1482
|
current_y_min, current_y_max = p1.y_range.start, p1.y_range.end
|
|
1428
|
-
|
|
1483
|
+
|
|
1429
1484
|
# Define "close to boundary" as within 5% of the plot range
|
|
1430
1485
|
x_range = current_x_max - current_x_min
|
|
1431
1486
|
y_range = current_y_max - current_y_min
|
|
1432
1487
|
boundary_threshold_x = x_range * 0.05
|
|
1433
1488
|
boundary_threshold_y = y_range * 0.05
|
|
1434
|
-
|
|
1489
|
+
|
|
1435
1490
|
needs_expansion = False
|
|
1436
1491
|
for label_x, label_y in all_label_positions:
|
|
1437
|
-
if (
|
|
1438
|
-
label_x
|
|
1439
|
-
|
|
1440
|
-
label_y
|
|
1492
|
+
if (
|
|
1493
|
+
label_x < current_x_min + boundary_threshold_x
|
|
1494
|
+
or label_x > current_x_max - boundary_threshold_x
|
|
1495
|
+
or label_y < current_y_min + boundary_threshold_y
|
|
1496
|
+
or label_y > current_y_max - boundary_threshold_y
|
|
1497
|
+
):
|
|
1441
1498
|
needs_expansion = True
|
|
1442
1499
|
break
|
|
1443
|
-
|
|
1500
|
+
|
|
1444
1501
|
# If labels are close to boundaries, expand plot by 5% (reduced from 10%)
|
|
1445
1502
|
if needs_expansion:
|
|
1446
1503
|
expansion_factor = 0.05 # 5% expansion (half of previous 10%)
|
|
1447
1504
|
x_expansion = x_range * expansion_factor
|
|
1448
1505
|
y_expansion = y_range * expansion_factor
|
|
1449
|
-
|
|
1506
|
+
|
|
1450
1507
|
p1.x_range.start = current_x_min - x_expansion
|
|
1451
1508
|
p1.x_range.end = current_x_max + x_expansion
|
|
1452
1509
|
p1.y_range.start = current_y_min - y_expansion
|
|
1453
1510
|
p1.y_range.end = current_y_max + y_expansion
|
|
1454
|
-
|
|
1455
|
-
|
|
1511
|
+
|
|
1456
1512
|
# Add hover tool with enrichment information
|
|
1457
|
-
hover = HoverTool(
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1513
|
+
hover = HoverTool(
|
|
1514
|
+
tooltips=[
|
|
1515
|
+
("Cluster", "@cluster"),
|
|
1516
|
+
("Sample", "@sample_name"),
|
|
1517
|
+
("Sample UID", "@sample_uid"),
|
|
1518
|
+
("Enrichments", "@enrichments"),
|
|
1519
|
+
]
|
|
1520
|
+
)
|
|
1463
1521
|
p1.add_tools(hover)
|
|
1464
|
-
|
|
1522
|
+
|
|
1465
1523
|
# Remove cluster legend labels from scatter plots (already done above)
|
|
1466
1524
|
# But keep any existing legend structure for now
|
|
1467
|
-
|
|
1525
|
+
|
|
1468
1526
|
# Create custom legend for enrichment terms (line/label pairs) ONLY
|
|
1469
1527
|
if line_renderers and (shared_terms or specific_terms):
|
|
1470
1528
|
legend_items = []
|
|
1471
1529
|
renderer_to_terms = {} # Group terms by their renderer
|
|
1472
|
-
|
|
1530
|
+
|
|
1473
1531
|
# Get all enriched terms and group them by their line renderer
|
|
1474
1532
|
all_enriched_atoms = set(shared_terms.keys()) | set(specific_terms.keys())
|
|
1475
|
-
|
|
1533
|
+
|
|
1476
1534
|
# First pass: map each term to its renderer
|
|
1477
1535
|
for atom in all_enriched_atoms:
|
|
1478
1536
|
renderer = None
|
|
1479
1537
|
renderer_key = None
|
|
1480
|
-
|
|
1538
|
+
|
|
1481
1539
|
if atom in shared_terms:
|
|
1482
1540
|
# Shared term
|
|
1483
1541
|
if atom in line_renderers:
|
|
@@ -1491,7 +1549,7 @@ def analyze_umap(
|
|
|
1491
1549
|
renderer = line_renderers[stack_key]
|
|
1492
1550
|
renderer_key = stack_key
|
|
1493
1551
|
break
|
|
1494
|
-
|
|
1552
|
+
|
|
1495
1553
|
elif atom in specific_terms:
|
|
1496
1554
|
# Cluster-specific term
|
|
1497
1555
|
cluster_id = specific_terms[atom]
|
|
@@ -1503,134 +1561,137 @@ def analyze_umap(
|
|
|
1503
1561
|
if stack_key in line_renderers:
|
|
1504
1562
|
renderer = line_renderers[stack_key]
|
|
1505
1563
|
renderer_key = stack_key
|
|
1506
|
-
|
|
1564
|
+
|
|
1507
1565
|
# Group terms by renderer
|
|
1508
1566
|
if renderer and renderer_key:
|
|
1509
1567
|
if renderer_key not in renderer_to_terms:
|
|
1510
1568
|
renderer_to_terms[renderer_key] = {
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1569
|
+
"renderer": renderer,
|
|
1570
|
+
"shared_terms": [],
|
|
1571
|
+
"specific_terms": [],
|
|
1572
|
+
"cluster_id": None,
|
|
1515
1573
|
}
|
|
1516
|
-
|
|
1574
|
+
|
|
1517
1575
|
if atom in shared_terms:
|
|
1518
|
-
renderer_to_terms[renderer_key][
|
|
1576
|
+
renderer_to_terms[renderer_key]["shared_terms"].append(atom)
|
|
1519
1577
|
else:
|
|
1520
|
-
renderer_to_terms[renderer_key][
|
|
1521
|
-
renderer_to_terms[renderer_key][
|
|
1522
|
-
|
|
1578
|
+
renderer_to_terms[renderer_key]["specific_terms"].append(atom)
|
|
1579
|
+
renderer_to_terms[renderer_key]["cluster_id"] = specific_terms[atom]
|
|
1580
|
+
|
|
1523
1581
|
# Second pass: create legend entries, one per renderer
|
|
1524
1582
|
for renderer_key, term_info in renderer_to_terms.items():
|
|
1525
|
-
shared_list = term_info[
|
|
1526
|
-
specific_list = term_info[
|
|
1527
|
-
line_renderer = term_info[
|
|
1528
|
-
|
|
1583
|
+
shared_list = term_info["shared_terms"]
|
|
1584
|
+
specific_list = term_info["specific_terms"]
|
|
1585
|
+
line_renderer = term_info["renderer"]
|
|
1586
|
+
|
|
1529
1587
|
# For now, legend can only control the line renderer
|
|
1530
1588
|
# Label visibility will be handled via JavaScript callback if needed
|
|
1531
1589
|
# (Note: LabelSet cannot be directly controlled by Bokeh legends)
|
|
1532
|
-
|
|
1590
|
+
|
|
1533
1591
|
# Create combined label text
|
|
1534
1592
|
if shared_list:
|
|
1535
1593
|
# Shared terms - remove "Shared:" prefix and just show the terms
|
|
1536
|
-
clean_terms = [
|
|
1537
|
-
|
|
1594
|
+
clean_terms = [
|
|
1595
|
+
atom.replace("name:", "").replace("group:", "").replace("batch:", "").replace("type:", "")
|
|
1596
|
+
for atom in shared_list
|
|
1597
|
+
]
|
|
1538
1598
|
if len(clean_terms) == 1:
|
|
1539
1599
|
label_text = clean_terms[0]
|
|
1540
1600
|
else:
|
|
1541
|
-
label_text =
|
|
1542
|
-
|
|
1601
|
+
label_text = ", ".join(clean_terms)
|
|
1602
|
+
|
|
1543
1603
|
elif specific_list:
|
|
1544
1604
|
# Cluster-specific terms
|
|
1545
|
-
cluster_id = term_info[
|
|
1546
|
-
clean_terms = [
|
|
1547
|
-
|
|
1605
|
+
cluster_id = term_info["cluster_id"]
|
|
1606
|
+
clean_terms = [
|
|
1607
|
+
atom.replace("name:", "").replace("group:", "").replace("batch:", "").replace("type:", "")
|
|
1608
|
+
for atom in specific_list
|
|
1609
|
+
]
|
|
1548
1610
|
if len(clean_terms) == 1:
|
|
1549
1611
|
label_text = f"C{cluster_id}: {clean_terms[0]}"
|
|
1550
1612
|
else:
|
|
1551
1613
|
label_text = f"C{cluster_id}: {', '.join(clean_terms)}"
|
|
1552
|
-
|
|
1614
|
+
|
|
1553
1615
|
# Add single legend entry for the line renderer only
|
|
1554
1616
|
# (Labels cannot be controlled by Bokeh legends directly)
|
|
1555
|
-
legend_items.append(
|
|
1556
|
-
|
|
1557
|
-
)
|
|
1558
|
-
|
|
1617
|
+
legend_items.append(LegendItem(label=label_text, renderers=[line_renderer]))
|
|
1618
|
+
|
|
1559
1619
|
# Hide cluster legend after we've created our enrichment legend
|
|
1560
|
-
if hasattr(p1,
|
|
1620
|
+
if hasattr(p1, "legend") and p1.legend:
|
|
1561
1621
|
if isinstance(p1.legend, list):
|
|
1562
1622
|
for legend in p1.legend:
|
|
1563
1623
|
legend.visible = False
|
|
1564
1624
|
else:
|
|
1565
1625
|
p1.legend.visible = False
|
|
1566
|
-
|
|
1626
|
+
|
|
1567
1627
|
# Create and add the custom enrichment legend
|
|
1568
1628
|
if legend_items:
|
|
1569
|
-
enrichment_legend = Legend(
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
p1.add_layout(enrichment_legend, 'right')
|
|
1575
|
-
|
|
1576
|
-
plots['cluster_plot'] = p1
|
|
1577
|
-
|
|
1629
|
+
enrichment_legend = Legend(items=legend_items, location="center_right", click_policy="hide")
|
|
1630
|
+
p1.add_layout(enrichment_legend, "right")
|
|
1631
|
+
|
|
1632
|
+
plots["cluster_plot"] = p1
|
|
1633
|
+
|
|
1578
1634
|
# Save cluster plot if filename provided
|
|
1579
1635
|
if filename:
|
|
1580
1636
|
# Handle filename extension properly
|
|
1581
|
-
if filename.endswith(
|
|
1637
|
+
if filename.endswith(".html"):
|
|
1582
1638
|
base_filename = filename[:-5] # Remove .html extension
|
|
1583
1639
|
cluster_filename = f"{base_filename}_clusters.html"
|
|
1584
1640
|
else:
|
|
1585
1641
|
cluster_filename = f"{filename}_clusters.html"
|
|
1586
|
-
|
|
1587
|
-
if not filename.startswith(
|
|
1642
|
+
|
|
1643
|
+
if not filename.startswith("/") and not filename[1:3] == ":\\":
|
|
1588
1644
|
cluster_filename = f"{self.folder}/{cluster_filename}"
|
|
1589
1645
|
_isolated_save_plot(p1, cluster_filename, cluster_filename, self.logger, "UMAP Cluster Plot")
|
|
1590
1646
|
else:
|
|
1591
1647
|
_isolated_show_notebook(p1)
|
|
1592
|
-
|
|
1593
|
-
results[
|
|
1648
|
+
|
|
1649
|
+
results["plots"] = plots
|
|
1594
1650
|
|
|
1595
1651
|
# Print summary
|
|
1596
1652
|
self.logger.debug("\n=== UMAP Cluster Analysis Summary ===")
|
|
1597
1653
|
self.logger.debug(f"Best clustering: {best_clustering['method']}")
|
|
1598
1654
|
self.logger.debug(f"Number of clusters: {best_clustering['n_clusters']}")
|
|
1599
1655
|
self.logger.debug(f"Silhouette score: {best_clustering['score']:.3f}")
|
|
1600
|
-
if best_clustering[
|
|
1656
|
+
if best_clustering["n_noise"] > 0:
|
|
1601
1657
|
self.logger.debug(f"Noise points: {best_clustering['n_noise']}")
|
|
1602
1658
|
|
|
1603
1659
|
self.logger.info(f"\nFound {len(all_associations)} total significant associations:")
|
|
1604
1660
|
|
|
1605
1661
|
# Show regular column associations
|
|
1606
|
-
regular_assocs = [a for a in all_associations if
|
|
1662
|
+
regular_assocs = [a for a in all_associations if "column" in a]
|
|
1607
1663
|
if regular_assocs:
|
|
1608
1664
|
self.logger.info(f" {len(regular_assocs)} column-level associations:")
|
|
1609
1665
|
for assoc in regular_assocs[:3]: # Show top 3
|
|
1610
|
-
self.logger.info(
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1666
|
+
self.logger.info(
|
|
1667
|
+
f" {assoc['column']} ({assoc['variable_type']}): {assoc['test']} p={assoc['p_value']:.4f}, "
|
|
1668
|
+
f"effect_size={assoc['effect_size']:.3f} ({assoc['interpretation']})"
|
|
1669
|
+
)
|
|
1670
|
+
|
|
1671
|
+
# Show text atom associations
|
|
1672
|
+
text_assocs = [a for a in all_associations if "atom" in a]
|
|
1615
1673
|
if text_assocs:
|
|
1616
1674
|
self.logger.info(f" {len(text_assocs)} text pattern associations:")
|
|
1617
1675
|
for assoc in text_assocs[:3]: # Show top 3
|
|
1618
|
-
freq = assoc.get(
|
|
1676
|
+
freq = assoc.get("atom_frequency", 0)
|
|
1619
1677
|
percentage = (freq / len(analysis_df_clean)) * 100 if len(analysis_df_clean) > 0 else 0
|
|
1620
|
-
|
|
1621
|
-
self.logger.info(
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1678
|
+
|
|
1679
|
+
self.logger.info(
|
|
1680
|
+
f" '{assoc['atom']}' ({assoc['type']}): p={assoc['p_value']:.4f}, "
|
|
1681
|
+
f"effect_size={assoc['effect_size']:.3f} ({assoc['interpretation']}) "
|
|
1682
|
+
f"[{freq} samples, {percentage:.1f}%]"
|
|
1683
|
+
)
|
|
1684
|
+
|
|
1625
1685
|
if len(all_associations) > 20:
|
|
1626
1686
|
self.logger.info(f" ... and {len(all_associations) - 20} more associations")
|
|
1627
1687
|
|
|
1628
1688
|
return results
|
|
1629
1689
|
|
|
1690
|
+
|
|
1630
1691
|
def _analyze_umap_simplified(
|
|
1631
1692
|
self,
|
|
1632
1693
|
n_neighbors=15,
|
|
1633
|
-
min_dist=0.1,
|
|
1694
|
+
min_dist=0.1,
|
|
1634
1695
|
metric="euclidean",
|
|
1635
1696
|
random_state=42,
|
|
1636
1697
|
cluster_methods=["hdbscan", "kmeans"],
|
|
@@ -1641,94 +1702,94 @@ def _analyze_umap_simplified(
|
|
|
1641
1702
|
filename=None,
|
|
1642
1703
|
):
|
|
1643
1704
|
"""Simplified fallback version of UMAP analysis."""
|
|
1644
|
-
|
|
1705
|
+
|
|
1645
1706
|
self.logger.info("Starting simplified UMAP analysis...")
|
|
1646
|
-
|
|
1707
|
+
|
|
1647
1708
|
# Check dependencies
|
|
1648
1709
|
if not UMAP_AVAILABLE or not HDBSCAN_AVAILABLE:
|
|
1649
1710
|
self.logger.error("Required dependencies not available")
|
|
1650
1711
|
return {
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1712
|
+
"umap_coords": None,
|
|
1713
|
+
"best_clustering": None,
|
|
1714
|
+
"all_clustering_results": {},
|
|
1715
|
+
"significant_associations": [],
|
|
1716
|
+
"text_associations": [],
|
|
1717
|
+
"cluster_summaries": {},
|
|
1718
|
+
"analysis_dataframe": None,
|
|
1658
1719
|
}
|
|
1659
|
-
|
|
1720
|
+
|
|
1660
1721
|
try:
|
|
1661
1722
|
# Get data
|
|
1662
1723
|
consensus_matrix = self.get_consensus_matrix()
|
|
1663
1724
|
samples_df = self.samples_df
|
|
1664
|
-
|
|
1725
|
+
|
|
1665
1726
|
if consensus_matrix is None or samples_df is None:
|
|
1666
1727
|
self.logger.error("No data available")
|
|
1667
1728
|
return {
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1729
|
+
"umap_coords": None,
|
|
1730
|
+
"best_clustering": None,
|
|
1731
|
+
"all_clustering_results": {},
|
|
1732
|
+
"significant_associations": [],
|
|
1733
|
+
"text_associations": [],
|
|
1734
|
+
"cluster_summaries": {},
|
|
1735
|
+
"analysis_dataframe": None,
|
|
1675
1736
|
}
|
|
1676
|
-
|
|
1737
|
+
|
|
1677
1738
|
# Basic UMAP
|
|
1678
1739
|
sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
|
|
1679
|
-
|
|
1740
|
+
|
|
1680
1741
|
if hasattr(consensus_matrix, "select"):
|
|
1681
1742
|
matrix_data = consensus_matrix.select(sample_cols).to_numpy()
|
|
1682
1743
|
else:
|
|
1683
1744
|
matrix_data = consensus_matrix.drop(columns=["consensus_uid"], errors="ignore").values
|
|
1684
|
-
|
|
1745
|
+
|
|
1685
1746
|
matrix_data = matrix_data.T
|
|
1686
1747
|
matrix_data = np.nan_to_num(matrix_data)
|
|
1687
|
-
|
|
1748
|
+
|
|
1688
1749
|
scaler = StandardScaler()
|
|
1689
1750
|
matrix_scaled = scaler.fit_transform(matrix_data)
|
|
1690
|
-
|
|
1751
|
+
|
|
1691
1752
|
# Import dependencies locally
|
|
1692
1753
|
import umap
|
|
1693
1754
|
import hdbscan
|
|
1694
|
-
|
|
1755
|
+
|
|
1695
1756
|
reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, random_state=random_state)
|
|
1696
1757
|
umap_coords = reducer.fit_transform(matrix_scaled)
|
|
1697
|
-
|
|
1758
|
+
|
|
1698
1759
|
# Simple clustering
|
|
1699
1760
|
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
|
|
1700
1761
|
cluster_labels = clusterer.fit_predict(umap_coords)
|
|
1701
|
-
|
|
1762
|
+
|
|
1702
1763
|
best_clustering = {
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1764
|
+
"labels": cluster_labels,
|
|
1765
|
+
"n_clusters": len(np.unique(cluster_labels[cluster_labels != -1])),
|
|
1766
|
+
"n_noise": np.sum(cluster_labels == -1),
|
|
1767
|
+
"silhouette_score": 0.5, # Placeholder
|
|
1768
|
+
"method": "hdbscan",
|
|
1708
1769
|
}
|
|
1709
|
-
|
|
1770
|
+
|
|
1710
1771
|
self.logger.info(f"Simplified analysis found {best_clustering['n_clusters']} clusters")
|
|
1711
|
-
|
|
1772
|
+
|
|
1712
1773
|
return {
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1774
|
+
"umap_coords": umap_coords,
|
|
1775
|
+
"best_clustering": best_clustering,
|
|
1776
|
+
"all_clustering_results": {"hdbscan": best_clustering},
|
|
1777
|
+
"significant_associations": [],
|
|
1778
|
+
"text_associations": [],
|
|
1779
|
+
"cluster_summaries": {},
|
|
1780
|
+
"analysis_dataframe": None,
|
|
1720
1781
|
}
|
|
1721
|
-
|
|
1782
|
+
|
|
1722
1783
|
except Exception as e:
|
|
1723
1784
|
self.logger.error(f"Error in simplified analysis: {e}")
|
|
1724
1785
|
return {
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1786
|
+
"umap_coords": None,
|
|
1787
|
+
"best_clustering": None,
|
|
1788
|
+
"all_clustering_results": {},
|
|
1789
|
+
"significant_associations": [],
|
|
1790
|
+
"text_associations": [],
|
|
1791
|
+
"cluster_summaries": {},
|
|
1792
|
+
"analysis_dataframe": None,
|
|
1732
1793
|
}
|
|
1733
1794
|
|
|
1734
1795
|
|
|
@@ -1736,27 +1797,30 @@ def _analyze_umap_simplified(
|
|
|
1736
1797
|
# Helper Functions for Plotting
|
|
1737
1798
|
# ========================================
|
|
1738
1799
|
|
|
1800
|
+
|
|
1739
1801
|
def _isolated_save_plot(plot, filename, title, logger, plot_type):
|
|
1740
1802
|
"""Save plot to file in isolation"""
|
|
1741
1803
|
try:
|
|
1742
1804
|
from bokeh.io import output_file, save
|
|
1743
1805
|
from bokeh.models import Title
|
|
1744
|
-
|
|
1806
|
+
|
|
1745
1807
|
# Add title to plot
|
|
1746
|
-
plot.add_layout(Title(text=title, text_font_size="16pt"),
|
|
1747
|
-
|
|
1808
|
+
plot.add_layout(Title(text=title, text_font_size="16pt"), "above")
|
|
1809
|
+
|
|
1748
1810
|
# Configure output
|
|
1749
1811
|
output_file(filename)
|
|
1750
1812
|
save(plot)
|
|
1751
1813
|
logger.info(f"Saved {plot_type} to: {filename}")
|
|
1752
|
-
|
|
1814
|
+
|
|
1753
1815
|
except Exception as e:
|
|
1754
1816
|
logger.error(f"Error saving {plot_type}: {e}")
|
|
1755
1817
|
|
|
1818
|
+
|
|
1756
1819
|
def _isolated_show_notebook(plot):
|
|
1757
1820
|
"""Show plot in notebook if available"""
|
|
1758
1821
|
try:
|
|
1759
1822
|
from bokeh.io import show
|
|
1823
|
+
|
|
1760
1824
|
show(plot)
|
|
1761
1825
|
except Exception:
|
|
1762
1826
|
pass # Silently fail if not in notebook
|