masster 0.4.21__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -0,0 +1,1762 @@
1
+ """
2
+ analysis.py
3
+
4
+ Advanced analytical methods for mass spectrometry study data including
5
+ UMAP clustering, statistical association testing, and text pattern analysis.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ """
11
+ Optimized analysis module for mass spectrometry data.
12
+ """
13
+ import warnings
14
+ import re
15
+ import numpy as np
16
+ import pandas as pd
17
+ from scipy import stats
18
+
19
+ # Suppress sklearn deprecation warnings
20
+ warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")
21
+ warnings.filterwarnings("ignore", category=DeprecationWarning, module="sklearn")
22
+
23
+ # Check for optional dependencies
24
+ UMAP_AVAILABLE = False
25
+ HDBSCAN_AVAILABLE = False
26
+ SKLEARN_AVAILABLE = False
27
+
28
+ try:
29
+ import umap
30
+ UMAP_AVAILABLE = True
31
+ except ImportError:
32
+ pass
33
+
34
+ try:
35
+ import hdbscan
36
+ HDBSCAN_AVAILABLE = True
37
+ except ImportError:
38
+ pass
39
+
40
+ try:
41
+ from sklearn.preprocessing import StandardScaler
42
+ from sklearn.cluster import KMeans, DBSCAN
43
+ from sklearn.metrics import silhouette_score
44
+ SKLEARN_AVAILABLE = True
45
+ except ImportError:
46
+ pass
47
+
48
+ # Compiled regex patterns for efficient text processing
49
+ TOKEN_PATTERN = re.compile(r'[_\-\s\|\.]+')
50
+ ALPHANUMERIC_PATTERN = re.compile(r'^[A-Za-z0-9]+$')
51
+
52
+ # Simple cache for tokenization
53
+ _tokenization_cache = {}
54
+
55
+ def tokenize_text_cached(text):
56
+ """Cached text tokenization for repeated strings - preserves original case."""
57
+ if text in _tokenization_cache:
58
+ return _tokenization_cache[text]
59
+
60
+ if pd.isna(text) or text == '' or not isinstance(text, str):
61
+ result = tuple()
62
+ else:
63
+ # Split by common delimiters to create atoms (same as original)
64
+ atoms = TOKEN_PATTERN.split(str(text).strip())
65
+ # Clean and filter atoms - preserve original case
66
+ meaningful_tokens = []
67
+ for atom in atoms:
68
+ atom = atom.strip() # Remove .lower() to preserve case
69
+ if atom and len(atom) > 1: # Original was > 1, not >= 1
70
+ meaningful_tokens.append(atom)
71
+
72
+ result = tuple(meaningful_tokens)
73
+
74
+ # Prevent cache from growing too large
75
+ if len(_tokenization_cache) < 10000:
76
+ _tokenization_cache[text] = result
77
+
78
+ return result
79
+
80
+
81
+ # Clear cache to ensure fresh start
82
+ _tokenization_cache.clear()
83
+
84
+
85
+ def analyze_umap(
86
+ self,
87
+ n_neighbors=15,
88
+ min_dist=0.1,
89
+ metric="euclidean",
90
+ random_state=42,
91
+ cluster_methods=["hdbscan", "kmeans", "dbscan"],
92
+ n_clusters_range=(2, 8),
93
+ min_cluster_size=3,
94
+ significance_threshold=0.01,
95
+ plot_results=True,
96
+ filename=None,
97
+ markersize=4,
98
+ ):
99
+ """
100
+ Perform UMAP dimensionality reduction followed by clustering analysis with enriched term labeling.
101
+
102
+ This method performs comprehensive cluster analysis on the study's consensus matrix, including:
103
+ - UMAP dimensionality reduction for visualization
104
+ - Automated clustering with multiple algorithms (HDBSCAN, K-means, DBSCAN)
105
+ - Metadata association discovery using statistical tests
106
+ - Text pattern analysis to identify enriched sample characteristics
107
+ - Enhanced visualization with intelligent label positioning for enriched terms
108
+
109
+ The enhanced visualization features cluster-aware enriched term labels with connecting spikes:
110
+ - Terms shared across multiple clusters are positioned at the geometric center with lines to each cluster
111
+ - Terms specific to single clusters are positioned nearby with short spikes
112
+ - Terms are ranked by presence percentage within clusters (favoring common terms)
113
+ - Empty/blank terms are automatically filtered out
114
+ - Label positioning adapts to line direction for optimal text alignment
115
+ - Dashed edges and color-coordinated labels provide visual clarity
116
+
117
+ Unlike plot_samples_umap() which colors by metadata columns, this function performs clustering
118
+ and colors points by cluster assignments, with tooltips showing enrichment information.
119
+
120
+ Parameters
121
+ ----------
122
+ n_neighbors : int, default=15
123
+ Number of neighbors for UMAP embedding. Higher values preserve more global structure,
124
+ lower values preserve more local structure.
125
+
126
+ min_dist : float, default=0.1
127
+ Minimum distance parameter for UMAP. Controls how tightly points are packed in the
128
+ embedding. Values closer to 0 result in tighter clusters.
129
+
130
+ metric : str, default="euclidean"
131
+ Distance metric for UMAP. Options include 'euclidean', 'manhattan', 'cosine', etc.
132
+
133
+ random_state : int, default=42
134
+ Random seed for reproducibility of UMAP embedding and clustering.
135
+
136
+ cluster_methods : list, default=["hdbscan", "kmeans", "dbscan"]
137
+ Clustering algorithms to evaluate. Available options:
138
+ - 'hdbscan': Hierarchical density-based clustering (requires hdbscan package)
139
+ - 'kmeans': K-means clustering with multiple k values
140
+ - 'dbscan': Density-based spatial clustering with multiple eps values
141
+
142
+ n_clusters_range : tuple, default=(2, 8)
143
+ Range of cluster numbers to test for K-means (min_clusters, max_clusters).
144
+
145
+ min_cluster_size : int, default=3
146
+ Minimum cluster size for HDBSCAN and DBSCAN algorithms.
147
+
148
+ significance_threshold : float, default=0.05
149
+ P-value threshold for statistical significance of metadata associations.
150
+
151
+ plot_results : bool, default=True
152
+ Whether to generate interactive Bokeh plots with enhanced labeling.
153
+ When False, only returns analysis results without visualization.
154
+
155
+ filename : str, optional
156
+ If provided, saves the interactive plot to this HTML file.
157
+
158
+ markersize : int, default=4
159
+ Size of scatter plot markers representing samples.
160
+
161
+ Returns
162
+ -------
163
+ dict
164
+ Comprehensive results dictionary containing:
165
+
166
+ - **umap_coords** : numpy.ndarray
167
+ 2D UMAP coordinates for all samples (n_samples x 2)
168
+
169
+ - **best_clustering** : dict
170
+ Best clustering result based on silhouette score, containing:
171
+ - 'labels': cluster assignments for each sample
172
+ - 'score': silhouette score (quality metric)
173
+ - 'n_clusters': number of identified clusters
174
+ - 'n_noise': number of noise points (outliers)
175
+ - 'method': clustering algorithm used
176
+
177
+ - **all_clustering_results** : dict
178
+ Results from all tested clustering configurations, keyed by method name
179
+
180
+ - **significant_associations** : list
181
+ All statistically significant associations (both numeric and text), sorted by
182
+ cluster presence percentage. Each association includes:
183
+ - Statistical test results (p-value, effect size)
184
+ - Cluster-specific enrichment information
185
+ - Interpretation of effect size magnitude
186
+
187
+ - **text_associations** : list
188
+ Subset of associations specifically for text pattern enrichment, ranked by
189
+ presence percentage within clusters rather than statistical enrichment
190
+
191
+ - **cluster_summaries** : dict
192
+ Summary information for each cluster:
193
+ - 'n_samples': number of samples in cluster
194
+ - 'sample_names': list of sample names in cluster
195
+
196
+ - **analysis_dataframe** : pandas.DataFrame
197
+ Complete dataframe with UMAP coordinates, cluster assignments, and all
198
+ sample metadata used for association analysis
199
+
200
+ Raises
201
+ ------
202
+ ImportError
203
+ If required dependencies (umap-learn, scikit-learn) are not installed
204
+
205
+ ValueError
206
+ If consensus matrix is empty or samples data is unavailable
207
+
208
+ Examples
209
+ --------
210
+ Basic UMAP analysis with default parameters:
211
+
212
+ >>> results = study.analyze_umap()
213
+ >>> print(f"Found {results['best_clustering']['n_clusters']} clusters")
214
+ >>> print(f"Silhouette score: {results['best_clustering']['score']:.3f}")
215
+
216
+ Custom analysis with specific clustering and enhanced visualization:
217
+
218
+ >>> results = study.analyze_umap(
219
+ ... n_neighbors=20,
220
+ ... min_dist=0.05,
221
+ ... cluster_methods=["hdbscan", "dbscan"],
222
+ ... significance_threshold=0.01,
223
+ ... filename="cluster_analysis.html"
224
+ ... )
225
+
226
+ Fast analysis for large datasets:
227
+
228
+ >>> results = study.analyze_umap(
229
+ ... cluster_methods=["hdbscan"]
230
+ ... )
231
+
232
+ Notes
233
+ -----
234
+ The enhanced visualization automatically identifies and labels enriched terms based on:
235
+
236
+ 1. **Presence-based ranking**: Terms are ranked by their prevalence within clusters
237
+ rather than statistical enrichment, favoring terms common across cluster members
238
+
239
+ 2. **Intelligent positioning**:
240
+ - Shared terms (multiple clusters) positioned at geometric center with connecting lines
241
+ - Individual terms positioned adjacent to their cluster with short spikes
242
+ - Westward lines position labels to the left with right-aligned text
243
+ - Eastward lines position labels to the right with left-aligned text
244
+
245
+ 3. **Quality filtering**: Empty terms (variants of 'empty', 'blank', 'qc') are
246
+ automatically excluded from enrichment analysis and visualization
247
+
248
+ 4. **Visual styling**: Dashed edges, color-coordinated labels and lines, and
249
+ moderate boundary expansion (5%) create professional, readable plots
250
+
251
+ The method automatically handles missing dependencies by falling back to simplified
252
+ analysis when optional packages (hdbscan) are unavailable.
253
+ """
254
+
255
+ # Check dependencies
256
+ if not UMAP_AVAILABLE:
257
+ self.logger.error("UMAP is required. Install with: pip install umap-learn")
258
+ return None
259
+
260
+ if not SKLEARN_AVAILABLE:
261
+ self.logger.error("scikit-learn is required. Install with: pip install scikit-learn")
262
+ return None
263
+
264
+ self.logger.info("Starting UMAP cluster analysis...")
265
+
266
+ # Get data
267
+ consensus_matrix = self.get_consensus_matrix()
268
+ samples_df = self.samples_df
269
+
270
+ if consensus_matrix is None or consensus_matrix.shape[0] == 0:
271
+ self.logger.error("No consensus matrix available. Run feature detection first.")
272
+ return None
273
+
274
+ if samples_df is None or len(samples_df) == 0:
275
+ self.logger.error("No samples data available.")
276
+ return None
277
+
278
+ # Prepare data for UMAP
279
+ sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
280
+
281
+ if hasattr(consensus_matrix, "select"):
282
+ matrix_data = consensus_matrix.select(sample_cols).to_numpy()
283
+ else:
284
+ matrix_sample_data = consensus_matrix.drop(columns=["consensus_uid"], errors="ignore")
285
+ matrix_data = matrix_sample_data.values if hasattr(matrix_sample_data, "values") else np.array(matrix_sample_data)
286
+
287
+ # Transpose so samples are rows
288
+ matrix_data = matrix_data.T
289
+ matrix_data = np.nan_to_num(matrix_data, nan=0.0, posinf=0.0, neginf=0.0)
290
+
291
+ # Standardize data
292
+ from sklearn.preprocessing import StandardScaler
293
+ scaler = StandardScaler()
294
+ matrix_scaled = scaler.fit_transform(matrix_data)
295
+
296
+ # Perform UMAP with optimizations
297
+ self.logger.debug(f"Computing UMAP with n_neighbors={n_neighbors}, min_dist={min_dist}")
298
+ import umap
299
+
300
+ # UMAP optimization: use limited threads to save memory
301
+ n_jobs = 1
302
+
303
+ reducer = umap.UMAP(
304
+ n_components=2,
305
+ n_neighbors=n_neighbors,
306
+ min_dist=min_dist,
307
+ metric=metric,
308
+ random_state=random_state,
309
+ n_jobs=n_jobs,
310
+ low_memory=False
311
+ )
312
+ umap_coords = reducer.fit_transform(matrix_scaled)
313
+
314
+ # Convert samples_df to pandas for easier analysis
315
+ samples_pd = samples_df.to_pandas() if hasattr(samples_df, 'to_pandas') else samples_df
316
+
317
+ # Get the actual sample columns present in consensus matrix
318
+ sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
319
+ consensus_sample_names = set(sample_cols)
320
+
321
+ # Filter samples_df to only include samples present in consensus matrix
322
+ if 'sample_name' in samples_pd.columns:
323
+ # Create a mask for samples present in consensus matrix
324
+ sample_mask = samples_pd['sample_name'].isin(consensus_sample_names)
325
+
326
+ if sample_mask.sum() != len(samples_pd):
327
+ missing_samples = set(samples_pd['sample_name']) - consensus_sample_names
328
+ self.logger.warning(f"Filtering out {len(missing_samples)} samples not in consensus matrix: {list(missing_samples)}")
329
+ samples_pd = samples_pd[sample_mask].copy()
330
+
331
+ # Reorder samples_pd to match the order in consensus matrix sample_cols
332
+ samples_pd = samples_pd.set_index('sample_name').reindex(sample_cols).reset_index()
333
+
334
+ # Final check - ensure we have the same number of samples
335
+ if len(samples_pd) != len(umap_coords):
336
+ self.logger.error(f"After filtering, still have mismatch: samples_df has {len(samples_pd)} rows, UMAP has {len(umap_coords)} points")
337
+ return None
338
+
339
+ self.logger.info(f"Using {len(samples_pd)} samples for analysis")
340
+
341
+ # Try different clustering methods
342
+ clustering_results = {}
343
+
344
+ for method in cluster_methods:
345
+ self.logger.debug(f"Trying clustering method: {method}")
346
+
347
+ if method == "hdbscan" and HDBSCAN_AVAILABLE:
348
+ import hdbscan
349
+ clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean')
350
+ cluster_labels = clusterer.fit_predict(umap_coords)
351
+
352
+ # Calculate silhouette score (excluding noise points for HDBSCAN)
353
+ valid_labels = cluster_labels[cluster_labels != -1]
354
+ valid_coords = umap_coords[cluster_labels != -1]
355
+
356
+ if len(np.unique(valid_labels)) > 1:
357
+ from sklearn.metrics import silhouette_score
358
+ score = silhouette_score(valid_coords, valid_labels)
359
+ n_clusters = len(np.unique(valid_labels))
360
+ n_noise = np.sum(cluster_labels == -1)
361
+
362
+ clustering_results[f"{method}"] = {
363
+ 'labels': cluster_labels,
364
+ 'score': score,
365
+ 'n_clusters': n_clusters,
366
+ 'n_noise': n_noise,
367
+ 'method': method
368
+ }
369
+
370
+ elif method == "kmeans":
371
+ from sklearn.cluster import KMeans
372
+ from sklearn.metrics import silhouette_score
373
+
374
+ for n_clusters in range(n_clusters_range[0], n_clusters_range[1] + 1):
375
+ kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
376
+ cluster_labels = kmeans.fit_predict(umap_coords)
377
+ score = silhouette_score(umap_coords, cluster_labels)
378
+
379
+ clustering_results[f"{method}_k{n_clusters}"] = {
380
+ 'labels': cluster_labels,
381
+ 'score': score,
382
+ 'n_clusters': n_clusters,
383
+ 'n_noise': 0,
384
+ 'method': f"{method} (k={n_clusters})"
385
+ }
386
+
387
+ elif method == "dbscan":
388
+ from sklearn.cluster import DBSCAN
389
+ # Standard DBSCAN eps values for exploration
390
+ eps_values = [0.3, 0.5, 0.7, 1.0, 1.5]
391
+
392
+ for eps in eps_values:
393
+ dbscan = DBSCAN(eps=eps, min_samples=min_cluster_size, n_jobs=-1)
394
+ cluster_labels = dbscan.fit_predict(umap_coords)
395
+
396
+ n_clusters = len(np.unique(cluster_labels[cluster_labels != -1]))
397
+ n_noise = np.sum(cluster_labels == -1)
398
+
399
+ # Only consider valid clusterings
400
+ if n_clusters > 1:
401
+ from sklearn.metrics import silhouette_score
402
+ valid_labels = cluster_labels[cluster_labels != -1]
403
+ valid_coords = umap_coords[cluster_labels != -1]
404
+
405
+ if len(valid_coords) > 0 and len(np.unique(valid_labels)) > 1:
406
+ score = silhouette_score(valid_coords, valid_labels)
407
+
408
+ clustering_results[f"{method}_eps{eps}"] = {
409
+ 'labels': cluster_labels,
410
+ 'score': score,
411
+ 'n_clusters': n_clusters,
412
+ 'n_noise': n_noise,
413
+ 'method': f"{method} (eps={eps})"
414
+ }
415
+
416
+ if not clustering_results:
417
+ self.logger.error("No valid clustering results found")
418
+ return None
419
+
420
+ # Select best clustering based on silhouette score
421
+ best_key = max(clustering_results.keys(), key=lambda k: clustering_results[k]['score'])
422
+ best_clustering = clustering_results[best_key]
423
+
424
+ self.logger.info(f"Best clustering: {best_clustering['method']} with {best_clustering['n_clusters']} clusters, "
425
+ f"silhouette score: {best_clustering['score']:.3f}")
426
+
427
+ # Analyze associations between clusters and sample metadata
428
+ cluster_labels = best_clustering['labels']
429
+
430
+ # Add cluster labels to samples dataframe for analysis
431
+ analysis_df = samples_pd.copy()
432
+ analysis_df['cluster'] = cluster_labels
433
+
434
+ # Remove noise points (label -1) for association analysis
435
+ analysis_df_clean = analysis_df[analysis_df['cluster'] != -1].copy()
436
+
437
+ if len(analysis_df_clean) == 0:
438
+ self.logger.error("No samples assigned to clusters (all noise)")
439
+ return None
440
+
441
+ # Analyze associations with specific columns only
442
+ significant_associations = []
443
+
444
+ # Define which columns to analyze for associations (non-text)
445
+ association_cols = {'sample_sequence', 'num_features'}
446
+
447
+ # Define which columns to analyze for text patterns - include all relevant text columns
448
+ text_pattern_cols = {'sample_name', 'sample_group', 'sample_batch', 'sample_type'}
449
+
450
+
451
+ for col in samples_pd.columns:
452
+ if col not in association_cols:
453
+ continue
454
+
455
+ try:
456
+ # Check if column has enough variation
457
+ col_data = analysis_df_clean[col].dropna()
458
+ if len(col_data.unique()) < 2:
459
+ continue
460
+
461
+ # Determine if column is numeric or categorical
462
+ if pd.api.types.is_numeric_dtype(col_data):
463
+ # Numeric variable - use ANOVA or Kruskal-Wallis
464
+ cluster_groups = [group[col].dropna().values for name, group in analysis_df_clean.groupby('cluster')]
465
+ cluster_groups = [group for group in cluster_groups if len(group) > 0]
466
+
467
+ if len(cluster_groups) > 1:
468
+ # Try ANOVA first
469
+ try:
470
+ f_stat, p_value = stats.f_oneway(*cluster_groups)
471
+ test_name = "ANOVA"
472
+ except Exception:
473
+ # Fall back to Kruskal-Wallis (non-parametric)
474
+ h_stat, p_value = stats.kruskal(*cluster_groups)
475
+ test_name = "Kruskal-Wallis"
476
+ f_stat = h_stat
477
+
478
+ if p_value < significance_threshold:
479
+ # Calculate effect size (eta-squared approximation)
480
+ ss_between = sum(len(group) * (np.mean(group) - np.mean(col_data))**2 for group in cluster_groups)
481
+ ss_total = np.sum((col_data - np.mean(col_data))**2)
482
+ eta_squared = ss_between / ss_total if ss_total > 0 else 0
483
+
484
+ significant_associations.append({
485
+ 'column': col,
486
+ 'variable_type': 'numeric',
487
+ 'test': test_name,
488
+ 'statistic': f_stat,
489
+ 'p_value': p_value,
490
+ 'effect_size': eta_squared,
491
+ 'interpretation': 'Large effect' if eta_squared > 0.14 else 'Medium effect' if eta_squared > 0.06 else 'Small effect'
492
+ })
493
+
494
+ else:
495
+ # Categorical variable - use Chi-square test
496
+ contingency_table = pd.crosstab(analysis_df_clean['cluster'], analysis_df_clean[col])
497
+
498
+ # Only test if we have enough observations
499
+ if contingency_table.sum().sum() > 10 and contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
500
+ try:
501
+ chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
502
+
503
+ if p_value < significance_threshold:
504
+ # Calculate Cramer's V (effect size for chi-square)
505
+ n = contingency_table.sum().sum()
506
+ cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))
507
+
508
+ significant_associations.append({
509
+ 'column': col,
510
+ 'variable_type': 'categorical',
511
+ 'test': 'Chi-square',
512
+ 'statistic': chi2,
513
+ 'p_value': p_value,
514
+ 'effect_size': cramers_v,
515
+ 'interpretation': 'Large effect' if cramers_v > 0.5 else 'Medium effect' if cramers_v > 0.3 else 'Small effect',
516
+ 'contingency_table': contingency_table
517
+ })
518
+ except Exception:
519
+ continue
520
+
521
+ except Exception as e:
522
+ self.logger.debug(f"Error analyzing column {col}: {e}")
523
+ continue
524
+
525
+ # Sort by effect size (descending)
526
+ significant_associations.sort(key=lambda x: x['effect_size'], reverse=True)
527
+
528
+ # Enhanced cluster-centric text analysis - analyze what makes each cluster unique
529
+ self.logger.debug("Performing cluster-centric enrichment analysis...")
530
+
531
+ text_associations = []
532
+
533
+ # Optimized text tokenization using cached function
534
+ def tokenize_text_optimized(text):
535
+ """Optimized text tokenization with caching"""
536
+ return tokenize_text_cached(text)
537
+
538
+ # Collect all atoms from specified string columns only
539
+ string_columns = []
540
+ for col in text_pattern_cols:
541
+ if col in analysis_df_clean.columns:
542
+ col_data = analysis_df_clean[col].dropna()
543
+ if len(col_data) > 0 and not pd.api.types.is_numeric_dtype(col_data):
544
+ if len(col_data.astype(str).unique()) > 1: # Has variation
545
+ string_columns.append(col)
546
+
547
+ if string_columns:
548
+ # Text analysis for string columns
549
+ self.logger.debug(f"Analyzing cluster enrichments in {len(string_columns)} string columns")
550
+
551
+ # Build cluster-centric atom analysis using cached tokenization
552
+ cluster_atoms = {} # cluster_id -> {atom -> count}
553
+ global_atom_counts = {} # atom -> total_count_across_all_samples
554
+
555
+ # Pre-tokenize all text data once for efficiency with column prefixes
556
+ sample_atom_sets = {}
557
+ for idx, row in analysis_df_clean.iterrows():
558
+ sample_atoms = set()
559
+ for col in string_columns:
560
+ atoms = tokenize_text_optimized(row[col])
561
+ # Add column prefix to distinguish where tokens come from
562
+ col_prefix = col.replace('sample_', '') + ':' # e.g., "name:", "group:", "batch:", "type:"
563
+ prefixed_atoms = [f"{col_prefix}{atom}" for atom in atoms]
564
+ sample_atoms.update(prefixed_atoms)
565
+ sample_atom_sets[idx] = sample_atoms
566
+
567
+ # Collect atoms by cluster
568
+ for idx, row in analysis_df_clean.iterrows():
569
+ cluster_id = row['cluster']
570
+ if cluster_id not in cluster_atoms:
571
+ cluster_atoms[cluster_id] = {}
572
+
573
+ # Use pre-tokenized atoms
574
+ sample_atoms = sample_atom_sets[idx]
575
+
576
+ # Count atoms for this cluster and globally
577
+ for atom in sample_atoms:
578
+ cluster_atoms[cluster_id][atom] = cluster_atoms[cluster_id].get(atom, 0) + 1
579
+ global_atom_counts[atom] = global_atom_counts.get(atom, 0) + 1
580
+
581
+ # Calculate cluster enrichments using hypergeometric test (same for both modes)
582
+ if string_columns:
583
+ n_total_samples = len(analysis_df_clean)
584
+
585
+ # For each cluster, find significantly enriched terms
586
+ for cluster_id, cluster_atom_counts in cluster_atoms.items():
587
+ cluster_size = len(analysis_df_clean[analysis_df_clean['cluster'] == cluster_id])
588
+
589
+ for atom, cluster_count in cluster_atom_counts.items():
590
+ global_count = global_atom_counts[atom]
591
+
592
+ # Skip empty terms from enrichment analysis and plotting
593
+ if (atom == '<empty>' or
594
+ atom.lower() == 'empty' or
595
+ atom.strip() == '' or
596
+ ':empty' in atom.lower() or
597
+ atom.lower().endswith('empty') or
598
+ ':blank' in atom.lower() or
599
+ atom.lower().endswith('blank')):
600
+ continue
601
+
602
+ # Skip atoms with low frequency
603
+ if global_count < 2:
604
+ continue
605
+
606
+ # Skip terms that occur in fewer than 5 samples within this cluster
607
+ if cluster_count < 5:
608
+ continue
609
+
610
+ # IMPORTANT: Skip atoms that appear in too many clusters (not cluster-specific)
611
+ # Count how many clusters this atom appears in
612
+ clusters_with_atom = set()
613
+ for other_cluster_id, other_cluster_atom_counts in cluster_atoms.items():
614
+ if atom in other_cluster_atom_counts:
615
+ clusters_with_atom.add(other_cluster_id)
616
+
617
+ total_clusters = len(cluster_atoms)
618
+ cluster_specificity = len(clusters_with_atom) / total_clusters if total_clusters > 0 else 1
619
+
620
+ # Skip if atom appears in more than 50% of clusters (not specific enough)
621
+ if cluster_specificity > 0.5:
622
+ # Note: logger not available in standalone function, would need to pass self
623
+ continue
624
+
625
+ # Additional check: ensure this cluster has significantly more of this atom than others
626
+ #max_other_cluster_count = 0
627
+ #for other_cluster_id, other_cluster_atom_counts in cluster_atoms.items():
628
+ # if other_cluster_id != cluster_id and atom in other_cluster_atom_counts:
629
+ # max_other_cluster_count = max(max_other_cluster_count, other_cluster_atom_counts[atom])
630
+
631
+ # Skip if current cluster doesn't have significantly more instances than the next highest
632
+ #if cluster_count <= max_other_cluster_count * 1.5:
633
+ # Note: logger not available in standalone function, would need to pass self
634
+ # continue
635
+
636
+ # Calculate enrichment using hypergeometric test
637
+ try:
638
+ from scipy.stats import hypergeom
639
+
640
+ M = n_total_samples
641
+ n = global_count
642
+ N = cluster_size
643
+ k = cluster_count
644
+
645
+ # Calculate p-value (probability of observing k or more successes)
646
+ p_value = hypergeom.sf(k-1, M, n, N)
647
+
648
+ # Calculate enrichment ratio
649
+ expected_freq = (n / M) * N
650
+ enrichment_ratio = cluster_count / expected_freq if expected_freq > 0 else float('inf')
651
+
652
+ # Only consider significantly enriched terms (p < threshold and enrichment > 1.5x)
653
+ if p_value < significance_threshold and enrichment_ratio > 1.5:
654
+
655
+ # Calculate percentage of cluster samples with this atom
656
+ cluster_percentage = (cluster_count / cluster_size) * 100
657
+ global_percentage = (global_count / n_total_samples) * 100
658
+
659
+ text_associations.append({
660
+ 'atom': atom,
661
+ 'cluster_id': cluster_id,
662
+ 'type': 'cluster_enrichment',
663
+ 'test': 'Hypergeometric',
664
+ 'p_value': p_value,
665
+ 'enrichment_ratio': enrichment_ratio,
666
+ 'effect_size': enrichment_ratio, # Use enrichment ratio as effect size
667
+ 'interpretation': 'Large enrichment' if enrichment_ratio > 3 else 'Medium enrichment' if enrichment_ratio > 2 else 'Small enrichment',
668
+ 'cluster_count': cluster_count,
669
+ 'cluster_size': cluster_size,
670
+ 'cluster_percentage': cluster_percentage,
671
+ 'global_count': global_count,
672
+ 'global_percentage': global_percentage,
673
+ 'cluster_samples_with_atom': cluster_count,
674
+ 'total_samples_with_atom': global_count
675
+ })
676
+
677
+ except Exception as e:
678
+ self.logger.debug(f"Error analyzing enrichment of '{atom}' in cluster {cluster_id}: {e}")
679
+ continue
680
+
681
+ # Sort text associations by cluster presence percentage (favors common terms in clusters)
682
+ text_associations.sort(key=lambda x: x['cluster_percentage'], reverse=True)
683
+
684
+ # Combine regular and text associations
685
+ all_associations = significant_associations + text_associations
686
+ # Sort by cluster percentage for text associations, effect size for others
687
+ all_associations.sort(key=lambda x: x.get('cluster_percentage', x.get('effect_size', 0)), reverse=True)
688
+
689
+ # Generate cluster summaries
690
+ cluster_summaries = {}
691
+ for cluster_id in analysis_df_clean['cluster'].unique():
692
+ cluster_data = analysis_df_clean[analysis_df_clean['cluster'] == cluster_id]
693
+ cluster_summaries[cluster_id] = {
694
+ 'n_samples': len(cluster_data),
695
+ 'sample_names': cluster_data['sample_name'].tolist() if 'sample_name' in cluster_data else [],
696
+ }
697
+
698
+ # Create results dictionary
699
+ results = {
700
+ 'umap_coords': umap_coords,
701
+ 'best_clustering': best_clustering,
702
+ 'all_clustering_results': clustering_results,
703
+ 'significant_associations': all_associations,
704
+ 'text_associations': text_associations,
705
+ 'cluster_summaries': cluster_summaries,
706
+ 'analysis_dataframe': analysis_df_clean
707
+ }
708
+
709
+ # Create sample-specific enrichment tooltips with optimization
710
+ sample_enrichments = {}
711
+
712
+ # For each sample, find which text atoms it contains that are significant
713
+ if text_associations:
714
+ max_check_terms = 10 # Standard limit for tooltip calculation
715
+
716
+ for idx, row in analysis_df_clean.iterrows():
717
+ sample_name = row.get('sample_name', f'sample_{idx}')
718
+ sample_enrichments[sample_name] = []
719
+
720
+ # Check which significant atoms this sample contains
721
+ for assoc in text_associations[:max_check_terms]: # Check fewer terms in fast mode
722
+ atom = assoc['atom']
723
+
724
+ # Check if this sample contains this atom in any of the text columns
725
+ sample_has_atom = False
726
+ for col in text_pattern_cols:
727
+ if col in row:
728
+ text_value = str(row[col]) if not pd.isna(row[col]) else ""
729
+ if atom.lower() in text_value.lower():
730
+ sample_has_atom = True
731
+ break
732
+
733
+ if sample_has_atom:
734
+ sample_enrichments[sample_name].append(f"{atom} ({assoc['p_value']:.3f})")
735
+ if len(sample_enrichments[sample_name]) >= 3: # Only show top 3 per sample
736
+ break
737
+
738
+ # Create embedded plots if requested
739
+ if plot_results:
740
+ plots = {}
741
+
742
+ # Plot 1: Enhanced UMAP with clusters and enriched term labels (EMBEDDED PLOTTING)
743
+ from bokeh.models import ColumnDataSource, HoverTool, LabelSet, LegendItem, Legend
744
+ from bokeh.plotting import figure
745
+ from collections import defaultdict
746
+
747
+ # Create cluster plot with enhanced size
748
+ p1 = figure(
749
+ width=900, height=700,
750
+ title=f"UMAP Clusters with Enriched Terms ({best_clustering['method']})",
751
+ tools="pan,wheel_zoom,box_zoom,reset,save"
752
+ )
753
+ p1.xaxis.axis_label = "UMAP1"
754
+ p1.yaxis.axis_label = "UMAP2"
755
+
756
+ # Remove grid
757
+ p1.grid.visible = False
758
+
759
+ # Color points by cluster
760
+ unique_clusters = np.unique(cluster_labels)
761
+ n_clusters = len(unique_clusters)
762
+
763
+ # Handle color mapping for many clusters - use turbo colormap
764
+ if n_clusters <= 10:
765
+ from bokeh.palettes import turbo
766
+ colors = turbo(max(10, n_clusters))[:n_clusters]
767
+ elif n_clusters <= 20:
768
+ from bokeh.palettes import turbo
769
+ colors = turbo(20)[:n_clusters]
770
+ else:
771
+ # For many clusters, use a continuous colormap
772
+ from bokeh.palettes import turbo
773
+ colors = turbo(min(256, n_clusters))
774
+
775
+ # Calculate cluster centers and plot points
776
+ cluster_centers = {}
777
+ for i, cluster_id in enumerate(unique_clusters):
778
+ mask = cluster_labels == cluster_id
779
+ if cluster_id == -1:
780
+ color = "gray"
781
+ label = "Noise"
782
+ else:
783
+ color = colors[i % len(colors)]
784
+ label = f"Cluster {cluster_id}"
785
+
786
+ cluster_coords = umap_coords[mask]
787
+
788
+ # Calculate cluster center
789
+ if len(cluster_coords) > 0:
790
+ center_x = np.mean(cluster_coords[:, 0])
791
+ center_y = np.mean(cluster_coords[:, 1])
792
+ cluster_centers[cluster_id] = (center_x, center_y)
793
+
794
+ cluster_samples = samples_pd[mask] if len(samples_pd) == len(mask) else None
795
+ sample_names = cluster_samples['sample_name'].tolist() if cluster_samples is not None and 'sample_name' in cluster_samples else [f"Sample_{j}" for j in range(np.sum(mask))]
796
+ sample_uids = cluster_samples['sample_uid'].tolist() if cluster_samples is not None and 'sample_uid' in cluster_samples else [f"UID_{j}" for j in range(np.sum(mask))]
797
+
798
+ # Create enrichment tooltip text for this cluster
799
+ cluster_associations = [assoc for assoc in text_associations if assoc.get('cluster_id') == cluster_id]
800
+
801
+ # Get the top enrichments for this cluster (not individual samples)
802
+ cluster_enrichments = []
803
+ for assoc in cluster_associations[:3]: # Top 3 enrichments for this cluster
804
+ atom = assoc['atom']
805
+ # Skip color codes and other non-meaningful atoms
806
+ if not ((atom.startswith('#') and len(atom) == 7) or atom in ['nan', 'None', 'null']):
807
+ cluster_enrichments.append(atom)
808
+
809
+ # Create the same enrichment text for ALL samples in this cluster
810
+ if cluster_enrichments:
811
+ cluster_enrichment_text = "; ".join(cluster_enrichments)
812
+ else:
813
+ cluster_enrichment_text = "No enrichments found"
814
+
815
+ # Apply the same enrichment text to all samples in this cluster
816
+ sample_enrichment_texts = [cluster_enrichment_text] * np.sum(mask)
817
+
818
+ source = ColumnDataSource({
819
+ 'x': umap_coords[mask, 0],
820
+ 'y': umap_coords[mask, 1],
821
+ 'cluster': [cluster_id] * np.sum(mask),
822
+ 'sample_name': sample_names[:np.sum(mask)],
823
+ 'sample_uid': sample_uids[:np.sum(mask)],
824
+ 'enrichments': sample_enrichment_texts[:np.sum(mask)]
825
+ })
826
+
827
+ p1.scatter('x', 'y', size=markersize, color=color, alpha=0.7,
828
+ source=source)
829
+
830
+ # Enhanced enriched term visualization
831
+ max_terms_per_cluster = 2
832
+ min_enrichment = 2.0
833
+
834
+ # Process enriched terms - group by cluster and filter
835
+ cluster_terms = defaultdict(list)
836
+ for assoc in text_associations:
837
+ # Skip empty terms from plotting
838
+ atom = assoc.get('atom', '')
839
+ if (atom == '<empty>' or
840
+ atom.lower() == 'empty' or
841
+ atom.strip() == '' or
842
+ ':empty' in atom.lower() or
843
+ atom.lower().endswith('empty') or
844
+ ':blank' in atom.lower() or
845
+ atom.lower().endswith('blank')):
846
+ continue
847
+
848
+ if (assoc['enrichment_ratio'] >= min_enrichment and
849
+ assoc['cluster_id'] in cluster_centers):
850
+ cluster_terms[assoc['cluster_id']].append(assoc)
851
+
852
+ # Limit terms per cluster and sort by cluster presence percentage (favors common terms)
853
+ for cluster_id in cluster_terms:
854
+ cluster_terms[cluster_id] = sorted(
855
+ cluster_terms[cluster_id],
856
+ key=lambda x: x['cluster_percentage'],
857
+ reverse=True
858
+ )[:max_terms_per_cluster]
859
+
860
+ # Collect all unique terms for shared term handling
861
+ all_terms = {}
862
+ for cluster_id, terms in cluster_terms.items():
863
+ for term in terms:
864
+ atom = term['atom']
865
+ if atom not in all_terms:
866
+ all_terms[atom] = []
867
+ all_terms[atom].append(cluster_id)
868
+
869
+ # Separate terms into shared vs cluster-specific
870
+ shared_terms = {atom: clusters for atom, clusters in all_terms.items() if len(clusters) > 1}
871
+ specific_terms = {atom: clusters[0] for atom, clusters in all_terms.items() if len(clusters) == 1}
872
+
873
+ # Merge overlapping terms that refer to the same concept
874
+ # E.g., "type:qc" and "name:PooledQC" both refer to QC samples
875
+ def should_merge_terms(term1, term2):
876
+ """Check if two terms should be merged based on semantic overlap"""
877
+ # Extract the actual values (remove prefixes)
878
+ val1 = term1.replace('name:', '').replace('type:', '').replace('group:', '').replace('batch:', '').lower()
879
+ val2 = term2.replace('name:', '').replace('type:', '').replace('group:', '').replace('batch:', '').lower()
880
+
881
+ # Define known overlapping concepts
882
+ qc_terms = {'qc', 'pooledqc', 'pooled_qc', 'quality_control', 'qualitycontrol'}
883
+ blank_terms = {'blank', 'blk', 'empty', 'background'}
884
+
885
+ # Check if both terms belong to the same concept group
886
+ if val1 in qc_terms and val2 in qc_terms:
887
+ return True
888
+ if val1 in blank_terms and val2 in blank_terms:
889
+ return True
890
+
891
+ # Also check for direct string similarity (e.g., case variations)
892
+ if val1 == val2:
893
+ return True
894
+
895
+ return False
896
+
897
+ def merge_overlapping_terms(shared_terms, specific_terms):
898
+ """Merge terms that refer to the same concept"""
899
+ all_atoms = list(shared_terms.keys()) + list(specific_terms.keys())
900
+ merged_groups = []
901
+ used_atoms = set()
902
+
903
+ for i, atom1 in enumerate(all_atoms):
904
+ if atom1 in used_atoms:
905
+ continue
906
+
907
+ group = [atom1]
908
+ used_atoms.add(atom1)
909
+
910
+ # Find all atoms that should be merged with this one
911
+ for j, atom2 in enumerate(all_atoms[i+1:], i+1):
912
+ if atom2 in used_atoms:
913
+ continue
914
+ if should_merge_terms(atom1, atom2):
915
+ group.append(atom2)
916
+ used_atoms.add(atom2)
917
+
918
+ if len(group) > 1:
919
+ merged_groups.append(group)
920
+
921
+ return merged_groups
922
+
923
+ # Find terms that should be merged
924
+ merged_groups = merge_overlapping_terms(shared_terms, specific_terms)
925
+
926
+ # Apply merging: create new combined terms and remove originals
927
+ for group in merged_groups:
928
+ # Determine the combined clusters for this group
929
+ combined_clusters = set()
930
+ for atom in group:
931
+ if atom in shared_terms:
932
+ combined_clusters.update(shared_terms[atom])
933
+ elif atom in specific_terms:
934
+ combined_clusters.add(specific_terms[atom])
935
+
936
+ # Create a new combined term name using newlines
937
+ # Keep the original prefixes and atom names
938
+ combined_atom = '\n'.join(group)
939
+
940
+ # Remove original terms from both dictionaries
941
+ for atom in group:
942
+ shared_terms.pop(atom, None)
943
+ specific_terms.pop(atom, None)
944
+
945
+ # Add the combined term to appropriate dictionary
946
+ combined_clusters_list = list(combined_clusters)
947
+ if len(combined_clusters_list) > 1:
948
+ shared_terms[combined_atom] = combined_clusters_list
949
+ else:
950
+ specific_terms[combined_atom] = combined_clusters_list[0]
951
+
952
+ # Create label sources for enriched terms
953
+ label_sources = {}
954
+ line_sources = {}
955
+ line_cluster_mapping = {} # Track which cluster each line belongs to
956
+
957
+ # Handle shared terms (place at center of all clusters that share it, but in empty areas)
958
+ for atom, clusters in shared_terms.items():
959
+ if len(clusters) > 1:
960
+ # Calculate center of all clusters sharing this term
961
+ cluster_coords_list = [cluster_centers[cid] for cid in clusters if cid in cluster_centers]
962
+ if cluster_coords_list:
963
+ center_x = np.mean([coord[0] for coord in cluster_coords_list])
964
+ center_y = np.mean([coord[1] for coord in cluster_coords_list])
965
+
966
+ # Calculate data bounds using simple approach
967
+ all_x = [pt[0] for pt in umap_coords]
968
+ all_y = [pt[1] for pt in umap_coords]
969
+ x_min, x_max = min(all_x), max(all_x)
970
+ y_min, y_max = min(all_y), max(all_y)
971
+ data_range_x = x_max - x_min
972
+ data_range_y = y_max - y_min
973
+
974
+ # Find empty area around the center
975
+ best_distance = 0
976
+ best_position = None
977
+
978
+ for distance_factor in [1.0, 1.5, 2.0]:
979
+ offset_distance = distance_factor * max(data_range_x, data_range_y) * 0.1
980
+
981
+ for angle in np.linspace(0, 2*np.pi, 8):
982
+ label_x = center_x + offset_distance * np.cos(angle)
983
+ label_y = center_y + offset_distance * np.sin(angle)
984
+
985
+ # Calculate minimum distance to any data point
986
+ distances = [np.sqrt((pt[0] - label_x)**2 + (pt[1] - label_y)**2) for pt in umap_coords]
987
+ min_distance = min(distances)
988
+
989
+ if min_distance > best_distance:
990
+ best_distance = min_distance
991
+ best_position = (label_x, label_y)
992
+
993
+ # Use best position or fallback to center
994
+ if best_position is not None:
995
+ label_x, label_y = best_position
996
+ else:
997
+ label_x, label_y = center_x, center_y
998
+
999
+ # Check if label would be outside plot bounds and adjust
1000
+ label_margin = max(data_range_x, data_range_y) * 0.05
1001
+ if label_x < x_min - label_margin:
1002
+ label_x = x_min - label_margin
1003
+ elif label_x > x_max + label_margin:
1004
+ label_x = x_max + label_margin
1005
+
1006
+ if label_y < y_min - label_margin:
1007
+ label_y = y_min - label_margin
1008
+ elif label_y > y_max + label_margin:
1009
+ label_y = y_max + label_margin
1010
+
1011
+ # Keep the original atom name with prefixes for display
1012
+ display_atom = atom # Keep prefixes like name:, group:, batch:, type:
1013
+
1014
+ # Create label source with center alignment for shared terms
1015
+ label_source = ColumnDataSource({
1016
+ 'x': [label_x],
1017
+ 'y': [label_y],
1018
+ 'text': [display_atom],
1019
+ 'atom': [atom],
1020
+ 'text_align': ['center']
1021
+ })
1022
+ label_sources[atom] = label_source
1023
+
1024
+ # Create lines to each cluster center
1025
+ line_x = []
1026
+ line_y = []
1027
+ for cluster_id in clusters:
1028
+ if cluster_id in cluster_centers:
1029
+ cx, cy = cluster_centers[cluster_id]
1030
+ line_x.extend([label_x, cx, np.nan]) # nan to break line
1031
+ line_y.extend([label_y, cy, np.nan])
1032
+
1033
+ line_source = ColumnDataSource({
1034
+ 'x': line_x,
1035
+ 'y': line_y
1036
+ })
1037
+ line_sources[atom] = line_source
1038
+ line_cluster_mapping[atom] = 'shared'
1039
+
1040
+ # Handle cluster-specific terms (arrange multiple terms per cluster to avoid overlap)
1041
+ # Group specific terms by cluster to handle multiple terms per cluster
1042
+ cluster_specific_terms = defaultdict(list)
1043
+ for atom, cluster_id in specific_terms.items():
1044
+ cluster_specific_terms[cluster_id].append(atom)
1045
+
1046
+ # Calculate data bounds once
1047
+ all_x = [pt[0] for pt in umap_coords]
1048
+ all_y = [pt[1] for pt in umap_coords]
1049
+ x_min, x_max = min(all_x), max(all_x)
1050
+ y_min, y_max = min(all_y), max(all_y)
1051
+ data_range_x = x_max - x_min
1052
+ data_range_y = y_max - y_min
1053
+
1054
+ # Expand plot ranges to accommodate labels (add 15% margin on all sides)
1055
+ margin = 0.15
1056
+ x_margin = data_range_x * margin
1057
+ y_margin = data_range_y * margin
1058
+ plot_x_min = x_min - x_margin
1059
+ plot_x_max = x_max + x_margin
1060
+ plot_y_min = y_min - y_margin
1061
+ plot_y_max = y_max + y_margin
1062
+
1063
+ # Set expanded plot ranges
1064
+ p1.x_range.start = plot_x_min
1065
+ p1.x_range.end = plot_x_max
1066
+ p1.y_range.start = plot_y_min
1067
+ p1.y_range.end = plot_y_max
1068
+
1069
+ # Process each cluster that has specific terms
1070
+ for cluster_id, cluster_atoms in cluster_specific_terms.items():
1071
+ if cluster_id not in cluster_centers:
1072
+ continue
1073
+
1074
+ cx, cy = cluster_centers[cluster_id]
1075
+ n_terms = len(cluster_atoms)
1076
+
1077
+ if n_terms == 1:
1078
+ # Single term - use smart positioning with shorter distances
1079
+ atom = cluster_atoms[0]
1080
+
1081
+ # Try multiple candidate positions with shorter distances and more angles
1082
+ best_distance = 0
1083
+ best_position = None
1084
+
1085
+ # Use shorter base distance and test many angles
1086
+ base_distance = max(data_range_x, data_range_y) * 0.08 # Much shorter base distance
1087
+
1088
+ # Test positions at different angles and short distances
1089
+ for distance_factor in [0.8, 1.0, 1.3]: # Shorter distance factors
1090
+ offset_distance = base_distance * distance_factor
1091
+
1092
+ for angle in np.linspace(0, 2*np.pi, 24): # More angles (24 directions)
1093
+ label_x = cx + offset_distance * np.cos(angle)
1094
+ label_y = cy + offset_distance * np.sin(angle)
1095
+
1096
+ # Calculate minimum distance to any data point
1097
+ distances = [np.sqrt((pt[0] - label_x)**2 + (pt[1] - label_y)**2) for pt in umap_coords]
1098
+ min_distance = min(distances)
1099
+
1100
+ # Check distance to other labels to avoid overlap
1101
+ min_label_distance = float('inf')
1102
+ for other_atom, other_source in label_sources.items():
1103
+ if other_atom != atom:
1104
+ other_data = other_source.data
1105
+ if other_data['x'] and other_data['y']:
1106
+ other_x, other_y = other_data['x'][0], other_data['y'][0]
1107
+ label_distance = np.sqrt((label_x - other_x)**2 + (label_y - other_y)**2)
1108
+ min_label_distance = min(min_label_distance, label_distance)
1109
+
1110
+ # Prefer positions that are reasonably far from data points and other labels
1111
+ combined_distance = min(min_distance, min_label_distance if min_label_distance != float('inf') else min_distance)
1112
+
1113
+ if combined_distance > best_distance:
1114
+ best_distance = combined_distance
1115
+ best_position = (label_x, label_y)
1116
+
1117
+ # Use best position found, or fallback to simple short offset
1118
+ if best_position is not None:
1119
+ label_x, label_y = best_position
1120
+ else:
1121
+ # Fallback: simple short radial offset
1122
+ offset_distance = base_distance
1123
+ angle = (cluster_id * 45) % 360 # Deterministic angle based on cluster
1124
+ angle_rad = np.radians(angle)
1125
+ label_x = cx + offset_distance * np.cos(angle_rad)
1126
+ label_y = cy + offset_distance * np.sin(angle_rad)
1127
+
1128
+ # Check if label would be outside plot bounds and adjust
1129
+ label_margin = max(data_range_x, data_range_y) * 0.05
1130
+
1131
+ # Instead of clamping to bounds, let labels go outside and plot bounds will be expanded later
1132
+ # Only apply minimal adjustments to prevent labels from being extremely far out
1133
+ extreme_margin = max(data_range_x, data_range_y) * 0.25 # Allow 25% outside data range
1134
+
1135
+ if label_x < x_min - extreme_margin:
1136
+ label_x = x_min - extreme_margin
1137
+ elif label_x > x_max + extreme_margin:
1138
+ label_x = x_max + extreme_margin
1139
+
1140
+ if label_y < y_min - extreme_margin:
1141
+ label_y = y_min - extreme_margin
1142
+ elif label_y > y_max + extreme_margin:
1143
+ label_y = y_max + extreme_margin
1144
+
1145
+ # Determine text alignment based on position relative to cluster
1146
+ text_align = 'right' if label_x > cx else 'left'
1147
+
1148
+ # Clean up atom name for display but keep prefixes
1149
+ display_atom = atom # Keep prefixes like name:, group:, batch:, type:
1150
+
1151
+ # Create label source with alignment
1152
+ label_source = ColumnDataSource({
1153
+ 'x': [label_x],
1154
+ 'y': [label_y],
1155
+ 'text': [display_atom],
1156
+ 'atom': [atom],
1157
+ 'text_align': [text_align]
1158
+ })
1159
+ label_sources[atom] = label_source
1160
+
1161
+ # Create spike line from cluster center to label
1162
+ line_source = ColumnDataSource({
1163
+ 'x': [cx, label_x],
1164
+ 'y': [cy, label_y]
1165
+ })
1166
+ line_sources[atom] = line_source
1167
+ line_cluster_mapping[atom] = cluster_id
1168
+
1169
+ else:
1170
+ # Multiple terms - stack them vertically with one line to cluster center
1171
+ # Determine if this cluster has shared vs non-shared terms to adjust positioning
1172
+ has_shared = any(atom in shared_terms for atom in cluster_atoms)
1173
+ has_specific = any(atom in specific_terms for atom in cluster_atoms)
1174
+
1175
+ # Adjust base distance: put non-shared (cluster-specific) labels further out
1176
+ if has_specific and not has_shared:
1177
+ # Pure cluster-specific terms - place further from center to reduce overlap
1178
+ base_distance = max(data_range_x, data_range_y) * 0.15 # Further out
1179
+ elif has_shared and not has_specific:
1180
+ # Pure shared terms - place closer to center
1181
+ base_distance = max(data_range_x, data_range_y) * 0.08 # Closer
1182
+ else:
1183
+ # Mixed terms - use intermediate distance
1184
+ base_distance = max(data_range_x, data_range_y) * 0.1 # Standard distance
1185
+
1186
+ # Calculate a good angle for the stack based on cluster position and available space
1187
+ # For non-shared terms, prefer angles that point away from plot center
1188
+ best_angle = None
1189
+ best_distance = 0
1190
+
1191
+ # Get plot center for reference
1192
+ plot_center_x = (x_min + x_max) / 2
1193
+ plot_center_y = (y_min + y_max) / 2
1194
+
1195
+ # Calculate angle from plot center to cluster center
1196
+ center_to_cluster_angle = np.arctan2(cy - plot_center_y, cx - plot_center_x)
1197
+
1198
+ if has_specific and not has_shared:
1199
+ # For non-shared terms, prefer angles that point away from plot center
1200
+ # Create angles around the center-to-cluster direction
1201
+ base_angle = center_to_cluster_angle
1202
+ preferred_angles = [
1203
+ base_angle, # Directly away from center
1204
+ base_angle + np.pi/4, # 45° offset
1205
+ base_angle - np.pi/4, # -45° offset
1206
+ base_angle + np.pi/6, # 30° offset
1207
+ base_angle - np.pi/6, # -30° offset
1208
+ base_angle + np.pi/3, # 60° offset
1209
+ base_angle - np.pi/3, # -60° offset
1210
+ base_angle + np.pi/2, # 90° offset
1211
+ base_angle - np.pi/2 # -90° offset
1212
+ ]
1213
+ else:
1214
+ # For shared terms or mixed, use the original preferred angles
1215
+ preferred_angles = [np.pi/4, 3*np.pi/4, 5*np.pi/4, 7*np.pi/4, # 45°, 135°, 225°, 315°
1216
+ np.pi/6, np.pi/3, 2*np.pi/3, 5*np.pi/6, # 30°, 60°, 120°, 150°
1217
+ 7*np.pi/6, 4*np.pi/3, 5*np.pi/3, 11*np.pi/6] # 210°, 240°, 300°, 330°
1218
+
1219
+ for test_angle in preferred_angles:
1220
+ test_x = cx + base_distance * np.cos(test_angle)
1221
+ test_y = cy + base_distance * np.sin(test_angle)
1222
+
1223
+ # Calculate minimum distance to any data point
1224
+ distances = [np.sqrt((pt[0] - test_x)**2 + (pt[1] - test_y)**2) for pt in umap_coords]
1225
+ min_distance = min(distances)
1226
+
1227
+ if min_distance > best_distance:
1228
+ best_distance = min_distance
1229
+ best_angle = test_angle
1230
+
1231
+ # Use the best angle found, or fallback to 45°
1232
+ if best_angle is not None:
1233
+ stack_angle = best_angle
1234
+ else:
1235
+ # Fallback: use 45° based on cluster
1236
+ angle_options = [np.pi/4, 3*np.pi/4, 5*np.pi/4, 7*np.pi/4]
1237
+ stack_angle = angle_options[cluster_id % len(angle_options)]
1238
+
1239
+ # Position for the end of the line (before labels start)
1240
+ line_end_x = cx + base_distance * np.cos(stack_angle)
1241
+ line_end_y = cy + base_distance * np.sin(stack_angle)
1242
+
1243
+ # Simplified approach: center labels at line end, then add 20pt offset in same direction
1244
+ # Calculate 20pt offset in the same direction as the line
1245
+ label_offset_distance = 20 # 20 points in the same direction
1246
+
1247
+ # Convert 20 points to data coordinates (approximate)
1248
+ # Assuming typical plot size, 20pt ≈ 1-2% of data range
1249
+ data_range = max(data_range_x, data_range_y)
1250
+ offset_in_data_coords = data_range * 0.02 # 2% of data range for 20pt
1251
+
1252
+ # Add offset in direction based on line orientation for better text placement
1253
+ # For westward lines: place label LEFT of endpoint with RIGHT alignment
1254
+ # For eastward lines: place label RIGHT of endpoint with LEFT alignment
1255
+
1256
+ angle_degrees = (stack_angle * 180 / np.pi) % 360
1257
+ if 90 < angle_degrees < 270:
1258
+ # Line goes LEFT (westward) - place label to the LEFT of line end
1259
+ label_center_x = line_end_x - offset_in_data_coords # SUBTRACT to go left
1260
+ label_center_y = line_end_y # Keep same Y position
1261
+ text_align = 'right' # Right-align so text ends near line endpoint
1262
+ else:
1263
+ # Line goes RIGHT (eastward) - place label to the RIGHT of line end
1264
+ label_center_x = line_end_x + offset_in_data_coords # ADD to go right
1265
+ label_center_y = line_end_y # Keep same Y position
1266
+ text_align = 'left' # Left-align so text starts near line endpoint
1267
+
1268
+ # Calculate consistent vertical spacing for stacked labels
1269
+ # BETTER APPROACH: Use single LabelSet with newline characters
1270
+
1271
+ # Create a single multi-line text string with all terms
1272
+ display_atoms = [atom for atom in cluster_atoms] # Keep original atom names with prefixes
1273
+ combined_text = '\n'.join(display_atoms)
1274
+
1275
+ # Check if label would be outside plot bounds and adjust
1276
+ label_margin = max(data_range_x, data_range_y) * 0.05
1277
+ label_x = label_center_x
1278
+ label_y = label_center_y
1279
+
1280
+ if label_x < x_min - label_margin:
1281
+ label_x = x_min - label_margin
1282
+ text_align = 'left'
1283
+ elif label_x > x_max + label_margin:
1284
+ label_x = x_max + label_margin
1285
+ text_align = 'right'
1286
+
1287
+ if label_y < y_min - label_margin:
1288
+ label_y = y_min - label_margin
1289
+ elif label_y > y_max + label_margin:
1290
+ label_y = y_max + label_margin
1291
+
1292
+ # Create single label source with multi-line text and alignment
1293
+ label_source = ColumnDataSource({
1294
+ 'x': [label_x],
1295
+ 'y': [label_y],
1296
+ 'text': [combined_text],
1297
+ 'atoms': [cluster_atoms], # Store all atoms for reference
1298
+ 'text_align': [text_align]
1299
+ })
1300
+
1301
+ # Store this single label source using a unique key for the cluster stack
1302
+ stack_label_key = f"cluster_{cluster_id}_labels"
1303
+ label_sources[stack_label_key] = label_source
1304
+
1305
+ # Create single line from cluster center to line end (before labels)
1306
+ stack_line_source = ColumnDataSource({
1307
+ 'x': [cx, line_end_x],
1308
+ 'y': [cy, line_end_y]
1309
+ })
1310
+ # Use a unique key for the stack line
1311
+ stack_key = f"cluster_{cluster_id}_stack"
1312
+ line_sources[stack_key] = stack_line_source
1313
+ line_cluster_mapping[stack_key] = cluster_id
1314
+
1315
+ # Add lines (spikes) to plot with matching cluster colors
1316
+ line_renderers = {}
1317
+ for line_key, line_source in line_sources.items():
1318
+ # Get the cluster color for this line
1319
+ if line_key in shared_terms:
1320
+ # For shared terms, use the same style as cluster-specific terms
1321
+ # Use a neutral color or the color of the first cluster it appears in
1322
+ first_cluster_id = list(shared_terms[line_key])[0]
1323
+ if first_cluster_id == -1:
1324
+ line_color = 'gray'
1325
+ else:
1326
+ cluster_idx = list(unique_clusters).index(first_cluster_id) if first_cluster_id in unique_clusters else 0
1327
+ line_color = colors[cluster_idx % len(colors)]
1328
+ line_dash = 'dashed' # Use dashed for all edges
1329
+ elif line_key in specific_terms:
1330
+ # For cluster-specific terms, use the cluster's color
1331
+ cluster_id = specific_terms[line_key]
1332
+ if cluster_id == -1:
1333
+ line_color = 'gray'
1334
+ else:
1335
+ cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
1336
+ line_color = colors[cluster_idx % len(colors)]
1337
+ line_dash = 'dashed' # Use dashed for all edges
1338
+ elif line_key in line_cluster_mapping:
1339
+ # For stack lines, use the cluster's color
1340
+ cluster_info = line_cluster_mapping[line_key]
1341
+ if cluster_info == 'shared':
1342
+ # For shared stacks, use a neutral color or first cluster color
1343
+ line_color = 'black'
1344
+ line_dash = 'dashed' # Use dashed for all edges
1345
+ else:
1346
+ cluster_id = cluster_info
1347
+ if cluster_id == -1:
1348
+ line_color = 'gray'
1349
+ else:
1350
+ cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
1351
+ line_color = colors[cluster_idx % len(colors)]
1352
+ line_dash = 'dashed' # Use dashed for all edges
1353
+ else:
1354
+ # Fallback
1355
+ line_color = 'gray'
1356
+ line_dash = 'dashed' # Use dashed for all edges
1357
+
1358
+ line_renderer = p1.line('x', 'y', source=line_source,
1359
+ line_color=line_color, line_width=2,
1360
+ alpha=0.8, line_dash=line_dash)
1361
+ line_renderers[line_key] = line_renderer
1362
+
1363
+ # Add labels to plot (simple and direct approach)
1364
+ label_renderers = {} # Store label renderers for legend control
1365
+ for label_key, label_source in label_sources.items():
1366
+ # Determine color and style based on label key type
1367
+ if label_key.startswith('cluster_') and label_key.endswith('_labels'):
1368
+ # This is a cluster stack with multiple terms
1369
+ cluster_id = int(label_key.split('_')[1])
1370
+ if cluster_id == -1:
1371
+ text_color = 'gray'
1372
+ else:
1373
+ cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
1374
+ text_color = colors[cluster_idx % len(colors)]
1375
+ text_font_style = 'bold'
1376
+ elif label_key in shared_terms:
1377
+ # Shared term - use same color as edge (first cluster's color)
1378
+ first_cluster_id = list(shared_terms[label_key])[0]
1379
+ if first_cluster_id == -1:
1380
+ text_color = 'gray'
1381
+ else:
1382
+ cluster_idx = list(unique_clusters).index(first_cluster_id) if first_cluster_id in unique_clusters else 0
1383
+ text_color = colors[cluster_idx % len(colors)]
1384
+ text_font_style = 'bold'
1385
+ elif label_key in specific_terms:
1386
+ # Individual cluster-specific term
1387
+ cluster_id = specific_terms[label_key]
1388
+ if cluster_id == -1:
1389
+ text_color = 'gray'
1390
+ else:
1391
+ cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
1392
+ text_color = colors[cluster_idx % len(colors)]
1393
+ text_font_style = 'bold'
1394
+ else:
1395
+ # Fallback
1396
+ text_color = 'black'
1397
+ text_font_style = 'bold'
1398
+
1399
+ # Get text alignment from label source, default to center
1400
+ label_data = label_source.data
1401
+ text_align = label_data.get('text_align', ['center'])[0] if 'text_align' in label_data else 'center'
1402
+
1403
+ label_set = LabelSet(
1404
+ x='x', y='y', text='text',
1405
+ source=label_source,
1406
+ text_font_size='11pt',
1407
+ text_color=text_color,
1408
+ text_font_style=text_font_style,
1409
+ text_align=text_align,
1410
+ text_baseline='middle'
1411
+ )
1412
+ p1.add_layout(label_set)
1413
+ label_renderers[label_key] = label_set # Store for legend control
1414
+
1415
+ # Check if any labels are close to plot boundaries and expand if needed
1416
+ if label_sources:
1417
+ # Collect all label positions
1418
+ all_label_positions = []
1419
+ for source in label_sources.values():
1420
+ data = source.data
1421
+ if 'x' in data and 'y' in data and data['x'] and data['y']:
1422
+ all_label_positions.extend(zip(data['x'], data['y']))
1423
+
1424
+ if all_label_positions:
1425
+ # Check if any labels are close to current plot boundaries
1426
+ current_x_min, current_x_max = p1.x_range.start, p1.x_range.end
1427
+ current_y_min, current_y_max = p1.y_range.start, p1.y_range.end
1428
+
1429
+ # Define "close to boundary" as within 5% of the plot range
1430
+ x_range = current_x_max - current_x_min
1431
+ y_range = current_y_max - current_y_min
1432
+ boundary_threshold_x = x_range * 0.05
1433
+ boundary_threshold_y = y_range * 0.05
1434
+
1435
+ needs_expansion = False
1436
+ for label_x, label_y in all_label_positions:
1437
+ if (label_x < current_x_min + boundary_threshold_x or
1438
+ label_x > current_x_max - boundary_threshold_x or
1439
+ label_y < current_y_min + boundary_threshold_y or
1440
+ label_y > current_y_max - boundary_threshold_y):
1441
+ needs_expansion = True
1442
+ break
1443
+
1444
+ # If labels are close to boundaries, expand plot by 5% (reduced from 10%)
1445
+ if needs_expansion:
1446
+ expansion_factor = 0.05 # 5% expansion (half of previous 10%)
1447
+ x_expansion = x_range * expansion_factor
1448
+ y_expansion = y_range * expansion_factor
1449
+
1450
+ p1.x_range.start = current_x_min - x_expansion
1451
+ p1.x_range.end = current_x_max + x_expansion
1452
+ p1.y_range.start = current_y_min - y_expansion
1453
+ p1.y_range.end = current_y_max + y_expansion
1454
+
1455
+
1456
+ # Add hover tool with enrichment information
1457
+ hover = HoverTool(tooltips=[
1458
+ ("Cluster", "@cluster"),
1459
+ ("Sample", "@sample_name"),
1460
+ ("Sample UID", "@sample_uid"),
1461
+ ("Enrichments", "@enrichments")
1462
+ ])
1463
+ p1.add_tools(hover)
1464
+
1465
+ # Remove cluster legend labels from scatter plots (already done above)
1466
+ # But keep any existing legend structure for now
1467
+
1468
+ # Create custom legend for enrichment terms (line/label pairs) ONLY
1469
+ if line_renderers and (shared_terms or specific_terms):
1470
+ legend_items = []
1471
+ renderer_to_terms = {} # Group terms by their renderer
1472
+
1473
+ # Get all enriched terms and group them by their line renderer
1474
+ all_enriched_atoms = set(shared_terms.keys()) | set(specific_terms.keys())
1475
+
1476
+ # First pass: map each term to its renderer
1477
+ for atom in all_enriched_atoms:
1478
+ renderer = None
1479
+ renderer_key = None
1480
+
1481
+ if atom in shared_terms:
1482
+ # Shared term
1483
+ if atom in line_renderers:
1484
+ renderer = line_renderers[atom]
1485
+ renderer_key = atom
1486
+ else:
1487
+ # Look for any stack renderer from clusters that have this shared term
1488
+ for cluster_id in shared_terms[atom]:
1489
+ stack_key = f"cluster_{cluster_id}_stack"
1490
+ if stack_key in line_renderers:
1491
+ renderer = line_renderers[stack_key]
1492
+ renderer_key = stack_key
1493
+ break
1494
+
1495
+ elif atom in specific_terms:
1496
+ # Cluster-specific term
1497
+ cluster_id = specific_terms[atom]
1498
+ if atom in line_renderers:
1499
+ renderer = line_renderers[atom]
1500
+ renderer_key = atom
1501
+ else:
1502
+ stack_key = f"cluster_{cluster_id}_stack"
1503
+ if stack_key in line_renderers:
1504
+ renderer = line_renderers[stack_key]
1505
+ renderer_key = stack_key
1506
+
1507
+ # Group terms by renderer
1508
+ if renderer and renderer_key:
1509
+ if renderer_key not in renderer_to_terms:
1510
+ renderer_to_terms[renderer_key] = {
1511
+ 'renderer': renderer,
1512
+ 'shared_terms': [],
1513
+ 'specific_terms': [],
1514
+ 'cluster_id': None
1515
+ }
1516
+
1517
+ if atom in shared_terms:
1518
+ renderer_to_terms[renderer_key]['shared_terms'].append(atom)
1519
+ else:
1520
+ renderer_to_terms[renderer_key]['specific_terms'].append(atom)
1521
+ renderer_to_terms[renderer_key]['cluster_id'] = specific_terms[atom]
1522
+
1523
+ # Second pass: create legend entries, one per renderer
1524
+ for renderer_key, term_info in renderer_to_terms.items():
1525
+ shared_list = term_info['shared_terms']
1526
+ specific_list = term_info['specific_terms']
1527
+ line_renderer = term_info['renderer']
1528
+
1529
+ # For now, legend can only control the line renderer
1530
+ # Label visibility will be handled via JavaScript callback if needed
1531
+ # (Note: LabelSet cannot be directly controlled by Bokeh legends)
1532
+
1533
+ # Create combined label text
1534
+ if shared_list:
1535
+ # Shared terms - remove "Shared:" prefix and just show the terms
1536
+ clean_terms = [atom.replace('name:', '').replace('group:', '').replace('batch:', '').replace('type:', '')
1537
+ for atom in shared_list]
1538
+ if len(clean_terms) == 1:
1539
+ label_text = clean_terms[0]
1540
+ else:
1541
+ label_text = ', '.join(clean_terms)
1542
+
1543
+ elif specific_list:
1544
+ # Cluster-specific terms
1545
+ cluster_id = term_info['cluster_id']
1546
+ clean_terms = [atom.replace('name:', '').replace('group:', '').replace('batch:', '').replace('type:', '')
1547
+ for atom in specific_list]
1548
+ if len(clean_terms) == 1:
1549
+ label_text = f"C{cluster_id}: {clean_terms[0]}"
1550
+ else:
1551
+ label_text = f"C{cluster_id}: {', '.join(clean_terms)}"
1552
+
1553
+ # Add single legend entry for the line renderer only
1554
+ # (Labels cannot be controlled by Bokeh legends directly)
1555
+ legend_items.append(
1556
+ LegendItem(label=label_text, renderers=[line_renderer])
1557
+ )
1558
+
1559
+ # Hide cluster legend after we've created our enrichment legend
1560
+ if hasattr(p1, 'legend') and p1.legend:
1561
+ if isinstance(p1.legend, list):
1562
+ for legend in p1.legend:
1563
+ legend.visible = False
1564
+ else:
1565
+ p1.legend.visible = False
1566
+
1567
+ # Create and add the custom enrichment legend
1568
+ if legend_items:
1569
+ enrichment_legend = Legend(
1570
+ items=legend_items,
1571
+ location="center_right",
1572
+ click_policy="hide"
1573
+ )
1574
+ p1.add_layout(enrichment_legend, 'right')
1575
+
1576
+ plots['cluster_plot'] = p1
1577
+
1578
+ # Save cluster plot if filename provided
1579
+ if filename:
1580
+ # Handle filename extension properly
1581
+ if filename.endswith('.html'):
1582
+ base_filename = filename[:-5] # Remove .html extension
1583
+ cluster_filename = f"{base_filename}_clusters.html"
1584
+ else:
1585
+ cluster_filename = f"{filename}_clusters.html"
1586
+
1587
+ if not filename.startswith('/') and not filename[1:3] == ':\\':
1588
+ cluster_filename = f"{self.folder}/{cluster_filename}"
1589
+ _isolated_save_plot(p1, cluster_filename, cluster_filename, self.logger, "UMAP Cluster Plot")
1590
+ else:
1591
+ _isolated_show_notebook(p1)
1592
+
1593
+ results['plots'] = plots
1594
+
1595
+ # Print summary
1596
+ self.logger.debug("\n=== UMAP Cluster Analysis Summary ===")
1597
+ self.logger.debug(f"Best clustering: {best_clustering['method']}")
1598
+ self.logger.debug(f"Number of clusters: {best_clustering['n_clusters']}")
1599
+ self.logger.debug(f"Silhouette score: {best_clustering['score']:.3f}")
1600
+ if best_clustering['n_noise'] > 0:
1601
+ self.logger.debug(f"Noise points: {best_clustering['n_noise']}")
1602
+
1603
+ self.logger.info(f"\nFound {len(all_associations)} total significant associations:")
1604
+
1605
+ # Show regular column associations
1606
+ regular_assocs = [a for a in all_associations if 'column' in a]
1607
+ if regular_assocs:
1608
+ self.logger.info(f" {len(regular_assocs)} column-level associations:")
1609
+ for assoc in regular_assocs[:3]: # Show top 3
1610
+ self.logger.info(f" {assoc['column']} ({assoc['variable_type']}): {assoc['test']} p={assoc['p_value']:.4f}, "
1611
+ f"effect_size={assoc['effect_size']:.3f} ({assoc['interpretation']})")
1612
+
1613
+ # Show text atom associations
1614
+ text_assocs = [a for a in all_associations if 'atom' in a]
1615
+ if text_assocs:
1616
+ self.logger.info(f" {len(text_assocs)} text pattern associations:")
1617
+ for assoc in text_assocs[:3]: # Show top 3
1618
+ freq = assoc.get('atom_frequency', 0)
1619
+ percentage = (freq / len(analysis_df_clean)) * 100 if len(analysis_df_clean) > 0 else 0
1620
+
1621
+ self.logger.info(f" '{assoc['atom']}' ({assoc['type']}): p={assoc['p_value']:.4f}, "
1622
+ f"effect_size={assoc['effect_size']:.3f} ({assoc['interpretation']}) "
1623
+ f"[{freq} samples, {percentage:.1f}%]")
1624
+
1625
+ if len(all_associations) > 20:
1626
+ self.logger.info(f" ... and {len(all_associations) - 20} more associations")
1627
+
1628
+ return results
1629
+
1630
+ def _analyze_umap_simplified(
1631
+ self,
1632
+ n_neighbors=15,
1633
+ min_dist=0.1,
1634
+ metric="euclidean",
1635
+ random_state=42,
1636
+ cluster_methods=["hdbscan", "kmeans"],
1637
+ n_clusters_range=(2, 8),
1638
+ min_cluster_size=3,
1639
+ significance_threshold=0.05,
1640
+ plot_results=True,
1641
+ filename=None,
1642
+ ):
1643
+ """Simplified fallback version of UMAP analysis."""
1644
+
1645
+ self.logger.info("Starting simplified UMAP analysis...")
1646
+
1647
+ # Check dependencies
1648
+ if not UMAP_AVAILABLE or not HDBSCAN_AVAILABLE:
1649
+ self.logger.error("Required dependencies not available")
1650
+ return {
1651
+ 'umap_coords': None,
1652
+ 'best_clustering': None,
1653
+ 'all_clustering_results': {},
1654
+ 'significant_associations': [],
1655
+ 'text_associations': [],
1656
+ 'cluster_summaries': {},
1657
+ 'analysis_dataframe': None
1658
+ }
1659
+
1660
+ try:
1661
+ # Get data
1662
+ consensus_matrix = self.get_consensus_matrix()
1663
+ samples_df = self.samples_df
1664
+
1665
+ if consensus_matrix is None or samples_df is None:
1666
+ self.logger.error("No data available")
1667
+ return {
1668
+ 'umap_coords': None,
1669
+ 'best_clustering': None,
1670
+ 'all_clustering_results': {},
1671
+ 'significant_associations': [],
1672
+ 'text_associations': [],
1673
+ 'cluster_summaries': {},
1674
+ 'analysis_dataframe': None
1675
+ }
1676
+
1677
+ # Basic UMAP
1678
+ sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
1679
+
1680
+ if hasattr(consensus_matrix, "select"):
1681
+ matrix_data = consensus_matrix.select(sample_cols).to_numpy()
1682
+ else:
1683
+ matrix_data = consensus_matrix.drop(columns=["consensus_uid"], errors="ignore").values
1684
+
1685
+ matrix_data = matrix_data.T
1686
+ matrix_data = np.nan_to_num(matrix_data)
1687
+
1688
+ scaler = StandardScaler()
1689
+ matrix_scaled = scaler.fit_transform(matrix_data)
1690
+
1691
+ # Import dependencies locally
1692
+ import umap
1693
+ import hdbscan
1694
+
1695
+ reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, random_state=random_state)
1696
+ umap_coords = reducer.fit_transform(matrix_scaled)
1697
+
1698
+ # Simple clustering
1699
+ clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
1700
+ cluster_labels = clusterer.fit_predict(umap_coords)
1701
+
1702
+ best_clustering = {
1703
+ 'labels': cluster_labels,
1704
+ 'n_clusters': len(np.unique(cluster_labels[cluster_labels != -1])),
1705
+ 'n_noise': np.sum(cluster_labels == -1),
1706
+ 'silhouette_score': 0.5, # Placeholder
1707
+ 'method': 'hdbscan'
1708
+ }
1709
+
1710
+ self.logger.info(f"Simplified analysis found {best_clustering['n_clusters']} clusters")
1711
+
1712
+ return {
1713
+ 'umap_coords': umap_coords,
1714
+ 'best_clustering': best_clustering,
1715
+ 'all_clustering_results': {'hdbscan': best_clustering},
1716
+ 'significant_associations': [],
1717
+ 'text_associations': [],
1718
+ 'cluster_summaries': {},
1719
+ 'analysis_dataframe': None
1720
+ }
1721
+
1722
+ except Exception as e:
1723
+ self.logger.error(f"Error in simplified analysis: {e}")
1724
+ return {
1725
+ 'umap_coords': None,
1726
+ 'best_clustering': None,
1727
+ 'all_clustering_results': {},
1728
+ 'significant_associations': [],
1729
+ 'text_associations': [],
1730
+ 'cluster_summaries': {},
1731
+ 'analysis_dataframe': None
1732
+ }
1733
+
1734
+
1735
+ # ========================================
1736
+ # Helper Functions for Plotting
1737
+ # ========================================
1738
+
1739
+ def _isolated_save_plot(plot, filename, title, logger, plot_type):
1740
+ """Save plot to file in isolation"""
1741
+ try:
1742
+ from bokeh.io import output_file, save
1743
+ from bokeh.models import Title
1744
+
1745
+ # Add title to plot
1746
+ plot.add_layout(Title(text=title, text_font_size="16pt"), 'above')
1747
+
1748
+ # Configure output
1749
+ output_file(filename)
1750
+ save(plot)
1751
+ logger.info(f"Saved {plot_type} to: {filename}")
1752
+
1753
+ except Exception as e:
1754
+ logger.error(f"Error saving {plot_type}: {e}")
1755
+
1756
+ def _isolated_show_notebook(plot):
1757
+ """Show plot in notebook if available"""
1758
+ try:
1759
+ from bokeh.io import show
1760
+ show(plot)
1761
+ except Exception:
1762
+ pass # Silently fail if not in notebook