masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/analysis.py CHANGED
@@ -27,12 +27,14 @@ SKLEARN_AVAILABLE = False
27
27
 
28
28
  try:
29
29
  import umap
30
+
30
31
  UMAP_AVAILABLE = True
31
32
  except ImportError:
32
33
  pass
33
34
 
34
35
  try:
35
36
  import hdbscan
37
+
36
38
  HDBSCAN_AVAILABLE = True
37
39
  except ImportError:
38
40
  pass
@@ -41,23 +43,25 @@ try:
41
43
  from sklearn.preprocessing import StandardScaler
42
44
  from sklearn.cluster import KMeans, DBSCAN
43
45
  from sklearn.metrics import silhouette_score
46
+
44
47
  SKLEARN_AVAILABLE = True
45
48
  except ImportError:
46
49
  pass
47
50
 
48
51
  # Compiled regex patterns for efficient text processing
49
- TOKEN_PATTERN = re.compile(r'[_\-\s\|\.]+')
50
- ALPHANUMERIC_PATTERN = re.compile(r'^[A-Za-z0-9]+$')
52
+ TOKEN_PATTERN = re.compile(r"[_\-\s\|\.]+")
53
+ ALPHANUMERIC_PATTERN = re.compile(r"^[A-Za-z0-9]+$")
51
54
 
52
55
  # Simple cache for tokenization
53
56
  _tokenization_cache = {}
54
57
 
58
+
55
59
  def tokenize_text_cached(text):
56
60
  """Cached text tokenization for repeated strings - preserves original case."""
57
61
  if text in _tokenization_cache:
58
62
  return _tokenization_cache[text]
59
-
60
- if pd.isna(text) or text == '' or not isinstance(text, str):
63
+
64
+ if pd.isna(text) or text == "" or not isinstance(text, str):
61
65
  result = tuple()
62
66
  else:
63
67
  # Split by common delimiters to create atoms (same as original)
@@ -68,13 +72,13 @@ def tokenize_text_cached(text):
68
72
  atom = atom.strip() # Remove .lower() to preserve case
69
73
  if atom and len(atom) > 1: # Original was > 1, not >= 1
70
74
  meaningful_tokens.append(atom)
71
-
75
+
72
76
  result = tuple(meaningful_tokens)
73
-
77
+
74
78
  # Prevent cache from growing too large
75
79
  if len(_tokenization_cache) < 10000:
76
80
  _tokenization_cache[text] = result
77
-
81
+
78
82
  return result
79
83
 
80
84
 
@@ -98,14 +102,14 @@ def analyze_umap(
98
102
  ):
99
103
  """
100
104
  Perform UMAP dimensionality reduction followed by clustering analysis with enriched term labeling.
101
-
105
+
102
106
  This method performs comprehensive cluster analysis on the study's consensus matrix, including:
103
107
  - UMAP dimensionality reduction for visualization
104
108
  - Automated clustering with multiple algorithms (HDBSCAN, K-means, DBSCAN)
105
109
  - Metadata association discovery using statistical tests
106
110
  - Text pattern analysis to identify enriched sample characteristics
107
111
  - Enhanced visualization with intelligent label positioning for enriched terms
108
-
112
+
109
113
  The enhanced visualization features cluster-aware enriched term labels with connecting spikes:
110
114
  - Terms shared across multiple clusters are positioned at the geometric center with lines to each cluster
111
115
  - Terms specific to single clusters are positioned nearby with short spikes
@@ -113,59 +117,59 @@ def analyze_umap(
113
117
  - Empty/blank terms are automatically filtered out
114
118
  - Label positioning adapts to line direction for optimal text alignment
115
119
  - Dashed edges and color-coordinated labels provide visual clarity
116
-
120
+
117
121
  Unlike plot_samples_umap() which colors by metadata columns, this function performs clustering
118
122
  and colors points by cluster assignments, with tooltips showing enrichment information.
119
-
123
+
120
124
  Parameters
121
125
  ----------
122
126
  n_neighbors : int, default=15
123
127
  Number of neighbors for UMAP embedding. Higher values preserve more global structure,
124
128
  lower values preserve more local structure.
125
-
129
+
126
130
  min_dist : float, default=0.1
127
131
  Minimum distance parameter for UMAP. Controls how tightly points are packed in the
128
132
  embedding. Values closer to 0 result in tighter clusters.
129
-
133
+
130
134
  metric : str, default="euclidean"
131
135
  Distance metric for UMAP. Options include 'euclidean', 'manhattan', 'cosine', etc.
132
-
136
+
133
137
  random_state : int, default=42
134
138
  Random seed for reproducibility of UMAP embedding and clustering.
135
-
139
+
136
140
  cluster_methods : list, default=["hdbscan", "kmeans", "dbscan"]
137
141
  Clustering algorithms to evaluate. Available options:
138
142
  - 'hdbscan': Hierarchical density-based clustering (requires hdbscan package)
139
143
  - 'kmeans': K-means clustering with multiple k values
140
144
  - 'dbscan': Density-based spatial clustering with multiple eps values
141
-
145
+
142
146
  n_clusters_range : tuple, default=(2, 8)
143
147
  Range of cluster numbers to test for K-means (min_clusters, max_clusters).
144
-
148
+
145
149
  min_cluster_size : int, default=3
146
150
  Minimum cluster size for HDBSCAN and DBSCAN algorithms.
147
-
151
+
148
152
  significance_threshold : float, default=0.05
149
153
  P-value threshold for statistical significance of metadata associations.
150
-
154
+
151
155
  plot_results : bool, default=True
152
156
  Whether to generate interactive Bokeh plots with enhanced labeling.
153
157
  When False, only returns analysis results without visualization.
154
-
158
+
155
159
  filename : str, optional
156
160
  If provided, saves the interactive plot to this HTML file.
157
-
161
+
158
162
  markersize : int, default=4
159
163
  Size of scatter plot markers representing samples.
160
-
164
+
161
165
  Returns
162
166
  -------
163
167
  dict
164
168
  Comprehensive results dictionary containing:
165
-
169
+
166
170
  - **umap_coords** : numpy.ndarray
167
171
  2D UMAP coordinates for all samples (n_samples x 2)
168
-
172
+
169
173
  - **best_clustering** : dict
170
174
  Best clustering result based on silhouette score, containing:
171
175
  - 'labels': cluster assignments for each sample
@@ -173,48 +177,48 @@ def analyze_umap(
173
177
  - 'n_clusters': number of identified clusters
174
178
  - 'n_noise': number of noise points (outliers)
175
179
  - 'method': clustering algorithm used
176
-
180
+
177
181
  - **all_clustering_results** : dict
178
182
  Results from all tested clustering configurations, keyed by method name
179
-
183
+
180
184
  - **significant_associations** : list
181
185
  All statistically significant associations (both numeric and text), sorted by
182
186
  cluster presence percentage. Each association includes:
183
187
  - Statistical test results (p-value, effect size)
184
188
  - Cluster-specific enrichment information
185
189
  - Interpretation of effect size magnitude
186
-
190
+
187
191
  - **text_associations** : list
188
192
  Subset of associations specifically for text pattern enrichment, ranked by
189
193
  presence percentage within clusters rather than statistical enrichment
190
-
194
+
191
195
  - **cluster_summaries** : dict
192
196
  Summary information for each cluster:
193
197
  - 'n_samples': number of samples in cluster
194
198
  - 'sample_names': list of sample names in cluster
195
-
199
+
196
200
  - **analysis_dataframe** : pandas.DataFrame
197
201
  Complete dataframe with UMAP coordinates, cluster assignments, and all
198
202
  sample metadata used for association analysis
199
-
203
+
200
204
  Raises
201
205
  ------
202
206
  ImportError
203
207
  If required dependencies (umap-learn, scikit-learn) are not installed
204
-
208
+
205
209
  ValueError
206
210
  If consensus matrix is empty or samples data is unavailable
207
-
211
+
208
212
  Examples
209
213
  --------
210
214
  Basic UMAP analysis with default parameters:
211
-
215
+
212
216
  >>> results = study.analyze_umap()
213
217
  >>> print(f"Found {results['best_clustering']['n_clusters']} clusters")
214
218
  >>> print(f"Silhouette score: {results['best_clustering']['score']:.3f}")
215
-
219
+
216
220
  Custom analysis with specific clustering and enhanced visualization:
217
-
221
+
218
222
  >>> results = study.analyze_umap(
219
223
  ... n_neighbors=20,
220
224
  ... min_dist=0.05,
@@ -222,41 +226,41 @@ def analyze_umap(
222
226
  ... significance_threshold=0.01,
223
227
  ... filename="cluster_analysis.html"
224
228
  ... )
225
-
229
+
226
230
  Fast analysis for large datasets:
227
-
231
+
228
232
  >>> results = study.analyze_umap(
229
233
  ... cluster_methods=["hdbscan"]
230
234
  ... )
231
-
235
+
232
236
  Notes
233
237
  -----
234
238
  The enhanced visualization automatically identifies and labels enriched terms based on:
235
-
239
+
236
240
  1. **Presence-based ranking**: Terms are ranked by their prevalence within clusters
237
241
  rather than statistical enrichment, favoring terms common across cluster members
238
-
239
- 2. **Intelligent positioning**:
242
+
243
+ 2. **Intelligent positioning**:
240
244
  - Shared terms (multiple clusters) positioned at geometric center with connecting lines
241
245
  - Individual terms positioned adjacent to their cluster with short spikes
242
246
  - Westward lines position labels to the left with right-aligned text
243
247
  - Eastward lines position labels to the right with left-aligned text
244
-
248
+
245
249
  3. **Quality filtering**: Empty terms (variants of 'empty', 'blank', 'qc') are
246
250
  automatically excluded from enrichment analysis and visualization
247
-
251
+
248
252
  4. **Visual styling**: Dashed edges, color-coordinated labels and lines, and
249
253
  moderate boundary expansion (5%) create professional, readable plots
250
-
254
+
251
255
  The method automatically handles missing dependencies by falling back to simplified
252
256
  analysis when optional packages (hdbscan) are unavailable.
253
257
  """
254
-
258
+
255
259
  # Check dependencies
256
260
  if not UMAP_AVAILABLE:
257
261
  self.logger.error("UMAP is required. Install with: pip install umap-learn")
258
262
  return None
259
-
263
+
260
264
  if not SKLEARN_AVAILABLE:
261
265
  self.logger.error("scikit-learn is required. Install with: pip install scikit-learn")
262
266
  return None
@@ -277,12 +281,14 @@ def analyze_umap(
277
281
 
278
282
  # Prepare data for UMAP
279
283
  sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
280
-
284
+
281
285
  if hasattr(consensus_matrix, "select"):
282
286
  matrix_data = consensus_matrix.select(sample_cols).to_numpy()
283
287
  else:
284
288
  matrix_sample_data = consensus_matrix.drop(columns=["consensus_uid"], errors="ignore")
285
- matrix_data = matrix_sample_data.values if hasattr(matrix_sample_data, "values") else np.array(matrix_sample_data)
289
+ matrix_data = (
290
+ matrix_sample_data.values if hasattr(matrix_sample_data, "values") else np.array(matrix_sample_data)
291
+ )
286
292
 
287
293
  # Transpose so samples are rows
288
294
  matrix_data = matrix_data.T
@@ -290,16 +296,17 @@ def analyze_umap(
290
296
 
291
297
  # Standardize data
292
298
  from sklearn.preprocessing import StandardScaler
299
+
293
300
  scaler = StandardScaler()
294
301
  matrix_scaled = scaler.fit_transform(matrix_data)
295
302
 
296
303
  # Perform UMAP with optimizations
297
304
  self.logger.debug(f"Computing UMAP with n_neighbors={n_neighbors}, min_dist={min_dist}")
298
305
  import umap
299
-
306
+
300
307
  # UMAP optimization: use limited threads to save memory
301
308
  n_jobs = 1
302
-
309
+
303
310
  reducer = umap.UMAP(
304
311
  n_components=2,
305
312
  n_neighbors=n_neighbors,
@@ -307,110 +314,118 @@ def analyze_umap(
307
314
  metric=metric,
308
315
  random_state=random_state,
309
316
  n_jobs=n_jobs,
310
- low_memory=False
317
+ low_memory=False,
311
318
  )
312
319
  umap_coords = reducer.fit_transform(matrix_scaled)
313
320
 
314
321
  # Convert samples_df to pandas for easier analysis
315
- samples_pd = samples_df.to_pandas() if hasattr(samples_df, 'to_pandas') else samples_df
316
-
322
+ samples_pd = samples_df.to_pandas() if hasattr(samples_df, "to_pandas") else samples_df
323
+
317
324
  # Get the actual sample columns present in consensus matrix
318
325
  sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
319
326
  consensus_sample_names = set(sample_cols)
320
-
327
+
321
328
  # Filter samples_df to only include samples present in consensus matrix
322
- if 'sample_name' in samples_pd.columns:
329
+ if "sample_name" in samples_pd.columns:
323
330
  # Create a mask for samples present in consensus matrix
324
- sample_mask = samples_pd['sample_name'].isin(consensus_sample_names)
325
-
331
+ sample_mask = samples_pd["sample_name"].isin(consensus_sample_names)
332
+
326
333
  if sample_mask.sum() != len(samples_pd):
327
- missing_samples = set(samples_pd['sample_name']) - consensus_sample_names
328
- self.logger.warning(f"Filtering out {len(missing_samples)} samples not in consensus matrix: {list(missing_samples)}")
334
+ missing_samples = set(samples_pd["sample_name"]) - consensus_sample_names
335
+ self.logger.warning(
336
+ f"Filtering out {len(missing_samples)} samples not in consensus matrix: {list(missing_samples)}"
337
+ )
329
338
  samples_pd = samples_pd[sample_mask].copy()
330
-
339
+
331
340
  # Reorder samples_pd to match the order in consensus matrix sample_cols
332
- samples_pd = samples_pd.set_index('sample_name').reindex(sample_cols).reset_index()
333
-
341
+ samples_pd = samples_pd.set_index("sample_name").reindex(sample_cols).reset_index()
342
+
334
343
  # Final check - ensure we have the same number of samples
335
344
  if len(samples_pd) != len(umap_coords):
336
- self.logger.error(f"After filtering, still have mismatch: samples_df has {len(samples_pd)} rows, UMAP has {len(umap_coords)} points")
345
+ self.logger.error(
346
+ f"After filtering, still have mismatch: samples_df has {len(samples_pd)} rows, UMAP has {len(umap_coords)} points"
347
+ )
337
348
  return None
338
-
349
+
339
350
  self.logger.info(f"Using {len(samples_pd)} samples for analysis")
340
351
 
341
352
  # Try different clustering methods
342
353
  clustering_results = {}
343
-
354
+
344
355
  for method in cluster_methods:
345
356
  self.logger.debug(f"Trying clustering method: {method}")
346
-
357
+
347
358
  if method == "hdbscan" and HDBSCAN_AVAILABLE:
348
359
  import hdbscan
349
- clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean')
360
+
361
+ clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric="euclidean")
350
362
  cluster_labels = clusterer.fit_predict(umap_coords)
351
-
363
+
352
364
  # Calculate silhouette score (excluding noise points for HDBSCAN)
353
365
  valid_labels = cluster_labels[cluster_labels != -1]
354
366
  valid_coords = umap_coords[cluster_labels != -1]
355
-
367
+
356
368
  if len(np.unique(valid_labels)) > 1:
357
369
  from sklearn.metrics import silhouette_score
370
+
358
371
  score = silhouette_score(valid_coords, valid_labels)
359
372
  n_clusters = len(np.unique(valid_labels))
360
373
  n_noise = np.sum(cluster_labels == -1)
361
-
374
+
362
375
  clustering_results[f"{method}"] = {
363
- 'labels': cluster_labels,
364
- 'score': score,
365
- 'n_clusters': n_clusters,
366
- 'n_noise': n_noise,
367
- 'method': method
376
+ "labels": cluster_labels,
377
+ "score": score,
378
+ "n_clusters": n_clusters,
379
+ "n_noise": n_noise,
380
+ "method": method,
368
381
  }
369
-
382
+
370
383
  elif method == "kmeans":
371
384
  from sklearn.cluster import KMeans
372
385
  from sklearn.metrics import silhouette_score
373
-
386
+
374
387
  for n_clusters in range(n_clusters_range[0], n_clusters_range[1] + 1):
375
388
  kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
376
389
  cluster_labels = kmeans.fit_predict(umap_coords)
377
390
  score = silhouette_score(umap_coords, cluster_labels)
378
-
391
+
379
392
  clustering_results[f"{method}_k{n_clusters}"] = {
380
- 'labels': cluster_labels,
381
- 'score': score,
382
- 'n_clusters': n_clusters,
383
- 'n_noise': 0,
384
- 'method': f"{method} (k={n_clusters})"
393
+ "labels": cluster_labels,
394
+ "score": score,
395
+ "n_clusters": n_clusters,
396
+ "n_noise": 0,
397
+ "method": f"{method} (k={n_clusters})",
385
398
  }
386
-
399
+
387
400
  elif method == "dbscan":
388
401
  from sklearn.cluster import DBSCAN
402
+
389
403
  # Standard DBSCAN eps values for exploration
390
404
  eps_values = [0.3, 0.5, 0.7, 1.0, 1.5]
391
-
405
+
392
406
  for eps in eps_values:
393
407
  dbscan = DBSCAN(eps=eps, min_samples=min_cluster_size, n_jobs=-1)
394
408
  cluster_labels = dbscan.fit_predict(umap_coords)
395
-
409
+
396
410
  n_clusters = len(np.unique(cluster_labels[cluster_labels != -1]))
397
411
  n_noise = np.sum(cluster_labels == -1)
398
-
412
+
399
413
  # Only consider valid clusterings
400
414
  if n_clusters > 1:
401
415
  from sklearn.metrics import silhouette_score
416
+
402
417
  valid_labels = cluster_labels[cluster_labels != -1]
403
418
  valid_coords = umap_coords[cluster_labels != -1]
404
-
419
+
405
420
  if len(valid_coords) > 0 and len(np.unique(valid_labels)) > 1:
406
421
  score = silhouette_score(valid_coords, valid_labels)
407
-
422
+
408
423
  clustering_results[f"{method}_eps{eps}"] = {
409
- 'labels': cluster_labels,
410
- 'score': score,
411
- 'n_clusters': n_clusters,
412
- 'n_noise': n_noise,
413
- 'method': f"{method} (eps={eps})"
424
+ "labels": cluster_labels,
425
+ "score": score,
426
+ "n_clusters": n_clusters,
427
+ "n_noise": n_noise,
428
+ "method": f"{method} (eps={eps})",
414
429
  }
415
430
 
416
431
  if not clustering_results:
@@ -418,52 +433,53 @@ def analyze_umap(
418
433
  return None
419
434
 
420
435
  # Select best clustering based on silhouette score
421
- best_key = max(clustering_results.keys(), key=lambda k: clustering_results[k]['score'])
436
+ best_key = max(clustering_results.keys(), key=lambda k: clustering_results[k]["score"])
422
437
  best_clustering = clustering_results[best_key]
423
-
424
- self.logger.info(f"Best clustering: {best_clustering['method']} with {best_clustering['n_clusters']} clusters, "
425
- f"silhouette score: {best_clustering['score']:.3f}")
438
+
439
+ self.logger.info(
440
+ f"Best clustering: {best_clustering['method']} with {best_clustering['n_clusters']} clusters, "
441
+ f"silhouette score: {best_clustering['score']:.3f}"
442
+ )
426
443
 
427
444
  # Analyze associations between clusters and sample metadata
428
- cluster_labels = best_clustering['labels']
429
-
445
+ cluster_labels = best_clustering["labels"]
446
+
430
447
  # Add cluster labels to samples dataframe for analysis
431
448
  analysis_df = samples_pd.copy()
432
- analysis_df['cluster'] = cluster_labels
433
-
449
+ analysis_df["cluster"] = cluster_labels
450
+
434
451
  # Remove noise points (label -1) for association analysis
435
- analysis_df_clean = analysis_df[analysis_df['cluster'] != -1].copy()
436
-
452
+ analysis_df_clean = analysis_df[analysis_df["cluster"] != -1].copy()
453
+
437
454
  if len(analysis_df_clean) == 0:
438
455
  self.logger.error("No samples assigned to clusters (all noise)")
439
456
  return None
440
457
 
441
458
  # Analyze associations with specific columns only
442
459
  significant_associations = []
443
-
460
+
444
461
  # Define which columns to analyze for associations (non-text)
445
- association_cols = {'sample_sequence', 'num_features'}
446
-
462
+ association_cols = {"sample_sequence", "num_features"}
463
+
447
464
  # Define which columns to analyze for text patterns - include all relevant text columns
448
- text_pattern_cols = {'sample_name', 'sample_group', 'sample_batch', 'sample_type'}
449
-
450
-
465
+ text_pattern_cols = {"sample_name", "sample_group", "sample_batch", "sample_type"}
466
+
451
467
  for col in samples_pd.columns:
452
468
  if col not in association_cols:
453
469
  continue
454
-
470
+
455
471
  try:
456
472
  # Check if column has enough variation
457
473
  col_data = analysis_df_clean[col].dropna()
458
474
  if len(col_data.unique()) < 2:
459
475
  continue
460
-
476
+
461
477
  # Determine if column is numeric or categorical
462
478
  if pd.api.types.is_numeric_dtype(col_data):
463
479
  # Numeric variable - use ANOVA or Kruskal-Wallis
464
- cluster_groups = [group[col].dropna().values for name, group in analysis_df_clean.groupby('cluster')]
480
+ cluster_groups = [group[col].dropna().values for name, group in analysis_df_clean.groupby("cluster")]
465
481
  cluster_groups = [group for group in cluster_groups if len(group) > 0]
466
-
482
+
467
483
  if len(cluster_groups) > 1:
468
484
  # Try ANOVA first
469
485
  try:
@@ -474,67 +490,81 @@ def analyze_umap(
474
490
  h_stat, p_value = stats.kruskal(*cluster_groups)
475
491
  test_name = "Kruskal-Wallis"
476
492
  f_stat = h_stat
477
-
493
+
478
494
  if p_value < significance_threshold:
479
495
  # Calculate effect size (eta-squared approximation)
480
- ss_between = sum(len(group) * (np.mean(group) - np.mean(col_data))**2 for group in cluster_groups)
481
- ss_total = np.sum((col_data - np.mean(col_data))**2)
496
+ ss_between = sum(
497
+ len(group) * (np.mean(group) - np.mean(col_data)) ** 2 for group in cluster_groups
498
+ )
499
+ ss_total = np.sum((col_data - np.mean(col_data)) ** 2)
482
500
  eta_squared = ss_between / ss_total if ss_total > 0 else 0
483
-
501
+
484
502
  significant_associations.append({
485
- 'column': col,
486
- 'variable_type': 'numeric',
487
- 'test': test_name,
488
- 'statistic': f_stat,
489
- 'p_value': p_value,
490
- 'effect_size': eta_squared,
491
- 'interpretation': 'Large effect' if eta_squared > 0.14 else 'Medium effect' if eta_squared > 0.06 else 'Small effect'
503
+ "column": col,
504
+ "variable_type": "numeric",
505
+ "test": test_name,
506
+ "statistic": f_stat,
507
+ "p_value": p_value,
508
+ "effect_size": eta_squared,
509
+ "interpretation": "Large effect"
510
+ if eta_squared > 0.14
511
+ else "Medium effect"
512
+ if eta_squared > 0.06
513
+ else "Small effect",
492
514
  })
493
-
515
+
494
516
  else:
495
517
  # Categorical variable - use Chi-square test
496
- contingency_table = pd.crosstab(analysis_df_clean['cluster'], analysis_df_clean[col])
497
-
518
+ contingency_table = pd.crosstab(analysis_df_clean["cluster"], analysis_df_clean[col])
519
+
498
520
  # Only test if we have enough observations
499
- if contingency_table.sum().sum() > 10 and contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
521
+ if (
522
+ contingency_table.sum().sum() > 10
523
+ and contingency_table.shape[0] > 1
524
+ and contingency_table.shape[1] > 1
525
+ ):
500
526
  try:
501
527
  chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
502
-
528
+
503
529
  if p_value < significance_threshold:
504
530
  # Calculate Cramer's V (effect size for chi-square)
505
531
  n = contingency_table.sum().sum()
506
532
  cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))
507
-
533
+
508
534
  significant_associations.append({
509
- 'column': col,
510
- 'variable_type': 'categorical',
511
- 'test': 'Chi-square',
512
- 'statistic': chi2,
513
- 'p_value': p_value,
514
- 'effect_size': cramers_v,
515
- 'interpretation': 'Large effect' if cramers_v > 0.5 else 'Medium effect' if cramers_v > 0.3 else 'Small effect',
516
- 'contingency_table': contingency_table
535
+ "column": col,
536
+ "variable_type": "categorical",
537
+ "test": "Chi-square",
538
+ "statistic": chi2,
539
+ "p_value": p_value,
540
+ "effect_size": cramers_v,
541
+ "interpretation": "Large effect"
542
+ if cramers_v > 0.5
543
+ else "Medium effect"
544
+ if cramers_v > 0.3
545
+ else "Small effect",
546
+ "contingency_table": contingency_table,
517
547
  })
518
548
  except Exception:
519
549
  continue
520
-
550
+
521
551
  except Exception as e:
522
552
  self.logger.debug(f"Error analyzing column {col}: {e}")
523
553
  continue
524
554
 
525
555
  # Sort by effect size (descending)
526
- significant_associations.sort(key=lambda x: x['effect_size'], reverse=True)
556
+ significant_associations.sort(key=lambda x: x["effect_size"], reverse=True)
527
557
 
528
558
  # Enhanced cluster-centric text analysis - analyze what makes each cluster unique
529
559
  self.logger.debug("Performing cluster-centric enrichment analysis...")
530
-
560
+
531
561
  text_associations = []
532
-
562
+
533
563
  # Optimized text tokenization using cached function
534
564
  def tokenize_text_optimized(text):
535
565
  """Optimized text tokenization with caching"""
536
566
  return tokenize_text_cached(text)
537
-
567
+
538
568
  # Collect all atoms from specified string columns only
539
569
  string_columns = []
540
570
  for col in text_pattern_cols:
@@ -543,15 +573,15 @@ def analyze_umap(
543
573
  if len(col_data) > 0 and not pd.api.types.is_numeric_dtype(col_data):
544
574
  if len(col_data.astype(str).unique()) > 1: # Has variation
545
575
  string_columns.append(col)
546
-
576
+
547
577
  if string_columns:
548
578
  # Text analysis for string columns
549
579
  self.logger.debug(f"Analyzing cluster enrichments in {len(string_columns)} string columns")
550
-
580
+
551
581
  # Build cluster-centric atom analysis using cached tokenization
552
582
  cluster_atoms = {} # cluster_id -> {atom -> count}
553
583
  global_atom_counts = {} # atom -> total_count_across_all_samples
554
-
584
+
555
585
  # Pre-tokenize all text data once for efficiency with column prefixes
556
586
  sample_atom_sets = {}
557
587
  for idx, row in analysis_df_clean.iterrows():
@@ -559,61 +589,63 @@ def analyze_umap(
559
589
  for col in string_columns:
560
590
  atoms = tokenize_text_optimized(row[col])
561
591
  # Add column prefix to distinguish where tokens come from
562
- col_prefix = col.replace('sample_', '') + ':' # e.g., "name:", "group:", "batch:", "type:"
592
+ col_prefix = col.replace("sample_", "") + ":" # e.g., "name:", "group:", "batch:", "type:"
563
593
  prefixed_atoms = [f"{col_prefix}{atom}" for atom in atoms]
564
594
  sample_atoms.update(prefixed_atoms)
565
595
  sample_atom_sets[idx] = sample_atoms
566
-
596
+
567
597
  # Collect atoms by cluster
568
598
  for idx, row in analysis_df_clean.iterrows():
569
- cluster_id = row['cluster']
599
+ cluster_id = row["cluster"]
570
600
  if cluster_id not in cluster_atoms:
571
601
  cluster_atoms[cluster_id] = {}
572
-
602
+
573
603
  # Use pre-tokenized atoms
574
604
  sample_atoms = sample_atom_sets[idx]
575
-
605
+
576
606
  # Count atoms for this cluster and globally
577
607
  for atom in sample_atoms:
578
608
  cluster_atoms[cluster_id][atom] = cluster_atoms[cluster_id].get(atom, 0) + 1
579
609
  global_atom_counts[atom] = global_atom_counts.get(atom, 0) + 1
580
-
610
+
581
611
  # Calculate cluster enrichments using hypergeometric test (same for both modes)
582
612
  if string_columns:
583
613
  n_total_samples = len(analysis_df_clean)
584
-
614
+
585
615
  # For each cluster, find significantly enriched terms
586
616
  for cluster_id, cluster_atom_counts in cluster_atoms.items():
587
- cluster_size = len(analysis_df_clean[analysis_df_clean['cluster'] == cluster_id])
588
-
617
+ cluster_size = len(analysis_df_clean[analysis_df_clean["cluster"] == cluster_id])
618
+
589
619
  for atom, cluster_count in cluster_atom_counts.items():
590
620
  global_count = global_atom_counts[atom]
591
-
621
+
592
622
  # Skip empty terms from enrichment analysis and plotting
593
- if (atom == '<empty>' or
594
- atom.lower() == 'empty' or
595
- atom.strip() == '' or
596
- ':empty' in atom.lower() or
597
- atom.lower().endswith('empty') or
598
- ':blank' in atom.lower() or
599
- atom.lower().endswith('blank')):
623
+ if (
624
+ atom == "<empty>"
625
+ or atom.lower() == "empty"
626
+ or atom.strip() == ""
627
+ or ":empty" in atom.lower()
628
+ or atom.lower().endswith("empty")
629
+ or ":blank" in atom.lower()
630
+ or atom.lower().endswith("blank")
631
+ ):
600
632
  continue
601
-
633
+
602
634
  # Skip atoms with low frequency
603
635
  if global_count < 2:
604
636
  continue
605
-
637
+
606
638
  # Skip terms that occur in fewer than 5 samples within this cluster
607
639
  if cluster_count < 5:
608
640
  continue
609
-
641
+
610
642
  # IMPORTANT: Skip atoms that appear in too many clusters (not cluster-specific)
611
643
  # Count how many clusters this atom appears in
612
644
  clusters_with_atom = set()
613
645
  for other_cluster_id, other_cluster_atom_counts in cluster_atoms.items():
614
646
  if atom in other_cluster_atom_counts:
615
647
  clusters_with_atom.add(other_cluster_id)
616
-
648
+
617
649
  total_clusters = len(cluster_atoms)
618
650
  cluster_specificity = len(clusters_with_atom) / total_clusters if total_clusters > 0 else 1
619
651
 
@@ -621,106 +653,109 @@ def analyze_umap(
621
653
  if cluster_specificity > 0.5:
622
654
  # Note: logger not available in standalone function, would need to pass self
623
655
  continue
624
-
656
+
625
657
  # Additional check: ensure this cluster has significantly more of this atom than others
626
- #max_other_cluster_count = 0
627
- #for other_cluster_id, other_cluster_atom_counts in cluster_atoms.items():
658
+ # max_other_cluster_count = 0
659
+ # for other_cluster_id, other_cluster_atom_counts in cluster_atoms.items():
628
660
  # if other_cluster_id != cluster_id and atom in other_cluster_atom_counts:
629
661
  # max_other_cluster_count = max(max_other_cluster_count, other_cluster_atom_counts[atom])
630
-
662
+
631
663
  # Skip if current cluster doesn't have significantly more instances than the next highest
632
- #if cluster_count <= max_other_cluster_count * 1.5:
633
- # Note: logger not available in standalone function, would need to pass self
664
+ # if cluster_count <= max_other_cluster_count * 1.5:
665
+ # Note: logger not available in standalone function, would need to pass self
634
666
  # continue
635
-
667
+
636
668
  # Calculate enrichment using hypergeometric test
637
669
  try:
638
670
  from scipy.stats import hypergeom
639
-
671
+
640
672
  M = n_total_samples
641
673
  n = global_count
642
674
  N = cluster_size
643
675
  k = cluster_count
644
-
676
+
645
677
  # Calculate p-value (probability of observing k or more successes)
646
- p_value = hypergeom.sf(k-1, M, n, N)
647
-
678
+ p_value = hypergeom.sf(k - 1, M, n, N)
679
+
648
680
  # Calculate enrichment ratio
649
681
  expected_freq = (n / M) * N
650
- enrichment_ratio = cluster_count / expected_freq if expected_freq > 0 else float('inf')
651
-
682
+ enrichment_ratio = cluster_count / expected_freq if expected_freq > 0 else float("inf")
683
+
652
684
  # Only consider significantly enriched terms (p < threshold and enrichment > 1.5x)
653
685
  if p_value < significance_threshold and enrichment_ratio > 1.5:
654
-
655
686
  # Calculate percentage of cluster samples with this atom
656
687
  cluster_percentage = (cluster_count / cluster_size) * 100
657
688
  global_percentage = (global_count / n_total_samples) * 100
658
-
689
+
659
690
  text_associations.append({
660
- 'atom': atom,
661
- 'cluster_id': cluster_id,
662
- 'type': 'cluster_enrichment',
663
- 'test': 'Hypergeometric',
664
- 'p_value': p_value,
665
- 'enrichment_ratio': enrichment_ratio,
666
- 'effect_size': enrichment_ratio, # Use enrichment ratio as effect size
667
- 'interpretation': 'Large enrichment' if enrichment_ratio > 3 else 'Medium enrichment' if enrichment_ratio > 2 else 'Small enrichment',
668
- 'cluster_count': cluster_count,
669
- 'cluster_size': cluster_size,
670
- 'cluster_percentage': cluster_percentage,
671
- 'global_count': global_count,
672
- 'global_percentage': global_percentage,
673
- 'cluster_samples_with_atom': cluster_count,
674
- 'total_samples_with_atom': global_count
691
+ "atom": atom,
692
+ "cluster_id": cluster_id,
693
+ "type": "cluster_enrichment",
694
+ "test": "Hypergeometric",
695
+ "p_value": p_value,
696
+ "enrichment_ratio": enrichment_ratio,
697
+ "effect_size": enrichment_ratio, # Use enrichment ratio as effect size
698
+ "interpretation": "Large enrichment"
699
+ if enrichment_ratio > 3
700
+ else "Medium enrichment"
701
+ if enrichment_ratio > 2
702
+ else "Small enrichment",
703
+ "cluster_count": cluster_count,
704
+ "cluster_size": cluster_size,
705
+ "cluster_percentage": cluster_percentage,
706
+ "global_count": global_count,
707
+ "global_percentage": global_percentage,
708
+ "cluster_samples_with_atom": cluster_count,
709
+ "total_samples_with_atom": global_count,
675
710
  })
676
-
711
+
677
712
  except Exception as e:
678
713
  self.logger.debug(f"Error analyzing enrichment of '{atom}' in cluster {cluster_id}: {e}")
679
714
  continue
680
-
715
+
681
716
  # Sort text associations by cluster presence percentage (favors common terms in clusters)
682
- text_associations.sort(key=lambda x: x['cluster_percentage'], reverse=True)
683
-
717
+ text_associations.sort(key=lambda x: x["cluster_percentage"], reverse=True)
718
+
684
719
  # Combine regular and text associations
685
720
  all_associations = significant_associations + text_associations
686
721
  # Sort by cluster percentage for text associations, effect size for others
687
- all_associations.sort(key=lambda x: x.get('cluster_percentage', x.get('effect_size', 0)), reverse=True)
722
+ all_associations.sort(key=lambda x: x.get("cluster_percentage", x.get("effect_size", 0)), reverse=True)
688
723
 
689
724
  # Generate cluster summaries
690
725
  cluster_summaries = {}
691
- for cluster_id in analysis_df_clean['cluster'].unique():
692
- cluster_data = analysis_df_clean[analysis_df_clean['cluster'] == cluster_id]
726
+ for cluster_id in analysis_df_clean["cluster"].unique():
727
+ cluster_data = analysis_df_clean[analysis_df_clean["cluster"] == cluster_id]
693
728
  cluster_summaries[cluster_id] = {
694
- 'n_samples': len(cluster_data),
695
- 'sample_names': cluster_data['sample_name'].tolist() if 'sample_name' in cluster_data else [],
729
+ "n_samples": len(cluster_data),
730
+ "sample_names": cluster_data["sample_name"].tolist() if "sample_name" in cluster_data else [],
696
731
  }
697
732
 
698
733
  # Create results dictionary
699
734
  results = {
700
- 'umap_coords': umap_coords,
701
- 'best_clustering': best_clustering,
702
- 'all_clustering_results': clustering_results,
703
- 'significant_associations': all_associations,
704
- 'text_associations': text_associations,
705
- 'cluster_summaries': cluster_summaries,
706
- 'analysis_dataframe': analysis_df_clean
735
+ "umap_coords": umap_coords,
736
+ "best_clustering": best_clustering,
737
+ "all_clustering_results": clustering_results,
738
+ "significant_associations": all_associations,
739
+ "text_associations": text_associations,
740
+ "cluster_summaries": cluster_summaries,
741
+ "analysis_dataframe": analysis_df_clean,
707
742
  }
708
743
 
709
744
  # Create sample-specific enrichment tooltips with optimization
710
745
  sample_enrichments = {}
711
-
746
+
712
747
  # For each sample, find which text atoms it contains that are significant
713
748
  if text_associations:
714
749
  max_check_terms = 10 # Standard limit for tooltip calculation
715
-
750
+
716
751
  for idx, row in analysis_df_clean.iterrows():
717
- sample_name = row.get('sample_name', f'sample_{idx}')
752
+ sample_name = row.get("sample_name", f"sample_{idx}")
718
753
  sample_enrichments[sample_name] = []
719
-
754
+
720
755
  # Check which significant atoms this sample contains
721
756
  for assoc in text_associations[:max_check_terms]: # Check fewer terms in fast mode
722
- atom = assoc['atom']
723
-
757
+ atom = assoc["atom"]
758
+
724
759
  # Check if this sample contains this atom in any of the text columns
725
760
  sample_has_atom = False
726
761
  for col in text_pattern_cols:
@@ -729,49 +764,53 @@ def analyze_umap(
729
764
  if atom.lower() in text_value.lower():
730
765
  sample_has_atom = True
731
766
  break
732
-
767
+
733
768
  if sample_has_atom:
734
769
  sample_enrichments[sample_name].append(f"{atom} ({assoc['p_value']:.3f})")
735
770
  if len(sample_enrichments[sample_name]) >= 3: # Only show top 3 per sample
736
771
  break
737
-
772
+
738
773
  # Create embedded plots if requested
739
774
  if plot_results:
740
775
  plots = {}
741
-
776
+
742
777
  # Plot 1: Enhanced UMAP with clusters and enriched term labels (EMBEDDED PLOTTING)
743
778
  from bokeh.models import ColumnDataSource, HoverTool, LabelSet, LegendItem, Legend
744
779
  from bokeh.plotting import figure
745
780
  from collections import defaultdict
746
-
781
+
747
782
  # Create cluster plot with enhanced size
748
783
  p1 = figure(
749
- width=900, height=700,
784
+ width=900,
785
+ height=700,
750
786
  title=f"UMAP Clusters with Enriched Terms ({best_clustering['method']})",
751
- tools="pan,wheel_zoom,box_zoom,reset,save"
787
+ tools="pan,wheel_zoom,box_zoom,reset,save",
752
788
  )
753
789
  p1.xaxis.axis_label = "UMAP1"
754
790
  p1.yaxis.axis_label = "UMAP2"
755
-
791
+
756
792
  # Remove grid
757
793
  p1.grid.visible = False
758
-
794
+
759
795
  # Color points by cluster
760
796
  unique_clusters = np.unique(cluster_labels)
761
797
  n_clusters = len(unique_clusters)
762
-
798
+
763
799
  # Handle color mapping for many clusters - use turbo colormap
764
800
  if n_clusters <= 10:
765
801
  from bokeh.palettes import turbo
802
+
766
803
  colors = turbo(max(10, n_clusters))[:n_clusters]
767
804
  elif n_clusters <= 20:
768
805
  from bokeh.palettes import turbo
806
+
769
807
  colors = turbo(20)[:n_clusters]
770
808
  else:
771
809
  # For many clusters, use a continuous colormap
772
810
  from bokeh.palettes import turbo
811
+
773
812
  colors = turbo(min(256, n_clusters))
774
-
813
+
775
814
  # Calculate cluster centers and plot points
776
815
  cluster_centers = {}
777
816
  for i, cluster_id in enumerate(unique_clusters):
@@ -782,147 +821,153 @@ def analyze_umap(
782
821
  else:
783
822
  color = colors[i % len(colors)]
784
823
  label = f"Cluster {cluster_id}"
785
-
824
+
786
825
  cluster_coords = umap_coords[mask]
787
-
826
+
788
827
  # Calculate cluster center
789
828
  if len(cluster_coords) > 0:
790
829
  center_x = np.mean(cluster_coords[:, 0])
791
830
  center_y = np.mean(cluster_coords[:, 1])
792
831
  cluster_centers[cluster_id] = (center_x, center_y)
793
-
832
+
794
833
  cluster_samples = samples_pd[mask] if len(samples_pd) == len(mask) else None
795
- sample_names = cluster_samples['sample_name'].tolist() if cluster_samples is not None and 'sample_name' in cluster_samples else [f"Sample_{j}" for j in range(np.sum(mask))]
796
- sample_uids = cluster_samples['sample_uid'].tolist() if cluster_samples is not None and 'sample_uid' in cluster_samples else [f"UID_{j}" for j in range(np.sum(mask))]
797
-
834
+ sample_names = (
835
+ cluster_samples["sample_name"].tolist()
836
+ if cluster_samples is not None and "sample_name" in cluster_samples
837
+ else [f"Sample_{j}" for j in range(np.sum(mask))]
838
+ )
839
+ sample_uids = (
840
+ cluster_samples["sample_uid"].tolist()
841
+ if cluster_samples is not None and "sample_uid" in cluster_samples
842
+ else [f"UID_{j}" for j in range(np.sum(mask))]
843
+ )
844
+
798
845
  # Create enrichment tooltip text for this cluster
799
- cluster_associations = [assoc for assoc in text_associations if assoc.get('cluster_id') == cluster_id]
800
-
846
+ cluster_associations = [assoc for assoc in text_associations if assoc.get("cluster_id") == cluster_id]
847
+
801
848
  # Get the top enrichments for this cluster (not individual samples)
802
849
  cluster_enrichments = []
803
850
  for assoc in cluster_associations[:3]: # Top 3 enrichments for this cluster
804
- atom = assoc['atom']
851
+ atom = assoc["atom"]
805
852
  # Skip color codes and other non-meaningful atoms
806
- if not ((atom.startswith('#') and len(atom) == 7) or atom in ['nan', 'None', 'null']):
853
+ if not ((atom.startswith("#") and len(atom) == 7) or atom in ["nan", "None", "null"]):
807
854
  cluster_enrichments.append(atom)
808
-
855
+
809
856
  # Create the same enrichment text for ALL samples in this cluster
810
857
  if cluster_enrichments:
811
858
  cluster_enrichment_text = "; ".join(cluster_enrichments)
812
859
  else:
813
860
  cluster_enrichment_text = "No enrichments found"
814
-
861
+
815
862
  # Apply the same enrichment text to all samples in this cluster
816
863
  sample_enrichment_texts = [cluster_enrichment_text] * np.sum(mask)
817
-
864
+
818
865
  source = ColumnDataSource({
819
- 'x': umap_coords[mask, 0],
820
- 'y': umap_coords[mask, 1],
821
- 'cluster': [cluster_id] * np.sum(mask),
822
- 'sample_name': sample_names[:np.sum(mask)],
823
- 'sample_uid': sample_uids[:np.sum(mask)],
824
- 'enrichments': sample_enrichment_texts[:np.sum(mask)]
866
+ "x": umap_coords[mask, 0],
867
+ "y": umap_coords[mask, 1],
868
+ "cluster": [cluster_id] * np.sum(mask),
869
+ "sample_name": sample_names[: np.sum(mask)],
870
+ "sample_uid": sample_uids[: np.sum(mask)],
871
+ "enrichments": sample_enrichment_texts[: np.sum(mask)],
825
872
  })
826
-
827
- p1.scatter('x', 'y', size=markersize, color=color, alpha=0.7,
828
- source=source)
829
-
873
+
874
+ p1.scatter("x", "y", size=markersize, color=color, alpha=0.7, source=source)
875
+
830
876
  # Enhanced enriched term visualization
831
877
  max_terms_per_cluster = 2
832
878
  min_enrichment = 2.0
833
-
879
+
834
880
  # Process enriched terms - group by cluster and filter
835
881
  cluster_terms = defaultdict(list)
836
882
  for assoc in text_associations:
837
883
  # Skip empty terms from plotting
838
- atom = assoc.get('atom', '')
839
- if (atom == '<empty>' or
840
- atom.lower() == 'empty' or
841
- atom.strip() == '' or
842
- ':empty' in atom.lower() or
843
- atom.lower().endswith('empty') or
844
- ':blank' in atom.lower() or
845
- atom.lower().endswith('blank')):
884
+ atom = assoc.get("atom", "")
885
+ if (
886
+ atom == "<empty>"
887
+ or atom.lower() == "empty"
888
+ or atom.strip() == ""
889
+ or ":empty" in atom.lower()
890
+ or atom.lower().endswith("empty")
891
+ or ":blank" in atom.lower()
892
+ or atom.lower().endswith("blank")
893
+ ):
846
894
  continue
847
-
848
- if (assoc['enrichment_ratio'] >= min_enrichment and
849
- assoc['cluster_id'] in cluster_centers):
850
- cluster_terms[assoc['cluster_id']].append(assoc)
851
-
895
+
896
+ if assoc["enrichment_ratio"] >= min_enrichment and assoc["cluster_id"] in cluster_centers:
897
+ cluster_terms[assoc["cluster_id"]].append(assoc)
898
+
852
899
  # Limit terms per cluster and sort by cluster presence percentage (favors common terms)
853
900
  for cluster_id in cluster_terms:
854
901
  cluster_terms[cluster_id] = sorted(
855
- cluster_terms[cluster_id],
856
- key=lambda x: x['cluster_percentage'],
857
- reverse=True
902
+ cluster_terms[cluster_id], key=lambda x: x["cluster_percentage"], reverse=True
858
903
  )[:max_terms_per_cluster]
859
-
904
+
860
905
  # Collect all unique terms for shared term handling
861
906
  all_terms = {}
862
907
  for cluster_id, terms in cluster_terms.items():
863
908
  for term in terms:
864
- atom = term['atom']
909
+ atom = term["atom"]
865
910
  if atom not in all_terms:
866
911
  all_terms[atom] = []
867
912
  all_terms[atom].append(cluster_id)
868
-
913
+
869
914
  # Separate terms into shared vs cluster-specific
870
915
  shared_terms = {atom: clusters for atom, clusters in all_terms.items() if len(clusters) > 1}
871
916
  specific_terms = {atom: clusters[0] for atom, clusters in all_terms.items() if len(clusters) == 1}
872
-
917
+
873
918
  # Merge overlapping terms that refer to the same concept
874
919
  # E.g., "type:qc" and "name:PooledQC" both refer to QC samples
875
920
  def should_merge_terms(term1, term2):
876
921
  """Check if two terms should be merged based on semantic overlap"""
877
922
  # Extract the actual values (remove prefixes)
878
- val1 = term1.replace('name:', '').replace('type:', '').replace('group:', '').replace('batch:', '').lower()
879
- val2 = term2.replace('name:', '').replace('type:', '').replace('group:', '').replace('batch:', '').lower()
880
-
923
+ val1 = term1.replace("name:", "").replace("type:", "").replace("group:", "").replace("batch:", "").lower()
924
+ val2 = term2.replace("name:", "").replace("type:", "").replace("group:", "").replace("batch:", "").lower()
925
+
881
926
  # Define known overlapping concepts
882
- qc_terms = {'qc', 'pooledqc', 'pooled_qc', 'quality_control', 'qualitycontrol'}
883
- blank_terms = {'blank', 'blk', 'empty', 'background'}
884
-
927
+ qc_terms = {"qc", "pooledqc", "pooled_qc", "quality_control", "qualitycontrol"}
928
+ blank_terms = {"blank", "blk", "empty", "background"}
929
+
885
930
  # Check if both terms belong to the same concept group
886
931
  if val1 in qc_terms and val2 in qc_terms:
887
932
  return True
888
933
  if val1 in blank_terms and val2 in blank_terms:
889
934
  return True
890
-
935
+
891
936
  # Also check for direct string similarity (e.g., case variations)
892
937
  if val1 == val2:
893
938
  return True
894
-
939
+
895
940
  return False
896
-
941
+
897
942
  def merge_overlapping_terms(shared_terms, specific_terms):
898
943
  """Merge terms that refer to the same concept"""
899
944
  all_atoms = list(shared_terms.keys()) + list(specific_terms.keys())
900
945
  merged_groups = []
901
946
  used_atoms = set()
902
-
947
+
903
948
  for i, atom1 in enumerate(all_atoms):
904
949
  if atom1 in used_atoms:
905
950
  continue
906
-
951
+
907
952
  group = [atom1]
908
953
  used_atoms.add(atom1)
909
-
954
+
910
955
  # Find all atoms that should be merged with this one
911
- for j, atom2 in enumerate(all_atoms[i+1:], i+1):
956
+ for j, atom2 in enumerate(all_atoms[i + 1 :], i + 1):
912
957
  if atom2 in used_atoms:
913
958
  continue
914
959
  if should_merge_terms(atom1, atom2):
915
960
  group.append(atom2)
916
961
  used_atoms.add(atom2)
917
-
962
+
918
963
  if len(group) > 1:
919
964
  merged_groups.append(group)
920
-
965
+
921
966
  return merged_groups
922
-
967
+
923
968
  # Find terms that should be merged
924
969
  merged_groups = merge_overlapping_terms(shared_terms, specific_terms)
925
-
970
+
926
971
  # Apply merging: create new combined terms and remove originals
927
972
  for group in merged_groups:
928
973
  # Determine the combined clusters for this group
@@ -932,28 +977,28 @@ def analyze_umap(
932
977
  combined_clusters.update(shared_terms[atom])
933
978
  elif atom in specific_terms:
934
979
  combined_clusters.add(specific_terms[atom])
935
-
980
+
936
981
  # Create a new combined term name using newlines
937
982
  # Keep the original prefixes and atom names
938
- combined_atom = '\n'.join(group)
939
-
983
+ combined_atom = "\n".join(group)
984
+
940
985
  # Remove original terms from both dictionaries
941
986
  for atom in group:
942
987
  shared_terms.pop(atom, None)
943
988
  specific_terms.pop(atom, None)
944
-
989
+
945
990
  # Add the combined term to appropriate dictionary
946
991
  combined_clusters_list = list(combined_clusters)
947
992
  if len(combined_clusters_list) > 1:
948
993
  shared_terms[combined_atom] = combined_clusters_list
949
994
  else:
950
995
  specific_terms[combined_atom] = combined_clusters_list[0]
951
-
996
+
952
997
  # Create label sources for enriched terms
953
998
  label_sources = {}
954
999
  line_sources = {}
955
1000
  line_cluster_mapping = {} # Track which cluster each line belongs to
956
-
1001
+
957
1002
  # Handle shared terms (place at center of all clusters that share it, but in empty areas)
958
1003
  for atom, clusters in shared_terms.items():
959
1004
  if len(clusters) > 1:
@@ -962,7 +1007,7 @@ def analyze_umap(
962
1007
  if cluster_coords_list:
963
1008
  center_x = np.mean([coord[0] for coord in cluster_coords_list])
964
1009
  center_y = np.mean([coord[1] for coord in cluster_coords_list])
965
-
1010
+
966
1011
  # Calculate data bounds using simple approach
967
1012
  all_x = [pt[0] for pt in umap_coords]
968
1013
  all_y = [pt[1] for pt in umap_coords]
@@ -970,57 +1015,57 @@ def analyze_umap(
970
1015
  y_min, y_max = min(all_y), max(all_y)
971
1016
  data_range_x = x_max - x_min
972
1017
  data_range_y = y_max - y_min
973
-
1018
+
974
1019
  # Find empty area around the center
975
1020
  best_distance = 0
976
1021
  best_position = None
977
-
1022
+
978
1023
  for distance_factor in [1.0, 1.5, 2.0]:
979
1024
  offset_distance = distance_factor * max(data_range_x, data_range_y) * 0.1
980
-
981
- for angle in np.linspace(0, 2*np.pi, 8):
1025
+
1026
+ for angle in np.linspace(0, 2 * np.pi, 8):
982
1027
  label_x = center_x + offset_distance * np.cos(angle)
983
1028
  label_y = center_y + offset_distance * np.sin(angle)
984
-
1029
+
985
1030
  # Calculate minimum distance to any data point
986
- distances = [np.sqrt((pt[0] - label_x)**2 + (pt[1] - label_y)**2) for pt in umap_coords]
1031
+ distances = [np.sqrt((pt[0] - label_x) ** 2 + (pt[1] - label_y) ** 2) for pt in umap_coords]
987
1032
  min_distance = min(distances)
988
-
1033
+
989
1034
  if min_distance > best_distance:
990
1035
  best_distance = min_distance
991
1036
  best_position = (label_x, label_y)
992
-
1037
+
993
1038
  # Use best position or fallback to center
994
1039
  if best_position is not None:
995
1040
  label_x, label_y = best_position
996
1041
  else:
997
1042
  label_x, label_y = center_x, center_y
998
-
1043
+
999
1044
  # Check if label would be outside plot bounds and adjust
1000
1045
  label_margin = max(data_range_x, data_range_y) * 0.05
1001
1046
  if label_x < x_min - label_margin:
1002
1047
  label_x = x_min - label_margin
1003
1048
  elif label_x > x_max + label_margin:
1004
1049
  label_x = x_max + label_margin
1005
-
1050
+
1006
1051
  if label_y < y_min - label_margin:
1007
1052
  label_y = y_min - label_margin
1008
1053
  elif label_y > y_max + label_margin:
1009
1054
  label_y = y_max + label_margin
1010
-
1055
+
1011
1056
  # Keep the original atom name with prefixes for display
1012
1057
  display_atom = atom # Keep prefixes like name:, group:, batch:, type:
1013
-
1058
+
1014
1059
  # Create label source with center alignment for shared terms
1015
1060
  label_source = ColumnDataSource({
1016
- 'x': [label_x],
1017
- 'y': [label_y],
1018
- 'text': [display_atom],
1019
- 'atom': [atom],
1020
- 'text_align': ['center']
1061
+ "x": [label_x],
1062
+ "y": [label_y],
1063
+ "text": [display_atom],
1064
+ "atom": [atom],
1065
+ "text_align": ["center"],
1021
1066
  })
1022
1067
  label_sources[atom] = label_source
1023
-
1068
+
1024
1069
  # Create lines to each cluster center
1025
1070
  line_x = []
1026
1071
  line_y = []
@@ -1029,20 +1074,17 @@ def analyze_umap(
1029
1074
  cx, cy = cluster_centers[cluster_id]
1030
1075
  line_x.extend([label_x, cx, np.nan]) # nan to break line
1031
1076
  line_y.extend([label_y, cy, np.nan])
1032
-
1033
- line_source = ColumnDataSource({
1034
- 'x': line_x,
1035
- 'y': line_y
1036
- })
1077
+
1078
+ line_source = ColumnDataSource({"x": line_x, "y": line_y})
1037
1079
  line_sources[atom] = line_source
1038
- line_cluster_mapping[atom] = 'shared'
1039
-
1080
+ line_cluster_mapping[atom] = "shared"
1081
+
1040
1082
  # Handle cluster-specific terms (arrange multiple terms per cluster to avoid overlap)
1041
1083
  # Group specific terms by cluster to handle multiple terms per cluster
1042
1084
  cluster_specific_terms = defaultdict(list)
1043
1085
  for atom, cluster_id in specific_terms.items():
1044
1086
  cluster_specific_terms[cluster_id].append(atom)
1045
-
1087
+
1046
1088
  # Calculate data bounds once
1047
1089
  all_x = [pt[0] for pt in umap_coords]
1048
1090
  all_y = [pt[1] for pt in umap_coords]
@@ -1050,7 +1092,7 @@ def analyze_umap(
1050
1092
  y_min, y_max = min(all_y), max(all_y)
1051
1093
  data_range_x = x_max - x_min
1052
1094
  data_range_y = y_max - y_min
1053
-
1095
+
1054
1096
  # Expand plot ranges to accommodate labels (add 15% margin on all sides)
1055
1097
  margin = 0.15
1056
1098
  x_margin = data_range_x * margin
@@ -1059,61 +1101,63 @@ def analyze_umap(
1059
1101
  plot_x_max = x_max + x_margin
1060
1102
  plot_y_min = y_min - y_margin
1061
1103
  plot_y_max = y_max + y_margin
1062
-
1104
+
1063
1105
  # Set expanded plot ranges
1064
1106
  p1.x_range.start = plot_x_min
1065
1107
  p1.x_range.end = plot_x_max
1066
1108
  p1.y_range.start = plot_y_min
1067
1109
  p1.y_range.end = plot_y_max
1068
-
1110
+
1069
1111
  # Process each cluster that has specific terms
1070
1112
  for cluster_id, cluster_atoms in cluster_specific_terms.items():
1071
1113
  if cluster_id not in cluster_centers:
1072
1114
  continue
1073
-
1115
+
1074
1116
  cx, cy = cluster_centers[cluster_id]
1075
1117
  n_terms = len(cluster_atoms)
1076
-
1118
+
1077
1119
  if n_terms == 1:
1078
1120
  # Single term - use smart positioning with shorter distances
1079
1121
  atom = cluster_atoms[0]
1080
-
1122
+
1081
1123
  # Try multiple candidate positions with shorter distances and more angles
1082
1124
  best_distance = 0
1083
1125
  best_position = None
1084
-
1126
+
1085
1127
  # Use shorter base distance and test many angles
1086
1128
  base_distance = max(data_range_x, data_range_y) * 0.08 # Much shorter base distance
1087
-
1129
+
1088
1130
  # Test positions at different angles and short distances
1089
1131
  for distance_factor in [0.8, 1.0, 1.3]: # Shorter distance factors
1090
1132
  offset_distance = base_distance * distance_factor
1091
-
1092
- for angle in np.linspace(0, 2*np.pi, 24): # More angles (24 directions)
1133
+
1134
+ for angle in np.linspace(0, 2 * np.pi, 24): # More angles (24 directions)
1093
1135
  label_x = cx + offset_distance * np.cos(angle)
1094
1136
  label_y = cy + offset_distance * np.sin(angle)
1095
-
1137
+
1096
1138
  # Calculate minimum distance to any data point
1097
- distances = [np.sqrt((pt[0] - label_x)**2 + (pt[1] - label_y)**2) for pt in umap_coords]
1139
+ distances = [np.sqrt((pt[0] - label_x) ** 2 + (pt[1] - label_y) ** 2) for pt in umap_coords]
1098
1140
  min_distance = min(distances)
1099
-
1141
+
1100
1142
  # Check distance to other labels to avoid overlap
1101
- min_label_distance = float('inf')
1143
+ min_label_distance = float("inf")
1102
1144
  for other_atom, other_source in label_sources.items():
1103
1145
  if other_atom != atom:
1104
1146
  other_data = other_source.data
1105
- if other_data['x'] and other_data['y']:
1106
- other_x, other_y = other_data['x'][0], other_data['y'][0]
1107
- label_distance = np.sqrt((label_x - other_x)**2 + (label_y - other_y)**2)
1147
+ if other_data["x"] and other_data["y"]:
1148
+ other_x, other_y = other_data["x"][0], other_data["y"][0]
1149
+ label_distance = np.sqrt((label_x - other_x) ** 2 + (label_y - other_y) ** 2)
1108
1150
  min_label_distance = min(min_label_distance, label_distance)
1109
-
1151
+
1110
1152
  # Prefer positions that are reasonably far from data points and other labels
1111
- combined_distance = min(min_distance, min_label_distance if min_label_distance != float('inf') else min_distance)
1112
-
1153
+ combined_distance = min(
1154
+ min_distance, min_label_distance if min_label_distance != float("inf") else min_distance
1155
+ )
1156
+
1113
1157
  if combined_distance > best_distance:
1114
1158
  best_distance = combined_distance
1115
1159
  best_position = (label_x, label_y)
1116
-
1160
+
1117
1161
  # Use best position found, or fallback to simple short offset
1118
1162
  if best_position is not None:
1119
1163
  label_x, label_y = best_position
@@ -1124,54 +1168,51 @@ def analyze_umap(
1124
1168
  angle_rad = np.radians(angle)
1125
1169
  label_x = cx + offset_distance * np.cos(angle_rad)
1126
1170
  label_y = cy + offset_distance * np.sin(angle_rad)
1127
-
1171
+
1128
1172
  # Check if label would be outside plot bounds and adjust
1129
1173
  label_margin = max(data_range_x, data_range_y) * 0.05
1130
-
1174
+
1131
1175
  # Instead of clamping to bounds, let labels go outside and plot bounds will be expanded later
1132
1176
  # Only apply minimal adjustments to prevent labels from being extremely far out
1133
1177
  extreme_margin = max(data_range_x, data_range_y) * 0.25 # Allow 25% outside data range
1134
-
1178
+
1135
1179
  if label_x < x_min - extreme_margin:
1136
1180
  label_x = x_min - extreme_margin
1137
1181
  elif label_x > x_max + extreme_margin:
1138
1182
  label_x = x_max + extreme_margin
1139
-
1183
+
1140
1184
  if label_y < y_min - extreme_margin:
1141
1185
  label_y = y_min - extreme_margin
1142
1186
  elif label_y > y_max + extreme_margin:
1143
1187
  label_y = y_max + extreme_margin
1144
-
1188
+
1145
1189
  # Determine text alignment based on position relative to cluster
1146
- text_align = 'right' if label_x > cx else 'left'
1147
-
1190
+ text_align = "right" if label_x > cx else "left"
1191
+
1148
1192
  # Clean up atom name for display but keep prefixes
1149
1193
  display_atom = atom # Keep prefixes like name:, group:, batch:, type:
1150
-
1194
+
1151
1195
  # Create label source with alignment
1152
1196
  label_source = ColumnDataSource({
1153
- 'x': [label_x],
1154
- 'y': [label_y],
1155
- 'text': [display_atom],
1156
- 'atom': [atom],
1157
- 'text_align': [text_align]
1197
+ "x": [label_x],
1198
+ "y": [label_y],
1199
+ "text": [display_atom],
1200
+ "atom": [atom],
1201
+ "text_align": [text_align],
1158
1202
  })
1159
1203
  label_sources[atom] = label_source
1160
-
1204
+
1161
1205
  # Create spike line from cluster center to label
1162
- line_source = ColumnDataSource({
1163
- 'x': [cx, label_x],
1164
- 'y': [cy, label_y]
1165
- })
1206
+ line_source = ColumnDataSource({"x": [cx, label_x], "y": [cy, label_y]})
1166
1207
  line_sources[atom] = line_source
1167
1208
  line_cluster_mapping[atom] = cluster_id
1168
-
1209
+
1169
1210
  else:
1170
1211
  # Multiple terms - stack them vertically with one line to cluster center
1171
1212
  # Determine if this cluster has shared vs non-shared terms to adjust positioning
1172
1213
  has_shared = any(atom in shared_terms for atom in cluster_atoms)
1173
1214
  has_specific = any(atom in specific_terms for atom in cluster_atoms)
1174
-
1215
+
1175
1216
  # Adjust base distance: put non-shared (cluster-specific) labels further out
1176
1217
  if has_specific and not has_shared:
1177
1218
  # Pure cluster-specific terms - place further from center to reduce overlap
@@ -1181,137 +1222,145 @@ def analyze_umap(
1181
1222
  base_distance = max(data_range_x, data_range_y) * 0.08 # Closer
1182
1223
  else:
1183
1224
  # Mixed terms - use intermediate distance
1184
- base_distance = max(data_range_x, data_range_y) * 0.1 # Standard distance
1185
-
1225
+ base_distance = max(data_range_x, data_range_y) * 0.1 # Standard distance
1226
+
1186
1227
  # Calculate a good angle for the stack based on cluster position and available space
1187
1228
  # For non-shared terms, prefer angles that point away from plot center
1188
1229
  best_angle = None
1189
1230
  best_distance = 0
1190
-
1231
+
1191
1232
  # Get plot center for reference
1192
1233
  plot_center_x = (x_min + x_max) / 2
1193
1234
  plot_center_y = (y_min + y_max) / 2
1194
-
1235
+
1195
1236
  # Calculate angle from plot center to cluster center
1196
1237
  center_to_cluster_angle = np.arctan2(cy - plot_center_y, cx - plot_center_x)
1197
-
1238
+
1198
1239
  if has_specific and not has_shared:
1199
1240
  # For non-shared terms, prefer angles that point away from plot center
1200
1241
  # Create angles around the center-to-cluster direction
1201
1242
  base_angle = center_to_cluster_angle
1202
1243
  preferred_angles = [
1203
- base_angle, # Directly away from center
1204
- base_angle + np.pi/4, # 45° offset
1205
- base_angle - np.pi/4, # -45° offset
1206
- base_angle + np.pi/6, # 30° offset
1207
- base_angle - np.pi/6, # -30° offset
1208
- base_angle + np.pi/3, # 60° offset
1209
- base_angle - np.pi/3, # -60° offset
1210
- base_angle + np.pi/2, # 90° offset
1211
- base_angle - np.pi/2 # -90° offset
1244
+ base_angle, # Directly away from center
1245
+ base_angle + np.pi / 4, # 45° offset
1246
+ base_angle - np.pi / 4, # -45° offset
1247
+ base_angle + np.pi / 6, # 30° offset
1248
+ base_angle - np.pi / 6, # -30° offset
1249
+ base_angle + np.pi / 3, # 60° offset
1250
+ base_angle - np.pi / 3, # -60° offset
1251
+ base_angle + np.pi / 2, # 90° offset
1252
+ base_angle - np.pi / 2, # -90° offset
1212
1253
  ]
1213
1254
  else:
1214
1255
  # For shared terms or mixed, use the original preferred angles
1215
- preferred_angles = [np.pi/4, 3*np.pi/4, 5*np.pi/4, 7*np.pi/4, # 45°, 135°, 225°, 315°
1216
- np.pi/6, np.pi/3, 2*np.pi/3, 5*np.pi/6, # 30°, 60°, 120°, 150°
1217
- 7*np.pi/6, 4*np.pi/3, 5*np.pi/3, 11*np.pi/6] # 210°, 240°, 300°, 330°
1218
-
1256
+ preferred_angles = [
1257
+ np.pi / 4,
1258
+ 3 * np.pi / 4,
1259
+ 5 * np.pi / 4,
1260
+ 7 * np.pi / 4, # 45°, 135°, 225°, 315°
1261
+ np.pi / 6,
1262
+ np.pi / 3,
1263
+ 2 * np.pi / 3,
1264
+ 5 * np.pi / 6, # 30°, 60°, 120°, 150°
1265
+ 7 * np.pi / 6,
1266
+ 4 * np.pi / 3,
1267
+ 5 * np.pi / 3,
1268
+ 11 * np.pi / 6,
1269
+ ] # 210°, 240°, 300°, 330°
1270
+
1219
1271
  for test_angle in preferred_angles:
1220
1272
  test_x = cx + base_distance * np.cos(test_angle)
1221
1273
  test_y = cy + base_distance * np.sin(test_angle)
1222
-
1274
+
1223
1275
  # Calculate minimum distance to any data point
1224
- distances = [np.sqrt((pt[0] - test_x)**2 + (pt[1] - test_y)**2) for pt in umap_coords]
1276
+ distances = [np.sqrt((pt[0] - test_x) ** 2 + (pt[1] - test_y) ** 2) for pt in umap_coords]
1225
1277
  min_distance = min(distances)
1226
-
1278
+
1227
1279
  if min_distance > best_distance:
1228
1280
  best_distance = min_distance
1229
1281
  best_angle = test_angle
1230
-
1282
+
1231
1283
  # Use the best angle found, or fallback to 45°
1232
1284
  if best_angle is not None:
1233
1285
  stack_angle = best_angle
1234
1286
  else:
1235
1287
  # Fallback: use 45° based on cluster
1236
- angle_options = [np.pi/4, 3*np.pi/4, 5*np.pi/4, 7*np.pi/4]
1288
+ angle_options = [np.pi / 4, 3 * np.pi / 4, 5 * np.pi / 4, 7 * np.pi / 4]
1237
1289
  stack_angle = angle_options[cluster_id % len(angle_options)]
1238
-
1290
+
1239
1291
  # Position for the end of the line (before labels start)
1240
1292
  line_end_x = cx + base_distance * np.cos(stack_angle)
1241
1293
  line_end_y = cy + base_distance * np.sin(stack_angle)
1242
-
1294
+
1243
1295
  # Simplified approach: center labels at line end, then add 20pt offset in same direction
1244
1296
  # Calculate 20pt offset in the same direction as the line
1245
1297
  label_offset_distance = 20 # 20 points in the same direction
1246
-
1298
+
1247
1299
  # Convert 20 points to data coordinates (approximate)
1248
1300
  # Assuming typical plot size, 20pt ≈ 1-2% of data range
1249
1301
  data_range = max(data_range_x, data_range_y)
1250
1302
  offset_in_data_coords = data_range * 0.02 # 2% of data range for 20pt
1251
-
1303
+
1252
1304
  # Add offset in direction based on line orientation for better text placement
1253
1305
  # For westward lines: place label LEFT of endpoint with RIGHT alignment
1254
1306
  # For eastward lines: place label RIGHT of endpoint with LEFT alignment
1255
-
1307
+
1256
1308
  angle_degrees = (stack_angle * 180 / np.pi) % 360
1257
1309
  if 90 < angle_degrees < 270:
1258
1310
  # Line goes LEFT (westward) - place label to the LEFT of line end
1259
1311
  label_center_x = line_end_x - offset_in_data_coords # SUBTRACT to go left
1260
1312
  label_center_y = line_end_y # Keep same Y position
1261
- text_align = 'right' # Right-align so text ends near line endpoint
1313
+ text_align = "right" # Right-align so text ends near line endpoint
1262
1314
  else:
1263
- # Line goes RIGHT (eastward) - place label to the RIGHT of line end
1315
+ # Line goes RIGHT (eastward) - place label to the RIGHT of line end
1264
1316
  label_center_x = line_end_x + offset_in_data_coords # ADD to go right
1265
1317
  label_center_y = line_end_y # Keep same Y position
1266
- text_align = 'left' # Left-align so text starts near line endpoint
1267
-
1318
+ text_align = "left" # Left-align so text starts near line endpoint
1319
+
1268
1320
  # Calculate consistent vertical spacing for stacked labels
1269
1321
  # BETTER APPROACH: Use single LabelSet with newline characters
1270
-
1322
+
1271
1323
  # Create a single multi-line text string with all terms
1272
1324
  display_atoms = [atom for atom in cluster_atoms] # Keep original atom names with prefixes
1273
- combined_text = '\n'.join(display_atoms)
1274
-
1325
+ combined_text = "\n".join(display_atoms)
1326
+
1275
1327
  # Check if label would be outside plot bounds and adjust
1276
1328
  label_margin = max(data_range_x, data_range_y) * 0.05
1277
1329
  label_x = label_center_x
1278
1330
  label_y = label_center_y
1279
-
1331
+
1280
1332
  if label_x < x_min - label_margin:
1281
1333
  label_x = x_min - label_margin
1282
- text_align = 'left'
1334
+ text_align = "left"
1283
1335
  elif label_x > x_max + label_margin:
1284
1336
  label_x = x_max + label_margin
1285
- text_align = 'right'
1286
-
1337
+ text_align = "right"
1338
+
1287
1339
  if label_y < y_min - label_margin:
1288
1340
  label_y = y_min - label_margin
1289
1341
  elif label_y > y_max + label_margin:
1290
1342
  label_y = y_max + label_margin
1291
-
1343
+
1292
1344
  # Create single label source with multi-line text and alignment
1293
1345
  label_source = ColumnDataSource({
1294
- 'x': [label_x],
1295
- 'y': [label_y],
1296
- 'text': [combined_text],
1297
- 'atoms': [cluster_atoms], # Store all atoms for reference
1298
- 'text_align': [text_align]
1346
+ "x": [label_x],
1347
+ "y": [label_y],
1348
+ "text": [combined_text],
1349
+ "atoms": [cluster_atoms], # Store all atoms for reference
1350
+ "text_align": [text_align],
1299
1351
  })
1300
-
1352
+
1301
1353
  # Store this single label source using a unique key for the cluster stack
1302
1354
  stack_label_key = f"cluster_{cluster_id}_labels"
1303
1355
  label_sources[stack_label_key] = label_source
1304
-
1356
+
1305
1357
  # Create single line from cluster center to line end (before labels)
1306
- stack_line_source = ColumnDataSource({
1307
- 'x': [cx, line_end_x],
1308
- 'y': [cy, line_end_y]
1309
- })
1358
+ stack_line_source = ColumnDataSource({"x": [cx, line_end_x], "y": [cy, line_end_y]})
1310
1359
  # Use a unique key for the stack line
1311
1360
  stack_key = f"cluster_{cluster_id}_stack"
1312
1361
  line_sources[stack_key] = stack_line_source
1313
1362
  line_cluster_mapping[stack_key] = cluster_id
1314
-
1363
+
1315
1364
  # Add lines (spikes) to plot with matching cluster colors
1316
1365
  line_renderers = {}
1317
1366
  for line_key, line_source in line_sources.items():
@@ -1321,163 +1370,172 @@ def analyze_umap(
1321
1370
  # Use a neutral color or the color of the first cluster it appears in
1322
1371
  first_cluster_id = list(shared_terms[line_key])[0]
1323
1372
  if first_cluster_id == -1:
1324
- line_color = 'gray'
1373
+ line_color = "gray"
1325
1374
  else:
1326
- cluster_idx = list(unique_clusters).index(first_cluster_id) if first_cluster_id in unique_clusters else 0
1375
+ cluster_idx = (
1376
+ list(unique_clusters).index(first_cluster_id) if first_cluster_id in unique_clusters else 0
1377
+ )
1327
1378
  line_color = colors[cluster_idx % len(colors)]
1328
- line_dash = 'dashed' # Use dashed for all edges
1379
+ line_dash = "dashed" # Use dashed for all edges
1329
1380
  elif line_key in specific_terms:
1330
1381
  # For cluster-specific terms, use the cluster's color
1331
1382
  cluster_id = specific_terms[line_key]
1332
1383
  if cluster_id == -1:
1333
- line_color = 'gray'
1384
+ line_color = "gray"
1334
1385
  else:
1335
1386
  cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
1336
1387
  line_color = colors[cluster_idx % len(colors)]
1337
- line_dash = 'dashed' # Use dashed for all edges
1388
+ line_dash = "dashed" # Use dashed for all edges
1338
1389
  elif line_key in line_cluster_mapping:
1339
1390
  # For stack lines, use the cluster's color
1340
1391
  cluster_info = line_cluster_mapping[line_key]
1341
- if cluster_info == 'shared':
1392
+ if cluster_info == "shared":
1342
1393
  # For shared stacks, use a neutral color or first cluster color
1343
- line_color = 'black'
1344
- line_dash = 'dashed' # Use dashed for all edges
1394
+ line_color = "black"
1395
+ line_dash = "dashed" # Use dashed for all edges
1345
1396
  else:
1346
1397
  cluster_id = cluster_info
1347
1398
  if cluster_id == -1:
1348
- line_color = 'gray'
1399
+ line_color = "gray"
1349
1400
  else:
1350
1401
  cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
1351
1402
  line_color = colors[cluster_idx % len(colors)]
1352
- line_dash = 'dashed' # Use dashed for all edges
1403
+ line_dash = "dashed" # Use dashed for all edges
1353
1404
  else:
1354
1405
  # Fallback
1355
- line_color = 'gray'
1356
- line_dash = 'dashed' # Use dashed for all edges
1357
-
1358
- line_renderer = p1.line('x', 'y', source=line_source,
1359
- line_color=line_color, line_width=2,
1360
- alpha=0.8, line_dash=line_dash)
1406
+ line_color = "gray"
1407
+ line_dash = "dashed" # Use dashed for all edges
1408
+
1409
+ line_renderer = p1.line(
1410
+ "x", "y", source=line_source, line_color=line_color, line_width=2, alpha=0.8, line_dash=line_dash
1411
+ )
1361
1412
  line_renderers[line_key] = line_renderer
1362
-
1413
+
1363
1414
  # Add labels to plot (simple and direct approach)
1364
1415
  label_renderers = {} # Store label renderers for legend control
1365
1416
  for label_key, label_source in label_sources.items():
1366
1417
  # Determine color and style based on label key type
1367
- if label_key.startswith('cluster_') and label_key.endswith('_labels'):
1418
+ if label_key.startswith("cluster_") and label_key.endswith("_labels"):
1368
1419
  # This is a cluster stack with multiple terms
1369
- cluster_id = int(label_key.split('_')[1])
1420
+ cluster_id = int(label_key.split("_")[1])
1370
1421
  if cluster_id == -1:
1371
- text_color = 'gray'
1422
+ text_color = "gray"
1372
1423
  else:
1373
1424
  cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
1374
1425
  text_color = colors[cluster_idx % len(colors)]
1375
- text_font_style = 'bold'
1426
+ text_font_style = "bold"
1376
1427
  elif label_key in shared_terms:
1377
1428
  # Shared term - use same color as edge (first cluster's color)
1378
1429
  first_cluster_id = list(shared_terms[label_key])[0]
1379
1430
  if first_cluster_id == -1:
1380
- text_color = 'gray'
1431
+ text_color = "gray"
1381
1432
  else:
1382
- cluster_idx = list(unique_clusters).index(first_cluster_id) if first_cluster_id in unique_clusters else 0
1433
+ cluster_idx = (
1434
+ list(unique_clusters).index(first_cluster_id) if first_cluster_id in unique_clusters else 0
1435
+ )
1383
1436
  text_color = colors[cluster_idx % len(colors)]
1384
- text_font_style = 'bold'
1437
+ text_font_style = "bold"
1385
1438
  elif label_key in specific_terms:
1386
1439
  # Individual cluster-specific term
1387
1440
  cluster_id = specific_terms[label_key]
1388
1441
  if cluster_id == -1:
1389
- text_color = 'gray'
1442
+ text_color = "gray"
1390
1443
  else:
1391
1444
  cluster_idx = list(unique_clusters).index(cluster_id) if cluster_id in unique_clusters else 0
1392
1445
  text_color = colors[cluster_idx % len(colors)]
1393
- text_font_style = 'bold'
1446
+ text_font_style = "bold"
1394
1447
  else:
1395
1448
  # Fallback
1396
- text_color = 'black'
1397
- text_font_style = 'bold'
1398
-
1449
+ text_color = "black"
1450
+ text_font_style = "bold"
1451
+
1399
1452
  # Get text alignment from label source, default to center
1400
1453
  label_data = label_source.data
1401
- text_align = label_data.get('text_align', ['center'])[0] if 'text_align' in label_data else 'center'
1402
-
1454
+ text_align = label_data.get("text_align", ["center"])[0] if "text_align" in label_data else "center"
1455
+
1403
1456
  label_set = LabelSet(
1404
- x='x', y='y', text='text',
1457
+ x="x",
1458
+ y="y",
1459
+ text="text",
1405
1460
  source=label_source,
1406
- text_font_size='11pt',
1461
+ text_font_size="11pt",
1407
1462
  text_color=text_color,
1408
1463
  text_font_style=text_font_style,
1409
1464
  text_align=text_align,
1410
- text_baseline='middle'
1465
+ text_baseline="middle",
1411
1466
  )
1412
1467
  p1.add_layout(label_set)
1413
1468
  label_renderers[label_key] = label_set # Store for legend control
1414
-
1469
+
1415
1470
  # Check if any labels are close to plot boundaries and expand if needed
1416
1471
  if label_sources:
1417
1472
  # Collect all label positions
1418
1473
  all_label_positions = []
1419
1474
  for source in label_sources.values():
1420
1475
  data = source.data
1421
- if 'x' in data and 'y' in data and data['x'] and data['y']:
1422
- all_label_positions.extend(zip(data['x'], data['y']))
1423
-
1476
+ if "x" in data and "y" in data and data["x"] and data["y"]:
1477
+ all_label_positions.extend(zip(data["x"], data["y"]))
1478
+
1424
1479
  if all_label_positions:
1425
1480
  # Check if any labels are close to current plot boundaries
1426
1481
  current_x_min, current_x_max = p1.x_range.start, p1.x_range.end
1427
1482
  current_y_min, current_y_max = p1.y_range.start, p1.y_range.end
1428
-
1483
+
1429
1484
  # Define "close to boundary" as within 5% of the plot range
1430
1485
  x_range = current_x_max - current_x_min
1431
1486
  y_range = current_y_max - current_y_min
1432
1487
  boundary_threshold_x = x_range * 0.05
1433
1488
  boundary_threshold_y = y_range * 0.05
1434
-
1489
+
1435
1490
  needs_expansion = False
1436
1491
  for label_x, label_y in all_label_positions:
1437
- if (label_x < current_x_min + boundary_threshold_x or
1438
- label_x > current_x_max - boundary_threshold_x or
1439
- label_y < current_y_min + boundary_threshold_y or
1440
- label_y > current_y_max - boundary_threshold_y):
1492
+ if (
1493
+ label_x < current_x_min + boundary_threshold_x
1494
+ or label_x > current_x_max - boundary_threshold_x
1495
+ or label_y < current_y_min + boundary_threshold_y
1496
+ or label_y > current_y_max - boundary_threshold_y
1497
+ ):
1441
1498
  needs_expansion = True
1442
1499
  break
1443
-
1500
+
1444
1501
  # If labels are close to boundaries, expand plot by 5% (reduced from 10%)
1445
1502
  if needs_expansion:
1446
1503
  expansion_factor = 0.05 # 5% expansion (half of previous 10%)
1447
1504
  x_expansion = x_range * expansion_factor
1448
1505
  y_expansion = y_range * expansion_factor
1449
-
1506
+
1450
1507
  p1.x_range.start = current_x_min - x_expansion
1451
1508
  p1.x_range.end = current_x_max + x_expansion
1452
1509
  p1.y_range.start = current_y_min - y_expansion
1453
1510
  p1.y_range.end = current_y_max + y_expansion
1454
-
1455
-
1511
+
1456
1512
  # Add hover tool with enrichment information
1457
- hover = HoverTool(tooltips=[
1458
- ("Cluster", "@cluster"),
1459
- ("Sample", "@sample_name"),
1460
- ("Sample UID", "@sample_uid"),
1461
- ("Enrichments", "@enrichments")
1462
- ])
1513
+ hover = HoverTool(
1514
+ tooltips=[
1515
+ ("Cluster", "@cluster"),
1516
+ ("Sample", "@sample_name"),
1517
+ ("Sample UID", "@sample_uid"),
1518
+ ("Enrichments", "@enrichments"),
1519
+ ]
1520
+ )
1463
1521
  p1.add_tools(hover)
1464
-
1522
+
1465
1523
  # Remove cluster legend labels from scatter plots (already done above)
1466
1524
  # But keep any existing legend structure for now
1467
-
1525
+
1468
1526
  # Create custom legend for enrichment terms (line/label pairs) ONLY
1469
1527
  if line_renderers and (shared_terms or specific_terms):
1470
1528
  legend_items = []
1471
1529
  renderer_to_terms = {} # Group terms by their renderer
1472
-
1530
+
1473
1531
  # Get all enriched terms and group them by their line renderer
1474
1532
  all_enriched_atoms = set(shared_terms.keys()) | set(specific_terms.keys())
1475
-
1533
+
1476
1534
  # First pass: map each term to its renderer
1477
1535
  for atom in all_enriched_atoms:
1478
1536
  renderer = None
1479
1537
  renderer_key = None
1480
-
1538
+
1481
1539
  if atom in shared_terms:
1482
1540
  # Shared term
1483
1541
  if atom in line_renderers:
@@ -1491,7 +1549,7 @@ def analyze_umap(
1491
1549
  renderer = line_renderers[stack_key]
1492
1550
  renderer_key = stack_key
1493
1551
  break
1494
-
1552
+
1495
1553
  elif atom in specific_terms:
1496
1554
  # Cluster-specific term
1497
1555
  cluster_id = specific_terms[atom]
@@ -1503,134 +1561,137 @@ def analyze_umap(
1503
1561
  if stack_key in line_renderers:
1504
1562
  renderer = line_renderers[stack_key]
1505
1563
  renderer_key = stack_key
1506
-
1564
+
1507
1565
  # Group terms by renderer
1508
1566
  if renderer and renderer_key:
1509
1567
  if renderer_key not in renderer_to_terms:
1510
1568
  renderer_to_terms[renderer_key] = {
1511
- 'renderer': renderer,
1512
- 'shared_terms': [],
1513
- 'specific_terms': [],
1514
- 'cluster_id': None
1569
+ "renderer": renderer,
1570
+ "shared_terms": [],
1571
+ "specific_terms": [],
1572
+ "cluster_id": None,
1515
1573
  }
1516
-
1574
+
1517
1575
  if atom in shared_terms:
1518
- renderer_to_terms[renderer_key]['shared_terms'].append(atom)
1576
+ renderer_to_terms[renderer_key]["shared_terms"].append(atom)
1519
1577
  else:
1520
- renderer_to_terms[renderer_key]['specific_terms'].append(atom)
1521
- renderer_to_terms[renderer_key]['cluster_id'] = specific_terms[atom]
1522
-
1578
+ renderer_to_terms[renderer_key]["specific_terms"].append(atom)
1579
+ renderer_to_terms[renderer_key]["cluster_id"] = specific_terms[atom]
1580
+
1523
1581
  # Second pass: create legend entries, one per renderer
1524
1582
  for renderer_key, term_info in renderer_to_terms.items():
1525
- shared_list = term_info['shared_terms']
1526
- specific_list = term_info['specific_terms']
1527
- line_renderer = term_info['renderer']
1528
-
1583
+ shared_list = term_info["shared_terms"]
1584
+ specific_list = term_info["specific_terms"]
1585
+ line_renderer = term_info["renderer"]
1586
+
1529
1587
  # For now, legend can only control the line renderer
1530
1588
  # Label visibility will be handled via JavaScript callback if needed
1531
1589
  # (Note: LabelSet cannot be directly controlled by Bokeh legends)
1532
-
1590
+
1533
1591
  # Create combined label text
1534
1592
  if shared_list:
1535
1593
  # Shared terms - remove "Shared:" prefix and just show the terms
1536
- clean_terms = [atom.replace('name:', '').replace('group:', '').replace('batch:', '').replace('type:', '')
1537
- for atom in shared_list]
1594
+ clean_terms = [
1595
+ atom.replace("name:", "").replace("group:", "").replace("batch:", "").replace("type:", "")
1596
+ for atom in shared_list
1597
+ ]
1538
1598
  if len(clean_terms) == 1:
1539
1599
  label_text = clean_terms[0]
1540
1600
  else:
1541
- label_text = ', '.join(clean_terms)
1542
-
1601
+ label_text = ", ".join(clean_terms)
1602
+
1543
1603
  elif specific_list:
1544
1604
  # Cluster-specific terms
1545
- cluster_id = term_info['cluster_id']
1546
- clean_terms = [atom.replace('name:', '').replace('group:', '').replace('batch:', '').replace('type:', '')
1547
- for atom in specific_list]
1605
+ cluster_id = term_info["cluster_id"]
1606
+ clean_terms = [
1607
+ atom.replace("name:", "").replace("group:", "").replace("batch:", "").replace("type:", "")
1608
+ for atom in specific_list
1609
+ ]
1548
1610
  if len(clean_terms) == 1:
1549
1611
  label_text = f"C{cluster_id}: {clean_terms[0]}"
1550
1612
  else:
1551
1613
  label_text = f"C{cluster_id}: {', '.join(clean_terms)}"
1552
-
1614
+
1553
1615
  # Add single legend entry for the line renderer only
1554
1616
  # (Labels cannot be controlled by Bokeh legends directly)
1555
- legend_items.append(
1556
- LegendItem(label=label_text, renderers=[line_renderer])
1557
- )
1558
-
1617
+ legend_items.append(LegendItem(label=label_text, renderers=[line_renderer]))
1618
+
1559
1619
  # Hide cluster legend after we've created our enrichment legend
1560
- if hasattr(p1, 'legend') and p1.legend:
1620
+ if hasattr(p1, "legend") and p1.legend:
1561
1621
  if isinstance(p1.legend, list):
1562
1622
  for legend in p1.legend:
1563
1623
  legend.visible = False
1564
1624
  else:
1565
1625
  p1.legend.visible = False
1566
-
1626
+
1567
1627
  # Create and add the custom enrichment legend
1568
1628
  if legend_items:
1569
- enrichment_legend = Legend(
1570
- items=legend_items,
1571
- location="center_right",
1572
- click_policy="hide"
1573
- )
1574
- p1.add_layout(enrichment_legend, 'right')
1575
-
1576
- plots['cluster_plot'] = p1
1577
-
1629
+ enrichment_legend = Legend(items=legend_items, location="center_right", click_policy="hide")
1630
+ p1.add_layout(enrichment_legend, "right")
1631
+
1632
+ plots["cluster_plot"] = p1
1633
+
1578
1634
  # Save cluster plot if filename provided
1579
1635
  if filename:
1580
1636
  # Handle filename extension properly
1581
- if filename.endswith('.html'):
1637
+ if filename.endswith(".html"):
1582
1638
  base_filename = filename[:-5] # Remove .html extension
1583
1639
  cluster_filename = f"{base_filename}_clusters.html"
1584
1640
  else:
1585
1641
  cluster_filename = f"{filename}_clusters.html"
1586
-
1587
- if not filename.startswith('/') and not filename[1:3] == ':\\':
1642
+
1643
+ if not filename.startswith("/") and not filename[1:3] == ":\\":
1588
1644
  cluster_filename = f"{self.folder}/{cluster_filename}"
1589
1645
  _isolated_save_plot(p1, cluster_filename, cluster_filename, self.logger, "UMAP Cluster Plot")
1590
1646
  else:
1591
1647
  _isolated_show_notebook(p1)
1592
-
1593
- results['plots'] = plots
1648
+
1649
+ results["plots"] = plots
1594
1650
 
1595
1651
  # Print summary
1596
1652
  self.logger.debug("\n=== UMAP Cluster Analysis Summary ===")
1597
1653
  self.logger.debug(f"Best clustering: {best_clustering['method']}")
1598
1654
  self.logger.debug(f"Number of clusters: {best_clustering['n_clusters']}")
1599
1655
  self.logger.debug(f"Silhouette score: {best_clustering['score']:.3f}")
1600
- if best_clustering['n_noise'] > 0:
1656
+ if best_clustering["n_noise"] > 0:
1601
1657
  self.logger.debug(f"Noise points: {best_clustering['n_noise']}")
1602
1658
 
1603
1659
  self.logger.info(f"\nFound {len(all_associations)} total significant associations:")
1604
1660
 
1605
1661
  # Show regular column associations
1606
- regular_assocs = [a for a in all_associations if 'column' in a]
1662
+ regular_assocs = [a for a in all_associations if "column" in a]
1607
1663
  if regular_assocs:
1608
1664
  self.logger.info(f" {len(regular_assocs)} column-level associations:")
1609
1665
  for assoc in regular_assocs[:3]: # Show top 3
1610
- self.logger.info(f" {assoc['column']} ({assoc['variable_type']}): {assoc['test']} p={assoc['p_value']:.4f}, "
1611
- f"effect_size={assoc['effect_size']:.3f} ({assoc['interpretation']})")
1612
-
1613
- # Show text atom associations
1614
- text_assocs = [a for a in all_associations if 'atom' in a]
1666
+ self.logger.info(
1667
+ f" {assoc['column']} ({assoc['variable_type']}): {assoc['test']} p={assoc['p_value']:.4f}, "
1668
+ f"effect_size={assoc['effect_size']:.3f} ({assoc['interpretation']})"
1669
+ )
1670
+
1671
+ # Show text atom associations
1672
+ text_assocs = [a for a in all_associations if "atom" in a]
1615
1673
  if text_assocs:
1616
1674
  self.logger.info(f" {len(text_assocs)} text pattern associations:")
1617
1675
  for assoc in text_assocs[:3]: # Show top 3
1618
- freq = assoc.get('atom_frequency', 0)
1676
+ freq = assoc.get("atom_frequency", 0)
1619
1677
  percentage = (freq / len(analysis_df_clean)) * 100 if len(analysis_df_clean) > 0 else 0
1620
-
1621
- self.logger.info(f" '{assoc['atom']}' ({assoc['type']}): p={assoc['p_value']:.4f}, "
1622
- f"effect_size={assoc['effect_size']:.3f} ({assoc['interpretation']}) "
1623
- f"[{freq} samples, {percentage:.1f}%]")
1624
-
1678
+
1679
+ self.logger.info(
1680
+ f" '{assoc['atom']}' ({assoc['type']}): p={assoc['p_value']:.4f}, "
1681
+ f"effect_size={assoc['effect_size']:.3f} ({assoc['interpretation']}) "
1682
+ f"[{freq} samples, {percentage:.1f}%]"
1683
+ )
1684
+
1625
1685
  if len(all_associations) > 20:
1626
1686
  self.logger.info(f" ... and {len(all_associations) - 20} more associations")
1627
1687
 
1628
1688
  return results
1629
1689
 
1690
+
1630
1691
  def _analyze_umap_simplified(
1631
1692
  self,
1632
1693
  n_neighbors=15,
1633
- min_dist=0.1,
1694
+ min_dist=0.1,
1634
1695
  metric="euclidean",
1635
1696
  random_state=42,
1636
1697
  cluster_methods=["hdbscan", "kmeans"],
@@ -1641,94 +1702,94 @@ def _analyze_umap_simplified(
1641
1702
  filename=None,
1642
1703
  ):
1643
1704
  """Simplified fallback version of UMAP analysis."""
1644
-
1705
+
1645
1706
  self.logger.info("Starting simplified UMAP analysis...")
1646
-
1707
+
1647
1708
  # Check dependencies
1648
1709
  if not UMAP_AVAILABLE or not HDBSCAN_AVAILABLE:
1649
1710
  self.logger.error("Required dependencies not available")
1650
1711
  return {
1651
- 'umap_coords': None,
1652
- 'best_clustering': None,
1653
- 'all_clustering_results': {},
1654
- 'significant_associations': [],
1655
- 'text_associations': [],
1656
- 'cluster_summaries': {},
1657
- 'analysis_dataframe': None
1712
+ "umap_coords": None,
1713
+ "best_clustering": None,
1714
+ "all_clustering_results": {},
1715
+ "significant_associations": [],
1716
+ "text_associations": [],
1717
+ "cluster_summaries": {},
1718
+ "analysis_dataframe": None,
1658
1719
  }
1659
-
1720
+
1660
1721
  try:
1661
1722
  # Get data
1662
1723
  consensus_matrix = self.get_consensus_matrix()
1663
1724
  samples_df = self.samples_df
1664
-
1725
+
1665
1726
  if consensus_matrix is None or samples_df is None:
1666
1727
  self.logger.error("No data available")
1667
1728
  return {
1668
- 'umap_coords': None,
1669
- 'best_clustering': None,
1670
- 'all_clustering_results': {},
1671
- 'significant_associations': [],
1672
- 'text_associations': [],
1673
- 'cluster_summaries': {},
1674
- 'analysis_dataframe': None
1729
+ "umap_coords": None,
1730
+ "best_clustering": None,
1731
+ "all_clustering_results": {},
1732
+ "significant_associations": [],
1733
+ "text_associations": [],
1734
+ "cluster_summaries": {},
1735
+ "analysis_dataframe": None,
1675
1736
  }
1676
-
1737
+
1677
1738
  # Basic UMAP
1678
1739
  sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
1679
-
1740
+
1680
1741
  if hasattr(consensus_matrix, "select"):
1681
1742
  matrix_data = consensus_matrix.select(sample_cols).to_numpy()
1682
1743
  else:
1683
1744
  matrix_data = consensus_matrix.drop(columns=["consensus_uid"], errors="ignore").values
1684
-
1745
+
1685
1746
  matrix_data = matrix_data.T
1686
1747
  matrix_data = np.nan_to_num(matrix_data)
1687
-
1748
+
1688
1749
  scaler = StandardScaler()
1689
1750
  matrix_scaled = scaler.fit_transform(matrix_data)
1690
-
1751
+
1691
1752
  # Import dependencies locally
1692
1753
  import umap
1693
1754
  import hdbscan
1694
-
1755
+
1695
1756
  reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, random_state=random_state)
1696
1757
  umap_coords = reducer.fit_transform(matrix_scaled)
1697
-
1758
+
1698
1759
  # Simple clustering
1699
1760
  clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
1700
1761
  cluster_labels = clusterer.fit_predict(umap_coords)
1701
-
1762
+
1702
1763
  best_clustering = {
1703
- 'labels': cluster_labels,
1704
- 'n_clusters': len(np.unique(cluster_labels[cluster_labels != -1])),
1705
- 'n_noise': np.sum(cluster_labels == -1),
1706
- 'silhouette_score': 0.5, # Placeholder
1707
- 'method': 'hdbscan'
1764
+ "labels": cluster_labels,
1765
+ "n_clusters": len(np.unique(cluster_labels[cluster_labels != -1])),
1766
+ "n_noise": np.sum(cluster_labels == -1),
1767
+ "silhouette_score": 0.5, # Placeholder
1768
+ "method": "hdbscan",
1708
1769
  }
1709
-
1770
+
1710
1771
  self.logger.info(f"Simplified analysis found {best_clustering['n_clusters']} clusters")
1711
-
1772
+
1712
1773
  return {
1713
- 'umap_coords': umap_coords,
1714
- 'best_clustering': best_clustering,
1715
- 'all_clustering_results': {'hdbscan': best_clustering},
1716
- 'significant_associations': [],
1717
- 'text_associations': [],
1718
- 'cluster_summaries': {},
1719
- 'analysis_dataframe': None
1774
+ "umap_coords": umap_coords,
1775
+ "best_clustering": best_clustering,
1776
+ "all_clustering_results": {"hdbscan": best_clustering},
1777
+ "significant_associations": [],
1778
+ "text_associations": [],
1779
+ "cluster_summaries": {},
1780
+ "analysis_dataframe": None,
1720
1781
  }
1721
-
1782
+
1722
1783
  except Exception as e:
1723
1784
  self.logger.error(f"Error in simplified analysis: {e}")
1724
1785
  return {
1725
- 'umap_coords': None,
1726
- 'best_clustering': None,
1727
- 'all_clustering_results': {},
1728
- 'significant_associations': [],
1729
- 'text_associations': [],
1730
- 'cluster_summaries': {},
1731
- 'analysis_dataframe': None
1786
+ "umap_coords": None,
1787
+ "best_clustering": None,
1788
+ "all_clustering_results": {},
1789
+ "significant_associations": [],
1790
+ "text_associations": [],
1791
+ "cluster_summaries": {},
1792
+ "analysis_dataframe": None,
1732
1793
  }
1733
1794
 
1734
1795
 
@@ -1736,27 +1797,30 @@ def _analyze_umap_simplified(
1736
1797
  # Helper Functions for Plotting
1737
1798
  # ========================================
1738
1799
 
1800
+
1739
1801
  def _isolated_save_plot(plot, filename, title, logger, plot_type):
1740
1802
  """Save plot to file in isolation"""
1741
1803
  try:
1742
1804
  from bokeh.io import output_file, save
1743
1805
  from bokeh.models import Title
1744
-
1806
+
1745
1807
  # Add title to plot
1746
- plot.add_layout(Title(text=title, text_font_size="16pt"), 'above')
1747
-
1808
+ plot.add_layout(Title(text=title, text_font_size="16pt"), "above")
1809
+
1748
1810
  # Configure output
1749
1811
  output_file(filename)
1750
1812
  save(plot)
1751
1813
  logger.info(f"Saved {plot_type} to: {filename}")
1752
-
1814
+
1753
1815
  except Exception as e:
1754
1816
  logger.error(f"Error saving {plot_type}: {e}")
1755
1817
 
1818
+
1756
1819
  def _isolated_show_notebook(plot):
1757
1820
  """Show plot in notebook if available"""
1758
1821
  try:
1759
1822
  from bokeh.io import show
1823
+
1760
1824
  show(plot)
1761
1825
  except Exception:
1762
1826
  pass # Silently fail if not in notebook